From 7a934d0bbc43656c8acf3ea9e1003610b8f70ced Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Thu, 28 Feb 2019 19:39:51 -0800 Subject: [PATCH 1/8] Add a blocked convolution implementation as a proof of concept --- src/impl/blocked_conv.jl | 129 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 src/impl/blocked_conv.jl diff --git a/src/impl/blocked_conv.jl b/src/impl/blocked_conv.jl new file mode 100644 index 000000000..3be53d9d3 --- /dev/null +++ b/src/impl/blocked_conv.jl @@ -0,0 +1,129 @@ +using BenchmarkTools +using NNlib +using LinearAlgebra +using Primes +using SIMD +using Flux +# using CuArrays + +BLAS.set_num_threads(4) + + + +# unoptimized blocking +function block(x, block_axis = 3, block_len = 8) + @assert size(x)[block_axis]%block_len == 0 + shape = [i for i in size(x)] + shape[block_axis] /= block_len + insert!(shape, block_axis, block_len) + permute = vcat([block_axis], [i for i=1:length(shape) if i != block_axis]) + permutedims(reshape(x,Tuple(shape)), permute) +end + +function deblock(x, block_axis = 3) + permute = [i for i = 2:length(size(x))] + insert!(permute, block_axis, 1) + shape = [size(x)[i] for i = 2:length(size(x))] + shape[block_axis] *= size(x)[1] + reshape(permutedims(x, permute), Tuple(shape)) +end + +function blocked_conv2d_inner_loop!(Out::Array{T,5}, + X::Array{T,5}, + W::Array{T,6}, + ol::Int64, + ::Type{Vec{B,T}}, + pad = 0, + stride = 1, + dilation = 1) where {B,T} + cₒ, cᵢ, Wf, Hf, Cᵢ, Cₒ = size(W) + cₒ, Wₒ, Hₒ, Cₒ, N = size(Out) + cᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) + # get fused loop indexes + ool = ol - 1 + batch = div(ool, Cₒ * Cᵢ * Hₒ) + ool -= (batch) * Cₒ * Cᵢ * Hₒ + j′ = div(ool, Cᵢ * Hₒ) + ool -= (j′) * Cᵢ * Hₒ + i′ = div(ool, Hₒ) + ool -= i′ * Hₒ + l = ool + batch += 1 + j′ += 1 + i′ += 1 + l += 1 + @inbounds for n = 1:Hf, m = 1:Wf, k′ = 1:Wₒ + # pre-calculate indexes for the inner loop + I_w = 1 + stride * (k′ - 1) + dilation * (m - 1) - pad + I_h = 1 + stride * (l - 1) + dilation * (n - 1) - pad + if (I_w < 1 || I_h < 1 || I_h > Hᵢ || I_w > Wᵢ) + continue + end + F_w = Wf - (m - 1) + F_h = Hf - (n - 1) + @inbounds for ii = 1:B + tmpI = Vec{8, T}(X[ii, I_w, I_h, i′, batch]) + tmpO = vload(Vec{B, T}, view(Out, :, k′, l, j′, batch), 1) + tmpW = vload(Vec{B, T}, view(W, :, ii, F_w, F_h, i′, j′), 1) + tmpOut = fma(tmpI, tmpW, tmpO) + vstore(tmpOut, view(Out, :, k′, l, j′, batch), 1) + end + end +end + +function blocked_conv2d!(Out::Array{T,5}, X::Array{T,5}, W::Array{T,6}, pad = 0, stride = 1, dilation = 1) where T<:Number + @assert size(Out)[1] == size(W)[1] + @assert size(X)[1] == size(W)[2] + ## Fuse a few outer loops to make sure we have enough jobs for the threads + ## Most important if it's a low batch size kernel + out_loop_size = size(Out)[5] * size(W)[5] * size(W)[6] * size(Out)[3] + @inbounds Threads.@threads for ol = 1:out_loop_size + blocked_conv2d_inner_loop!(Out, X, W, ol, Vec{size(X)[1],T}, pad, stride, dilation) + end +end + +function blocked_conv2d(X::Array{T,5}, W::Array{T,6}; pad = 0, stride = 1, dilation = 1) where T<:Number + @assert size(X)[1] == size(W)[2] + @assert size(X)[4] == size(W)[5] + cₒ, cᵢ, Wf, Hf, Cᵢ, Cₒ = size(W) + cᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) + Wₒ = 1 + div(Wᵢ - (dilation * (Wf - 1) + 1) + 2 * pad, stride) + Hₒ = 1 + div(Hᵢ - (dilation * (Hf - 1) + 1) + 2 * pad, stride) + + Out = zeros(T, cₒ, Wₒ, Hₒ, Cₒ, N) + blocked_conv2d!(Out, X, W, pad, stride, dilation) + Out +end + + +for im_size = [32, 64, 128, 192] + for k_size = [5] + for pad = [3], stride = [2], dilation = [2] + X = rand(Float32, im_size, im_size, 32, 128) + W = rand(Float32, k_size, k_size, 32, 16) + + println("Data Shape: $(size(X))") + println("Weight Shape: $(size(W))") + println("pad=$pad, stride=$stride, dilation=$dilation") + + # print("block_data: ") + # @btime block($X) + # print("block_weights: ") + # @btime block(block($W,3),5) + + bX = block(X) + bW = block(block(W,3),5) + + print("blocked_conv2d: ") + @btime Out1 = blocked_conv2d($bX, $bW, pad = $pad, stride = $stride, dilation = $dilation) + print("NNlib.conv: ") + @btime Out2 = NNlib.conv($gpu($X), $gpu($W), pad = $pad, stride = $stride, dilation = $dilation) + + Out1 = blocked_conv2d(bX, bW, pad = pad, stride = stride, dilation = dilation) + + Out2 = NNlib.conv(X, W, pad = pad, stride = stride, dilation = dilation) + @assert isapprox(deblock(Out1), Out2) + println() + end + end +end From b36f18d4f0cfb17d46b3a8dc01c0caea95826908 Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Sun, 3 Mar 2019 20:12:14 -0800 Subject: [PATCH 2/8] clean up loop variable/shape names --- src/impl/blocked_conv.jl | 66 +++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/src/impl/blocked_conv.jl b/src/impl/blocked_conv.jl index 3be53d9d3..c96c2cf23 100644 --- a/src/impl/blocked_conv.jl +++ b/src/impl/blocked_conv.jl @@ -28,6 +28,18 @@ function deblock(x, block_axis = 3) reshape(permutedims(x, permute), Tuple(shape)) end +##Iteration indicies, outer to inner: +# batch - n +# out_channels - j +# in channels - i +# out height - hₒ +# out width - wₒ +# filter height - hₖ +# filter width - wₖ +# filter depth - dₖ +# out depth - dₒ +# in blocked channels - ii +# out blocked channels (simd'd), jj function blocked_conv2d_inner_loop!(Out::Array{T,5}, X::Array{T,5}, W::Array{T,6}, @@ -36,37 +48,41 @@ function blocked_conv2d_inner_loop!(Out::Array{T,5}, pad = 0, stride = 1, dilation = 1) where {B,T} - cₒ, cᵢ, Wf, Hf, Cᵢ, Cₒ = size(W) + cₒ, cᵢ, Wₖ, Hₖ, Cᵢ, Cₒ = size(W) cₒ, Wₒ, Hₒ, Cₒ, N = size(Out) cᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) + # get fused loop indexes ool = ol - 1 - batch = div(ool, Cₒ * Cᵢ * Hₒ) - ool -= (batch) * Cₒ * Cᵢ * Hₒ - j′ = div(ool, Cᵢ * Hₒ) - ool -= (j′) * Cᵢ * Hₒ - i′ = div(ool, Hₒ) - ool -= i′ * Hₒ - l = ool - batch += 1 - j′ += 1 - i′ += 1 - l += 1 - @inbounds for n = 1:Hf, m = 1:Wf, k′ = 1:Wₒ + n = div(ool, Cₒ * Cᵢ * Hₒ) + ool -= (n) * Cₒ * Cᵢ * Hₒ + j = div(ool, Cᵢ * Hₒ) + ool -= (j) * Cᵢ * Hₒ + i = div(ool, Hₒ) + ool -= i * Hₒ + hₒ = ool + + n += 1 + j += 1 + i += 1 + hₒ += 1 + + @inbounds for hₖ = 1:Hₖ, wₖ = 1:Wₖ, wₒ = 1:Wₒ # pre-calculate indexes for the inner loop - I_w = 1 + stride * (k′ - 1) + dilation * (m - 1) - pad - I_h = 1 + stride * (l - 1) + dilation * (n - 1) - pad - if (I_w < 1 || I_h < 1 || I_h > Hᵢ || I_w > Wᵢ) + hᵢ = 1 + stride * (hₒ - 1) + dilation * (hₖ - 1) - pad + wᵢ = 1 + stride * (wₒ - 1) + dilation * (wₖ - 1) - pad + # Check for over-input TODO(mbrookhart): move to compile step + if (hᵢ < 1 || wᵢ < 1 || hᵢ > Hᵢ || wᵢ > Wᵢ) continue end - F_w = Wf - (m - 1) - F_h = Hf - (n - 1) + F_w = Wₖ - (wₖ - 1) + F_h = Hₖ - (hₖ - 1) @inbounds for ii = 1:B - tmpI = Vec{8, T}(X[ii, I_w, I_h, i′, batch]) - tmpO = vload(Vec{B, T}, view(Out, :, k′, l, j′, batch), 1) - tmpW = vload(Vec{B, T}, view(W, :, ii, F_w, F_h, i′, j′), 1) + tmpI = Vec{8, T}(X[ii, wᵢ, hᵢ, i, n]) + tmpO = vload(Vec{B, T}, view(Out, :, wₒ, hₒ, j, n), 1) + tmpW = vload(Vec{B, T}, view(W, :, ii, F_w, F_h, i, j), 1) tmpOut = fma(tmpI, tmpW, tmpO) - vstore(tmpOut, view(Out, :, k′, l, j′, batch), 1) + vstore(tmpOut, view(Out, :, wₒ, hₒ, j, n), 1) end end end @@ -85,10 +101,10 @@ end function blocked_conv2d(X::Array{T,5}, W::Array{T,6}; pad = 0, stride = 1, dilation = 1) where T<:Number @assert size(X)[1] == size(W)[2] @assert size(X)[4] == size(W)[5] - cₒ, cᵢ, Wf, Hf, Cᵢ, Cₒ = size(W) + cₒ, cᵢ, Wₖ, Hₖ, Cᵢ, Cₒ = size(W) cᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) - Wₒ = 1 + div(Wᵢ - (dilation * (Wf - 1) + 1) + 2 * pad, stride) - Hₒ = 1 + div(Hᵢ - (dilation * (Hf - 1) + 1) + 2 * pad, stride) + Wₒ = 1 + div(Wᵢ - (dilation * (Wₖ - 1) + 1) + 2 * pad, stride) + Hₒ = 1 + div(Hᵢ - (dilation * (Hₖ - 1) + 1) + 2 * pad, stride) Out = zeros(T, cₒ, Wₒ, Hₒ, Cₒ, N) blocked_conv2d!(Out, X, W, pad, stride, dilation) From 595a14bc21a05619af9a4628901adb2da1daf188 Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Tue, 5 Mar 2019 07:07:14 -0800 Subject: [PATCH 3/8] add 1D and 3D kernels, test utility --- src/impl/blocked_conv.jl | 197 +++++++++++++++++++++++++++++---------- 1 file changed, 146 insertions(+), 51 deletions(-) diff --git a/src/impl/blocked_conv.jl b/src/impl/blocked_conv.jl index c96c2cf23..db357c6c5 100644 --- a/src/impl/blocked_conv.jl +++ b/src/impl/blocked_conv.jl @@ -1,4 +1,5 @@ using BenchmarkTools +using Test using NNlib using LinearAlgebra using Primes @@ -8,7 +9,27 @@ using Flux BLAS.set_num_threads(4) +@inline function insert_singleton_spatial_dimension(x::AbstractArray) + return reshape(x, size(x)[1:end-2]..., 1, size(x)[end-1:end]...) +end + +@inline function remove_singleton_spatial_dimension(x::AbstractArray) + return reshape(x, size(x)[1:end-3]..., size(x)[end-1:end]...) +end + +@inline function remove_singleton_spatial_dimension(x, reps::Int) + for r in 1:reps + x = remove_singleton_spatial_dimension(x) + end + return x +end +@inline function insert_singleton_spatial_dimension(x, reps::Int) + for r in 1:reps + x = insert_singleton_spatial_dimension(x) + end + return x +end # unoptimized blocking function block(x, block_axis = 3, block_len = 8) @@ -40,17 +61,17 @@ end # out depth - dₒ # in blocked channels - ii # out blocked channels (simd'd), jj -function blocked_conv2d_inner_loop!(Out::Array{T,5}, - X::Array{T,5}, - W::Array{T,6}, - ol::Int64, - ::Type{Vec{B,T}}, - pad = 0, - stride = 1, - dilation = 1) where {B,T} - cₒ, cᵢ, Wₖ, Hₖ, Cᵢ, Cₒ = size(W) - cₒ, Wₒ, Hₒ, Cₒ, N = size(Out) - cᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) +function blocked_conv_inner_loop!(Out::Array{T,6}, + X::Array{T,6}, + W::Array{T,7}, + ol::Int64, + ::Type{Vec{B,T}}, + pad::NTuple{3,Int64}, + stride::NTuple{3,Int64}, + dilation::NTuple{3,Int64}) where {B,T} + cₒ, cᵢ, Dₖ, Wₖ, Hₖ, Cᵢ, Cₒ = size(W) + cₒ, Dₒ, Wₒ, Hₒ, Cₒ, N = size(Out) + cᵢ, Dᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) # get fused loop indexes ool = ol - 1 @@ -67,79 +88,153 @@ function blocked_conv2d_inner_loop!(Out::Array{T,5}, i += 1 hₒ += 1 - @inbounds for hₖ = 1:Hₖ, wₖ = 1:Wₖ, wₒ = 1:Wₒ + @inbounds for wₒ = 1:Wₒ, hₖ = 1:Hₖ, wₖ = 1:Wₖ, dₖ = 1:Dₖ, dₒ = 1:Dₒ # pre-calculate indexes for the inner loop - hᵢ = 1 + stride * (hₒ - 1) + dilation * (hₖ - 1) - pad - wᵢ = 1 + stride * (wₒ - 1) + dilation * (wₖ - 1) - pad - # Check for over-input TODO(mbrookhart): move to compile step - if (hᵢ < 1 || wᵢ < 1 || hᵢ > Hᵢ || wᵢ > Wᵢ) + hᵢ = 1 + stride[3] * (hₒ - 1) + dilation[3] * (hₖ - 1) - pad[3] + wᵢ = 1 + stride[2] * (wₒ - 1) + dilation[2] * (wₖ - 1) - pad[2] + dᵢ = 1 + stride[1] * (dₒ - 1) + dilation[1] * (dₖ - 1) - pad[1] + # Check for over-input TODO(mbrookhart): move to compile step? + if (hᵢ < 1 || wᵢ < 1 || dᵢ < 1 || hᵢ > Hᵢ || wᵢ > Wᵢ || dᵢ > Dᵢ) continue end + F_d = Dₖ - (dₖ - 1) F_w = Wₖ - (wₖ - 1) F_h = Hₖ - (hₖ - 1) @inbounds for ii = 1:B - tmpI = Vec{8, T}(X[ii, wᵢ, hᵢ, i, n]) - tmpO = vload(Vec{B, T}, view(Out, :, wₒ, hₒ, j, n), 1) - tmpW = vload(Vec{B, T}, view(W, :, ii, F_w, F_h, i, j), 1) + tmpI = Vec{8, T}(X[ii, dᵢ, wᵢ, hᵢ, i, n]) + tmpO = vload(Vec{B, T}, view(Out, :, dₒ, wₒ, hₒ, j, n), 1) + tmpW = vload(Vec{B, T}, view(W, :, ii, F_d, F_w, F_h, i, j), 1) tmpOut = fma(tmpI, tmpW, tmpO) - vstore(tmpOut, view(Out, :, wₒ, hₒ, j, n), 1) + vstore(tmpOut, view(Out, :, dₒ, wₒ, hₒ, j, n), 1) end end end -function blocked_conv2d!(Out::Array{T,5}, X::Array{T,5}, W::Array{T,6}, pad = 0, stride = 1, dilation = 1) where T<:Number +function blocked_conv!(Out::Array{T,6}, + X::Array{T,6}, + W::Array{T,7}, + pad::NTuple{3,Int64}, + stride::NTuple{3,Int64}, + dilation::NTuple{3,Int64}) where T<:Number @assert size(Out)[1] == size(W)[1] @assert size(X)[1] == size(W)[2] ## Fuse a few outer loops to make sure we have enough jobs for the threads ## Most important if it's a low batch size kernel - out_loop_size = size(Out)[5] * size(W)[5] * size(W)[6] * size(Out)[3] + out_loop_size = size(Out)[6] * size(W)[6] * size(W)[7] * size(Out)[4] @inbounds Threads.@threads for ol = 1:out_loop_size - blocked_conv2d_inner_loop!(Out, X, W, ol, Vec{size(X)[1],T}, pad, stride, dilation) + blocked_conv_inner_loop!(Out, X, W, ol, Vec{size(X)[1],T}, pad, stride, dilation) end end -function blocked_conv2d(X::Array{T,5}, W::Array{T,6}; pad = 0, stride = 1, dilation = 1) where T<:Number +@inline function calc_conv_dim(input_dim, kernel_dim, pad, stride, dilation) + 1 + div(input_dim - (dilation * (kernel_dim - 1) + 1) + 2 * pad, stride) +end + +function blocked_conv(X::Array{T,6}, W::Array{T,7}; + pad = 0, stride = 1, dilation = 1) where T<:Number + @assert size(X)[1] == size(W)[2] + @assert size(X)[5] == size(W)[6] + cₒ, cᵢ, Dₖ, Wₖ, Hₖ, Cᵢ, Cₒ = size(W) + cᵢ, Dᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) + Hₒ = calc_conv_dim(Hᵢ, Hₖ, pad, stride, dilation) + Wₒ = calc_conv_dim(Wᵢ, Wₖ, pad, stride, dilation) + Dₒ = calc_conv_dim(Dᵢ, Dₖ, pad, stride, dilation) + Out = zeros(T, cₒ, Dₒ, Wₒ, Hₒ, Cₒ, N) + pad = (pad, pad, pad) + stride = (stride, stride, stride) + dilation = (dilation, dilation, dilation) + blocked_conv!(Out, X, W, pad, stride, dilation) + Out +end + +function blocked_conv(X::Array{T,5}, W::Array{T,6}; + pad = 0, stride = 1, dilation = 1) where T<:Number @assert size(X)[1] == size(W)[2] @assert size(X)[4] == size(W)[5] cₒ, cᵢ, Wₖ, Hₖ, Cᵢ, Cₒ = size(W) cᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) - Wₒ = 1 + div(Wᵢ - (dilation * (Wₖ - 1) + 1) + 2 * pad, stride) - Hₒ = 1 + div(Hᵢ - (dilation * (Hₖ - 1) + 1) + 2 * pad, stride) + Hₒ = calc_conv_dim(Hᵢ, Hₖ, pad, stride, dilation) + Wₒ = calc_conv_dim(Wᵢ, Wₖ, pad, stride, dilation) + + Out = zeros(T, cₒ, Wₒ, Hₒ, 1, Cₒ, N) + X = insert_singleton_spatial_dimension(X) + W = insert_singleton_spatial_dimension(W) + pad = (pad, pad, 0) + stride = (stride, stride, 1) + dilation = (dilation, dilation, 1) + blocked_conv!(Out, X, W, pad, stride, dilation) + remove_singleton_spatial_dimension(Out) +end - Out = zeros(T, cₒ, Wₒ, Hₒ, Cₒ, N) - blocked_conv2d!(Out, X, W, pad, stride, dilation) - Out +function blocked_conv(X::Array{T,4}, W::Array{T,5}; + pad = 0, stride = 1, dilation = 1) where T<:Number + @assert size(X)[1] == size(W)[2] + @assert size(X)[3] == size(W)[4] + cₒ, cᵢ, Wₖ, Cᵢ, Cₒ = size(W) + cᵢ, Wᵢ, Cᵢ, N = size(X) + Wₒ = calc_conv_dim(Wᵢ, Wₖ, pad, stride, dilation) + + Out = zeros(T, cₒ, Wₒ, 1, 1, Cₒ, N) + X = insert_singleton_spatial_dimension(X, 2) + W = insert_singleton_spatial_dimension(W, 2) + pad = (pad, 0, 0) + stride = (stride, 1, 1) + dilation = (dilation, 1, 1) + blocked_conv!(Out, X, W, pad, stride, dilation) + remove_singleton_spatial_dimension(Out, 2) end +function test_blocked_conv(im_size, + k_size, + rank, + pad, + stride, + dilation; + benchmark = false) + X_shape = vcat([im_size for i in 1:rank], [32, 128]) + W_shape = vcat([k_size for i in 1:rank], [32, 16]) -for im_size = [32, 64, 128, 192] - for k_size = [5] - for pad = [3], stride = [2], dilation = [2] - X = rand(Float32, im_size, im_size, 32, 128) - W = rand(Float32, k_size, k_size, 32, 16) + X = rand(Float32, X_shape...) + W = rand(Float32, W_shape...) - println("Data Shape: $(size(X))") - println("Weight Shape: $(size(W))") - println("pad=$pad, stride=$stride, dilation=$dilation") + bX = block(X, rank + 1) + bW = block(block(W, rank + 1), rank + 3) - # print("block_data: ") - # @btime block($X) - # print("block_weights: ") - # @btime block(block($W,3),5) - bX = block(X) - bW = block(block(W,3),5) + if benchmark + println("Data Shape: $(size(X))") + println("Weight Shape: $(size(W))") + println("pad=$pad, stride=$stride, dilation=$dilation") + # print("block_data: ") + # @btime block($X, $(rank + 1)) + # print("block_weights: ") + # @btime block(block($W, $(rank + 1)), $(rank + 3)) - print("blocked_conv2d: ") - @btime Out1 = blocked_conv2d($bX, $bW, pad = $pad, stride = $stride, dilation = $dilation) - print("NNlib.conv: ") - @btime Out2 = NNlib.conv($gpu($X), $gpu($W), pad = $pad, stride = $stride, dilation = $dilation) - Out1 = blocked_conv2d(bX, bW, pad = pad, stride = stride, dilation = dilation) - Out2 = NNlib.conv(X, W, pad = pad, stride = stride, dilation = dilation) - @assert isapprox(deblock(Out1), Out2) - println() + print("blocked_conv2d: ") + @btime Out1 = blocked_conv($bX, $bW, pad = $pad, stride = $stride, dilation = $dilation) + print("NNlib.conv: ") + @btime Out2 = NNlib.conv($gpu($X), $gpu($W), pad = $pad, stride = $stride, dilation = $dilation) + end + + Out1 = blocked_conv(bX, bW, pad = pad, stride = stride, dilation = dilation) + + Out2 = NNlib.conv(X, W, pad = pad, stride = stride, dilation = dilation) + @test isapprox(deblock(Out1, rank + 1), Out2) + println() +end + +do_benchmarking = false + +for im_size = [32, 64, 128, 192] + for k_size = [5] + for pad = [3], stride = [2], dilation = [2] + test_blocked_conv(im_size, k_size, 1, pad, stride, dilation, benchmark = do_benchmarking) + test_blocked_conv(im_size, k_size, 2, pad, stride, dilation, benchmark = do_benchmarking) + if im_size <= 32 + test_blocked_conv(im_size, k_size, 3, pad, stride, dilation, benchmark = do_benchmarking) + end end end end From 986f3d66d710fae5bb65eeace65a402241eb06ff Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Mon, 20 May 2019 13:46:39 -0700 Subject: [PATCH 4/8] first crack at moving things into NNlib --- src/NNlib.jl | 2 +- src/impl/blocked_conv.jl | 81 +++++----------------------------------- test/blocked_conv.jl | 61 ++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 72 deletions(-) create mode 100644 test/blocked_conv.jl diff --git a/src/NNlib.jl b/src/NNlib.jl index 0a7d6a142..8e2643f71 100644 --- a/src/NNlib.jl +++ b/src/NNlib.jl @@ -29,7 +29,7 @@ include("impl/depthwiseconv_direct.jl") # im2col implementations of convolutional and depthwise-convolutional algorithms include("impl/conv_im2col.jl") include("impl/depthwiseconv_im2col.jl") - +include("impl/blocked_conv.jl") # Direct implementations of pooling include("impl/pooling_direct.jl") diff --git a/src/impl/blocked_conv.jl b/src/impl/blocked_conv.jl index db357c6c5..914c4b153 100644 --- a/src/impl/blocked_conv.jl +++ b/src/impl/blocked_conv.jl @@ -1,17 +1,11 @@ -using BenchmarkTools -using Test -using NNlib -using LinearAlgebra -using Primes +export blocked_conv, block, unblock using SIMD -using Flux -# using CuArrays BLAS.set_num_threads(4) -@inline function insert_singleton_spatial_dimension(x::AbstractArray) - return reshape(x, size(x)[1:end-2]..., 1, size(x)[end-1:end]...) -end +# @inline function insert_singleton_spatial_dimension(x::AbstractArray) +# return reshape(x, size(x)[1:end-2]..., 1, size(x)[end-1:end]...) +# end @inline function remove_singleton_spatial_dimension(x::AbstractArray) return reshape(x, size(x)[1:end-3]..., size(x)[end-1:end]...) @@ -24,12 +18,12 @@ end return x end -@inline function insert_singleton_spatial_dimension(x, reps::Int) - for r in 1:reps - x = insert_singleton_spatial_dimension(x) - end - return x -end +# @inline function insert_singleton_spatial_dimension(x, reps::Int) +# for r in 1:reps +# x = insert_singleton_spatial_dimension(x) +# end +# return x +# end # unoptimized blocking function block(x, block_axis = 3, block_len = 8) @@ -183,58 +177,3 @@ function blocked_conv(X::Array{T,4}, W::Array{T,5}; blocked_conv!(Out, X, W, pad, stride, dilation) remove_singleton_spatial_dimension(Out, 2) end - -function test_blocked_conv(im_size, - k_size, - rank, - pad, - stride, - dilation; - benchmark = false) - X_shape = vcat([im_size for i in 1:rank], [32, 128]) - W_shape = vcat([k_size for i in 1:rank], [32, 16]) - - X = rand(Float32, X_shape...) - W = rand(Float32, W_shape...) - - bX = block(X, rank + 1) - bW = block(block(W, rank + 1), rank + 3) - - - if benchmark - println("Data Shape: $(size(X))") - println("Weight Shape: $(size(W))") - println("pad=$pad, stride=$stride, dilation=$dilation") - # print("block_data: ") - # @btime block($X, $(rank + 1)) - # print("block_weights: ") - # @btime block(block($W, $(rank + 1)), $(rank + 3)) - - - - print("blocked_conv2d: ") - @btime Out1 = blocked_conv($bX, $bW, pad = $pad, stride = $stride, dilation = $dilation) - print("NNlib.conv: ") - @btime Out2 = NNlib.conv($gpu($X), $gpu($W), pad = $pad, stride = $stride, dilation = $dilation) - end - - Out1 = blocked_conv(bX, bW, pad = pad, stride = stride, dilation = dilation) - - Out2 = NNlib.conv(X, W, pad = pad, stride = stride, dilation = dilation) - @test isapprox(deblock(Out1, rank + 1), Out2) - println() -end - -do_benchmarking = false - -for im_size = [32, 64, 128, 192] - for k_size = [5] - for pad = [3], stride = [2], dilation = [2] - test_blocked_conv(im_size, k_size, 1, pad, stride, dilation, benchmark = do_benchmarking) - test_blocked_conv(im_size, k_size, 2, pad, stride, dilation, benchmark = do_benchmarking) - if im_size <= 32 - test_blocked_conv(im_size, k_size, 3, pad, stride, dilation, benchmark = do_benchmarking) - end - end - end -end diff --git a/test/blocked_conv.jl b/test/blocked_conv.jl new file mode 100644 index 000000000..c02ce443f --- /dev/null +++ b/test/blocked_conv.jl @@ -0,0 +1,61 @@ +using BenchmarkTools +using Test +using NNlib + + +BLAS.set_num_threads(4) + +function test_blocked_conv(im_size, + k_size, + rank, + pad, + stride, + dilation; + benchmark = false) + X_shape = vcat([im_size for i in 1:rank], [32, 128]) + W_shape = vcat([k_size for i in 1:rank], [32, 16]) + + X = rand(Float32, X_shape...) + W = rand(Float32, W_shape...) + + bX = block(X, rank + 1) + bW = block(block(W, rank + 1), rank + 3) + + + if benchmark + println("Data Shape: $(size(X))") + println("Weight Shape: $(size(W))") + println("pad=$pad, stride=$stride, dilation=$dilation") + # print("block_data: ") + # @btime block($X, $(rank + 1)) + # print("block_weights: ") + # @btime block(block($W, $(rank + 1)), $(rank + 3)) + + + + print("blocked_conv2d: ") + @btime Out1 = blocked_conv($bX, $bW, pad = $pad, stride = $stride, dilation = $dilation) + # print("NNlib.conv: ") + # @btime Out2 = NNlib.conv($X, $W, pad = $pad, stride = $stride, dilation = $dilation) + end + + Out1 = blocked_conv(bX, bW, pad = pad, stride = stride, dilation = dilation) + + # Out2 = NNlib.conv(X, W, pad = pad, stride = stride, dilation = dilation) + # @test isapprox(deblock(Out1, rank + 1), Out2) + println() +end + +do_benchmarking = true + +for im_size = [32, 64, 128, 192] + for k_size = [5] + for pad = [3], stride = [2], dilation = [2] + # test_blocked_conv(im_size, k_size, 1, pad, stride, dilation, benchmark = do_benchmarking) + test_blocked_conv(im_size, k_size, 2, pad, stride, dilation, benchmark = do_benchmarking) + if im_size <= 32 + # test_blocked_conv(im_size, k_size, 3, pad, stride, dilation, benchmark = do_benchmarking) + end + end + end +end From 30349118db5f85044af3903e087bbd9ac3ec3132 Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Mon, 20 May 2019 14:38:21 -0700 Subject: [PATCH 5/8] use DenseConvDims API in first layer --- src/impl/blocked_conv.jl | 73 +++++++++++++--------------------------- test/blocked_conv.jl | 17 +++++----- 2 files changed, 31 insertions(+), 59 deletions(-) diff --git a/src/impl/blocked_conv.jl b/src/impl/blocked_conv.jl index 914c4b153..7da4591ad 100644 --- a/src/impl/blocked_conv.jl +++ b/src/impl/blocked_conv.jl @@ -1,4 +1,4 @@ -export blocked_conv, block, unblock +export blocked_conv, block, deblock using SIMD BLAS.set_num_threads(4) @@ -60,7 +60,7 @@ function blocked_conv_inner_loop!(Out::Array{T,6}, W::Array{T,7}, ol::Int64, ::Type{Vec{B,T}}, - pad::NTuple{3,Int64}, + pad::NTuple{6,Int64}, stride::NTuple{3,Int64}, dilation::NTuple{3,Int64}) where {B,T} cₒ, cᵢ, Dₖ, Wₖ, Hₖ, Cᵢ, Cₒ = size(W) @@ -84,8 +84,8 @@ function blocked_conv_inner_loop!(Out::Array{T,6}, @inbounds for wₒ = 1:Wₒ, hₖ = 1:Hₖ, wₖ = 1:Wₖ, dₖ = 1:Dₖ, dₒ = 1:Dₒ # pre-calculate indexes for the inner loop - hᵢ = 1 + stride[3] * (hₒ - 1) + dilation[3] * (hₖ - 1) - pad[3] - wᵢ = 1 + stride[2] * (wₒ - 1) + dilation[2] * (wₖ - 1) - pad[2] + hᵢ = 1 + stride[3] * (hₒ - 1) + dilation[3] * (hₖ - 1) - pad[5] + wᵢ = 1 + stride[2] * (wₒ - 1) + dilation[2] * (wₖ - 1) - pad[3] dᵢ = 1 + stride[1] * (dₒ - 1) + dilation[1] * (dₖ - 1) - pad[1] # Check for over-input TODO(mbrookhart): move to compile step? if (hᵢ < 1 || wᵢ < 1 || dᵢ < 1 || hᵢ > Hᵢ || wᵢ > Wᵢ || dᵢ > Dᵢ) @@ -107,7 +107,7 @@ end function blocked_conv!(Out::Array{T,6}, X::Array{T,6}, W::Array{T,7}, - pad::NTuple{3,Int64}, + pad::NTuple{6,Int64}, stride::NTuple{3,Int64}, dilation::NTuple{3,Int64}) where T<:Number @assert size(Out)[1] == size(W)[1] @@ -124,56 +124,29 @@ end 1 + div(input_dim - (dilation * (kernel_dim - 1) + 1) + 2 * pad, stride) end -function blocked_conv(X::Array{T,6}, W::Array{T,7}; - pad = 0, stride = 1, dilation = 1) where T<:Number - @assert size(X)[1] == size(W)[2] - @assert size(X)[5] == size(W)[6] - cₒ, cᵢ, Dₖ, Wₖ, Hₖ, Cᵢ, Cₒ = size(W) - cᵢ, Dᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) - Hₒ = calc_conv_dim(Hᵢ, Hₖ, pad, stride, dilation) - Wₒ = calc_conv_dim(Wᵢ, Wₖ, pad, stride, dilation) - Dₒ = calc_conv_dim(Dᵢ, Dₖ, pad, stride, dilation) - Out = zeros(T, cₒ, Dₒ, Wₒ, Hₒ, Cₒ, N) - pad = (pad, pad, pad) - stride = (stride, stride, stride) - dilation = (dilation, dilation, dilation) - blocked_conv!(Out, X, W, pad, stride, dilation) - Out +function blocked_conv(X::Array{T,6}, W::Array{T,7}, cdims::DenseConvDims) where T<:Number + cdims = insert_singleton_spatial_dimension(cdims, 0) + X = insert_singleton_spatial_dimension(X, 0) + W = insert_singleton_spatial_dimension(W, 0) + Out = zeros(T, size(W,1), output_size(cdims)..., div(channels_out(cdims),size(W, 1)), size(X, 6)) + blocked_conv!(Out, X, W, padding(cdims), stride(cdims), dilation(cdims)) + remove_singleton_spatial_dimension(Out, 0) end -function blocked_conv(X::Array{T,5}, W::Array{T,6}; - pad = 0, stride = 1, dilation = 1) where T<:Number - @assert size(X)[1] == size(W)[2] - @assert size(X)[4] == size(W)[5] - cₒ, cᵢ, Wₖ, Hₖ, Cᵢ, Cₒ = size(W) - cᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) - Hₒ = calc_conv_dim(Hᵢ, Hₖ, pad, stride, dilation) - Wₒ = calc_conv_dim(Wᵢ, Wₖ, pad, stride, dilation) - - Out = zeros(T, cₒ, Wₒ, Hₒ, 1, Cₒ, N) - X = insert_singleton_spatial_dimension(X) - W = insert_singleton_spatial_dimension(W) - pad = (pad, pad, 0) - stride = (stride, stride, 1) - dilation = (dilation, dilation, 1) - blocked_conv!(Out, X, W, pad, stride, dilation) - remove_singleton_spatial_dimension(Out) +function blocked_conv(X::Array{T,5}, W::Array{T,6}, cdims::DenseConvDims) where T<:Number + cdims = insert_singleton_spatial_dimension(cdims, 1) + X = insert_singleton_spatial_dimension(X, 1) + W = insert_singleton_spatial_dimension(W, 1) + Out = zeros(T, size(W,1), output_size(cdims)..., div(channels_out(cdims),size(W, 1)), size(X, 6)) + blocked_conv!(Out, X, W, padding(cdims), stride(cdims), dilation(cdims)) + remove_singleton_spatial_dimension(Out, 1) end -function blocked_conv(X::Array{T,4}, W::Array{T,5}; - pad = 0, stride = 1, dilation = 1) where T<:Number - @assert size(X)[1] == size(W)[2] - @assert size(X)[3] == size(W)[4] - cₒ, cᵢ, Wₖ, Cᵢ, Cₒ = size(W) - cᵢ, Wᵢ, Cᵢ, N = size(X) - Wₒ = calc_conv_dim(Wᵢ, Wₖ, pad, stride, dilation) - - Out = zeros(T, cₒ, Wₒ, 1, 1, Cₒ, N) +function blocked_conv(X::Array{T,4}, W::Array{T,5}, cdims::DenseConvDims) where T<:Number + cdims = insert_singleton_spatial_dimension(cdims, 2) X = insert_singleton_spatial_dimension(X, 2) W = insert_singleton_spatial_dimension(W, 2) - pad = (pad, 0, 0) - stride = (stride, 1, 1) - dilation = (dilation, 1, 1) - blocked_conv!(Out, X, W, pad, stride, dilation) + Out = zeros(T, size(W,1), output_size(cdims)..., div(channels_out(cdims),size(W, 1)), size(X, 6)) + blocked_conv!(Out, X, W, padding(cdims), stride(cdims), dilation(cdims)) remove_singleton_spatial_dimension(Out, 2) end diff --git a/test/blocked_conv.jl b/test/blocked_conv.jl index c02ce443f..7aa2ed1dc 100644 --- a/test/blocked_conv.jl +++ b/test/blocked_conv.jl @@ -32,17 +32,16 @@ function test_blocked_conv(im_size, # @btime block(block($W, $(rank + 1)), $(rank + 3)) - + c = DenseConvDims(X, W; stride = stride, dilation = dilation, padding = pad) print("blocked_conv2d: ") - @btime Out1 = blocked_conv($bX, $bW, pad = $pad, stride = $stride, dilation = $dilation) - # print("NNlib.conv: ") - # @btime Out2 = NNlib.conv($X, $W, pad = $pad, stride = $stride, dilation = $dilation) + @btime Out1 = blocked_conv($bX, $bW, $c) + print("NNlib.conv: ") + @btime Out2 = conv($X, $W, $c) end - - Out1 = blocked_conv(bX, bW, pad = pad, stride = stride, dilation = dilation) - - # Out2 = NNlib.conv(X, W, pad = pad, stride = stride, dilation = dilation) - # @test isapprox(deblock(Out1, rank + 1), Out2) + c = DenseConvDims(X, W; stride = stride, dilation = dilation, padding = pad) + Out1 = blocked_conv(bX, bW, c) + Out2 = conv(X, W, c) + @test isapprox(deblock(Out1, rank + 1), Out2) println() end From ecf7d27e303e1be13a4a7366ade53e93323cc66c Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Mon, 20 May 2019 20:11:20 -0700 Subject: [PATCH 6/8] update blocked_conv and blocked_conv! to use cdims, but don't change the inner loop --- src/impl/blocked_conv.jl | 80 +++++++++++++++++++--------------------- test/blocked_conv.jl | 6 +-- 2 files changed, 41 insertions(+), 45 deletions(-) diff --git a/src/impl/blocked_conv.jl b/src/impl/blocked_conv.jl index 7da4591ad..f956128d3 100644 --- a/src/impl/blocked_conv.jl +++ b/src/impl/blocked_conv.jl @@ -1,12 +1,6 @@ export blocked_conv, block, deblock using SIMD -BLAS.set_num_threads(4) - -# @inline function insert_singleton_spatial_dimension(x::AbstractArray) -# return reshape(x, size(x)[1:end-2]..., 1, size(x)[end-1:end]...) -# end - @inline function remove_singleton_spatial_dimension(x::AbstractArray) return reshape(x, size(x)[1:end-3]..., size(x)[end-1:end]...) end @@ -18,13 +12,6 @@ end return x end -# @inline function insert_singleton_spatial_dimension(x, reps::Int) -# for r in 1:reps -# x = insert_singleton_spatial_dimension(x) -# end -# return x -# end - # unoptimized blocking function block(x, block_axis = 3, block_len = 8) @assert size(x)[block_axis]%block_len == 0 @@ -107,46 +94,55 @@ end function blocked_conv!(Out::Array{T,6}, X::Array{T,6}, W::Array{T,7}, - pad::NTuple{6,Int64}, - stride::NTuple{3,Int64}, - dilation::NTuple{3,Int64}) where T<:Number + cdims::DenseConvDims) where T<:Number @assert size(Out)[1] == size(W)[1] @assert size(X)[1] == size(W)[2] ## Fuse a few outer loops to make sure we have enough jobs for the threads ## Most important if it's a low batch size kernel out_loop_size = size(Out)[6] * size(W)[6] * size(W)[7] * size(Out)[4] @inbounds Threads.@threads for ol = 1:out_loop_size - blocked_conv_inner_loop!(Out, X, W, ol, Vec{size(X)[1],T}, pad, stride, dilation) + blocked_conv_inner_loop!(Out, X, W, ol, Vec{size(X)[1],T}, + padding(cdims), stride(cdims), dilation(cdims)) end end -@inline function calc_conv_dim(input_dim, kernel_dim, pad, stride, dilation) - 1 + div(input_dim - (dilation * (kernel_dim - 1) + 1) + 2 * pad, stride) -end function blocked_conv(X::Array{T,6}, W::Array{T,7}, cdims::DenseConvDims) where T<:Number - cdims = insert_singleton_spatial_dimension(cdims, 0) - X = insert_singleton_spatial_dimension(X, 0) - W = insert_singleton_spatial_dimension(W, 0) - Out = zeros(T, size(W,1), output_size(cdims)..., div(channels_out(cdims),size(W, 1)), size(X, 6)) - blocked_conv!(Out, X, W, padding(cdims), stride(cdims), dilation(cdims)) - remove_singleton_spatial_dimension(Out, 0) -end - -function blocked_conv(X::Array{T,5}, W::Array{T,6}, cdims::DenseConvDims) where T<:Number - cdims = insert_singleton_spatial_dimension(cdims, 1) - X = insert_singleton_spatial_dimension(X, 1) - W = insert_singleton_spatial_dimension(W, 1) - Out = zeros(T, size(W,1), output_size(cdims)..., div(channels_out(cdims),size(W, 1)), size(X, 6)) - blocked_conv!(Out, X, W, padding(cdims), stride(cdims), dilation(cdims)) - remove_singleton_spatial_dimension(Out, 1) + Out = zeros(T, size(W,1), output_size(cdims)..., + div(channels_out(cdims),size(W, 1)), size(X, 6)) + blocked_conv!(Out, X, W, cdims) + Out end -function blocked_conv(X::Array{T,4}, W::Array{T,5}, cdims::DenseConvDims) where T<:Number - cdims = insert_singleton_spatial_dimension(cdims, 2) - X = insert_singleton_spatial_dimension(X, 2) - W = insert_singleton_spatial_dimension(W, 2) - Out = zeros(T, size(W,1), output_size(cdims)..., div(channels_out(cdims),size(W, 1)), size(X, 6)) - blocked_conv!(Out, X, W, padding(cdims), stride(cdims), dilation(cdims)) - remove_singleton_spatial_dimension(Out, 2) +for N in (3, 4) + @eval begin + function $(Symbol("blocked_conv!"))( + y::AbstractArray{yT,$(N+1)}, x::AbstractArray{xT,$(N+1)}, + w::AbstractArray{wT,$(N+2)}, cdims::ConvDims) where {yT, xT, wT} + $(Symbol("blocked_conv!"))( + insert_singleton_spatial_dimension(y, $(5 - N)), + insert_singleton_spatial_dimension(x, $(5 - N)), + insert_singleton_spatial_dimension(w, $(5 - N)), + insert_singleton_spatial_dimension(cdims, $(5 - N)) + ) + + # We explicitly return `y` here, because the backend call + # itself may return a reshaped view, which we don't want. + return y + end + end + @eval begin + function $(Symbol("blocked_conv"))( + x::AbstractArray{xT,$(N+1)}, + w::AbstractArray{wT,$(N+2)}, cdims::ConvDims) where {yT, xT, wT} + remove_singleton_spatial_dimension( + $(Symbol("blocked_conv"))( + insert_singleton_spatial_dimension(x, $(5 - N)), + insert_singleton_spatial_dimension(w, $(5 - N)), + insert_singleton_spatial_dimension(cdims, $(5 - N)) + ), + $(5 - N) + ) + end + end end diff --git a/test/blocked_conv.jl b/test/blocked_conv.jl index 7aa2ed1dc..766168802 100644 --- a/test/blocked_conv.jl +++ b/test/blocked_conv.jl @@ -1,7 +1,7 @@ using BenchmarkTools using Test using NNlib - +using LinearAlgebra BLAS.set_num_threads(4) @@ -50,10 +50,10 @@ do_benchmarking = true for im_size = [32, 64, 128, 192] for k_size = [5] for pad = [3], stride = [2], dilation = [2] - # test_blocked_conv(im_size, k_size, 1, pad, stride, dilation, benchmark = do_benchmarking) + test_blocked_conv(im_size, k_size, 1, pad, stride, dilation, benchmark = do_benchmarking) test_blocked_conv(im_size, k_size, 2, pad, stride, dilation, benchmark = do_benchmarking) if im_size <= 32 - # test_blocked_conv(im_size, k_size, 3, pad, stride, dilation, benchmark = do_benchmarking) + test_blocked_conv(im_size, k_size, 3, pad, stride, dilation, benchmark = do_benchmarking) end end end From e7076104e821cce0e2ba81a9b562631a986197d9 Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Mon, 20 May 2019 21:30:23 -0700 Subject: [PATCH 7/8] update to padded/central region loops --- src/impl/blocked_conv.jl | 89 ++++++++++++++++++++++++---------------- test/blocked_conv.jl | 4 +- 2 files changed, 55 insertions(+), 38 deletions(-) diff --git a/src/impl/blocked_conv.jl b/src/impl/blocked_conv.jl index f956128d3..a0451c483 100644 --- a/src/impl/blocked_conv.jl +++ b/src/impl/blocked_conv.jl @@ -33,13 +33,13 @@ end ##Iteration indicies, outer to inner: # batch - n # out_channels - j +# out depth - dₒ # in channels - i -# out height - hₒ -# out width - wₒ +# filter depth - dₖ # filter height - hₖ # filter width - wₖ -# filter depth - dₖ -# out depth - dₒ +# out height - hₒ +# out width - wₒ # in blocked channels - ii # out blocked channels (simd'd), jj function blocked_conv_inner_loop!(Out::Array{T,6}, @@ -47,46 +47,64 @@ function blocked_conv_inner_loop!(Out::Array{T,6}, W::Array{T,7}, ol::Int64, ::Type{Vec{B,T}}, - pad::NTuple{6,Int64}, - stride::NTuple{3,Int64}, - dilation::NTuple{3,Int64}) where {B,T} - cₒ, cᵢ, Dₖ, Wₖ, Hₖ, Cᵢ, Cₒ = size(W) - cₒ, Dₒ, Wₒ, Hₒ, Cₒ, N = size(Out) - cᵢ, Dᵢ, Wᵢ, Hᵢ, Cᵢ, N = size(X) - + cdims::DenseConvDims) where {B,T} + cₒ, cᵢ, Wₖ, Hₖ, Dₖ, Cᵢ, Cₒ = size(W) + cₒ, Wₒ, Hₒ, Dₒ, Cₒ, N = size(Out) + cᵢ, Wᵢ, Hᵢ, Dᵢ, Cᵢ, N = size(X) + p = padding(cdims) + s = stride(cdims) + d = dilation(cdims) + padded_regions, central_region = calc_padding_regions(cdims) # get fused loop indexes ool = ol - 1 - n = div(ool, Cₒ * Cᵢ * Hₒ) - ool -= (n) * Cₒ * Cᵢ * Hₒ - j = div(ool, Cᵢ * Hₒ) - ool -= (j) * Cᵢ * Hₒ - i = div(ool, Hₒ) - ool -= i * Hₒ - hₒ = ool + n = div(ool, Cₒ) + ool -= (n) * Cₒ + j = ool n += 1 j += 1 - i += 1 - hₒ += 1 - @inbounds for wₒ = 1:Wₒ, hₖ = 1:Hₖ, wₖ = 1:Wₖ, dₖ = 1:Dₖ, dₒ = 1:Dₒ + #calculate the central region without conditionals + w_region, h_region, d_region = central_region + @inbounds for i = 1:Cᵢ, dₒ = d_region, hₒ = h_region, dₖ = 1:Dₖ, hₖ = 1:Hₖ, wₖ = 1:Wₖ, wₒ = w_region # pre-calculate indexes for the inner loop - hᵢ = 1 + stride[3] * (hₒ - 1) + dilation[3] * (hₖ - 1) - pad[5] - wᵢ = 1 + stride[2] * (wₒ - 1) + dilation[2] * (wₖ - 1) - pad[3] - dᵢ = 1 + stride[1] * (dₒ - 1) + dilation[1] * (dₖ - 1) - pad[1] - # Check for over-input TODO(mbrookhart): move to compile step? - if (hᵢ < 1 || wᵢ < 1 || dᵢ < 1 || hᵢ > Hᵢ || wᵢ > Wᵢ || dᵢ > Dᵢ) - continue - end - F_d = Dₖ - (dₖ - 1) + dᵢ = 1 + s[3] * (dₒ - 1) + d[3] * (dₖ - 1) - p[5] + hᵢ = 1 + s[2] * (hₒ - 1) + d[2] * (hₖ - 1) - p[3] + wᵢ = 1 + s[1] * (wₒ - 1) + d[1] * (wₖ - 1) - p[1] + F_w = Wₖ - (wₖ - 1) F_h = Hₖ - (hₖ - 1) + F_d = Dₖ - (dₖ - 1) @inbounds for ii = 1:B - tmpI = Vec{8, T}(X[ii, dᵢ, wᵢ, hᵢ, i, n]) - tmpO = vload(Vec{B, T}, view(Out, :, dₒ, wₒ, hₒ, j, n), 1) - tmpW = vload(Vec{B, T}, view(W, :, ii, F_d, F_w, F_h, i, j), 1) + tmpI = Vec{8, T}(X[ii, wᵢ, hᵢ, dᵢ, i, n]) + tmpO = vload(Vec{B, T}, view(Out, :, wₒ, hₒ, dₒ, j, n), 1) + tmpW = vload(Vec{B, T}, view(W, :, ii, F_w, F_h, F_d, i, j), 1) tmpOut = fma(tmpI, tmpW, tmpO) - vstore(tmpOut, view(Out, :, dₒ, wₒ, hₒ, j, n), 1) + vstore(tmpOut, view(Out, :, wₒ, hₒ, dₒ, j, n), 1) + end + end + + #calculate the regions with conditionals + @inbounds for (w_region, h_region, d_region) in padded_regions + @inbounds for i =1:Cᵢ, dₒ = d_region, hₒ = h_region, dₖ = 1:Dₖ, hₖ = 1:Hₖ, wₖ = 1:Wₖ, wₒ = w_region + # pre-calculate indexes for the inner loop + dᵢ = 1 + s[3] * (dₒ - 1) + d[3] * (dₖ - 1) - p[5] + hᵢ = 1 + s[2] * (hₒ - 1) + d[2] * (hₖ - 1) - p[3] + wᵢ = 1 + s[1] * (wₒ - 1) + d[1] * (wₖ - 1) - p[1] + # Check for over-input + if (hᵢ < 1 || wᵢ < 1 || dᵢ < 1 || hᵢ > Hᵢ || wᵢ > Wᵢ || dᵢ > Dᵢ) + continue + end + F_w = Wₖ - (wₖ - 1) + F_h = Hₖ - (hₖ - 1) + F_d = Dₖ - (dₖ - 1) + @inbounds for ii = 1:B + tmpI = Vec{8, T}(X[ii, wᵢ, hᵢ, dᵢ, i, n]) + tmpO = vload(Vec{B, T}, view(Out, :, wₒ, hₒ, dₒ, j, n), 1) + tmpW = vload(Vec{B, T}, view(W, :, ii, F_w, F_h, F_d, i, j), 1) + tmpOut = fma(tmpI, tmpW, tmpO) + vstore(tmpOut, view(Out, :, wₒ, hₒ, dₒ, j, n), 1) + end end end end @@ -99,10 +117,9 @@ function blocked_conv!(Out::Array{T,6}, @assert size(X)[1] == size(W)[2] ## Fuse a few outer loops to make sure we have enough jobs for the threads ## Most important if it's a low batch size kernel - out_loop_size = size(Out)[6] * size(W)[6] * size(W)[7] * size(Out)[4] + out_loop_size = size(Out)[6] * size(Out)[5] @inbounds Threads.@threads for ol = 1:out_loop_size - blocked_conv_inner_loop!(Out, X, W, ol, Vec{size(X)[1],T}, - padding(cdims), stride(cdims), dilation(cdims)) + blocked_conv_inner_loop!(Out, X, W, ol, Vec{size(X)[1],T}, cdims) end end diff --git a/test/blocked_conv.jl b/test/blocked_conv.jl index 766168802..3133b1ba7 100644 --- a/test/blocked_conv.jl +++ b/test/blocked_conv.jl @@ -37,15 +37,15 @@ function test_blocked_conv(im_size, @btime Out1 = blocked_conv($bX, $bW, $c) print("NNlib.conv: ") @btime Out2 = conv($X, $W, $c) + println() end c = DenseConvDims(X, W; stride = stride, dilation = dilation, padding = pad) Out1 = blocked_conv(bX, bW, c) Out2 = conv(X, W, c) @test isapprox(deblock(Out1, rank + 1), Out2) - println() end -do_benchmarking = true +do_benchmarking = false for im_size = [32, 64, 128, 192] for k_size = [5] From ca98fed4bd384f75ee5d162c3d92c7d0cd8d83b0 Mon Sep 17 00:00:00 2001 From: Matthew Brookhart Date: Mon, 20 May 2019 21:42:32 -0700 Subject: [PATCH 8/8] add SIMD to dependencies --- Manifest.toml | 6 ++++++ Project.toml | 1 + 2 files changed, 7 insertions(+) diff --git a/Manifest.toml b/Manifest.toml index 6c2dba115..5895a6ede 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -69,6 +69,12 @@ version = "0.5.2" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +[[SIMD]] +deps = ["InteractiveUtils", "Test"] +git-tree-sha1 = "a81f30058aa91fb53c794169436b402c3102a960" +uuid = "fdea26ae-647d-5447-a871-4b548cad5224" +version = "2.6.0" + [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" diff --git a/Project.toml b/Project.toml index 8d33535e8..07dc0bf1a 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Requires = "ae029012-a4dd-5104-9daa-d747884805df" +SIMD = "fdea26ae-647d-5447-a871-4b548cad5224" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"