Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert LoopVectorization #226

Merged
merged 5 commits into from
Aug 1, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@ version = "0.7.3"
[deps]
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
NNPACK_jll = "a6bfbf70-4841-5cb9-aa18-3a8ad3c413ee"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[compat]
LoopVectorization = "0.8.10"
Requires = "0.5, 1.0"
julia = "1.3"

Expand Down
1 change: 0 additions & 1 deletion src/NNlib.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
module NNlib
using Pkg
using Requires
using LoopVectorization
using NNPACK_jll

# Include APIs
Expand Down
55 changes: 23 additions & 32 deletions src/activation.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
export σ, sigmoid, hardσ, hardsigmoid, hardtanh, relu, leakyrelu, relu6, rrelu, elu, gelu, swish, selu, celu, softplus, softsign, logσ,
logsigmoid, logcosh, mish, tanhshrink, softshrink, thresholdrelu, trelu, lisht

import LoopVectorization: vifelse
using LoopVectorization.SLEEFPirates: FloatType

const RealOrFloatType = Union{Real, FloatType}

## Activation functions
#
# Some of activation functions have its wrapper function for GPU in CuArrays.jl.
Expand All @@ -17,9 +12,9 @@ const RealOrFloatType = Union{Real, FloatType}
Classic [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function) activation
function.
"""
function σ(x::RealOrFloatType)
function σ(x::Real)
DhairyaLGandhi marked this conversation as resolved.
Show resolved Hide resolved
t = exp(-abs(x))
vifelse(x ≥ 0, inv(one(t) + t), t / (one(t) + t))
ifelse(x ≥ 0, inv(one(t) + t), t / (one(t) + t))
end
const sigmoid = σ

Expand All @@ -29,7 +24,7 @@ const sigmoid = σ
Segment-wise linear approximation of sigmoid.
See [BinaryConnect: Training Deep Neural Networks withbinary weights during propagations](https://arxiv.org/pdf/1511.00363.pdf).
"""
hardσ(x::RealOrFloatType, a=0.2) = oftype(x/1, max(zero(x/1), min(one(x/1), oftype(x/1,a) * x + oftype(x/1,0.5))))
hardσ(x::Real, a=0.2) = oftype(x/1, max(zero(x/1), min(one(x/1), oftype(x/1,a) * x + oftype(x/1,0.5))))
const hardsigmoid = hardσ


Expand All @@ -46,7 +41,7 @@ Return `log(σ(x))` which is computed in a numerically stable way.
-10.000045398899218
-3.720075976020836e-44
"""
logσ(x::RealOrFloatType) = -softplus(-x)
logσ(x::Real) = -softplus(-x)
const logsigmoid = logσ


Expand All @@ -56,7 +51,7 @@ const logsigmoid = logσ
Segment-wise linear approximation of tanh. Cheaper and more computational efficient version of tanh.
See [Large Scale Machine Learning](http://ronan.collobert.org/pub/matos/2004_phdthesis_lip6.pdf).
"""
hardtanh(x::RealOrFloatType) = max(-one(x), min( one(x), x))
hardtanh(x::Real) = max(-one(x), min( one(x), x))


"""
Expand All @@ -65,7 +60,7 @@ hardtanh(x::RealOrFloatType) = max(-one(x), min( one(x), x))
[Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))
activation function.
"""
relu(x::RealOrFloatType) = max(zero(x), x)
relu(x::Real) = max(zero(x), x)


"""
Expand All @@ -75,7 +70,7 @@ Leaky [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_ne
activation function.
You can also specify the coefficient explicitly, e.g. `leakyrelu(x, 0.01)`.
"""
leakyrelu(x::RealOrFloatType, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x))
leakyrelu(x::Real, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x))

"""
relu6(x) = min(max(0, x), 6)
Expand All @@ -84,7 +79,7 @@ leakyrelu(x::RealOrFloatType, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x))
activation function capped at 6.
See [Convolutional Deep Belief Networks on CIFAR-10](http://www.cs.utoronto.ca/%7Ekriz/conv-cifar10-aug2010.pdf)
"""
relu6(x::RealOrFloatType) = min(relu(x), oftype(x, 6))
relu6(x::Real) = min(relu(x), oftype(x, 6))

"""
rrelu(x, l=1/8, u=1/3) = max(a*x, x)
Expand All @@ -95,7 +90,7 @@ Randomized Leaky [Rectified Linear Unit](https://arxiv.org/pdf/1505.00853.pdf)
activation function.
You can also specify the bound explicitly, e.g. `rrelu(x, 0.0, 1.0)`.
"""
function rrelu(x::RealOrFloatType, l::Real = 1 / 8.0, u::Real = 1 / 3.0)
function rrelu(x::Real, l::Real = 1 / 8.0, u::Real = 1 / 3.0)
a = oftype(x / 1, (u - l) * rand() + l)
return leakyrelu(x, a)
end
Expand All @@ -108,7 +103,7 @@ Exponential Linear Unit activation function.
See [Fast and Accurate Deep Network Learning by Exponential Linear Units](https://arxiv.org/abs/1511.07289).
You can also specify the coefficient explicitly, e.g. `elu(x, 1)`.
"""
elu(x::RealOrFloatType, α = one(x)) = vifelse(x ≥ 0, x / one(x), α * (exp(x) - one(x)))
elu(x::Real, α = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x) - one(x)))


"""
Expand All @@ -117,7 +112,7 @@ elu(x::RealOrFloatType, α = one(x)) = vifelse(x ≥ 0, x / one(x), α * (exp(x)
[Gaussian Error Linear Unit](https://arxiv.org/pdf/1606.08415.pdf)
activation function.
"""
function gelu(x::RealOrFloatType)
function gelu(x::Real)
p = oftype(x / 1, π)
λ = oftype(x / 1, √(2 / p))
α = oftype(x / 1, 0.044715)
Expand All @@ -132,7 +127,7 @@ end
Self-gated activation function.
See [Swish: a Self-Gated Activation Function](https://arxiv.org/pdf/1710.05941.pdf).
"""
swish(x::RealOrFloatType) = x * σ(x)
swish(x::Real) = x * σ(x)


"""
Expand All @@ -141,7 +136,7 @@ swish(x::RealOrFloatType) = x * σ(x)
Non-Parametric Linearly Scaled Hyperbolic Tangent Activation Function.
See [LiSHT](https://arxiv.org/abs/1901.05894)
"""
lisht(x::RealOrFloatType) = x * tanh(x)
lisht(x::Real) = x * tanh(x)


"""
Expand All @@ -153,10 +148,10 @@ lisht(x::RealOrFloatType) = x * tanh(x)
Scaled exponential linear units.
See [Self-Normalizing Neural Networks](https://arxiv.org/pdf/1706.02515.pdf).
"""
function selu(x::RealOrFloatType)
function selu(x::Real)
λ = oftype(x / 1, 1.0507009873554804934193349852946)
α = oftype(x / 1, 1.6732632423543772848170429916717)
λ * vifelse(x > 0, x / one(x), α * (exp(x) - one(x)))
λ * ifelse(x > 0, x / one(x), α * (exp(x) - one(x)))
end

"""
Expand All @@ -166,7 +161,7 @@ end
Continuously Differentiable Exponential Linear Units
See [Continuously Differentiable Exponential Linear Units](https://arxiv.org/pdf/1704.07483.pdf).
"""
celu(x::RealOrFloatType, α::Real = one(x)) = vifelse(x ≥ 0, x / one(x), α * (exp(x/α) - one(x)))
celu(x::Real, α::Real = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x/α) - one(x)))


"""
Expand All @@ -175,7 +170,7 @@ celu(x::RealOrFloatType, α::Real = one(x)) = vifelse(x ≥ 0, x / one(x), α *
Threshold Gated Rectified Linear.
See [ThresholdRelu](https://arxiv.org/pdf/1402.3337.pdf)
"""
trelu(x::RealOrFloatType,theta = one(x)) = vifelse(x> theta, x, zero(x))
trelu(x::Real,theta = one(x)) = ifelse(x> theta, x, zero(x))
const thresholdrelu = trelu


Expand All @@ -184,23 +179,23 @@ const thresholdrelu = trelu

See [Quadratic Polynomials Learn Better Image Features](http://www.iro.umontreal.ca/~lisa/publications2/index.php/attachments/single/205).
"""
softsign(x::RealOrFloatType) = x / (one(x) + abs(x))
softsign(x::Real) = x / (one(x) + abs(x))


"""
softplus(x) = log(exp(x) + 1)

See [Deep Sparse Rectifier Neural Networks](http://proceedings.mlr.press/v15/glorot11a/glorot11a.pdf).
"""
softplus(x::RealOrFloatType) = vifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x)))
softplus(x::Real) = ifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x)))


"""
logcosh(x)

Return `log(cosh(x))` which is computed in a numerically stable way.
"""
logcosh(x::RealOrFloatType) = x + softplus(-2x) - log(oftype(x, 2))
logcosh(x::Real) = x + softplus(-2x) - log(oftype(x, 2))


"""
Expand All @@ -209,29 +204,25 @@ logcosh(x::RealOrFloatType) = x + softplus(-2x) - log(oftype(x, 2))
Self Regularized Non-Monotonic Neural Activation Function.
See [Mish: A Self Regularized Non-Monotonic Neural Activation Function](https://arxiv.org/abs/1908.08681).
"""
mish(x::RealOrFloatType) = x * tanh(softplus(x))
mish(x::Real) = x * tanh(softplus(x))

"""
tanhshrink(x) = x - tanh(x)

See [Tanhshrink Activation Function](https://www.gabormelli.com/RKB/Tanhshrink_Activation_Function).
"""
tanhshrink(x::RealOrFloatType) = x - tanh(x)
tanhshrink(x::Real) = x - tanh(x)

"""
softshrink(x, λ=0.5) =
(x ≥ λ ? x - λ : (-λ ≥ x ? x + λ : 0))

See [Softshrink Activation Function](https://www.gabormelli.com/RKB/Softshrink_Activation_Function).
"""
softshrink(x::RealOrFloatType, λ = oftype(x/1, 0.5)) = min(max(zero(x), x - λ), x + λ)
softshrink(x::Real, λ = oftype(x/1, 0.5)) = min(max(zero(x), x - λ), x + λ)

# Provide an informative error message if activation functions are called with an array
for f in (:σ, :hardσ, :logσ, :hardtanh, :relu, :leakyrelu, :relu6, :rrelu, :elu, :gelu, :swish, :lisht, :selu, :celu, :trelu, :softsign, :softplus, :logcosh, :mish, :tanhshrink, :softshrink)
@eval $(f)(x::AbstractArray, args...) =
error("Use broadcasting (`", $(string(f)), ".(x)`) to apply activation functions to arrays.")
end

for f in (:σ, :tanh)
@eval Base.broadcasted(::typeof($f), x::Array{T, N}) where {T <: Union{Float64, Float32}, N} = vmap($f, x)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is also some aggressive piracy that is good to get rid of

end
86 changes: 44 additions & 42 deletions src/softmax.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
export softmax, softmax!, ∇softmax, ∇softmax!,
logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax!

fsum(x; dims) = sum(x, dims = dims)
fsum(x::Array; dims) = vreduce(+, x, dims = dims)
fmaximum(x; dims) = maximum(x, dims = dims)
fmaximum(x::Array; dims) = vreduce(max, x, dims = dims)
fmap(f, x) = map(f, x)
fmap(f, x::Array) = vmap(f, x)

"""
softmax(x; dims=1)

Expand All @@ -33,38 +26,45 @@ julia> softmax([1, 2, 3])
See also [`logsoftmax`](@ref).
"""
function softmax(xs::AbstractArray; dims=1)
max_ = fmaximum(xs, dims=dims)
exp_ = fmap(exp, xs .- max_)
exp_ ./ fsum(exp_, dims=dims)
max_ = maximum(xs, dims=dims)
exp_ = exp.(xs .- max_)
exp_ ./ sum(exp_, dims=dims)
end

function softmax!(out::AbstractVecOrMat, xs::AbstractVecOrMat)
for j = 1:size(xs, 2)
xi_max = xs[1, j]
@avx for i = 1:size(xs, 1)
xi_max = max(xi_max, xs[i, j])
end
@avx for i = 1:size(out, 1)
out[i, j] = exp(xs[i, j] - xi_max)
end
s = zero(eltype(out))
@avx for i = 1:size(out, 1)
function softmax!(out::AbstractVecOrMat{T}, xs::AbstractVecOrMat{T}) where {T}
@inbounds for j = 1:size(xs, 2)
# First, store column-wise maximum in the last element of `out`
out[end, j] = xs[end, j]
@inbounds for i = 1:(size(xs, 1) - 1)
out[end, j] = max(out[end, j], xs[i, j])
end

# Subtract the column-wise maximums to normalize, take exp()
# out .= exp(xs .- out[end, :])
@inbounds for i = 1:size(out, 1)
out[i, j] = exp(xs[i, j] - out[end, j])
end

# Normalize by sum of the entire thing
# out ./= sum(out, 1)
s = T(0)
@inbounds for i = 1:size(out, 1)
s += out[i, j]
end
@avx for i = 1:size(out, 1)
end
@inbounds for i = 1:size(out, 1)
out[i, j] /= s
end
end
out
end
return out
end

function ∇softmax!(out::AbstractVecOrMat, Δ::AbstractVecOrMat, xs::AbstractVecOrMat)
sf = softmax(xs)
out .= sf .* (Δ .- fsum(Δ .* sf, dims = 1))
out .= sf .* (Δ .- sum(Δ .* sf, dims = 1))
end
function ∇softmax(Δ, xs; dims=1)
sf = softmax(xs, dims=dims)
sf .* (Δ .- fsum(Δ .* sf, dims=dims))
sf .* (Δ .- sum(Δ .* sf, dims=dims))
end
∇softmax!(Δ, xs) = ∇softmax!(Δ, Δ, xs)

Expand All @@ -83,32 +83,34 @@ It is semantically equivalent to the following:
See also [`softmax`](@ref).
"""
function logsoftmax(xs::AbstractArray; dims=1)
max_ = fmaximum(xs, dims=dims)
exp_ = fmap(exp, xs .- max_)
log_ = fmap(log, fsum(exp_, dims=dims))
max_ = maximum(xs, dims=dims)
exp_ = exp.(xs .- max_)
log_ = log.(sum(exp_, dims=dims))
(xs .- max_) .- log_
end

function logsoftmax!(out::AbstractVecOrMat, xs::AbstractVecOrMat)
for j = 1:size(xs, 2)
xi_max = xs[1, j]
@avx for i = 1:size(out, 1)
xi_max = max(xi_max, xs[i, j])
end
s = zero(eltype(out))
@avx for i = 1:size(out, 1)
s += exp(xs[i, j] - xi_max)
end
@avx for i = 1:size(out, 1)
out[i, j] = xs[i, j] - log(s) - xi_max
@inbounds begin
xi_max = xs[1, j]
for i = 1:size(out, 1)
xi_max = max(xi_max, xs[i, j])
end
s = zero(eltype(out))
for i = 1:size(out, 1)
s += exp(xs[i, j] - xi_max)
end
for i = 1:size(out, 1)
out[i, j] = xs[i, j] - log(s) - xi_max
end
end
end
return out
end

function ∇logsoftmax!(out::AbstractVecOrMat, Δ::AbstractVecOrMat, xs::AbstractVecOrMat)
out .= Δ .- fsum(Δ, dims=1) .* softmax(xs, dims=1)
out .= Δ .- sum(Δ, dims=1) .* softmax(xs, dims=1)
end

∇logsoftmax(Δ, xs; dims=1) = Δ .- fsum(Δ, dims=dims) .* softmax(xs, dims=dims)
∇logsoftmax(Δ, xs; dims=1) = Δ .- sum(Δ, dims=dims) .* softmax(xs, dims=dims)
∇logsoftmax!(Δ, xs) = ∇logsoftmax!(Δ, Δ, xs)