diff --git a/Project.toml b/Project.toml index 34e98362c..a363c55ab 100644 --- a/Project.toml +++ b/Project.toml @@ -5,13 +5,11 @@ version = "0.7.3" [deps] Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Requires = "ae029012-a4dd-5104-9daa-d747884805df" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] -LoopVectorization = "0.8.10" Requires = "0.5, 1.0" julia = "1.3" diff --git a/src/NNlib.jl b/src/NNlib.jl index ef8acc987..d954a68cd 100644 --- a/src/NNlib.jl +++ b/src/NNlib.jl @@ -1,8 +1,6 @@ module NNlib using Pkg using Requires -using LoopVectorization -using Requires # Include APIs include("dim_helpers.jl") diff --git a/src/activation.jl b/src/activation.jl index 3e7dd22c4..870006b69 100644 --- a/src/activation.jl +++ b/src/activation.jl @@ -1,11 +1,6 @@ export σ, sigmoid, hardσ, hardsigmoid, hardtanh, relu, leakyrelu, relu6, rrelu, elu, gelu, swish, selu, celu, softplus, softsign, logσ, logsigmoid, logcosh, mish, tanhshrink, softshrink, thresholdrelu, trelu, lisht -import LoopVectorization: vifelse -using LoopVectorization.SLEEFPirates: FloatType - -const RealOrFloatType = Union{Real, FloatType} - ## Activation functions # # Some of activation functions have its wrapper function for GPU in CuArrays.jl. @@ -17,9 +12,9 @@ const RealOrFloatType = Union{Real, FloatType} Classic [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function) activation function. """ -function σ(x::RealOrFloatType) +function σ(x) t = exp(-abs(x)) - vifelse(x ≥ 0, inv(one(t) + t), t / (one(t) + t)) + ifelse(x ≥ 0, inv(one(t) + t), t / (one(t) + t)) end const sigmoid = σ @@ -29,7 +24,7 @@ const sigmoid = σ Segment-wise linear approximation of sigmoid. See [BinaryConnect: Training Deep Neural Networks withbinary weights during propagations](https://arxiv.org/pdf/1511.00363.pdf). """ -hardσ(x::RealOrFloatType, a=0.2) = oftype(x/1, max(zero(x/1), min(one(x/1), oftype(x/1,a) * x + oftype(x/1,0.5)))) +hardσ(x, a=0.2) = oftype(x/1, max(zero(x/1), min(one(x/1), oftype(x/1,a) * x + oftype(x/1,0.5)))) const hardsigmoid = hardσ @@ -46,7 +41,7 @@ Return `log(σ(x))` which is computed in a numerically stable way. -10.000045398899218 -3.720075976020836e-44 """ -logσ(x::RealOrFloatType) = -softplus(-x) +logσ(x) = -softplus(-x) const logsigmoid = logσ @@ -56,7 +51,7 @@ const logsigmoid = logσ Segment-wise linear approximation of tanh. Cheaper and more computational efficient version of tanh. See [Large Scale Machine Learning](http://ronan.collobert.org/pub/matos/2004_phdthesis_lip6.pdf). """ -hardtanh(x::RealOrFloatType) = max(-one(x), min( one(x), x)) +hardtanh(x) = max(-one(x), min( one(x), x)) """ @@ -65,7 +60,7 @@ hardtanh(x::RealOrFloatType) = max(-one(x), min( one(x), x)) [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) activation function. """ -relu(x::RealOrFloatType) = max(zero(x), x) +relu(x) = max(zero(x), x) """ @@ -75,7 +70,7 @@ Leaky [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_ne activation function. You can also specify the coefficient explicitly, e.g. `leakyrelu(x, 0.01)`. """ -leakyrelu(x::RealOrFloatType, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x)) +leakyrelu(x, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x)) """ relu6(x) = min(max(0, x), 6) @@ -84,7 +79,7 @@ leakyrelu(x::RealOrFloatType, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x)) activation function capped at 6. See [Convolutional Deep Belief Networks on CIFAR-10](http://www.cs.utoronto.ca/%7Ekriz/conv-cifar10-aug2010.pdf) """ -relu6(x::RealOrFloatType) = min(relu(x), oftype(x, 6)) +relu6(x) = min(relu(x), oftype(x, 6)) """ rrelu(x, l=1/8, u=1/3) = max(a*x, x) @@ -95,7 +90,7 @@ Randomized Leaky [Rectified Linear Unit](https://arxiv.org/pdf/1505.00853.pdf) activation function. You can also specify the bound explicitly, e.g. `rrelu(x, 0.0, 1.0)`. """ -function rrelu(x::RealOrFloatType, l::Real = 1 / 8.0, u::Real = 1 / 3.0) +function rrelu(x, l = 1 / 8.0, u = 1 / 3.0) a = oftype(x / 1, (u - l) * rand() + l) return leakyrelu(x, a) end @@ -108,7 +103,7 @@ Exponential Linear Unit activation function. See [Fast and Accurate Deep Network Learning by Exponential Linear Units](https://arxiv.org/abs/1511.07289). You can also specify the coefficient explicitly, e.g. `elu(x, 1)`. """ -elu(x::RealOrFloatType, α = one(x)) = vifelse(x ≥ 0, x / one(x), α * (exp(x) - one(x))) +elu(x, α = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x) - one(x))) """ @@ -117,7 +112,7 @@ elu(x::RealOrFloatType, α = one(x)) = vifelse(x ≥ 0, x / one(x), α * (exp(x) [Gaussian Error Linear Unit](https://arxiv.org/pdf/1606.08415.pdf) activation function. """ -function gelu(x::RealOrFloatType) +function gelu(x) p = oftype(x / 1, π) λ = oftype(x / 1, √(2 / p)) α = oftype(x / 1, 0.044715) @@ -132,7 +127,7 @@ end Self-gated activation function. See [Swish: a Self-Gated Activation Function](https://arxiv.org/pdf/1710.05941.pdf). """ -swish(x::RealOrFloatType) = x * σ(x) +swish(x) = x * σ(x) """ @@ -141,7 +136,7 @@ swish(x::RealOrFloatType) = x * σ(x) Non-Parametric Linearly Scaled Hyperbolic Tangent Activation Function. See [LiSHT](https://arxiv.org/abs/1901.05894) """ -lisht(x::RealOrFloatType) = x * tanh(x) +lisht(x) = x * tanh(x) """ @@ -153,10 +148,10 @@ lisht(x::RealOrFloatType) = x * tanh(x) Scaled exponential linear units. See [Self-Normalizing Neural Networks](https://arxiv.org/pdf/1706.02515.pdf). """ -function selu(x::RealOrFloatType) +function selu(x) λ = oftype(x / 1, 1.0507009873554804934193349852946) α = oftype(x / 1, 1.6732632423543772848170429916717) - λ * vifelse(x > 0, x / one(x), α * (exp(x) - one(x))) + λ * ifelse(x > 0, x / one(x), α * (exp(x) - one(x))) end """ @@ -166,7 +161,7 @@ end Continuously Differentiable Exponential Linear Units See [Continuously Differentiable Exponential Linear Units](https://arxiv.org/pdf/1704.07483.pdf). """ -celu(x::RealOrFloatType, α::Real = one(x)) = vifelse(x ≥ 0, x / one(x), α * (exp(x/α) - one(x))) +celu(x, α = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x/α) - one(x))) """ @@ -175,7 +170,7 @@ celu(x::RealOrFloatType, α::Real = one(x)) = vifelse(x ≥ 0, x / one(x), α * Threshold Gated Rectified Linear. See [ThresholdRelu](https://arxiv.org/pdf/1402.3337.pdf) """ -trelu(x::RealOrFloatType,theta = one(x)) = vifelse(x> theta, x, zero(x)) +trelu(x, theta = one(x)) = ifelse(x> theta, x, zero(x)) const thresholdrelu = trelu @@ -184,7 +179,7 @@ const thresholdrelu = trelu See [Quadratic Polynomials Learn Better Image Features](http://www.iro.umontreal.ca/~lisa/publications2/index.php/attachments/single/205). """ -softsign(x::RealOrFloatType) = x / (one(x) + abs(x)) +softsign(x) = x / (one(x) + abs(x)) """ @@ -192,7 +187,7 @@ softsign(x::RealOrFloatType) = x / (one(x) + abs(x)) See [Deep Sparse Rectifier Neural Networks](http://proceedings.mlr.press/v15/glorot11a/glorot11a.pdf). """ -softplus(x::RealOrFloatType) = vifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x))) +softplus(x) = ifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x))) """ @@ -200,7 +195,7 @@ softplus(x::RealOrFloatType) = vifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x))) Return `log(cosh(x))` which is computed in a numerically stable way. """ -logcosh(x::RealOrFloatType) = x + softplus(-2x) - log(oftype(x, 2)) +logcosh(x) = x + softplus(-2x) - log(oftype(x, 2)) """ @@ -209,14 +204,14 @@ logcosh(x::RealOrFloatType) = x + softplus(-2x) - log(oftype(x, 2)) Self Regularized Non-Monotonic Neural Activation Function. See [Mish: A Self Regularized Non-Monotonic Neural Activation Function](https://arxiv.org/abs/1908.08681). """ -mish(x::RealOrFloatType) = x * tanh(softplus(x)) +mish(x) = x * tanh(softplus(x)) """ tanhshrink(x) = x - tanh(x) See [Tanhshrink Activation Function](https://www.gabormelli.com/RKB/Tanhshrink_Activation_Function). """ -tanhshrink(x::RealOrFloatType) = x - tanh(x) +tanhshrink(x) = x - tanh(x) """ softshrink(x, λ=0.5) = @@ -224,14 +219,10 @@ tanhshrink(x::RealOrFloatType) = x - tanh(x) See [Softshrink Activation Function](https://www.gabormelli.com/RKB/Softshrink_Activation_Function). """ -softshrink(x::RealOrFloatType, λ = oftype(x/1, 0.5)) = min(max(zero(x), x - λ), x + λ) +softshrink(x, λ = oftype(x/1, 0.5)) = min(max(zero(x), x - λ), x + λ) # Provide an informative error message if activation functions are called with an array for f in (:σ, :hardσ, :logσ, :hardtanh, :relu, :leakyrelu, :relu6, :rrelu, :elu, :gelu, :swish, :lisht, :selu, :celu, :trelu, :softsign, :softplus, :logcosh, :mish, :tanhshrink, :softshrink) @eval $(f)(x::AbstractArray, args...) = error("Use broadcasting (`", $(string(f)), ".(x)`) to apply activation functions to arrays.") end - -for f in (:σ, :tanh) - @eval Base.broadcasted(::typeof($f), x::Array{T, N}) where {T <: Union{Float64, Float32}, N} = vmap($f, x) -end diff --git a/src/softmax.jl b/src/softmax.jl index 24ed88027..4fec286c3 100644 --- a/src/softmax.jl +++ b/src/softmax.jl @@ -1,13 +1,6 @@ export softmax, softmax!, ∇softmax, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax, ∇logsoftmax! -fsum(x; dims) = sum(x, dims = dims) -fsum(x::Array; dims) = vreduce(+, x, dims = dims) -fmaximum(x; dims) = maximum(x, dims = dims) -fmaximum(x::Array; dims) = vreduce(max, x, dims = dims) -fmap(f, x) = map(f, x) -fmap(f, x::Array) = vmap(f, x) - """ softmax(x; dims=1) @@ -33,38 +26,45 @@ julia> softmax([1, 2, 3]) See also [`logsoftmax`](@ref). """ function softmax(xs::AbstractArray; dims=1) - max_ = fmaximum(xs, dims=dims) - exp_ = fmap(exp, xs .- max_) - exp_ ./ fsum(exp_, dims=dims) + max_ = maximum(xs, dims=dims) + exp_ = exp.(xs .- max_) + exp_ ./ sum(exp_, dims=dims) end -function softmax!(out::AbstractVecOrMat, xs::AbstractVecOrMat) - for j = 1:size(xs, 2) - xi_max = xs[1, j] - @avx for i = 1:size(xs, 1) - xi_max = max(xi_max, xs[i, j]) - end - @avx for i = 1:size(out, 1) - out[i, j] = exp(xs[i, j] - xi_max) - end - s = zero(eltype(out)) - @avx for i = 1:size(out, 1) +function softmax!(out::AbstractVecOrMat{T}, xs::AbstractVecOrMat{T}) where {T} + @inbounds for j = 1:size(xs, 2) + # First, store column-wise maximum in the last element of `out` + out[end, j] = xs[end, j] + @inbounds for i = 1:(size(xs, 1) - 1) + out[end, j] = max(out[end, j], xs[i, j]) + end + + # Subtract the column-wise maximums to normalize, take exp() + # out .= exp(xs .- out[end, :]) + @inbounds for i = 1:size(out, 1) + out[i, j] = exp(xs[i, j] - out[end, j]) + end + + # Normalize by sum of the entire thing + # out ./= sum(out, 1) + s = T(0) + @inbounds for i = 1:size(out, 1) s += out[i, j] - end - @avx for i = 1:size(out, 1) + end + @inbounds for i = 1:size(out, 1) out[i, j] /= s - end + end + out end - return out end function ∇softmax!(out::AbstractVecOrMat, Δ::AbstractVecOrMat, xs::AbstractVecOrMat) sf = softmax(xs) - out .= sf .* (Δ .- fsum(Δ .* sf, dims = 1)) + out .= sf .* (Δ .- sum(Δ .* sf, dims = 1)) end function ∇softmax(Δ, xs; dims=1) sf = softmax(xs, dims=dims) - sf .* (Δ .- fsum(Δ .* sf, dims=dims)) + sf .* (Δ .- sum(Δ .* sf, dims=dims)) end ∇softmax!(Δ, xs) = ∇softmax!(Δ, Δ, xs) @@ -83,32 +83,34 @@ It is semantically equivalent to the following: See also [`softmax`](@ref). """ function logsoftmax(xs::AbstractArray; dims=1) - max_ = fmaximum(xs, dims=dims) - exp_ = fmap(exp, xs .- max_) - log_ = fmap(log, fsum(exp_, dims=dims)) + max_ = maximum(xs, dims=dims) + exp_ = exp.(xs .- max_) + log_ = log.(sum(exp_, dims=dims)) (xs .- max_) .- log_ end function logsoftmax!(out::AbstractVecOrMat, xs::AbstractVecOrMat) for j = 1:size(xs, 2) - xi_max = xs[1, j] - @avx for i = 1:size(out, 1) - xi_max = max(xi_max, xs[i, j]) - end - s = zero(eltype(out)) - @avx for i = 1:size(out, 1) - s += exp(xs[i, j] - xi_max) - end - @avx for i = 1:size(out, 1) - out[i, j] = xs[i, j] - log(s) - xi_max + @inbounds begin + xi_max = xs[1, j] + for i = 1:size(out, 1) + xi_max = max(xi_max, xs[i, j]) + end + s = zero(eltype(out)) + for i = 1:size(out, 1) + s += exp(xs[i, j] - xi_max) + end + for i = 1:size(out, 1) + out[i, j] = xs[i, j] - log(s) - xi_max + end end end return out end function ∇logsoftmax!(out::AbstractVecOrMat, Δ::AbstractVecOrMat, xs::AbstractVecOrMat) - out .= Δ .- fsum(Δ, dims=1) .* softmax(xs, dims=1) + out .= Δ .- sum(Δ, dims=1) .* softmax(xs, dims=1) end -∇logsoftmax(Δ, xs; dims=1) = Δ .- fsum(Δ, dims=dims) .* softmax(xs, dims=dims) +∇logsoftmax(Δ, xs; dims=1) = Δ .- sum(Δ, dims=dims) .* softmax(xs, dims=dims) ∇logsoftmax!(Δ, xs) = ∇logsoftmax!(Δ, Δ, xs)