diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index e8c31e235b..e904ed6514 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -50,5 +50,6 @@ These layers don't affect the structure of the network but may improve training Flux.testmode! BatchNorm Dropout +AlphaDropout LayerNorm ``` diff --git a/src/Flux.jl b/src/Flux.jl index a8bd4f0bf9..c806716d94 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -7,7 +7,7 @@ using MacroTools, Juno, Requires, Reexport, Statistics, Random using MacroTools: @forward export Chain, Dense, RNN, LSTM, GRU, Conv, ConvTranspose, MaxPool, MeanPool, - DepthwiseConv, Dropout, LayerNorm, BatchNorm, InstanceNorm, + DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, params, mapleaves, cpu, gpu, f32, f64 @reexport using NNlib diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 054ca08b78..5fd93e9db2 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -43,6 +43,37 @@ end _testmode!(a::Dropout, test) = (a.active = !test) +""" + AlphaDropout(p) +A dropout layer. It is used in Self-Normalizing Neural Networks. +(https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf) +The AlphaDropout layer ensures that mean and variance of activations remains the same as before. +""" +mutable struct AlphaDropout{F} + p::F + active::Bool +end + +function AlphaDropout(p) + @assert 0 ≤ p ≤ 1 + AlphaDropout(p,true) +end + +function (a::AlphaDropout)(x) + a.active || return x + λ = eltype(x)(1.0507009873554804934193349852946) + α = eltype(x)(1.6732632423543772848170429916717) + α1 = eltype(x)(-λ*α) + noise = randn(eltype(x), size(x)) + x = @. x*(noise > (1 - a.p)) + α1 * (noise <= (1 - a.p)) + A = (a.p + a.p * (1 - a.p) * α1 ^ 2)^0.5 + B = -A * α1 * (1 - a.p) + x = @. A * x + B + return x +end + +_testmode!(a::AlphaDropout, test) = (a.active = !test) + """ LayerNorm(h::Integer)