diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index e8c31e235b..e904ed6514 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -50,5 +50,6 @@ These layers don't affect the structure of the network but may improve training
 Flux.testmode!
 BatchNorm
 Dropout
+AlphaDropout
 LayerNorm
 ```
diff --git a/src/Flux.jl b/src/Flux.jl
index a8bd4f0bf9..c806716d94 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -7,7 +7,7 @@ using MacroTools, Juno, Requires, Reexport, Statistics, Random
 using MacroTools: @forward
 
 export Chain, Dense, RNN, LSTM, GRU, Conv, ConvTranspose, MaxPool, MeanPool,
-       DepthwiseConv, Dropout, LayerNorm, BatchNorm, InstanceNorm,
+       DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm,
        params, mapleaves, cpu, gpu, f32, f64
 
 @reexport using NNlib
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 054ca08b78..5fd93e9db2 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -43,6 +43,37 @@ end
 
 _testmode!(a::Dropout, test) = (a.active = !test)
 
+"""
+    AlphaDropout(p)
+A dropout layer. It is used in Self-Normalizing Neural Networks. 
+(https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf)
+The AlphaDropout layer ensures that mean and variance of activations remains the same as before.
+"""
+mutable struct AlphaDropout{F}
+  p::F
+  active::Bool
+end
+
+function AlphaDropout(p)
+  @assert 0 ≤ p ≤ 1
+  AlphaDropout(p,true)
+end
+
+function (a::AlphaDropout)(x)
+  a.active || return x
+  λ = eltype(x)(1.0507009873554804934193349852946)
+  α = eltype(x)(1.6732632423543772848170429916717)
+  α1 = eltype(x)(-λ*α)
+  noise = randn(eltype(x), size(x))
+  x = @. x*(noise > (1 - a.p)) + α1 * (noise <= (1 - a.p))
+  A = (a.p + a.p * (1 - a.p) * α1 ^ 2)^0.5
+  B = -A * α1 * (1 - a.p)
+  x = @. A * x + B
+  return x
+end
+
+_testmode!(a::AlphaDropout, test) = (a.active = !test)
+
 """
     LayerNorm(h::Integer)