From e96e4efda8d14885457e3ae5002b6d752dc9c9b7 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Tue, 3 Jan 2023 17:45:45 -0500
Subject: [PATCH 01/11] use NNlib.dropout, deprecate Flux.dropout

---
 docs/src/models/layers.md    |  2 +-
 src/deprecations.jl          | 12 +++++++
 src/layers/normalise.jl      | 63 ++++++------------------------------
 test/layers/normalisation.jl |  2 +-
 4 files changed, 24 insertions(+), 55 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index b8345c3f78..c0e1c57307 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -123,7 +123,7 @@ LayerNorm
 InstanceNorm
 GroupNorm
 Flux.normalise
-Flux.dropout
+NNlib.dropout
 ```
 
 ### Test vs. Train
diff --git a/src/deprecations.jl b/src/deprecations.jl
index 8625c458e0..27c7bc2264 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -185,6 +185,18 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple,
     """)
 end
 
+
+function dropout(rng, x, p; dims=:, active::Bool=true)
+  if active
+    NNlib.dropout(rng, x, p; dims)
+  else
+    Base.depwarn("Flux.dropout(...; active=false) is deprecated. Please branch outside the function, or call dropout(x, 0) if you must.", :dropout)
+    return x
+  end
+end
+dropout(x, p; kwargs...) = dropout(NNlib._rng_from_array(x), x, p; kwargs...)
+
+
 # v0.14 deprecations
 
 # Enable these when 0.14 is released, and delete const ClipGrad = Optimise.ClipValue etc: 
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 7ee28f64e9..7982361b14 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -1,52 +1,6 @@
-
+# Internal function, used only for layers defined in this file.
 _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active
 
-_dropout_shape(s, ::Colon) = size(s)
-_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
-
-_dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
-
-"""
-    dropout([rng = rng_from_array(x)], x, p; dims=:, active=true)
-
-The dropout function. If `active` is `true`,
-for each input, either sets that input to `0` (with probability
-`p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions,
-e.g. `dims=1` applies dropout along columns and `dims=2` along rows.
-If `active` is `false`, it just returns the input `x`.
-
-Specify `rng` for custom RNGs instead of the default RNG.
-Note that custom RNGs are only supported on the CPU.
-
-Warning: when using this function, you have to manually manage the activation
-state. Usually in fact, dropout is used while training
-but is deactivated in the inference phase. This can be
-automatically managed using the [`Dropout`](@ref) layer instead of the
-`dropout` function.
-
-The [`Dropout`](@ref) layer is what you should use in most scenarios.
-"""
-function dropout(rng, x, p; dims=:, active::Bool=true)
-  active || return x
-  y = dropout_mask(rng, x, p, dims=dims)
-  return x .* y
-end
-dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...)
-
-dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
-dropout_mask(rng, x::CuArray, p; kwargs...) =
-  throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays."))
-dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
-function _dropout_mask(rng, x, p; dims=:)
-  realfptype = float(real(eltype(x)))
-  y = rand!(rng, similar(x, realfptype, _dropout_shape(x, dims)))
-  y .= _dropout_kernel.(y, p, 1 - p)
-  return y
-end
-
-# TODO move this to NNlib
-ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any)
-
 """
     Dropout(p; dims=:, rng = default_rng_value())
 
@@ -87,16 +41,16 @@ julia> isapprox(count(==(0), y) / length(y), 0.5, atol=0.1)
 true
 ```
 """
-mutable struct Dropout{F,D,R<:AbstractRNG}
+mutable struct Dropout{F<:Real,D,R<:AbstractRNG}
   p::F
   dims::D
   active::Union{Bool, Nothing}
   rng::R
 end
-Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value())
+Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value())
 
-function Dropout(p; dims=:, rng = default_rng_value())
-  @assert 0 ≤ p ≤ 1
+function Dropout(p::Real; dims=:, rng = default_rng_value())
+  0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1"))
   Dropout(p, dims, nothing, rng)
 end
 
@@ -104,8 +58,11 @@ end
 trainable(a::Dropout) = (;)
 
 function (a::Dropout)(x)
-  _isactive(a, x) || return x
-  return dropout(a.rng, x, a.p; dims=a.dims, active=true)
+  if _isactive(a, x) && a.p != 0
+    dropout(a.rng, x, a.p; dims=a.dims)
+  else
+    x
+  end
 end
 
 testmode!(m::Dropout, mode=true) =
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 2aa26bb2a7..3385775b2f 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -1,4 +1,4 @@
-using Flux, Test, Statistics
+using Flux, Test, Statistics, Random
 using Zygote: pullback, ForwardDiff
 
 evalwgrad(f, x...) = pullback(f, x...)[1]

From fec2d8ef2d8d4ce53edeb335cadcb73a78c33299 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Tue, 3 Jan 2023 17:46:00 -0500
Subject: [PATCH 02/11] improve Dropout's docstring

---
 src/layers/normalise.jl | 58 +++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 7982361b14..7812654f51 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -4,41 +4,55 @@ _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active
 """
     Dropout(p; dims=:, rng = default_rng_value())
 
-Dropout layer.
+Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability.
+This is used as a regularisation, i.e. to reduce overfitting.
 
-While training, for each input, this layer either sets that input to `0` (with probability
-`p`) or scales it by `1 / (1 - p)`. To apply dropout along certain dimension(s), specify the 
-`dims` keyword. e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input
-(also called 2D dropout). This is used as a regularisation, i.e. it reduces overfitting during 
-training.
+While training, it sets each input to `0` (with probability `p`)
+or else scales it by `1 / (1 - p)`, using the [`NNlib.dropout`](@ref) function.
+While testing, it has no effect.
 
-In the forward pass, this layer applies the [`Flux.dropout`](@ref) function. See that for more
-details.
+By defaul the mode will switch automatically, but it can also
+be controlled manually via [`Flux.testmode!`](@ref).
 
-Specify `rng` to use a custom RNG instead of the default.
-Custom RNGs are only supported on the CPU.
+By default every input is treated independently. The `dims` keyword
+instead takes a random choice only along that dimension.
+For example `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input
+(also called 2D dropout).
 
-Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
+Keyword `rng` lets you specify a custom random number generator.
+(Only supported on the CPU.)
 
 # Examples
-```jldoctest
-julia> m = Chain(Dense(1 => 1), Dropout(1));
+```julia
+julia> m = Chain(Dense(ones(3,2)), Dropout(0.4))
+Chain(
+  Dense(2 => 3),                        # 9 parameters
+  Dropout(0.4),
+)
 
-julia> Flux.trainmode!(m);
+julia> m(ones(2, 7))  # test mode, no effect
+3×7 Matrix{Float64}:
+ 2.0  2.0  2.0  2.0  2.0  2.0  2.0
+ 2.0  2.0  2.0  2.0  2.0  2.0  2.0
+ 2.0  2.0  2.0  2.0  2.0  2.0  2.0
 
-julia> y = m([1]);
+julia> Flux.trainmode!(m);  # would happen within gradient
 
-julia> y == [0]
-true
+julia> m(ones(2, 7))
+3×7 Matrix{Float64}:
+ 0.0      0.0      3.33333  0.0      0.0      0.0  0.0
+ 3.33333  0.0      3.33333  0.0      3.33333  0.0  3.33333
+ 3.33333  3.33333  0.0      3.33333  0.0      0.0  3.33333
 
-julia> m = Chain(Dense(1000 => 1000), Dropout(0.5));
+julia> y = m(ones(2, 10_000));
 
-julia> Flux.trainmode!(m);
+julia> using Statistics
 
-julia> y = m(ones(1000));
+julia> mean(y)  # is about 2.0, as for test mode
+1.9892222222222182
 
-julia> isapprox(count(==(0), y) / length(y), 0.5, atol=0.1)
-true
+julia> mean(iszero, y)  # is about 0.4
+0.40323333333333333
 ```
 """
 mutable struct Dropout{F<:Real,D,R<:AbstractRNG}

From 6c84a6cf2c6b737be39f8beba41acec6dd8e220c Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 5 Jan 2023 12:07:43 -0500
Subject: [PATCH 03/11] make Dropout(0) === identity, cannot mutate

---
 src/layers/normalise.jl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 7812654f51..cc8b9e55ad 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -64,7 +64,10 @@ end
 Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value())
 
 function Dropout(p::Real; dims=:, rng = default_rng_value())
-  0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1"))
+  0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1, got p = $p"))
+  if p isa Integer  # Dropout(0)
+    return p==0 ? identity : zero
+  end
   Dropout(p, dims, nothing, rng)
 end
 

From eab0b15857fd7f99dad081cde4322815c4badfc0 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Thu, 5 Jan 2023 20:55:48 -0500
Subject: [PATCH 04/11] NNlibCUDA = 0.2.5

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index b4c3dc6d75..130cd28d7e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -31,7 +31,7 @@ Functors = "0.3, 0.4"
 MLUtils = "0.2, 0.3.1, 0.4"
 MacroTools = "0.5"
 NNlib = "0.8.14"
-NNlibCUDA = "0.2.4"
+NNlibCUDA = "0.2.5"
 OneHotArrays = "0.1, 0.2"
 Optimisers = "0.2.12"
 ProgressLogging = "0.1"

From 4ab93b321ef22609f4183962df68bd6a712bbec0 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 7 Jan 2023 13:01:55 -0500
Subject: [PATCH 05/11] NNlibCUDA = 0.2.6

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 130cd28d7e..d189304f78 100644
--- a/Project.toml
+++ b/Project.toml
@@ -30,8 +30,8 @@ ChainRulesCore = "1.12"
 Functors = "0.3, 0.4"
 MLUtils = "0.2, 0.3.1, 0.4"
 MacroTools = "0.5"
-NNlib = "0.8.14"
-NNlibCUDA = "0.2.5"
+NNlib = "0.8.15"
+NNlibCUDA = "0.2.6"
 OneHotArrays = "0.1, 0.2"
 Optimisers = "0.2.12"
 ProgressLogging = "0.1"

From 0e396a6f09b97c23d4b16ffcc2cd06ccd29353ae Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 7 Jan 2023 14:20:57 -0500
Subject: [PATCH 06/11] simplify default_rng etc

---
 src/Flux.jl                  |  1 +
 src/deprecations.jl          | 15 ++-------------
 src/layers/normalise.jl      | 12 ++++++------
 src/utils.jl                 | 28 ++++------------------------
 test/layers/normalisation.jl |  4 ++--
 5 files changed, 15 insertions(+), 45 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index 3b4b4b9237..f74b781ec5 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -2,6 +2,7 @@ module Flux
 
 using Base: tail
 using LinearAlgebra, Statistics, Random  # standard lib
+using Random: default_rng
 using MacroTools, Reexport, ProgressLogging, SpecialFunctions
 using MacroTools: @forward
 
diff --git a/src/deprecations.jl b/src/deprecations.jl
index 27c7bc2264..a15b88f956 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -84,8 +84,6 @@ Base.@deprecate_binding ADADelta AdaDelta
 # Remove sub-module Data, while making sure Flux.Data.DataLoader keeps working
 Base.@deprecate_binding Data Flux false "Sub-module Flux.Data has been removed. The only thing it contained may be accessed as Flux.DataLoader"
 
-@deprecate rng_from_array() default_rng_value()
-
 function istraining()
   Base.depwarn("Flux.istraining() is deprecated, use NNlib.within_gradient(x) instead", :istraining)
   false
@@ -185,17 +183,8 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple,
     """)
 end
 
-
-function dropout(rng, x, p; dims=:, active::Bool=true)
-  if active
-    NNlib.dropout(rng, x, p; dims)
-  else
-    Base.depwarn("Flux.dropout(...; active=false) is deprecated. Please branch outside the function, or call dropout(x, 0) if you must.", :dropout)
-    return x
-  end
-end
-dropout(x, p; kwargs...) = dropout(NNlib._rng_from_array(x), x, p; kwargs...)
-
+@deprecate rng_from_array() default_rng_value()
+@deprecate default_rng_value() Random.default_rng()
 
 # v0.14 deprecations
 
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index cc8b9e55ad..76ea72f9e8 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -2,7 +2,7 @@
 _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active
 
 """
-    Dropout(p; dims=:, rng = default_rng_value())
+    Dropout(p; dims=:, rng = default_rng())
 
 Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability.
 This is used as a regularisation, i.e. to reduce overfitting.
@@ -61,9 +61,9 @@ mutable struct Dropout{F<:Real,D,R<:AbstractRNG}
   active::Union{Bool, Nothing}
   rng::R
 end
-Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value())
+Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng())
 
-function Dropout(p::Real; dims=:, rng = default_rng_value())
+function Dropout(p::Real; dims=:, rng = default_rng())
   0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1, got p = $p"))
   if p isa Integer  # Dropout(0)
     return p==0 ? identity : zero
@@ -92,7 +92,7 @@ function Base.show(io::IO, d::Dropout)
 end
 
 """
-    AlphaDropout(p; rng = default_rng_value())
+    AlphaDropout(p; rng = default_rng())
 
 A dropout layer. Used in
 [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515).
@@ -126,8 +126,8 @@ mutable struct AlphaDropout{F,R<:AbstractRNG}
     new{typeof(p), typeof(rng)}(p, active, rng)
   end
 end
-AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value())
-AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng)
+AlphaDropout(p, active) = AlphaDropout(p, active, default_rng())
+AlphaDropout(p; rng = default_rng()) = AlphaDropout(p, nothing, rng)
 
 @functor AlphaDropout
 trainable(a::AlphaDropout) = (;)
diff --git a/src/utils.jl b/src/utils.jl
index 884fcd7465..81ebffaf5a 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -36,32 +36,12 @@ epseltype(x) = eps(float(eltype(x)))
 """
     rng_from_array([x])
 
-Create an instance of the RNG most appropriate for `x`.
-The current defaults are:
-- `x isa CuArray`: `CUDA.default_rng()`, else:
-- `x isa AbstractArray`, or no `x` provided:
-  - Julia version is < 1.7: `Random.GLOBAL_RNG`
-  - Julia version is >= 1.7: `Random.default_rng()`
-"""
-rng_from_array(::AbstractArray) = default_rng_value()
-rng_from_array(::CuArray) = CUDA.default_rng()
-
-@non_differentiable rng_from_array(::Any)
-
-if VERSION >= v"1.7"
-  default_rng_value() = Random.default_rng()
-else
-  default_rng_value() = Random.GLOBAL_RNG
-end
-
+Create an instance of the RNG most appropriate for array `x`.
+If `x isa CuArray` then this is `CUDA.default_rng()`,
+otherwise `Random.default_rng()`.
 """
-    default_rng_value()
+rng_from_array(x::AbstractArray) = NNlib._rng_from_array(x)
 
-Create an instance of the default RNG depending on Julia's version.
-- Julia version is < 1.7: `Random.GLOBAL_RNG`
-- Julia version is >= 1.7: `Random.default_rng()`
-"""
-default_rng_value
 
 """
     glorot_uniform([rng = default_rng_value()], size...; gain = 1) -> Array
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 3385775b2f..ffb1b905cd 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -56,10 +56,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     y = m(x)
     @test count(a->a == 0, y) > 50
 
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true)
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9) # , active=true)
     @test count(a->a == 0, y) > 50
 
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false)
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9 * 0) #, active=false)
     @test count(a->a == 0, y) == 0
 
     # CPU RNGs map onto CPU ok

From f42f475864b65367f0a2eb0095a174955ef638b4 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 7 Jan 2023 20:59:16 -0500
Subject: [PATCH 07/11] Revert "simplify default_rng etc"

This reverts commit 0e396a6f09b97c23d4b16ffcc2cd06ccd29353ae.
---
 src/Flux.jl                  |  1 -
 src/deprecations.jl          | 15 +++++++++++++--
 src/layers/normalise.jl      | 12 ++++++------
 src/utils.jl                 | 28 ++++++++++++++++++++++++----
 test/layers/normalisation.jl |  4 ++--
 5 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/src/Flux.jl b/src/Flux.jl
index f74b781ec5..3b4b4b9237 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -2,7 +2,6 @@ module Flux
 
 using Base: tail
 using LinearAlgebra, Statistics, Random  # standard lib
-using Random: default_rng
 using MacroTools, Reexport, ProgressLogging, SpecialFunctions
 using MacroTools: @forward
 
diff --git a/src/deprecations.jl b/src/deprecations.jl
index a15b88f956..27c7bc2264 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -84,6 +84,8 @@ Base.@deprecate_binding ADADelta AdaDelta
 # Remove sub-module Data, while making sure Flux.Data.DataLoader keeps working
 Base.@deprecate_binding Data Flux false "Sub-module Flux.Data has been removed. The only thing it contained may be accessed as Flux.DataLoader"
 
+@deprecate rng_from_array() default_rng_value()
+
 function istraining()
   Base.depwarn("Flux.istraining() is deprecated, use NNlib.within_gradient(x) instead", :istraining)
   false
@@ -183,8 +185,17 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple,
     """)
 end
 
-@deprecate rng_from_array() default_rng_value()
-@deprecate default_rng_value() Random.default_rng()
+
+function dropout(rng, x, p; dims=:, active::Bool=true)
+  if active
+    NNlib.dropout(rng, x, p; dims)
+  else
+    Base.depwarn("Flux.dropout(...; active=false) is deprecated. Please branch outside the function, or call dropout(x, 0) if you must.", :dropout)
+    return x
+  end
+end
+dropout(x, p; kwargs...) = dropout(NNlib._rng_from_array(x), x, p; kwargs...)
+
 
 # v0.14 deprecations
 
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 76ea72f9e8..cc8b9e55ad 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -2,7 +2,7 @@
 _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active
 
 """
-    Dropout(p; dims=:, rng = default_rng())
+    Dropout(p; dims=:, rng = default_rng_value())
 
 Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability.
 This is used as a regularisation, i.e. to reduce overfitting.
@@ -61,9 +61,9 @@ mutable struct Dropout{F<:Real,D,R<:AbstractRNG}
   active::Union{Bool, Nothing}
   rng::R
 end
-Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng())
+Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value())
 
-function Dropout(p::Real; dims=:, rng = default_rng())
+function Dropout(p::Real; dims=:, rng = default_rng_value())
   0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1, got p = $p"))
   if p isa Integer  # Dropout(0)
     return p==0 ? identity : zero
@@ -92,7 +92,7 @@ function Base.show(io::IO, d::Dropout)
 end
 
 """
-    AlphaDropout(p; rng = default_rng())
+    AlphaDropout(p; rng = default_rng_value())
 
 A dropout layer. Used in
 [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515).
@@ -126,8 +126,8 @@ mutable struct AlphaDropout{F,R<:AbstractRNG}
     new{typeof(p), typeof(rng)}(p, active, rng)
   end
 end
-AlphaDropout(p, active) = AlphaDropout(p, active, default_rng())
-AlphaDropout(p; rng = default_rng()) = AlphaDropout(p, nothing, rng)
+AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value())
+AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng)
 
 @functor AlphaDropout
 trainable(a::AlphaDropout) = (;)
diff --git a/src/utils.jl b/src/utils.jl
index 81ebffaf5a..884fcd7465 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -36,12 +36,32 @@ epseltype(x) = eps(float(eltype(x)))
 """
     rng_from_array([x])
 
-Create an instance of the RNG most appropriate for array `x`.
-If `x isa CuArray` then this is `CUDA.default_rng()`,
-otherwise `Random.default_rng()`.
+Create an instance of the RNG most appropriate for `x`.
+The current defaults are:
+- `x isa CuArray`: `CUDA.default_rng()`, else:
+- `x isa AbstractArray`, or no `x` provided:
+  - Julia version is < 1.7: `Random.GLOBAL_RNG`
+  - Julia version is >= 1.7: `Random.default_rng()`
+"""
+rng_from_array(::AbstractArray) = default_rng_value()
+rng_from_array(::CuArray) = CUDA.default_rng()
+
+@non_differentiable rng_from_array(::Any)
+
+if VERSION >= v"1.7"
+  default_rng_value() = Random.default_rng()
+else
+  default_rng_value() = Random.GLOBAL_RNG
+end
+
 """
-rng_from_array(x::AbstractArray) = NNlib._rng_from_array(x)
+    default_rng_value()
 
+Create an instance of the default RNG depending on Julia's version.
+- Julia version is < 1.7: `Random.GLOBAL_RNG`
+- Julia version is >= 1.7: `Random.default_rng()`
+"""
+default_rng_value
 
 """
     glorot_uniform([rng = default_rng_value()], size...; gain = 1) -> Array
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index ffb1b905cd..3385775b2f 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -56,10 +56,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     y = m(x)
     @test count(a->a == 0, y) > 50
 
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9) # , active=true)
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true)
     @test count(a->a == 0, y) > 50
 
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9 * 0) #, active=false)
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false)
     @test count(a->a == 0, y) == 0
 
     # CPU RNGs map onto CPU ok

From d7cc49d8b438280bda714a4d3690770738262cce Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 7 Jan 2023 21:00:28 -0500
Subject: [PATCH 08/11] un-revert the removal of the active=true method

---
 src/deprecations.jl          | 11 -----------
 test/layers/normalisation.jl |  4 ++--
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/deprecations.jl b/src/deprecations.jl
index 27c7bc2264..a99297649b 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -186,17 +186,6 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple,
 end
 
 
-function dropout(rng, x, p; dims=:, active::Bool=true)
-  if active
-    NNlib.dropout(rng, x, p; dims)
-  else
-    Base.depwarn("Flux.dropout(...; active=false) is deprecated. Please branch outside the function, or call dropout(x, 0) if you must.", :dropout)
-    return x
-  end
-end
-dropout(x, p; kwargs...) = dropout(NNlib._rng_from_array(x), x, p; kwargs...)
-
-
 # v0.14 deprecations
 
 # Enable these when 0.14 is released, and delete const ClipGrad = Optimise.ClipValue etc: 
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 3385775b2f..6a3d85756d 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -56,10 +56,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     y = m(x)
     @test count(a->a == 0, y) > 50
 
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true)
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9) # , active=true)
     @test count(a->a == 0, y) > 50
 
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false)
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9 * 0) # , active=false)
     @test count(a->a == 0, y) == 0
 
     # CPU RNGs map onto CPU ok

From 28ac4c4731bdad4241d085d337129d435a65b6c6 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Fri, 27 Jan 2023 12:38:49 -0500
Subject: [PATCH 09/11] avoid a branch

---
 src/layers/normalise.jl | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index cc8b9e55ad..7937a311b4 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -74,13 +74,7 @@ end
 @functor Dropout
 trainable(a::Dropout) = (;)
 
-function (a::Dropout)(x)
-  if _isactive(a, x) && a.p != 0
-    dropout(a.rng, x, a.p; dims=a.dims)
-  else
-    x
-  end
-end
+(a::Dropout)(x) = dropout(a.rng, x, a.p * _isactive(a, x); dims=a.dims)
 
 testmode!(m::Dropout, mode=true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)

From 9e99422eca9a6bf000bcada4bcb0a4f2d2fd700c Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sun, 8 Jan 2023 06:58:45 -0500
Subject: [PATCH 10/11] Update src/layers/normalise.jl

Co-authored-by: Carlo Lucibello <carlo.lucibello@gmail.com>
---
 src/layers/normalise.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 7937a311b4..37be2fb8e2 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -11,7 +11,7 @@ While training, it sets each input to `0` (with probability `p`)
 or else scales it by `1 / (1 - p)`, using the [`NNlib.dropout`](@ref) function.
 While testing, it has no effect.
 
-By defaul the mode will switch automatically, but it can also
+By default the mode will switch automatically, but it can also
 be controlled manually via [`Flux.testmode!`](@ref).
 
 By default every input is treated independently. The `dims` keyword

From fc9855b085750fd00ca55cd3279c057f36afea73 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sun, 29 Jan 2023 11:56:25 -0500
Subject: [PATCH 11/11] Apply suggestions from code review

Co-authored-by: Kyle Daruwalla <daruwalla.k.public@icloud.com>
---
 src/layers/normalise.jl | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 37be2fb8e2..e9362313ab 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -2,7 +2,7 @@
 _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active
 
 """
-    Dropout(p; dims=:, rng = default_rng_value())
+    Dropout(p; [dims, rng])
 
 Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability.
 This is used as a regularisation, i.e. to reduce overfitting.
@@ -14,8 +14,8 @@ While testing, it has no effect.
 By default the mode will switch automatically, but it can also
 be controlled manually via [`Flux.testmode!`](@ref).
 
-By default every input is treated independently. The `dims` keyword
-instead takes a random choice only along that dimension.
+By default every input is treated independently. With the `dims` keyword,
+instead it takes a random choice only along that dimension.
 For example `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input
 (also called 2D dropout).
 
@@ -64,10 +64,7 @@ end
 Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value())
 
 function Dropout(p::Real; dims=:, rng = default_rng_value())
-  0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1, got p = $p"))
-  if p isa Integer  # Dropout(0)
-    return p==0 ? identity : zero
-  end
+  0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expects 0 ≤ p ≤ 1, got p = $p"))
   Dropout(p, dims, nothing, rng)
 end