FluxML · mcabbott · Feb 1, 2023 · Jan 3, 2023 · Jan 3, 2023 · Jan 5, 2023
diff --git a/Project.toml b/Project.toml
@@ -30,8 +30,8 @@ ChainRulesCore = "1.12"
 Functors = "0.3, 0.4"
 MLUtils = "0.2, 0.3.1, 0.4"
 MacroTools = "0.5"
-NNlib = "0.8.14"
-NNlibCUDA = "0.2.4"
+NNlib = "0.8.15"
+NNlibCUDA = "0.2.6"
 OneHotArrays = "0.1, 0.2"
 Optimisers = "0.2.12"
 ProgressLogging = "0.1"

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
@@ -123,7 +123,7 @@ LayerNorm
 InstanceNorm
 GroupNorm
 Flux.normalise
-Flux.dropout
+NNlib.dropout
 ```
 
 ### Test vs. Train

diff --git a/src/Flux.jl b/src/Flux.jl
@@ -2,6 +2,7 @@ module Flux
 
 using Base: tail
 using LinearAlgebra, Statistics, Random  # standard lib
+using Random: default_rng
 using MacroTools, Reexport, ProgressLogging, SpecialFunctions
 using MacroTools: @forward
 

diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -84,8 +84,6 @@ Base.@deprecate_binding ADADelta AdaDelta
 # Remove sub-module Data, while making sure Flux.Data.DataLoader keeps working
 Base.@deprecate_binding Data Flux false "Sub-module Flux.Data has been removed. The only thing it contained may be accessed as Flux.DataLoader"
 
-@deprecate rng_from_array() default_rng_value()
-
 function istraining()
   Base.depwarn("Flux.istraining() is deprecated, use NNlib.within_gradient(x) instead", :istraining)
   false
@@ -185,6 +183,9 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple,
     """)
 end
 
+@deprecate rng_from_array() default_rng_value()
+@deprecate default_rng_value() Random.default_rng()
+
 # v0.14 deprecations
 
 # Enable these when 0.14 is released, and delete const ClipGrad = Optimise.ClipValue etc: 

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -1,111 +1,85 @@
-
+# Internal function, used only for layers defined in this file.
 _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active
 
-_dropout_shape(s, ::Colon) = size(s)
-_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...)
-
-_dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
-
-"""
-    dropout([rng = rng_from_array(x)], x, p; dims=:, active=true)
-
-The dropout function. If `active` is `true`,
-for each input, either sets that input to `0` (with probability
-`p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions,
-e.g. `dims=1` applies dropout along columns and `dims=2` along rows.
-If `active` is `false`, it just returns the input `x`.
-
-Specify `rng` for custom RNGs instead of the default RNG.
-Note that custom RNGs are only supported on the CPU.
-
-Warning: when using this function, you have to manually manage the activation
-state. Usually in fact, dropout is used while training
-but is deactivated in the inference phase. This can be
-automatically managed using the [`Dropout`](@ref) layer instead of the
-`dropout` function.
-
-The [`Dropout`](@ref) layer is what you should use in most scenarios.
 """
-function dropout(rng, x, p; dims=:, active::Bool=true)
-  active || return x
-  y = dropout_mask(rng, x, p, dims=dims)
-  return x .* y
-end
-dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...)
-
-dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
-dropout_mask(rng, x::CuArray, p; kwargs...) =
-  throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays."))
-dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...)
-function _dropout_mask(rng, x, p; dims=:)
-  realfptype = float(real(eltype(x)))
-  y = rand!(rng, similar(x, realfptype, _dropout_shape(x, dims)))
-  y .= _dropout_kernel.(y, p, 1 - p)
-  return y
-end
-
-# TODO move this to NNlib
-ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any)
+    Dropout(p; dims=:, rng = default_rng())
 
-"""
-    Dropout(p; dims=:, rng = default_rng_value())
+Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability.
+This is used as a regularisation, i.e. to reduce overfitting.
 
-Dropout layer.
+While training, it sets each input to `0` (with probability `p`)
+or else scales it by `1 / (1 - p)`, using the [`NNlib.dropout`](@ref) function.
+While testing, it has no effect.
 
-While training, for each input, this layer either sets that input to `0` (with probability
-`p`) or scales it by `1 / (1 - p)`. To apply dropout along certain dimension(s), specify the 
-`dims` keyword. e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input
-(also called 2D dropout). This is used as a regularisation, i.e. it reduces overfitting during 
-training.
+By defaul the mode will switch automatically, but it can also
+be controlled manually via [`Flux.testmode!`](@ref).
 
-In the forward pass, this layer applies the [`Flux.dropout`](@ref) function. See that for more
-details.
+By default every input is treated independently. The `dims` keyword
+instead takes a random choice only along that dimension.
+For example `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input
+(also called 2D dropout).
 
-Specify `rng` to use a custom RNG instead of the default.
-Custom RNGs are only supported on the CPU.
-
-Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
+Keyword `rng` lets you specify a custom random number generator.
+(Only supported on the CPU.)
 
 # Examples
-```jldoctest
-julia> m = Chain(Dense(1 => 1), Dropout(1));
+```julia
+julia> m = Chain(Dense(ones(3,2)), Dropout(0.4))
+Chain(
+  Dense(2 => 3),                        # 9 parameters
+  Dropout(0.4),
+)
 
-julia> Flux.trainmode!(m);
+julia> m(ones(2, 7))  # test mode, no effect
+3×7 Matrix{Float64}:
+ 2.0  2.0  2.0  2.0  2.0  2.0  2.0
+ 2.0  2.0  2.0  2.0  2.0  2.0  2.0
+ 2.0  2.0  2.0  2.0  2.0  2.0  2.0
 
-julia> y = m([1]);
+julia> Flux.trainmode!(m);  # would happen within gradient
 
-julia> y == [0]
-true
+julia> m(ones(2, 7))
+3×7 Matrix{Float64}:
+ 0.0      0.0      3.33333  0.0      0.0      0.0  0.0
+ 3.33333  0.0      3.33333  0.0      3.33333  0.0  3.33333
+ 3.33333  3.33333  0.0      3.33333  0.0      0.0  3.33333
 
-julia> m = Chain(Dense(1000 => 1000), Dropout(0.5));
+julia> y = m(ones(2, 10_000));
 
-julia> Flux.trainmode!(m);
+julia> using Statistics
 
-julia> y = m(ones(1000));
+julia> mean(y)  # is about 2.0, as for test mode
+1.9892222222222182
 
-julia> isapprox(count(==(0), y) / length(y), 0.5, atol=0.1)
-true
+julia> mean(iszero, y)  # is about 0.4
+0.40323333333333333
 ```
 """
-mutable struct Dropout{F,D,R<:AbstractRNG}
+mutable struct Dropout{F<:Real,D,R<:AbstractRNG}
   p::F
   dims::D
   active::Union{Bool, Nothing}
   rng::R
 end
-Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value())
+Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng())
 
-function Dropout(p; dims=:, rng = default_rng_value())
-  @assert 0 ≤ p ≤ 1
+function Dropout(p::Real; dims=:, rng = default_rng())
+  0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1, got p = $p"))
+  if p isa Integer  # Dropout(0)
+    return p==0 ? identity : zero
+  end
   Dropout(p, dims, nothing, rng)
 end
 
 @functor Dropout
 trainable(a::Dropout) = (;)
 
 function (a::Dropout)(x)
-  _isactive(a, x) || return x
-  return dropout(a.rng, x, a.p; dims=a.dims, active=true)
+  if _isactive(a, x) && a.p != 0
+    dropout(a.rng, x, a.p; dims=a.dims)
+  else
+    x
+  end
 end
 
 testmode!(m::Dropout, mode=true) =
@@ -118,7 +92,7 @@ function Base.show(io::IO, d::Dropout)
 end
 
 """
-    AlphaDropout(p; rng = default_rng_value())
+    AlphaDropout(p; rng = default_rng())
 
 A dropout layer. Used in
 [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515).
@@ -152,8 +126,8 @@ mutable struct AlphaDropout{F,R<:AbstractRNG}
     new{typeof(p), typeof(rng)}(p, active, rng)
   end
 end
-AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value())
-AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng)
+AlphaDropout(p, active) = AlphaDropout(p, active, default_rng())
+AlphaDropout(p; rng = default_rng()) = AlphaDropout(p, nothing, rng)
 
 @functor AlphaDropout
 trainable(a::AlphaDropout) = (;)

diff --git a/src/utils.jl b/src/utils.jl
@@ -36,32 +36,12 @@ epseltype(x) = eps(float(eltype(x)))
 """
     rng_from_array([x])
 
-Create an instance of the RNG most appropriate for `x`.
-The current defaults are:
-- `x isa CuArray`: `CUDA.default_rng()`, else:
-- `x isa AbstractArray`, or no `x` provided:
-  - Julia version is < 1.7: `Random.GLOBAL_RNG`
-  - Julia version is >= 1.7: `Random.default_rng()`
-"""
-rng_from_array(::AbstractArray) = default_rng_value()
-rng_from_array(::CuArray) = CUDA.default_rng()
-
-@non_differentiable rng_from_array(::Any)
-
-if VERSION >= v"1.7"
-  default_rng_value() = Random.default_rng()
-else
-  default_rng_value() = Random.GLOBAL_RNG
-end
-
+Create an instance of the RNG most appropriate for array `x`.
+If `x isa CuArray` then this is `CUDA.default_rng()`,
+otherwise `Random.default_rng()`.
 """
-    default_rng_value()
+rng_from_array(x::AbstractArray) = NNlib._rng_from_array(x)
 
-Create an instance of the default RNG depending on Julia's version.
-- Julia version is < 1.7: `Random.GLOBAL_RNG`
-- Julia version is >= 1.7: `Random.default_rng()`
-"""
-default_rng_value
 
 """
     glorot_uniform([rng = default_rng_value()], size...; gain = 1) -> Array

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
@@ -1,4 +1,4 @@
-using Flux, Test, Statistics
+using Flux, Test, Statistics, Random
 using Zygote: pullback, ForwardDiff
 
 evalwgrad(f, x...) = pullback(f, x...)[1]
@@ -56,10 +56,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     y = m(x)
     @test count(a->a == 0, y) > 50
 
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true)
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9) # , active=true)
     @test count(a->a == 0, y) > 50
 
-    y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false)
+    y = Flux.dropout(values(rng_kwargs)..., x, 0.9 * 0) #, active=false)
     @test count(a->a == 0, y) == 0
 
     # CPU RNGs map onto CPU ok