From e96e4efda8d14885457e3ae5002b6d752dc9c9b7 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Tue, 3 Jan 2023 17:45:45 -0500 Subject: [PATCH 01/11] use NNlib.dropout, deprecate Flux.dropout --- docs/src/models/layers.md | 2 +- src/deprecations.jl | 12 +++++++ src/layers/normalise.jl | 63 ++++++------------------------------ test/layers/normalisation.jl | 2 +- 4 files changed, 24 insertions(+), 55 deletions(-) diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index b8345c3f78..c0e1c57307 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -123,7 +123,7 @@ LayerNorm InstanceNorm GroupNorm Flux.normalise -Flux.dropout +NNlib.dropout ``` ### Test vs. Train diff --git a/src/deprecations.jl b/src/deprecations.jl index 8625c458e0..27c7bc2264 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -185,6 +185,18 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple, """) end + +function dropout(rng, x, p; dims=:, active::Bool=true) + if active + NNlib.dropout(rng, x, p; dims) + else + Base.depwarn("Flux.dropout(...; active=false) is deprecated. Please branch outside the function, or call dropout(x, 0) if you must.", :dropout) + return x + end +end +dropout(x, p; kwargs...) = dropout(NNlib._rng_from_array(x), x, p; kwargs...) + + # v0.14 deprecations # Enable these when 0.14 is released, and delete const ClipGrad = Optimise.ClipValue etc: diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 7ee28f64e9..7982361b14 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -1,52 +1,6 @@ - +# Internal function, used only for layers defined in this file. _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active -_dropout_shape(s, ::Colon) = size(s) -_dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(size(s)))...) - -_dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0) - -""" - dropout([rng = rng_from_array(x)], x, p; dims=:, active=true) - -The dropout function. If `active` is `true`, -for each input, either sets that input to `0` (with probability -`p`) or scales it by `1 / (1 - p)`. `dims` specifies the unbroadcasted dimensions, -e.g. `dims=1` applies dropout along columns and `dims=2` along rows. -If `active` is `false`, it just returns the input `x`. - -Specify `rng` for custom RNGs instead of the default RNG. -Note that custom RNGs are only supported on the CPU. - -Warning: when using this function, you have to manually manage the activation -state. Usually in fact, dropout is used while training -but is deactivated in the inference phase. This can be -automatically managed using the [`Dropout`](@ref) layer instead of the -`dropout` function. - -The [`Dropout`](@ref) layer is what you should use in most scenarios. -""" -function dropout(rng, x, p; dims=:, active::Bool=true) - active || return x - y = dropout_mask(rng, x, p, dims=dims) - return x .* y -end -dropout(x, p; kwargs...) = dropout(rng_from_array(x), x, p; kwargs...) - -dropout_mask(rng::CUDA.RNG, x::CuArray, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) -dropout_mask(rng, x::CuArray, p; kwargs...) = - throw(ArgumentError("x isa CuArray, but rng isa $(typeof(rng)). dropout_mask only support CUDA.RNG for CuArrays.")) -dropout_mask(rng, x, p; kwargs...) = _dropout_mask(rng, x, p; kwargs...) -function _dropout_mask(rng, x, p; dims=:) - realfptype = float(real(eltype(x))) - y = rand!(rng, similar(x, realfptype, _dropout_shape(x, dims))) - y .= _dropout_kernel.(y, p, 1 - p) - return y -end - -# TODO move this to NNlib -ChainRulesCore.@non_differentiable dropout_mask(::Any, ::Any, ::Any) - """ Dropout(p; dims=:, rng = default_rng_value()) @@ -87,16 +41,16 @@ julia> isapprox(count(==(0), y) / length(y), 0.5, atol=0.1) true ``` """ -mutable struct Dropout{F,D,R<:AbstractRNG} +mutable struct Dropout{F<:Real,D,R<:AbstractRNG} p::F dims::D active::Union{Bool, Nothing} rng::R end -Dropout(p, dims, active) = Dropout(p, dims, active, default_rng_value()) +Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value()) -function Dropout(p; dims=:, rng = default_rng_value()) - @assert 0 ≤ p ≤ 1 +function Dropout(p::Real; dims=:, rng = default_rng_value()) + 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1")) Dropout(p, dims, nothing, rng) end @@ -104,8 +58,11 @@ end trainable(a::Dropout) = (;) function (a::Dropout)(x) - _isactive(a, x) || return x - return dropout(a.rng, x, a.p; dims=a.dims, active=true) + if _isactive(a, x) && a.p != 0 + dropout(a.rng, x, a.p; dims=a.dims) + else + x + end end testmode!(m::Dropout, mode=true) = diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 2aa26bb2a7..3385775b2f 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -1,4 +1,4 @@ -using Flux, Test, Statistics +using Flux, Test, Statistics, Random using Zygote: pullback, ForwardDiff evalwgrad(f, x...) = pullback(f, x...)[1] From fec2d8ef2d8d4ce53edeb335cadcb73a78c33299 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Tue, 3 Jan 2023 17:46:00 -0500 Subject: [PATCH 02/11] improve Dropout's docstring --- src/layers/normalise.jl | 58 +++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 7982361b14..7812654f51 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -4,41 +4,55 @@ _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active """ Dropout(p; dims=:, rng = default_rng_value()) -Dropout layer. +Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability. +This is used as a regularisation, i.e. to reduce overfitting. -While training, for each input, this layer either sets that input to `0` (with probability -`p`) or scales it by `1 / (1 - p)`. To apply dropout along certain dimension(s), specify the -`dims` keyword. e.g. `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input -(also called 2D dropout). This is used as a regularisation, i.e. it reduces overfitting during -training. +While training, it sets each input to `0` (with probability `p`) +or else scales it by `1 / (1 - p)`, using the [`NNlib.dropout`](@ref) function. +While testing, it has no effect. -In the forward pass, this layer applies the [`Flux.dropout`](@ref) function. See that for more -details. +By defaul the mode will switch automatically, but it can also +be controlled manually via [`Flux.testmode!`](@ref). -Specify `rng` to use a custom RNG instead of the default. -Custom RNGs are only supported on the CPU. +By default every input is treated independently. The `dims` keyword +instead takes a random choice only along that dimension. +For example `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input +(also called 2D dropout). -Does nothing to the input once [`Flux.testmode!`](@ref) is `true`. +Keyword `rng` lets you specify a custom random number generator. +(Only supported on the CPU.) # Examples -```jldoctest -julia> m = Chain(Dense(1 => 1), Dropout(1)); +```julia +julia> m = Chain(Dense(ones(3,2)), Dropout(0.4)) +Chain( + Dense(2 => 3), # 9 parameters + Dropout(0.4), +) -julia> Flux.trainmode!(m); +julia> m(ones(2, 7)) # test mode, no effect +3×7 Matrix{Float64}: + 2.0 2.0 2.0 2.0 2.0 2.0 2.0 + 2.0 2.0 2.0 2.0 2.0 2.0 2.0 + 2.0 2.0 2.0 2.0 2.0 2.0 2.0 -julia> y = m([1]); +julia> Flux.trainmode!(m); # would happen within gradient -julia> y == [0] -true +julia> m(ones(2, 7)) +3×7 Matrix{Float64}: + 0.0 0.0 3.33333 0.0 0.0 0.0 0.0 + 3.33333 0.0 3.33333 0.0 3.33333 0.0 3.33333 + 3.33333 3.33333 0.0 3.33333 0.0 0.0 3.33333 -julia> m = Chain(Dense(1000 => 1000), Dropout(0.5)); +julia> y = m(ones(2, 10_000)); -julia> Flux.trainmode!(m); +julia> using Statistics -julia> y = m(ones(1000)); +julia> mean(y) # is about 2.0, as for test mode +1.9892222222222182 -julia> isapprox(count(==(0), y) / length(y), 0.5, atol=0.1) -true +julia> mean(iszero, y) # is about 0.4 +0.40323333333333333 ``` """ mutable struct Dropout{F<:Real,D,R<:AbstractRNG} From 6c84a6cf2c6b737be39f8beba41acec6dd8e220c Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Thu, 5 Jan 2023 12:07:43 -0500 Subject: [PATCH 03/11] make Dropout(0) === identity, cannot mutate --- src/layers/normalise.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 7812654f51..cc8b9e55ad 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -64,7 +64,10 @@ end Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value()) function Dropout(p::Real; dims=:, rng = default_rng_value()) - 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1")) + 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1, got p = $p")) + if p isa Integer # Dropout(0) + return p==0 ? identity : zero + end Dropout(p, dims, nothing, rng) end From eab0b15857fd7f99dad081cde4322815c4badfc0 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Thu, 5 Jan 2023 20:55:48 -0500 Subject: [PATCH 04/11] NNlibCUDA = 0.2.5 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index b4c3dc6d75..130cd28d7e 100644 --- a/Project.toml +++ b/Project.toml @@ -31,7 +31,7 @@ Functors = "0.3, 0.4" MLUtils = "0.2, 0.3.1, 0.4" MacroTools = "0.5" NNlib = "0.8.14" -NNlibCUDA = "0.2.4" +NNlibCUDA = "0.2.5" OneHotArrays = "0.1, 0.2" Optimisers = "0.2.12" ProgressLogging = "0.1" From 4ab93b321ef22609f4183962df68bd6a712bbec0 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sat, 7 Jan 2023 13:01:55 -0500 Subject: [PATCH 05/11] NNlibCUDA = 0.2.6 --- Project.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 130cd28d7e..d189304f78 100644 --- a/Project.toml +++ b/Project.toml @@ -30,8 +30,8 @@ ChainRulesCore = "1.12" Functors = "0.3, 0.4" MLUtils = "0.2, 0.3.1, 0.4" MacroTools = "0.5" -NNlib = "0.8.14" -NNlibCUDA = "0.2.5" +NNlib = "0.8.15" +NNlibCUDA = "0.2.6" OneHotArrays = "0.1, 0.2" Optimisers = "0.2.12" ProgressLogging = "0.1" From 0e396a6f09b97c23d4b16ffcc2cd06ccd29353ae Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sat, 7 Jan 2023 14:20:57 -0500 Subject: [PATCH 06/11] simplify default_rng etc --- src/Flux.jl | 1 + src/deprecations.jl | 15 ++------------- src/layers/normalise.jl | 12 ++++++------ src/utils.jl | 28 ++++------------------------ test/layers/normalisation.jl | 4 ++-- 5 files changed, 15 insertions(+), 45 deletions(-) diff --git a/src/Flux.jl b/src/Flux.jl index 3b4b4b9237..f74b781ec5 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -2,6 +2,7 @@ module Flux using Base: tail using LinearAlgebra, Statistics, Random # standard lib +using Random: default_rng using MacroTools, Reexport, ProgressLogging, SpecialFunctions using MacroTools: @forward diff --git a/src/deprecations.jl b/src/deprecations.jl index 27c7bc2264..a15b88f956 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -84,8 +84,6 @@ Base.@deprecate_binding ADADelta AdaDelta # Remove sub-module Data, while making sure Flux.Data.DataLoader keeps working Base.@deprecate_binding Data Flux false "Sub-module Flux.Data has been removed. The only thing it contained may be accessed as Flux.DataLoader" -@deprecate rng_from_array() default_rng_value() - function istraining() Base.depwarn("Flux.istraining() is deprecated, use NNlib.within_gradient(x) instead", :istraining) false @@ -185,17 +183,8 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple, """) end - -function dropout(rng, x, p; dims=:, active::Bool=true) - if active - NNlib.dropout(rng, x, p; dims) - else - Base.depwarn("Flux.dropout(...; active=false) is deprecated. Please branch outside the function, or call dropout(x, 0) if you must.", :dropout) - return x - end -end -dropout(x, p; kwargs...) = dropout(NNlib._rng_from_array(x), x, p; kwargs...) - +@deprecate rng_from_array() default_rng_value() +@deprecate default_rng_value() Random.default_rng() # v0.14 deprecations diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index cc8b9e55ad..76ea72f9e8 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -2,7 +2,7 @@ _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active """ - Dropout(p; dims=:, rng = default_rng_value()) + Dropout(p; dims=:, rng = default_rng()) Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability. This is used as a regularisation, i.e. to reduce overfitting. @@ -61,9 +61,9 @@ mutable struct Dropout{F<:Real,D,R<:AbstractRNG} active::Union{Bool, Nothing} rng::R end -Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value()) +Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng()) -function Dropout(p::Real; dims=:, rng = default_rng_value()) +function Dropout(p::Real; dims=:, rng = default_rng()) 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1, got p = $p")) if p isa Integer # Dropout(0) return p==0 ? identity : zero @@ -92,7 +92,7 @@ function Base.show(io::IO, d::Dropout) end """ - AlphaDropout(p; rng = default_rng_value()) + AlphaDropout(p; rng = default_rng()) A dropout layer. Used in [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515). @@ -126,8 +126,8 @@ mutable struct AlphaDropout{F,R<:AbstractRNG} new{typeof(p), typeof(rng)}(p, active, rng) end end -AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value()) -AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng) +AlphaDropout(p, active) = AlphaDropout(p, active, default_rng()) +AlphaDropout(p; rng = default_rng()) = AlphaDropout(p, nothing, rng) @functor AlphaDropout trainable(a::AlphaDropout) = (;) diff --git a/src/utils.jl b/src/utils.jl index 884fcd7465..81ebffaf5a 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -36,32 +36,12 @@ epseltype(x) = eps(float(eltype(x))) """ rng_from_array([x]) -Create an instance of the RNG most appropriate for `x`. -The current defaults are: -- `x isa CuArray`: `CUDA.default_rng()`, else: -- `x isa AbstractArray`, or no `x` provided: - - Julia version is < 1.7: `Random.GLOBAL_RNG` - - Julia version is >= 1.7: `Random.default_rng()` -""" -rng_from_array(::AbstractArray) = default_rng_value() -rng_from_array(::CuArray) = CUDA.default_rng() - -@non_differentiable rng_from_array(::Any) - -if VERSION >= v"1.7" - default_rng_value() = Random.default_rng() -else - default_rng_value() = Random.GLOBAL_RNG -end - +Create an instance of the RNG most appropriate for array `x`. +If `x isa CuArray` then this is `CUDA.default_rng()`, +otherwise `Random.default_rng()`. """ - default_rng_value() +rng_from_array(x::AbstractArray) = NNlib._rng_from_array(x) -Create an instance of the default RNG depending on Julia's version. -- Julia version is < 1.7: `Random.GLOBAL_RNG` -- Julia version is >= 1.7: `Random.default_rng()` -""" -default_rng_value """ glorot_uniform([rng = default_rng_value()], size...; gain = 1) -> Array diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 3385775b2f..ffb1b905cd 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -56,10 +56,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1] y = m(x) @test count(a->a == 0, y) > 50 - y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true) + y = Flux.dropout(values(rng_kwargs)..., x, 0.9) # , active=true) @test count(a->a == 0, y) > 50 - y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false) + y = Flux.dropout(values(rng_kwargs)..., x, 0.9 * 0) #, active=false) @test count(a->a == 0, y) == 0 # CPU RNGs map onto CPU ok From f42f475864b65367f0a2eb0095a174955ef638b4 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sat, 7 Jan 2023 20:59:16 -0500 Subject: [PATCH 07/11] Revert "simplify default_rng etc" This reverts commit 0e396a6f09b97c23d4b16ffcc2cd06ccd29353ae. --- src/Flux.jl | 1 - src/deprecations.jl | 15 +++++++++++++-- src/layers/normalise.jl | 12 ++++++------ src/utils.jl | 28 ++++++++++++++++++++++++---- test/layers/normalisation.jl | 4 ++-- 5 files changed, 45 insertions(+), 15 deletions(-) diff --git a/src/Flux.jl b/src/Flux.jl index f74b781ec5..3b4b4b9237 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -2,7 +2,6 @@ module Flux using Base: tail using LinearAlgebra, Statistics, Random # standard lib -using Random: default_rng using MacroTools, Reexport, ProgressLogging, SpecialFunctions using MacroTools: @forward diff --git a/src/deprecations.jl b/src/deprecations.jl index a15b88f956..27c7bc2264 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -84,6 +84,8 @@ Base.@deprecate_binding ADADelta AdaDelta # Remove sub-module Data, while making sure Flux.Data.DataLoader keeps working Base.@deprecate_binding Data Flux false "Sub-module Flux.Data has been removed. The only thing it contained may be accessed as Flux.DataLoader" +@deprecate rng_from_array() default_rng_value() + function istraining() Base.depwarn("Flux.istraining() is deprecated, use NNlib.within_gradient(x) instead", :istraining) false @@ -183,8 +185,17 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple, """) end -@deprecate rng_from_array() default_rng_value() -@deprecate default_rng_value() Random.default_rng() + +function dropout(rng, x, p; dims=:, active::Bool=true) + if active + NNlib.dropout(rng, x, p; dims) + else + Base.depwarn("Flux.dropout(...; active=false) is deprecated. Please branch outside the function, or call dropout(x, 0) if you must.", :dropout) + return x + end +end +dropout(x, p; kwargs...) = dropout(NNlib._rng_from_array(x), x, p; kwargs...) + # v0.14 deprecations diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 76ea72f9e8..cc8b9e55ad 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -2,7 +2,7 @@ _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active """ - Dropout(p; dims=:, rng = default_rng()) + Dropout(p; dims=:, rng = default_rng_value()) Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability. This is used as a regularisation, i.e. to reduce overfitting. @@ -61,9 +61,9 @@ mutable struct Dropout{F<:Real,D,R<:AbstractRNG} active::Union{Bool, Nothing} rng::R end -Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng()) +Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value()) -function Dropout(p::Real; dims=:, rng = default_rng()) +function Dropout(p::Real; dims=:, rng = default_rng_value()) 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1, got p = $p")) if p isa Integer # Dropout(0) return p==0 ? identity : zero @@ -92,7 +92,7 @@ function Base.show(io::IO, d::Dropout) end """ - AlphaDropout(p; rng = default_rng()) + AlphaDropout(p; rng = default_rng_value()) A dropout layer. Used in [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515). @@ -126,8 +126,8 @@ mutable struct AlphaDropout{F,R<:AbstractRNG} new{typeof(p), typeof(rng)}(p, active, rng) end end -AlphaDropout(p, active) = AlphaDropout(p, active, default_rng()) -AlphaDropout(p; rng = default_rng()) = AlphaDropout(p, nothing, rng) +AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value()) +AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng) @functor AlphaDropout trainable(a::AlphaDropout) = (;) diff --git a/src/utils.jl b/src/utils.jl index 81ebffaf5a..884fcd7465 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -36,12 +36,32 @@ epseltype(x) = eps(float(eltype(x))) """ rng_from_array([x]) -Create an instance of the RNG most appropriate for array `x`. -If `x isa CuArray` then this is `CUDA.default_rng()`, -otherwise `Random.default_rng()`. +Create an instance of the RNG most appropriate for `x`. +The current defaults are: +- `x isa CuArray`: `CUDA.default_rng()`, else: +- `x isa AbstractArray`, or no `x` provided: + - Julia version is < 1.7: `Random.GLOBAL_RNG` + - Julia version is >= 1.7: `Random.default_rng()` +""" +rng_from_array(::AbstractArray) = default_rng_value() +rng_from_array(::CuArray) = CUDA.default_rng() + +@non_differentiable rng_from_array(::Any) + +if VERSION >= v"1.7" + default_rng_value() = Random.default_rng() +else + default_rng_value() = Random.GLOBAL_RNG +end + """ -rng_from_array(x::AbstractArray) = NNlib._rng_from_array(x) + default_rng_value() +Create an instance of the default RNG depending on Julia's version. +- Julia version is < 1.7: `Random.GLOBAL_RNG` +- Julia version is >= 1.7: `Random.default_rng()` +""" +default_rng_value """ glorot_uniform([rng = default_rng_value()], size...; gain = 1) -> Array diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index ffb1b905cd..3385775b2f 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -56,10 +56,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1] y = m(x) @test count(a->a == 0, y) > 50 - y = Flux.dropout(values(rng_kwargs)..., x, 0.9) # , active=true) + y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true) @test count(a->a == 0, y) > 50 - y = Flux.dropout(values(rng_kwargs)..., x, 0.9 * 0) #, active=false) + y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false) @test count(a->a == 0, y) == 0 # CPU RNGs map onto CPU ok From d7cc49d8b438280bda714a4d3690770738262cce Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sat, 7 Jan 2023 21:00:28 -0500 Subject: [PATCH 08/11] un-revert the removal of the active=true method --- src/deprecations.jl | 11 ----------- test/layers/normalisation.jl | 4 ++-- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/src/deprecations.jl b/src/deprecations.jl index 27c7bc2264..a99297649b 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -186,17 +186,6 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple, end -function dropout(rng, x, p; dims=:, active::Bool=true) - if active - NNlib.dropout(rng, x, p; dims) - else - Base.depwarn("Flux.dropout(...; active=false) is deprecated. Please branch outside the function, or call dropout(x, 0) if you must.", :dropout) - return x - end -end -dropout(x, p; kwargs...) = dropout(NNlib._rng_from_array(x), x, p; kwargs...) - - # v0.14 deprecations # Enable these when 0.14 is released, and delete const ClipGrad = Optimise.ClipValue etc: diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 3385775b2f..6a3d85756d 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -56,10 +56,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1] y = m(x) @test count(a->a == 0, y) > 50 - y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=true) + y = Flux.dropout(values(rng_kwargs)..., x, 0.9) # , active=true) @test count(a->a == 0, y) > 50 - y = Flux.dropout(values(rng_kwargs)..., x, 0.9, active=false) + y = Flux.dropout(values(rng_kwargs)..., x, 0.9 * 0) # , active=false) @test count(a->a == 0, y) == 0 # CPU RNGs map onto CPU ok From 28ac4c4731bdad4241d085d337129d435a65b6c6 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Fri, 27 Jan 2023 12:38:49 -0500 Subject: [PATCH 09/11] avoid a branch --- src/layers/normalise.jl | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index cc8b9e55ad..7937a311b4 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -74,13 +74,7 @@ end @functor Dropout trainable(a::Dropout) = (;) -function (a::Dropout)(x) - if _isactive(a, x) && a.p != 0 - dropout(a.rng, x, a.p; dims=a.dims) - else - x - end -end +(a::Dropout)(x) = dropout(a.rng, x, a.p * _isactive(a, x); dims=a.dims) testmode!(m::Dropout, mode=true) = (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) From 9e99422eca9a6bf000bcada4bcb0a4f2d2fd700c Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sun, 8 Jan 2023 06:58:45 -0500 Subject: [PATCH 10/11] Update src/layers/normalise.jl Co-authored-by: Carlo Lucibello --- src/layers/normalise.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 7937a311b4..37be2fb8e2 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -11,7 +11,7 @@ While training, it sets each input to `0` (with probability `p`) or else scales it by `1 / (1 - p)`, using the [`NNlib.dropout`](@ref) function. While testing, it has no effect. -By defaul the mode will switch automatically, but it can also +By default the mode will switch automatically, but it can also be controlled manually via [`Flux.testmode!`](@ref). By default every input is treated independently. The `dims` keyword From fc9855b085750fd00ca55cd3279c057f36afea73 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sun, 29 Jan 2023 11:56:25 -0500 Subject: [PATCH 11/11] Apply suggestions from code review Co-authored-by: Kyle Daruwalla --- src/layers/normalise.jl | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 37be2fb8e2..e9362313ab 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -2,7 +2,7 @@ _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active """ - Dropout(p; dims=:, rng = default_rng_value()) + Dropout(p; [dims, rng]) Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability. This is used as a regularisation, i.e. to reduce overfitting. @@ -14,8 +14,8 @@ While testing, it has no effect. By default the mode will switch automatically, but it can also be controlled manually via [`Flux.testmode!`](@ref). -By default every input is treated independently. The `dims` keyword -instead takes a random choice only along that dimension. +By default every input is treated independently. With the `dims` keyword, +instead it takes a random choice only along that dimension. For example `Dropout(p; dims = 3)` will randomly zero out entire channels on WHCN input (also called 2D dropout). @@ -64,10 +64,7 @@ end Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value()) function Dropout(p::Real; dims=:, rng = default_rng_value()) - 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expexts 0 ≤ p ≤ 1, got p = $p")) - if p isa Integer # Dropout(0) - return p==0 ? identity : zero - end + 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expects 0 ≤ p ≤ 1, got p = $p")) Dropout(p, dims, nothing, rng) end