diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index 6f5f3978cb..93a58a87fd 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -145,6 +145,7 @@ Several normalisation layers behave differently under training and inference (te The functions `Flux.trainmode!` and `Flux.testmode!` let you manually specify which behaviour you want. When called on a model, they will place all layers within the model into the specified mode. ```@docs -Flux.testmode! +testmode!(::Any) +testmode!(::Any, ::Any) trainmode! ``` diff --git a/src/deprecations.jl b/src/deprecations.jl index e5bc552946..627e5bd5b8 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -187,6 +187,23 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple, """) end +""" + trainmode!(m, active) + +!!! warning + This two-argument method is deprecated. + +Possible values of `active` are: +- `true` for training, or +- `false` for testing, same as [`testmode!`](@ref)`(m)` +- `:auto` or `nothing` for Flux to detect training automatically. +""" +function trainmode!(m, active::Bool) + Base.depwarn("trainmode!(m, active::Bool) is deprecated", :trainmode) + testmode!(m, !active) +end + + # v0.14 deprecations # Enable these when 0.14 is released, and delete const ClipGrad = Optimise.ClipValue etc: diff --git a/src/functor.jl b/src/functor.jl index e04a9b1fbd..d0cb9c5d80 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -5,36 +5,75 @@ import Functors: Functors, @functor, functor, fmap, isleaf using SparseArrays: AbstractSparseArray """ - testmode!(m, mode = true) + testmode!(model, [mode]) -> model -Set a layer or model's test mode (see below). -Using `:auto` mode will treat any gradient computation as training. +Set a layer, or all layers in a model, to test mode. +This disables the effect of [`Dropout`](@ref) and +some other regularisation layers. -_Note_: if you manually set a model into test mode, you need to manually place -it back into train mode during training phase. +If you manually set a model into test mode, you need to manually place +it back into train mode during training phase, using [`trainmode!`](@ref). -Possible values include: -- `false` for training -- `true` for testing -- `:auto` or `nothing` for Flux to detect the mode automatically +There is an optional second argument, which takes a symbol `:auto` to +reset all layers back to the default automatic mode. + +# Example + +```jldoctest +julia> d = Dropout(0.3) +Dropout(0.3) + +julia> testmode!(d) # dropout is now always disabled +Dropout(0.3, active=false) + +julia> trainmode!(d) # dropout is now always enabled +Dropout(0.3, active=true) + +julia> testmode!(d, :auto) # back to default +Dropout(0.3) +``` """ -testmode!(m, mode = true) = (foreach(x -> testmode!(x, mode), trainable(m)); m) +testmode!(m) = testmode!(m, true) """ - trainmode!(m, mode = true) + trainmode!(model) -> model -Set a layer of model's train mode (see below). -Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`). +Set a layer, or all layers in a model, to training mode. +Opposite to [`testmode!`](@ref), see further details there. +""" +trainmode!(m) = testmode!(m, false) +trainmode!(m, mode::Symbol) = testmode!(m, mode) +trainmode!(m, ::Nothing) = testmode!(m, nothing) # why do we have so much API? + +""" + testmode!(model, inactive) + +This two-argument method is largely internal. It recurses into the `model`, +and until a method like `testmode!(d::Dropout, inactive)` alters the activity of a layer. +Custom layers can support manual `testmode!` / `trainmode!` switching +by defining such a method. -_Note_: if you manually set a model into train mode, you need to manually place -it into test mode during testing phase. +Possible values of `inactive` are: +- `true` for testing, i.e. `active=false` +- `false` for training, same as [`trainmode!`](@ref)`(m)` +- `:auto` or `nothing` for Flux to detect training automatically. -Possible values include: -- `true` for training -- `false` for testing -- `:auto` or `nothing` for Flux to detect the mode automatically +!!! compat + This method may be removed in a future breaking change, to separate + the user-facing `testmode!` from the internal recursion. """ -trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode) +function testmode!(m, mode) + inactive = if mode isa Symbol + mode === :auto || throw(ArgumentError("testmode! accepts only the symbol :auto, got :$mode")) + nothing + elseif mode isa Union{Bool,Nothing} + mode + else + throw(ArgumentError("testmode! does not accept $(repr(mode)) as the 2nd argument")) + end + foreach(x -> testmode!(x, inactive), trainable(m)) + m +end function params!(p::Params, x, seen = IdSet()) if x isa AbstractArray{<:Number} && Functors.isleaf(x) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 9af9044c7b..3794576e76 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -1,8 +1,13 @@ # Internal function, used only for layers defined in this file. _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active +# Internal function, used only in this file. +_tidy_active(mode::Bool) = mode +_tidy_active(::Nothing) = nothing +_tidy_active(mode) = mode === :auto ? nothing : throw(ArgumentError("active = $(repr(mode)) is not accepted, must be true/false/nothing or :auto")) + """ - Dropout(p; [dims, rng]) + Dropout(p; [dims, rng, active]) Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability. This is used as a regularisation, i.e. to reduce overfitting. @@ -12,7 +17,8 @@ or else scales it by `1 / (1 - p)`, using the [`NNlib.dropout`](@ref) function. While testing, it has no effect. By default the mode will switch automatically, but it can also -be controlled manually via [`Flux.testmode!`](@ref). +be controlled manually via [`Flux.testmode!`](@ref), +or by passing keyword `active=true` for training mode. By default every input is treated independently. With the `dims` keyword, instead it takes a random choice only along that dimension. @@ -36,7 +42,11 @@ julia> m(ones(2, 7)) # test mode, no effect 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 -julia> Flux.trainmode!(m); # equivalent to use within gradient +julia> Flux.trainmode!(m) # equivalent to use within gradient +Chain( + Dense(2 => 3), # 9 parameters + Dropout(0.4, active=true), +) julia> m(ones(2, 7)) 3×7 Matrix{Float64}: @@ -63,9 +73,9 @@ mutable struct Dropout{F<:Real,D,R<:AbstractRNG} end Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value()) -function Dropout(p::Real; dims=:, rng = default_rng_value()) +function Dropout(p::Real; dims=:, active::Union{Bool,Nothing} = nothing, rng = default_rng_value()) 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expects 0 ≤ p ≤ 1, got p = $p")) - Dropout(p, dims, nothing, rng) + Dropout(p, dims, active, rng) end @functor Dropout @@ -74,16 +84,17 @@ trainable(a::Dropout) = (;) (a::Dropout)(x) = dropout(a.rng, x, a.p * _isactive(a, x); dims=a.dims) testmode!(m::Dropout, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) + (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m) function Base.show(io::IO, d::Dropout) print(io, "Dropout(", d.p) - d.dims != (:) && print(io, ", dims = $(repr(d.dims))") + d.dims != (:) && print(io, ", dims=", d.dims) + d.active == nothing || print(io, ", active=", d.active) print(io, ")") end """ - AlphaDropout(p; rng = default_rng_value()) + AlphaDropout(p; [rng, active]) A dropout layer. Used in [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515). @@ -112,13 +123,13 @@ mutable struct AlphaDropout{F,R<:AbstractRNG} p::F active::Union{Bool, Nothing} rng::R - function AlphaDropout(p, active, rng) - @assert 0 ≤ p ≤ 1 - new{typeof(p), typeof(rng)}(p, active, rng) - end end + AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value()) -AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng) +function AlphaDropout(p; rng = default_rng_value(), active::Union{Bool,Nothing} = nothing) + 0 ≤ p ≤ 1 || throw(ArgumentError("AlphaDropout expects 0 ≤ p ≤ 1, got p = $p")) + AlphaDropout(p, active, rng) +end @functor AlphaDropout trainable(a::AlphaDropout) = (;) @@ -138,7 +149,7 @@ function (a::AlphaDropout)(x::AbstractArray{T}) where T end testmode!(m::AlphaDropout, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) + (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m) """ LayerNorm(size..., λ=identity; affine=true, ϵ=1fe-5) @@ -257,7 +268,7 @@ ChainRulesCore.@non_differentiable _track_stats!(::Any...) """ BatchNorm(channels::Integer, λ=identity; initβ=zeros32, initγ=ones32, - affine = true, track_stats = true, + affine=true, track_stats=true, active=nothing, ϵ=1f-5, momentum= 0.1f0) [Batch Normalization](https://arxiv.org/abs/1502.03167) layer. @@ -310,7 +321,7 @@ end function BatchNorm(chs::Int, λ=identity; initβ=zeros32, initγ=ones32, - affine=true, track_stats=true, + affine=true, track_stats=true, active::Union{Bool,Nothing}=nothing, ϵ=1f-5, momentum=0.1f0) β = affine ? initβ(chs) : nothing @@ -321,7 +332,7 @@ function BatchNorm(chs::Int, λ=identity; return BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, - nothing, chs) + active, chs) end @functor BatchNorm @@ -335,12 +346,13 @@ function (BN::BatchNorm)(x::AbstractArray{T,N}) where {T,N} end testmode!(m::BatchNorm, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) + (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m) function Base.show(io::IO, l::BatchNorm) print(io, "BatchNorm($(l.chs)") (l.λ == identity) || print(io, ", $(l.λ)") hasaffine(l) || print(io, ", affine=false") + l.active == nothing || print(io, ", active=", l.active) print(io, ")") end @@ -399,7 +411,7 @@ end function InstanceNorm(chs::Int, λ=identity; initβ=zeros32, initγ=ones32, - affine=false, track_stats=false, + affine=false, track_stats=false, active::Union{Bool,Nothing}=nothing, ϵ=1f-5, momentum=0.1f0) β = affine ? initβ(chs) : nothing @@ -410,7 +422,7 @@ function InstanceNorm(chs::Int, λ=identity; return InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, - nothing, chs) + active, chs) end @functor InstanceNorm @@ -424,12 +436,13 @@ function (l::InstanceNorm)(x::AbstractArray{T,N}) where {T,N} end testmode!(m::InstanceNorm, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) + (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m) function Base.show(io::IO, l::InstanceNorm) print(io, "InstanceNorm($(l.chs)") l.λ == identity || print(io, ", $(l.λ)") hasaffine(l) || print(io, ", affine=false") + l.active == nothing || print(io, ", active=", l.active) print(io, ")") end @@ -495,7 +508,7 @@ trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;) function GroupNorm(chs::Int, G::Int, λ=identity; initβ=zeros32, initγ=ones32, - affine=true, track_stats=false, + affine=true, track_stats=false, active::Union{Bool,Nothing}=nothing, ϵ=1f-5, momentum=0.1f0) if track_stats @@ -514,7 +527,7 @@ end μ, σ², ϵ, momentum, affine, track_stats, - nothing, chs) + active, chs) end function (gn::GroupNorm)(x::AbstractArray) @@ -529,13 +542,14 @@ function (gn::GroupNorm)(x::AbstractArray) end testmode!(m::GroupNorm, mode = true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) + (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m) function Base.show(io::IO, l::GroupNorm) # print(io, "GroupNorm($(join(size(l.β), ", "))", ", ", l.G) print(io, "GroupNorm($(l.chs), $(l.G)") l.λ == identity || print(io, ", ", l.λ) hasaffine(l) || print(io, ", affine=false") + l.active == nothing || print(io, ", active=", l.active) print(io, ")") end diff --git a/test/layers/basic.jl b/test/layers/basic.jl index 45e4750a6f..e143af8cf1 100644 --- a/test/layers/basic.jl +++ b/test/layers/basic.jl @@ -1,5 +1,5 @@ using Test, Random -import Flux: activations +using Flux: activations @testset "basic" begin @testset "helpers" begin @@ -16,11 +16,11 @@ import Flux: activations end @testset "Chain" begin - @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10)) - @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn(10)) + @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn32(10)) + @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn32(10)) # numeric test should be put into testset of corresponding layer - @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn(10)) + @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn32(10)) m = Chain(first = Dense(10, 5, σ), second = Dense(5, 2)) @test m[:first] == m[1] @test m[1:2] == m @@ -72,10 +72,10 @@ import Flux: activations @test_throws MethodError Dense(rand(5), rand(5), tanh) end @testset "dimensions" begin - @test length(Dense(10, 5)(randn(10))) == 5 - @test_throws DimensionMismatch Dense(10, 5)(randn(1)) - @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting - @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting + @test length(Dense(10 => 5)(randn32(10))) == 5 + @test_throws DimensionMismatch Dense(10 => 5)(randn32(1)) + @test_throws MethodError Dense(10 => 5)(1) # avoid broadcasting + @test_throws MethodError Dense(10 => 5).(randn32(10)) # avoid broadcasting @test size(Dense(10, 5)(randn(10))) == (5,) @test size(Dense(10, 5)(randn(10,2))) == (5,2) @test size(Dense(10, 5)(randn(10,2,3))) == (5,2,3) @@ -333,7 +333,7 @@ import Flux: activations y = m(x) @test y isa Array{Float32, 3} @test size(y) == (embed_size, 3, 4) - x3 = onehotbatch(x, 1:1:vocab_size) + x3 = Flux.onehotbatch(x, 1:1:vocab_size) @test size(x3) == (vocab_size, 3, 4) y3 = m(x3) @test size(y3) == (embed_size, 3, 4) diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index e21c307901..917ca20a17 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -26,6 +26,11 @@ evalwgrad(f, x...) = pullback(f, x...)[1] y = evalwgrad(m, x) @test count(a->a==0, y) > 50 + # Keyword active=false + m2 = Dropout(0.9; active=false, rng_kwargs...) + y2 = evalwgrad(m2, x) + @test count(iszero, y2) == 0 + x = rand(Float32, 100) m = Chain(Dense(100,100), Dropout(0.9; rng_kwargs...)) @@ -73,6 +78,9 @@ evalwgrad(f, x...) = pullback(f, x...)[1] @test cpu(m).rng === only(values(rng_kwargs)) end end + + @test Dropout(0.5; active=true).active === true + @test_throws Exception Dropout(0.5; active=:something_else) end @testset "AlphaDropout" begin @@ -119,6 +127,9 @@ end @test cpu(m).rng === only(values(rng_kwargs)) end end + + @test AlphaDropout(0.5; active=true).active === true + @test_throws Exception AlphaDropout(0.5; active=:something_else) end @testset "BatchNorm" begin @@ -211,6 +222,9 @@ end @test length(Flux.params(BatchNorm(10))) == 2 @test length(Flux.params(BatchNorm(10, affine=true))) == 2 @test length(Flux.params(BatchNorm(10, affine=false))) == 0 + + @test BatchNorm(5; active=true).active === true + @test_throws Exception BatchNorm(5; active=:something_else) end @testset "InstanceNorm" begin @@ -342,6 +356,9 @@ end @test length(Flux.params(InstanceNorm(10))) == 0 @test length(Flux.params(InstanceNorm(10, affine=true))) == 2 @test length(Flux.params(InstanceNorm(10, affine=false))) == 0 + + @test InstanceNorm(5; active=true).active === true + @test_throws Exception InstanceNorm(5; active=:something_else) end @testset "LayerNorm" begin @@ -465,6 +482,9 @@ end x = Float32.(reshape(collect(1:prod(sizes)), sizes)) @test BN(x) ≈ GN(x) end + + @test GroupNorm(5, 5; active=true).active === true + @test_throws Exception GroupNorm(5, 5; active=:something_else) end @testset "second derivatives" begin