From 49d13030ab86a16586e7565af0007eb405480dee Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Fri, 31 Mar 2023 01:15:33 -0400 Subject: [PATCH 01/10] print the state of Dropout etc. --- src/functor.jl | 34 +++++++++++++++++++------- src/layers/normalise.jl | 54 ++++++++++++++++++++++++++--------------- 2 files changed, 59 insertions(+), 29 deletions(-) diff --git a/src/functor.jl b/src/functor.jl index e04a9b1fbd..9fc997d56b 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -7,32 +7,48 @@ using SparseArrays: AbstractSparseArray """ testmode!(m, mode = true) -Set a layer or model's test mode (see below). -Using `:auto` mode will treat any gradient computation as training. +Set a layer, or all layers in a model, to test mode. +This disables the effect of [`Dropout`](@ref), and similar layers. _Note_: if you manually set a model into test mode, you need to manually place it back into train mode during training phase. -Possible values include: -- `false` for training +Possible values of optional 2nd argument `mode` are: - `true` for testing -- `:auto` or `nothing` for Flux to detect the mode automatically +- `false` for training, same as [`trainmode!`](@ref) +- `:auto` or `nothing` for Flux to detect training automatically. + +# Example + +```jldoctest +julia> d = Dropout(0.3) +Dropout(0.3) + +julia> testmode!(d) # dropout is now always disabled +Dropout(0.3, active=false) + +julia> trainmode!(d) # dropout is now always enabled +Dropout(0.3, active=true) + +julia> trainmode!(d, :auto) # back to default +Dropout(0.3) +``` """ testmode!(m, mode = true) = (foreach(x -> testmode!(x, mode), trainable(m)); m) """ trainmode!(m, mode = true) -Set a layer of model's train mode (see below). -Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`). +Set a layer, or all layers in a model, to training mode. +Opposite to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`). _Note_: if you manually set a model into train mode, you need to manually place it into test mode during testing phase. -Possible values include: +Possible values of optional 2nd argument `mode` are: - `true` for training - `false` for testing -- `:auto` or `nothing` for Flux to detect the mode automatically +- `:auto` or `nothing` for Flux to detect training automatically """ trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 9af9044c7b..e3f3d90f36 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -1,8 +1,13 @@ # Internal function, used only for layers defined in this file. _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active +# Internal function, used only in this file. +_tidy_active(mode::Bool) = mode +_tidy_active(::Nothing) = nothing +_tidy_active(mode) = mode === :auto ? nothing : throw(ArgumentError("active = $(repr(mode)) is not accepted, must be true/false/nothing or :auto")) + """ - Dropout(p; [dims, rng]) + Dropout(p; [dims, rng, active]) Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability. This is used as a regularisation, i.e. to reduce overfitting. @@ -12,7 +17,8 @@ or else scales it by `1 / (1 - p)`, using the [`NNlib.dropout`](@ref) function. While testing, it has no effect. By default the mode will switch automatically, but it can also -be controlled manually via [`Flux.testmode!`](@ref). +be controlled manually via [`Flux.testmode!`](@ref), +or by passing keyword `active=true` for training mode. By default every input is treated independently. With the `dims` keyword, instead it takes a random choice only along that dimension. @@ -36,7 +42,11 @@ julia> m(ones(2, 7)) # test mode, no effect 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 -julia> Flux.trainmode!(m); # equivalent to use within gradient +julia> Flux.trainmode!(m) # equivalent to use within gradient +Chain( + Dense(2 => 3), # 9 parameters + Dropout(0.4, active=true), +) julia> m(ones(2, 7)) 3×7 Matrix{Float64}: @@ -63,9 +73,9 @@ mutable struct Dropout{F<:Real,D,R<:AbstractRNG} end Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value()) -function Dropout(p::Real; dims=:, rng = default_rng_value()) +function Dropout(p::Real; dims=:, active = nothing, rng = default_rng_value()) 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expects 0 ≤ p ≤ 1, got p = $p")) - Dropout(p, dims, nothing, rng) + Dropout(p, dims, _tidy_active(active), rng) end @functor Dropout @@ -74,16 +84,17 @@ trainable(a::Dropout) = (;) (a::Dropout)(x) = dropout(a.rng, x, a.p * _isactive(a, x); dims=a.dims) testmode!(m::Dropout, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) + (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m) function Base.show(io::IO, d::Dropout) print(io, "Dropout(", d.p) - d.dims != (:) && print(io, ", dims = $(repr(d.dims))") + d.dims != (:) && print(io, ", dims=", d.dims) + d.active == nothing || print(io, ", active=", d.active) print(io, ")") end """ - AlphaDropout(p; rng = default_rng_value()) + AlphaDropout(p; [rng, active]) A dropout layer. Used in [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515). @@ -114,11 +125,11 @@ mutable struct AlphaDropout{F,R<:AbstractRNG} rng::R function AlphaDropout(p, active, rng) @assert 0 ≤ p ≤ 1 - new{typeof(p), typeof(rng)}(p, active, rng) + new{typeof(p), typeof(rng)}(p, _tidy_active(active), rng) end end AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value()) -AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng) +AlphaDropout(p; rng = default_rng_value(), active=nothing) = AlphaDropout(p, active, rng) @functor AlphaDropout trainable(a::AlphaDropout) = (;) @@ -138,7 +149,7 @@ function (a::AlphaDropout)(x::AbstractArray{T}) where T end testmode!(m::AlphaDropout, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) + (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m) """ LayerNorm(size..., λ=identity; affine=true, ϵ=1fe-5) @@ -310,7 +321,7 @@ end function BatchNorm(chs::Int, λ=identity; initβ=zeros32, initγ=ones32, - affine=true, track_stats=true, + affine=true, track_stats=true, active=nothing, ϵ=1f-5, momentum=0.1f0) β = affine ? initβ(chs) : nothing @@ -321,7 +332,7 @@ function BatchNorm(chs::Int, λ=identity; return BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, - nothing, chs) + _tidy_active(active), chs) end @functor BatchNorm @@ -335,12 +346,13 @@ function (BN::BatchNorm)(x::AbstractArray{T,N}) where {T,N} end testmode!(m::BatchNorm, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) + (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m) function Base.show(io::IO, l::BatchNorm) print(io, "BatchNorm($(l.chs)") (l.λ == identity) || print(io, ", $(l.λ)") hasaffine(l) || print(io, ", affine=false") + l.active == nothing || print(io, ", active=", l.active) print(io, ")") end @@ -399,7 +411,7 @@ end function InstanceNorm(chs::Int, λ=identity; initβ=zeros32, initγ=ones32, - affine=false, track_stats=false, + affine=false, track_stats=false, active=nothing, ϵ=1f-5, momentum=0.1f0) β = affine ? initβ(chs) : nothing @@ -410,7 +422,7 @@ function InstanceNorm(chs::Int, λ=identity; return InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, - nothing, chs) + _tidy_active(active), chs) end @functor InstanceNorm @@ -424,12 +436,13 @@ function (l::InstanceNorm)(x::AbstractArray{T,N}) where {T,N} end testmode!(m::InstanceNorm, mode=true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) + (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m) function Base.show(io::IO, l::InstanceNorm) print(io, "InstanceNorm($(l.chs)") l.λ == identity || print(io, ", $(l.λ)") hasaffine(l) || print(io, ", affine=false") + l.active == nothing || print(io, ", active=", l.active) print(io, ")") end @@ -495,7 +508,7 @@ trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;) function GroupNorm(chs::Int, G::Int, λ=identity; initβ=zeros32, initγ=ones32, - affine=true, track_stats=false, + affine=true, track_stats=false, active=nothing, ϵ=1f-5, momentum=0.1f0) if track_stats @@ -514,7 +527,7 @@ end μ, σ², ϵ, momentum, affine, track_stats, - nothing, chs) + _tidy_active(active), chs) end function (gn::GroupNorm)(x::AbstractArray) @@ -529,13 +542,14 @@ function (gn::GroupNorm)(x::AbstractArray) end testmode!(m::GroupNorm, mode = true) = - (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m) + (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m) function Base.show(io::IO, l::GroupNorm) # print(io, "GroupNorm($(join(size(l.β), ", "))", ", ", l.G) print(io, "GroupNorm($(l.chs), $(l.G)") l.λ == identity || print(io, ", ", l.λ) hasaffine(l) || print(io, ", affine=false") + l.active == nothing || print(io, ", active=", l.active) print(io, ")") end From 061440ebfb530dec504431c4b0d9164cc8dcdf84 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Fri, 31 Mar 2023 17:46:03 -0400 Subject: [PATCH 02/10] add tests --- test/layers/normalisation.jl | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index e21c307901..64051ac1fd 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -26,6 +26,11 @@ evalwgrad(f, x...) = pullback(f, x...)[1] y = evalwgrad(m, x) @test count(a->a==0, y) > 50 + # Keyword active=false + m2 = Dropout(0.9; active=false, rng_kwargs...) + y2 = evalwgrad(m2, x) + @test count(a->a==0, y2) == 0 + x = rand(Float32, 100) m = Chain(Dense(100,100), Dropout(0.9; rng_kwargs...)) @@ -73,6 +78,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1] @test cpu(m).rng === only(values(rng_kwargs)) end end + + @test Dropout(0.5; active=:auto).active === nothing + @test Dropout(0.5; active=true).active === true + @test_throws ArgumentError Dropout(0.5; active=:something_else) end @testset "AlphaDropout" begin @@ -119,6 +128,10 @@ end @test cpu(m).rng === only(values(rng_kwargs)) end end + + @test AlphaDropout(0.5; active=:auto).active === nothing + @test AlphaDropout(0.5; active=true).active === true + @test_throws ArgumentError AlphaDropout(0.5; active=:something_else) end @testset "BatchNorm" begin @@ -211,6 +224,10 @@ end @test length(Flux.params(BatchNorm(10))) == 2 @test length(Flux.params(BatchNorm(10, affine=true))) == 2 @test length(Flux.params(BatchNorm(10, affine=false))) == 0 + + @test BatchNorm(5; active=:auto).active === nothing + @test BatchNorm(5; active=true).active === true + @test_throws ArgumentError BatchNorm(5; active=:something_else) end @testset "InstanceNorm" begin @@ -342,6 +359,10 @@ end @test length(Flux.params(InstanceNorm(10))) == 0 @test length(Flux.params(InstanceNorm(10, affine=true))) == 2 @test length(Flux.params(InstanceNorm(10, affine=false))) == 0 + + @test InstanceNorm(5; active=:auto).active === nothing + @test InstanceNorm(5; active=true).active === true + @test_throws ArgumentError InstanceNorm(5; active=:something_else) end @testset "LayerNorm" begin @@ -465,6 +486,10 @@ end x = Float32.(reshape(collect(1:prod(sizes)), sizes)) @test BN(x) ≈ GN(x) end + + @test GroupNorm(5, 5; active=:auto).active === nothing + @test GroupNorm(5, 5; active=true).active === true + @test_throws ArgumentError GroupNorm(5, 5; active=:something_else) end @testset "second derivatives" begin From 1153f5794bed4ad3563dd503498008e4a2911a3c Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Fri, 31 Mar 2023 17:48:41 -0400 Subject: [PATCH 03/10] doc improvements --- src/functor.jl | 16 ++++++++-------- src/layers/normalise.jl | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/functor.jl b/src/functor.jl index 9fc997d56b..a3af614d41 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -5,7 +5,7 @@ import Functors: Functors, @functor, functor, fmap, isleaf using SparseArrays: AbstractSparseArray """ - testmode!(m, mode = true) + testmode!(m, inactive = true) Set a layer, or all layers in a model, to test mode. This disables the effect of [`Dropout`](@ref), and similar layers. @@ -13,9 +13,9 @@ This disables the effect of [`Dropout`](@ref), and similar layers. _Note_: if you manually set a model into test mode, you need to manually place it back into train mode during training phase. -Possible values of optional 2nd argument `mode` are: +Possible values of optional 2nd argument `inactive` are: - `true` for testing -- `false` for training, same as [`trainmode!`](@ref) +- `false` for training, same as [`trainmode!`](@ref)`(m)` - `:auto` or `nothing` for Flux to detect training automatically. # Example @@ -34,23 +34,23 @@ julia> trainmode!(d, :auto) # back to default Dropout(0.3) ``` """ -testmode!(m, mode = true) = (foreach(x -> testmode!(x, mode), trainable(m)); m) +testmode!(m, inactive = true) = (foreach(x -> testmode!(x, inactive), trainable(m)); m) """ - trainmode!(m, mode = true) + trainmode!(m, active = true) Set a layer, or all layers in a model, to training mode. -Opposite to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`). +Opposite to [`testmode!`](@ref) (i.e. `trainmode!(m, active) == testmode!(m, !active)`). _Note_: if you manually set a model into train mode, you need to manually place it into test mode during testing phase. -Possible values of optional 2nd argument `mode` are: +Possible values of optional 2nd argument `active` are: - `true` for training - `false` for testing - `:auto` or `nothing` for Flux to detect training automatically """ -trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode) +trainmode!(m, active = true) = active isa Bool ? testmode!(m, !active) : testmode!(m, active) function params!(p::Params, x, seen = IdSet()) if x isa AbstractArray{<:Number} && Functors.isleaf(x) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index e3f3d90f36..7ca65b4ab9 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -268,7 +268,7 @@ ChainRulesCore.@non_differentiable _track_stats!(::Any...) """ BatchNorm(channels::Integer, λ=identity; initβ=zeros32, initγ=ones32, - affine = true, track_stats = true, + affine=true, track_stats=true, active=:auto, ϵ=1f-5, momentum= 0.1f0) [Batch Normalization](https://arxiv.org/abs/1502.03167) layer. From a22d7d00a0e8c49b552a7f32bc9aa5006c93906e Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sat, 1 Apr 2023 23:10:21 -0400 Subject: [PATCH 04/10] simpler scheme for testmode/trainmode --- docs/src/models/layers.md | 3 ++- src/deprecations.jl | 17 ++++++++++++++ src/functor.jl | 48 ++++++++++++++++++++++++--------------- 3 files changed, 49 insertions(+), 19 deletions(-) diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md index 6f5f3978cb..93a58a87fd 100644 --- a/docs/src/models/layers.md +++ b/docs/src/models/layers.md @@ -145,6 +145,7 @@ Several normalisation layers behave differently under training and inference (te The functions `Flux.trainmode!` and `Flux.testmode!` let you manually specify which behaviour you want. When called on a model, they will place all layers within the model into the specified mode. ```@docs -Flux.testmode! +testmode!(::Any) +testmode!(::Any, ::Any) trainmode! ``` diff --git a/src/deprecations.jl b/src/deprecations.jl index e5bc552946..627e5bd5b8 100644 --- a/src/deprecations.jl +++ b/src/deprecations.jl @@ -187,6 +187,23 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple, """) end +""" + trainmode!(m, active) + +!!! warning + This two-argument method is deprecated. + +Possible values of `active` are: +- `true` for training, or +- `false` for testing, same as [`testmode!`](@ref)`(m)` +- `:auto` or `nothing` for Flux to detect training automatically. +""" +function trainmode!(m, active::Bool) + Base.depwarn("trainmode!(m, active::Bool) is deprecated", :trainmode) + testmode!(m, !active) +end + + # v0.14 deprecations # Enable these when 0.14 is released, and delete const ClipGrad = Optimise.ClipValue etc: diff --git a/src/functor.jl b/src/functor.jl index a3af614d41..8331ad8e59 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -5,18 +5,17 @@ import Functors: Functors, @functor, functor, fmap, isleaf using SparseArrays: AbstractSparseArray """ - testmode!(m, inactive = true) + testmode!(m, [mode]) Set a layer, or all layers in a model, to test mode. -This disables the effect of [`Dropout`](@ref), and similar layers. +This disables the effect of [`Dropout`](@ref) and +some other regularisation layers. -_Note_: if you manually set a model into test mode, you need to manually place -it back into train mode during training phase. +If you manually set a model into test mode, you need to manually place +it back into train mode during training phase, using [`trainmode!`](@ref). -Possible values of optional 2nd argument `inactive` are: -- `true` for testing -- `false` for training, same as [`trainmode!`](@ref)`(m)` -- `:auto` or `nothing` for Flux to detect training automatically. +There is an optional second argument, which takes a symbol `:auto` to +reset all layers back to the default automatic mode. # Example @@ -34,23 +33,36 @@ julia> trainmode!(d, :auto) # back to default Dropout(0.3) ``` """ -testmode!(m, inactive = true) = (foreach(x -> testmode!(x, inactive), trainable(m)); m) +testmode!(m) = testmode!(m, true) """ - trainmode!(m, active = true) + trainmode!(m) Set a layer, or all layers in a model, to training mode. -Opposite to [`testmode!`](@ref) (i.e. `trainmode!(m, active) == testmode!(m, !active)`). +Opposite to [`testmode!`](@ref), see further details there. +""" +trainmode!(m) = testmode!(m, false) +trainmode!(m, mode::Symbol) = testmode!(m, mode) +trainmode!(m, ::Nothing) = testmode!(m, nothing) # why do we have so much API? + +""" + testmode!(model, inactive) -_Note_: if you manually set a model into train mode, you need to manually place -it into test mode during testing phase. +This two-argument method is largely internal. It recurses into the `model`, +and until a method like `testmode!(d::Dropout, mode)` alters the activity of a layer. -Possible values of optional 2nd argument `active` are: -- `true` for training -- `false` for testing -- `:auto` or `nothing` for Flux to detect training automatically +Possible values of `inactive` are: +- `true` for testing, i.e. `active=false` +- `false` for training, same as [`trainmode!`](@ref)`(m)` +- `:auto` or `nothing` for Flux to detect training automatically. """ -trainmode!(m, active = true) = active isa Bool ? testmode!(m, !active) : testmode!(m, active) +function testmode!(m, mode) + if inactive isa Symbol && mode !== :auto + throw(ArgumentError("testmode! accepts only the symbol :auto, got :$mode")) + end + foreach(x -> testmode!(x, mode), trainable(m)) + m +end function params!(p::Params, x, seen = IdSet()) if x isa AbstractArray{<:Number} && Functors.isleaf(x) From a36cd594bee55b3489123d54971dab4a463254e6 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sat, 1 Apr 2023 23:14:20 -0400 Subject: [PATCH 05/10] simplify active keyword a bit --- src/layers/normalise.jl | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl index 7ca65b4ab9..3794576e76 100644 --- a/src/layers/normalise.jl +++ b/src/layers/normalise.jl @@ -73,9 +73,9 @@ mutable struct Dropout{F<:Real,D,R<:AbstractRNG} end Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value()) -function Dropout(p::Real; dims=:, active = nothing, rng = default_rng_value()) +function Dropout(p::Real; dims=:, active::Union{Bool,Nothing} = nothing, rng = default_rng_value()) 0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expects 0 ≤ p ≤ 1, got p = $p")) - Dropout(p, dims, _tidy_active(active), rng) + Dropout(p, dims, active, rng) end @functor Dropout @@ -123,13 +123,13 @@ mutable struct AlphaDropout{F,R<:AbstractRNG} p::F active::Union{Bool, Nothing} rng::R - function AlphaDropout(p, active, rng) - @assert 0 ≤ p ≤ 1 - new{typeof(p), typeof(rng)}(p, _tidy_active(active), rng) - end end + AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value()) -AlphaDropout(p; rng = default_rng_value(), active=nothing) = AlphaDropout(p, active, rng) +function AlphaDropout(p; rng = default_rng_value(), active::Union{Bool,Nothing} = nothing) + 0 ≤ p ≤ 1 || throw(ArgumentError("AlphaDropout expects 0 ≤ p ≤ 1, got p = $p")) + AlphaDropout(p, active, rng) +end @functor AlphaDropout trainable(a::AlphaDropout) = (;) @@ -268,7 +268,7 @@ ChainRulesCore.@non_differentiable _track_stats!(::Any...) """ BatchNorm(channels::Integer, λ=identity; initβ=zeros32, initγ=ones32, - affine=true, track_stats=true, active=:auto, + affine=true, track_stats=true, active=nothing, ϵ=1f-5, momentum= 0.1f0) [Batch Normalization](https://arxiv.org/abs/1502.03167) layer. @@ -321,7 +321,7 @@ end function BatchNorm(chs::Int, λ=identity; initβ=zeros32, initγ=ones32, - affine=true, track_stats=true, active=nothing, + affine=true, track_stats=true, active::Union{Bool,Nothing}=nothing, ϵ=1f-5, momentum=0.1f0) β = affine ? initβ(chs) : nothing @@ -332,7 +332,7 @@ function BatchNorm(chs::Int, λ=identity; return BatchNorm(λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, - _tidy_active(active), chs) + active, chs) end @functor BatchNorm @@ -411,7 +411,7 @@ end function InstanceNorm(chs::Int, λ=identity; initβ=zeros32, initγ=ones32, - affine=false, track_stats=false, active=nothing, + affine=false, track_stats=false, active::Union{Bool,Nothing}=nothing, ϵ=1f-5, momentum=0.1f0) β = affine ? initβ(chs) : nothing @@ -422,7 +422,7 @@ function InstanceNorm(chs::Int, λ=identity; return InstanceNorm(λ, β, γ, μ, σ², ϵ, momentum, affine, track_stats, - _tidy_active(active), chs) + active, chs) end @functor InstanceNorm @@ -508,7 +508,7 @@ trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;) function GroupNorm(chs::Int, G::Int, λ=identity; initβ=zeros32, initγ=ones32, - affine=true, track_stats=false, active=nothing, + affine=true, track_stats=false, active::Union{Bool,Nothing}=nothing, ϵ=1f-5, momentum=0.1f0) if track_stats @@ -527,7 +527,7 @@ end μ, σ², ϵ, momentum, affine, track_stats, - _tidy_active(active), chs) + active, chs) end function (gn::GroupNorm)(x::AbstractArray) From aa38847274a3b43a7fb1caaac069d3a9dbeaf140 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sat, 1 Apr 2023 23:15:55 -0400 Subject: [PATCH 06/10] a bug --- src/functor.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/functor.jl b/src/functor.jl index 8331ad8e59..accfd879b8 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -49,7 +49,7 @@ trainmode!(m, ::Nothing) = testmode!(m, nothing) # why do we have so much API? testmode!(model, inactive) This two-argument method is largely internal. It recurses into the `model`, -and until a method like `testmode!(d::Dropout, mode)` alters the activity of a layer. +and until a method like `testmode!(d::Dropout, inactive)` alters the activity of a layer. Possible values of `inactive` are: - `true` for testing, i.e. `active=false` @@ -57,7 +57,7 @@ Possible values of `inactive` are: - `:auto` or `nothing` for Flux to detect training automatically. """ function testmode!(m, mode) - if inactive isa Symbol && mode !== :auto + if mode isa Symbol && mode !== :auto throw(ArgumentError("testmode! accepts only the symbol :auto, got :$mode")) end foreach(x -> testmode!(x, mode), trainable(m)) From fd59571ad5a20b07665c51fe2a90e899c685f905 Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Tue, 4 Apr 2023 11:26:17 -0400 Subject: [PATCH 07/10] fix tests --- test/layers/basic.jl | 18 +++++++++--------- test/layers/normalisation.jl | 15 +++++---------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/test/layers/basic.jl b/test/layers/basic.jl index 45e4750a6f..e143af8cf1 100644 --- a/test/layers/basic.jl +++ b/test/layers/basic.jl @@ -1,5 +1,5 @@ using Test, Random -import Flux: activations +using Flux: activations @testset "basic" begin @testset "helpers" begin @@ -16,11 +16,11 @@ import Flux: activations end @testset "Chain" begin - @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10)) - @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn(10)) + @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn32(10)) + @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn32(10)) # numeric test should be put into testset of corresponding layer - @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn(10)) + @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn32(10)) m = Chain(first = Dense(10, 5, σ), second = Dense(5, 2)) @test m[:first] == m[1] @test m[1:2] == m @@ -72,10 +72,10 @@ import Flux: activations @test_throws MethodError Dense(rand(5), rand(5), tanh) end @testset "dimensions" begin - @test length(Dense(10, 5)(randn(10))) == 5 - @test_throws DimensionMismatch Dense(10, 5)(randn(1)) - @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting - @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting + @test length(Dense(10 => 5)(randn32(10))) == 5 + @test_throws DimensionMismatch Dense(10 => 5)(randn32(1)) + @test_throws MethodError Dense(10 => 5)(1) # avoid broadcasting + @test_throws MethodError Dense(10 => 5).(randn32(10)) # avoid broadcasting @test size(Dense(10, 5)(randn(10))) == (5,) @test size(Dense(10, 5)(randn(10,2))) == (5,2) @test size(Dense(10, 5)(randn(10,2,3))) == (5,2,3) @@ -333,7 +333,7 @@ import Flux: activations y = m(x) @test y isa Array{Float32, 3} @test size(y) == (embed_size, 3, 4) - x3 = onehotbatch(x, 1:1:vocab_size) + x3 = Flux.onehotbatch(x, 1:1:vocab_size) @test size(x3) == (vocab_size, 3, 4) y3 = m(x3) @test size(y3) == (embed_size, 3, 4) diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 64051ac1fd..88cf5f078a 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -79,9 +79,8 @@ evalwgrad(f, x...) = pullback(f, x...)[1] end end - @test Dropout(0.5; active=:auto).active === nothing @test Dropout(0.5; active=true).active === true - @test_throws ArgumentError Dropout(0.5; active=:something_else) + @test_throws Exception Dropout(0.5; active=:something_else) end @testset "AlphaDropout" begin @@ -129,9 +128,8 @@ end end end - @test AlphaDropout(0.5; active=:auto).active === nothing @test AlphaDropout(0.5; active=true).active === true - @test_throws ArgumentError AlphaDropout(0.5; active=:something_else) + @test_throws Exception AlphaDropout(0.5; active=:something_else) end @testset "BatchNorm" begin @@ -225,9 +223,8 @@ end @test length(Flux.params(BatchNorm(10, affine=true))) == 2 @test length(Flux.params(BatchNorm(10, affine=false))) == 0 - @test BatchNorm(5; active=:auto).active === nothing @test BatchNorm(5; active=true).active === true - @test_throws ArgumentError BatchNorm(5; active=:something_else) + @test_throws Exception BatchNorm(5; active=:something_else) end @testset "InstanceNorm" begin @@ -360,9 +357,8 @@ end @test length(Flux.params(InstanceNorm(10, affine=true))) == 2 @test length(Flux.params(InstanceNorm(10, affine=false))) == 0 - @test InstanceNorm(5; active=:auto).active === nothing @test InstanceNorm(5; active=true).active === true - @test_throws ArgumentError InstanceNorm(5; active=:something_else) + @test_throws Exception InstanceNorm(5; active=:something_else) end @testset "LayerNorm" begin @@ -487,9 +483,8 @@ end @test BN(x) ≈ GN(x) end - @test GroupNorm(5, 5; active=:auto).active === nothing @test GroupNorm(5, 5; active=true).active === true - @test_throws ArgumentError GroupNorm(5, 5; active=:something_else) + @test_throws Exception GroupNorm(5, 5; active=:something_else) end @testset "second derivatives" begin From 5a09f69cfdbc00d360b1bd4d6a9249af6d34742a Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sun, 9 Apr 2023 07:55:02 -0400 Subject: [PATCH 08/10] Update test/layers/normalisation.jl Co-authored-by: Brian Chen --- test/layers/normalisation.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl index 88cf5f078a..917ca20a17 100644 --- a/test/layers/normalisation.jl +++ b/test/layers/normalisation.jl @@ -29,7 +29,7 @@ evalwgrad(f, x...) = pullback(f, x...)[1] # Keyword active=false m2 = Dropout(0.9; active=false, rng_kwargs...) y2 = evalwgrad(m2, x) - @test count(a->a==0, y2) == 0 + @test count(iszero, y2) == 0 x = rand(Float32, 100) m = Chain(Dense(100,100), From 3720d9f77fb4c005110688e4f71494e9d79b652a Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Sun, 9 Apr 2023 07:56:23 -0400 Subject: [PATCH 09/10] Update src/functor.jl --- src/functor.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/functor.jl b/src/functor.jl index accfd879b8..767e831dd0 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -29,7 +29,7 @@ Dropout(0.3, active=false) julia> trainmode!(d) # dropout is now always enabled Dropout(0.3, active=true) -julia> trainmode!(d, :auto) # back to default +julia> testmode!(d, :auto) # back to default Dropout(0.3) ``` """ From e2cef3dda0db1982518810347efd3b581a9a30ae Mon Sep 17 00:00:00 2001 From: Michael Abbott <32575566+mcabbott@users.noreply.github.com> Date: Mon, 24 Apr 2023 20:07:13 -0400 Subject: [PATCH 10/10] extend docstrings & warnings --- src/functor.jl | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/functor.jl b/src/functor.jl index 767e831dd0..d0cb9c5d80 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -5,7 +5,7 @@ import Functors: Functors, @functor, functor, fmap, isleaf using SparseArrays: AbstractSparseArray """ - testmode!(m, [mode]) + testmode!(model, [mode]) -> model Set a layer, or all layers in a model, to test mode. This disables the effect of [`Dropout`](@ref) and @@ -36,7 +36,7 @@ Dropout(0.3) testmode!(m) = testmode!(m, true) """ - trainmode!(m) + trainmode!(model) -> model Set a layer, or all layers in a model, to training mode. Opposite to [`testmode!`](@ref), see further details there. @@ -50,17 +50,28 @@ trainmode!(m, ::Nothing) = testmode!(m, nothing) # why do we have so much API? This two-argument method is largely internal. It recurses into the `model`, and until a method like `testmode!(d::Dropout, inactive)` alters the activity of a layer. +Custom layers can support manual `testmode!` / `trainmode!` switching +by defining such a method. Possible values of `inactive` are: - `true` for testing, i.e. `active=false` - `false` for training, same as [`trainmode!`](@ref)`(m)` - `:auto` or `nothing` for Flux to detect training automatically. + +!!! compat + This method may be removed in a future breaking change, to separate + the user-facing `testmode!` from the internal recursion. """ function testmode!(m, mode) - if mode isa Symbol && mode !== :auto - throw(ArgumentError("testmode! accepts only the symbol :auto, got :$mode")) + inactive = if mode isa Symbol + mode === :auto || throw(ArgumentError("testmode! accepts only the symbol :auto, got :$mode")) + nothing + elseif mode isa Union{Bool,Nothing} + mode + else + throw(ArgumentError("testmode! does not accept $(repr(mode)) as the 2nd argument")) end - foreach(x -> testmode!(x, mode), trainable(m)) + foreach(x -> testmode!(x, inactive), trainable(m)) m end