From 49d13030ab86a16586e7565af0007eb405480dee Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Fri, 31 Mar 2023 01:15:33 -0400
Subject: [PATCH 01/10] print the state of Dropout etc.

---
 src/functor.jl          | 34 +++++++++++++++++++-------
 src/layers/normalise.jl | 54 ++++++++++++++++++++++++++---------------
 2 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index e04a9b1fbd..9fc997d56b 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -7,32 +7,48 @@ using SparseArrays: AbstractSparseArray
 """
     testmode!(m, mode = true)
 
-Set a layer or model's test mode (see below).
-Using `:auto` mode will treat any gradient computation as training.
+Set a layer, or all layers in a model, to test mode.
+This disables the effect of [`Dropout`](@ref), and similar layers.
 
 _Note_: if you manually set a model into test mode, you need to manually place
 it back into train mode during training phase.
 
-Possible values include:
-- `false` for training
+Possible values of optional 2nd argument `mode` are:
 - `true` for testing
-- `:auto` or `nothing` for Flux to detect the mode automatically
+- `false` for training, same as [`trainmode!`](@ref)
+- `:auto` or `nothing` for Flux to detect training automatically.
+
+# Example
+
+```jldoctest
+julia> d = Dropout(0.3)
+Dropout(0.3)
+
+julia> testmode!(d)   # dropout is now always disabled
+Dropout(0.3, active=false)
+
+julia> trainmode!(d)  # dropout is now always enabled
+Dropout(0.3, active=true)
+
+julia> trainmode!(d, :auto)  # back to default
+Dropout(0.3)
+```
 """
 testmode!(m, mode = true) = (foreach(x -> testmode!(x, mode), trainable(m)); m)
 
 """
     trainmode!(m, mode = true)
 
-Set a layer of model's train mode (see below).
-Symmetric to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`).
+Set a layer, or all layers in a model, to training mode.
+Opposite to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`).
 
 _Note_: if you manually set a model into train mode, you need to manually place
 it into test mode during testing phase.
 
-Possible values include:
+Possible values of optional 2nd argument `mode` are:
 - `true` for training
 - `false` for testing
-- `:auto` or `nothing` for Flux to detect the mode automatically
+- `:auto` or `nothing` for Flux to detect training automatically
 """
 trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode)
 
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 9af9044c7b..e3f3d90f36 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -1,8 +1,13 @@
 # Internal function, used only for layers defined in this file.
 _isactive(m, x) = isnothing(m.active) ? NNlib.within_gradient(x) : m.active
 
+# Internal function, used only in this file.
+_tidy_active(mode::Bool) = mode
+_tidy_active(::Nothing) = nothing
+_tidy_active(mode) = mode === :auto ? nothing : throw(ArgumentError("active = $(repr(mode)) is not accepted, must be true/false/nothing or :auto"))
+
 """
-    Dropout(p; [dims, rng])
+    Dropout(p; [dims, rng, active])
 
 Layer implementing [dropout](https://arxiv.org/abs/1207.0580) with the given probability.
 This is used as a regularisation, i.e. to reduce overfitting.
@@ -12,7 +17,8 @@ or else scales it by `1 / (1 - p)`, using the [`NNlib.dropout`](@ref) function.
 While testing, it has no effect.
 
 By default the mode will switch automatically, but it can also
-be controlled manually via [`Flux.testmode!`](@ref).
+be controlled manually via [`Flux.testmode!`](@ref),
+or by passing keyword `active=true` for training mode.
 
 By default every input is treated independently. With the `dims` keyword,
 instead it takes a random choice only along that dimension.
@@ -36,7 +42,11 @@ julia> m(ones(2, 7))  # test mode, no effect
  2.0  2.0  2.0  2.0  2.0  2.0  2.0
  2.0  2.0  2.0  2.0  2.0  2.0  2.0
 
-julia> Flux.trainmode!(m);  # equivalent to use within gradient
+julia> Flux.trainmode!(m)  # equivalent to use within gradient
+Chain(
+  Dense(2 => 3),                        # 9 parameters
+  Dropout(0.4, active=true),
+)
 
 julia> m(ones(2, 7))
 3×7 Matrix{Float64}:
@@ -63,9 +73,9 @@ mutable struct Dropout{F<:Real,D,R<:AbstractRNG}
 end
 Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value())
 
-function Dropout(p::Real; dims=:, rng = default_rng_value())
+function Dropout(p::Real; dims=:, active = nothing, rng = default_rng_value())
   0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expects 0 ≤ p ≤ 1, got p = $p"))
-  Dropout(p, dims, nothing, rng)
+  Dropout(p, dims, _tidy_active(active), rng)
 end
 
 @functor Dropout
@@ -74,16 +84,17 @@ trainable(a::Dropout) = (;)
 (a::Dropout)(x) = dropout(a.rng, x, a.p * _isactive(a, x); dims=a.dims)
 
 testmode!(m::Dropout, mode=true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+  (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m)
 
 function Base.show(io::IO, d::Dropout)
   print(io, "Dropout(", d.p)
-  d.dims != (:) && print(io, ", dims = $(repr(d.dims))")
+  d.dims != (:) && print(io, ", dims=", d.dims)
+  d.active == nothing || print(io, ", active=", d.active)
   print(io, ")")
 end
 
 """
-    AlphaDropout(p; rng = default_rng_value())
+    AlphaDropout(p; [rng, active])
 
 A dropout layer. Used in
 [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515).
@@ -114,11 +125,11 @@ mutable struct AlphaDropout{F,R<:AbstractRNG}
   rng::R
   function AlphaDropout(p, active, rng)
     @assert 0 ≤ p ≤ 1
-    new{typeof(p), typeof(rng)}(p, active, rng)
+    new{typeof(p), typeof(rng)}(p, _tidy_active(active), rng)
   end
 end
 AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value())
-AlphaDropout(p; rng = default_rng_value()) = AlphaDropout(p, nothing, rng)
+AlphaDropout(p; rng = default_rng_value(), active=nothing) = AlphaDropout(p, active, rng)
 
 @functor AlphaDropout
 trainable(a::AlphaDropout) = (;)
@@ -138,7 +149,7 @@ function (a::AlphaDropout)(x::AbstractArray{T}) where T
 end
 
 testmode!(m::AlphaDropout, mode=true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+  (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m)
 
 """
     LayerNorm(size..., λ=identity; affine=true, ϵ=1fe-5)
@@ -310,7 +321,7 @@ end
 
 function BatchNorm(chs::Int, λ=identity;
           initβ=zeros32, initγ=ones32,
-          affine=true, track_stats=true,
+          affine=true, track_stats=true, active=nothing,
           ϵ=1f-5, momentum=0.1f0)
 
   β = affine ? initβ(chs) : nothing
@@ -321,7 +332,7 @@ function BatchNorm(chs::Int, λ=identity;
   return BatchNorm(λ, β, γ,
             μ, σ², ϵ, momentum,
             affine, track_stats,
-            nothing, chs)
+            _tidy_active(active), chs)
 end
 
 @functor BatchNorm
@@ -335,12 +346,13 @@ function (BN::BatchNorm)(x::AbstractArray{T,N}) where {T,N}
 end
 
 testmode!(m::BatchNorm, mode=true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+  (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::BatchNorm)
   print(io, "BatchNorm($(l.chs)")
   (l.λ == identity) || print(io, ", $(l.λ)")
   hasaffine(l) || print(io,  ", affine=false")
+  l.active == nothing || print(io, ", active=", l.active)
   print(io, ")")
 end
 
@@ -399,7 +411,7 @@ end
 
 function InstanceNorm(chs::Int, λ=identity;
                     initβ=zeros32, initγ=ones32,
-                    affine=false, track_stats=false,
+                    affine=false, track_stats=false, active=nothing,
                     ϵ=1f-5, momentum=0.1f0)
 
   β = affine ? initβ(chs) : nothing
@@ -410,7 +422,7 @@ function InstanceNorm(chs::Int, λ=identity;
   return InstanceNorm(λ, β, γ,
             μ, σ², ϵ, momentum,
             affine, track_stats,
-            nothing, chs)
+            _tidy_active(active), chs)
 end
 
 @functor InstanceNorm
@@ -424,12 +436,13 @@ function (l::InstanceNorm)(x::AbstractArray{T,N}) where {T,N}
 end
 
 testmode!(m::InstanceNorm, mode=true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+  (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::InstanceNorm)
   print(io, "InstanceNorm($(l.chs)")
   l.λ == identity || print(io, ", $(l.λ)")
   hasaffine(l) || print(io,  ", affine=false")
+  l.active == nothing || print(io, ", active=", l.active)
   print(io, ")")
 end
 
@@ -495,7 +508,7 @@ trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;)
 
 function GroupNorm(chs::Int, G::Int, λ=identity;
               initβ=zeros32, initγ=ones32,
-              affine=true, track_stats=false,
+              affine=true, track_stats=false, active=nothing,
               ϵ=1f-5, momentum=0.1f0)
 
 if track_stats
@@ -514,7 +527,7 @@ end
             μ, σ²,
             ϵ, momentum,
             affine, track_stats,
-            nothing, chs)
+            _tidy_active(active), chs)
 end
 
 function (gn::GroupNorm)(x::AbstractArray)
@@ -529,13 +542,14 @@ function (gn::GroupNorm)(x::AbstractArray)
 end
 
 testmode!(m::GroupNorm, mode = true) =
-  (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
+  (m.active = isnothing(_tidy_active(mode)) ? nothing : !mode; m)
 
 function Base.show(io::IO, l::GroupNorm)
   # print(io, "GroupNorm($(join(size(l.β), ", "))", ", ", l.G)
   print(io, "GroupNorm($(l.chs), $(l.G)")
   l.λ == identity || print(io, ", ", l.λ)
   hasaffine(l) || print(io,  ", affine=false")
+  l.active == nothing || print(io, ", active=", l.active)
   print(io, ")")
 end
 

From 061440ebfb530dec504431c4b0d9164cc8dcdf84 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Fri, 31 Mar 2023 17:46:03 -0400
Subject: [PATCH 02/10] add tests

---
 test/layers/normalisation.jl | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index e21c307901..64051ac1fd 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -26,6 +26,11 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     y = evalwgrad(m, x)
     @test count(a->a==0, y) > 50
 
+    # Keyword active=false
+    m2 = Dropout(0.9; active=false, rng_kwargs...)
+    y2 = evalwgrad(m2, x)
+    @test count(a->a==0, y2) == 0
+
     x = rand(Float32, 100)
     m = Chain(Dense(100,100),
               Dropout(0.9; rng_kwargs...))
@@ -73,6 +78,10 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
       @test cpu(m).rng === only(values(rng_kwargs))
     end
   end
+
+  @test Dropout(0.5; active=:auto).active === nothing
+  @test Dropout(0.5; active=true).active === true
+  @test_throws ArgumentError Dropout(0.5; active=:something_else)
 end
 
 @testset "AlphaDropout" begin
@@ -119,6 +128,10 @@ end
       @test cpu(m).rng === only(values(rng_kwargs))
     end
   end
+
+  @test AlphaDropout(0.5; active=:auto).active === nothing
+  @test AlphaDropout(0.5; active=true).active === true
+  @test_throws ArgumentError AlphaDropout(0.5; active=:something_else)
 end
 
 @testset "BatchNorm" begin
@@ -211,6 +224,10 @@ end
   @test length(Flux.params(BatchNorm(10))) == 2
   @test length(Flux.params(BatchNorm(10, affine=true))) == 2
   @test length(Flux.params(BatchNorm(10, affine=false))) == 0
+
+  @test BatchNorm(5; active=:auto).active === nothing
+  @test BatchNorm(5; active=true).active === true
+  @test_throws ArgumentError BatchNorm(5; active=:something_else)
 end
 
 @testset "InstanceNorm" begin
@@ -342,6 +359,10 @@ end
   @test length(Flux.params(InstanceNorm(10))) == 0
   @test length(Flux.params(InstanceNorm(10, affine=true))) == 2
   @test length(Flux.params(InstanceNorm(10, affine=false))) == 0
+
+  @test InstanceNorm(5; active=:auto).active === nothing
+  @test InstanceNorm(5; active=true).active === true
+  @test_throws ArgumentError InstanceNorm(5; active=:something_else)
 end
 
 @testset "LayerNorm" begin
@@ -465,6 +486,10 @@ end
       x = Float32.(reshape(collect(1:prod(sizes)), sizes))
     @test BN(x) ≈ GN(x)
   end
+
+  @test GroupNorm(5, 5; active=:auto).active === nothing
+  @test GroupNorm(5, 5; active=true).active === true
+  @test_throws ArgumentError GroupNorm(5, 5; active=:something_else)
 end
 
 @testset "second derivatives" begin

From 1153f5794bed4ad3563dd503498008e4a2911a3c Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Fri, 31 Mar 2023 17:48:41 -0400
Subject: [PATCH 03/10] doc improvements

---
 src/functor.jl          | 16 ++++++++--------
 src/layers/normalise.jl |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index 9fc997d56b..a3af614d41 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -5,7 +5,7 @@ import Functors: Functors, @functor, functor, fmap, isleaf
 using SparseArrays: AbstractSparseArray
 
 """
-    testmode!(m, mode = true)
+    testmode!(m, inactive = true)
 
 Set a layer, or all layers in a model, to test mode.
 This disables the effect of [`Dropout`](@ref), and similar layers.
@@ -13,9 +13,9 @@ This disables the effect of [`Dropout`](@ref), and similar layers.
 _Note_: if you manually set a model into test mode, you need to manually place
 it back into train mode during training phase.
 
-Possible values of optional 2nd argument `mode` are:
+Possible values of optional 2nd argument `inactive` are:
 - `true` for testing
-- `false` for training, same as [`trainmode!`](@ref)
+- `false` for training, same as [`trainmode!`](@ref)`(m)`
 - `:auto` or `nothing` for Flux to detect training automatically.
 
 # Example
@@ -34,23 +34,23 @@ julia> trainmode!(d, :auto)  # back to default
 Dropout(0.3)
 ```
 """
-testmode!(m, mode = true) = (foreach(x -> testmode!(x, mode), trainable(m)); m)
+testmode!(m, inactive = true) = (foreach(x -> testmode!(x, inactive), trainable(m)); m)
 
 """
-    trainmode!(m, mode = true)
+    trainmode!(m, active = true)
 
 Set a layer, or all layers in a model, to training mode.
-Opposite to [`testmode!`](@ref) (i.e. `trainmode!(m, mode) == testmode!(m, !mode)`).
+Opposite to [`testmode!`](@ref) (i.e. `trainmode!(m, active) == testmode!(m, !active)`).
 
 _Note_: if you manually set a model into train mode, you need to manually place
 it into test mode during testing phase.
 
-Possible values of optional 2nd argument `mode` are:
+Possible values of optional 2nd argument `active` are:
 - `true` for training
 - `false` for testing
 - `:auto` or `nothing` for Flux to detect training automatically
 """
-trainmode!(m, mode = true) = mode isa Bool ? testmode!(m, !mode) : testmode!(m, mode)
+trainmode!(m, active = true) = active isa Bool ? testmode!(m, !active) : testmode!(m, active)
 
 function params!(p::Params, x, seen = IdSet())
   if x isa AbstractArray{<:Number} && Functors.isleaf(x)
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index e3f3d90f36..7ca65b4ab9 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -268,7 +268,7 @@ ChainRulesCore.@non_differentiable _track_stats!(::Any...)
 """
     BatchNorm(channels::Integer, λ=identity;
               initβ=zeros32, initγ=ones32,
-              affine = true, track_stats = true,
+              affine=true, track_stats=true, active=:auto,
               ϵ=1f-5, momentum= 0.1f0)
 
 [Batch Normalization](https://arxiv.org/abs/1502.03167) layer.

From a22d7d00a0e8c49b552a7f32bc9aa5006c93906e Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 1 Apr 2023 23:10:21 -0400
Subject: [PATCH 04/10] simpler scheme for testmode/trainmode

---
 docs/src/models/layers.md |  3 ++-
 src/deprecations.jl       | 17 ++++++++++++++
 src/functor.jl            | 48 ++++++++++++++++++++++++---------------
 3 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/docs/src/models/layers.md b/docs/src/models/layers.md
index 6f5f3978cb..93a58a87fd 100644
--- a/docs/src/models/layers.md
+++ b/docs/src/models/layers.md
@@ -145,6 +145,7 @@ Several normalisation layers behave differently under training and inference (te
 The functions `Flux.trainmode!` and `Flux.testmode!` let you manually specify which behaviour you want. When called on a model, they will place all layers within the model into the specified mode.
 
 ```@docs
-Flux.testmode!
+testmode!(::Any)
+testmode!(::Any, ::Any)
 trainmode!
 ```
diff --git a/src/deprecations.jl b/src/deprecations.jl
index e5bc552946..627e5bd5b8 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -187,6 +187,23 @@ function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple,
     """)
 end
 
+""" 
+    trainmode!(m, active)
+
+!!! warning
+    This two-argument method is deprecated.
+
+Possible values of  `active` are:
+- `true` for training, or 
+- `false` for testing, same as [`testmode!`](@ref)`(m)`
+- `:auto` or `nothing` for Flux to detect training automatically.
+"""
+function trainmode!(m, active::Bool)
+  Base.depwarn("trainmode!(m, active::Bool) is deprecated", :trainmode)
+  testmode!(m, !active)
+end
+
+
 # v0.14 deprecations
 
 # Enable these when 0.14 is released, and delete const ClipGrad = Optimise.ClipValue etc: 
diff --git a/src/functor.jl b/src/functor.jl
index a3af614d41..8331ad8e59 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -5,18 +5,17 @@ import Functors: Functors, @functor, functor, fmap, isleaf
 using SparseArrays: AbstractSparseArray
 
 """
-    testmode!(m, inactive = true)
+    testmode!(m, [mode])
 
 Set a layer, or all layers in a model, to test mode.
-This disables the effect of [`Dropout`](@ref), and similar layers.
+This disables the effect of [`Dropout`](@ref) and
+some other regularisation layers.
 
-_Note_: if you manually set a model into test mode, you need to manually place
-it back into train mode during training phase.
+If you manually set a model into test mode, you need to manually place
+it back into train mode during training phase, using [`trainmode!`](@ref).
 
-Possible values of optional 2nd argument `inactive` are:
-- `true` for testing
-- `false` for training, same as [`trainmode!`](@ref)`(m)`
-- `:auto` or `nothing` for Flux to detect training automatically.
+There is an optional second argument, which takes a symbol `:auto` to
+reset all layers back to the default automatic mode.
 
 # Example
 
@@ -34,23 +33,36 @@ julia> trainmode!(d, :auto)  # back to default
 Dropout(0.3)
 ```
 """
-testmode!(m, inactive = true) = (foreach(x -> testmode!(x, inactive), trainable(m)); m)
+testmode!(m) = testmode!(m, true)
 
 """
-    trainmode!(m, active = true)
+    trainmode!(m)
 
 Set a layer, or all layers in a model, to training mode.
-Opposite to [`testmode!`](@ref) (i.e. `trainmode!(m, active) == testmode!(m, !active)`).
+Opposite to [`testmode!`](@ref), see further details there.
+"""
+trainmode!(m) = testmode!(m, false)
+trainmode!(m, mode::Symbol) = testmode!(m, mode)
+trainmode!(m, ::Nothing) = testmode!(m, nothing)  # why do we have so much API?
+
+"""
+    testmode!(model, inactive)
 
-_Note_: if you manually set a model into train mode, you need to manually place
-it into test mode during testing phase.
+This two-argument method is largely internal. It recurses into the `model`,
+and until a method like `testmode!(d::Dropout, mode)` alters the activity of a layer.
 
-Possible values of optional 2nd argument `active` are:
-- `true` for training
-- `false` for testing
-- `:auto` or `nothing` for Flux to detect training automatically
+Possible values of  `inactive` are:
+- `true` for testing, i.e. `active=false`
+- `false` for training, same as [`trainmode!`](@ref)`(m)`
+- `:auto` or `nothing` for Flux to detect training automatically.
 """
-trainmode!(m, active = true) = active isa Bool ? testmode!(m, !active) : testmode!(m, active)
+function testmode!(m, mode)
+  if inactive isa Symbol && mode !== :auto
+    throw(ArgumentError("testmode! accepts only the symbol :auto, got :$mode"))
+  end
+  foreach(x -> testmode!(x, mode), trainable(m))
+  m
+end
 
 function params!(p::Params, x, seen = IdSet())
   if x isa AbstractArray{<:Number} && Functors.isleaf(x)

From a36cd594bee55b3489123d54971dab4a463254e6 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 1 Apr 2023 23:14:20 -0400
Subject: [PATCH 05/10] simplify active keyword a bit

---
 src/layers/normalise.jl | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
index 7ca65b4ab9..3794576e76 100644
--- a/src/layers/normalise.jl
+++ b/src/layers/normalise.jl
@@ -73,9 +73,9 @@ mutable struct Dropout{F<:Real,D,R<:AbstractRNG}
 end
 Dropout(p::Real, dims, active) = Dropout(p, dims, active, default_rng_value())
 
-function Dropout(p::Real; dims=:, active = nothing, rng = default_rng_value())
+function Dropout(p::Real; dims=:, active::Union{Bool,Nothing} = nothing, rng = default_rng_value())
   0 ≤ p ≤ 1 || throw(ArgumentError("Dropout expects 0 ≤ p ≤ 1, got p = $p"))
-  Dropout(p, dims, _tidy_active(active), rng)
+  Dropout(p, dims, active, rng)
 end
 
 @functor Dropout
@@ -123,13 +123,13 @@ mutable struct AlphaDropout{F,R<:AbstractRNG}
   p::F
   active::Union{Bool, Nothing}
   rng::R
-  function AlphaDropout(p, active, rng)
-    @assert 0 ≤ p ≤ 1
-    new{typeof(p), typeof(rng)}(p, _tidy_active(active), rng)
-  end
 end
+
 AlphaDropout(p, active) = AlphaDropout(p, active, default_rng_value())
-AlphaDropout(p; rng = default_rng_value(), active=nothing) = AlphaDropout(p, active, rng)
+function AlphaDropout(p; rng = default_rng_value(), active::Union{Bool,Nothing} = nothing)
+  0 ≤ p ≤ 1 || throw(ArgumentError("AlphaDropout expects 0 ≤ p ≤ 1, got p = $p"))
+  AlphaDropout(p, active, rng)
+end
 
 @functor AlphaDropout
 trainable(a::AlphaDropout) = (;)
@@ -268,7 +268,7 @@ ChainRulesCore.@non_differentiable _track_stats!(::Any...)
 """
     BatchNorm(channels::Integer, λ=identity;
               initβ=zeros32, initγ=ones32,
-              affine=true, track_stats=true, active=:auto,
+              affine=true, track_stats=true, active=nothing,
               ϵ=1f-5, momentum= 0.1f0)
 
 [Batch Normalization](https://arxiv.org/abs/1502.03167) layer.
@@ -321,7 +321,7 @@ end
 
 function BatchNorm(chs::Int, λ=identity;
           initβ=zeros32, initγ=ones32,
-          affine=true, track_stats=true, active=nothing,
+          affine=true, track_stats=true, active::Union{Bool,Nothing}=nothing,
           ϵ=1f-5, momentum=0.1f0)
 
   β = affine ? initβ(chs) : nothing
@@ -332,7 +332,7 @@ function BatchNorm(chs::Int, λ=identity;
   return BatchNorm(λ, β, γ,
             μ, σ², ϵ, momentum,
             affine, track_stats,
-            _tidy_active(active), chs)
+            active, chs)
 end
 
 @functor BatchNorm
@@ -411,7 +411,7 @@ end
 
 function InstanceNorm(chs::Int, λ=identity;
                     initβ=zeros32, initγ=ones32,
-                    affine=false, track_stats=false, active=nothing,
+                    affine=false, track_stats=false, active::Union{Bool,Nothing}=nothing,
                     ϵ=1f-5, momentum=0.1f0)
 
   β = affine ? initβ(chs) : nothing
@@ -422,7 +422,7 @@ function InstanceNorm(chs::Int, λ=identity;
   return InstanceNorm(λ, β, γ,
             μ, σ², ϵ, momentum,
             affine, track_stats,
-            _tidy_active(active), chs)
+            active, chs)
 end
 
 @functor InstanceNorm
@@ -508,7 +508,7 @@ trainable(gn::GroupNorm) = hasaffine(gn) ? (β = gn.β, γ = gn.γ) : (;)
 
 function GroupNorm(chs::Int, G::Int, λ=identity;
               initβ=zeros32, initγ=ones32,
-              affine=true, track_stats=false, active=nothing,
+              affine=true, track_stats=false, active::Union{Bool,Nothing}=nothing,
               ϵ=1f-5, momentum=0.1f0)
 
 if track_stats
@@ -527,7 +527,7 @@ end
             μ, σ²,
             ϵ, momentum,
             affine, track_stats,
-            _tidy_active(active), chs)
+            active, chs)
 end
 
 function (gn::GroupNorm)(x::AbstractArray)

From aa38847274a3b43a7fb1caaac069d3a9dbeaf140 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sat, 1 Apr 2023 23:15:55 -0400
Subject: [PATCH 06/10] a bug

---
 src/functor.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index 8331ad8e59..accfd879b8 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -49,7 +49,7 @@ trainmode!(m, ::Nothing) = testmode!(m, nothing)  # why do we have so much API?
     testmode!(model, inactive)
 
 This two-argument method is largely internal. It recurses into the `model`,
-and until a method like `testmode!(d::Dropout, mode)` alters the activity of a layer.
+and until a method like `testmode!(d::Dropout, inactive)` alters the activity of a layer.
 
 Possible values of  `inactive` are:
 - `true` for testing, i.e. `active=false`
@@ -57,7 +57,7 @@ Possible values of  `inactive` are:
 - `:auto` or `nothing` for Flux to detect training automatically.
 """
 function testmode!(m, mode)
-  if inactive isa Symbol && mode !== :auto
+  if mode isa Symbol && mode !== :auto
     throw(ArgumentError("testmode! accepts only the symbol :auto, got :$mode"))
   end
   foreach(x -> testmode!(x, mode), trainable(m))

From fd59571ad5a20b07665c51fe2a90e899c685f905 Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Tue, 4 Apr 2023 11:26:17 -0400
Subject: [PATCH 07/10] fix tests

---
 test/layers/basic.jl         | 18 +++++++++---------
 test/layers/normalisation.jl | 15 +++++----------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index 45e4750a6f..e143af8cf1 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -1,5 +1,5 @@
 using Test, Random
-import Flux: activations
+using Flux: activations
 
 @testset "basic" begin
   @testset "helpers" begin
@@ -16,11 +16,11 @@ import Flux: activations
   end
 
   @testset "Chain" begin
-    @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn(10))
-    @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn(10))
+    @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn32(10))
+    @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn32(10))
     # numeric test should be put into testset of corresponding layer
 
-    @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn(10))
+    @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn32(10))
     m = Chain(first = Dense(10, 5, σ), second = Dense(5, 2))
     @test m[:first] == m[1]
     @test m[1:2] == m
@@ -72,10 +72,10 @@ import Flux: activations
       @test_throws MethodError Dense(rand(5), rand(5), tanh)
     end
     @testset "dimensions" begin
-      @test  length(Dense(10, 5)(randn(10))) == 5
-      @test_throws DimensionMismatch Dense(10, 5)(randn(1))
-      @test_throws MethodError Dense(10, 5)(1) # avoid broadcasting
-      @test_throws MethodError Dense(10, 5).(randn(10)) # avoid broadcasting
+      @test  length(Dense(10 => 5)(randn32(10))) == 5
+      @test_throws DimensionMismatch Dense(10 => 5)(randn32(1))
+      @test_throws MethodError Dense(10 => 5)(1) # avoid broadcasting
+      @test_throws MethodError Dense(10 => 5).(randn32(10)) # avoid broadcasting
       @test size(Dense(10, 5)(randn(10))) == (5,)
       @test size(Dense(10, 5)(randn(10,2))) == (5,2)
       @test size(Dense(10, 5)(randn(10,2,3))) == (5,2,3)
@@ -333,7 +333,7 @@ import Flux: activations
     y = m(x)
     @test y isa Array{Float32, 3}
     @test size(y) == (embed_size, 3, 4)
-    x3 = onehotbatch(x, 1:1:vocab_size)
+    x3 = Flux.onehotbatch(x, 1:1:vocab_size)
     @test size(x3) == (vocab_size, 3, 4)
     y3 = m(x3)
     @test size(y3) == (embed_size, 3, 4)
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 64051ac1fd..88cf5f078a 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -79,9 +79,8 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     end
   end
 
-  @test Dropout(0.5; active=:auto).active === nothing
   @test Dropout(0.5; active=true).active === true
-  @test_throws ArgumentError Dropout(0.5; active=:something_else)
+  @test_throws Exception Dropout(0.5; active=:something_else)
 end
 
 @testset "AlphaDropout" begin
@@ -129,9 +128,8 @@ end
     end
   end
 
-  @test AlphaDropout(0.5; active=:auto).active === nothing
   @test AlphaDropout(0.5; active=true).active === true
-  @test_throws ArgumentError AlphaDropout(0.5; active=:something_else)
+  @test_throws Exception AlphaDropout(0.5; active=:something_else)
 end
 
 @testset "BatchNorm" begin
@@ -225,9 +223,8 @@ end
   @test length(Flux.params(BatchNorm(10, affine=true))) == 2
   @test length(Flux.params(BatchNorm(10, affine=false))) == 0
 
-  @test BatchNorm(5; active=:auto).active === nothing
   @test BatchNorm(5; active=true).active === true
-  @test_throws ArgumentError BatchNorm(5; active=:something_else)
+  @test_throws Exception BatchNorm(5; active=:something_else)
 end
 
 @testset "InstanceNorm" begin
@@ -360,9 +357,8 @@ end
   @test length(Flux.params(InstanceNorm(10, affine=true))) == 2
   @test length(Flux.params(InstanceNorm(10, affine=false))) == 0
 
-  @test InstanceNorm(5; active=:auto).active === nothing
   @test InstanceNorm(5; active=true).active === true
-  @test_throws ArgumentError InstanceNorm(5; active=:something_else)
+  @test_throws Exception InstanceNorm(5; active=:something_else)
 end
 
 @testset "LayerNorm" begin
@@ -487,9 +483,8 @@ end
     @test BN(x) ≈ GN(x)
   end
 
-  @test GroupNorm(5, 5; active=:auto).active === nothing
   @test GroupNorm(5, 5; active=true).active === true
-  @test_throws ArgumentError GroupNorm(5, 5; active=:something_else)
+  @test_throws Exception GroupNorm(5, 5; active=:something_else)
 end
 
 @testset "second derivatives" begin

From 5a09f69cfdbc00d360b1bd4d6a9249af6d34742a Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sun, 9 Apr 2023 07:55:02 -0400
Subject: [PATCH 08/10] Update test/layers/normalisation.jl

Co-authored-by: Brian Chen <ToucheSir@users.noreply.github.com>
---
 test/layers/normalisation.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index 88cf5f078a..917ca20a17 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -29,7 +29,7 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     # Keyword active=false
     m2 = Dropout(0.9; active=false, rng_kwargs...)
     y2 = evalwgrad(m2, x)
-    @test count(a->a==0, y2) == 0
+    @test count(iszero, y2) == 0
 
     x = rand(Float32, 100)
     m = Chain(Dense(100,100),

From 3720d9f77fb4c005110688e4f71494e9d79b652a Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Sun, 9 Apr 2023 07:56:23 -0400
Subject: [PATCH 09/10] Update src/functor.jl

---
 src/functor.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/functor.jl b/src/functor.jl
index accfd879b8..767e831dd0 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -29,7 +29,7 @@ Dropout(0.3, active=false)
 julia> trainmode!(d)  # dropout is now always enabled
 Dropout(0.3, active=true)
 
-julia> trainmode!(d, :auto)  # back to default
+julia> testmode!(d, :auto)  # back to default
 Dropout(0.3)
 ```
 """

From e2cef3dda0db1982518810347efd3b581a9a30ae Mon Sep 17 00:00:00 2001
From: Michael Abbott <32575566+mcabbott@users.noreply.github.com>
Date: Mon, 24 Apr 2023 20:07:13 -0400
Subject: [PATCH 10/10] extend docstrings & warnings

---
 src/functor.jl | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/functor.jl b/src/functor.jl
index 767e831dd0..d0cb9c5d80 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -5,7 +5,7 @@ import Functors: Functors, @functor, functor, fmap, isleaf
 using SparseArrays: AbstractSparseArray
 
 """
-    testmode!(m, [mode])
+    testmode!(model, [mode]) -> model
 
 Set a layer, or all layers in a model, to test mode.
 This disables the effect of [`Dropout`](@ref) and
@@ -36,7 +36,7 @@ Dropout(0.3)
 testmode!(m) = testmode!(m, true)
 
 """
-    trainmode!(m)
+    trainmode!(model) -> model
 
 Set a layer, or all layers in a model, to training mode.
 Opposite to [`testmode!`](@ref), see further details there.
@@ -50,17 +50,28 @@ trainmode!(m, ::Nothing) = testmode!(m, nothing)  # why do we have so much API?
 
 This two-argument method is largely internal. It recurses into the `model`,
 and until a method like `testmode!(d::Dropout, inactive)` alters the activity of a layer.
+Custom layers can support manual `testmode!` / `trainmode!` switching
+by defining such a method.
 
 Possible values of  `inactive` are:
 - `true` for testing, i.e. `active=false`
 - `false` for training, same as [`trainmode!`](@ref)`(m)`
 - `:auto` or `nothing` for Flux to detect training automatically.
+
+!!! compat
+    This method may be removed in a future breaking change, to separate
+    the user-facing `testmode!` from the internal recursion.
 """
 function testmode!(m, mode)
-  if mode isa Symbol && mode !== :auto
-    throw(ArgumentError("testmode! accepts only the symbol :auto, got :$mode"))
+  inactive = if mode isa Symbol
+    mode === :auto || throw(ArgumentError("testmode! accepts only the symbol :auto, got :$mode"))
+    nothing
+  elseif mode isa Union{Bool,Nothing}
+    mode
+  else
+    throw(ArgumentError("testmode! does not accept $(repr(mode)) as the 2nd argument"))
   end
-  foreach(x -> testmode!(x, mode), trainable(m))
+  foreach(x -> testmode!(x, inactive), trainable(m))
   m
 end