From 1fedc0d80e4f06a0d1c2fb5145640a8b4796f407 Mon Sep 17 00:00:00 2001
From: Carlo Lucibello <carlo.lucibello@unibocconi.it>
Date: Sun, 17 Nov 2024 21:06:14 +0100
Subject: [PATCH] deprecation of params and Optimise (continued) (#2526)

---
 .buildkite/pipeline.yml            |   3 +
 .gitignore                         |   1 +
 Project.toml                       |   2 +-
 ext/FluxEnzymeExt/FluxEnzymeExt.jl |   1 -
 src/Flux.jl                        |  99 +++++++----------
 src/deprecations.jl                | 171 +++++++++--------------------
 src/functor.jl                     |  66 ++---------
 src/layers/basic.jl                |  11 +-
 src/layers/macro.jl                |  26 ++---
 src/layers/recurrent.jl            |  18 +--
 src/layers/upsample.jl             |   3 +
 src/optimise/Optimise.jl           | 106 ++++++++++++++++++
 src/optimise/optimisers.jl         |   5 -
 src/optimise/train.jl              |  63 -----------
 src/train.jl                       |  11 +-
 test/ext_cuda/cuda.jl              |   6 +-
 test/ext_cuda/layers.jl            |   5 +-
 test/ext_cuda/runtests.jl          |   1 -
 test/ext_enzyme/enzyme.jl          |  38 ++++---
 test/layers/basic.jl               |  82 +++++++-------
 test/layers/normalisation.jl       |   2 +-
 test/layers/recurrent.jl           |  11 ++
 test/loading.jl                    |  28 ++---
 test/outputsize.jl                 |  14 +--
 test/runtests.jl                   |   2 +-
 test/train.jl                      |  27 +++--
 test/utils.jl                      |  16 +--
 27 files changed, 374 insertions(+), 444 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 58d9ae5002..70bb7951a9 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -14,6 +14,7 @@ steps:
     env:
       FLUX_TEST_CUDA: "true"
       FLUX_TEST_CPU: "false"
+      FLUX_TEST_ENZYME: "false"
     timeout_in_minutes: 60
 
   # - label: "GPU nightly"
@@ -53,6 +54,7 @@ steps:
     env:
       FLUX_TEST_METAL: "true"
       FLUX_TEST_CPU: "false"
+      FLUX_TEST_ENZYME: "false"
     matrix:
       setup:
         julia:
@@ -82,6 +84,7 @@ steps:
       JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
       FLUX_TEST_AMDGPU: "true"
       FLUX_TEST_CPU: "false"
+      FLUX_TEST_ENZYME: "false"
       JULIA_NUM_THREADS: 4
 env:
   SECRET_CODECOV_TOKEN: "fAV/xwuaV0l5oaIYSAXRQIor8h7yHdlrpLUZFwNVnchn7rDk9UZoz0oORG9vlKLc1GK2HhaPRAy+fTkJ3GM/8Y0phHh3ANK8f5UsGm2DUTNsnf6u9izgnwnoRTcsWu+vSO0fyYrxBvBCoJwljL+yZbDFz3oE16DP7HPIzxfQagm+o/kMEszVuoUXhuLXXH0LxT6pXl214qjqs04HfMRmKIIiup48NB6fBLdhGlQz64MdMNHBfgDa/fafB7eNvn0X6pEOxysoy6bDQLUhKelOXgcDx1UsTo34Yiqr+QeJPAeKcO//PWurwQhPoUoHfLad2da9DN4uQk4YQLqAlcIuAA==;U2FsdGVkX1+mRXF2c9soCXT7DYymY3msM+vrpaifiTp8xA+gMpbQ0G63WY3tJ+6V/fJcVnxYoKZVXbjcg8fl4Q=="
diff --git a/.gitignore b/.gitignore
index 21bd9e6e68..c289756be2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ LocalPreferences.toml
 .DS_Store
 docs/mymodel.bson
 prova.jl
+benchmarks/
diff --git a/Project.toml b/Project.toml
index 70673996d7..c0ec473d09 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.15-DEV"
+version = "0.15.0-DEV"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/ext/FluxEnzymeExt/FluxEnzymeExt.jl b/ext/FluxEnzymeExt/FluxEnzymeExt.jl
index 5ac7c2e577..1394b562b3 100644
--- a/ext/FluxEnzymeExt/FluxEnzymeExt.jl
+++ b/ext/FluxEnzymeExt/FluxEnzymeExt.jl
@@ -2,7 +2,6 @@ module FluxEnzymeExt
 
 using Flux
 import Flux.Train: train!, _rule_to_state
-import Flux.Optimise
 import Optimisers
 import Enzyme
 using Enzyme: EnzymeRules, Active, Const, Duplicated, autodiff, ReverseWithPrimal
diff --git a/src/Flux.jl b/src/Flux.jl
index 2804803947..0ddea4a764 100644
--- a/src/Flux.jl
+++ b/src/Flux.jl
@@ -9,14 +9,14 @@ using MacroTools: @forward
 
 @reexport using NNlib
 using MLUtils
-const stack = MLUtils.stack  # now exported by Base
-import Optimisers: Optimisers, trainable, destructure  # before v0.13, Flux owned these functions
-using Optimisers: freeze!, thaw!, adjust!, trainables
+
+using Optimisers: Optimisers, destructure, freeze!, thaw!, adjust!, trainables, update!
+import Optimisers: trainable
 @reexport using Optimisers
 
 using Random: default_rng
 using Zygote, ChainRulesCore
-using Zygote: Params, @adjoint, gradient, pullback
+using Zygote: @adjoint, gradient, pullback
 using Zygote.ForwardDiff: value
 export gradient
 
@@ -31,10 +31,6 @@ export gradient
                     get_device_type,
                     DeviceIterator
 
-
-# Pirate error to catch a common mistake. (Internal function `base` because overloading `update!` is more likely to give ambiguities.)
-Optimisers.base(dx::Zygote.Grads) = error("Optimisers.jl cannot be used with Zygote.jl's implicit gradients, `Params` & `Grads`")
-
 export Chain, Dense, Embedding, EmbeddingBag,
        Maxout, SkipConnection, Parallel, PairwiseFusion,
        RNNCell, LSTMCell, GRUCell, GRUv3Cell,
@@ -55,12 +51,43 @@ export Chain, Dense, Embedding, EmbeddingBag,
   Bilinear, Scale,
   # utils
   outputsize, state, create_bias, @layer,
+  # from OneHotArrays.jl
+  onehot, onehotbatch, onecold,  
+  # from Train
+  setup, train!,
+  # from Optimsers.jl
+  destructure, freeze!, thaw!, adjust!, trainables, update!, trainable,
+  # init
+  glorot_uniform,
+  glorot_normal,
+  kaiming_uniform,
+  kaiming_normal,
+  truncated_normal,
+  lecun_normal,
+  orthogonal,
+  sparse_init,
+  identity_init,
+  # Losses
+  binary_focal_loss,
+  binarycrossentropy,
+  crossentropy,
+  dice_coeff_loss,
+  focal_loss,
+  hinge_loss,
+  huber_loss,
+  kldivergence,
+  label_smoothing,
+  logitbinarycrossentropy,
+  logitcrossentropy,
+  mae,
+  mse,
+  msle,
+  poisson_loss,
+  siamese_contrastive_loss,
+  squared_hinge_loss,
+  tversky_loss,
 ))
 
-include("optimise/Optimise.jl")
-using .Optimise: Optimise
-export ClipValue # this is const defined in deprecations, for ClipGrad
-
 include("train.jl")
 using .Train
 using .Train: setup
@@ -69,18 +96,6 @@ using Adapt, Functors, OneHotArrays
 include("utils.jl")
 include("functor.jl")
 
-@compat(public, (
-  # from OneHotArrays.jl
-  onehot, onehotbatch, onecold,  
-  # from Functors.jl
-  functor, @functor, KeyPath, haskeypath, getkeypath,
-  # from Optimise/Train/Optimisers.jl
-  setup, update!, destructure, freeze!, adjust!, params, trainable, trainables
-))
-
-# Pirate error to catch a common mistake.
-Functors.functor(::Type{<:MLUtils.DataLoader}, x) = error("`DataLoader` does not support Functors.jl, thus functions like `Flux.gpu` will not act on its contents.")
-
 include("layers/show.jl")
 include("layers/macro.jl")
 
@@ -97,8 +112,6 @@ include("loading.jl")
 include("outputsize.jl")
 export @autosize
 
-include("deprecations.jl")
-
 include("losses/Losses.jl")
 using .Losses
 
@@ -110,38 +123,6 @@ include("distributed/backend.jl")
 include("distributed/public_api.jl")
 export MPIBackend, NCCLBackend, DistributedUtils
 
-@compat(public, (
-  # init
-  glorot_uniform,
-  glorot_normal,
-  kaiming_uniform,
-  kaiming_normal,
-  truncated_normal,
-  lecun_normal,
-  orthogonal,
-  sparse_init,
-  identity_init,
-
-  # Losses
-  binary_focal_loss,
-  binarycrossentropy,
-  crossentropy,
-  dice_coeff_loss,
-  focal_loss,
-  hinge_loss,
-  huber_loss,
-  kldivergence,
-  label_smoothing,
-  logitbinarycrossentropy,
-  logitcrossentropy,
-  mae,
-  mse,
-  msle,
-  poisson_loss,
-  siamese_contrastive_loss,
-  squared_hinge_loss,
-  tversky_loss,
-))
-
+include("deprecations.jl")
 
 end # module
diff --git a/src/deprecations.jl b/src/deprecations.jl
index 57ea3bf72a..1b85345f5d 100644
--- a/src/deprecations.jl
+++ b/src/deprecations.jl
@@ -16,125 +16,8 @@ GRUCell(in::Integer, out::Integer; kw...) = GRUCell(in => out; kw...)
 GRUv3Cell(in::Integer, out::Integer; kw...) = GRUv3Cell(in => out; kw...)
 
 
-#=
-  # Valid method in Optimise, old implicit style, is:
-  train!(loss, ps::Params, data, opt::AbstractOptimiser; cb = () -> ())
-
-  # Valid methods in Train, new explict style, are:
-  train!(loss, model, data, opt)  # preferred
-  train!(loss, model, data, opt::Optimisers.AbstractRule)  # if you forget setup
-
-  # Provide friendly errors for what happens if you mix these up:
-=#
-import .Optimise: train!
-
-train!(loss, ps::Params, data, opt; cb=nothing) = error(
-  """can't mix implict Params with explict state!
-  To use `Flux.params(m)` in `train!`, the 4th argument must be from the old `Flux.Optimise` sub-module.
-  But better to use the new explicit style, in which `m` itself is the 2nd argument.
-  """)
-
-train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error(
-  """can't mix implict Params with explict rule from Optimisers.jl
-  To use `Flux.params(m)` in `train!`, the 4th argument must be from the old `Flux.Optimise` sub-module.
-  But better to use the new explicit style, in which `m` itself is the 2nd argument.
-  """)
-
-train!(loss, model, data, opt::Optimise.AbstractOptimiser; cb=nothing) =
-  train!(loss, model, data, __old_to_new(opt); cb)
-
-# Next, to use the new `setup` with the still-exported old-style `Adam` etc:
-import .Train: setup
-setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
-# ... and allow accidental use of `Optimisers.setup` to do the same:
-Optimisers.setup(rule::Optimise.AbstractOptimiser, model) = setup(__old_to_new(rule), model)
-
-
-function __old_to_new(rule)
-  Base.depwarn("""Optimisers from  Flux.Optimise module are deprecated. 
-                   Use optimisers from Optimisers.jl instead.""", :__old_to_new)
-  return _old_to_new(rule)
-end
-
-for T in [:Descent, :Adam, :Momentum, :Nesterov,
-   	      :AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief,
-   	      # :InvDecay, :ExpDecay, 
-          :SignDecay,
-          ]
-  @eval function _old_to_new(rule::Optimise.$T)
-    args = map(f -> getfield(rule, f), fieldnames(Optimisers.$T))
-    Optimisers.$T(args...)
-  end
-end
-_old_to_new(rule::Optimise.Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
-# const OptimiserChain = Optimise.Optimiser  # lets you use new name with implicit params too.
-const Optimiser = Optimisers.OptimiserChain
-_old_to_new(rule::Optimise.WeightDecay) = Optimisers.WeightDecay(rule.wd)  # called lambda now
-_old_to_new(rule::Optimise.ClipNorm) = Optimisers.ClipNorm(rule.thresh)  # called omega, and there are more fields 
-_old_to_new(rule::Optimise.ClipValue) = Optimisers.ClipGrad(rule.thresh)  # called delta now, and struct name differs
-# const ClipGrad = Optimise.ClipValue
-const ClipValue = Optimisers.ClipGrad
-_old_to_new(rule::Optimise.RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon)  # RMSProp has no field centred
-
-_old_to_new(rule) = error("Flux.setup does not know how to translate this old-style implicit rule to a new-style Optimisers.jl explicit rule")
-
-# This allows you to mix and match, like Flux.setup(OptimiserChain(Optimisers.SignDecay(), Flux.Descent()), [1,2,3.])
-Optimisers.OptimiserChain(rules::Union{Optimisers.AbstractRule, Optimise.AbstractOptimiser}...) =
-  Optimisers.OptimiserChain(map(_old_to_new, rules))
-_old_to_new(rule::Optimisers.AbstractRule) = rule
-
-# Since `update!` should be called in a loop, it makes less sense to call `setup` for you if you forgot.
-# But let's make sure that such uses give a helpful error:
-import .Optimise: update!
-
-function update!(opt::Optimise.AbstractOptimiser, model, grad)
-  # This error method requires narrowing the main worker method of Flux.Optimise
-  # to accept only arrays. Remove if this causes problems!
-  # update!(opt::Flux.Optimise.AbstractOptimiser, x::AbstractArray, x̄)
-  error("""Invalid input to `update!`.
-    * For the implicit style, this needs `update!(::AbstractOptimiser, ::Params, ::Grads)`
-    * For the explicit style, `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
-    """)
-end
-
-# TODO this friendly error should go in Optimisers.jl.
-# remove after https://github.com/FluxML/Optimisers.jl/pull/181
-function update!(opt::Optimisers.AbstractRule, model, grad)
-  error("""Invalid input to `update!`.
-     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
-    """)
-end
-function update!(opt::Optimisers.AbstractRule, model::Chain, grad::Tuple)
-  error("""Invalid input to `update!`.
-     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
-    """)
-end
-
-# An easy error to make is to pass result of explicit gradient(...), not gradient(...)[1]
-# Can't catch every case, but can catch many simple Flux models:
-
-function update!(opt, model::Chain, grads::Tuple)
-  # Zygote will make a NamedTuple{(:layers,)} for the gradient of Chain, Diffractor a Tangent
-  @warn """explicit `update!(opt, model, grad)` wants the gradient for the model alone,
-    not the whole tuple from `gradient(m -> loss(m, x, y), model)`. You probably want `grads[1]`."""
-  update!(opt, model, grads[1])
-end
-
-function update!(opt::Optimise.AbstractOptimiser, model::Chain, grads::Tuple)  # ambiguity
-  update!(opt, model, grads[1])  # calls error case "Invalid input" just above
-end
-
-# One more easy error to catch is using explicit gradient with `params(m)`:
+#### v0.14 deprecations ###########################
 
-function update!(opt::Optimise.AbstractOptimiser, ::Params, grads::Union{Tuple, NamedTuple})
-  error("""can't mix implicit Params with explicit gradients!
-    * For the implicit style, this needs `update(::AbstractOptimiser, ::Params, ::Grads)` with implicit gradient.
-    * For the explicit style, `update(state, model, grad)` needs the model itself, and `state = Flux.setup(opt, model)`.
-    """)
-end
-
-
-# v0.14 deprecations
 @deprecate default_rng_value() Random.default_rng()
 
 
@@ -179,14 +62,14 @@ const FluxCUDAAdaptor = CUDADevice
 const FluxAMDGPUAdaptor = AMDGPUDevice
 const FluxMetalAdaptor = MetalDevice
 
-# v0.15 deprecations
+######## v0.15 deprecations #########################
 
-# Enable these when 0.15 is released, and delete const ClipGrad = Optimise.ClipValue etc: 
+# Enable these when 0.16 is released, and delete const ClipGrad = Optimise.ClipValue etc: 
 # Base.@deprecate_binding Optimiser OptimiserChain
 # Base.@deprecate_binding ClipValue ClipGrad
 
 # train!(loss::Function, ps::Zygote.Params, data, opt) = throw(ArgumentError(
-#   """On Flux 0.15, `train!` no longer accepts implicit `Zygote.Params`.
+#   """On Flux 0.16, `train!` no longer accepts implicit `Zygote.Params`.
 #   Instead of `train!(loss_xy, Flux.params(model), data, Adam())`
 #   it now needs `opt = Flux.setup(Adam(), model); train!(loss_mxy, model, data, opt)`
 #   where `loss_mxy` accepts the model as its first argument.
@@ -197,3 +80,49 @@ function reset!(x)
   Base.depwarn("reset!(m) is deprecated. You can remove this call as it is no more needed.", :reset!)
   return x
 end
+
+function params!(p::Zygote.Params, x, seen = IdSet())
+  if x isa AbstractArray{<:Number} && Functors.isleaf(x)
+    return push!(p, x)
+  elseif x in seen
+    nothing
+  else
+    _check_new_macro(x)  # complains if you used @functor not @layer
+    push!(seen, x)
+    for child in trainable(x)
+      params!(p, child, seen)
+    end
+  end
+end
+
+function params(m...)
+  Base.depwarn("""
+  Flux.params(m...) is deprecated. Use `Flux.trainable(model)` for parameters' collection
+  and the explicit `gradient(m -> loss(m, x, y), model)` for gradient computation.
+  """, :params)
+  ps = Params()
+  params!(ps, m)
+  return ps
+end
+
+# Allows caching of the parameters when params is called within gradient() to fix #2040.
+# @non_differentiable params(m...)  # https://github.com/FluxML/Flux.jl/pull/2054
+# That speeds up implicit use, and silently breaks explicit use. 
+# From @macroexpand Zygote.@non_differentiable params(m...) and https://github.com/FluxML/Zygote.jl/pull/1248
+Zygote._pullback(::Zygote.Context{true}, ::typeof(params), m...) = params(m), _ -> nothing
+
+include("optimise/Optimise.jl") ## deprecated Module
+
+
+# TODO this friendly error should go in Optimisers.jl.
+# remove after https://github.com/FluxML/Optimisers.jl/pull/181
+function Optimisers.update!(opt::Optimisers.AbstractRule, model, grad)
+  error("""Invalid input to `update!`.
+     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    """)
+end
+function Optimisers.update!(opt::Optimisers.AbstractRule, model::Chain, grad::Tuple)
+  error("""Invalid input to `update!`.
+     `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    """)
+end
diff --git a/src/functor.jl b/src/functor.jl
index b1c489b61e..e8c02b919f 100644
--- a/src/functor.jl
+++ b/src/functor.jl
@@ -75,64 +75,6 @@ function testmode!(m, mode)
   m
 end
 
-function params!(p::Params, x, seen = IdSet())
-  if x isa AbstractArray{<:Number} && Functors.isleaf(x)
-    return push!(p, x)
-  elseif x in seen
-    nothing
-  else
-    _check_new_macro(x)  # complains if you used @functor not @layer
-    push!(seen, x)
-    for child in trainable(x)
-      params!(p, child, seen)
-    end
-  end
-end
-
-"""
-    params(model)
-    params(layers...)
-
-Given a model or specific layers from a model, create a `Params` object pointing to its trainable parameters.
-
-This can be used with the `gradient` function, see the [training section of the manual](@ref man-training), or as input to the [`Flux.train!`](@ref Flux.train!) function.
-
-The behaviour of `params` on custom types can be customized using [`Functors.@functor`](@ref) or [`Flux.trainable`](@ref).
-
-# Examples
-```jldoctest
-julia> using Flux: params
-
-julia> params(Chain(Dense(ones(2,3)), softmax))  # unpacks Flux models
-Params([[1.0 1.0 1.0; 1.0 1.0 1.0], [0.0, 0.0]])
-
-julia> bn = BatchNorm(2, relu)
-BatchNorm(2, relu)  # 4 parameters, plus 4 non-trainable
-
-julia> params(bn)  # only the trainable parameters
-Params([Float32[0.0, 0.0], Float32[1.0, 1.0]])
-
-julia> params([1, 2, 3], [4])  # one or more arrays of numbers
-Params([[1, 2, 3], [4]])
-
-julia> params([[1, 2, 3], [4]])  # unpacks array of arrays
-Params([[1, 2, 3], [4]])
-
-julia> params(1, [2 2], (alpha=[3,3,3], beta=Ref(4), gamma=sin))  # ignores scalars, unpacks NamedTuples
-Params([[2 2], [3, 3, 3]])
-```
-"""
-function params(m...)
-  ps = Params()
-  params!(ps, m)
-  return ps
-end
-
-# Allows caching of the parameters when params is called within gradient() to fix #2040.
-# @non_differentiable params(m...)  # https://github.com/FluxML/Flux.jl/pull/2054
-# That speeds up implicit use, and silently breaks explicit use. 
-# From @macroexpand Zygote.@non_differentiable params(m...) and https://github.com/FluxML/Zygote.jl/pull/1248
-Zygote._pullback(::Zygote.Context{true}, ::typeof(params), m...) = params(m), _ -> nothing
 
 
 
@@ -166,6 +108,10 @@ julia> m.bias
 """
 cpu(x) = cpu_device()(x)
 
+# TODO remove after https://github.com/LuxDL/Lux.jl/pull/1089
+ChainRulesCore.@non_differentiable cpu_device()
+
+
 # Remove when 
 # https://github.com/JuliaPackaging/Preferences.jl/issues/39
 # is resolved
@@ -207,6 +153,10 @@ CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
 """
 gpu(x) = gpu_device()(x)
 
+# TODO remove after https://github.com/LuxDL/Lux.jl/pull/1089
+ChainRulesCore.@non_differentiable gpu_device()
+ChainRulesCore.@non_differentiable gpu_device(::Any)
+
 # Precision
 
 struct FluxEltypeAdaptor{T} end
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
index 8aec354716..546f8f29ce 100644
--- a/src/layers/basic.jl
+++ b/src/layers/basic.jl
@@ -77,7 +77,16 @@ function _applychain(layers::AbstractVector, x)  # type-unstable path, helps com
   for f in layers
     x = f(x)
   end
-  x
+  return x
+end
+
+# An easy error to make is to pass result of explicit gradient(...), not gradient(...)[1]
+# Can't catch every case, but can catch many simple Flux models:
+function Optimisers.update!(opt, model::Chain, grads::Tuple)
+  # Zygote will make a NamedTuple{(:layers,)} for the gradient of Chain, Diffractor a Tangent
+  @warn """explicit `update!(opt, model, grad)` wants the gradient for the model alone,
+    not the whole tuple from `gradient(m -> loss(m, x, y), model)`. You probably want `grads[1]`."""
+  return Optimisers.update!(opt, model, grads[1])
 end
 
 Base.getindex(c::Chain, i::AbstractArray) = Chain(c.layers[i])
diff --git a/src/layers/macro.jl b/src/layers/macro.jl
index 065774602a..9f9d0435ec 100644
--- a/src/layers/macro.jl
+++ b/src/layers/macro.jl
@@ -117,19 +117,19 @@ end
 _macro_functor(type, field::Union{Symbol,QuoteNode}) = _macro_functor(type, :(($field,)))  # lets you forget a comma
 
 function _default_functor(::Type{T}, x) where {T}
-   if @generated
-     F = fieldnames(T)
-     args = map(sy -> :(getfield(x, $(QuoteNode(sy)))), F)
-     C = Base.typename(T).wrapper  # constructor
-     # recon = VERSION > v"1.9-" ? :(Splat($C)) : :(Base.splat($C))
-     recon = :(Base.splat($C))
-     :((NamedTuple{$F}(($(args...),)), $recon))
-   else
-     # Getting this parameterless type takes about 2μs, every time:
-     # spl = VERSION > v"1.9-" ? Splat : Base.splat
-     spl = Base.splat
-     namedtuple(x), spl(Base.typename(T).wrapper)
-   end
+  if @generated
+    F = fieldnames(T)
+    args = map(sy -> :(getfield(x, $(QuoteNode(sy)))), F)
+    C = Base.typename(T).wrapper  # constructor
+    # recon = VERSION > v"1.9-" ? :(Splat($C)) : :(Base.splat($C))
+    recon = :(Base.splat($C))
+    :((NamedTuple{$F}(($(args...),)), $recon))
+  else
+    # Getting this parameterless type takes about 2μs, every time:
+    # spl = VERSION > v"1.9-" ? Splat : Base.splat
+    spl = Base.splat
+    namedtuple(x), spl(Base.typename(T).wrapper)
+  end
 end
  
 function namedtuple(x::T) where T
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
index 0834ce5b3f..20b7c9c5aa 100644
--- a/src/layers/recurrent.jl
+++ b/src/layers/recurrent.jl
@@ -176,9 +176,9 @@ function RNN((in, out)::Pair, σ = tanh; cell_kwargs...)
   return RNN(cell)
 end
 
-(m::RNN)(x) = m(x, zeros_like(x, size(m.cell.Wh, 1)))
+(m::RNN)(x::AbstractArray) = m(x, zeros_like(x, size(m.cell.Wh, 1)))
 
-function (m::RNN)(x, h)
+function (m::RNN)(x::AbstractArray, h)
   @assert ndims(x) == 2 || ndims(x) == 3
   # [x] = [in, L] or [in, L, B]
   # [h] = [out] or [out, B]
@@ -366,13 +366,13 @@ function LSTM((in, out)::Pair; cell_kwargs...)
   return LSTM(cell)
 end
 
-function (m::LSTM)(x)
-  h = zeros_like(x, size(m.cell.Wh, 1))
+function (m::LSTM)(x::AbstractArray)
+  h = zeros_like(x, size(m.cell.Wh, 2))
   c = zeros_like(h)
   return m(x, (h, c))
 end
 
-function (m::LSTM)(x, (h, c))
+function (m::LSTM)(x::AbstractArray, (h, c))
   @assert ndims(x) == 2 || ndims(x) == 3
   h′ = []
   c′ = []
@@ -538,12 +538,12 @@ function GRU((in, out)::Pair; cell_kwargs...)
   return GRU(cell)
 end
 
-function (m::GRU)(x)
+function (m::GRU)(x::AbstractArray)
   h = zeros_like(x, size(m.cell.Wh, 2))
   return m(x, h)
 end
 
-function (m::GRU)(x, h)
+function (m::GRU)(x::AbstractArray, h)
   @assert ndims(x) == 2 || ndims(x) == 3
   h′ = []
   # [x] = [in, L] or [in, L, B]
@@ -676,12 +676,12 @@ function GRUv3((in, out)::Pair; cell_kwargs...)
   return GRUv3(cell)
 end
 
-function (m::GRUv3)(x)
+function (m::GRUv3)(x::AbstractArray)
   h = zeros_like(x, size(m.cell.Wh, 2))
   return m(x, h)
 end
 
-function (m::GRUv3)(x, h)
+function (m::GRUv3)(x::AbstractArray, h)
   @assert ndims(x) == 2 || ndims(x) == 3
   h′ = []
   for x_t in eachslice(x, dims = 2)
diff --git a/src/layers/upsample.jl b/src/layers/upsample.jl
index c71a9acc8d..5f79b98fde 100644
--- a/src/layers/upsample.jl
+++ b/src/layers/upsample.jl
@@ -35,6 +35,9 @@ struct Upsample{mode, S, T}
   size::T
 end
 
+Functors.@leaf Upsample # mark leaf since the constructor is not compatible with Functors
+                        # by default but we don't need to recurse into it   
+
 function Upsample(mode::Symbol = :nearest; scale = nothing, size = nothing)
   mode in [:nearest, :bilinear, :trilinear] || 
     throw(ArgumentError("mode=:$mode is not supported."))
diff --git a/src/optimise/Optimise.jl b/src/optimise/Optimise.jl
index f637d83242..7683fa81f3 100644
--- a/src/optimise/Optimise.jl
+++ b/src/optimise/Optimise.jl
@@ -1,7 +1,21 @@
+# Deprecated since Flux v0.13 in favor of Optimisers.jl
 module Optimise
 
 using LinearAlgebra
 
+using MacroTools: @forward
+
+using ProgressLogging: @progress, @withprogress, @logprogress
+import Zygote: Params, gradient, withgradient
+
+using MacroTools: @forward
+
+using Optimisers: Optimisers
+# Add methods to Optimisers.jl's function, so that there is just one Flux.update!
+# for both explicit and implicit parameters.
+import Optimisers: update!
+import Flux: train!, Chain, setup
+
 export train!, update!,
 	Descent, Adam, Momentum, Nesterov, RMSProp,
 	AdaGrad, AdaMax, AdaDelta, AMSGrad, NAdam, AdamW,RAdam, OAdam, AdaBelief,
@@ -11,4 +25,96 @@ export train!, update!,
 include("optimisers.jl")
 include("train.jl")
 
+
+#=
+  # Valid method in Optimise, old implicit style, is:
+  train!(loss, ps::Params, data, opt::AbstractOptimiser; cb = () -> ())
+
+  # Valid methods in Train, new explict style, are:
+  train!(loss, model, data, opt)  # preferred
+  train!(loss, model, data, opt::Optimisers.AbstractRule)  # if you forget setup
+
+  # Provide friendly errors for what happens if you mix these up:
+=#
+
+train!(loss, ps::Params, data, opt; cb=nothing) = error(
+  """can't mix implict Params with explict state!
+  To use `Flux.params(m)` in `train!`, the 4th argument must be from the old `Flux.Optimise` sub-module.
+  But better to use the new explicit style, in which `m` itself is the 2nd argument.
+  """)
+
+train!(loss, ps::Params, data, opt::Optimisers.AbstractRule; cb=nothing) = error(
+  """can't mix implict Params with explict rule from Optimisers.jl
+  To use `Flux.params(m)` in `train!`, the 4th argument must be from the old `Flux.Optimise` sub-module.
+  But better to use the new explicit style, in which `m` itself is the 2nd argument.
+  """)
+
+train!(loss, model, data, opt::AbstractOptimiser; cb=nothing) =
+  train!(loss, model, data, __old_to_new(opt); cb)
+
+# Next, to use the new `setup` with the still-exported old-style `Adam` etc:
+setup(rule::AbstractOptimiser, model) = setup(__old_to_new(rule), model)
+# ... and allow accidental use of `Optimisers.setup` to do the same:
+Optimisers.setup(rule::AbstractOptimiser, model) = setup(__old_to_new(rule), model)
+
+
+function __old_to_new(rule)
+  Base.depwarn("""Optimisers from  Flux.Optimise module are deprecated. 
+                   Use optimisers from Optimisers.jl instead.""", :__old_to_new)
+  return _old_to_new(rule)
+end
+
+for T in [:Descent, :Adam, :Momentum, :Nesterov,
+   	      :AdaGrad, :AdaMax, :AdaDelta, :AMSGrad, :NAdam, :RAdam, :OAdam, :AdaBelief,
+   	      # :InvDecay, :ExpDecay, 
+          :SignDecay,
+          ]
+  @eval function _old_to_new(rule::$T)
+    args = map(f -> getfield(rule, f), fieldnames(Optimisers.$T))
+    Optimisers.$T(args...)
+  end
+end
+_old_to_new(rule::Optimiser) = Optimisers.OptimiserChain(map(_old_to_new, rule.os)...)
+const OptimiserChain = Optimiser  # lets you use new name with implicit params too.
+_old_to_new(rule::WeightDecay) = Optimisers.WeightDecay(rule.wd)  # called lambda now
+_old_to_new(rule::ClipNorm) = Optimisers.ClipNorm(rule.thresh)  # called omega, and there are more fields 
+_old_to_new(rule::ClipValue) = Optimisers.ClipGrad(rule.thresh)  # called delta now, and struct name differs
+const ClipGrad = ClipValue
+_old_to_new(rule::RMSProp) = Optimisers.RMSProp(rule.eta, rule.rho, rule.epsilon)  # RMSProp has no field centred
+
+_old_to_new(rule) = error("Flux.setup does not know how to translate this old-style implicit rule to a new-style Optimisers.jl explicit rule")
+
+# This allows you to mix and match, like Flux.setup(OptimiserChain(Optimisers.SignDecay(), Flux.Descent()), [1,2,3.])
+Optimisers.OptimiserChain(rules::Union{Optimisers.AbstractRule, AbstractOptimiser}...) =
+  Optimisers.OptimiserChain(map(_old_to_new, rules))
+_old_to_new(rule::Optimisers.AbstractRule) = rule
+
+# Since `update!` should be called in a loop, it makes less sense to call `setup` for you if you forgot.
+# But let's make sure that such uses give a helpful error:
+
+function update!(opt::AbstractOptimiser, model, grad)
+  # This error method requires narrowing the main worker method of Flux.Optimise
+  # to accept only arrays. Remove if this causes problems!
+  # update!(opt::Flux.Optimise.AbstractOptimiser, x::AbstractArray, x̄)
+  error("""Invalid input to `update!`.
+    * For the implicit style, this needs `update!(::AbstractOptimiser, ::Params, ::Grads)`
+    * For the explicit style, `update!(state, model, grad)` needs `state = Flux.setup(opt, model)`.
+    """)
+end
+
+function update!(opt::AbstractOptimiser, model::Chain, grads::Tuple)  # ambiguity
+  update!(opt, model, grads[1])  # calls error case "Invalid input" just above
+end
+
+# One more easy error to catch is using explicit gradient with `params(m)`:
+
+function update!(opt::AbstractOptimiser, ::Params, grads::Union{Tuple, NamedTuple})
+  error("""can't mix implicit Params with explicit gradients!
+    * For the implicit style, this needs `update(::AbstractOptimiser, ::Params, ::Grads)` with implicit gradient.
+    * For the explicit style, `update(state, model, grad)` needs the model itself, and `state = Flux.setup(opt, model)`.
+    """)
+end
+
+##########
+
 end
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
index 18f9d3ddae..8c6076f4bb 100644
--- a/src/optimise/optimisers.jl
+++ b/src/optimise/optimisers.jl
@@ -1,12 +1,7 @@
-using Flux
-using MacroTools: @forward
-
 abstract type AbstractOptimiser end
 
 const EPS = 1e-8
 
-# TODO: should use weak refs
-
 """
     Descent(η = 0.1)
 
diff --git a/src/optimise/train.jl b/src/optimise/train.jl
index 7bd3f9b277..52488107ef 100644
--- a/src/optimise/train.jl
+++ b/src/optimise/train.jl
@@ -1,24 +1,3 @@
-using ProgressLogging: @progress, @withprogress, @logprogress
-import Zygote: Params, gradient, withgradient
-
-# Add methods to Optimisers.jl's function, so that there is just one Flux.update!
-# for both explicit and implicit parameters.
-import Optimisers.update!
-
-"""
-    update!(opt, p, g)
-    update!(opt, ps::Params, gs)
-
-Perform an update step of the parameters `ps` (or the single parameter `p`)
-according to optimiser `opt::AbstractOptimiser`  and the gradients `gs` (the gradient `g`).
-
-As a result, the parameters are mutated and the optimiser's internal state may change.
-The gradient could be mutated as well.
-
-!!! compat "Deprecated"
-    This method for implicit `Params` (and `AbstractOptimiser`) will be removed from Flux 0.15.
-    The explicit method `update!(opt, model, grad)` from Optimisers.jl will remain.
-"""
 function update!(opt::AbstractOptimiser, x::AbstractArray, x̄)
   x̄r = copyto!(similar(x̄), x̄)  # Flux.Optimise assumes it can mutate the gradient. This is not
                                # safe due to aliasing, nor guaranteed to be possible, e.g. Fill.
@@ -41,48 +20,6 @@ runall(fs::AbstractVector) = () -> foreach(call, fs)
 batchmemaybe(x) = tuple(x)
 batchmemaybe(x::Tuple) = x
 
-"""
-    train!(loss, pars::Params, data, opt::AbstractOptimiser; [cb])
-        
-Uses a `loss` function and training `data` to improve the 
-model's parameters according to a particular optimisation rule `opt`.
-
-!!! compat "Deprecated"
-    This method with implicit `Params` will be removed from Flux 0.15.
-    It should be replaced with the explicit method `train!(loss, model, data, opt)`.
-
-For each `d in data`, first the gradient of the `loss` is computed like this:
-```
-    gradient(() -> loss(d...), pars)  # if d isa Tuple
-    gradient(() -> loss(d), pars)     # otherwise
-```
-Here `pars` is produced by calling `Flux.params` on your model.
-(Or just on the layers you want to train, like `train!(loss, params(model[1:end-2]), data, opt)`.)
-This is the "implicit" style of parameter handling.
-
-This gradient is then used by optimiser `opt` to update the parameters:
-```
-    update!(opt, pars, grads)
-```
-The optimiser should be from the `Flux.Optimise` module (see [Optimisers](@ref)).
-Different optimisers can be combined using [`Flux.Optimise.Optimiser`](@ref Flux.Optimiser).
-
-This training loop iterates through `data` once.
-It will stop with a `DomainError` if the loss is `NaN` or infinite.
-
-You can use use `train!` inside a for loop to do this several times, or 
-use for instance `Itertools.ncycle` to make a longer `data` iterator.
-
-## Callbacks
-
-[Callbacks](@ref) are given with the keyword argument `cb`.
-For example, this will print "training" every 10 seconds (using [`Flux.throttle`](@ref)):
-```
-    train!(loss, params, data, opt, cb = throttle(() -> println("training"), 10))
-```
-
-Multiple callbacks can be passed to `cb` as array.
-"""
 function train!(loss, ps::Params, data, opt::AbstractOptimiser; cb = () -> ())
   cb = runall(cb)
   itrsz = Base.IteratorSize(typeof(data))
diff --git a/src/train.jl b/src/train.jl
index d2cbbd40fa..7dd27e2269 100644
--- a/src/train.jl
+++ b/src/train.jl
@@ -3,13 +3,12 @@ module Train
 using LinearAlgebra
 using Optimisers: Optimisers
 using Functors: fmap, fmapstructure
-using ..Flux: Flux # used only in docstring 
-import ..Flux.Optimise: train!, update!  # during 0.13, we add methods to the old functions
-
-export setup, train!
+using ..Flux: Flux
 
 using ProgressLogging: @progress, @withprogress, @logprogress
-using Zygote: Zygote, Params
+using Zygote: Zygote
+
+export setup, train!
 
 """
     opt_state = setup(rule, model)
@@ -49,7 +48,7 @@ function setup(rule::Optimisers.AbstractRule, model)
       Optimisers.maywrite(x) || error("""model must be fully mutable for `train!` to work, got `x::$(typeof(x))`.
                                          If `x .+= dx` is in fact ok, define `Optimisers.maywrite(::$(typeof(x))) = true`""")
     end
-    state
+    return state
 end
 
 """
diff --git a/test/ext_cuda/cuda.jl b/test/ext_cuda/cuda.jl
index 066998f14c..fc5e2c7bde 100644
--- a/test/ext_cuda/cuda.jl
+++ b/test/ext_cuda/cuda.jl
@@ -111,7 +111,7 @@ end
   # This test should really not go through indirections and pull out Fills for efficiency
   # but we forcefully materialise. TODO: remove materialising CuArray here
   @test gradient(x -> sum(cpu(x)), ca)[1] isa CuArray # This involves FillArray, which should be GPU compatible
-  @test gradient(x -> sum(cpu(x)), ca')[1] isa CuArray
+  @test gradient(x -> sum(cpu(x)), ca')[1] isa AnyCuArray
 
   # Even more trivial: no movement
   @test gradient(x -> sum(abs, cpu(x)), a)[1] isa Matrix
@@ -133,8 +133,8 @@ end
 
   # Scalar indexing of an array, needs OneElement to transfer to GPU
   # https://github.com/FluxML/Zygote.jl/issues/1005
-  @test_broken gradient(x -> cpu(2 .* gpu(x))[1], Float32[1,2,3]) == ([2,0,0],)
-  @test_broken gradient(x -> cpu(gpu(x) * gpu(x))[1,2], Float32[1 2 3; 4 5 6; 7 8 9]) == ([2 6 8; 0 2 0; 0 3 0],)
+  @test gradient(x -> cpu(2 .* gpu(x))[1], Float32[1,2,3]) == ([2,0,0],)
+  @test gradient(x -> cpu(gpu(x) * gpu(x))[1,2], Float32[1 2 3; 4 5 6; 7 8 9]) == ([2 6 8; 0 2 0; 0 3 0],)
 end
 
 @testset "gpu(x) and cpu(x) on structured arrays" begin
diff --git a/test/ext_cuda/layers.jl b/test/ext_cuda/layers.jl
index cdb8f003e9..b7b456bcf1 100644
--- a/test/ext_cuda/layers.jl
+++ b/test/ext_cuda/layers.jl
@@ -6,11 +6,10 @@
 
 # generic movement tests
 @testset "Basic GPU Movement" begin
-  @test gradient(x -> sum(gpu(x)), rand(3,3)) isa Tuple
-  @test gradient(x -> sum(cpu(x)), gpu(rand(3,3))) isa Tuple
+  @test gradient(x -> sum(gpu(x)), rand(Float32, 3, 3))[1] isa Matrix{Float32}
+  @test gradient(x -> sum(cpu(x)), gpu(rand(Float32, 3, 3)))[1] isa CuMatrix{Float32}
 end
 
-
 const ACTIVATIONS = [identity, tanh]
 
 function gpu_gradtest(name::String, layers::Vector, x_cpu, args...; 
diff --git a/test/ext_cuda/runtests.jl b/test/ext_cuda/runtests.jl
index be02409077..9f9de8a76a 100644
--- a/test/ext_cuda/runtests.jl
+++ b/test/ext_cuda/runtests.jl
@@ -32,4 +32,3 @@ end
 @testset "utils" begin
   include("utils.jl")
 end
-  
diff --git a/test/ext_enzyme/enzyme.jl b/test/ext_enzyme/enzyme.jl
index aa04150cf0..282d08911d 100644
--- a/test/ext_enzyme/enzyme.jl
+++ b/test/ext_enzyme/enzyme.jl
@@ -8,6 +8,7 @@ using FiniteDifferences
 
 
 function gradient_fd(f, x...)
+    f = f |> f64
     x = [cpu(x) for x in x]
     ps_and_res = [x isa AbstractArray ? (x, identity) : Flux.destructure(x) for x in x]
     ps = [f64(x[1]) for x in ps_and_res]
@@ -97,21 +98,16 @@ end
     end
 
     models_xs = [
-        (Dense(2, 4), randn(Float32, 2), "Dense"),
-        (Chain(Dense(2, 4, relu), Dense(4, 3)), randn(Float32, 2), "Chain(Dense, Dense)"),
-        (f64(Chain(Dense(2, 4), Dense(4, 2))), randn(Float64, 2, 1), "f64(Chain(Dense, Dense))"),
+        (Dense(2=>4), randn(Float32, 2), "Dense"),
+        (Chain(Dense(2=>4, tanh), Dense(4=>3)), randn(Float32, 2), "Chain(Dense, Dense)"),
+        (f64(Chain(Dense(2=>4), Dense(4=>2))), randn(Float64, 2, 1), "f64(Chain(Dense, Dense))"),
         (Flux.Scale([1.0f0 2.0f0 3.0f0 4.0f0], true, abs2), randn(Float32, 2), "Flux.Scale"),
         (Conv((3, 3), 2 => 3), randn(Float32, 3, 3, 2, 1), "Conv"),
-        (Chain(Conv((3, 3), 2 => 3, relu), Conv((3, 3), 3 => 1, relu)), rand(Float32, 5, 5, 2, 1), "Chain(Conv, Conv)"),
-        (Chain(Conv((4, 4), 2 => 2, pad=SamePad()), MeanPool((5, 5), pad=SamePad())), rand(Float32, 5, 5, 2, 2), "Chain(Conv, MeanPool)"),
+        (Chain(Conv((3, 3), 2 => 3, ), Conv((3, 3), 3 => 1, tanh)), rand(Float32, 5, 5, 2, 1), "Chain(Conv, Conv)"),
+        # (Chain(Conv((4, 4), 2 => 2, pad=SamePad()), MeanPool((5, 5), pad=SamePad())), rand(Float32, 5, 5, 2, 2), "Chain(Conv, MeanPool)"),
         (Maxout(() -> Dense(5 => 4, tanh), 3), randn(Float32, 5, 1), "Maxout"),
-        (RNN(3 => 2), randn(Float32, 3, 2), "RNN"), 
-        (Chain(RNN(3 => 4), RNN(4 => 3)), randn(Float32, 3, 2), "Chain(RNN, RNN)"),
-        (LSTM(3 => 5), randn(Float32, 3, 2), "LSTM"),
-        (Chain(LSTM(3 => 5), LSTM(5 => 3)), randn(Float32, 3, 2), "Chain(LSTM, LSTM)"),
         (SkipConnection(Dense(2 => 2), vcat), randn(Float32, 2, 3), "SkipConnection"),
-        (Flux.Bilinear((2, 2) => 3), randn(Float32, 2, 1), "Bilinear"),        
-        (GRU(3 => 5), randn(Float32, 3, 10), "GRU"),
+        # (Flux.Bilinear((2, 2) => 3), randn(Float32, 2, 1), "Bilinear"),        
         (ConvTranspose((3, 3), 3 => 2, stride=2), rand(Float32, 5, 5, 3, 1), "ConvTranspose"),
     ]
     
@@ -131,12 +127,22 @@ end
         return sum(x)
     end
 
+    struct LSTMChain
+        rnn1
+        rnn2
+    end
+    function (m::LSTMChain)(x)
+        st = m.rnn1(x)
+        st = m.rnn2(st[1])
+        return st[1]
+    end
+
     models_xs = [
-        (RNN(3 => 3), randn(Float32, 3, 2), "RNN"),
-        (LSTM(3 => 3), randn(Float32, 3, 2), "LSTM"),
-        # TESTS BELOW ARE BROKEN FOR ZYGOTE BUT CORRECT FOR ENZYME!
-        (Chain(RNN(3 => 5), RNN(5 => 3)), randn(Float32, 3, 2), "Chain(RNN, RNN)"),
-        (Chain(LSTM(3 => 5), LSTM(5 => 3)), randn(Float32, 3, 2), "Chain(LSTM, LSTM)"),
+        # (RNN(3 => 2), randn(Float32, 3, 2), "RNN"), 
+        # (LSTM(3 => 5), randn(Float32, 3, 2), "LSTM"),
+        # (GRU(3 => 5), randn(Float32, 3, 10), "GRU"),
+        # (Chain(RNN(3 => 4), RNN(4 => 3)), randn(Float32, 3, 2), "Chain(RNN, RNN)"),
+        # (LSTMChain(LSTM(3 => 5), LSTM(5 => 3)), randn(Float32, 3, 2), "LSTMChain(LSTM, LSTM)"),
     ]
 
     for (model, x, name) in models_xs
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
index c95c8c8288..666e3e761b 100644
--- a/test/layers/basic.jl
+++ b/test/layers/basic.jl
@@ -16,23 +16,23 @@ using Flux: activations
   end
 
   @testset "Chain" begin
-    @test_nowarn Chain(Dense(10, 5, σ), Dense(5, 2))(randn32(10))
-    @test_throws DimensionMismatch Chain(Dense(10, 5, σ),Dense(2, 1))(randn32(10))
+    @test_nowarn Chain(Dense(10 => 5, σ), Dense(5 => 2))(randn32(10))
+    @test_throws DimensionMismatch Chain(Dense(10 => 5, σ),Dense(2 => 1))(randn32(10))
     # numeric test should be put into testset of corresponding layer
 
-    @test_nowarn Chain(first = Dense(10, 5, σ), second = Dense(5, 2))(randn32(10))
-    m = Chain(first = Dense(10, 5, σ), second = Dense(5, 2))
+    @test_nowarn Chain(first = Dense(10 => 5, σ), second = Dense(5 => 2))(randn32(10))
+    m = Chain(first = Dense(10 => 5, σ), second = Dense(5 => 2))
     @test m[:first] == m[1]
     @test m[1:2] == m
 
     @test m == m
     @test m == fmap(identity, m)  # does not forget names
 
-    @test_throws ArgumentError Chain(layers = Dense(10, 10), two = identity) # reserved name
+    @test_throws ArgumentError Chain(layers = Dense(10 => 10), two = identity) # reserved name
 
-    @test_nowarn Chain([Dense(10, 5, σ), Dense(5, 2)])(randn(Float32, 10))  # vector of layers
+    @test_nowarn Chain([Dense(10 => 5, σ), Dense(5 => 2)])(randn(Float32, 10))  # vector of layers
     
-    c = Chain(Dense(10, 5, σ), Dense(5, 2), Dense(2, 1, relu))
+    c = Chain(Dense(10 => 5, σ), Dense(5 => 2), Dense(2 => 1, relu))
     @test c[1] == c[begin]
     @test c[3] == c[end]
 
@@ -40,7 +40,7 @@ using Flux: activations
   end
 
   @testset "Activations" begin
-    c = Chain(Dense(3,5,relu), Dense(5,1,relu))
+    c = Chain(Dense(3 => 5, relu), Dense(5 => 1, relu))
     X = Float32.([1.0; 1.0; 1.0])
     @test_nowarn gradient(c -> Flux.activations(c, X)[2][1], c)
 
@@ -51,8 +51,8 @@ using Flux: activations
 
   @testset "Dense" begin
     @testset "constructors" begin
-      @test size(Dense(10, 100).weight) == (100, 10)
-      @test size(Dense(10, 100).bias) == (100,)
+      @test size(Dense(10 => 100).weight) == (100, 10)
+      @test size(Dense(10 => 100).bias) == (100,)
       @test Dense(rand(100,10), rand(100)).σ == identity
       @test Dense(rand(100,10)).σ == identity
 
@@ -62,12 +62,12 @@ using Flux: activations
       @test Dense(rand(Float16, 100,10), true).bias isa Vector{Float16}  # creates matching type
       @test Dense(rand(Float16, 100,10), rand(100)).bias isa Vector{Float16}  # converts to match
 
-      @test Dense(3,4; init=Base.randn, bias=true).bias isa Vector{Float64}
-      @test Dense(3,4; init=Base.randn, bias=[1,2,3,4]).bias isa Vector{Float64}
+      @test Dense(3 => 4; init=Base.randn, bias=true).bias isa Vector{Float64}
+      @test Dense(3 => 4; init=Base.randn, bias=[1,2,3,4]).bias isa Vector{Float64}
 
-      @test_throws MethodError Dense(10, 10.5)
-      @test_throws MethodError Dense(10, 10.5, tanh)
-      @test_throws DimensionMismatch Dense(3,4; bias=rand(5))
+      @test_throws MethodError Dense(10 => 10.5)
+      @test_throws MethodError Dense(10 => 10.5, tanh)
+      @test_throws DimensionMismatch Dense(3 => 4; bias=rand(5))
       @test_throws DimensionMismatch Dense(rand(4,3), rand(5))
       @test_throws MethodError Dense(rand(5))
       @test_throws MethodError Dense(rand(5), rand(5))
@@ -78,18 +78,18 @@ using Flux: activations
       @test_throws DimensionMismatch Dense(10 => 5)(randn32(1))
       @test_throws MethodError Dense(10 => 5)(1) # avoid broadcasting
       @test_throws MethodError Dense(10 => 5).(randn32(10)) # avoid broadcasting
-      @test size(Dense(10, 5)(randn(10))) == (5,)
-      @test size(Dense(10, 5)(randn(10,2))) == (5,2)
-      @test size(Dense(10, 5)(randn(10,2,3))) == (5,2,3)
-      @test size(Dense(10, 5)(randn(10,2,3,4))) == (5,2,3,4)
-      @test_throws DimensionMismatch Dense(10, 5)(randn(11,2,3))
+      @test size(Dense(10 => 5)(randn(10))) == (5,)
+      @test size(Dense(10 => 5)(randn(10,2))) == (5,2)
+      @test size(Dense(10 => 5)(randn(10,2,3))) == (5,2,3)
+      @test size(Dense(10 => 5)(randn(10,2,3,4))) == (5,2,3,4)
+      @test_throws DimensionMismatch Dense(10 => 5)(randn(11,2,3))
     end
     @testset "zeros" begin
-      @test Dense(10, 1, identity, init = ones)(ones(10,1)) == 10*ones(1, 1)
-      @test Dense(10, 1, identity, init = ones)(ones(10,2)) == 10*ones(1, 2)
-      @test Dense(10, 2, identity, init = ones)(ones(10,1)) == 10*ones(2, 1)
-      @test Dense(10, 2, identity, init = ones)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
-      @test Dense(10, 2, identity, init = ones, bias = false)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
+      @test Dense(10 => 1, identity, init = ones)(ones(10,1)) == 10*ones(1, 1)
+      @test Dense(10 => 1, identity, init = ones)(ones(10,2)) == 10*ones(1, 2)
+      @test Dense(10 => 2, identity, init = ones)(ones(10,1)) == 10*ones(2, 1)
+      @test Dense(10 => 2, identity, init = ones)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
+      @test Dense(10 => 2, identity, init = ones, bias = false)([ones(10,1) 2*ones(10,1)]) == [10 20; 10 20]
     end
     @testset "type matching" begin
        d1 = Dense(2 => 3)
@@ -159,7 +159,7 @@ using Flux: activations
     end
 
     @testset "trainables" begin
-      mo = Maxout(()->Dense(32, 64), 4)
+      mo = Maxout(()->Dense(32 => 64), 4)
       ps = Flux.trainables(mo)
       @test length(ps) == 8  #4 alts, each with weight and bias
     end
@@ -173,13 +173,13 @@ using Flux: activations
 
     @testset "concat size" begin
       input = randn(10, 2)
-      @test size(SkipConnection(Dense(10,10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
+      @test size(SkipConnection(Dense(10 => 10), (a,b) -> cat(a, b, dims = 2))(input)) == (10,4)
     end
   end
 
   @testset "Bilinear" begin
     @testset "SkipConnection recombinator" begin
-      d = Dense(10, 10)
+      d = Dense(10 => 10)
       b = Flux.Bilinear(10, 10, 5)
       x = randn(Float32,10,9)
       sc = SkipConnection(d, b)
@@ -231,14 +231,14 @@ using Flux: activations
 
     @testset "concat size" begin
       input = randn32(10, 2)
-      @test size(Parallel((a, b) -> cat(a, b; dims=2), Dense(10, 10), identity)(input)) == (10, 4)
-      @test size(Parallel(hcat, one = Dense(10, 10), two = identity)(input)) == (10, 4)
+      @test size(Parallel((a, b) -> cat(a, b; dims=2), Dense(10 => 10), identity)(input)) == (10, 4)
+      @test size(Parallel(hcat, one = Dense(10 => 10), two = identity)(input)) == (10, 4)
     end
 
     @testset "vararg input" begin
       inputs = randn32(10), randn32(5), randn32(4)
-      @test size(Parallel(+, Dense(10, 2), Dense(5, 2), Dense(4, 2))(inputs)) == (2,)
-      @test size(Parallel(+; a = Dense(10, 2), b = Dense(5, 2), c = Dense(4, 2))(inputs)) == (2,)
+      @test size(Parallel(+, Dense(10 => 2), Dense(5 => 2), Dense(4 => 2))(inputs)) == (2,)
+      @test size(Parallel(+; a = Dense(10 => 2), b = Dense(5 => 2), c = Dense(4 => 2))(inputs)) == (2,)
       @test_throws ArgumentError Parallel(+, sin, cos)(1,2,3)  # wrong number of inputs
       @test Parallel(+, sin, cos)(pi/2) ≈ 1  # one input, several layers
       @test Parallel(/, abs)(3, -4) ≈ 3/4    # one layer, several inputs
@@ -247,12 +247,12 @@ using Flux: activations
     end
 
     @testset "named access" begin
-      m = Parallel(hcat, one = Dense(10, 10), two = identity)
+      m = Parallel(hcat, one = Dense(10 => 10), two = identity)
       @test m[1] == m[:one]
       @test m[1:2] == m
 
-      @test_throws ArgumentError Parallel(hcat, layers = Dense(10, 10), two = identity) # reserved names
-      @test_throws ArgumentError Parallel(hcat, connection = Dense(10, 10), two = identity)
+      @test_throws ArgumentError Parallel(hcat, layers = Dense(10 => 10), two = identity) # reserved names
+      @test_throws ArgumentError Parallel(hcat, connection = Dense(10 => 10), two = identity)
 
       @test m == fmap(identity, m)  # does not forget names
 
@@ -427,7 +427,7 @@ using Flux: activations
 end
 
 @testset "second derivatives" begin
-  m1 = Chain(Dense(3,4,tanh; bias=false), Dense(4,2))
+  m1 = Chain(Dense(3 => 4,tanh; bias=false), Dense(4 => 2))
   @test Zygote.hessian_dual(sum∘m1, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m1, [1,2,3])
 
   m1v = Chain([m1[1], m1[2]])  # vector of layers
@@ -435,17 +435,17 @@ end
   @test Zygote.hessian_dual(sum∘m1v, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m1v, [1,2,3])
 
   # NNlib's softmax gradient writes in-place
-  m2 = Chain(Dense(3,4,tanh), Dense(4,2), softmax)
+  m2 = Chain(Dense(3 => 4, tanh), Dense(4 => 2), softmax)
   @test_broken Zygote.hessian_dual(sum∘m2, [1,2,3]) ≈ Zygote.hessian_reverse(sum∘m2, [1,2,3])
 
   # https://github.com/FluxML/NNlib.jl/issues/362
-  m3 = Chain(Conv((3,), 2 => 3, relu), Dense(2,2))
+  m3 = Chain(Conv((3,), 2 => 3, relu), Dense(2 => 2))
   x3 = cat(Float32[1 2; 3 4; 5 6; 7 8]; dims=3)
   @test Zygote.hessian_dual(sum∘m3, x3) ≈ Zygote.hessian_reverse(sum∘m3, x3)
 end
 
 @testset "gradients of Chain{Vector}" begin
-  m1 = Chain(Dense(3,4,tanh; bias=false), Dense(4,2))
+  m1 = Chain(Dense(3 => 4, tanh; bias=false), Dense(4 => 2))
   m1v = Chain([m1[1], m1[2]])
   @test sum(length, Flux.trainables(m1)) == sum(length, Flux.trainables(m1v))
 
@@ -465,14 +465,14 @@ end
 
 @testset "PairwiseFusion" begin
   x = (rand(1, 10), rand(30, 10))
-  layer = PairwiseFusion(+,  Dense(1, 30),  Dense(30, 10))
+  layer = PairwiseFusion(+,  Dense(1 => 30),  Dense(30 => 10))
   y = layer(x)
   @test length(y) == 2
   @test size(y[1]) == (30, 10)
   @test size(y[2]) == (10, 10)
 
   x = rand(1, 10)
-  layer = PairwiseFusion(.+,  Dense(1, 10),  Dense(10, 1))
+  layer = PairwiseFusion(.+,  Dense(1 => 10),  Dense(10 => 1))
   y = layer(x)
   @test length(y) == 2
   @test size(y[1]) == (10, 10)
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
index f678297eaa..b0c6584b84 100644
--- a/test/layers/normalisation.jl
+++ b/test/layers/normalisation.jl
@@ -32,7 +32,7 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
     @test count(iszero, y2) == 0
 
     x = rand(Float32, 100)
-    m = Chain(Dense(100,100),
+    m = Chain(Dense(100 => 100),
               Dropout(0.9; rng_kwargs...))
     y = evalwgrad(m, x)
     @test count(a->a == 0, y) > 50
diff --git a/test/layers/recurrent.jl b/test/layers/recurrent.jl
index 98e072cdb1..6da1f73ee9 100644
--- a/test/layers/recurrent.jl
+++ b/test/layers/recurrent.jl
@@ -159,6 +159,17 @@ end
     @test c isa Array{Float32, 2}
     @test size(c) == (4, 3)
     test_gradients(model, x, loss = (m, x) -> mean(m(x)[1]))
+
+    lstm = model.lstm
+    h, c = lstm(x)
+    @test h isa Array{Float32, 2}
+    @test size(h) == (4, 3)
+    @test c isa Array{Float32, 2}
+    @test size(c) == (4, 3)
+    # no initial state same as zero initial state
+    h1, c1 = lstm(x, (zeros(Float32, 4), zeros(Float32, 4)))
+    @test h ≈ h1
+    @test c ≈ c1
 end
 
 @testset "GRUCell" begin
diff --git a/test/loading.jl b/test/loading.jl
index 06bc412d31..c4de6055f5 100644
--- a/test/loading.jl
+++ b/test/loading.jl
@@ -16,12 +16,12 @@ end
   
 
 @testset "loadmodel!(dst, src)" begin
-  m1 = Chain(Dense(10, 5), Dense(5, 2, relu))
-  m2 = Chain(Dense(10, 5), Dense(5, 2))
-  m3 = Chain(Conv((3, 3), 3 => 16), Dense(5, 2))
-  m4 = Chain(Dense(10, 6), Dense(6, 2))
-  m5 = Chain(Dense(10, 5), Parallel(+, Dense(Flux.ones32(2, 5), false), Dense(5, 2)))
-  m6 = Chain(Dense(10, 5), Parallel(+, Dense(5, 2), Dense(5, 2)))
+  m1 = Chain(Dense(10 => 5), Dense(5 => 2, relu))
+  m2 = Chain(Dense(10 => 5), Dense(5 => 2))
+  m3 = Chain(Conv((3, 3), 3 => 16), Dense(5 => 2))
+  m4 = Chain(Dense(10 => 6), Dense(6 => 2))
+  m5 = Chain(Dense(10 => 5), Parallel(+, Dense(Flux.ones32(2, 5), false), Dense(5 => 2)))
+  m6 = Chain(Dense(10 => 5), Parallel(+, Dense(5 => 2), Dense(5 => 2)))
 
   loadmodel!(m1, m2)
   # trainable parameters copy over
@@ -73,7 +73,7 @@ end
                   Dropout(0.2),
                   x -> reshape(x, :, size(x, 4)),
                   Dropout(0.2),
-                  Dense(90, 10),
+                  Dense(90 => 10),
                   softmax)
   chain2 = Chain([Dropout(0.1),
                   Conv((3, 3), 1 => 32, relu),
@@ -88,7 +88,7 @@ end
                   Dropout(0.1),
                   x -> reshape(x, :, size(x, 4)),
                   Dropout(0.1),
-                  Dense(90, 10),
+                  Dense(90 => 10),
                   softmax])
   chain2[3].μ .= 5f0
   chain2[3].σ² .= 2f0
@@ -143,9 +143,9 @@ end
   @test_throws ErrorException loadmodel!(m1, m2)
 
   @testset "loadmodel! & filter" begin
-    m1 = Chain(Dense(10, 5), Dense(5, 2, relu))
-    m2 = Chain(Dense(10, 5), Dropout(0.2), Dense(5, 2))
-    m3 = Chain(Dense(10, 5), Dense(5, 2, relu))
+    m1 = Chain(Dense(10 => 5), Dense(5 => 2, relu))
+    m2 = Chain(Dense(10 => 5), Dropout(0.2), Dense(5 => 2))
+    m3 = Chain(Dense(10 => 5), Dense(5 => 2, relu))
 
     # this will not error cause Dropout is skipped
     loadmodel!(m1, m2; filter = x -> !(x isa Dropout))
@@ -191,8 +191,8 @@ end
 end
 
 @testset "state" begin
-  m1 = Chain(Dense(10, 5), Parallel(+, Dense(Flux.ones32(2, 5), false), Dense(5 => 2)))
-  m2 = Chain(Dense(10, 5), Parallel(+, Dense(Flux.zeros32(2, 5), Flux.ones32(2)), Dense(5 => 2)))
+  m1 = Chain(Dense(10 => 5), Parallel(+, Dense(Flux.ones32(2, 5), false), Dense(5 => 2)))
+  m2 = Chain(Dense(10 => 5), Parallel(+, Dense(Flux.zeros32(2, 5), Flux.ones32(2)), Dense(5 => 2)))
   s = Flux.state(m1)
   @test s isa NamedTuple
   @test fieldnames(typeof(s)) == (:layers,)
@@ -217,7 +217,7 @@ end
   end
 
   @testset "track active state and batch norm params" begin
-    m3 = Chain(Dense(10, 5), Dropout(0.2), Dense(5, 2), BatchNorm(2))
+    m3 = Chain(Dense(10 => 5), Dropout(0.2), Dense(5 => 2), BatchNorm(2))
     trainmode!(m3)
     s = Flux.state(m3)
     @test s.layers[2].active == true
diff --git a/test/outputsize.jl b/test/outputsize.jl
index fe217c0fc9..083b415756 100644
--- a/test/outputsize.jl
+++ b/test/outputsize.jl
@@ -2,15 +2,15 @@
   m = Chain(Conv((3, 3), 3 => 16), Conv((3, 3), 16 => 32))
   @test outputsize(m, (10, 10, 3, 1)) == (6, 6, 32, 1)
 
-  m = Dense(10, 5)
+  m = Dense(10 => 5)
   @test_throws DimensionMismatch outputsize(m, (5, 2)) == (5, 1)
   @test outputsize(m, (10,); padbatch=true) == (5, 1)
 
-  m = Chain(Dense(10, 8, σ), Dense(8, 5), Dense(5, 2))
+  m = Chain(Dense(10 => 8, σ), Dense(8 => 5), Dense(5 => 2))
   @test outputsize(m, (10,); padbatch=true) == (2, 1)
   @test outputsize(m, (10, 30)) == (2, 30)
 
-  m = Chain(Dense(10, 8, σ), Dense(8, 4), Dense(5, 2))
+  m = Chain(Dense(10 => 8, σ), Dense(8 => 4), Dense(5 => 2))
   @test_throws DimensionMismatch outputsize(m, (10,))
 
   m = Flux.Scale(10)
@@ -25,7 +25,7 @@
   m = Flux.unsqueeze(dims=3)
   @test outputsize(m, (5, 7, 13)) == (5, 7, 1, 13)
 
-  m = Flux.Bilinear(10, 10, 7)
+  m = Flux.Bilinear((10, 10) => 7)
   @test outputsize(m, (10,)) == (7,)
   @test outputsize(m, (10, 32)) == (7, 32)
 
@@ -41,13 +41,13 @@
 end
 
 @testset "multiple inputs" begin
-  m = Parallel(vcat, Dense(2, 4, relu), Dense(3, 6, relu))
+  m = Parallel(vcat, Dense(2 => 4, relu), Dense(3 => 6, relu))
   @test outputsize(m, (2,), (3,)) == (10,)
   @test outputsize(m, ((2,), (3,))) == (10,)
   @test outputsize(m, (2,), (3,); padbatch=true) == (10, 1)
   @test outputsize(m, (2,7), (3,7)) == (10, 7)
 
-  m = Chain(m, Dense(10, 13, tanh), softmax)
+  m = Chain(m, Dense(10 => 13, tanh), softmax)
   @test outputsize(m, (2,), (3,)) == (13,)
   @test outputsize(m, ((2,), (3,))) == (13,)
   @test outputsize(m, (2,), (3,); padbatch=true) == (13, 1)
@@ -59,7 +59,7 @@ end
                      leakyrelu, lisht, logcosh, logσ, mish,
                      relu, relu6, rrelu, selu, σ, softplus,
                      softshrink, softsign, swish, tanhshrink, trelu]
-    @test outputsize(Dense(10, 5, f), (10, 1)) == (5, 1)
+    @test outputsize(Dense(10 => 5, f), (10, 1)) == (5, 1)
   end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 6f5a2e7d84..a4d72f292e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -15,7 +15,7 @@ using Functors: fmapstructure_with_path
 # ENV["FLUX_TEST_CPU"] = "false"
 # ENV["FLUX_TEST_DISTRIBUTED_MPI"] = "true"
 # ENV["FLUX_TEST_DISTRIBUTED_NCCL"] = "true"
-ENV["FLUX_TEST_ENZYME"] = "false" # We temporarily disable Enzyme tests since they are failing
+# ENV["FLUX_TEST_ENZYME"] = "false"
 
 include("test_utils.jl") # for test_gradients
 
diff --git a/test/train.jl b/test/train.jl
index a021b6f22a..4f75d247ab 100644
--- a/test/train.jl
+++ b/test/train.jl
@@ -16,7 +16,7 @@ for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
     continue
   end
 
-  @testset "Explicit Flux.train! with $name" begin
+  @testset "Flux.train! with $name" begin
     Random.seed!(84)
     w = randn(10, 10)
     w2 = randn(10, 10)  # NB outside the inner @testset, else it will be exactly == w, as the RNG seed is reset.
@@ -48,12 +48,13 @@ for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
 end
 
 for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
-  
-  if (name == "Enzyme" && get(ENV, "FLUX_TEST_ENZYME", "true") == "false")
-    continue
-  end
+  # TODO reinstate Enzyme
+  name == "Enzyme" && continue  
+  # if (name == "Enzyme" && get(ENV, "FLUX_TEST_ENZYME", "true") == "false")
+  #   continue
+  # end
 
-  @testset "Explicit Flux.train! features with $name" begin
+  @testset "Flux.train! features with $name" begin
     @testset "Stop on NaN" begin
       m1 = Dense(1 => 1)
       m1.weight .= 0
@@ -88,7 +89,7 @@ for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
   end
 end
 
-@testset "Explicit Flux.update! features" begin
+@testset "Flux.update! features" begin
   m = Chain(Dense(2=>3, tanh), Dense(3=>1), only)
   x = rand(Float32, 2)
   y1 = m(x)  # before
@@ -116,9 +117,11 @@ end
 
 for (trainfn!, name) in ((Flux.train!, "Zygote"), (train_enzyme!, "Enzyme"))
 
-  if (name == "Enzyme" && get(ENV, "FLUX_TEST_ENZYME", "true") == "false")
-    continue
-  end
+  # TODO reinstate Enzyme
+  name == "Enzyme" && continue
+  # if (name == "Enzyme" && get(ENV, "FLUX_TEST_ENZYME", "true") == "false")
+  #   continue
+  # end
   
   @testset "L2 regularisation with $name" begin
     # New docs claim an exact equivalent. It's a bit long to put the example in there,
@@ -167,7 +170,7 @@ end
 @testset "Flux.setup bugs" begin
   # https://github.com/FluxML/Flux.jl/issues/2144
   @test Flux.setup(Flux.Adam(), Embedding(3 => 1)).weight isa Optimisers.Leaf
-  # Typo in 0.13.9's deprecation
-  @test Flux.setup(Flux.ClipValue(1), Dense(2 => 3)).weight.rule isa Optimisers.ClipGrad
+  
+  @test Flux.setup(Flux.ClipGrad(1), Dense(2 => 3)).weight.rule isa Optimisers.ClipGrad
   @test Flux.setup(Flux.ClipNorm(1), Dense(2 => 3)).weight.rule isa Optimisers.ClipNorm
 end
diff --git a/test/utils.jl b/test/utils.jl
index 6b0a16bcf3..8372e33b69 100644
--- a/test/utils.jl
+++ b/test/utils.jl
@@ -225,7 +225,7 @@ end
         @test identity_init(1, 2, 3, 3)[:, end, :, :] == zeros(Float32, 1, 3, 3)
     end
     @testset "Dense ID mapping" begin
-        l = Dense(3,3, init = identity_init)
+        l = Dense(3 => 3, init = identity_init)
 
         indata = reshape(collect(Float32, 1:9), 3, 3)
         @test l(indata) == indata
@@ -449,7 +449,7 @@ end
   @test modules[5] === m2
   @test modules[6] === m3
 
-  mod_par = Flux.modules(Parallel(Flux.Bilinear(2,2,2,cbrt), Dense(2=>2,abs), Dense(2=>2,abs2)))
+  mod_par = Flux.modules(Parallel(Flux.Bilinear((2,2) => 2,cbrt), Dense(2=>2,abs), Dense(2=>2,abs2)))
   @test length(mod_par) == 5
 
   mod_rnn = Flux.modules(Chain(Dense(2=>3), BatchNorm(3), LSTM(3=>4)))
@@ -559,15 +559,15 @@ end
         dense::Dense
         dense2::Dense
     end
-    Flux.@functor TwoDenses
+    Flux.@layer TwoDenses
 
     function (m::TwoDenses)(x)
         out = m.dense(x)
     end
 
     model = TwoDenses(
-        Dense(3,1),
-        Dense(3,2)
+        Dense(3 => 1),
+        Dense(3 => 2)
     )
     p, re = Flux.destructure(model)
 
@@ -602,7 +602,7 @@ end
     Flux.@layer Model
     (m::Model)(x) = m.a(x) .+ m.b(x)
 
-    d = Dense(1, 1)
+    d = Dense(1 => 1)
     x = rand(Float32, 1, 1)
 
     # Sharing the parameters
@@ -631,8 +631,8 @@ end
 
     data = rand(Float32, n_input, n_batch)
     model = Chain(
-        Dense(n_input, n_shared),
-        Split(Dense(n_shared, n_outputs[1]), Dense(n_shared, n_outputs[2]))
+        Dense(n_input => n_shared),
+        Split(Dense(n_shared => n_outputs[1]), Dense(n_shared => n_outputs[2]))
     )
 
     pvec, re = Flux.destructure(model)