Merge pull request #591 from LuxDL/ap/fused_ops

Used New Fused Ops from LuxLib
LuxDL · Apr 28, 2024 · 51f2968 · 51f2968 · avik-pal · Apr 28, 2024
2 parents 36b362a + 6dfaa8b
commit 51f2968
Show file tree

Hide file tree

Showing 7 changed files with 39 additions and 63 deletions.
diff --git a/.github/workflows/Downstream.yml b/.github/workflows/Downstream.yml
@@ -50,7 +50,7 @@ jobs:
             # force it to use this PR's version of the package
             Pkg.develop(PackageSpec(path="."))  # resolver may fail with main deps
             Pkg.update()
-            Pkg.test()  # resolver may fail with test time deps
+            Pkg.test(; coverage=true)  # resolver may fail with test time deps
           catch err
             err isa Pkg.Resolve.ResolverError || rethrow()
             # If we can't resolve that means this is incompatible by SemVer and this is fine

diff --git a/Project.toml b/Project.toml
@@ -65,7 +65,7 @@ ADTypes = "0.2, 1"
 Adapt = "4"
 Aqua = "0.8.4"
 ArrayInterface = "7.9"
-CUDA = "5.2"
+CUDA = "5.3.2"
 ChainRulesCore = "1.23"
 ComponentArrays = "0.15.11"
 ConcreteStructs = "0.2.3"
@@ -84,7 +84,7 @@ LuxAMDGPU = "0.2.2"
 LuxCUDA = "0.3.2"
 LuxCore = "0.1.14"
 LuxDeviceUtils = "0.1.19"
-LuxLib = "0.3.11"
+LuxLib = "0.3.18"
 LuxTestUtils = "0.1.15"
 MLUtils = "0.4.3"
 MPI = "0.20.19"

diff --git a/examples/NeuralODE/main.jl b/examples/NeuralODE/main.jl
@@ -171,8 +171,12 @@ function train(model_function; cpu::Bool=false, kwargs...)
 end
 
 train(NeuralODECompact)
+nothing #hide
+
+#-
 
 train(NeuralODE)
+nothing #hide
 
 # We can also change the sensealg and train the model! `GaussAdjoint` allows you to use
 # any arbitrary parameter structure and not just a flat vector (`ComponentArray`).

diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -197,7 +197,7 @@ end
 function initialparameters(rng::AbstractRNG, d::Dense{use_bias}) where {use_bias}
     if use_bias
         return (weight=d.init_weight(rng, d.out_dims, d.in_dims),
-            bias=d.init_bias(rng, d.out_dims, 1))
+            bias=d.init_bias(rng, d.out_dims, 1)) #TODO: In v0.6 make it a vector
     else
         return (weight=d.init_weight(rng, d.out_dims, d.in_dims),)
     end
@@ -210,32 +210,19 @@ statelength(d::Dense) = 0
 
 outputsize(d::Dense) = (d.out_dims,)
 
-@inline function (d::Dense{false})(x::AbstractVecOrMat, ps, st::NamedTuple)
-    return apply_activation(d.activation, ps.weight * x), st
+@inline function (d::Dense)(x::AbstractVector, ps, st::NamedTuple)
+    return vec(first(d(reshape(x, :, 1), ps, st))), st
 end
 
-@inline function (d::Dense{false})(x::AbstractArray, ps, st::NamedTuple)
-    x_reshaped = reshape(x, size(x, 1), :)
+@inline function (d::Dense)(x::AbstractMatrix, ps, st::NamedTuple)
     return (
-        reshape(apply_activation(d.activation, ps.weight * x_reshaped),
-            d.out_dims, size(x)[2:end]...),
+        fused_dense_bias_activation(
+            d.activation, ps.weight, x, _vec(_getproperty(ps, Val(:bias)))),
         st)
 end
 
-@inline function (d::Dense{true})(x::AbstractVector, ps, st::NamedTuple)
-    return apply_bias_activation(d.activation, ps.weight * x, vec(ps.bias)), st
-end
-
-@inline function (d::Dense{true})(x::AbstractMatrix, ps, st::NamedTuple)
-    return apply_bias_activation(d.activation, ps.weight * x, ps.bias), st
-end
-
-@inline function (d::Dense{true})(x::AbstractArray, ps, st::NamedTuple)
-    x_reshaped = reshape(x, size(x, 1), :)
-    return (
-        reshape(apply_bias_activation(d.activation, ps.weight * x_reshaped, ps.bias),
-            d.out_dims, size(x)[2:end]...),
-        st)
+@inline function (d::Dense)(x::AbstractArray, ps, st::NamedTuple)
+    return reshape(first(d(reshape(x, size(x, 1), :), ps, st)), :, size(x)[2:end]...), st
 end
 
 """

diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -114,16 +114,12 @@ function parameterlength(c::Conv{N, use_bias}) where {N, use_bias}
            (use_bias ? c.out_chs : 0)
 end
 
-@inline function (c::Conv{N, false})(x::AbstractArray, ps, st::NamedTuple) where {N}
-    cdims = DenseConvDims(
-        x, ps.weight; stride=c.stride, padding=c.pad, dilation=c.dilation, groups=c.groups)
-    return apply_activation(c.activation, _conv(x, ps.weight, cdims)), st
-end
-
-@inline function (c::Conv{N, true})(x::AbstractArray, ps, st::NamedTuple) where {N}
-    cdims = DenseConvDims(
-        x, ps.weight; stride=c.stride, padding=c.pad, dilation=c.dilation, groups=c.groups)
-    return apply_bias_activation(c.activation, _conv(x, ps.weight, cdims), ps.bias), st
+@inline function (c::Conv)(x::AbstractArray, ps, st::NamedTuple)
+    cdims = DenseConvDims(x, ps.weight; c.stride, padding=c.pad, c.dilation, c.groups)
+    return (
+        fused_conv_bias_activation(
+            c.activation, ps.weight, x, _getproperty(ps, Val(:bias)), cdims),
+        st)
 end
 
 function Base.show(io::IO, l::Conv)
@@ -617,16 +613,13 @@ function parameterlength(c::CrossCor{N, use_bias}) where {N, use_bias}
     return prod(c.kernel_size) * c.in_chs * c.out_chs + (use_bias ? c.out_chs : 0)
 end
 
-@inline function (c::CrossCor{N, false})(x::AbstractArray, ps, st::NamedTuple) where {N}
+@inline function (c::CrossCor)(x::AbstractArray, ps, st::NamedTuple)
     cdims = DenseConvDims(
         DenseConvDims(x, ps.weight; c.stride, padding=c.pad, c.dilation); F=true)
-    return apply_activation(c.activation, _conv(x, ps.weight, cdims)), st
-end
-
-@inline function (c::CrossCor{N, true})(x::AbstractArray, ps, st::NamedTuple) where {N}
-    cdims = DenseConvDims(
-        DenseConvDims(x, ps.weight; c.stride, padding=c.pad, c.dilation); F=true)
-    return apply_bias_activation(c.activation, _conv(x, ps.weight, cdims), ps.bias), st
+    return (
+        fused_conv_bias_activation(
+            c.activation, ps.weight, x, _getproperty(ps, Val(:bias)), cdims),
+        st)
 end
 
 function Base.show(io::IO, l::CrossCor)

diff --git a/src/layers/normalize.jl b/src/layers/normalize.jl
@@ -129,15 +129,15 @@ statelength(l::BatchNorm) = (_track_stats(l) ? 2 * l.chs : 0) + 1
 
 function (BN::BatchNorm)(x::AbstractArray, ps, st::NamedTuple)
     y, stats = batchnorm(x, _getproperty(ps, Val(:scale)), _getproperty(ps, Val(:bias)),
-        _getproperty(st, Val(:running_mean)),
-        _getproperty(st, Val(:running_var)); BN.momentum, BN.epsilon, st.training)
+        _getproperty(st, Val(:running_mean)), _getproperty(st, Val(:running_var)),
+        BN.activation; BN.momentum, BN.epsilon, st.training)
 
     if _track_stats(BN)
         @set! st.running_mean = stats.running_mean
         @set! st.running_var = stats.running_var
     end
 
-    return apply_activation(BN.activation, y), st
+    return y, st
 end
 
 function Base.show(io::IO, l::BatchNorm)
@@ -241,9 +241,9 @@ end
 parameterlength(l::GroupNorm) = _affine(l) ? (l.chs * 2) : 0
 
 function (GN::GroupNorm)(x::AbstractArray, ps, st::NamedTuple)
-    y = groupnorm(x, _getproperty(ps, Val(:scale)),
-        _getproperty(ps, Val(:bias)); GN.groups, GN.epsilon)
-    return apply_activation(GN.activation, y), st
+    y = groupnorm(x, _getproperty(ps, Val(:scale)), _getproperty(ps, Val(:bias)),
+        GN.activation; GN.groups, GN.epsilon)
+    return y, st
 end
 
 function Base.show(io::IO, l::GroupNorm)
@@ -355,9 +355,9 @@ initialstates(rng::AbstractRNG, l::InstanceNorm) = (; training=Val(true))
 parameterlength(l::InstanceNorm) = _affine(l) ? (l.chs * 2) : 0
 
 function (IN::InstanceNorm)(x::AbstractArray, ps, st::NamedTuple)
-    y, stats = instancenorm(x, _getproperty(ps, Val(:scale)),
-        _getproperty(ps, Val(:bias)); IN.epsilon, st.training)
-    return apply_activation(IN.activation, y), st
+    y, stats = instancenorm(x, _getproperty(ps, Val(:scale)), _getproperty(ps, Val(:bias)),
+        IN.activation; IN.epsilon, st.training)
+    return y, st
 end
 
 function Base.show(io::IO, l::InstanceNorm)
@@ -574,9 +574,9 @@ function initialparameters(rng::AbstractRNG, ln::LayerNorm)
 end
 
 function (l::LayerNorm)(x::AbstractArray, ps, st::NamedTuple)
-    y = layernorm(
-        x, _getproperty(ps, Val(:scale)), _getproperty(ps, Val(:bias)); l.dims, l.epsilon)
-    return apply_activation(l.activation, y), st
+    y = layernorm(x, _getproperty(ps, Val(:scale)),
+        _getproperty(ps, Val(:bias)), l.activation; l.dims, l.epsilon)
+    return y, st
 end
 
 function Base.show(io::IO, l::LayerNorm)

diff --git a/src/utils.jl b/src/utils.jl
@@ -29,7 +29,7 @@ _maybetuple_string(pad::Tuple) = all(==(pad[1]), pad) ? string(pad[1]) : string(
 # Padding
 struct SamePad end
 
-function _calc_padding(pad, k::NTuple{N}, dilation, stride) where {N}
+function _calc_padding(pad, ::NTuple{N}, dilation, stride) where {N}
     return _expand(Val(2 * N), pad)
 end
 
@@ -43,9 +43,6 @@ function _calc_padding(::SamePad, k::NTuple, dilation, stride)
     return Tuple(mapfoldl(i -> [cld(i, 2), fld(i, 2)], vcat, pad_amt))
 end
 
-# Getting typename
-get_typename(::T) where {T} = Base.typename(T).wrapper
-
 # RNN Utilities
 @inline _gate(h::Int, n::Int) = (1:h) .+ h * (n - 1)
 @inline _gate(x::AbstractVector, h::Int, n::Int) = view(x, _gate(h, n))
@@ -120,11 +117,6 @@ end
 
 # Backend Integration
 ## Convolution
-@inline _conv(x, weight, cdims) = conv(x, weight, cdims)
-@inline function _conv(x::SubArray{T, N, <:AbstractArray}, weight, cdims) where {T, N}
-    return _conv(copy(x), weight, cdims)
-end
-
 @inline _conv_transpose(x, weight, cdims) = ∇conv_data(x, weight, cdims)
 @inline function _conv_transpose(
         x::SubArray{T, N, <:GPUArraysCore.AnyGPUArray}, weight, cdims) where {T, N}
Benchmark suite	Current: `51f2968`	Previous: `36b362a`	Ratio
`Dense(2 => 2)/cpu/reverse/ReverseDiff (compiled)/(2, 128)`	`3861` ns	`3674.375` ns	`1.05`
`Dense(2 => 2)/cpu/reverse/Zygote/(2, 128)`	`7130` ns	`5854.25` ns	`1.22`
`Dense(2 => 2)/cpu/reverse/Tracker/(2, 128)`	`20018` ns	`15508` ns	`1.29`
`Dense(2 => 2)/cpu/reverse/ReverseDiff/(2, 128)`	`9644` ns	`9975.333333333334` ns	`0.97`
`Dense(2 => 2)/cpu/reverse/Flux/(2, 128)`	`8847.5` ns	`8696` ns	`1.02`
`Dense(2 => 2)/cpu/reverse/SimpleChains/(2, 128)`	`4458.25` ns	`4494.625` ns	`0.99`
`Dense(2 => 2)/cpu/forward/NamedTuple/(2, 128)`	`1121.198717948718` ns	`2060.9` ns	`0.54`
`Dense(2 => 2)/cpu/forward/ComponentArray/(2, 128)`	`1186.6060606060605` ns	`1664.8521126760563` ns	`0.71`
`Dense(2 => 2)/cpu/forward/Flux/(2, 128)`	`1793.1509433962265` ns	`1815.6923076923076` ns	`0.99`
`Dense(2 => 2)/cpu/forward/SimpleChains/(2, 128)`	`179.78284923928078` ns	`179.37413073713492` ns	`1.00`
`Dense(20 => 20)/cpu/reverse/ReverseDiff (compiled)/(20, 128)`	`17293` ns	`17743` ns	`0.97`
`Dense(20 => 20)/cpu/reverse/Zygote/(20, 128)`	`17333` ns	`18735` ns	`0.93`
`Dense(20 => 20)/cpu/reverse/Tracker/(20, 128)`	`36698` ns	`35667` ns	`1.03`
`Dense(20 => 20)/cpu/reverse/ReverseDiff/(20, 128)`	`28303` ns	`28753` ns	`0.98`
`Dense(20 => 20)/cpu/reverse/Flux/(20, 128)`	`19607` ns	`19787` ns	`0.99`
`Dense(20 => 20)/cpu/reverse/SimpleChains/(20, 128)`	`16842` ns	`17562.5` ns	`0.96`
`Dense(20 => 20)/cpu/forward/NamedTuple/(20, 128)`	`3847.125` ns	`4920.571428571428` ns	`0.78`
`Dense(20 => 20)/cpu/forward/ComponentArray/(20, 128)`	`3942.375` ns	`5003.571428571428` ns	`0.79`
`Dense(20 => 20)/cpu/forward/Flux/(20, 128)`	`4867.714285714285` ns	`5028` ns	`0.97`
`Dense(20 => 20)/cpu/forward/SimpleChains/(20, 128)`	`1653.1` ns	`1651.1` ns	`1.00`
`Conv((3, 3), 3 => 3)/cpu/reverse/ReverseDiff (compiled)/(64, 64, 3, 128)`	`49892211` ns	`48926002` ns	`1.02`
`Conv((3, 3), 3 => 3)/cpu/reverse/Zygote/(64, 64, 3, 128)`	`57726460.5` ns	`108271301` ns	`0.53`
`Conv((3, 3), 3 => 3)/cpu/reverse/Tracker/(64, 64, 3, 128)`	`99509688.5` ns	`84036071.5` ns	`1.18`
`Conv((3, 3), 3 => 3)/cpu/reverse/ReverseDiff/(64, 64, 3, 128)`	`106130667` ns	`107192834` ns	`0.99`
`Conv((3, 3), 3 => 3)/cpu/reverse/Flux/(64, 64, 3, 128)`	`105713647.5` ns	`106869664` ns	`0.99`
`Conv((3, 3), 3 => 3)/cpu/reverse/SimpleChains/(64, 64, 3, 128)`	`11648881` ns	`11898560` ns	`0.98`
`Conv((3, 3), 3 => 3)/cpu/forward/NamedTuple/(64, 64, 3, 128)`	`7036951.5` ns	`18820810.5` ns	`0.37`
`Conv((3, 3), 3 => 3)/cpu/forward/ComponentArray/(64, 64, 3, 128)`	`7057975` ns	`18550564.5` ns	`0.38`
`Conv((3, 3), 3 => 3)/cpu/forward/Flux/(64, 64, 3, 128)`	`18159101` ns	`18693425` ns	`0.97`
`Conv((3, 3), 3 => 3)/cpu/forward/SimpleChains/(64, 64, 3, 128)`	`6374599.5` ns	`6446973` ns	`0.99`
`vgg16/cpu/reverse/Zygote/(32, 32, 3, 1)`	`117682544` ns	`106088743.5` ns	`1.11`
`vgg16/cpu/reverse/Zygote/(32, 32, 3, 16)`	`730992081` ns	`832416622` ns	`0.88`
`vgg16/cpu/reverse/Zygote/(32, 32, 3, 64)`	`2850285111` ns	`2984233767` ns	`0.96`
`vgg16/cpu/reverse/Tracker/(32, 32, 3, 1)`	`164703848` ns	`146290469` ns	`1.13`
`vgg16/cpu/reverse/Tracker/(32, 32, 3, 16)`	`892438806` ns	`1085323519.5` ns	`0.82`
`vgg16/cpu/reverse/Tracker/(32, 32, 3, 64)`	`2801258575` ns	`3036724601` ns	`0.92`
`vgg16/cpu/reverse/Flux/(32, 32, 3, 1)`	`81992561` ns	`90590491` ns	`0.91`
`vgg16/cpu/reverse/Flux/(32, 32, 3, 16)`	`851035463` ns	`733826110` ns	`1.16`
`vgg16/cpu/reverse/Flux/(32, 32, 3, 64)`	`3068020617` ns	`3075391905` ns	`1.00`
`vgg16/cpu/forward/NamedTuple/(32, 32, 3, 1)`	`29174554.5` ns	`29790496` ns	`0.98`
`vgg16/cpu/forward/NamedTuple/(32, 32, 3, 16)`	`180762225` ns	`212277593` ns	`0.85`
`vgg16/cpu/forward/NamedTuple/(32, 32, 3, 64)`	`682516468` ns	`781425925` ns	`0.87`
`vgg16/cpu/forward/ComponentArray/(32, 32, 3, 1)`	`28838898` ns	`30383532` ns	`0.95`
`vgg16/cpu/forward/ComponentArray/(32, 32, 3, 16)`	`173516198` ns	`197512172` ns	`0.88`
`vgg16/cpu/forward/ComponentArray/(32, 32, 3, 64)`	`673443329` ns	`778085435.5` ns	`0.87`
`vgg16/cpu/forward/Flux/(32, 32, 3, 1)`	`22791681` ns	`29280386` ns	`0.78`
`vgg16/cpu/forward/Flux/(32, 32, 3, 16)`	`185722808.5` ns	`188405608` ns	`0.99`
`vgg16/cpu/forward/Flux/(32, 32, 3, 64)`	`708367926.5` ns	`809432575` ns	`0.88`
`Conv((3, 3), 64 => 64)/cpu/reverse/ReverseDiff (compiled)/(64, 64, 64, 128)`	`1095069819` ns	`1147032763` ns	`0.95`
`Conv((3, 3), 64 => 64)/cpu/reverse/Zygote/(64, 64, 64, 128)`	`1853949272` ns	`1880482284` ns	`0.99`
`Conv((3, 3), 64 => 64)/cpu/reverse/Tracker/(64, 64, 64, 128)`	`2135692150` ns	`2148352018` ns	`0.99`
`Conv((3, 3), 64 => 64)/cpu/reverse/ReverseDiff/(64, 64, 64, 128)`	`2328333728.5` ns	`2539276579` ns	`0.92`
`Conv((3, 3), 64 => 64)/cpu/reverse/Flux/(64, 64, 64, 128)`	`1806353666.5` ns	`1864259381` ns	`0.97`
`Conv((3, 3), 64 => 64)/cpu/forward/NamedTuple/(64, 64, 64, 128)`	`316924856` ns	`358738282` ns	`0.88`
`Conv((3, 3), 64 => 64)/cpu/forward/ComponentArray/(64, 64, 64, 128)`	`319541589.5` ns	`405361001.5` ns	`0.79`
`Conv((3, 3), 64 => 64)/cpu/forward/Flux/(64, 64, 64, 128)`	`366675180` ns	`412981399.5` ns	`0.89`
`Conv((3, 3), 1 => 1)/cpu/reverse/ReverseDiff (compiled)/(64, 64, 1, 128)`	`11925685.5` ns	`12013680.5` ns	`0.99`
`Conv((3, 3), 1 => 1)/cpu/reverse/Zygote/(64, 64, 1, 128)`	`17932328` ns	`18334339` ns	`0.98`
`Conv((3, 3), 1 => 1)/cpu/reverse/Tracker/(64, 64, 1, 128)`	`19128765` ns	`19696096` ns	`0.97`
`Conv((3, 3), 1 => 1)/cpu/reverse/ReverseDiff/(64, 64, 1, 128)`	`23843499.5` ns	`24459171` ns	`0.97`
`Conv((3, 3), 1 => 1)/cpu/reverse/Flux/(64, 64, 1, 128)`	`17990776.5` ns	`18368123` ns	`0.98`
`Conv((3, 3), 1 => 1)/cpu/reverse/SimpleChains/(64, 64, 1, 128)`	`1152412.5` ns	`1168429` ns	`0.99`
`Conv((3, 3), 1 => 1)/cpu/forward/NamedTuple/(64, 64, 1, 128)`	`2037817` ns	`2120374.5` ns	`0.96`
`Conv((3, 3), 1 => 1)/cpu/forward/ComponentArray/(64, 64, 1, 128)`	`2036956` ns	`2133928` ns	`0.95`
`Conv((3, 3), 1 => 1)/cpu/forward/Flux/(64, 64, 1, 128)`	`2053085` ns	`2118672` ns	`0.97`
`Conv((3, 3), 1 => 1)/cpu/forward/SimpleChains/(64, 64, 1, 128)`	`199553` ns	`216398` ns	`0.92`
`Dense(200 => 200)/cpu/reverse/ReverseDiff (compiled)/(200, 128)`	`294264.5` ns	`309096.5` ns	`0.95`
`Dense(200 => 200)/cpu/reverse/Zygote/(200, 128)`	`265987` ns	`277497.5` ns	`0.96`
`Dense(200 => 200)/cpu/reverse/Tracker/(200, 128)`	`360333` ns	`374849` ns	`0.96`
`Dense(200 => 200)/cpu/reverse/ReverseDiff/(200, 128)`	`404816` ns	`418761` ns	`0.97`
`Dense(200 => 200)/cpu/reverse/Flux/(200, 128)`	`272669` ns	`279090` ns	`0.98`
`Dense(200 => 200)/cpu/reverse/SimpleChains/(200, 128)`	`403614` ns	`409864.5` ns	`0.98`
`Dense(200 => 200)/cpu/forward/NamedTuple/(200, 128)`	`80691` ns	`93404` ns	`0.86`
`Dense(200 => 200)/cpu/forward/ComponentArray/(200, 128)`	`81091` ns	`94686.5` ns	`0.86`
`Dense(200 => 200)/cpu/forward/Flux/(200, 128)`	`86171` ns	`89286` ns	`0.97`
`Dense(200 => 200)/cpu/forward/SimpleChains/(200, 128)`	`104385` ns	`104365` ns	`1.00`
`Conv((3, 3), 16 => 16)/cpu/reverse/ReverseDiff (compiled)/(64, 64, 16, 128)`	`193514136` ns	`193458437` ns	`1.00`
`Conv((3, 3), 16 => 16)/cpu/reverse/Zygote/(64, 64, 16, 128)`	`326751148.5` ns	`373472760.5` ns	`0.87`
`Conv((3, 3), 16 => 16)/cpu/reverse/Tracker/(64, 64, 16, 128)`	`401460640` ns	`404755227` ns	`0.99`
`Conv((3, 3), 16 => 16)/cpu/reverse/ReverseDiff/(64, 64, 16, 128)`	`431980614` ns	`454774364.5` ns	`0.95`
`Conv((3, 3), 16 => 16)/cpu/reverse/Flux/(64, 64, 16, 128)`	`370530564` ns	`372704909` ns	`0.99`
`Conv((3, 3), 16 => 16)/cpu/reverse/SimpleChains/(64, 64, 16, 128)`	`335121405` ns	`371496475.5` ns	`0.90`
`Conv((3, 3), 16 => 16)/cpu/forward/NamedTuple/(64, 64, 16, 128)`	`43967310` ns	`60350010` ns	`0.73`
`Conv((3, 3), 16 => 16)/cpu/forward/ComponentArray/(64, 64, 16, 128)`	`43943682.5` ns	`52074054` ns	`0.84`
`Conv((3, 3), 16 => 16)/cpu/forward/Flux/(64, 64, 16, 128)`	`54701594` ns	`51366141` ns	`1.06`
`Conv((3, 3), 16 => 16)/cpu/forward/SimpleChains/(64, 64, 16, 128)`	`28158813` ns	`28579992.5` ns	`0.99`
`Dense(2000 => 2000)/cpu/reverse/ReverseDiff (compiled)/(2000, 128)`	`19527366` ns	`20025770.5` ns	`0.98`
`Dense(2000 => 2000)/cpu/reverse/Zygote/(2000, 128)`	`19511676` ns	`19976154` ns	`0.98`
`Dense(2000 => 2000)/cpu/reverse/Tracker/(2000, 128)`	`23170806` ns	`24021852` ns	`0.96`
`Dense(2000 => 2000)/cpu/reverse/ReverseDiff/(2000, 128)`	`24000771` ns	`24619312` ns	`0.97`
`Dense(2000 => 2000)/cpu/reverse/Flux/(2000, 128)`	`19543440.5` ns	`19994222` ns	`0.98`
`Dense(2000 => 2000)/cpu/forward/NamedTuple/(2000, 128)`	`6503799` ns	`6681275.5` ns	`0.97`
`Dense(2000 => 2000)/cpu/forward/ComponentArray/(2000, 128)`	`6489076.5` ns	`6671001` ns	`0.97`
`Dense(2000 => 2000)/cpu/forward/Flux/(2000, 128)`	`6483030.5` ns	`6636669` ns	`0.98`