Skip to content

Commit

Permalink
feat: add bf16 function (#1104)
Browse files Browse the repository at this point in the history
* feat: add bf16 function

* docs: add bf16 to docs
  • Loading branch information
avik-pal authored Nov 25, 2024
1 parent cd96335 commit 06eb507
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 5 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Lux"
uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.3.4"
version = "1.4.0"

[deps]
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
Expand Down
1 change: 1 addition & 0 deletions docs/src/api/Lux/utilities.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ simply pass the parameters / states / arrays into one of the following functions
Lux.f16
Lux.f32
Lux.f64
Lux.bf16
```

## Element Type Matching
Expand Down
2 changes: 1 addition & 1 deletion src/Lux.jl
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ export BinaryCrossEntropyLoss, BinaryFocalLoss, CrossEntropyLoss, DiceCoeffLoss,
PoissonLoss, SiameseContrastiveLoss, SquaredHingeLoss
export GenericLossFunction

export f16, f32, f64
export f16, f32, f64, bf16
export match_eltype

export FromFluxAdaptor, FluxLayer
Expand Down
38 changes: 35 additions & 3 deletions src/helpers/eltype_conversion.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,46 @@ for (fname, ftype) in zip((:f16, :f32, :f64), (Float16, Float32, Float64))
$($fname)(m)
Converts the `eltype` of `m` *floating point* values to `$($ftype)`.
Recurses into structs marked with `Functors.@functor`.
To avoid recursion into structs mark them with `Functors.@leaf`.
"""
$(fname)(m) = (LuxEltypeAdaptor{$ftype}())(m)
$(fname)(m) = (LuxEltypeAdaptor{$(ftype)}())(m)
end
end

@static if isdefined(Core, :BFloat16)
bf16_docs = """
bf16(m)
Converts the `eltype` of `m` *floating point* values to `BFloat16`.
To avoid recursion into structs mark them with `Functors.@leaf`.
!!! warning
`BFloat16s.jl` needs to be loaded before using this function.
!!! tip "Support for `BFloat16`"
Most Lux operations aren't optimized for `BFloat16` yet. Instead this is meant to be
used together with `Reactant.@compile`.
"""

bf16(m) = (LuxEltypeAdaptor{Core.BFloat16}())(m)
else
bf16_docs = """
bf16(m)
!!! danger "Not Supported"
Current Julia version does not support `BFloat16`. Use julia 1.11 or newer.
"""

bf16(_) = error("`bf16` is not supported on Julia versions 1.11+")
end

@doc (bf16_docs) bf16

# Common incorrect usage
for f in (f16, f32, f64)
for f in (f16, f32, f64, bf16)
warn_msg = "$(f) is not meant to be broadcasted like `$(f).(x)` or `x .|> $(f)`, \
and this might give unexpected results and could lead to crashes. Directly \
use `$(f)` as `$(f)(x)` or `x |> $(f)` instead."
Expand Down

1 comment on commit 06eb507

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 06eb507 Previous: cd96335 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4375 ns 4375 ns 1
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4333 ns 4208 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4875 ns 5042 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4291 ns 3833 ns 1.12
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 62852 ns 59750 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10250 ns 10229.5 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10542 ns 10458 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10625 ns 11208 ns 0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10542 ns 10083.5 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 442826.5 ns 421969 ns 1.05
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1000 ns 1042 ns 0.96
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1208 ns 1333 ns 0.91
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1333 ns 1334 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1333 ns 1167 ns 1.14
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 18476 ns 18218 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 3895.5 ns 3791 ns 1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4167 ns 4125 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4042 ns 4375 ns 0.92
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3959 ns 4083 ns 0.97
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 113416 ns 110020 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57542 ns 55625 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46292 ns 46833 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46541 ns 46208 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83375 ns 81750 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37589.5 ns 36958.5 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2024896 ns 2050166 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1835271 ns 2100334 ns 0.87
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2098250 ns 2073937.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2020667 ns 1993041 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 198299 ns 195385 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 143916 ns 143208 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 144479.5 ns 143958.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 145937.5 ns 146000 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 143417 ns 182375 ns 0.79
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166429 ns 165528 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1117416 ns 1157292 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 995229 ns 1158062.5 ns 0.86
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1124542 ns 1107125 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1145250.5 ns 1113937.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 537731.5 ns 525805 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3792 ns 3542 ns 1.07
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3542 ns 3959 ns 0.89
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4333.5 ns 4458 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3667 ns 3458 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 68177 ns 70267.5 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9334 ns 8792 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8583 ns 8667 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9417 ns 9500 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8854.5 ns 9333 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 498056 ns 486148 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 16125 ns 15916 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15813 ns 15208 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18791 ns 17458 ns 1.08
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 15167 ns 15750 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 55225 ns 55035.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215625 ns 214687.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 213104.5 ns 213875 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214167 ns 214499.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213188 ns 214020.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 275510 ns 271923 ns 1.01
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 584 ns 0.86
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 750 ns 542 ns 1.38
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 875 ns 750 ns 1.17
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 542 ns 1.23
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17577 ns 17550 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1500 ns 1667 ns 0.90
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1667 ns 1625 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1542 ns 1875 ns 0.82
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1500 ns 1583 ns 0.95
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 103563 ns 102829 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7334 ns 7083 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5666 ns 6041 ns 0.94
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5958 ns 5917 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10333 ns 10000 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23689 ns 23605 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221979 ns 221000 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229334 ns 229416.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229417 ns 230875 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 214125 ns 252791.5 ns 0.85
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 169909 ns 168416.5 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3875 ns 3958 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3916 ns 3875 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3916 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23381 ns 23282 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16667 ns 16625 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16750 ns 16875 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17000 ns 16958 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16708 ns 16667 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 162845 ns 160471 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 571542 ns 574250 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 574583 ns 576167 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 575500 ns 579895.5 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 575333 ns 573917 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113453 ns 113142 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1419645.5 ns 1424041.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1428270.5 ns 1417292 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1425833 ns 1420500 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1425291 ns 1425500 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 212962.5 ns 209769 ns 1.02
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1086583 ns 1054333.5 ns 1.03
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 963625.5 ns 959917 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1340708 ns 1343583.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1274625 ns 1300896 ns 0.98
lenet(28, 28, 1, 64)/forward/GPU/CUDA 275533.5 ns 279273.5 ns 0.99
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 6003313 ns 5749437.5 ns 1.04
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4543291 ns 4599687.5 ns 0.99
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4950500 ns 4952395.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5760542 ns 5610084 ns 1.03
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1094293 ns 1087158.5 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 542 ns 541 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 541 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23428 ns 23646 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2084 ns 2125 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2125 ns 2084 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2209 ns 2208 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2083 ns 2083 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 170454.5 ns 173162 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4542 ns 4542 ns 1
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4208 ns 4208 ns 1
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 4750 ns 5000 ns 0.95
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4166 ns 4000 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 66283 ns 64791.5 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10895.5 ns 11208 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11375 ns 11291.5 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12334 ns 12209 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11084 ns 11292 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 458465.5 ns 449166 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6667 ns 6792 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6541.5 ns 6833 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8792 ns 8312.5 ns 1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6000 ns 6250 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 52738.5 ns 51887 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17667 ns 16792 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17958 ns 16667 ns 1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17833 ns 17500 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16875 ns 17875 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 308559 ns 301591 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 583 ns 583 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 583 ns 1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 666 ns 625 ns 1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 32624 ns 32520 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8750 ns 8625 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8833 ns 8416.5 ns 1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9625 ns 9417 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8667 ns 8250 ns 1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 159033.5 ns 159487 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64667 ns 64959 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64583 ns 64667 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64458 ns 64416 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64709 ns 64541 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111312 ns 110435.5 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 283500 ns 291250 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 271791 ns 285125 ns 0.95
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 274167 ns 274833.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 287250 ns 279770.5 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 185765 ns 183913 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3282083.5 ns 3222417 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 3018667 ns 3060583 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 3018083 ns 3017291.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 3955041 ns 4070708 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 584692 ns 571448 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7658895.5 ns 7560916.5 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7457750 ns 7434917 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7453875 ns 7464958 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8280228.5 ns 8157583.5 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1363348 ns 1323265 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 17573167 ns 17698375 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 17536583 ns 17382541 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 17554104.5 ns 17917041 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14252687.5 ns 14113978.5 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23479854 ns 24259667 ns 0.97
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33441750 ns 33537791.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37263874.5 ns 37485625 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 35385958 ns 34876854.5 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1857021 ns 1864963 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 189054666 ns 191699417 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 232216291.5 ns 233048750 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 192889813 ns 194089542 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 446068417 ns 434858250 ns 1.03
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13856600 ns 13855629 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 287697167 ns 292275916 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 333361166 ns 336958667 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 296288146 ns 297206917 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 358087917 ns 408837354 ns 0.88
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 21542 ns 22333 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 21917 ns 24521 ns 0.89
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23833 ns 23666 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 23000 ns 22417 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 99298.5 ns 95962.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103416 ns 104625 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 102959 ns 103334 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 105041 ns 104875 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103708 ns 103250 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 518798 ns 503280 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6000 ns 6417 ns 0.94
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5750 ns 6250 ns 0.92
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6708 ns 7250 ns 0.93
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6334 ns 6041 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 69881 ns 67524 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14833.5 ns 15250 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14917 ns 15500 ns 0.96
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16166 ns 16125 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15333.5 ns 12875 ns 1.19
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 489724.5 ns 474310.5 ns 1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 2971270.5 ns 2994417 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2063958 ns 2072458 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2272542 ns 2264416 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4666750 ns 4512000 ns 1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 591884 ns 589406.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23554624.5 ns 23917916 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18028687.5 ns 18038749.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17852125 ns 17983750 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 36060875 ns 35261125 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2769683 ns 2768485.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33424000 ns 33831646.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 27634645.5 ns 27630729 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28456458 ns 28545541 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41983541 ns 41340292 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 71959 ns 72833 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75187.5 ns 73521 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 76708 ns 74958 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 74750 ns 83500 ns 0.90
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 105798 ns 102113 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 296145.5 ns 208042 ns 1.42
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 218625 ns 291208 ns 0.75
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 213708 ns 219875 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 318812.5 ns 217417 ns 1.47
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 566311.5 ns 550239 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11792 ns 11916 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12250 ns 12042 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 12750 ns 13000 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12500 ns 11583 ns 1.08
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 73134 ns 70941.5 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26042 ns 26541 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 27042 ns 26875 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27875 ns 27833 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 27417 ns 26708 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 488772.5 ns 472589 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11792 ns 12083 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12625 ns 12917 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14250 ns 13771 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12750 ns 12334 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 54676 ns 52605 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 25458 ns 25917 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 25334 ns 25625 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27000 ns 25958 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26167 ns 26542 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 315843.5 ns 304518.5 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 179292 ns 179750 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 182084 ns 181375 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 182291.5 ns 182875 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 183250 ns 182083 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 58956 ns 57612 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 581896 ns 585375 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 588312.5 ns 582375 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 583583 ns 584291.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 587083.5 ns 585895.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 294154 ns 287910.5 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5625 ns 6396 ns 0.88
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6334 ns 6084 ns 1.04
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6979.5 ns 7667 ns 0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6520.5 ns 5542 ns 1.18
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 72600.5 ns 70404 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13916 ns 14458 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14417 ns 14917 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15667 ns 16000 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14895.5 ns 14416 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 479621.5 ns 461584 ns 1.04
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1206937.5 ns 1193604.5 ns 1.01
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1245542 ns 1246000 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1292542 ns 1273583.5 ns 1.01
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1006417 ns 1016875 ns 0.99
batchedmm(512, Bsize=4)/forward/GPU/CUDA 299757 ns 301246 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4169416.5 ns 4298583 ns 0.97
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4414833 ns 4454937.5 ns 0.99
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4586979 ns 4559833 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 3897020.5 ns 3718125 ns 1.05
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1046888 ns 1052722 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1875 ns 1833 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1875 ns 1834 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23582 ns 24315 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4833 ns 5000 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4916 ns 4916 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4917 ns 4917 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4916 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 189156.5 ns 193381 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5792 ns 6270.5 ns 0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5917 ns 6292 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7375 ns 7042 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5917 ns 5666 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 56730.5 ns 56858.5 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10500 ns 11000 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10833 ns 10584 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11542 ns 11292 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10667 ns 10625 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 346270 ns 341133.5 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 292 ns 416 ns 0.70
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 334 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 375 ns 292 ns 1.28
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22862 ns 23459 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2709 ns 2792 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2750 ns 2708 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3042 ns 3000 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2750 ns 2750 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 160360.5 ns 163121 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11000 ns 12209 ns 0.90
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11833 ns 12250 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13334 ns 15083 ns 0.88
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11583 ns 11375 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 61359.5 ns 59412 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24541 ns 24792 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24459 ns 24833 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25542 ns 25042 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24542 ns 25167 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 310594.5 ns 302787.5 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4208 ns 4250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4167 ns 4167 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4209 ns 4208 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4250 ns 4167 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24670 ns 25427.5 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16250 ns 16041 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16083 ns 16250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16291 ns 16125 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 15959 ns 16084 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 204493 ns 203537 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5792 ns 5875 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5791 ns 5834 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5875 ns 5875 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5833 ns 5875 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 33437 ns 34639 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20500 ns 21375 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21167 ns 21125 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 22250 ns 22125 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 20895.5 ns 23000 ns 0.91
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 177667.5 ns 181357 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 419666.5 ns 404042 ns 1.04
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 386209 ns 390084 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 478875 ns 483167 ns 0.99
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 109479 ns 103834 ns 1.05
batchedmm(16, Bsize=512)/forward/GPU/CUDA 67033 ns 67491 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 909583.5 ns 913854 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 973708.5 ns 961459 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1177021 ns 1201334 ns 0.98
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 462625 ns 448417 ns 1.03
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 190401 ns 192152 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 81208 ns 80542 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81791 ns 81500 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 82417 ns 79854.5 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81291.5 ns 78813 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193814.5 ns 193447.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1935667 ns 1946833 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1916333.5 ns 1932479 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1698562.5 ns 1920708 ns 0.88
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1930833.5 ns 1904937.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 412302 ns 402534 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 333 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21830 ns 22000 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1833 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1833 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 176875.5 ns 169877.5 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6459 ns 7521 ns 0.86
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6333 ns 7167 ns 0.88
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 8041 ns 7792 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6875 ns 6500 ns 1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 68185.5 ns 61779 ns 1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9375 ns 9250 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9125 ns 9500 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9417 ns 9042 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9542 ns 9292 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 336822.5 ns 314965 ns 1.07
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 120048500 ns 158324292 ns 0.76
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 173952417 ns 174385041 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 148044250 ns 148149145.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 104491521 ns 104978917 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5473411 ns 5475583 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 614607916.5 ns 673914521 ns 0.91
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 555101125 ns 556536500 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 456940500.5 ns 454282229 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 767695250.5 ns 754352104 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34955825 ns 35161544.5 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 651124125 ns 703002500 ns 0.93
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 669116625 ns 668300021 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 578815791.5 ns 587968625 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 742698833 ns 742489083 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 60083 ns 57833 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 48000 ns 48000 ns 1
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46333 ns 47959 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84917 ns 82333 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38192 ns 38135 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1922875 ns 1945042 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1974250 ns 1994937.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1990062.5 ns 1978208 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1906625 ns 1862834 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 174155.5 ns 174772.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 267042 ns 267333 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 265667 ns 267521 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 269750 ns 268709 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 267791.5 ns 266959 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 147410.5 ns 138445.5 ns 1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 592896 ns 605250 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 684208.5 ns 597333.5 ns 1.15
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 589520.5 ns 696500 ns 0.85
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 698666 ns 676042 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 813499.5 ns 740206.5 ns 1.10
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2209791.5 ns 2204042 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2209896 ns 2205084 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2103125 ns 2220750 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2236250 ns 2219958 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 133936 ns 135150.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5510542 ns 5598583 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5555437.5 ns 5526083 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5498229 ns 5502958 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5548208 ns 5487708.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 892951 ns 792599 ns 1.13
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 637334 ns 660166 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 646875 ns 643583 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 646625 ns 659417 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 654875 ns 644542 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46730 ns 47532 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1826417 ns 1795875 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1722937.5 ns 1722291 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1719334 ns 1727709 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2093625 ns 2095458 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 221633 ns 227325 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59125 ns 56375 ns 1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46417 ns 46291 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47000 ns 46625 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84917 ns 82500 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28496 ns 29417 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2036604.5 ns 2030542 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1836917 ns 2111062.5 ns 0.87
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2095792 ns 2091895.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2019479 ns 1996833 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 190943.5 ns 193004 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13379729 ns 13382833 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12436958 ns 12443000 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12541125 ns 12480979 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15244750 ns 15173917 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 515571.5 ns 517073 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47364291.5 ns 47607083 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41915083 ns 41883313 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 40768458 ns 40854417 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 59377021 ns 58509979 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2882313 ns 2896765.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 74658083.5 ns 97269708 ns 0.77
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 90949750 ns 68581771 ns 1.33
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90367208 ns 90434166 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 99601541 ns 98826583 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59459 ns 56833 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47417 ns 47417 ns 1
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47333 ns 47291 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84375 ns 80833 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 48174 ns 46888 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1935167 ns 1939104 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1966979 ns 2010459 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1974250 ns 1977312.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1911312.5 ns 1892292 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 197391.5 ns 192004 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 334 ns 1.12
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 333 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 32694 ns 31834 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6042 ns 6084 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6375 ns 6083 ns 1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6583 ns 6709 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6083 ns 6167 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 187739 ns 176223.5 ns 1.07
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32507 ns 31304 ns 1.04
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2625 ns 2625 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2792 ns 2584 ns 1.08
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2917 ns 2917 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2667 ns 2667 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 175489 ns 164663.5 ns 1.07
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 286852687.5 ns 324499000.5 ns 0.88
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 339813500 ns 340579375 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 313305624.5 ns 313389416.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 269143125 ns 273909208 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7114947 ns 7105361 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 994653583 ns 1052816166 ns 0.94
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 936368458 ns 943649000 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 837895375.5 ns 840615666.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1177393750 ns 1152028667 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34035408 ns 34095663 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1316953312.5 ns 1721214458 ns 0.77
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1689250042 ns 1359927020.5 ns 1.24
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1683427084 ns 1606248000 ns 1.05
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1672545042 ns 1668736833 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1454500 ns 1425375 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1407875 ns 1415542 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1409834 ns 1416520.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1414167 ns 1410375 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 128152 ns 127634 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5044833 ns 5060999.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4714312.5 ns 5059104 ns 0.93
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5026437.5 ns 5025375 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5051042 ns 5018125 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 686490.5 ns 596333 ns 1.15
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 172590354 ns 163798854 ns 1.05
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 124318041 ns 128369875 ns 0.97
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 123190833 ns 130888792 ns 0.94
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 165357625 ns 168698771 ns 0.98
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4891073 ns 5432122 ns 0.90
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 615854000 ns 630866750 ns 0.98
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 630625333 ns 635134916 ns 0.99
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 562103875 ns 554211625 ns 1.01
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 653647292 ns 648292583 ns 1.01
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 16015121 ns 16519965 ns 0.97
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 9006416.5 ns 9165854 ns 0.98
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8896937.5 ns 8986459 ns 0.99
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7913208 ns 7922833 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 9977125 ns 9756167 ns 1.02
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1591890.5 ns 1610067 ns 0.99
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 35918500 ns 37032625 ns 0.97
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 36875416 ns 37212042 ns 0.99
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 33279603.5 ns 33438583 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 39552041.5 ns 37841958 ns 1.05
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6456562 ns 6473180 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47709 ns 47479.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47375 ns 47437.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47500 ns 47667 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47542 ns 47500 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 19056.5 ns 18175 ns 1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50500 ns 50250 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50250 ns 50625 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50750 ns 50542 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50375 ns 50416.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 263004.5 ns 243634.5 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6500 ns 7229.5 ns 0.90
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6875 ns 6917 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8250 ns 7834 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7000 ns 7292 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 145281 ns 134228.5 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9750 ns 10375 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10250 ns 9458 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10500 ns 10334 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10042 ns 10250 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 744130.5 ns 725024.5 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5875 ns 6917 ns 0.85
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6125 ns 6292 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7375 ns 7417 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5750 ns 5937.5 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 151878 ns 158899 ns 0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13042 ns 13334 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13250 ns 13083 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13250 ns 13958 ns 0.95
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13542 ns 12875 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 607325 ns 654550.5 ns 0.93
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1042 ns 1083 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1000 ns 1083 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1084 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 32597 ns 32302 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8000 ns 8000 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7833 ns 7958.5 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8292 ns 8000 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8375 ns 8250 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 237194 ns 248668.5 ns 0.95
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23167 ns 23334 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23084 ns 23625 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23458 ns 23604.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23292 ns 23334 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18583 ns 18197 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52500 ns 52375 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 54208.5 ns 52583 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 53083 ns 52709 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52667 ns 52291 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 385105.5 ns 365195 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1402916 ns 1409312.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1454958 ns 1395312.5 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1401792 ns 1395667 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1456375 ns 1399187.5 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 197023 ns 196466 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5029479.5 ns 5048625 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5009000 ns 5082916.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5018334 ns 5010208 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5048291.5 ns 5015083 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 723983 ns 697077 ns 1.04
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3027250 ns 3082583 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2080437.5 ns 2075667 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2291437.5 ns 2279000 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4926416 ns 4910958 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 586737 ns 586799 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24385583.5 ns 24742792 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18885375 ns 18899334 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18817833 ns 18912125 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 37153959 ns 36606271 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2840867 ns 2884394 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 34104021 ns 34600271 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28291895.5 ns 28275125 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28002792 ns 27978625 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 42422958 ns 41693583 ns 1.02
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 142464667 ns 146263625 ns 0.97
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 147805542 ns 148262792 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 127021875 ns 125521666 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 173652229 ns 173208104.5 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22558116 ns 22564372 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 1200697875 ns 948935833 ns 1.27
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1864615833.5 ns 1199705645.5 ns 1.55
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1647966021 ns 727524542 ns 2.27
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 686829458 ns 936153853.5 ns 0.73
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 117772826 ns 115985315 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 75375 ns 74250 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 84209 ns 76209 ns 1.10
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 76458 ns 76042 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 80958 ns 72167 ns 1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 312734.5 ns 331111.5 ns 0.94
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 204687.5 ns 282500 ns 0.72
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 278104 ns 191083.5 ns 1.46
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 192375 ns 280584 ns 0.69
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 283312.5 ns 291917 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1488616 ns 1500994.5 ns 0.99
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35722250 ns 36314916.5 ns 0.98
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 36312354 ns 36531396 ns 0.99
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32588937.5 ns 32439729.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40883292 ns 40435354 ns 1.01
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5836813 ns 5837859 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 149459958 ns 151857209 ns 0.98
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 153182708.5 ns 153888604 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 140187104 ns 135530208.5 ns 1.03
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 226961625.5 ns 283241209 ns 0.80
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34882818.5 ns 34859945 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 121271541.5 ns 159567375 ns 0.76
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174726458 ns 174506458 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 147669333 ns 147925667 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 105646958 ns 104572437 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5477234.5 ns 5480695 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 471261458.5 ns 524085270.5 ns 0.90
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 465682583 ns 467380250 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 434340042 ns 437823166 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 758899104.5 ns 737646542 ns 1.03
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 32272056.5 ns 32284174.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 709031375 ns 696105375 ns 1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 654357417 ns 658106854.5 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 581732375 ns 575346979 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 734152875 ns 729353375 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1246834 ns 1155874.5 ns 1.08
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 970729 ns 998792 ns 0.97
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 905979 ns 991542 ns 0.91
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2088750 ns 2092625 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 584722.5 ns 579446 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 3017521 ns 2931916.5 ns 1.03
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2605541 ns 2619083.5 ns 0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2618042 ns 2626604.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3762104 ns 3482417 ns 1.08
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1908342 ns 1969877.5 ns 0.97
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 5812937.5 ns 5947625 ns 0.98
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 5782937.5 ns 5782625 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 5769333 ns 5801958.5 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 2967958 ns 2880584 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7500 ns 7208 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6250 ns 6083 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6125 ns 5959 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10375 ns 9959 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25756 ns 26024 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212417 ns 212562.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221208.5 ns 221083.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220334 ns 221333 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 206375 ns 209292 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 307623 ns 302079.5 ns 1.02
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 310236833.5 ns 311414437.5 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 228243416 ns 232931208 ns 0.98
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 199023750 ns 202032375 ns 0.99
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 307111500 ns 308462875 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7677099 ns 7680461 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1077070792 ns 1101691479.5 ns 0.98
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 909540270.5 ns 909424125 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 811121083 ns 804661000 ns 1.01
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1177347271 ns 1153673416.5 ns 1.02
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26401108 ns 26512167 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5625 ns 5833.5 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5833.5 ns 5833 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6667 ns 6270.5 ns 1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5208 ns 5146 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 199489.5 ns 196235 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7208 ns 7542 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7334 ns 7125 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7416 ns 7417 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7291 ns 7125 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 722768 ns 702510 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 583 ns 625 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 667 ns 584 ns 1.14
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 667 ns 625 ns 1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24721 ns 24654 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9000 ns 9333 ns 0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9209 ns 8750 ns 1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9709 ns 9875 ns 0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9416 ns 9292 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 238784 ns 239874 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 353291.5 ns 352958 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 351750 ns 351875 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 352354.5 ns 351583.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 362833 ns 352208 ns 1.03
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21565 ns 21408 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 814792 ns 779709 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 826041.5 ns 775541 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 777875 ns 776062.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 829958 ns 817375 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 302093.5 ns 316328 ns 0.96
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 337583.5 ns 317708 ns 1.06
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 340250 ns 341667 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 444208 ns 453354 ns 0.98
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 10812.5 ns 10875 ns 0.99
batchedmm(16, Bsize=32)/forward/GPU/CUDA 18424 ns 18691 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 719166.5 ns 712145.5 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 721917 ns 734917 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1006458 ns 1006834 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 28250 ns 27250 ns 1.04
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 299767.5 ns 293795 ns 1.02
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 379708.5 ns 359083.5 ns 1.06
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 349958 ns 350250 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 436354 ns 442875 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 30833 ns 30583 ns 1.01
batchedmm(16, Bsize=128)/forward/GPU/CUDA 23185.5 ns 22877.5 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 737500 ns 736584 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 772041.5 ns 783750 ns 0.99
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1022146 ns 1041500 ns 0.98
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 101459 ns 105875 ns 0.96
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 233048 ns 265090.5 ns 0.88
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3458 ns 3666 ns 0.94
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3625 ns 3667 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3625 ns 3667 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3625 ns 3750 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 18179 ns 17832 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4334 ns 4208 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4625 ns 4292 ns 1.08
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4625 ns 4333 ns 1.07
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4458 ns 4333 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 297309.5 ns 285935 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3833 ns 4209 ns 0.91
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3708 ns 3875 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4208.5 ns 4291 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4104 ns 3250 ns 1.26
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 236489 ns 226147.5 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8458 ns 8541 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8166 ns 8208.5 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8792 ns 8833 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8708 ns 8792 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1272633 ns 1241590 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 208042 ns 203041 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 215895.5 ns 210833 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 211084 ns 213042 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199667 ns 200208 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35583 ns 35096 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 645812.5 ns 600458 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 623291 ns 664687.5 ns 0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 622916 ns 621125 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 638333 ns 587666 ns 1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 366544.5 ns 364021 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 1020979 ns 1006145.5 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1006020.5 ns 1034750 ns 0.97
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 957729 ns 960375 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 904000 ns 870666.5 ns 1.04
batchedmm(128, Bsize=128)/forward/GPU/CUDA 208984 ns 207603 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4550166.5 ns 4675520.5 ns 0.97
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4713709 ns 4661500 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4462125 ns 4484166.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 5571625 ns 5182375 ns 1.08
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 936095 ns 945582 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3708 ns 4167 ns 0.89
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3708.5 ns 3458 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4292 ns 4416.5 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3709 ns 3250 ns 1.14
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 245340.5 ns 242881.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7167 ns 7625 ns 0.94
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7375 ns 7125 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7708 ns 7791 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7209 ns 7166 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 1060150.5 ns 1049374 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1616083 ns 1641104.5 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1153750 ns 1162041.5 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1337250 ns 1361146 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2432374.5 ns 2337792 ns 1.04
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 217163 ns 215237 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12337062.5 ns 12428417 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9522833 ns 9554417 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9266729 ns 9282166 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18081312 ns 18043958 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1948614 ns 1957521 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17355771 ns 17446729 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14388208.5 ns 14307562.5 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14348354 ns 14338292 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21196875 ns 21055500 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 88312.5 ns 90250 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 89271 ns 89750 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 91125 ns 92271 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 91625 ns 92625 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 126391 ns 126161 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2036875 ns 2045083 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2015416.5 ns 2029000 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1865791 ns 2032875 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2043208 ns 2022667 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1072650 ns 1071170.5 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 2813 ns 1645.5 ns 1.71
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 2791 ns 2375 ns 1.18
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 3375 ns 3708 ns 0.91
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 1833 ns 2875 ns 0.64
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16578 ns 16073 ns 1.03
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2542 ns 2792 ns 0.91
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2625 ns 2708 ns 0.97
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 2875 ns 2916 ns 0.99
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 3000 ns 2792 ns 1.07
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 199941.5 ns 196950.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7167 ns 7166 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5709 ns 6083 ns 0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5833 ns 5958 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10250 ns 10042 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34656 ns 33844 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212709 ns 215354.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220625 ns 220833.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220541 ns 220875 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 220354 ns 209625.5 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 356302 ns 351596 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3750 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3708 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3709 ns 3708 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3750 ns 3709 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22913 ns 22384 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14584 ns 14250 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14458 ns 14459 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14500 ns 14334 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14166.5 ns 14375 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 486419 ns 522120.5 ns 0.93
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 92375 ns 140291 ns 0.66
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 93333.5 ns 91729.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 94916.5 ns 96250 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 95417 ns 94458 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125841 ns 125465 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1915000 ns 1947916 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1914209 ns 1932104.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1928125.5 ns 1925000 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1940729 ns 1650916 ns 1.18
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1045000 ns 1016603 ns 1.03
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 871959 ns 859167 ns 1.01
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 821041.5 ns 818395.5 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1216500 ns 1219500 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 943271 ns 962834 ns 0.98
lenet(28, 28, 1, 32)/forward/GPU/CUDA 280426 ns 269546 ns 1.04
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2729167 ns 2844645.5 ns 0.96
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2498104 ns 2436375 ns 1.03
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3340041 ns 3336375 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3427250 ns 3413042 ns 1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1723859 ns 1630539.5 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17229 ns 15708.5 ns 1.10
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17875 ns 16000 ns 1.12
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18792 ns 17166.5 ns 1.09
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 15375 ns 14958.5 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 190406.5 ns 143350.5 ns 1.33
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 228541 ns 217083.5 ns 1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220833.5 ns 215417 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 216521 ns 216916 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 228437.5 ns 227125 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 725866.5 ns 653459 ns 1.11
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 221145.5 ns 221959 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 222000 ns 221645.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 221291 ns 221395.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 221083.5 ns 221083 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 321034 ns 274733.5 ns 1.17
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 495084 ns 511042 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 496625 ns 495750 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 496729 ns 497042 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 507625 ns 508875 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1510358 ns 1471458 ns 1.03
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 3854 ns 4666.5 ns 0.83
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 3875 ns 4083 ns 0.95
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 5042 ns 5708 ns 0.88
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 4083 ns 3917 ns 1.04
batchedmm(16, Bsize=4)/forward/GPU/CUDA 17250 ns 16967 ns 1.02
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 7167 ns 7125 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 6959 ns 7417 ns 0.94
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 7250 ns 7520.5 ns 0.96
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 7416 ns 7250 ns 1.02
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 201503.5 ns 198610.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 20083 ns 18084 ns 1.11
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 16875 ns 18166.5 ns 0.93
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19500 ns 18666.5 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18167 ns 18395.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 232442 ns 148303 ns 1.57
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 224916 ns 214125 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 212708 ns 212791.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 212416 ns 213042 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 248812.5 ns 219417 ns 1.13
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1078558.5 ns 1024505 ns 1.05
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4333 ns 4625 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4250 ns 4333 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5209 ns 5708 ns 0.91
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4417 ns 3708.5 ns 1.19
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 255475 ns 244514 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10708 ns 10875 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10166 ns 10062.5 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10833 ns 11375 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11375 ns 10583 ns 1.07
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 1114054 ns 1099794 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3500 ns 4000 ns 0.88
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3583.5 ns 3792 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4458 ns 4375 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3917 ns 2750 ns 1.42
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 247293 ns 250198 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7542 ns 7270.5 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7250 ns 7459 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8083 ns 7916 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7791 ns 7500 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1128935 ns 1106505 ns 1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23544917 ns 24086812.5 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 34700375 ns 34704750 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 37800604.5 ns 37376896 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 35322562.5 ns 34935000 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1834217 ns 1853477 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 183535208 ns 186942250 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 159261916 ns 159685500 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 146891041.5 ns 146457125 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 419037250 ns 411532208 ns 1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16405198.5 ns 16500596 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 428624000 ns 434054666 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 254269584 ns 253740479 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 296570146 ns 299567770.5 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 493357917 ns 479705417 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 184479.5 ns 184375 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 184208 ns 183584 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 185416 ns 184333 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 184062.5 ns 185292 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 233424 ns 229399 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 585417 ns 594750 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 589833 ns 586209 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 586896 ns 586729.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 639166 ns 599250.5 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1146333 ns 1121066.5 ns 1.02
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3917708 ns 3936375 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3921208 ns 4081937 ns 0.96
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3581645.5 ns 3587479 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 4674709 ns 4565729.5 ns 1.02
batchedmm(128, Bsize=512)/forward/GPU/CUDA 538155 ns 538820 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17548417 ns 18136458.5 ns 0.97
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17792083 ns 17936750 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16472417 ns 16532771 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 21347458 ns 20226167 ns 1.06
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2621425 ns 2633099 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 584 ns 625 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 583 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 33117 ns 31971 ns 1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9291 ns 9375 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9354.5 ns 9000 ns 1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9792 ns 9500 ns 1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9416 ns 9333 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 269036 ns 265140 ns 1.01
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 503680250 ns 503989417 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 425402999.5 ns 431858541.5 ns 0.99
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 418147958 ns 427434834 ns 0.98
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 678706395.5 ns 592092708 ns 1.15
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12481919 ns 11928812 ns 1.05
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 1881496729.5 ns 1891189687.5 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1619255500 ns 1632073542 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1494277750 ns 1496948750 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2234203604 ns 2217192312.5 ns 1.01
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49122118.5 ns 49332313 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1536334 ns 1638750 ns 0.94
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1156271 ns 1179458 ns 0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1380541 ns 1387875 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2362625 ns 2352479.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 217676 ns 214938 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12766416 ns 12852583.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9918708 ns 9964500 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9674833 ns 9669416.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18454708 ns 18345667 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2051013 ns 2032751.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17738333 ns 17791875 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14710417 ns 14679354.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14604375 ns 14576209 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21451125 ns 21490021 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26250 ns 26333 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26208 ns 26250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23581.5 ns 23891 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 67333 ns 66750 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 68000 ns 67791 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67333 ns 67042 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 67042 ns 67042 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 404754.5 ns 403092 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 205583 ns 203542 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209125 ns 209834 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209000 ns 210500 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199041 ns 199500 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26431.5 ns 26253 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 611478.5 ns 602666.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 633292 ns 670479 ns 0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 670416 ns 621791 ns 1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 611479 ns 633916 ns 0.96
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 353085.5 ns 351051 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 612333 ns 678375 ns 0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 643520.5 ns 654937.5 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 644958 ns 646500 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 652334 ns 669916 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132321 ns 131843 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2263750 ns 2326042 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2226645.5 ns 2262000 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2243875 ns 2145978.5 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2302583 ns 2234542 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1253025 ns 1242552 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19667 ns 17645.5 ns 1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 16917 ns 17062.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21500.5 ns 19125 ns 1.12
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18208 ns 17875 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 145311.5 ns 146421.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 233042 ns 220959 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 218770.5 ns 219500 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 262625 ns 220291 ns 1.19
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 230000 ns 235729 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1059070.5 ns 1091456.5 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 583 ns 625 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 667 ns 666 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 667 ns 708 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23551 ns 23721 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9667 ns 10084 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9583 ns 9791.5 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10583 ns 10041 ns 1.05
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9959 ns 9875 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 258697 ns 261550.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5833 ns 6042 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5542 ns 5625 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6500 ns 6584 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4958 ns 5125 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 231871 ns 234716 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6875 ns 7417 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7125 ns 7208 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7792 ns 8041 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6833 ns 7334 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 803650 ns 806215 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2166 ns 2229.5 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1917 ns 2458 ns 0.78
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2417 ns 2375 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2333 ns 2292 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 17852 ns 17855 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6417 ns 6542 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6458 ns 6667 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6916 ns 6834 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6459 ns 6500 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 332798.5 ns 333301.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 749396 ns 755083 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 746625 ns 746333 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 749250 ns 749250 ns 1
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 751417 ns 750187.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21271 ns 21362 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 793042 ns 788958.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 792500 ns 772209 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 775750 ns 787687.5 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 797542 ns 791333 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 296567 ns 298265 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7500 ns 7125 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5542 ns 6041 ns 0.92
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 5959 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10458 ns 10209 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32600 ns 33317.5 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221375 ns 226500 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 240270.5 ns 236063 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 257854 ns 228041 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 222250 ns 255812.5 ns 0.87
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 360398 ns 363202 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10104.5 ns 10542 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10334 ns 10334 ns 1
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10916 ns 11208 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10229.5 ns 9833 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 251149.5 ns 246668.5 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24666 ns 24729.5 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24312.5 ns 24666 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25750 ns 25542 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25562.5 ns 24625 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1138926 ns 1134784.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 106325125 ns 106546667 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 117472625 ns 118425312.5 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 120287229 ns 120189792 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117860729 ns 117420708 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2629206 ns 2655736 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 394161333.5 ns 394570417 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 365470000 ns 368931959 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 355300666 ns 424438979 ns 0.84
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 484349417 ns 482063875 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15196205 ns 15246102 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 755446562.5 ns 945190750 ns 0.80
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 762235792 ns 580209500 ns 1.31
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 742589166.5 ns 744122999.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 957309125 ns 945148083 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7041 ns 7708 ns 0.91
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6875 ns 7250 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8584 ns 8750 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6625 ns 6958.5 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 243246.5 ns 238753.5 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14167 ns 14500 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14041.5 ns 13875 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14667 ns 14000 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14020.5 ns 14333 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1088970 ns 1093778.5 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5917 ns 6708 ns 0.88
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6270.5 ns 6125 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7292 ns 8208 ns 0.89
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5583 ns 5417 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 237671 ns 238599 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12458 ns 12833 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12375 ns 12750 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12916 ns 13166 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12292 ns 12666 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 799420 ns 799288.5 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 5417 ns 5667 ns 0.96
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 5709 ns 6250 ns 0.91
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 6166 ns 6459 ns 0.95
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 5667 ns 5500 ns 1.03
batchedmm(2, Bsize=128)/forward/GPU/CUDA 17212 ns 17328 ns 0.99
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 15458 ns 15583 ns 0.99
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 15459 ns 15417 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 15708 ns 15625 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 15625 ns 15791 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 202604 ns 202450 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 23718 ns 23671 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6167 ns 6541 ns 0.94
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6334 ns 6459 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6875 ns 6667 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6291 ns 6312.5 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 241777 ns 241480.5 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5917 ns 5875 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6042 ns 5834 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5959 ns 5917 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5834 ns 5917 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 24949 ns 25115 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 20958 ns 21604.5 ns 0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21208 ns 21166 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21625 ns 21750 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21250 ns 21708.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 267126.5 ns 267689.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 143875 ns 186417 ns 0.77
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 143770.5 ns 144250 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 149333 ns 148916.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144270.5 ns 187729 ns 0.77
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 169394.5 ns 167935.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1364229 ns 1375083.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1311708 ns 1321917 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1324520.5 ns 1326146 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1349667 ns 1322375 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1363355 ns 1358092 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 23458 ns 23000 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22250 ns 25770.5 ns 0.86
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 25167 ns 24167 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 22584 ns 23834 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 357496 ns 354989 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 186729 ns 130916 ns 1.43
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 175562.5 ns 188500 ns 0.93
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 180666.5 ns 127375 ns 1.42
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 165042 ns 176959 ns 0.93
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1496433 ns 1479622.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23418 ns 23532 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6167 ns 6458 ns 0.95
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6333.5 ns 6333 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7042 ns 6833 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6459 ns 6458 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 259427.5 ns 258733.5 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4708 ns 4833 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4625 ns 4917 ns 0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5166 ns 5709 ns 0.90
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4896 ns 4167 ns 1.17
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 256096.5 ns 256891 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9917 ns 9916 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10042 ns 9958 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10667 ns 10584 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10750 ns 10167 ns 1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1366539 ns 1360812 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1583 ns 1667 ns 0.95
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1667 ns 1625 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1625 ns 1584 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1625 ns 1583 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23036 ns 23180 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5625 ns 5708 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5833 ns 5667 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6041 ns 6000 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5666 ns 5708 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 276314 ns 276437 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6734334 ns 6818791 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6391625 ns 6367083 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6537375 ns 6546291.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7542292 ns 7662166 ns 0.98
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 216147 ns 215904 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24173292 ns 24172500 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21308875 ns 21282334 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21052792 ns 21008479 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29893541 ns 29757292 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2120264 ns 2111780 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 37482583 ns 48853770.5 ns 0.77
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 45446437.5 ns 34383187.5 ns 1.32
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45525834 ns 45683833.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 49665500 ns 49363417 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5916 ns 6479.5 ns 0.91
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5729.5 ns 6125 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7166 ns 6833 ns 1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5750 ns 5750 ns 1
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 236953 ns 238562.5 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8583 ns 8625 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8042 ns 8084 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8666 ns 8208 ns 1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8500 ns 7917 ns 1.07
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1066445 ns 1069949 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1511791 ns 1541500 ns 0.98
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1266750 ns 1273500 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1624771 ns 1639187 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2083583.5 ns 2161000 ns 0.96
lenet(28, 28, 1, 128)/forward/GPU/CUDA 272636.5 ns 276949 ns 0.98
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7911542 ns 7986167 ns 0.99
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6587125 ns 6543375.5 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7180959 ns 7167709 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10527750 ns 10462145.5 ns 1.01
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1860081 ns 1888924 ns 0.98
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 364792 ns 343084 ns 1.06
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 367208 ns 369208 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 449270.5 ns 456437.5 ns 0.98
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 23917 ns 26417 ns 0.91
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46266 ns 42517 ns 1.09
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 743187 ns 749479 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 805084 ns 814979 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1059125 ns 1061458 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 89959 ns 119729.5 ns 0.75
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 310715.5 ns 307361.5 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397333 ns 395625 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288042 ns 288375 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288209 ns 288167 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 750708 ns 749875 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43949 ns 44492 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 673458 ns 646208 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 531458 ns 533666 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 529250 ns 529000 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 974917 ns 974208 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 189986 ns 191704 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 595125 ns 670000 ns 0.89
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 645125 ns 636958 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 661291.5 ns 641042 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 604083.5 ns 677625 ns 0.89
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132185 ns 132879 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2499541.5 ns 2560042 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2451209 ns 2486124.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2456625 ns 2459583 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2529417 ns 2464667 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1282545 ns 1294427.5 ns 0.99
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 3333 ns 2459 ns 1.36
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 3708 ns 3208 ns 1.16
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 4125 ns 4500 ns 0.92
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 2708 ns 3354 ns 0.81
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16211 ns 16581 ns 0.98
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 5292 ns 5541 ns 0.96
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 5250 ns 5500 ns 0.95
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 5667 ns 5583 ns 1.02
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 5625 ns 5541 ns 1.02
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 197863.5 ns 200795 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1466875 ns 1458667 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1505417 ns 1501750 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1503125 ns 1499417 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1440875 ns 1438916 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 41133 ns 40877 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5168291.5 ns 5154042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5273458 ns 5302542 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5291104 ns 5280125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5023291 ns 4986917 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 197140 ns 198039.5 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3667 ns 3750 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3667 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3667 ns 3708 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3667 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 32935 ns 33533 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15042 ns 14875 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15209 ns 15125 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15334 ns 15417 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15000 ns 15125 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 373770 ns 380809 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71500 ns 71583 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 70750 ns 71458 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71125 ns 71166 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 71083 ns 70000 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 112823 ns 112938 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 320042 ns 327333 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 315667 ns 333917 ns 0.95
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 318667 ns 320375 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 324000 ns 318167 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 194736 ns 194303.5 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1000 ns 1000 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1000 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1000 ns 959 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 23415 ns 24404 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8208 ns 8166 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8167 ns 8083 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8375 ns 8250 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8042 ns 8000 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 263486.5 ns 265429.5 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 505999.5 ns 503042 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 497291.5 ns 488125 ns 1.02
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 560209 ns 565250 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 217875 ns 215521 ns 1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129532 ns 129735 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1384937.5 ns 1418125 ns 0.98
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1454020.5 ns 1470041 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1746937.5 ns 1769041.5 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 899021 ns 863062.5 ns 1.04
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 276899 ns 275150 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31419.5 ns 32275 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6167 ns 6375 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6333 ns 6375 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6667 ns 6645.5 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6458.5 ns 6291.5 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 263608 ns 266163 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1728312 ns 1767250 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1729000 ns 1723000 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1733417 ns 1726625 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1738250 ns 1769375 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 170018 ns 169706 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4369375 ns 4423667 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3963375 ns 4340375 ns 0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4358208 ns 4364395.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4400041 ns 4356604.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1280531 ns 1259542.5 ns 1.02
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6750 ns 6875 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6792 ns 6708 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 6875 ns 9167 ns 0.75
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6875 ns 9667 ns 0.71
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20701.5 ns 21299 ns 0.97
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 51792 ns 52542 ns 0.99
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 51208 ns 48458 ns 1.06
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 32833 ns 32834 ns 1.00
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 71875 ns 51708.5 ns 1.39
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 222859 ns 295364.5 ns 0.75
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 17625 ns 17959 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 17625 ns 18333 ns 0.96
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 18291 ns 18667 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 17937.5 ns 17833.5 ns 1.01
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18343 ns 18767 ns 0.98
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 53250 ns 53250 ns 1
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 53166 ns 53583 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 53166 ns 53292 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 53792 ns 53500 ns 1.01
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 340623.5 ns 337341 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75709 ns 75666 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 74125 ns 75250 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75291 ns 75208 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75334 ns 74750 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 47398 ns 46984 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 325250 ns 342791 ns 0.95
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 325333 ns 339042 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 324750 ns 324833 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 340333 ns 325083 ns 1.05
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 211070 ns 212927.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1491500 ns 1484000 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1531125 ns 1528916 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1529875 ns 1527041 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1465875 ns 1464042 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 51611 ns 52506 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5144459 ns 5172417 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5274708 ns 5313667 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5268229 ns 5251417 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5019729.5 ns 4985750 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 205600 ns 206884 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28167 ns 28375 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28167 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28291 ns 28209 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28333 ns 28208 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24406 ns 24921 ns 0.98
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66541 ns 66458 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66292 ns 66625 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66375 ns 66291 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66417 ns 66250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 516526.5 ns 525792 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1468729.5 ns 1326354 ns 1.11
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1131000 ns 1132104 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1119791.5 ns 1139166 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2241937.5 ns 2248604 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 581317 ns 583822.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3109709 ns 3055395.5 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2104833 ns 2729979 ns 0.77
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2739417 ns 2738333 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3875250.5 ns 3816042 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2085553.5 ns 2120607 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 7940229.5 ns 8049792 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 7908458.5 ns 8097167 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 7909729.5 ns 7911292 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 4901667 ns 4824937 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 81709 ns 82042 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81979.5 ns 81875 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 83833 ns 82625 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 80541.5 ns 82125 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193422.5 ns 194553 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2029687.5 ns 2055125 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2007750 ns 2001916.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2012750 ns 2021458 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2040271 ns 2014750 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 811844 ns 811167.5 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.