-
Notifications
You must be signed in to change notification settings - Fork 62
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feat: add bf16 function * docs: add bf16 to docs
- Loading branch information
Showing
4 changed files
with
38 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "Lux" | ||
uuid = "b2108857-7c20-44ae-9111-449ecde12c47" | ||
authors = ["Avik Pal <[email protected]> and contributors"] | ||
version = "1.3.4" | ||
version = "1.4.0" | ||
|
||
[deps] | ||
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
06eb507
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4375
ns4375
ns1
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
4333
ns4208
ns1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
4875
ns5042
ns0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4291
ns3833
ns1.12
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
62852
ns59750
ns1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10250
ns10229.5
ns1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10542
ns10458
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10625
ns11208
ns0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10542
ns10083.5
ns1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
442826.5
ns421969
ns1.05
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
1000
ns1042
ns0.96
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1208
ns1333
ns0.91
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1333
ns1334
ns1.00
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
1333
ns1167
ns1.14
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
18476
ns18218
ns1.01
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
3895.5
ns3791
ns1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
4167
ns4125
ns1.01
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4042
ns4375
ns0.92
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
3959
ns4083
ns0.97
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
113416
ns110020
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57542
ns55625
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46292
ns46833
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46541
ns46208
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83375
ns81750
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37589.5
ns36958.5
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2024896
ns2050166
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1835271
ns2100334
ns0.87
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2098250
ns2073937.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2020667
ns1993041
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
198299
ns195385
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
143916
ns143208
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
144479.5
ns143958.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
145937.5
ns146000
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
143417
ns182375
ns0.79
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
166429
ns165528
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1117416
ns1157292
ns0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
995229
ns1158062.5
ns0.86
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1124542
ns1107125
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1145250.5
ns1113937.5
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
537731.5
ns525805
ns1.02
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3792
ns3542
ns1.07
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
3542
ns3959
ns0.89
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4333.5
ns4458
ns0.97
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3667
ns3458
ns1.06
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
68177
ns70267.5
ns0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9334
ns8792
ns1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8583
ns8667
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9417
ns9500
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8854.5
ns9333
ns0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
498056
ns486148
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
16125
ns15916
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
15813
ns15208
ns1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18791
ns17458
ns1.08
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
15167
ns15750
ns0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
55225
ns55035.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
215625
ns214687.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
213104.5
ns213875
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
214167
ns214499.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
213188
ns214020.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
275510
ns271923
ns1.01
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
500
ns584
ns0.86
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
750
ns542
ns1.38
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
875
ns750
ns1.17
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
667
ns542
ns1.23
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
17577
ns17550
ns1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1500
ns1667
ns0.90
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1667
ns1625
ns1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1542
ns1875
ns0.82
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1500
ns1583
ns0.95
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
103563
ns102829
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7334
ns7083
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5666
ns6041
ns0.94
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5958
ns5917
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10333
ns10000
ns1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
23689
ns23605
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221979
ns221000
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
229334
ns229416.5
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
229417
ns230875
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
214125
ns252791.5
ns0.85
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
169909
ns168416.5
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
3875
ns3958
ns0.98
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
3916
ns3875
ns1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
3917
ns3916
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
23381
ns23282
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16667
ns16625
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16750
ns16875
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
17000
ns16958
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16708
ns16667
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
162845
ns160471
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
571542
ns574250
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
574583
ns576167
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
575500
ns579895.5
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
575333
ns573917
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
113453
ns113142
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1419645.5
ns1424041.5
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1428270.5
ns1417292
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1425833
ns1420500
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
1425291
ns1425500
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
212962.5
ns209769
ns1.02
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1086583
ns1054333.5
ns1.03
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
963625.5
ns959917
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
1340708
ns1343583.5
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1274625
ns1300896
ns0.98
lenet(28, 28, 1, 64)/forward/GPU/CUDA
275533.5
ns279273.5
ns0.99
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
6003313
ns5749437.5
ns1.04
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
4543291
ns4599687.5
ns0.99
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
4950500
ns4952395.5
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
5760542
ns5610084
ns1.03
lenet(28, 28, 1, 64)/zygote/GPU/CUDA
1094293
ns1087158.5
ns1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
500
ns542
ns0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
542
ns541
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
541
ns500
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
23428
ns23646
ns0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2084
ns2125
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2125
ns2084
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2209
ns2208
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2083
ns2083
ns1
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
170454.5
ns173162
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
4542
ns4542
ns1
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
4208
ns4208
ns1
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
4750
ns5000
ns0.95
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
4166
ns4000
ns1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
66283
ns64791.5
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10895.5
ns11208
ns0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
11375
ns11291.5
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
12334
ns12209
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
11084
ns11292
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
458465.5
ns449166
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6667
ns6792
ns0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6541.5
ns6833
ns0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8792
ns8312.5
ns1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6000
ns6250
ns0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
52738.5
ns51887
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
17667
ns16792
ns1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
17958
ns16667
ns1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
17833
ns17500
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
16875
ns17875
ns0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
308559
ns301591
ns1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
583
ns583
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
625
ns583
ns1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
666
ns625
ns1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
542
ns583
ns0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
32624
ns32520
ns1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
8750
ns8625
ns1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
8833
ns8416.5
ns1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9625
ns9417
ns1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8667
ns8250
ns1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
159033.5
ns159487
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
64667
ns64959
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
64583
ns64667
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
64458
ns64416
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
64709
ns64541
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
111312
ns110435.5
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
283500
ns291250
ns0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
271791
ns285125
ns0.95
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
274167
ns274833.5
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
287250
ns279770.5
ns1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
185765
ns183913
ns1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
3282083.5
ns3222417
ns1.02
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
3018667
ns3060583
ns0.99
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
3018083
ns3017291.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
3955041
ns4070708
ns0.97
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA
584692
ns571448
ns1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
7658895.5
ns7560916.5
ns1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
7457750
ns7434917
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
7453875
ns7464958
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
8280228.5
ns8157583.5
ns1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA
1363348
ns1323265
ns1.03
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
17573167
ns17698375
ns0.99
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
17536583
ns17382541
ns1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
17554104.5
ns17917041
ns0.98
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
14252687.5
ns14113978.5
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23479854
ns24259667
ns0.97
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
33441750
ns33537791.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37263874.5
ns37485625
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
35385958
ns34876854.5
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1857021
ns1864963
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
189054666
ns191699417
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
232216291.5
ns233048750
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
192889813
ns194089542
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
446068417
ns434858250
ns1.03
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
13856600
ns13855629
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
287697167
ns292275916
ns0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
333361166
ns336958667
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
296288146
ns297206917
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
358087917
ns408837354
ns0.88
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
21542
ns22333
ns0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
21917
ns24521
ns0.89
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
23833
ns23666
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
23000
ns22417
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
99298.5
ns95962.5
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
103416
ns104625
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
102959
ns103334
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
105041
ns104875
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
103708
ns103250
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
518798
ns503280
ns1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6000
ns6417
ns0.94
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
5750
ns6250
ns0.92
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
6708
ns7250
ns0.93
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6334
ns6041
ns1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
69881
ns67524
ns1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14833.5
ns15250
ns0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14917
ns15500
ns0.96
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
16166
ns16125
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15333.5
ns12875
ns1.19
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
489724.5
ns474310.5
ns1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
2971270.5
ns2994417
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2063958
ns2072458
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2272542
ns2264416
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4666750
ns4512000
ns1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA
591884
ns589406.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
23554624.5
ns23917916
ns0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18028687.5
ns18038749.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
17852125
ns17983750
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
36060875
ns35261125
ns1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2769683
ns2768485.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
33424000
ns33831646.5
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
27634645.5
ns27630729
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
28456458
ns28545541
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
41983541
ns41340292
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
71959
ns72833
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
75187.5
ns73521
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
76708
ns74958
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
74750
ns83500
ns0.90
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
105798
ns102113
ns1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
296145.5
ns208042
ns1.42
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
218625
ns291208
ns0.75
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
213708
ns219875
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
318812.5
ns217417
ns1.47
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
566311.5
ns550239
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
11792
ns11916
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12250
ns12042
ns1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
12750
ns13000
ns0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
12500
ns11583
ns1.08
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
73134
ns70941.5
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
26042
ns26541
ns0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
27042
ns26875
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27875
ns27833
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
27417
ns26708
ns1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
488772.5
ns472589
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
11792
ns12083
ns0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12625
ns12917
ns0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
14250
ns13771
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
12750
ns12334
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
54676
ns52605
ns1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
25458
ns25917
ns0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
25334
ns25625
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27000
ns25958
ns1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26167
ns26542
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
315843.5
ns304518.5
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
179292
ns179750
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
182084
ns181375
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
182291.5
ns182875
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
183250
ns182083
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
58956
ns57612
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
581896
ns585375
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
588312.5
ns582375
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
583583
ns584291.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
587083.5
ns585895.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
294154
ns287910.5
ns1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5625
ns6396
ns0.88
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6334
ns6084
ns1.04
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
6979.5
ns7667
ns0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6520.5
ns5542
ns1.18
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
72600.5
ns70404
ns1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
13916
ns14458
ns0.96
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14417
ns14917
ns0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15667
ns16000
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14895.5
ns14416
ns1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
479621.5
ns461584
ns1.04
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
1206937.5
ns1193604.5
ns1.01
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
1245542
ns1246000
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
1292542
ns1273583.5
ns1.01
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
1006417
ns1016875
ns0.99
batchedmm(512, Bsize=4)/forward/GPU/CUDA
299757
ns301246
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
4169416.5
ns4298583
ns0.97
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
4414833
ns4454937.5
ns0.99
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
4586979
ns4559833
ns1.01
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
3897020.5
ns3718125
ns1.05
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1046888
ns1052722
ns0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1792
ns1875
ns0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1834
ns1834
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1875
ns1833
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1875
ns1834
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
23582
ns24315
ns0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4833
ns5000
ns0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4916
ns4916
ns1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
4917
ns4917
ns1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4875
ns4916
ns0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
189156.5
ns193381
ns0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5792
ns6270.5
ns0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5917
ns6292
ns0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7375
ns7042
ns1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5917
ns5666
ns1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
56730.5
ns56858.5
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10500
ns11000
ns0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10833
ns10584
ns1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
11542
ns11292
ns1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10667
ns10625
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
346270
ns341133.5
ns1.02
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
292
ns416
ns0.70
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
333
ns334
ns1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
333
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
375
ns292
ns1.28
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
22862
ns23459
ns0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2709
ns2792
ns0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2750
ns2708
ns1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
3042
ns3000
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2750
ns2750
ns1
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
160360.5
ns163121
ns0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
11000
ns12209
ns0.90
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
11833
ns12250
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
13334
ns15083
ns0.88
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
11583
ns11375
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
61359.5
ns59412
ns1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
24541
ns24792
ns0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
24459
ns24833
ns0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25542
ns25042
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24542
ns25167
ns0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
310594.5
ns302787.5
ns1.03
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4208
ns4250
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4167
ns4167
ns1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4209
ns4208
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4250
ns4167
ns1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
24670
ns25427.5
ns0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16250
ns16041
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16083
ns16250
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16291
ns16125
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
15959
ns16084
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
204493
ns203537
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5792
ns5875
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
5791
ns5834
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5875
ns5875
ns1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5833
ns5875
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
33437
ns34639
ns0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
20500
ns21375
ns0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
21167
ns21125
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
22250
ns22125
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
20895.5
ns23000
ns0.91
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
177667.5
ns181357
ns0.98
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
419666.5
ns404042
ns1.04
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
386209
ns390084
ns0.99
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
478875
ns483167
ns0.99
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
109479
ns103834
ns1.05
batchedmm(16, Bsize=512)/forward/GPU/CUDA
67033
ns67491
ns0.99
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
909583.5
ns913854
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
973708.5
ns961459
ns1.01
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
1177021
ns1201334
ns0.98
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
462625
ns448417
ns1.03
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
190401
ns192152
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
81208
ns80542
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
81791
ns81500
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
82417
ns79854.5
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
81291.5
ns78813
ns1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193814.5
ns193447.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1935667
ns1946833
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1916333.5
ns1932479
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1698562.5
ns1920708
ns0.88
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1930833.5
ns1904937.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
412302
ns402534
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns333
ns0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
333
ns333
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
21830
ns22000
ns0.99
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1792
ns1833
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1792
ns1792
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1834
ns1834
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1792
ns1833
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
176875.5
ns169877.5
ns1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
6459
ns7521
ns0.86
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6333
ns7167
ns0.88
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
8041
ns7792
ns1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6875
ns6500
ns1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
68185.5
ns61779
ns1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9375
ns9250
ns1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9125
ns9500
ns0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9417
ns9042
ns1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9542
ns9292
ns1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
336822.5
ns314965
ns1.07
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
120048500
ns158324292
ns0.76
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
173952417
ns174385041
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
148044250
ns148149145.5
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
104491521
ns104978917
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5473411
ns5475583
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
614607916.5
ns673914521
ns0.91
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
555101125
ns556536500
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
456940500.5
ns454282229
ns1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
767695250.5
ns754352104
ns1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34955825
ns35161544.5
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
651124125
ns703002500
ns0.93
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
669116625
ns668300021
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
578815791.5
ns587968625
ns0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
742698833
ns742489083
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
60083
ns57833
ns1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
48000
ns48000
ns1
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46333
ns47959
ns0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
84917
ns82333
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
38192
ns38135
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1922875
ns1945042
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1974250
ns1994937.5
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1990062.5
ns1978208
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1906625
ns1862834
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
174155.5
ns174772.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
267042
ns267333
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
265667
ns267521
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
269750
ns268709
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
267791.5
ns266959
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
147410.5
ns138445.5
ns1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
592896
ns605250
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
684208.5
ns597333.5
ns1.15
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
589520.5
ns696500
ns0.85
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
698666
ns676042
ns1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
813499.5
ns740206.5
ns1.10
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2209791.5
ns2204042
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2209896
ns2205084
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2103125
ns2220750
ns0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2236250
ns2219958
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
133936
ns135150.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5510542
ns5598583
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5555437.5
ns5526083
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5498229
ns5502958
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5548208
ns5487708.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
892951
ns792599
ns1.13
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
637334
ns660166
ns0.97
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
646875
ns643583
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
646625
ns659417
ns0.98
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
654875
ns644542
ns1.02
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
46730
ns47532
ns0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1826417
ns1795875
ns1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1722937.5
ns1722291
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1719334
ns1727709
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
2093625
ns2095458
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
221633
ns227325
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
59125
ns56375
ns1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46417
ns46291
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47000
ns46625
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
84917
ns82500
ns1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
28496
ns29417
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2036604.5
ns2030542
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1836917
ns2111062.5
ns0.87
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2095792
ns2091895.5
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2019479
ns1996833
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
190943.5
ns193004
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
13379729
ns13382833
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
12436958
ns12443000
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
12541125
ns12480979
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
15244750
ns15173917
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
515571.5
ns517073
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
47364291.5
ns47607083
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
41915083
ns41883313
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
40768458
ns40854417
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
59377021
ns58509979
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2882313
ns2896765.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
74658083.5
ns97269708
ns0.77
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
90949750
ns68581771
ns1.33
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
90367208
ns90434166
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
99601541
ns98826583
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
59459
ns56833
ns1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47417
ns47417
ns1
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47333
ns47291
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
84375
ns80833
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
48174
ns46888
ns1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1935167
ns1939104
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1966979
ns2010459
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1974250
ns1977312.5
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1911312.5
ns1892292
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
197391.5
ns192004
ns1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
333
ns292
ns1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns334
ns1.12
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
333
ns333
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
32694
ns31834
ns1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6042
ns6084
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6375
ns6083
ns1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6583
ns6709
ns0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6083
ns6167
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
187739
ns176223.5
ns1.07
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
250
ns292
ns0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
333
ns292
ns1.14
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
32507
ns31304
ns1.04
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
2625
ns2625
ns1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
2792
ns2584
ns1.08
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
2917
ns2917
ns1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
2667
ns2667
ns1
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
175489
ns164663.5
ns1.07
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
286852687.5
ns324499000.5
ns0.88
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
339813500
ns340579375
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
313305624.5
ns313389416.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
269143125
ns273909208
ns0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
7114947
ns7105361
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
994653583
ns1052816166
ns0.94
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
936368458
ns943649000
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
837895375.5
ns840615666.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1177393750
ns1152028667
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34035408
ns34095663
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1316953312.5
ns1721214458
ns0.77
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1689250042
ns1359927020.5
ns1.24
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1683427084
ns1606248000
ns1.05
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1672545042
ns1668736833
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1454500
ns1425375
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1407875
ns1415542
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1409834
ns1416520.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1414167
ns1410375
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
128152
ns127634
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5044833
ns5060999.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4714312.5
ns5059104
ns0.93
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5026437.5
ns5025375
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5051042
ns5018125
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
686490.5
ns596333
ns1.15
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
172590354
ns163798854
ns1.05
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
124318041
ns128369875
ns0.97
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
123190833
ns130888792
ns0.94
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
165357625
ns168698771
ns0.98
vgg16(32, 32, 3, 32)/forward/GPU/CUDA
4891073
ns5432122
ns0.90
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
615854000
ns630866750
ns0.98
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
630625333
ns635134916
ns0.99
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
562103875
ns554211625
ns1.01
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
653647292
ns648292583
ns1.01
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA
16015121
ns16519965
ns0.97
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
9006416.5
ns9165854
ns0.98
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
8896937.5
ns8986459
ns0.99
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
7913208
ns7922833
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
9977125
ns9756167
ns1.02
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1591890.5
ns1610067
ns0.99
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
35918500
ns37032625
ns0.97
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
36875416
ns37212042
ns0.99
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
33279603.5
ns33438583
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
39552041.5
ns37841958
ns1.05
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
6456562
ns6473180
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
47709
ns47479.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
47375
ns47437.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
47500
ns47667
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
47542
ns47500
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
19056.5
ns18175
ns1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
50500
ns50250
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
50250
ns50625
ns0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
50750
ns50542
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
50375
ns50416.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
263004.5
ns243634.5
ns1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6500
ns7229.5
ns0.90
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6875
ns6917
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
8250
ns7834
ns1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
7000
ns7292
ns0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
145281
ns134228.5
ns1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9750
ns10375
ns0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10250
ns9458
ns1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10500
ns10334
ns1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10042
ns10250
ns0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
744130.5
ns725024.5
ns1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5875
ns6917
ns0.85
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6125
ns6292
ns0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7375
ns7417
ns0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5750
ns5937.5
ns0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
151878
ns158899
ns0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13042
ns13334
ns0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13250
ns13083
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13250
ns13958
ns0.95
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
13542
ns12875
ns1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
607325
ns654550.5
ns0.93
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
1042
ns1083
ns0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1000
ns1083
ns0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1084
ns1042
ns1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1083
ns1083
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
32597
ns32302
ns1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8000
ns8000
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7833
ns7958.5
ns0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8292
ns8000
ns1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8375
ns8250
ns1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
237194
ns248668.5
ns0.95
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
23167
ns23334
ns0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
23084
ns23625
ns0.98
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
23458
ns23604.5
ns0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
23292
ns23334
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
18583
ns18197
ns1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
52500
ns52375
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
54208.5
ns52583
ns1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
53083
ns52709
ns1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
52667
ns52291
ns1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
385105.5
ns365195
ns1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1402916
ns1409312.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1454958
ns1395312.5
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1401792
ns1395667
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1456375
ns1399187.5
ns1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
197023
ns196466
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5029479.5
ns5048625
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5009000
ns5082916.5
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5018334
ns5010208
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5048291.5
ns5015083
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
723983
ns697077
ns1.04
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3027250
ns3082583
ns0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2080437.5
ns2075667
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2291437.5
ns2279000
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4926416
ns4910958
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
586737
ns586799
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
24385583.5
ns24742792
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18885375
ns18899334
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
18817833
ns18912125
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
37153959
ns36606271
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2840867
ns2884394
ns0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
34104021
ns34600271
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
28291895.5
ns28275125
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
28002792
ns27978625
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
42422958
ns41693583
ns1.02
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
142464667
ns146263625
ns0.97
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
147805542
ns148262792
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
127021875
ns125521666
ns1.01
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
173652229
ns173208104.5
ns1.00
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22558116
ns22564372
ns1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
1200697875
ns948935833
ns1.27
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
1864615833.5
ns1199705645.5
ns1.55
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
1647966021
ns727524542
ns2.27
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
686829458
ns936153853.5
ns0.73
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
117772826
ns115985315
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
75375
ns74250
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
84209
ns76209
ns1.10
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
76458
ns76042
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
80958
ns72167
ns1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
312734.5
ns331111.5
ns0.94
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
204687.5
ns282500
ns0.72
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
278104
ns191083.5
ns1.46
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
192375
ns280584
ns0.69
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
283312.5
ns291917
ns0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1488616
ns1500994.5
ns0.99
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
35722250
ns36314916.5
ns0.98
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
36312354
ns36531396
ns0.99
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
32588937.5
ns32439729.5
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
40883292
ns40435354
ns1.01
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5836813
ns5837859
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
149459958
ns151857209
ns0.98
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
153182708.5
ns153888604
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
140187104
ns135530208.5
ns1.03
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
226961625.5
ns283241209
ns0.80
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
34882818.5
ns34859945
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
121271541.5
ns159567375
ns0.76
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
174726458
ns174506458
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
147669333
ns147925667
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
105646958
ns104572437
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5477234.5
ns5480695
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
471261458.5
ns524085270.5
ns0.90
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
465682583
ns467380250
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
434340042
ns437823166
ns0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
758899104.5
ns737646542
ns1.03
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
32272056.5
ns32284174.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
709031375
ns696105375
ns1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
654357417
ns658106854.5
ns0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
581732375
ns575346979
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
734152875
ns729353375
ns1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1246834
ns1155874.5
ns1.08
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
970729
ns998792
ns0.97
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
905979
ns991542
ns0.91
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
2088750
ns2092625
ns1.00
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA
584722.5
ns579446
ns1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
3017521
ns2931916.5
ns1.03
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
2605541
ns2619083.5
ns0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
2618042
ns2626604.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
3762104
ns3482417
ns1.08
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA
1908342
ns1969877.5
ns0.97
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
5812937.5
ns5947625
ns0.98
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
5782937.5
ns5782625
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
5769333
ns5801958.5
ns0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
2967958
ns2880584
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7500
ns7208
ns1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6250
ns6083
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6125
ns5959
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10375
ns9959
ns1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
25756
ns26024
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212417
ns212562.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
221208.5
ns221083.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
220334
ns221333
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
206375
ns209292
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
307623
ns302079.5
ns1.02
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
310236833.5
ns311414437.5
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
228243416
ns232931208
ns0.98
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
199023750
ns202032375
ns0.99
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
307111500
ns308462875
ns1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA
7677099
ns7680461
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
1077070792
ns1101691479.5
ns0.98
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
909540270.5
ns909424125
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
811121083
ns804661000
ns1.01
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
1177347271
ns1153673416.5
ns1.02
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA
26401108
ns26512167
ns1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5625
ns5833.5
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5833.5
ns5833
ns1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6667
ns6270.5
ns1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
5208
ns5146
ns1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
199489.5
ns196235
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7208
ns7542
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7334
ns7125
ns1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7416
ns7417
ns1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7291
ns7125
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
722768
ns702510
ns1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
583
ns625
ns0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
667
ns584
ns1.14
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
667
ns625
ns1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
500
ns583
ns0.86
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
24721
ns24654
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9000
ns9333
ns0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9209
ns8750
ns1.05
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
9709
ns9875
ns0.98
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9416
ns9292
ns1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
238784
ns239874
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
353291.5
ns352958
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
351750
ns351875
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
352354.5
ns351583.5
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
362833
ns352208
ns1.03
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
21565
ns21408
ns1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
814792
ns779709
ns1.04
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
826041.5
ns775541
ns1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
777875
ns776062.5
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
829958
ns817375
ns1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
302093.5
ns316328
ns0.96
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
337583.5
ns317708
ns1.06
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
340250
ns341667
ns1.00
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
444208
ns453354
ns0.98
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
10812.5
ns10875
ns0.99
batchedmm(16, Bsize=32)/forward/GPU/CUDA
18424
ns18691
ns0.99
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
719166.5
ns712145.5
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
721917
ns734917
ns0.98
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
1006458
ns1006834
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
28250
ns27250
ns1.04
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
299767.5
ns293795
ns1.02
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
379708.5
ns359083.5
ns1.06
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
349958
ns350250
ns1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
436354
ns442875
ns0.99
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
30833
ns30583
ns1.01
batchedmm(16, Bsize=128)/forward/GPU/CUDA
23185.5
ns22877.5
ns1.01
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
737500
ns736584
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
772041.5
ns783750
ns0.99
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
1022146
ns1041500
ns0.98
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
101459
ns105875
ns0.96
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
233048
ns265090.5
ns0.88
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
3458
ns3666
ns0.94
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
3625
ns3667
ns0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
3625
ns3667
ns0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
3625
ns3750
ns0.97
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
18179
ns17832
ns1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
4334
ns4208
ns1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
4625
ns4292
ns1.08
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
4625
ns4333
ns1.07
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
4458
ns4333
ns1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
297309.5
ns285935
ns1.04
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3833
ns4209
ns0.91
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
3708
ns3875
ns0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4208.5
ns4291
ns0.98
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4104
ns3250
ns1.26
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
236489
ns226147.5
ns1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8458
ns8541
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8166
ns8208.5
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8792
ns8833
ns1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8708
ns8792
ns0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
1272633
ns1241590
ns1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
208042
ns203041
ns1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
215895.5
ns210833
ns1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
211084
ns213042
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
199667
ns200208
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
35583
ns35096
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
645812.5
ns600458
ns1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
623291
ns664687.5
ns0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
622916
ns621125
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
638333
ns587666
ns1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
366544.5
ns364021
ns1.01
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
1020979
ns1006145.5
ns1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
1006020.5
ns1034750
ns0.97
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
957729
ns960375
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
904000
ns870666.5
ns1.04
batchedmm(128, Bsize=128)/forward/GPU/CUDA
208984
ns207603
ns1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
4550166.5
ns4675520.5
ns0.97
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
4713709
ns4661500
ns1.01
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
4462125
ns4484166.5
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
5571625
ns5182375
ns1.08
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
936095
ns945582
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3708
ns4167
ns0.89
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3708.5
ns3458
ns1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
4292
ns4416.5
ns0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3709
ns3250
ns1.14
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
245340.5
ns242881.5
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7167
ns7625
ns0.94
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7375
ns7125
ns1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7708
ns7791
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7209
ns7166
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
1060150.5
ns1049374
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1616083
ns1641104.5
ns0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1153750
ns1162041.5
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1337250
ns1361146
ns0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2432374.5
ns2337792
ns1.04
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA
217163
ns215237
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12337062.5
ns12428417
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9522833
ns9554417
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9266729
ns9282166
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18081312
ns18043958
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1948614
ns1957521
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17355771
ns17446729
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14388208.5
ns14307562.5
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14348354
ns14338292
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21196875
ns21055500
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
88312.5
ns90250
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
89271
ns89750
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
91125
ns92271
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
91625
ns92625
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
126391
ns126161
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2036875
ns2045083
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2015416.5
ns2029000
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1865791
ns2032875
ns0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2043208
ns2022667
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1072650
ns1071170.5
ns1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
2813
ns1645.5
ns1.71
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
2791
ns2375
ns1.18
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
3375
ns3708
ns0.91
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
1833
ns2875
ns0.64
batchedmm(2, Bsize=4)/forward/GPU/CUDA
16578
ns16073
ns1.03
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
2542
ns2792
ns0.91
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
2625
ns2708
ns0.97
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
2875
ns2916
ns0.99
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
3000
ns2792
ns1.07
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
199941.5
ns196950.5
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7167
ns7166
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5709
ns6083
ns0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5833
ns5958
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10250
ns10042
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
34656
ns33844
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212709
ns215354.5
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220625
ns220833.5
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
220541
ns220875
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
220354
ns209625.5
ns1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
356302
ns351596
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3708
ns3750
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3708
ns3708
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3709
ns3708
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3750
ns3709
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
22913
ns22384
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14584
ns14250
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
14458
ns14459
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14500
ns14334
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14166.5
ns14375
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
486419
ns522120.5
ns0.93
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
92375
ns140291
ns0.66
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
93333.5
ns91729.5
ns1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
94916.5
ns96250
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
95417
ns94458
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
125841
ns125465
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1915000
ns1947916
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1914209
ns1932104.5
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1928125.5
ns1925000
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1940729
ns1650916
ns1.18
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1045000
ns1016603
ns1.03
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
871959
ns859167
ns1.01
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
821041.5
ns818395.5
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
1216500
ns1219500
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
943271
ns962834
ns0.98
lenet(28, 28, 1, 32)/forward/GPU/CUDA
280426
ns269546
ns1.04
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2729167
ns2844645.5
ns0.96
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
2498104
ns2436375
ns1.03
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
3340041
ns3336375
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3427250
ns3413042
ns1.00
lenet(28, 28, 1, 32)/zygote/GPU/CUDA
1723859
ns1630539.5
ns1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17229
ns15708.5
ns1.10
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
17875
ns16000
ns1.12
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18792
ns17166.5
ns1.09
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
15375
ns14958.5
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
190406.5
ns143350.5
ns1.33
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
228541
ns217083.5
ns1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220833.5
ns215417
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
216521
ns216916
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
228437.5
ns227125
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
725866.5
ns653459
ns1.11
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
221145.5
ns221959
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
222000
ns221645.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
221291
ns221395.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
221083.5
ns221083
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
321034
ns274733.5
ns1.17
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
495084
ns511042
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
496625
ns495750
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
496729
ns497042
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
507625
ns508875
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1510358
ns1471458
ns1.03
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
3854
ns4666.5
ns0.83
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
3875
ns4083
ns0.95
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
5042
ns5708
ns0.88
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
4083
ns3917
ns1.04
batchedmm(16, Bsize=4)/forward/GPU/CUDA
17250
ns16967
ns1.02
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
7167
ns7125
ns1.01
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
6959
ns7417
ns0.94
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
7250
ns7520.5
ns0.96
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
7416
ns7250
ns1.02
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
201503.5
ns198610.5
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
20083
ns18084
ns1.11
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
16875
ns18166.5
ns0.93
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
19500
ns18666.5
ns1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
18167
ns18395.5
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
232442
ns148303
ns1.57
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
224916
ns214125
ns1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
212708
ns212791.5
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
212416
ns213042
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
248812.5
ns219417
ns1.13
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1078558.5
ns1024505
ns1.05
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
4333
ns4625
ns0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
4250
ns4333
ns0.98
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5209
ns5708
ns0.91
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
4417
ns3708.5
ns1.19
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
255475
ns244514
ns1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10708
ns10875
ns0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10166
ns10062.5
ns1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10833
ns11375
ns0.95
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
11375
ns10583
ns1.07
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
1114054
ns1099794
ns1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3500
ns4000
ns0.88
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3583.5
ns3792
ns0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
4458
ns4375
ns1.02
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3917
ns2750
ns1.42
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
247293
ns250198
ns0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7542
ns7270.5
ns1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7250
ns7459
ns0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8083
ns7916
ns1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7791
ns7500
ns1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
1128935
ns1106505
ns1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23544917
ns24086812.5
ns0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
34700375
ns34704750
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37800604.5
ns37376896
ns1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
35322562.5
ns34935000
ns1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1834217
ns1853477
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
183535208
ns186942250
ns0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
159261916
ns159685500
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
146891041.5
ns146457125
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
419037250
ns411532208
ns1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
16405198.5
ns16500596
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
428624000
ns434054666
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
254269584
ns253740479
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
296570146
ns299567770.5
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
493357917
ns479705417
ns1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
184479.5
ns184375
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
184208
ns183584
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
185416
ns184333
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
184062.5
ns185292
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
233424
ns229399
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
585417
ns594750
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
589833
ns586209
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
586896
ns586729.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
639166
ns599250.5
ns1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1146333
ns1121066.5
ns1.02
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
3917708
ns3936375
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
3921208
ns4081937
ns0.96
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
3581645.5
ns3587479
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
4674709
ns4565729.5
ns1.02
batchedmm(128, Bsize=512)/forward/GPU/CUDA
538155
ns538820
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
17548417
ns18136458.5
ns0.97
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
17792083
ns17936750
ns0.99
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
16472417
ns16532771
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
21347458
ns20226167
ns1.06
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2621425
ns2633099
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
542
ns583
ns0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
625
ns625
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
584
ns625
ns0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
583
ns583
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
33117
ns31971
ns1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9291
ns9375
ns0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9354.5
ns9000
ns1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9792
ns9500
ns1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
9416
ns9333
ns1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
269036
ns265140
ns1.01
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
503680250
ns503989417
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
425402999.5
ns431858541.5
ns0.99
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
418147958
ns427434834
ns0.98
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
678706395.5
ns592092708
ns1.15
vgg16(32, 32, 3, 128)/forward/GPU/CUDA
12481919
ns11928812
ns1.05
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
1881496729.5
ns1891189687.5
ns0.99
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
1619255500
ns1632073542
ns0.99
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
1494277750
ns1496948750
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
2234203604
ns2217192312.5
ns1.01
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA
49122118.5
ns49332313
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1536334
ns1638750
ns0.94
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1156271
ns1179458
ns0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1380541
ns1387875
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2362625
ns2352479.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
217676
ns214938
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12766416
ns12852583.5
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9918708
ns9964500
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9674833
ns9669416.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18454708
ns18345667
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2051013
ns2032751.5
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17738333
ns17791875
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14710417
ns14679354.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14604375
ns14576209
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21451125
ns21490021
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
26250
ns26333
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
26208
ns26250
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
26250
ns26250
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
26250
ns26250
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
23581.5
ns23891
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
67333
ns66750
ns1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
68000
ns67791
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
67333
ns67042
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
67042
ns67042
ns1
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
404754.5
ns403092
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
205583
ns203542
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
209125
ns209834
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
209000
ns210500
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
199041
ns199500
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
26431.5
ns26253
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
611478.5
ns602666.5
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
633292
ns670479
ns0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
670416
ns621791
ns1.08
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
611479
ns633916
ns0.96
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
353085.5
ns351051
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
612333
ns678375
ns0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
643520.5
ns654937.5
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
644958
ns646500
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
652334
ns669916
ns0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
132321
ns131843
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2263750
ns2326042
ns0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2226645.5
ns2262000
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2243875
ns2145978.5
ns1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2302583
ns2234542
ns1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1253025
ns1242552
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
19667
ns17645.5
ns1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
16917
ns17062.5
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
21500.5
ns19125
ns1.12
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
18208
ns17875
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
145311.5
ns146421.5
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
233042
ns220959
ns1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
218770.5
ns219500
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
262625
ns220291
ns1.19
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
230000
ns235729
ns0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1059070.5
ns1091456.5
ns0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
583
ns625
ns0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
667
ns666
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
667
ns708
ns0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
542
ns583
ns0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
23551
ns23721
ns0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9667
ns10084
ns0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9583
ns9791.5
ns0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
10583
ns10041
ns1.05
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9959
ns9875
ns1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
258697
ns261550.5
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5833
ns6042
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5542
ns5625
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6500
ns6584
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4958
ns5125
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
231871
ns234716
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6875
ns7417
ns0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7125
ns7208
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7792
ns8041
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6833
ns7334
ns0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
803650
ns806215
ns1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
2166
ns2229.5
ns0.97
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1917
ns2458
ns0.78
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
2417
ns2375
ns1.02
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
2333
ns2292
ns1.02
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
17852
ns17855
ns1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
6417
ns6542
ns0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6458
ns6667
ns0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6916
ns6834
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
6459
ns6500
ns0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
332798.5
ns333301.5
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
749396
ns755083
ns0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
746625
ns746333
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
749250
ns749250
ns1
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
751417
ns750187.5
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
21271
ns21362
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
793042
ns788958.5
ns1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
792500
ns772209
ns1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
775750
ns787687.5
ns0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
797542
ns791333
ns1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
296567
ns298265
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7500
ns7125
ns1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5542
ns6041
ns0.92
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6000
ns5959
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10458
ns10209
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
32600
ns33317.5
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221375
ns226500
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
240270.5
ns236063
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
257854
ns228041
ns1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
222250
ns255812.5
ns0.87
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
360398
ns363202
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
10104.5
ns10542
ns0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
10334
ns10334
ns1
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10916
ns11208
ns0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
10229.5
ns9833
ns1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
251149.5
ns246668.5
ns1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
24666
ns24729.5
ns1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
24312.5
ns24666
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25750
ns25542
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
25562.5
ns24625
ns1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
1138926
ns1134784.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
106325125
ns106546667
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
117472625
ns118425312.5
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
120287229
ns120189792
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
117860729
ns117420708
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
2629206
ns2655736
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
394161333.5
ns394570417
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
365470000
ns368931959
ns0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
355300666
ns424438979
ns0.84
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
484349417
ns482063875
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
15196205
ns15246102
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
755446562.5
ns945190750
ns0.80
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
762235792
ns580209500
ns1.31
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
742589166.5
ns744122999.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
957309125
ns945148083
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7041
ns7708
ns0.91
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6875
ns7250
ns0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8584
ns8750
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6625
ns6958.5
ns0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
243246.5
ns238753.5
ns1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14167
ns14500
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14041.5
ns13875
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14667
ns14000
ns1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14020.5
ns14333
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
1088970
ns1093778.5
ns1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5917
ns6708
ns0.88
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6270.5
ns6125
ns1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7292
ns8208
ns0.89
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5583
ns5417
ns1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
237671
ns238599
ns1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12458
ns12833
ns0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12375
ns12750
ns0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
12916
ns13166
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12292
ns12666
ns0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
799420
ns799288.5
ns1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
5417
ns5667
ns0.96
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
5709
ns6250
ns0.91
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
6166
ns6459
ns0.95
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
5667
ns5500
ns1.03
batchedmm(2, Bsize=128)/forward/GPU/CUDA
17212
ns17328
ns0.99
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
15458
ns15583
ns0.99
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
15459
ns15417
ns1.00
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
15708
ns15625
ns1.01
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
15625
ns15791
ns0.99
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
202604
ns202450
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
333
ns375
ns0.89
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
416
ns375
ns1.11
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns333
ns0.88
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
23718
ns23671
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6167
ns6541
ns0.94
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6334
ns6459
ns0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6875
ns6667
ns1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6291
ns6312.5
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
241777
ns241480.5
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5917
ns5875
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6042
ns5834
ns1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
5959
ns5917
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
5834
ns5917
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
24949
ns25115
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
20958
ns21604.5
ns0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
21208
ns21166
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
21625
ns21750
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
21250
ns21708.5
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
267126.5
ns267689.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
143875
ns186417
ns0.77
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
143770.5
ns144250
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
149333
ns148916.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
144270.5
ns187729
ns0.77
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
169394.5
ns167935.5
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1364229
ns1375083.5
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1311708
ns1321917
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1324520.5
ns1326146
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1349667
ns1322375
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1363355
ns1358092
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
23458
ns23000
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
22250
ns25770.5
ns0.86
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
25167
ns24167
ns1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
22584
ns23834
ns0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
357496
ns354989
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
186729
ns130916
ns1.43
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
175562.5
ns188500
ns0.93
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
180666.5
ns127375
ns1.42
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
165042
ns176959
ns0.93
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1496433
ns1479622.5
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
292
ns375
ns0.78
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
416
ns375
ns1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
333
ns292
ns1.14
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
23418
ns23532
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6167
ns6458
ns0.95
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6333.5
ns6333
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
7042
ns6833
ns1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6459
ns6458
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
259427.5
ns258733.5
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
4708
ns4833
ns0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4625
ns4917
ns0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
5166
ns5709
ns0.90
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4896
ns4167
ns1.17
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
256096.5
ns256891
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9917
ns9916
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10042
ns9958
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10667
ns10584
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10750
ns10167
ns1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
1366539
ns1360812
ns1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1583
ns1667
ns0.95
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1667
ns1625
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1625
ns1584
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1625
ns1583
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
23036
ns23180
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
5625
ns5708
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
5833
ns5667
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6041
ns6000
ns1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
5666
ns5708
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
276314
ns276437
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
6734334
ns6818791
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
6391625
ns6367083
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
6537375
ns6546291.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
7542292
ns7662166
ns0.98
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
216147
ns215904
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
24173292
ns24172500
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
21308875
ns21282334
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
21052792
ns21008479
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
29893541
ns29757292
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2120264
ns2111780
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
37482583
ns48853770.5
ns0.77
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
45446437.5
ns34383187.5
ns1.32
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
45525834
ns45683833.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
49665500
ns49363417
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5916
ns6479.5
ns0.91
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
5729.5
ns6125
ns0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7166
ns6833
ns1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5750
ns5750
ns1
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
236953
ns238562.5
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8583
ns8625
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8042
ns8084
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8666
ns8208
ns1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8500
ns7917
ns1.07
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
1066445
ns1069949
ns1.00
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
1511791
ns1541500
ns0.98
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
1266750
ns1273500
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
1624771
ns1639187
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2083583.5
ns2161000
ns0.96
lenet(28, 28, 1, 128)/forward/GPU/CUDA
272636.5
ns276949
ns0.98
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
7911542
ns7986167
ns0.99
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
6587125
ns6543375.5
ns1.01
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
7180959
ns7167709
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
10527750
ns10462145.5
ns1.01
lenet(28, 28, 1, 128)/zygote/GPU/CUDA
1860081
ns1888924
ns0.98
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
364792
ns343084
ns1.06
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
367208
ns369208
ns0.99
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
449270.5
ns456437.5
ns0.98
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
23917
ns26417
ns0.91
batchedmm(128, Bsize=4)/forward/GPU/CUDA
46266
ns42517
ns1.09
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
743187
ns749479
ns0.99
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
805084
ns814979
ns0.99
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
1059125
ns1061458
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
89959
ns119729.5
ns0.75
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
310715.5
ns307361.5
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
397333
ns395625
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
288042
ns288375
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
288209
ns288167
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
750708
ns749875
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
43949
ns44492
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
673458
ns646208
ns1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
531458
ns533666
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
529250
ns529000
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
974917
ns974208
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
189986
ns191704
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
595125
ns670000
ns0.89
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
645125
ns636958
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
661291.5
ns641042
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
604083.5
ns677625
ns0.89
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
132185
ns132879
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2499541.5
ns2560042
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2451209
ns2486124.5
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2456625
ns2459583
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2529417
ns2464667
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1282545
ns1294427.5
ns0.99
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
3333
ns2459
ns1.36
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
3708
ns3208
ns1.16
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
4125
ns4500
ns0.92
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
2708
ns3354
ns0.81
batchedmm(2, Bsize=32)/forward/GPU/CUDA
16211
ns16581
ns0.98
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
5292
ns5541
ns0.96
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
5250
ns5500
ns0.95
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
5667
ns5583
ns1.02
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
5625
ns5541
ns1.02
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
197863.5
ns200795
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1466875
ns1458667
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1505417
ns1501750
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1503125
ns1499417
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1440875
ns1438916
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
41133
ns40877
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5168291.5
ns5154042
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5273458
ns5302542
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5291104
ns5280125
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5023291
ns4986917
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
197140
ns198039.5
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3667
ns3750
ns0.98
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3708
ns3667
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3667
ns3708
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3709
ns3667
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
32935
ns33533
ns0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15042
ns14875
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
15209
ns15125
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
15334
ns15417
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
15000
ns15125
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
373770
ns380809
ns0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
71500
ns71583
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
70750
ns71458
ns0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
71125
ns71166
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
71083
ns70000
ns1.02
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
112823
ns112938
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
320042
ns327333
ns0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
315667
ns333917
ns0.95
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
318667
ns320375
ns0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
324000
ns318167
ns1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
194736
ns194303.5
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
1000
ns1000
ns1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
1083
ns1000
ns1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
1083
ns1042
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
1000
ns959
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
23415
ns24404
ns0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8208
ns8166
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8167
ns8083
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8375
ns8250
ns1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8042
ns8000
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
263486.5
ns265429.5
ns0.99
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
505999.5
ns503042
ns1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
497291.5
ns488125
ns1.02
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
560209
ns565250
ns0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
217875
ns215521
ns1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA
129532
ns129735
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
1384937.5
ns1418125
ns0.98
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
1454020.5
ns1470041
ns0.99
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
1746937.5
ns1769041.5
ns0.99
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
899021
ns863062.5
ns1.04
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
276899
ns275150
ns1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
292
ns375
ns0.78
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns416
ns0.90
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
31419.5
ns32275
ns0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6167
ns6375
ns0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6333
ns6375
ns0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6667
ns6645.5
ns1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6458.5
ns6291.5
ns1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
263608
ns266163
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1728312
ns1767250
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1729000
ns1723000
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1733417
ns1726625
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1738250
ns1769375
ns0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
170018
ns169706
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4369375
ns4423667
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
3963375
ns4340375
ns0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4358208
ns4364395.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4400041
ns4356604.5
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1280531
ns1259542.5
ns1.02
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
6750
ns6875
ns0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
6792
ns6708
ns1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
6875
ns9167
ns0.75
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6875
ns9667
ns0.71
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
20701.5
ns21299
ns0.97
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
51792
ns52542
ns0.99
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
51208
ns48458
ns1.06
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
32833
ns32834
ns1.00
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
71875
ns51708.5
ns1.39
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
222859
ns295364.5
ns0.75
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
17625
ns17959
ns0.98
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
17625
ns18333
ns0.96
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
18291
ns18667
ns0.98
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
17937.5
ns17833.5
ns1.01
batchedmm(2, Bsize=512)/forward/GPU/CUDA
18343
ns18767
ns0.98
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
53250
ns53250
ns1
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
53166
ns53583
ns0.99
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
53166
ns53292
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
53792
ns53500
ns1.01
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
340623.5
ns337341
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
75709
ns75666
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
74125
ns75250
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
75291
ns75208
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
75334
ns74750
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
47398
ns46984
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
325250
ns342791
ns0.95
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
325333
ns339042
ns0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
324750
ns324833
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
340333
ns325083
ns1.05
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
211070
ns212927.5
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1491500
ns1484000
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1531125
ns1528916
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1529875
ns1527041
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1465875
ns1464042
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
51611
ns52506
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5144459
ns5172417
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5274708
ns5313667
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5268229
ns5251417
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5019729.5
ns4985750
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
205600
ns206884
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
28167
ns28375
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
28167
ns28250
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
28291
ns28209
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
28333
ns28208
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
24406
ns24921
ns0.98
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
66541
ns66458
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
66292
ns66625
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
66375
ns66291
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66417
ns66250
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
516526.5
ns525792
ns0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1468729.5
ns1326354
ns1.11
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
1131000
ns1132104
ns1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
1119791.5
ns1139166
ns0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2241937.5
ns2248604
ns1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA
581317
ns583822.5
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
3109709
ns3055395.5
ns1.02
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
2104833
ns2729979
ns0.77
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
2739417
ns2738333
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
3875250.5
ns3816042
ns1.02
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA
2085553.5
ns2120607
ns0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
7940229.5
ns8049792
ns0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
7908458.5
ns8097167
ns0.98
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
7909729.5
ns7911292
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
4901667
ns4824937
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
81709
ns82042
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
81979.5
ns81875
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
83833
ns82625
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
80541.5
ns82125
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193422.5
ns194553
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2029687.5
ns2055125
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2007750
ns2001916.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2012750
ns2021458
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2040271
ns2014750
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
811844
ns811167.5
ns1.00
This comment was automatically generated by workflow using github-action-benchmark.