-
Notifications
You must be signed in to change notification settings - Fork 62
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
docs: add CUDA.CURAND.default_rng() to docs (#1105)
- Loading branch information
Showing
1 changed file
with
1 addition
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
6f9f8d6
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3937.5
ns4375
ns0.90
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
4333
ns4333
ns1
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
4917
ns4875
ns1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4042
ns4291
ns0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
61383
ns62852
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10583
ns10250
ns1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10250
ns10542
ns0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10125
ns10625
ns0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10083
ns10542
ns0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
431239
ns442826.5
ns0.97
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
1042
ns1000
ns1.04
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1334
ns1208
ns1.10
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1333
ns1333
ns1
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
1125
ns1333
ns0.84
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
18191
ns18476
ns0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
4333
ns3895.5
ns1.11
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
4250
ns4167
ns1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4291
ns4042
ns1.06
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
3834
ns3959
ns0.97
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
110865.5
ns113416
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57709
ns57542
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46667
ns46292
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46208
ns46541
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
80291
ns83375
ns0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37897
ns37589.5
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2036958
ns2024896
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2083333.5
ns1835271
ns1.14
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1856125
ns2098250
ns0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1994375
ns2020667
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
198201
ns198299
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
157792
ns143916
ns1.10
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
145875
ns144479.5
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
145583.5
ns145937.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
143729
ns143417
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
166222
ns166429
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1114145.5
ns1117416
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1128875
ns995229
ns1.13
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1024292
ns1124542
ns0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1115833.5
ns1145250.5
ns0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
534915.5
ns537731.5
ns0.99
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3584
ns3792
ns0.95
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4208
ns3542
ns1.19
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4000
ns4333.5
ns0.92
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3583
ns3667
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
67978
ns68177
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9750
ns9334
ns1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10459
ns8583
ns1.22
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8625
ns9417
ns0.92
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9125
ns8854.5
ns1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
495677
ns498056
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
15000
ns16125
ns0.93
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18500
ns15813
ns1.17
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
16000
ns18791
ns0.85
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
14583
ns15167
ns0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
55105
ns55225
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
213834
ns215625
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
215958
ns213104.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
213333
ns214167
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
214375
ns213188
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
276152.5
ns275510
ns1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
541
ns500
ns1.08
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
792
ns750
ns1.06
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
792
ns875
ns0.91
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
541
ns667
ns0.81
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
17241
ns17577
ns0.98
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1541
ns1500
ns1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1708
ns1667
ns1.02
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1791
ns1542
ns1.16
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1542
ns1500
ns1.03
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
102070.5
ns103563
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7208
ns7334
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5958
ns5666
ns1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5916
ns5958
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10042
ns10333
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
23944
ns23689
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221312.5
ns221979
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
229500
ns229334
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
228667
ns229417
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
218021
ns214125
ns1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
170367
ns169909
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
3958
ns3875
ns1.02
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
3917
ns3916
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
3917
ns3917
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
3875
ns3917
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
23420
ns23381
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
17000
ns16667
ns1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
17084
ns16750
ns1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16916
ns17000
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16708
ns16708
ns1
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
162884.5
ns162845
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
573416.5
ns571542
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
580333
ns574583
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
568042
ns575500
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
569542
ns575333
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
113416
ns113453
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1418250
ns1419645.5
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1429042
ns1428270.5
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1420375
ns1425833
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
1437458
ns1425291
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
212927.5
ns212962.5
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1086895.5
ns1086583
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
962854
ns963625.5
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
1344792
ns1340708
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1286083
ns1274625
ns1.01
lenet(28, 28, 1, 64)/forward/GPU/CUDA
281106
ns275533.5
ns1.02
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
5908292
ns6003313
ns0.98
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
4600625
ns4543291
ns1.01
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
4927041.5
ns4950500
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
5714562.5
ns5760542
ns0.99
lenet(28, 28, 1, 64)/zygote/GPU/CUDA
1101975
ns1094293
ns1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
542
ns500
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
500
ns541
ns0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
23476
ns23428
ns1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2209
ns2084
ns1.06
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2167
ns2125
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2167
ns2209
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2166
ns2083
ns1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
169515.5
ns170454.5
ns0.99
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
4333
ns4542
ns0.95
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
4500
ns4208
ns1.07
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
4791.5
ns4750
ns1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
3791
ns4166
ns0.91
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
65149
ns66283
ns0.98
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
11584
ns10895.5
ns1.06
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
11333
ns11375
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
11667
ns12334
ns0.95
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
11333
ns11084
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
446339
ns458465.5
ns0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6916
ns6667
ns1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6917
ns6541.5
ns1.06
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7479.5
ns8792
ns0.85
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6042
ns6000
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
51979.5
ns52738.5
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
19125
ns17667
ns1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
17458
ns17958
ns0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
17792
ns17833
ns1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
17333
ns16875
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
300938.5
ns308559
ns0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
666
ns583
ns1.14
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
584
ns625
ns0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
542
ns666
ns0.81
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
584
ns542
ns1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
32053.5
ns32624
ns0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9250
ns8750
ns1.06
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9042
ns8833
ns1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9375
ns9625
ns0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8917
ns8667
ns1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
158152
ns159033.5
ns0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
64333
ns64667
ns0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
64625
ns64583
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
64458
ns64458
ns1
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
64375
ns64709
ns0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
111585
ns111312
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
287209
ns283500
ns1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
277834
ns271791
ns1.02
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
280583
ns274167
ns1.02
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
281125
ns287250
ns0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
183928
ns185765
ns0.99
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
3298562.5
ns3282083.5
ns1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
3083000
ns3018667
ns1.02
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
3028771
ns3018083
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
4061625
ns3955041
ns1.03
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA
577723.5
ns584692
ns0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
7606291
ns7658895.5
ns0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
7495375.5
ns7457750
ns1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
7404541
ns7453875
ns0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
8192541.5
ns8280228.5
ns0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA
1371476
ns1363348
ns1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
17505792
ns17573167
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
17567291
ns17536583
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
17475667
ns17554104.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
14122958.5
ns14252687.5
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23660020.5
ns23479854
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
34147791.5
ns33441750
ns1.02
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37059937.5
ns37263874.5
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
34985187.5
ns35385958
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1854503
ns1857021
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
187449375
ns189054666
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
233703458.5
ns232216291.5
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
195671083
ns192889813
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
433586291
ns446068417
ns0.97
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
13830860.5
ns13856600
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
288446709
ns287697167
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
337867791
ns333361166
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
296978708
ns296288146
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
400413062.5
ns358087917
ns1.12
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
22084
ns21542
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
24979.5
ns21917
ns1.14
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
23875
ns23833
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
21666
ns23000
ns0.94
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
98077
ns99298.5
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
102958
ns103416
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
104792
ns102959
ns1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
103812
ns105041
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
110292
ns103708
ns1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
512479
ns518798
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6125
ns6000
ns1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6500
ns5750
ns1.13
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7062.5
ns6708
ns1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6083
ns6334
ns0.96
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
69253
ns69881
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
15166
ns14833.5
ns1.02
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
16145.5
ns14917
ns1.08
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
16208
ns16166
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
15083
ns15333.5
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
482969
ns489724.5
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3041271
ns2971270.5
ns1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2067458.5
ns2063958
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2297479.5
ns2272542
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4457375
ns4666750
ns0.96
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA
592674
ns591884
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
23527562.5
ns23554624.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18050396
ns18028687.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
17902834
ns17852125
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
35496125
ns36060875
ns0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2768935.5
ns2769683
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
33385459
ns33424000
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
27540666
ns27634645.5
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
28658250
ns28456458
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
41547354.5
ns41983541
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
74875
ns71959
ns1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
74396
ns75187.5
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
75500
ns76708
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
74333
ns74750
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
102653
ns105798
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
291291.5
ns296145.5
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
318417
ns218625
ns1.46
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
208187.5
ns213708
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
290437.5
ns318812.5
ns0.91
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
545207.5
ns566311.5
ns0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
11958
ns11792
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12145.5
ns12250
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
14209
ns12750
ns1.11
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
11500
ns12500
ns0.92
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
70994
ns73134
ns0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
27042
ns26042
ns1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26917
ns27042
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27625
ns27875
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26958
ns27417
ns0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
469447.5
ns488772.5
ns0.96
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
12708
ns11792
ns1.08
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12708
ns12625
ns1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
14125
ns14250
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
11958
ns12750
ns0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
52810
ns54676
ns0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
25958
ns25458
ns1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26209
ns25334
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
25958
ns27000
ns0.96
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
26875
ns26167
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
301312
ns315843.5
ns0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
179875
ns179292
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
181083
ns182084
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
182500
ns182291.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
179666
ns183250
ns0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
56497
ns58956
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
582959
ns581896
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
588917
ns588312.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
585083
ns583583
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
590500
ns587083.5
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
286103
ns294154
ns0.97
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6417
ns5625
ns1.14
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7125
ns6334
ns1.12
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8083
ns6979.5
ns1.16
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
5750
ns6520.5
ns0.88
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
70488.5
ns72600.5
ns0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14875
ns13916
ns1.07
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14458
ns14417
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15417
ns15667
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14291
ns14895.5
ns0.96
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
457568
ns479621.5
ns0.95
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
1207438
ns1206937.5
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
1241417
ns1245542
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
1284208
ns1292542
ns0.99
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
997354.5
ns1006417
ns0.99
batchedmm(512, Bsize=4)/forward/GPU/CUDA
301394.5
ns299757
ns1.01
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
4107041.5
ns4169416.5
ns0.99
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
4414458
ns4414833
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
4959854.5
ns4586979
ns1.08
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
3696125
ns3897020.5
ns0.95
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1040815
ns1046888
ns0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1834
ns1792
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1834
ns1834
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1875
ns1875
ns1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1833
ns1875
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
23635
ns23582
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4959
ns4833
ns1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
5041
ns4916
ns1.03
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
4958
ns4917
ns1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4958
ns4875
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
185922
ns189156.5
ns0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6250
ns5792
ns1.08
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6625
ns5917
ns1.12
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6334
ns7375
ns0.86
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5666
ns5917
ns0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
55102
ns56730.5
ns0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
11084
ns10500
ns1.06
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
11834
ns10833
ns1.09
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10791
ns11542
ns0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10875
ns10667
ns1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
330730
ns346270
ns0.96
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
334
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
333
ns333
ns1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
333
ns333
ns1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
292
ns375
ns0.78
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
22810
ns22862
ns1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
3000
ns2709
ns1.11
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
3000
ns2750
ns1.09
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2959
ns3042
ns0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2750
ns2750
ns1
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
156803.5
ns160360.5
ns0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
11750
ns11000
ns1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
11958
ns11833
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
13292
ns13334
ns1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
11292
ns11583
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
57953
ns61359.5
ns0.94
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
25291.5
ns24541
ns1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
24917
ns24459
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25125
ns25542
ns0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24542
ns24542
ns1
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
293802.5
ns310594.5
ns0.95
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4167
ns4208
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4209
ns4167
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4208
ns4209
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4208
ns4250
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
24619
ns24670
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16375
ns16250
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16292
ns16083
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16333
ns16291
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16250
ns15959
ns1.02
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
195053
ns204493
ns0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5833
ns5792
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
5834
ns5791
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5834
ns5875
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5875
ns5833
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
33320
ns33437
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
21209
ns20500
ns1.03
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
21125
ns21167
ns1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
21625
ns22250
ns0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
20667
ns20895.5
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
173685
ns177667.5
ns0.98
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
426708
ns419666.5
ns1.02
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
384958
ns386209
ns1.00
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
482062.5
ns478875
ns1.01
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
102708.5
ns109479
ns0.94
batchedmm(16, Bsize=512)/forward/GPU/CUDA
66966
ns67033
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
909146
ns909583.5
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
972729
ns973708.5
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
1175729
ns1177021
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
439917
ns462625
ns0.95
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
190337.5
ns190401
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
80625
ns81208
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
81500
ns81791
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
81708
ns82417
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
80187.5
ns81291.5
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193436
ns193814.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1902458
ns1935667
ns0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1931125
ns1916333.5
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1927562.5
ns1698562.5
ns1.13
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1906917
ns1930833.5
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
397725
ns412302
ns0.96
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
333
ns292
ns1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns333
ns0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
22050
ns21830
ns1.01
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1834
ns1792
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1834
ns1792
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1834
ns1834
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1833
ns1792
ns1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
169534
ns176875.5
ns0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
6875
ns6459
ns1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
7146
ns6333
ns1.13
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
7667
ns8041
ns0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
6500
ns6875
ns0.95
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
62608.5
ns68185.5
ns0.92
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9542
ns9375
ns1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9333
ns9125
ns1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9083
ns9417
ns0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9458
ns9542
ns0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
313766.5
ns336822.5
ns0.93
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
118190208
ns120048500
ns0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
174175750
ns173952417
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
147818500
ns148044250
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
107522750
ns104491521
ns1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5476530
ns5473411
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
612187917
ns614607916.5
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
556303083
ns555101125
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
452274750
ns456940500.5
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
757288396
ns767695250.5
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
38234410
ns34955825
ns1.09
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
649126292
ns651124125
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
667267229
ns669116625
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
589618437.5
ns578815791.5
ns1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
741758417
ns742698833
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57583
ns60083
ns0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47375
ns48000
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47833
ns46333
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82250
ns84917
ns0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37784
ns38192
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1917978.5
ns1922875
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1995291.5
ns1974250
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1985646
ns1990062.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1843354
ns1906625
ns0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
172983
ns174155.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
266084
ns267042
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
268750
ns265667
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
268209
ns269750
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
267562
ns267791.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
132212
ns147410.5
ns0.90
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
650416.5
ns592896
ns1.10
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
674667
ns684208.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
589437.5
ns589520.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
688771
ns698666
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
730804
ns813499.5
ns0.90
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2181417
ns2209791.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2196416.5
ns2209896
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2101104
ns2103125
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2231125
ns2236250
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
133510.5
ns133936
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5502917
ns5510542
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5510333
ns5555437.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5498521
ns5498229
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5441417
ns5548208
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
776428
ns892951
ns0.87
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
640333
ns637334
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
646083
ns646875
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
646875
ns646625
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
635334
ns654875
ns0.97
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
47144
ns46730
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1818833
ns1826417
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1727958
ns1722937.5
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1724083
ns1719334
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
2099750
ns2093625
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
220116
ns221633
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58500
ns59125
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47083
ns46417
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
46458
ns47000
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
81500
ns84917
ns0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
28922
ns28496
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2025729
ns2036604.5
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2106791.5
ns1836917
ns1.15
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2095000
ns2095792
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1998375
ns2019479
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
189080
ns190943.5
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
13351000
ns13379729
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
12437395.5
ns12436958
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
12498666.5
ns12541125
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
14894375
ns15244750
ns0.98
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
519065
ns515571.5
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
47200625
ns47364291.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
41881708
ns41915083
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
40754334
ns40768458
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
58105083
ns59377021
ns0.98
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2883161
ns2882313
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
96219125
ns74658083.5
ns1.29
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
91954062.5
ns90949750
ns1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
90758584
ns90367208
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
98984500
ns99601541
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58708
ns59459
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47000
ns47417
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
47291
ns47333
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82000
ns84375
ns0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
47821
ns48174
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1902250
ns1935167
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1986000
ns1966979
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1978125
ns1974250
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1883042
ns1911312.5
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
194258.5
ns197391.5
ns0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
417
ns333
ns1.25
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
333
ns375
ns0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
333
ns333
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
32804.5
ns32694
ns1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6792
ns6042
ns1.12
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6625
ns6375
ns1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6708
ns6583
ns1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6166
ns6083
ns1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
179592
ns187739
ns0.96
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
292
ns250
ns1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns333
ns0.88
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
32076
ns32507
ns0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
2917
ns2625
ns1.11
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
2875
ns2792
ns1.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
2875
ns2917
ns0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
2625
ns2667
ns0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
166744
ns175489
ns0.95
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
284237292
ns286852687.5
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
339653916.5
ns339813500
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
313913791.5
ns313305624.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
272402875
ns269143125
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
7047786.5
ns7114947
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
993594875
ns994653583
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
945283292
ns936368458
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
835507124.5
ns837895375.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1160037292
ns1177393750
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
34045459
ns34035408
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1668906166
ns1316953312.5
ns1.27
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1694695167
ns1689250042
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1627000917
ns1683427084
ns0.97
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1703328625
ns1672545042
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1418646
ns1454500
ns0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1413958
ns1407875
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1414875
ns1409834
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1411416
ns1414167
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
128242
ns128152
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5029312.5
ns5044833
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5037875
ns4714312.5
ns1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5028146
ns5026437.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5024417
ns5051042
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
552451.5
ns686490.5
ns0.80
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
170453833
ns172590354
ns0.99
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
127944542
ns124318041
ns1.03
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
129428958
ns123190833
ns1.05
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
164372666.5
ns165357625
ns0.99
vgg16(32, 32, 3, 32)/forward/GPU/CUDA
4859943
ns4891073
ns0.99
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
620949625
ns615854000
ns1.01
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
515114583
ns630625333
ns0.82
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
463124083
ns562103875
ns0.82
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
648066667
ns653647292
ns0.99
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA
16797902
ns16015121
ns1.05
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
8927250
ns9006416.5
ns0.99
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
8950584
ns8896937.5
ns1.01
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
7917333
ns7913208
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
9753125
ns9977125
ns0.98
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1591258
ns1591890.5
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
35919479
ns35918500
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
37210542
ns36875416
ns1.01
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
33517916.5
ns33279603.5
ns1.01
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
37573417
ns39552041.5
ns0.95
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
6470424
ns6456562
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
47417
ns47709
ns0.99
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
47583
ns47375
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
47708
ns47500
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
47417
ns47542
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
18601
ns19056.5
ns0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
50542
ns50500
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
50458
ns50250
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
50542
ns50750
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
52916.5
ns50375
ns1.05
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
206886.5
ns263004.5
ns0.79
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
7000
ns6500
ns1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
7291
ns6875
ns1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7625
ns8250
ns0.92
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
7125
ns7000
ns1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
89400.5
ns145281
ns0.62
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10625
ns9750
ns1.09
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10333.5
ns10250
ns1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10417
ns10500
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10479.5
ns10042
ns1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
543240.5
ns744130.5
ns0.73
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6000
ns5875
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6166
ns6125
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7083
ns7375
ns0.96
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5666.5
ns5750
ns0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
121379.5
ns151878
ns0.80
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13333
ns13042
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
13000
ns13250
ns0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13333
ns13250
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12687.5
ns13542
ns0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
510219
ns607325
ns0.84
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
1125
ns1042
ns1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1084
ns1000
ns1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1084
ns1084
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1083
ns1083
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
32431.5
ns32597
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8375
ns8000
ns1.05
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8542
ns7833
ns1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8292
ns8292
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8042
ns8375
ns0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
204899
ns237194
ns0.86
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
23208
ns23167
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
23417
ns23084
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
23666
ns23458
ns1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
23334
ns23292
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
18285
ns18583
ns0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
52625
ns52500
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
52709
ns54208.5
ns0.97
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
53083
ns53083
ns1
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
52562.5
ns52667
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
286000
ns385105.5
ns0.74
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1398458
ns1402916
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1450667
ns1454958
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1398999.5
ns1401792
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1395750.5
ns1456375
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
196905
ns197023
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5011896
ns5029479.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5032187.5
ns5009000
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5012250
ns5018334
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5002687.5
ns5048291.5
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
598226
ns723983
ns0.83
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3070875
ns3027250
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2072042
ns2080437.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2289104.5
ns2291437.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4773854
ns4926416
ns0.97
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
584355
ns586737
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
24311583
ns24385583.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18870583.5
ns18885375
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
19070166.5
ns18817833
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
36514562.5
ns37153959
ns0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
2861612.5
ns2840867
ns1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
34008958
ns34104021
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
28397792
ns28291895.5
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
27946625
ns28002792
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
41793708.5
ns42422958
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
144075292
ns142464667
ns1.01
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
147842750
ns147805542
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
126624187.5
ns127021875
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
172290146
ns173652229
ns0.99
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22560426
ns22558116
ns1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
1298569062.5
ns1200697875
ns1.08
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
886633209
ns1864615833.5
ns0.48
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
1199135125
ns1647966021
ns0.73
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
689233333
ns686829458
ns1.00
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
117701235
ns117772826
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
73000
ns75375
ns0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
73292
ns84209
ns0.87
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
85645.5
ns76458
ns1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
72583
ns80958
ns0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
223969
ns312734.5
ns0.72
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
276062.5
ns204687.5
ns1.35
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
287625
ns278104
ns1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
282625
ns192375
ns1.47
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
190583
ns283312.5
ns0.67
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1155754
ns1488616
ns0.78
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
35424583
ns35722250
ns0.99
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
36355854
ns36312354
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
32516083.5
ns32588937.5
ns1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
40329917
ns40883292
ns0.99
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5847057
ns5836813
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
144746000
ns149459958
ns0.97
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
153804708.5
ns153182708.5
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
140298187
ns140187104
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
283107125
ns226961625.5
ns1.25
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
34865240
ns34882818.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
121095354
ns121271541.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
174763417
ns174726458
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
148056208
ns147669333
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
105211667
ns105646958
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5466322
ns5477234.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
468110062.5
ns471261458.5
ns0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
466487917
ns465682583
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
437682625
ns434340042
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
737562458
ns758899104.5
ns0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
35152775
ns32272056.5
ns1.09
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
706128833.5
ns709031375
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
656179312
ns654357417
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
571296688
ns581732375
ns0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
731578125
ns734152875
ns1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1324833
ns1246834
ns1.06
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
963417
ns970729
ns0.99
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
979125
ns905979
ns1.08
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
2064125
ns2088750
ns0.99
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA
573443.5
ns584722.5
ns0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
2963875
ns3017521
ns0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
2641084
ns2605541
ns1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
2621249.5
ns2618042
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
3522250
ns3762104
ns0.94
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA
1659147
ns1908342
ns0.87
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
5792625
ns5812937.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
5824583.5
ns5782937.5
ns1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
5815083.5
ns5769333
ns1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
2879416
ns2967958
ns0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7292
ns7500
ns0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6333
ns6250
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6250
ns6125
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
9917
ns10375
ns0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
25248
ns25756
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212708
ns212417
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220666
ns221208.5
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
221208
ns220334
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
206375
ns206375
ns1
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
250623
ns307623
ns0.81
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
307616584
ns310236833.5
ns0.99
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
221441583
ns228243416
ns0.97
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
198752396
ns199023750
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
309471333
ns307111500
ns1.01
vgg16(32, 32, 3, 64)/forward/GPU/CUDA
7903869
ns7677099
ns1.03
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
1075422250
ns1077070792
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
906727646
ns909540270.5
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
801892167
ns811121083
ns0.99
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
1153514499.5
ns1177347271
ns0.98
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA
26746953
ns26401108
ns1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
5791.5
ns5625
ns1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5917
ns5833.5
ns1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6375
ns6667
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4875
ns5208
ns0.94
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
155781
ns199489.5
ns0.78
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7625
ns7208
ns1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7334
ns7334
ns1
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7562.5
ns7416
ns1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7083
ns7291
ns0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
649264
ns722768
ns0.90
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
625
ns583
ns1.07
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
542
ns667
ns0.81
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
666
ns667
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
542
ns500
ns1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
23898
ns24721
ns0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9333.5
ns9000
ns1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9542
ns9209
ns1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
9833
ns9709
ns1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
8792
ns9416
ns0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
220286
ns238784
ns0.92
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
351479.5
ns353291.5
ns0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
352042
ns351750
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
353812.5
ns352354.5
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
354687.5
ns362833
ns0.98
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
21024
ns21565
ns0.97
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
811959
ns814792
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
778625
ns826041.5
ns0.94
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
774625
ns777875
ns1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
821708
ns829958
ns0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
304830.5
ns302093.5
ns1.01
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
339000
ns337583.5
ns1.00
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
343083
ns340250
ns1.01
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
451041.5
ns444208
ns1.02
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
10583
ns10812.5
ns0.98
batchedmm(16, Bsize=32)/forward/GPU/CUDA
18316
ns18424
ns0.99
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
714000
ns719166.5
ns0.99
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
742750.5
ns721917
ns1.03
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
1003583
ns1006458
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
26375
ns28250
ns0.93
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
291054.5
ns299767.5
ns0.97
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
384958.5
ns379708.5
ns1.01
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
348083
ns349958
ns0.99
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
444917
ns436354
ns1.02
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
30125
ns30833
ns0.98
batchedmm(16, Bsize=128)/forward/GPU/CUDA
23128
ns23185.5
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
738542
ns737500
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
791896
ns772041.5
ns1.03
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
1018521
ns1022146
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
105270.5
ns101459
ns1.04
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
225989
ns233048
ns0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
3708
ns3458
ns1.07
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
3708
ns3625
ns1.02
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
3792
ns3625
ns1.05
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
3542
ns3625
ns0.98
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
17710.5
ns18179
ns0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
4500
ns4334
ns1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
4250
ns4625
ns0.92
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
4500
ns4625
ns0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
4417
ns4458
ns0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
279830
ns297309.5
ns0.94
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3666
ns3833
ns0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4125
ns3708
ns1.11
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4500
ns4208.5
ns1.07
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3708
ns4104
ns0.90
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
199332
ns236489
ns0.84
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8875
ns8458
ns1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8500
ns8166
ns1.04
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8458
ns8792
ns0.96
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8500
ns8708
ns0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
1228522.5
ns1272633
ns0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
203645.5
ns208042
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
210208
ns215895.5
ns0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
210792
ns211084
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
201125
ns199667
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
34981
ns35583
ns0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
611520.5
ns645812.5
ns0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
624479.5
ns623291
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
624000.5
ns622916
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
630624.5
ns638333
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
361212
ns366544.5
ns0.99
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
995583
ns1020979
ns0.98
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
1022646
ns1006020.5
ns1.02
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
952562
ns957729
ns0.99
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
869209
ns904000
ns0.96
batchedmm(128, Bsize=128)/forward/GPU/CUDA
207395.5
ns208984
ns0.99
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
4529458
ns4550166.5
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
4744750
ns4713709
ns1.01
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
4448625
ns4462125
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
5089542
ns5571625
ns0.91
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
933469
ns936095
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3666
ns3708
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3375
ns3708.5
ns0.91
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
4167
ns4292
ns0.97
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3209
ns3709
ns0.87
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
242210.5
ns245340.5
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7792
ns7167
ns1.09
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7167
ns7375
ns0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7417
ns7708
ns0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7208
ns7209
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
1046390.5
ns1060150.5
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1637500.5
ns1616083
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1186917
ns1153750
ns1.03
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1336062.5
ns1337250
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2468375
ns2432374.5
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA
213930
ns217163
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12339333
ns12337062.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9615979.5
ns9522833
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9254104
ns9266729
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
17996208
ns18081312
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1954541
ns1948614
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17361479
ns17355771
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14427833
ns14388208.5
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14271583.5
ns14348354
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21144917
ns21196875
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
88834
ns88312.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
91500
ns89271
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
90834
ns91125
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
87625
ns91625
ns0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
125982
ns126391
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2019500
ns2036875
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2042708
ns2015416.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2028042
ns1865791
ns1.09
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2025999.5
ns2043208
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1063927.5
ns1072650
ns0.99
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
3541.5
ns2813
ns1.26
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
2333
ns2791
ns0.84
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
3584
ns3375
ns1.06
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
1500
ns1833
ns0.82
batchedmm(2, Bsize=4)/forward/GPU/CUDA
15780.5
ns16578
ns0.95
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
3000
ns2542
ns1.18
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
2958
ns2625
ns1.13
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
2709
ns2875
ns0.94
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
2875
ns3000
ns0.96
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
195545.5
ns199941.5
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7208
ns7167
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6042
ns5709
ns1.06
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
6000
ns5833
ns1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10000
ns10250
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
33801
ns34656
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
221667
ns212709
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
228625
ns220625
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
221000
ns220541
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
206709
ns220354
ns0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
347206.5
ns356302
ns0.97
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3708
ns3708
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3709
ns3708
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3708
ns3709
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3750
ns3750
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
22295
ns22913
ns0.97
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14500
ns14584
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
14459
ns14458
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14458
ns14500
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14417
ns14166.5
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
485845.5
ns486419
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
92250
ns92375
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
94209
ns93333.5
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
95250
ns94916.5
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
91750
ns95417
ns0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
125421
ns125841
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1929000
ns1915000
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1929917
ns1914209
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1922333
ns1928125.5
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1923646
ns1940729
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
960449
ns1045000
ns0.92
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
875291
ns871959
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
817687.5
ns821041.5
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
1220791.5
ns1216500
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
956708
ns943271
ns1.01
lenet(28, 28, 1, 32)/forward/GPU/CUDA
270219.5
ns280426
ns0.96
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2786125
ns2729167
ns1.02
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
2476771
ns2498104
ns0.99
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
3326500
ns3340041
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3277354
ns3427250
ns0.96
lenet(28, 28, 1, 32)/zygote/GPU/CUDA
1614761
ns1723859
ns0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
16229
ns17229
ns0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
18000
ns17875
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
17084
ns18792
ns0.91
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
14895.5
ns15375
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
142802.5
ns190406.5
ns0.75
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
222458
ns228541
ns0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
216625
ns220833.5
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
216270.5
ns216521
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
225667
ns228437.5
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
642849
ns725866.5
ns0.89
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
220666.5
ns221145.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
223437.5
ns222000
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
221291.5
ns221291
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
220208
ns221083.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
270694.5
ns321034
ns0.84
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
498104.5
ns495084
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
505958
ns496625
ns1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
498020.5
ns496729
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
500229
ns507625
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1376499
ns1510358
ns0.91
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
4000
ns3854
ns1.04
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
3667
ns3875
ns0.95
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
5875
ns5042
ns1.17
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
3666
ns4083
ns0.90
batchedmm(16, Bsize=4)/forward/GPU/CUDA
16958
ns17250
ns0.98
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
7333
ns7167
ns1.02
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
7333
ns6959
ns1.05
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
7125
ns7250
ns0.98
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
7542
ns7416
ns1.02
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
195319
ns201503.5
ns0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17333
ns20083
ns0.86
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
20291.5
ns16875
ns1.20
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
19354.5
ns19500
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
16708.5
ns18167
ns0.92
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
146982.5
ns232442
ns0.63
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
214625
ns224916
ns0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
212500
ns212708
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
212792
ns212416
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
221417
ns248812.5
ns0.89
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1020818
ns1078558.5
ns0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
4125
ns4333
ns0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
4500
ns4250
ns1.06
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5291
ns5209
ns1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
3542
ns4417
ns0.80
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
241162.5
ns255475
ns0.94
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
11000
ns10708
ns1.03
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10459
ns10166
ns1.03
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10916
ns10833
ns1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10125
ns11375
ns0.89
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
1056501
ns1114054
ns0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3333
ns3500
ns0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3875
ns3583.5
ns1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
4250
ns4458
ns0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
2938
ns3917
ns0.75
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
237567.5
ns247293
ns0.96
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7875
ns7542
ns1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7709
ns7250
ns1.06
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7250
ns8083
ns0.90
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7458.5
ns7791
ns0.96
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
1070019
ns1128935
ns0.95
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23347771
ns23544917
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
35406500
ns34700375
ns1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
37669583
ns37800604.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
34858666
ns35322562.5
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1830001
ns1834217
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
183823166
ns183535208
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
159867750
ns159261916
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
146428479.5
ns146891041.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
410553708
ns419037250
ns0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
16506890.5
ns16405198.5
ns1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
424862333.5
ns428624000
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
253527416.5
ns254269584
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
295623854.5
ns296570146
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
480544667
ns493357917
ns0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
182875
ns184479.5
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
185563
ns184208
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
184500
ns185416
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
182250
ns184062.5
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
218471
ns233424
ns0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
633375
ns585417
ns1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
596208
ns589833
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
587250
ns586896
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
590520.5
ns639166
ns0.92
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1067870
ns1146333
ns0.93
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
3926937.5
ns3917708
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
3941459
ns3921208
ns1.01
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
3667000
ns3581645.5
ns1.02
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
4544333.5
ns4674709
ns0.97
batchedmm(128, Bsize=512)/forward/GPU/CUDA
531767
ns538155
ns0.99
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
17389166
ns17548417
ns0.99
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
17947521
ns17792083
ns1.01
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
16390812
ns16472417
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
19902458.5
ns21347458
ns0.93
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2636393
ns2621425
ns1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
625
ns542
ns1.15
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
584
ns625
ns0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
625
ns584
ns1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
542
ns583
ns0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
32468
ns33117
ns0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9479.5
ns9291
ns1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9500
ns9354.5
ns1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9584
ns9792
ns0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8875
ns9416
ns0.94
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
263813
ns269036
ns0.98
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
498564458
ns503680250
ns0.99
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
426956020.5
ns425402999.5
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
423367333
ns418147958
ns1.01
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
596263958
ns678706395.5
ns0.88
vgg16(32, 32, 3, 128)/forward/GPU/CUDA
12482792
ns12481919
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
1875323562.5
ns1881496729.5
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
1628477375
ns1619255500
ns1.01
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
1492393083.5
ns1494277750
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
2205444916.5
ns2234203604
ns0.99
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA
49302271
ns49122118.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1639125
ns1536334
ns1.07
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1202125
ns1156271
ns1.04
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1357187.5
ns1380541
ns0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2457312
ns2362625
ns1.04
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
213583.5
ns217676
ns0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12714125
ns12766416
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9952750
ns9918708
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9614459
ns9674833
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18361979
ns18454708
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2064490
ns2051013
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17715625
ns17738333
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14737021
ns14710417
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14521854
ns14604375
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21413792
ns21451125
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
26292
ns26250
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
26250
ns26208
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
26666
ns26250
ns1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
26250
ns26250
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
24074
ns23581.5
ns1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
67542
ns67333
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
67625
ns68000
ns0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
68125
ns67333
ns1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
67042
ns67042
ns1
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
400556.5
ns404754.5
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
203812.5
ns205583
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
210750
ns209125
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
209833
ns209000
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
199375
ns199041
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
27041
ns26431.5
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
627333
ns611478.5
ns1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
626584
ns633292
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
622042
ns670416
ns0.93
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
580541
ns611479
ns0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
355125.5
ns353085.5
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
640833
ns612333
ns1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
653000
ns643520.5
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
599854
ns644958
ns0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
599062.5
ns652334
ns0.92
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
132599.5
ns132321
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2247625
ns2263750
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2173250
ns2226645.5
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2242375
ns2243875
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2238250
ns2302583
ns0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1242951
ns1253025
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17459
ns19667
ns0.89
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
19917
ns16917
ns1.18
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18958
ns21500.5
ns0.88
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
17500
ns18208
ns0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
146290
ns145311.5
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
227125
ns233042
ns0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
229687.5
ns218770.5
ns1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
219959
ns262625
ns0.84
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
218709
ns230000
ns0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1041939
ns1059070.5
ns0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
625
ns583
ns1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
625
ns667
ns0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
666
ns667
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
542
ns542
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
24055
ns23551
ns1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
10042
ns9667
ns1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9875
ns9583
ns1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
10292
ns10583
ns0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9333
ns9959
ns0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
260885.5
ns258697
ns1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
6208
ns5833
ns1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6042
ns5542
ns1.09
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6416
ns6500
ns0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
5062.5
ns4958
ns1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
223280.5
ns231871
ns0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7625
ns6875
ns1.11
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7416
ns7125
ns1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7459
ns7792
ns0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6875
ns6833
ns1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
794061
ns803650
ns0.99
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
2083
ns2166
ns0.96
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
2333
ns1917
ns1.22
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
2542
ns2417
ns1.05
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
2166
ns2333
ns0.93
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
17949
ns17852
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
6584
ns6417
ns1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6584
ns6458
ns1.02
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6792
ns6916
ns0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
6520.5
ns6459
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
335163
ns332798.5
ns1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
746958.5
ns749396
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
746875
ns746625
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
749792
ns749250
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
751729
ns751417
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
21434
ns21271
ns1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
819625
ns793042
ns1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
791708
ns792500
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
773145.5
ns775750
ns1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
790854
ns797542
ns0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
298785
ns296567
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7375
ns7500
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6000
ns5542
ns1.08
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5958
ns6000
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10166
ns10458
ns0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
33922
ns32600
ns1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
220854.5
ns221375
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
236854.5
ns240270.5
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
228083.5
ns257854
ns0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
212875
ns222250
ns0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
365652.5
ns360398
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
10209
ns10104.5
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
10417
ns10334
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
10708
ns10916
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
9541.5
ns10229.5
ns0.93
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
251155
ns251149.5
ns1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
24333
ns24666
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
24584
ns24312.5
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
24583
ns25750
ns0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24979
ns25562.5
ns0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
1135827
ns1138926
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
106024583.5
ns106325125
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
117903521
ns117472625
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
120396396
ns120287229
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
117544479
ns117860729
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
2631384.5
ns2629206
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
385240458
ns394161333.5
ns0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
368294084
ns365470000
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
356727875
ns355300666
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
482802291
ns484349417
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
15255065.5
ns15196205
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
936146916.5
ns755446562.5
ns1.24
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
762770042
ns762235792
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
746849979.5
ns742589166.5
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
945639875
ns957309125
ns0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7541
ns7041
ns1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7250
ns6875
ns1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7583.5
ns8584
ns0.88
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6479
ns6625
ns0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
243530.5
ns243246.5
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14458
ns14167
ns1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
13959
ns14041.5
ns0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14750
ns14667
ns1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
13833
ns14020.5
ns0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
1088103
ns1088970
ns1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6417
ns5917
ns1.08
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6292
ns6270.5
ns1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7166.5
ns7292
ns0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5750
ns5583
ns1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
238108.5
ns237671
ns1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13084
ns12458
ns1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12916
ns12375
ns1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
12583
ns12916
ns0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12166
ns12292
ns0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
798707
ns799420
ns1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
5584
ns5417
ns1.03
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
5770.5
ns5709
ns1.01
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
6500
ns6166
ns1.05
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
7166.5
ns5667
ns1.26
batchedmm(2, Bsize=128)/forward/GPU/CUDA
17513
ns17212
ns1.02
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
15667
ns15458
ns1.01
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
15625
ns15459
ns1.01
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
15541
ns15708
ns0.99
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
15583
ns15625
ns1.00
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
202130
ns202604
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
416
ns333
ns1.25
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
333
ns375
ns0.89
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
417
ns416
ns1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
23880
ns23718
ns1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6584
ns6167
ns1.07
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6458
ns6334
ns1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6583
ns6875
ns0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6187.5
ns6291
ns0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
241952.5
ns241777
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5917
ns5917
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
5917
ns6042
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
5959
ns5959
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
5834
ns5834
ns1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
25052
ns24949
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
21667
ns20958
ns1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
21750
ns21208
ns1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
21875
ns21625
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
21167
ns21250
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
267258
ns267126.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
144125
ns143875
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
144791
ns143770.5
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
146500
ns149333
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
143125
ns144270.5
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
168261.5
ns169394.5
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1324313
ns1364229
ns0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1331958
ns1311708
ns1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1325708
ns1324520.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1319666.5
ns1349667
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1358754
ns1363355
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
24416.5
ns23458
ns1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
24834
ns22250
ns1.12
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
23375
ns25167
ns0.93
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
21292
ns22584
ns0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
357948
ns357496
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
132395.5
ns186729
ns0.71
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
127354.5
ns175562.5
ns0.73
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
118459
ns180666.5
ns0.66
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
117395.5
ns165042
ns0.71
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1501059
ns1496433
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
375
ns292
ns1.28
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns416
ns0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
292
ns333
ns0.88
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
23530
ns23418
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6875
ns6167
ns1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6666
ns6333.5
ns1.05
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
7125
ns7042
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6417
ns6459
ns0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
259579
ns259427.5
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
4625
ns4708
ns0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4708
ns4625
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
5042
ns5166
ns0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4084
ns4896
ns0.83
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
258332.5
ns256096.5
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10375
ns9917
ns1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10209
ns10042
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10209
ns10667
ns0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10166.5
ns10750
ns0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
1363356
ns1366539
ns1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1625
ns1583
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1625
ns1667
ns0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1625
ns1625
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1583
ns1625
ns0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
23506
ns23036
ns1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
5958
ns5625
ns1.06
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6042
ns5833
ns1.04
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6125
ns6041
ns1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
5667
ns5666
ns1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
277914
ns276314
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
6791458.5
ns6734334
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
6360916.5
ns6391625
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
6541917
ns6537375
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
7577625
ns7542292
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
214916.5
ns216147
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
24027042
ns24173292
ns0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
21266917
ns21308875
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
21002500
ns21052792
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
29759417
ns29893541
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2132435.5
ns2120264
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
48562041
ns37482583
ns1.30
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
45901125
ns45446437.5
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
45588125
ns45525834
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
49263125
ns49665500
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6000
ns5916
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6167
ns5729.5
ns1.08
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6625
ns7166
ns0.92
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5334
ns5750
ns0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
236967.5
ns236953
ns1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9250
ns8583
ns1.08
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8458
ns8042
ns1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8958
ns8666
ns1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8250
ns8500
ns0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
1058397
ns1066445
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
1553000
ns1511791
ns1.03
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
1271333
ns1266750
ns1.00
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
1611667
ns1624771
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2139521
ns2083583.5
ns1.03
lenet(28, 28, 1, 128)/forward/GPU/CUDA
272139
ns272636.5
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
7938708.5
ns7911542
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
6600938
ns6587125
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
7126750
ns7180959
ns0.99
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
10443521
ns10527750
ns0.99
lenet(28, 28, 1, 128)/zygote/GPU/CUDA
1846977
ns1860081
ns0.99
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
374500.5
ns364792
ns1.03
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
372770.5
ns367208
ns1.02
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
456750
ns449270.5
ns1.02
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
23000
ns23917
ns0.96
batchedmm(128, Bsize=4)/forward/GPU/CUDA
46393
ns46266
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
736917
ns743187
ns0.99
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
808083
ns805084
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
1057958
ns1059125
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
78020.5
ns89959
ns0.87
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
308525
ns310715.5
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
397417
ns397333
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
287917
ns288042
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
288000
ns288209
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
753542
ns750708
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
43767
ns43949
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
673583
ns673458
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
536166
ns531458
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
531917
ns529250
ns1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
973208
ns974917
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
188160
ns189986
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
633500
ns595125
ns1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
647250
ns645125
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
599709
ns661291.5
ns0.91
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
615666
ns604083.5
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
131655.5
ns132185
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2457916
ns2499541.5
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2396750
ns2451209
ns0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2458187.5
ns2456625
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2452625
ns2529417
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1345493
ns1282545
ns1.05
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
3083
ns3333
ns0.92
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
2833
ns3708
ns0.76
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
4500
ns4125
ns1.09
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
2583
ns2708
ns0.95
batchedmm(2, Bsize=32)/forward/GPU/CUDA
16191
ns16211
ns1.00
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
5750
ns5292
ns1.09
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
5584
ns5250
ns1.06
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
5459
ns5667
ns0.96
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
5625
ns5625
ns1
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
198160.5
ns197863.5
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1458292
ns1466875
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1499833
ns1505417
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1499083
ns1503125
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1437417
ns1440875
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
40922
ns41133
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5128625
ns5168291.5
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5308187.5
ns5273458
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5301146
ns5291104
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4993250
ns5023291
ns0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
195601.5
ns197140
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3709
ns3667
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3708
ns3708
ns1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3708
ns3667
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3708
ns3709
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
33852
ns32935
ns1.03
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15500
ns15042
ns1.03
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
15334
ns15209
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
15334
ns15334
ns1
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
15166
ns15000
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
381247
ns373770
ns1.02
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
71292
ns71500
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
71416
ns70750
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
71167
ns71125
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
70916
ns71083
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
113823.5
ns112823
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
317500
ns320042
ns0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
321000
ns315667
ns1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
319083
ns318667
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
318500
ns324000
ns0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
197369
ns194736
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
1125
ns1000
ns1.13
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
1084
ns1083
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
1084
ns1083
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
959
ns1000
ns0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
24373
ns23415
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8333.5
ns8208
ns1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8229.5
ns8167
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8250
ns8375
ns0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
7833
ns8042
ns0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
265338.5
ns263486.5
ns1.01
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
511459
ns505999.5
ns1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
488042
ns497291.5
ns0.98
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
567084
ns560209
ns1.01
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
220750
ns217875
ns1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA
129208
ns129532
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
1389625
ns1384937.5
ns1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
1480250
ns1454020.5
ns1.02
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
1756312.5
ns1746937.5
ns1.01
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
865000
ns899021
ns0.96
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
277406
ns276899
ns1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
416
ns292
ns1.42
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns292
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
32170
ns31419.5
ns1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6875
ns6167
ns1.11
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6625
ns6333
ns1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6458
ns6667
ns0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6020.5
ns6458.5
ns0.93
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
266374
ns263608
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1718417
ns1728312
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1721417
ns1729000
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1726125
ns1733417
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1719500
ns1738250
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
169010.5
ns170018
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4367625
ns4369375
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
4399270.5
ns3963375
ns1.11
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4374042
ns4358208
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4359438
ns4400041
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1258694
ns1280531
ns0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
6500
ns6750
ns0.96
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
6625
ns6792
ns0.98
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
7208.5
ns6875
ns1.05
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6542
ns6875
ns0.95
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
20518
ns20701.5
ns0.99
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
32542
ns51792
ns0.63
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
52479.5
ns51208
ns1.02
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
52000
ns32833
ns1.58
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
32625
ns71875
ns0.45
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
210236
ns222859
ns0.94
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
17542
ns17625
ns1.00
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
17917
ns17625
ns1.02
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
18708
ns18291
ns1.02
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
17375
ns17937.5
ns0.97
batchedmm(2, Bsize=512)/forward/GPU/CUDA
18845.5
ns18343
ns1.03
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
53750
ns53250
ns1.01
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
53208
ns53166
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
53250
ns53166
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
53500
ns53792
ns0.99
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
344404.5
ns340623.5
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
75292
ns75709
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
75500
ns74125
ns1.02
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
74833
ns75291
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
74959
ns75334
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
47057
ns47398
ns0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
323708
ns325250
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
338541
ns325333
ns1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
326000
ns324750
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
325417
ns340333
ns0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
211393
ns211070
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1486000
ns1491500
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1527542
ns1531125
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1526208
ns1529875
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1463000
ns1465875
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
52398
ns51611
ns1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5120500
ns5144459
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5242958
ns5274708
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5297166.5
ns5268229
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4985916.5
ns5019729.5
ns0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
204362
ns205600
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
28250
ns28167
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
28250
ns28167
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
28167
ns28291
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
28167
ns28333
ns0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
25076
ns24406
ns1.03
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
66417
ns66541
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
66417
ns66292
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
66792
ns66375
ns1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66458
ns66417
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
540264.5
ns516526.5
ns1.05
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1467604.5
ns1468729.5
ns1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
1148208
ns1131000
ns1.02
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
1073125
ns1119791.5
ns0.96
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2179542
ns2241937.5
ns0.97
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA
575331.5
ns581317
ns0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
3075042
ns3109709
ns0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
2748167
ns2104833
ns1.31
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
2727604
ns2739417
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
3816646
ns3875250.5
ns0.98
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA
2066149
ns2085553.5
ns0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
7917125
ns7940229.5
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
7956750
ns7908458.5
ns1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
7912958
ns7909729.5
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
4824417
ns4901667
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
81334
ns81709
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
82000
ns81979.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
81895.5
ns83833
ns0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
80250
ns80541.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193566.5
ns193422.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2017375
ns2029687.5
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2065916.5
ns2007750
ns1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2015625
ns2012750
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2021542
ns2040271
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
803967
ns811844
ns0.99
This comment was automatically generated by workflow using github-action-benchmark.