Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: allow zero-sized arrays in bias_activation
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Aug 20, 2024
1 parent 62eabb9 commit 17ac9a2
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "0.3.46"
version = "0.3.47"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down
15 changes: 10 additions & 5 deletions src/impl/bias_activation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,9 @@ end
function bias_activation!(y::AbstractArray{<:Number, N}, ::LoopedArrayOp, σ::F,
x::AbstractArray{<:Number, N}, bias::AbstractVector{<:Number}) where {F, N}
bias_activation_cpu!(
reshape(y, :, size(y, N - 1), size(y, N)), Traits.fuse_cpu_activation(σ),
σ, reshape(x, :, size(x, N - 1), size(x, N)), bias)
reshape(y, flattened_bias_dims(y), size(y, N - 1), size(y, N)),
Traits.fuse_cpu_activation(σ),
σ, reshape(x, flattened_bias_dims(x), size(x, N - 1), size(x, N)), bias)
return
end

Expand Down Expand Up @@ -246,8 +247,8 @@ end

function bias_add!(y::AbstractArray{<:Number, N}, ::LoopedArrayOp,
x::AbstractArray{<:Number, N}, bias::AbstractVector{<:Number}) where {N}
bias_add_loop!(reshape(y, :, size(y, N - 1), size(y, N)),
reshape(x, :, size(x, N - 1), size(x, N)), bias)
bias_add_loop!(reshape(y, flattened_bias_dims(y), size(y, N - 1), size(y, N)),
reshape(x, flattened_bias_dims(x), size(x, N - 1), size(x, N)), bias)
return
end

Expand Down Expand Up @@ -294,8 +295,12 @@ end
function bias_activation_cached!!(
::LoopedArrayOp, ::True, σ::F, x::AbstractArray{<:Number, N},
bias::Optional{<:AbstractVector{<:Number}}) where {F, N}
x′ = reshape(x, :, size(x, N - 1), size(x, N))
x′ = reshape(x, flattened_bias_dims(x), size(x, N - 1), size(x, N))
bias_add_loop!(x′, x′, bias)
x′′ = reshape(x′, size(x))
return activation(σ, x′′), x′′
end

flattened_bias_dims(x::AbstractArray{T, N}) where {T, N} = prod(size(x)[1:(N - 2)]; init=1)

CRC.@non_differentiable flattened_bias_dims(::Any...)
14 changes: 14 additions & 0 deletions test/common_ops/bias_act_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,17 @@ end
z = bias_activation(identity, Tracker.param(x), b)
@test z isa Tracker.TrackedArray
end

@testitem "Bias Activation: Zero-sized Arrays" tags=[:other_ops] setup=[SharedTestSetup] begin
@testset "$mode" for (mode, aType, ongpu) in MODES
x = rand(Float32, 4, 3, 2, 0) |> aType
b = rand(Float32, 2) |> aType
@test size(bias_activation(identity, x, b)) == (4, 3, 2, 0)
@test size(bias_activation!!(identity, x, b)) == (4, 3, 2, 0)

x = rand(Float32, 2, 0) |> aType
b = rand(Float32, 2) |> aType
@test size(bias_activation(relu, x, b)) == (2, 0)
@test size(bias_activation!!(relu, x, b)) == (2, 0)
end
end

3 comments on commit 17ac9a2

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/113527

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.47 -m "<description of version>" 17ac9a2c6256b0912556747e1c535e54a8add6c2
git push origin v0.3.47

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 17ac9a2 Previous: 62eabb9 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4937.5 ns 5854 ns 0.84
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5666 ns 5333.5 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 8042 ns 7937.5 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5687.5 ns 6187.5 ns 0.92
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 120909 ns 118032 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 2455225 ns 2423568 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 791750 ns 782792 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 413945 ns 415924 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10000 ns 9708 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10250 ns 9729.5 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9875 ns 10541 ns 0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9584 ns 9709 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 558079 ns 543351 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 17669056 ns 18092128 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 2765041 ns 2659875 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 664078 ns 681837 ns 0.97
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1375 ns 1500 ns 0.92
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1500 ns 2917 ns 0.51
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 2083 ns 2208 ns 0.94
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1667 ns 2958 ns 0.56
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 21790 ns 21829 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI 1329237 ns 1310773 ns 1.01
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal 216396 ns 241583 ns 0.90
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 31411 ns 31380 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4229.5 ns 4313 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 3666 ns 4541 ns 0.81
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4166 ns 3875 ns 1.08
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4208 ns 3833 ns 1.10
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 149451 ns 145099 ns 1.03
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI 9238119 ns 9229160 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal 1690125 ns 1544146 ns 1.09
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 153327 ns 151422 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58500 ns 58000 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39500 ns 46458 ns 0.85
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47042 ns 46042 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83333 ns 82625 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37308.5 ns 36359 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 568690.5 ns 644849 ns 0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1066021 ns 1068687 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 81381 ns 80010.5 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2032541.5 ns 2025458 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2086500 ns 2080416.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2080042 ns 2072458.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1986125 ns 1991834 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 235686.5 ns 229025 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 7616963 ns 8450207 ns 0.90
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7909459 ns 7471708 ns 1.06
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1203034 ns 1429005 ns 0.84
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 148292 ns 175729 ns 0.84
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 166416.5 ns 148708 ns 1.12
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 150375 ns 165458.5 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 153437 ns 147208 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 165231.5 ns 166770 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7240146 ns 7919821.5 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1574250 ns 1556812.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 180947 ns 218943 ns 0.83
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1115708.5 ns 1113104 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1115583 ns 1103291.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1111895.5 ns 1131500 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1116604 ns 1112791.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 717544 ns 699462.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33548293 ns 35512444 ns 0.94
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 5783062 ns 6463312 ns 0.89
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1033041 ns 932249.5 ns 1.11
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4958 ns 4042 ns 1.23
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4334 ns 4250 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5667 ns 5625 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4771 ns 5709 ns 0.84
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 95604 ns 91912 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 5339458 ns 5255099 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 722292 ns 437917 ns 1.65
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 60161 ns 71131 ns 0.85
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8875 ns 8583 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8542 ns 8542 ns 1
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8583 ns 9417 ns 0.91
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8541 ns 8708 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 618298 ns 595424 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 37406952 ns 35680059.5 ns 1.05
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 6128688 ns 5662792 ns 1.08
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 393664 ns 389839 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17312.5 ns 17854.5 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18834 ns 17916 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20375 ns 20520.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18291.5 ns 18083.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 67939.5 ns 65742 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 2980613.5 ns 2917446 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1353958 ns 1373187 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 76051 ns 74701 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 223334 ns 212334 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 211917 ns 217375 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 219896 ns 218646 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212084 ns 211791 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 360648.5 ns 352483 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 14475566 ns 13596474 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5859625 ns 5745167 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 479156 ns 480345 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 583.5 ns 708 ns 0.82
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 666 ns 666 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 834 ns 1000 ns 0.83
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 708 ns 625 ns 1.13
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 20628 ns 20944 ns 0.98
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI 1259163 ns 1204727 ns 1.05
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal 300000 ns 280583.5 ns 1.07
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 32990 ns 32630 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1395.5 ns 1375 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1541 ns 1416 ns 1.09
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1417 ns 1542 ns 0.92
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1334 ns 1375 ns 0.97
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 126083.5 ns 125999 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI 8839991 ns 8883264 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal 1618083 ns 1494917 ns 1.08
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 126566.5 ns 137492 ns 0.92
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7375 ns 7333 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5375 ns 6125 ns 0.88
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6166 ns 6125 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9917 ns 9959 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23573 ns 23485 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1332882 ns 1300745.5 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 626500 ns 576646 ns 1.09
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 47160 ns 49320 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 234375 ns 220500 ns 1.06
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 242458 ns 268854.5 ns 0.90
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 270958 ns 269333 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 251083 ns 254708 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 185731 ns 183065 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 32262788 ns 29898186.5 ns 1.08
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9574500 ns 9134229.5 ns 1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 623787 ns 618976 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4083 ns 4125 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4084 ns 4084 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4083 ns 4125 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23126 ns 22910 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI 2060093 ns 1968950 ns 1.05
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal 229375 ns 219458 ns 1.05
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 48711 ns 50520 ns 0.96
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 17041 ns 16833 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16500 ns 16834 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17250 ns 16958 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16833 ns 16875 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 195934 ns 197795 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI 9776499 ns 10645383 ns 0.92
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal 972750 ns 930708 ns 1.05
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 179972 ns 177772 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 509541.5 ns 509792 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 332334 ns 404834 ns 0.82
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 404875 ns 404666.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 865104.5 ns 864750 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113032 ns 113219.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI 393314 ns 402480 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal 448145.5 ns 453521 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 248713 ns 249427.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2319667 ns 2317166 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1752729.5 ns 2034833 ns 0.86
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2031958 ns 2026750 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3283979.5 ns 3276667 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 244203 ns 244909.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI 10141578 ns 10934161 ns 0.93
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal 2016625 ns 1922875 ns 1.05
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 763594 ns 761538 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6042 ns 6729.5 ns 0.90
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6458 ns 6875 ns 0.94
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7708 ns 7458 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6333 ns 6270.5 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 93025 ns 94787 ns 0.98
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 5860829 ns 5253207.5 ns 1.12
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 901583 ns 752958 ns 1.20
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 62671 ns 60151 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11583 ns 10521 ns 1.10
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10500 ns 11958 ns 0.88
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11458 ns 11458 ns 1
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10979 ns 12062.5 ns 0.91
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 646277 ns 664213 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 39475366 ns 39026819.5 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5976917 ns 5529917 ns 1.08
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 418444 ns 415644 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 541 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 541 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23258 ns 23657 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI 2210297.5 ns 2187739 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal 327292 ns 319854.5 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 52080 ns 51140 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2083 ns 2125 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2125 ns 2125 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2125 ns 2209 ns 0.96
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2084 ns 2125 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 221917 ns 243631 ns 0.91
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI 11090915.5 ns 11206966.5 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal 2054166 ns 1981375 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 182777 ns 178346.5 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9062.5 ns 8500 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9792 ns 9354.5 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10167 ns 9875 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8896 ns 8896 ns 1
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 106196.5 ns 116168 ns 0.91
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 3337522.5 ns 3094072 ns 1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 876291.5 ns 733250 ns 1.20
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 75871 ns 78331 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16854.5 ns 17562.5 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17624.5 ns 17916 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18958 ns 18270.5 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17187.5 ns 17937.5 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 603520 ns 635290 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 17553358.5 ns 16856244 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5108208 ns 4597917 ns 1.11
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 396274 ns 390934 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 708 ns 0.88
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 35127 ns 35377 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 1236195 ns 1189275 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 475271 ns 273875 ns 1.74
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 49441 ns 45830 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9792 ns 10375 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10125 ns 9750 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10229.5 ns 10666.5 ns 0.96
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9334 ns 10125 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 257318.5 ns 269773.5 ns 0.95
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 19179819.5 ns 18833230.5 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5182437.5 ns 4684416.5 ns 1.11
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 380274 ns 378479 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397083 ns 397125 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 215250 ns 288208 ns 0.75
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287916 ns 287833 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756041 ns 756000 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111427 ns 112391.5 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI 329641 ns 327166 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal 363792 ns 387167 ns 0.94
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 78871 ns 78291 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1454375 ns 1454292 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 859500 ns 1135437.5 ns 0.76
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1129916 ns 1134916.5 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2440417 ns 2439500 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 209113 ns 208924 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI 10754018 ns 10539379 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal 1658937.5 ns 1551334 ns 1.07
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 328243 ns 324908 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6916.5 ns 7312.5 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7084 ns 6833 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8188 ns 8166.5 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7042 ns 7250 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 152190.5 ns 157256 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 5924676 ns 5707911.5 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 764458 ns 708479.5 ns 1.08
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 60511 ns 70880 ns 0.85
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16083.5 ns 14479 ns 1.11
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15625 ns 15042 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14167 ns 15208.5 ns 0.93
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14333 ns 14563 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 1030700 ns 1058911 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 41391929 ns 41408429 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 6599291 ns 5845541 ns 1.13
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 440235 ns 427694 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25292 ns 24854.5 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 29625 ns 28542 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 26500 ns 29541 ns 0.90
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 25291 ns 25375 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 228373.5 ns 227961 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7754403 ns 7745684.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1026771 ns 1037604 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 118522 ns 117166 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 146334 ns 147667 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 118854 ns 114041.5 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 148958 ns 149708.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 117125 ns 152584 ns 0.77
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1207252 ns 1197557 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 43560506 ns 42890857 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6191583 ns 5764959 ns 1.07
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 601256 ns 597616 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 73833 ns 76917 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 78104.5 ns 76667 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78021 ns 80458 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 77750 ns 77500 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 234865 ns 232480 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7715200.5 ns 7739750 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 534625 ns 524583 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 125536.5 ns 125411.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 305354 ns 301958 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 321166 ns 321188 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 295667 ns 307271.5 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 304625 ns 297896 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1245639 ns 1235149.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 42472482.5 ns 40402340 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6703875 ns 6347749.5 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 703602.5 ns 702732 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 16812.5 ns 16416 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 16334 ns 17166 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 17438 ns 17708 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 17083 ns 16417 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 166179 ns 165184 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 5720112.5 ns 5708995 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 615083 ns 664229.5 ns 0.93
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 240073 ns 239872 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 27771 ns 26125 ns 1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 30354.5 ns 27958 ns 1.09
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26791 ns 26792 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26770.5 ns 27500 ns 0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 1050438 ns 1039155 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 42382106 ns 40056686 ns 1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 6159041 ns 5713084 ns 1.08
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 717457 ns 706062.5 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11125 ns 11624.5 ns 0.96
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11624.5 ns 12250 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11958 ns 12771 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11229.5 ns 10959 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 139887 ns 138899 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 4251683 ns 3796355 ns 1.12
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 817625 ns 788438 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 244463 ns 243833 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 20917 ns 22166.5 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21833.5 ns 22250 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 22563 ns 22375 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21416.5 ns 21958.5 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 755838.5 ns 748333 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 21707541 ns 21361866 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5465833 ns 5064604 ns 1.08
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 695787.5 ns 689278 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 69479 ns 63125 ns 1.10
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 66041 ns 63625 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 66229 ns 67479 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 67791.5 ns 63917 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 119885 ns 117712.5 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3482594.5 ns 3707369 ns 0.94
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1370041.5 ns 1370062.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 239323 ns 238362 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 484042 ns 483270.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 465541 ns 451625 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 439354.5 ns 448416 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 437646 ns 449458 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 558258 ns 554127 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20869371 ns 20983063 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6275458.5 ns 6194854 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 737788 ns 715517 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7166.5 ns 7271 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7792 ns 6958 ns 1.12
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8375 ns 9042 ns 0.93
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7292 ns 7166.5 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 161904.5 ns 160659.5 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 5642311 ns 5587356 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 456041 ns 442520.5 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 61410 ns 59231 ns 1.04
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14708 ns 13521 ns 1.09
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17500 ns 13500 ns 1.30
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14646 ns 15291.5 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 15250 ns 15833 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1023653 ns 1017254.5 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 39308326 ns 39424822 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 6089229.5 ns 5535917 ns 1.10
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 412184 ns 406944 ns 1.01
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6148208 ns 6145625 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 3227583 ns 6372750 ns 0.51
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 6378333 ns 6371166 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11914959 ns 11907459 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301812 ns 351332.5 ns 0.86
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 296489 ns 295593 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19106770.5 ns 19085062.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 11136250 ns 19924479 ns 0.56
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 19962416 ns 20021500 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36542271 ns 36494374.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1158703 ns 1097383 ns 1.06
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1169188 ns 1163817 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 958 ns 958 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 958 ns 1000 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 959 ns 1000 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 959 ns 958 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23501 ns 23661.5 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI 2086104 ns 2047052 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal 329667 ns 226209 ns 1.46
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 216992 ns 214002 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3709 ns 3667 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3666 ns 3708 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3750 ns 3750 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3667 ns 3625 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 297158.5 ns 298918 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10890085 ns 11777281.5 ns 0.92
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal 2191521 ns 2108708 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 650431.5 ns 643757 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8500 ns 8958 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9250.5 ns 8583 ns 1.08
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9396 ns 9271 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8125 ns 8250 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 136116.5 ns 134550.5 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 3625618.5 ns 3553980.5 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 819208 ns 721791 ns 1.13
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 67611 ns 67561 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11250 ns 12042 ns 0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12667 ns 12500 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11729 ns 12312.5 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11042 ns 11917 ns 0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 721603 ns 711267.5 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 22100212 ns 22128749 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5441770.5 ns 4449541 ns 1.22
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 373594 ns 363664 ns 1.03
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 291 ns 292 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22886 ns 22725.5 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI 2122211.5 ns 2110212 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal 331291 ns 216958 ns 1.53
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 51921 ns 52851 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 3000 ns 2875 ns 1.04
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2958 ns 2917 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3042 ns 3125 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2875 ns 2917 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 213807.5 ns 212530.5 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI 9555854.5 ns 10303702 ns 0.93
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal 1713479.5 ns 1562292 ns 1.10
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 168911.5 ns 171862 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11833 ns 12084 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11896 ns 11458 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13000 ns 12750 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12291 ns 11583 ns 1.06
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 137978.5 ns 136669.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 3346916 ns 3560577 ns 0.94
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 900666.5 ns 853666 ns 1.06
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 239817.5 ns 243422.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 23042 ns 20646 ns 1.12
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 22104 ns 20792 ns 1.06
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 20458 ns 21812.5 ns 0.94
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23917 ns 20854.5 ns 1.15
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 653027.5 ns 647113 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 21573566 ns 19242302 ns 1.12
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 4833270.5 ns 4418416 ns 1.09
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 673012 ns 655347 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4375 ns 4417 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24516 ns 24882 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI 2141686 ns 2124569.5 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal 231666.5 ns 219041 ns 1.06
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 54410 ns 52591 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16750 ns 16625 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16167 ns 16750 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16833 ns 16916 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16459 ns 16625 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 353559.5 ns 352424 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI 13538541 ns 12518270.5 ns 1.08
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal 1092020.5 ns 1126416 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 216712 ns 214922.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 1959 ns 2000 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 1959 ns 2042 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2083 ns 2167 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2125 ns 1917 ns 1.11
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 35968 ns 36064 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 1179215 ns 1237482 ns 0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 444042 ns 276625 ns 1.61
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 208282 ns 207022 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 17958.5 ns 19104.5 ns 0.94
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 16958 ns 19250 ns 0.88
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 17333.5 ns 18437.5 ns 0.94
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 17646 ns 17687 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 305401 ns 302079 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 22626384 ns 20285887 ns 1.12
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5381292 ns 5279999.5 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 703887 ns 703572 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 59250 ns 59084 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 60625 ns 65312.5 ns 0.93
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 64167 ns 65833 ns 0.97
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51291 ns 54042 ns 0.95
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66533 ns 66307 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 101811 ns 102391 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 196208 ns 182750 ns 1.07
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 139333 ns 137479 ns 1.01
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 155270.5 ns 130291 ns 1.19
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 285354 ns 309103.5 ns 0.92
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 231110.5 ns 230456.5 ns 1.00
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 587041 ns 582421 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 82771 ns 83250 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 87959 ns 85417 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85959 ns 84021 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81812.5 ns 85333 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 192437.5 ns 193872 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5643042.5 ns 5235813 ns 1.08
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2001125 ns 2079791 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 172856.5 ns 209407.5 ns 0.83
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1915166.5 ns 1884937.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1905625 ns 1903375 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1906791.5 ns 1896375 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1867521 ns 1907458 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 575411.5 ns 571543 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 26938211 ns 26257898 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9319437.5 ns 8902417 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1079271.5 ns 1079882 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 291 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21855 ns 21813 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI 2160069.5 ns 2084650 ns 1.04
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal 370125 ns 320667 ns 1.15
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 45340 ns 45201 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1833 ns 1875 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1833 ns 1834 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1791 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 268250 ns 267247 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI 10449469 ns 10318095 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal 1115417 ns 1507562.5 ns 0.74
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 183362 ns 186822 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8250 ns 9291 ns 0.89
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 11209 ns 10416 ns 1.08
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9708 ns 10708.5 ns 0.91
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 9084 ns 8083 ns 1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 135375 ns 133754 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 3475930 ns 3421507 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 905833 ns 847208 ns 1.07
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 242893 ns 239032 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 12042 ns 9084 ns 1.33
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 11125 ns 9084 ns 1.22
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8833 ns 9125 ns 0.97
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 12083 ns 9167 ns 1.32
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 583740 ns 568427 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 21716042.5 ns 19345703 ns 1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 4734458 ns 4300021 ns 1.10
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 652127 ns 631937 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58458 ns 58042 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39584 ns 46584 ns 0.85
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 47104.5 ns 46458 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83084 ns 83125 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 39769 ns 39683 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1485296 ns 1371601 ns 1.08
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1151625 ns 1146041.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 78761 ns 77101 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1929833 ns 1921833 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1940687 ns 1952917 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1942312.5 ns 1960375 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1910500 ns 1841250 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 236370 ns 234062 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 34305466 ns 32924888 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11016708.5 ns 10966291.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1026191 ns 1020771 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 417125 ns 415291 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 417396 ns 417687.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 419312.5 ns 422375 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 416334 ns 418000 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 238661.5 ns 235407.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7635909 ns 7542978.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 553834 ns 531500 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 288843 ns 288923 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 709000 ns 774208 ns 0.92
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 734313 ns 682542 ns 1.08
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 671250 ns 767312.5 ns 0.87
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 669791.5 ns 737938 ns 0.91
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1151563 ns 1139485 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 44436982 ns 44764339 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6696083 ns 6953229 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 931160 ns 928979 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 3399479.5 ns 3441042 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 3363750 ns 3456937.5 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 3425625 ns 3428771 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 3391083.5 ns 3464583 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 177139 ns 175145 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8299345 ns 8202705.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1423625 ns 1354666 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 416864 ns 437135 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 6186791 ns 6187166 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 6198687.5 ns 6181167 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 6090875 ns 6197500 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 6187875 ns 6196250 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1083853 ns 1070107 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 51969867.5 ns 52446997.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 8058500 ns 7250062.5 ns 1.11
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1565741.5 ns 1563197 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 471667 ns 472000 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 253791 ns 340625 ns 0.75
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 342583 ns 341708 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 902708 ns 902667 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46521 ns 46753 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI 889304 ns 384887.5 ns 2.31
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal 448250 ns 404791 ns 1.11
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 251212 ns 251683 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2350667 ns 2320833 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1761583.5 ns 2040917 ns 0.86
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2037792 ns 2031584 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3284625 ns 3282833 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 258155.5 ns 255715 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI 13627654 ns 8191343 ns 1.66
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal 2294875 ns 2170729.5 ns 1.06
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 791358 ns 784548 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58292 ns 57500 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39584 ns 46250 ns 0.86
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46542 ns 45834 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82833 ns 82875 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 27855 ns 27970.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1365660 ns 1021421 ns 1.34
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1156292 ns 1149062.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 77241 ns 74351 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2035459 ns 2033375 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2077875 ns 2085958 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2072875 ns 2091250 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1932083 ns 1948958.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 241361.5 ns 239148 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 37426143 ns 37589753 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11703125 ns 11610833 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1056652 ns 1049280.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58333 ns 57875 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 39458 ns 46667 ns 0.85
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46834 ns 46375 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83250 ns 82708 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 49658 ns 49341 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 784251.5 ns 807803 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1110916 ns 1096916.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 75300.5 ns 73081 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1894292 ns 1916250 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1940666 ns 1967625 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1969937.5 ns 1972395.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1886875 ns 1891979.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 247040 ns 245040 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 17460173 ns 18113352 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9839292 ns 9728333 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1051031 ns 928945 ns 1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 34603 ns 34818 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 1179890 ns 1198925.5 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 433687.5 ns 405563 ns 1.07
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 49160 ns 47730 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6958 ns 7312 ns 0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7250 ns 7416 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7521 ns 7812.5 ns 0.96
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7208.5 ns 8333 ns 0.87
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 210766 ns 207888 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 20576421.5 ns 20491372 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5193791.5 ns 4662542 ns 1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 378014 ns 380574 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 291 ns 292 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32342 ns 32837 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI 1286880 ns 1211839 ns 1.06
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal 261521 ns 251125 ns 1.04
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 39160 ns 39501 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2666 ns 2667 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2667 ns 2875 ns 0.93
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2834 ns 2959 ns 0.96
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2625 ns 2917 ns 0.90
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 202783.5 ns 202228 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI 7191833 ns 7788464.5 ns 0.92
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal 969250 ns 942542 ns 1.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 154716.5 ns 154372 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 457625 ns 429542 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 453792 ns 473375 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 426146 ns 426771 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 456125 ns 443791.5 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 142160 ns 140775 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5858214 ns 6055220 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2271875 ns 2980417 ns 0.76
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 326853 ns 374113.5 ns 0.87
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3802938 ns 3786687.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3809708 ns 3800125 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3801896 ns 3802250 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3792625 ns 3800125.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 781504 ns 773073.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32606612 ns 32060419 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11052792 ns 11414458 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1495896 ns 1481216 ns 1.01
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49881521 ns 49836458.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 26009250 ns 35531854 ns 0.73
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 35546334 ns 35532958 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 96980062.5 ns 96940104.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1600432 ns 1598677.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1012971 ns 1003070 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154537104 ns 154620438 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 88927125 ns 112348896.5 ns 0.79
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 112528667 ns 112268042 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 298524146 ns 297276875 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6474447 ns 6507145 ns 0.99
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5518798 ns 5527328 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 19062.5 ns 19333.5 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 15542 ns 18417 ns 0.84
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 17042 ns 17041.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 16021 ns 15895.5 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 20743 ns 20523 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI 1114798 ns 1185183 ns 0.94
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal 252583 ns 216083 ns 1.17
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 26040 ns 25770 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 10917 ns 10666.5 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 7416 ns 9125 ns 0.81
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 9208 ns 9333 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17375 ns 17333 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 296392 ns 294450.5 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI 9784850 ns 10063127 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal 1636083.5 ns 1473458 ns 1.11
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 155431 ns 152692 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8729 ns 9125 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9000 ns 9208 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9229.5 ns 9917 ns 0.93
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8562.5 ns 7709 ns 1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 139671.5 ns 138926.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 3414992.5 ns 3463205 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 799833 ns 776250 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 242752 ns 242133 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9312.5 ns 9250 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9416 ns 9792 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10167 ns 10083 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9250 ns 9333 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 704800.5 ns 694521 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 23114743.5 ns 23124170 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5428520.5 ns 4958562 ns 1.09
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 674252.5 ns 652436.5 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9333 ns 9354 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9709 ns 9521 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10625 ns 10833 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9250 ns 9333.5 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 136210.5 ns 134783 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 3600263.5 ns 3350897 ns 1.07
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 947792 ns 835416 ns 1.13
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 69541 ns 74881 ns 0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13062.5 ns 13042 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13542 ns 12917 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13916.5 ns 14021 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13000 ns 12708 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 647891 ns 641313 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 19445945.5 ns 19842694 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 4788583 ns 4446583 ns 1.08
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 349204 ns 353113.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 459 ns 542 ns 0.85
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 584 ns 583 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 500 ns 458 ns 1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 34950 ns 35065 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 1182154 ns 1207795 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 441000 ns 272000 ns 1.62
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 208662 ns 208482 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7916 ns 8084 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8000 ns 8583 ns 0.93
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8729.5 ns 9021 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8417 ns 7999.5 ns 1.05
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 235567 ns 232978 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 23108838 ns 21485714 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 5655333.5 ns 4575750 ns 1.24
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 664097 ns 677037 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 16375 ns 14000 ns 1.17
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 14604.5 ns 16833 ns 0.87
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 14708 ns 14708 ns 1
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 10459 ns 11084 ns 0.94
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 21454 ns 22205 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI 1118960.5 ns 1162979 ns 0.96
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal 214750 ns 202500 ns 1.06
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 188482 ns 192482 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 31708 ns 32208 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 31875 ns 32458 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 32146 ns 32500 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 31917 ns 32292 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 314264 ns 312235 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11477943.5 ns 11511065 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal 1721916 ns 1679791 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 610347 ns 604116 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 441229.5 ns 441708 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 445062.5 ns 448937.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 447666 ns 446250 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 446000 ns 480084 ns 0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194324 ns 196019 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6172811 ns 5816450 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2129687.5 ns 2098250 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 356014 ns 375818.5 ns 0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3806062.5 ns 3828583 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3830125 ns 3823145.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3819020.5 ns 3821375 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3829625.5 ns 3827208 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 580459 ns 576589.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 27992292 ns 28049485 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10082833.5 ns 9440833.5 ns 1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1390109 ns 1386199.5 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 833503354 ns 832497916.5 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 415838000 ns 542428167 ns 0.77
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 544434542 ns 543721583 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1561715250 ns 1563747062.5 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22756243 ns 22544597 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14023836 ns 14054025 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2997704083 ns 3015248500 ns 0.99
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1512242750 ns 1790623833 ns 0.84
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 2248995791 ns 2952821875 ns 0.76
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 5261167167 ns 5283328917 ns 1.00
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 364718000 ns 308874000 ns 1.18
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 87342499 ns 87902788 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 77833 ns 77188 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 76542 ns 76417 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78708 ns 77021 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 76354.5 ns 77750 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 235898 ns 234575.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7929573 ns 7672129 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 551041.5 ns 527333 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 109786.5 ns 110611 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 282312.5 ns 260625 ns 1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 251104 ns 274229 ns 0.92
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 197208 ns 272750.5 ns 0.72
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 192416 ns 235500 ns 0.82
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1133383 ns 1125486 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45395412 ns 47620210 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6595833 ns 6079250 ns 1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 643627 ns 649537 ns 0.99
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199406375 ns 199482541.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 104150500 ns 139345542 ns 0.75
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 139302333 ns 139029792 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 388728500 ns 392535083 ns 0.99
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5827807.5 ns 5822474 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3416565 ns 3382050 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 621451500.5 ns 619752416.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 353591958 ns 442441875 ns 0.80
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 438706083.5 ns 440256854 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1195242542 ns 1198345083 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26241215 ns 26648833.5 ns 0.98
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 21717195 ns 21780965 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7209 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5292 ns 6083 ns 0.87
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 6250 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10042 ns 9875 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27646 ns 27541 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1194589 ns 1291872 ns 0.92
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 620417 ns 587750 ns 1.06
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50410 ns 47840 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213208 ns 214771 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 221104.5 ns 221145.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221854 ns 222354.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 216000 ns 207000 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 239232 ns 238084 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 30973712 ns 32570629.5 ns 0.95
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9004750 ns 9544417 ns 0.94
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 536025 ns 538400.5 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8333.5 ns 9854 ns 0.85
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 10250 ns 8750 ns 1.17
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9937.5 ns 10479.5 ns 0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 9416 ns 8000 ns 1.18
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 133822.5 ns 133164.5 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 3586391 ns 3474874 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 904312 ns 869583 ns 1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 72841 ns 72621 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7667 ns 7416.5 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7917 ns 7875 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8167 ns 8416 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7833 ns 7542 ns 1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 581095 ns 569420 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 18441555 ns 20340438.5 ns 0.91
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 4731020.5 ns 4250145.5 ns 1.11
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 326163 ns 321593.5 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 459 ns 458 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 584 ns 500 ns 1.17
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 417 ns 1.20
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 26581 ns 26321 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 1237995.5 ns 1240804 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 473959 ns 300875 ns 1.58
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 49351 ns 48251 ns 1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10166 ns 10167 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10334 ns 9791 ns 1.06
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10416 ns 10875 ns 0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9584 ns 9667 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 272007 ns 269831 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 23722074 ns 23376313 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5995833.5 ns 5253375 ns 1.14
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 394569 ns 398219 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 107229.5 ns 107312.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 85749.5 ns 99312 ns 0.86
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 99417 ns 100395.5 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 146291 ns 146708 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 24482 ns 24943 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI 1201696.5 ns 1222208 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal 274937.5 ns 258062 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 192342 ns 190671.5 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 478334 ns 477917 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 500041 ns 478000 ns 1.05
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 478375 ns 500667 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 478708 ns 496666 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 255734 ns 253423.5 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11849860 ns 11761727 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal 2286625 ns 2149375 ns 1.06
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 624721 ns 618976 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 4937.5 ns 5458 ns 0.90
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 7000 ns 6749.5 ns 1.04
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 7792 ns 6250 ns 1.25
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 4333 ns 6417 ns 0.68
batchedmm(16, Bsize=32)/forward/GPU/CUDA 16407 ns 16082.5 ns 1.02
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 78321 ns 79671 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 11542 ns 11646 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 9666.5 ns 11166.5 ns 0.87
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 10792 ns 11104 ns 0.97
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 16958 ns 16416 ns 1.03
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 233195.5 ns 231483.5 ns 1.01
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 378594 ns 373474 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 39417 ns 39458 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 50250 ns 47083 ns 1.07
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 51417 ns 52417 ns 0.98
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13833 ns 13791 ns 1.00
batchedmm(16, Bsize=128)/forward/GPU/CUDA 19791.5 ns 19645 ns 1.01
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 85261 ns 88401 ns 0.96
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 51020.5 ns 36062.5 ns 1.41
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 28646.5 ns 30937.5 ns 0.93
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 31146.5 ns 31854.5 ns 0.98
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 64625 ns 57291 ns 1.13
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 208902 ns 206595 ns 1.01
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 415884.5 ns 420164 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1875 ns 1625 ns 1.15
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1667 ns 1979.5 ns 0.84
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2250 ns 2333 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1750 ns 1708 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 20332 ns 20532 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI 1179138 ns 1200494 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal 324854.5 ns 296854 ns 1.09
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 28921 ns 33920 ns 0.85
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2083 ns 2084 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2208 ns 2208 ns 1
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2291 ns 2500 ns 0.92
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2250 ns 2167 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 222981 ns 222308 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI 9345973.5 ns 9522565 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal 1764708 ns 1461292 ns 1.21
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 139241 ns 138611 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4667 ns 3958.5 ns 1.18
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4750 ns 4771 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5750 ns 6083 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4333 ns 4166.5 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 161766.5 ns 160299 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 5876374 ns 5753962 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 453291.5 ns 436958 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 62650 ns 73001 ns 0.86
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8667 ns 8208 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7958 ns 8500 ns 0.94
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8250 ns 8958 ns 0.92
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8166 ns 8459 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 958412 ns 947464 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 39258406 ns 38499411 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5932250.5 ns 5531166 ns 1.07
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 385774 ns 391074 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 57250 ns 56917 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 56916 ns 57917 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 58250 ns 58000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 58667 ns 58500 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 37674 ns 37518 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1210249 ns 1241903 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 380459 ns 555417 ns 0.68
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 208842 ns 218173 ns 0.96
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 449562.5 ns 447979 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 466895.5 ns 464396 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 465833 ns 473021 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 434708 ns 434125 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 276347 ns 273702 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26892491 ns 28360190 ns 0.95
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8199750 ns 8213312.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 814928 ns 849179 ns 0.96
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3302500 ns 3314750 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1770792 ns 2333729 ns 0.76
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 2337291.5 ns 2339125 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6303499.5 ns 6302291.5 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 204292.5 ns 204384 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 203467.5 ns 209662 ns 0.97
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11464458 ns 11431208 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 6552083 ns 8359770.5 ns 0.78
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 8324666.5 ns 8320667 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21058833.5 ns 21055375 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 741274.5 ns 741163 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1081561 ns 1077386 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4583 ns 6666 ns 0.69
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5667 ns 5042 ns 1.12
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6104 ns 5792 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4854.5 ns 4750 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 156234.5 ns 153420 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 5915211.5 ns 5572475 ns 1.06
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 827584 ns 774395.5 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 58490 ns 58011 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7750 ns 7083.5 ns 1.09
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7042 ns 7229.5 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7416 ns 7625 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6959 ns 7250 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 812002.5 ns 801539 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 34732254.5 ns 36387767 ns 0.95
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5657917 ns 5231166.5 ns 1.08
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 382614 ns 377908.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 95791 ns 123729 ns 0.77
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 98000 ns 101000 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 125333 ns 104000 ns 1.21
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 98604 ns 126792 ns 0.78
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 158376.5 ns 156884 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6145153.5 ns 6223875 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2249500 ns 2963062.5 ns 0.76
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 189172 ns 208512 ns 0.91
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2001625 ns 2029666.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1968041.5 ns 2022354.5 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2021312.5 ns 1991125 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2030708.5 ns 1994708 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 779642 ns 768699 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32168960 ns 31458817 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11090459 ns 10927541.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1124561.5 ns 1258693 ns 0.89
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 34541.5 ns 32625 ns 1.06
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 35875 ns 36833.5 ns 0.97
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 33958 ns 35792 ns 0.95
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 625 ns 500 ns 1.25
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15484 ns 14999 ns 1.03
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 80681 ns 72471 ns 1.11
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2667 ns 2583.5 ns 1.03
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2791 ns 2917 ns 0.96
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3000 ns 3041 ns 0.99
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2250 ns 2125 ns 1.06
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 148962 ns 147569 ns 1.01
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 353673 ns 353134 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7291 ns 7208 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5292 ns 6000 ns 0.88
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6084 ns 6041 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 10209 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 36617 ns 36526 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1195969 ns 1258443 ns 0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 574854 ns 344041.5 ns 1.67
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 49650 ns 49210 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 213333.5 ns 212937 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220792 ns 220562.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221208.5 ns 221542 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 214813 ns 206666 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 253557.5 ns 251971 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26578838 ns 28011462 ns 0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7960167 ns 7840500 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 522265 ns 585241 ns 0.89
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3917 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3917 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22029 ns 21871 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI 2084023 ns 2206686 ns 0.94
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal 250333 ns 242458 ns 1.03
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 45980 ns 45741 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14958 ns 14958 ns 1
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14625 ns 15000 ns 0.97
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14958 ns 15041 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14875 ns 14959 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 339194 ns 338565 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI 11224100 ns 11553768.5 ns 0.97
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal 1025166 ns 976583 ns 1.05
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 196272 ns 200867 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 103041 ns 102500 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 125083 ns 100312.5 ns 1.25
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 132667 ns 108417 ns 1.22
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 100249.5 ns 121104.5 ns 0.83
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 160077 ns 151313 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5649119.5 ns 6198682 ns 0.91
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2853125 ns 2966354 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 205222 ns 209207 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1923874.5 ns 1900250 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1935583 ns 1919459 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1923375 ns 1905125 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1927062.5 ns 1916250 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 765025 ns 757300 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 31727518 ns 32108317 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10829375 ns 11078458 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1233282 ns 1229222 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18791 ns 18250 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18729.5 ns 18687.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20145.5 ns 20958 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19646 ns 18417 ns 1.07
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 123582.5 ns 123983 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3867602 ns 3759508.5 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1393500 ns 1408000 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 76250 ns 76771 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215917 ns 216375 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 216688 ns 216583 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 219083 ns 225958.5 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 216250 ns 225688 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 569648 ns 570521 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20650113 ns 20158203.5 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6226521 ns 6272542 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 496345 ns 495115 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 25312.5 ns 23938 ns 1.06
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 28312.5 ns 30688 ns 0.92
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 29041 ns 29062.5 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1458 ns 1250 ns 1.17
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16184 ns 16428 ns 0.99
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 88291 ns 83121 ns 1.06
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 4875 ns 4646 ns 1.05
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 4895.5 ns 4896 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 5437.5 ns 5000 ns 1.09
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4875 ns 4958 ns 0.98
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 227416 ns 227807 ns 1.00
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 387604 ns 388114 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 305625 ns 305583 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 305812.5 ns 306229.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 309146 ns 306417 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 307792 ns 305458 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 259343.5 ns 261264.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7522169 ns 7704179 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 655771 ns 1023292 ns 0.64
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 277977.5 ns 277833 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 532041 ns 529834 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 530083 ns 542250 ns 0.98
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 538458 ns 540625 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 533250 ns 564458 ns 0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1187558.5 ns 1187321 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45262893 ns 41669413 ns 1.09
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6496375 ns 5875584 ns 1.11
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 870989 ns 878529 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19792 ns 19458 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 21104 ns 20250 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22312.5 ns 22125 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20542 ns 20000 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 131573 ns 130701 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3585949 ns 3847882 ns 0.93
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1498125 ns 1505125 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 75971 ns 82371 ns 0.92
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 214917 ns 211958 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 213083 ns 213833 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 213958 ns 221208 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212500 ns 225958.5 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 880154.5 ns 881389.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 24833949 ns 26325325.5 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7325541 ns 7153208 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 546485 ns 544575 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6334 ns 6917 ns 0.92
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7458 ns 6542 ns 1.14
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8083 ns 8000 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6583 ns 6458 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 157693 ns 156818.5 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 5843649.5 ns 5948934 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 839500 ns 752520.5 ns 1.12
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 69580 ns 69091 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11041 ns 10125 ns 1.09
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9917 ns 9916 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10729 ns 10542 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10209 ns 10917 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 890019.5 ns 888274 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 39536698 ns 40943719 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5554084 ns 5136625 ns 1.08
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 391634 ns 391869 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4667 ns 4500 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5124.5 ns 5375 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5833 ns 6542 ns 0.89
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6458 ns 5167 ns 1.25
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 161059 ns 160006.5 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 5609112 ns 5573705.5 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 822917 ns 760125 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 62251 ns 70591 ns 0.88
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7542 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7333 ns 7208 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7708 ns 7792 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7166 ns 7292 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 835582 ns 834345 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 41010539 ns 40626772 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5986084 ns 5526750 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 405494.5 ns 398254 ns 1.02
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14490500 ns 14520542 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 7719208 ns 10131583 ns 0.76
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 10131041 ns 10128208 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27827208 ns 27740417 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 529747 ns 528864 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 389754 ns 389824 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46259291.5 ns 46258062.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 26496000 ns 33606750 ns 0.79
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 33451708 ns 33452875 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85583541 ns 85111667 ns 1.01
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2650995 ns 2665492 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3276734 ns 3283266 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 68250 ns 66291 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 68104.5 ns 68291.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 69312 ns 70667 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 66125 ns 68459 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 134037 ns 136351 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3550202 ns 3622418.5 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1521625 ns 1516937 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 232902 ns 229622 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 449854 ns 441083 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 440625 ns 442083 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 442209 ns 451271 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 440396 ns 448709 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 796931.5 ns 797999 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26432216.5 ns 27539269 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7473000 ns 7538041.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 813753.5 ns 798133.5 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 31856 ns 32316 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 1165311 ns 1191148 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 476979 ns 282250 ns 1.69
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 51801 ns 51341 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10937.5 ns 10333 ns 1.06
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9542 ns 10145.5 ns 0.94
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10312.5 ns 10750 ns 0.96
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10895.5 ns 9708 ns 1.12
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 298325 ns 298535 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 21243700 ns 22166302 ns 0.96
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5492937.5 ns 5112604 ns 1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 381794 ns 389099 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9833 ns 9833 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9834 ns 9875 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9834 ns 9833 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9792 ns 9792 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23467 ns 23326 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI 2094657 ns 2165543 ns 0.97
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal 227250 ns 221000 ns 1.03
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 218063 ns 217222 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 46250 ns 46042 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45750 ns 46083 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 46625 ns 46333 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 46625 ns 45958 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 308147 ns 311769 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11866227 ns 9346963.5 ns 1.27
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal 981645.5 ns 1406750 ns 0.70
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 625806 ns 625172 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56500 ns 56291 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 56333 ns 57084 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57292 ns 57166 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 57958 ns 57833 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 28681 ns 29441 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1215113 ns 1309673.5 ns 0.93
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 679666.5 ns 525875 ns 1.29
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 206472 ns 205592 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 454625 ns 453959 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 465208 ns 497667 ns 0.93
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 467459 ns 473604.5 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 435834 ns 480063 ns 0.91
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 255741 ns 257462 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 31365693.5 ns 35120868.5 ns 0.89
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9276416.5 ns 9345875 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 857403.5 ns 846349 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 647417 ns 642041.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 646792 ns 579916.5 ns 1.12
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 649354.5 ns 645666 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 663709 ns 646458 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 225589 ns 228679 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8440846.5 ns 8172629 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1395125 ns 1355750.5 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 235913 ns 253213 ns 0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2227104.5 ns 2224417 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2251250 ns 2220292 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2225292 ns 2237020.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2242250 ns 2228625 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1068301.5 ns 1068960.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 49297045.5 ns 47850344 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7711771 ns 7115750 ns 1.08
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1379184 ns 1248002 ns 1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22916 ns 20375 ns 1.12
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20146 ns 20167 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21833 ns 22708 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20709 ns 22291.5 ns 0.93
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 127032 ns 126170.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3587279 ns 3616907 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1515770.5 ns 1510500 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 84371 ns 83861 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 253853.5 ns 220209 ns 1.15
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220458 ns 219167 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221000 ns 228104 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219020.5 ns 262208.5 ns 0.84
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 840768 ns 842194.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27055091.5 ns 27546993 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7691791.5 ns 7841187.5 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 560576 ns 570991 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 584 ns 584 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 708 ns 0.88
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 541 ns 0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 22755 ns 23369 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 1200857 ns 1263372.5 ns 0.95
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 466021 ns 433416.5 ns 1.08
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 50411 ns 52440 ns 0.96
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 11229.5 ns 10750 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10542 ns 10895.5 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10771 ns 10875 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10312 ns 10395.5 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 277858 ns 277798 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 26122331 ns 25466793 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 6009125 ns 5692709 ns 1.06
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 412724 ns 417444.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9209 ns 8209 ns 1.12
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 10125 ns 8750 ns 1.16
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9750 ns 9958 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8917 ns 8834 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 135766 ns 135599.5 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 3551579 ns 3368218 ns 1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 904792 ns 871041 ns 1.04
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 67721 ns 73140 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7750 ns 7334 ns 1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7666 ns 8000 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8312.5 ns 8208 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7500 ns 7667 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 551973.5 ns 551087 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 17337125 ns 17674213 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4446687.5 ns 4122708 ns 1.08
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 336393 ns 329793.5 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1437.5 ns 1458 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1500 ns 1500 ns 1
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2000.5 ns 1958 ns 1.02
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1354.5 ns 1604 ns 0.84
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 21147 ns 21901 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI 1189038 ns 1190713 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal 311875 ns 295917 ns 1.05
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 190276.5 ns 192142 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3333 ns 3250 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3333 ns 3542 ns 0.94
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3458 ns 3667 ns 0.94
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3333.5 ns 3208 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 241366 ns 238995.5 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI 9747135 ns 10823565 ns 0.90
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal 1889917 ns 1600541.5 ns 1.18
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 597216 ns 595486 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 148042 ns 149479 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 106084 ns 128917 ns 0.82
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 128375.5 ns 130000 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 225104 ns 233187.5 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 24502.5 ns 24863 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI 1170307 ns 1175801 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal 306333 ns 266083 ns 1.15
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 36970 ns 37325.5 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 174999.5 ns 160187 ns 1.09
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 87125 ns 123542 ns 0.71
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 110792 ns 126354.5 ns 0.88
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 250729 ns 269771 ns 0.93
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 240885.5 ns 242114 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI 10729195 ns 10717525 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal 2110083.5 ns 2011459 ns 1.05
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 226383 ns 238832.5 ns 0.95
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7209 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5292 ns 6042 ns 0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6084 ns 5917 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10083 ns 10083 ns 1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32889 ns 32666 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1323363 ns 1158312.5 ns 1.14
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 369062.5 ns 569812.5 ns 0.65
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 51151 ns 50740 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 223583 ns 220250 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 228584 ns 227542 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 228917 ns 236271 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213604 ns 253792 ns 0.84
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 270279 ns 271669.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26319175 ns 29231373 ns 0.90
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8277437.5 ns 8143771 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 534116 ns 535646 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 14833 ns 14958 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 15125 ns 14666 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 16500 ns 16250 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 15917 ns 15917 ns 1
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 157359.5 ns 156410 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 5880290 ns 5735978 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 824458 ns 768020.5 ns 1.07
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 240222 ns 238883 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 23687 ns 22083 ns 1.07
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23500 ns 22958 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 23854 ns 23375 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23292 ns 23000 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 926538.5 ns 926946 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 39379136.5 ns 38775799 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5882625 ns 5549917 ns 1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 690662 ns 693337 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9812.5 ns 9438 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9542 ns 9875 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10583 ns 11000 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 10167 ns 9333 ns 1.09
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 140467 ns 139616.5 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 3419289 ns 3454137.5 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 821479 ns 723042 ns 1.14
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 71471 ns 76450 ns 0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13917 ns 13584 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13166 ns 13792 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14458 ns 14125 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13583 ns 13500 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 766881 ns 764786.5 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 22531916 ns 20882662 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5288584 ns 4491625 ns 1.18
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 372183.5 ns 374309 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9459 ns 10458.5 ns 0.90
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9542 ns 9375 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10958 ns 11125 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10333 ns 10021 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 138627.5 ns 138381 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 3561997 ns 3357197.5 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 927624.5 ns 840979.5 ns 1.10
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 72865.5 ns 75811 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12583 ns 12083 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12646.5 ns 12833 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13083.5 ns 13292 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11937.5 ns 12604 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 624799.5 ns 621892.5 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 20341049.5 ns 18616134 ns 1.09
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4551375 ns 4378854 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 348243 ns 351323.5 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 31083.5 ns 27375 ns 1.14
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 32937.5 ns 32542 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 31583 ns 32042 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 2042 ns 1750 ns 1.17
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16203 ns 16401 ns 0.99
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 73550 ns 74361 ns 0.99
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5229.5 ns 5250 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 5063 ns 5375 ns 0.94
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5562.5 ns 5458 ns 1.02
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6416 ns 6354.5 ns 1.01
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 148737.5 ns 149643.5 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 374559 ns 372579 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 291 ns 375 ns 0.78
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 26129 ns 26282 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 1175092.5 ns 1235843 ns 0.95
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 467478.5 ns 429000 ns 1.09
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 48501 ns 48191 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7209 ns 6875 ns 1.05
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7042 ns 7333 ns 0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7708 ns 8250 ns 0.93
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7458 ns 7250 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 198167 ns 199861.5 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 22361121 ns 23031425.5 ns 0.97
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 6016959 ns 5383603.5 ns 1.12
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 396144 ns 399594 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 2042 ns 2000 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 1917 ns 2042 ns 0.94
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2084 ns 2084 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 1959 ns 1917 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 26961 ns 27232 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 1272300.5 ns 1267736 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 473229.5 ns 288917 ns 1.64
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 211192 ns 210802 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17312.5 ns 17312.5 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16979.5 ns 16750 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17958 ns 18271 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17291.5 ns 17792 ns 0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 284214 ns 284556 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 25656179.5 ns 28374240 ns 0.90
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5834000 ns 5355833 ns 1.09
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 717577 ns 716677 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 188417 ns 152500 ns 1.24
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 169438 ns 151917 ns 1.12
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 149396 ns 152875 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 175916 ns 197875 ns 0.89
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 221937 ns 223454.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7838028 ns 9157076.5 ns 0.86
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1550833 ns 1404125 ns 1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 199412 ns 219493 ns 0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1315271 ns 1335250 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1324083 ns 1317937.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1325000 ns 1295104.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1331833.5 ns 1330416.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 998483 ns 1005186 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 47666418 ns 47695710 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6733584 ns 6492770.5 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1130086 ns 1013325.5 ns 1.12
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 27020.5 ns 24312.5 ns 1.11
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24792 ns 25125 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 26416 ns 27125 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24687.5 ns 25312.5 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 268327.5 ns 268978 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8104229 ns 7881859 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 621687.5 ns 605042 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 117991 ns 121672 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 131333 ns 117958.5 ns 1.11
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 116958 ns 117458 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 125645.5 ns 176312.5 ns 0.71
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 127375 ns 125666.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1214493 ns 1219058 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 43264858 ns 45063058 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6553167 ns 6043584 ns 1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 601326 ns 618336 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 250 ns 334 ns 0.75
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 334 ns 375 ns 0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22301 ns 23021 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 1209248 ns 1205681 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 447500 ns 416084 ns 1.08
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 51730.5 ns 51680 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7541.5 ns 7250 ns 1.04
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7167 ns 7500 ns 0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 7792 ns 8250 ns 0.94
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7416 ns 7041 ns 1.05
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 204142.5 ns 205357.5 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 25835569 ns 24678245 ns 1.05
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5695875 ns 5614708.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 401495 ns 401394 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5896 ns 5250 ns 1.12
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5708 ns 5917 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6667 ns 6709 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6208 ns 5250 ns 1.18
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 167740 ns 167889 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 5615184 ns 5730740.5 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 488083 ns 442416 ns 1.10
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 238573 ns 237332 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10083.5 ns 9792 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9709 ns 10291 ns 0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9958.5 ns 10417 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9708 ns 9979.5 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 976109 ns 974688.5 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 41486841 ns 40869083 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 6285500 ns 5716125 ns 1.10
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 679397 ns 681047 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 625 ns 667 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 666 ns 667 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 667 ns 667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 666 ns 666 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22844 ns 22997 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI 1942240 ns 2184475 ns 0.89
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal 335708 ns 222542 ns 1.51
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 216152.5 ns 214122 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4583 ns 4583 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4542 ns 4667 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4792 ns 4875 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4584 ns 4584 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 237762 ns 241286.5 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10676201 ns 10599719 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal 1793708 ns 1575875 ns 1.14
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 600666.5 ns 597171.5 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 9542 ns 8042 ns 1.19
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8375 ns 8375 ns 1
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9542 ns 9687.5 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8375 ns 7583 ns 1.10
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 138258.5 ns 137743.5 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 3579498 ns 3699354 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 834417 ns 778937 ns 1.07
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 69561 ns 77070 ns 0.90
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8541 ns 8583.5 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8166 ns 8666 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9083.5 ns 8958 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8209 ns 8520.5 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 673050 ns 671848 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 22514145.5 ns 21090224 ns 1.07
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5316625 ns 4565417 ns 1.16
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 354703 ns 354384 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 125917 ns 127708 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 96125 ns 129375 ns 0.74
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 130167 ns 130250 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 183437 ns 180687 ns 1.02
batchedmm(128, Bsize=4)/forward/GPU/CUDA 45933 ns 46493 ns 0.99
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 98581 ns 98931 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 339916 ns 344083 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 166583 ns 324375 ns 0.51
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 348854.5 ns 344583 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 574020.5 ns 606833 ns 0.95
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 207728 ns 208834 ns 0.99
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 495960 ns 512935 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397708 ns 398042 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 215083 ns 288042 ns 0.75
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288291 ns 288646 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756250 ns 756209 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43863 ns 43829 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI 1414380.5 ns 1374897 ns 1.03
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal 508833 ns 409979 ns 1.24
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 84981 ns 83555.5 ns 1.02
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1459874.5 ns 1451083 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 862000 ns 1136063 ns 0.76
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1134791.5 ns 1133125 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2443958 ns 2442166.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 264585.5 ns 265950 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI 11951229.5 ns 10929832 ns 1.09
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal 1843542 ns 1780583 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 355253 ns 351723.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 614666 ns 638500 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 586000 ns 643687 ns 0.91
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 645874.5 ns 647458 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 657000 ns 646208 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 222791 ns 223801.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8463661 ns 8161714 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1392125 ns 1357208 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 247582 ns 249462 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2443375 ns 2446416 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2464833.5 ns 2441562 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2434958 ns 2458583 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2451958 ns 2438604 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1084693 ns 1085043 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 52848514 ns 52292780.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9656375 ns 7220166 ns 1.34
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1475249.5 ns 1491070.5 ns 0.99
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 33979 ns 32062 ns 1.06
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 35146 ns 36083 ns 0.97
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 34541.5 ns 34292 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 958 ns 750 ns 1.28
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15785 ns 15707 ns 1.00
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 72911 ns 87960 ns 0.83
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3166 ns 3042 ns 1.04
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3208 ns 3417 ns 0.94
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3459 ns 3458 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3166.5 ns 3020.5 ns 1.05
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 147758 ns 148084.5 ns 1.00
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 345553 ns 359633 ns 0.96
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 406875 ns 406833.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 401958 ns 408000 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 409250 ns 408292 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 421375 ns 420458 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 43841 ns 44216 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1372730.5 ns 1415070 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1170812 ns 1153458.5 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 242582.5 ns 240777 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3882208 ns 3864458 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3924041.5 ns 3999792 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3998375 ns 3979541.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3776500 ns 3775854.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 250561 ns 252337 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 35883398 ns 38063076.5 ns 0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11700333.5 ns 11500395.5 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1246592 ns 1237002 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3916 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 34574 ns 34777 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI 1219036 ns 1220867 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal 264250 ns 174292 ns 1.52
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 40720 ns 40930 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15750 ns 15750 ns 1
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15542 ns 16000 ns 0.97
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15917 ns 16000 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15792 ns 15708 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 273311 ns 275461 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI 8765176.5 ns 9012714 ns 0.97
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal 885792 ns 864791 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 167912 ns 169182 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404125 ns 404416 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 220833 ns 295625 ns 0.75
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 295250 ns 295500 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760375 ns 760500 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113355 ns 113590.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI 999579 ns 1022527 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal 483500 ns 398708 ns 1.21
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 90391 ns 88881 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1480125 ns 1487187 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 886750 ns 1158000 ns 0.77
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1160937.5 ns 1155479.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2466312.5 ns 2466208 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 264186 ns 270328 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI 10729803 ns 10184609.5 ns 1.05
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal 1873812.5 ns 1820375.5 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 357734 ns 358073 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 584 ns 500 ns 1.17
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 458 ns 583 ns 0.79
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 541 ns 500 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 26163 ns 26734 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 1282772 ns 1351582.5 ns 0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 465459 ns 291500 ns 1.60
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 210412 ns 212062 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8604.5 ns 8229.5 ns 1.05
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8167 ns 8583 ns 0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9125 ns 9541.5 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8750 ns 8250 ns 1.06
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 212800 ns 214078 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 25355208.5 ns 25901884 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5710375 ns 5489167 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 711177 ns 700618 ns 1.02
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 833479.5 ns 833854 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 471667 ns 622042 ns 0.76
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 618333 ns 622979 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1549979.5 ns 1540520.5 ns 1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129908.5 ns 129761.5 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 169932 ns 232503 ns 0.73
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2690812.5 ns 2695146 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1528250 ns 1998709 ns 0.76
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 2007542 ns 2002084 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4933833.5 ns 4935604.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 255516 ns 253692 ns 1.01
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 874763.5 ns 775058 ns 1.13
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 250 ns 375 ns 0.67
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 291 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31620 ns 32085 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 1200788 ns 1210183 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 434791 ns 398167 ns 1.09
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 49800 ns 48991 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7646 ns 7125 ns 1.07
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7084 ns 7542 ns 0.94
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7875 ns 7959 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7333 ns 7208 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 227155.5 ns 228384.5 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 22078020 ns 22179958 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 4969834 ns 5066187 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 366063.5 ns 371134 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2419959 ns 2380250 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2370750 ns 2375084 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2383667 ns 2384771 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2405250 ns 2400396 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 221771 ns 222871 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7972987.5 ns 7920377 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1606125 ns 1468250 ns 1.09
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 359644 ns 333598.5 ns 1.08
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4630917 ns 4645416 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4535583 ns 4654896 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4657333 ns 4551000 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4651709 ns 4663375.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 989560.5 ns 991786.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 47401923 ns 49524120.5 ns 0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6807396 ns 6445334 ns 1.06
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1409064 ns 1420379 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 15188 ns 7625 ns 1.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6875 ns 6917 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7459 ns 7417 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 9416.5 ns 6916.5 ns 1.36
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 24119 ns 24247 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI 1200176 ns 1191973 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal 280270.5 ns 267833.5 ns 1.05
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 34491 ns 39055.5 ns 0.88
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 67062.5 ns 48958.5 ns 1.37
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 45729.5 ns 52041.5 ns 0.88
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 47833 ns 50209 ns 0.95
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 48416 ns 70292 ns 0.69
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 241118 ns 242503 ns 0.99
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI 10961456 ns 10913747 ns 1.00
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal 2256625 ns 2020458 ns 1.12
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 244442 ns 238762 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 22000 ns 21687.5 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 24167 ns 26292 ns 0.92
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 24291.5 ns 24292 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5333.5 ns 7104.5 ns 0.75
batchedmm(2, Bsize=512)/forward/GPU/CUDA 17742 ns 18134 ns 0.98
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 91171 ns 86280 ns 1.06
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 12250 ns 12375 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 9229 ns 10333 ns 0.89
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 10708.5 ns 10959 ns 0.98
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 17979.5 ns 18020.5 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 247367 ns 247945.5 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 394269 ns 390974 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 405958 ns 406000 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 223625 ns 296750 ns 0.75
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 296750 ns 296833 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762959 ns 762416 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46786 ns 47301 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI 1380803 ns 1385156 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal 437125 ns 480417 ns 0.91
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 92421 ns 90581 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1485583 ns 1489959 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 892146 ns 1168167 ns 0.76
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1165042 ns 1168541.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2472709 ns 2469208.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 308920 ns 301442 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI 13822167 ns 12084462 ns 1.14
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal 2073458 ns 2052625 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 380064 ns 375484 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 435750 ns 433833 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 430312.5 ns 437292 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 438875 ns 437959 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 448792 ns 447542 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 54925.5 ns 55480 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1013411 ns 1017218 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1149375 ns 1108958.5 ns 1.04
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 238282 ns 238782 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3884333 ns 3902500 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3995458.5 ns 4024833 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4027188 ns 4023500 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3806541.5 ns 3799395.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 270795 ns 271271 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 30409622 ns 38442703 ns 0.79
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10301625 ns 10071250 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1244507 ns 1237173 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 8750 ns 8750 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 6875 ns 7667 ns 0.90
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 7708 ns 7708 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 12458 ns 12417 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24004 ns 24191 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI 2037132.5 ns 2251764 ns 0.90
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal 231583.5 ns 220958 ns 1.05
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 219512 ns 217462 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 45125 ns 45042 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 44791 ns 45125 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 45166 ns 45459 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 45542 ns 45375 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 364741 ns 366950 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI 13229894.5 ns 13502470 ns 0.98
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal 1791396 ns 1612375 ns 1.11
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 666126 ns 663217 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 85666 ns 82104 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 82854.5 ns 82167 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 90541 ns 83916 ns 1.08
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 123042 ns 122312.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 190268 ns 189921 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6129927 ns 6075927 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2136500 ns 2073438 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 206862 ns 203742 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1990916 ns 2018750.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1994062.5 ns 2019062.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2022062.5 ns 1986604 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2019666 ns 1993875 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 579448 ns 579704 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 27636360 ns 30614260 ns 0.90
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9777083 ns 9357458.5 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1101570 ns 1104701 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.