Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: broken qa tests
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Aug 21, 2024
1 parent e53df59 commit c185f04
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 15 deletions.
6 changes: 3 additions & 3 deletions src/api/layernorm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ Normalized Array of same size as `x`.
[1] Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton. "Layer normalization." arXiv
preprint arXiv:1607.06450 (2016).
"""
function layernorm(x::AbstractArray{xT}, scale::Optional{<:AbstractArray{scT}},
bias::Optional{<:AbstractArray{bT}}, σ::F=identity, dims=Colon(),
epsilon::Real=get_utils(:default_epsilon)(x)) where {F, xT, scT, bT}
function layernorm(x::AbstractArray{xT}, scale::Optional{<:AbstractArray},
bias::Optional{<:AbstractArray}, σ::F=identity, dims=Colon(),
epsilon::Real=get_utils(:default_epsilon)(x)) where {F, xT}
σ′ = get_impl(:select_fastest_activation)(σ, x, scale, bias)
return get_impl(:layernorm)(x, scale, bias, σ′, dims, epsilon)
end
8 changes: 4 additions & 4 deletions src/deprecations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ import .API: batchnorm, groupnorm, instancenorm, layernorm, dropout,

## conv
@deprecate fused_conv_bias_activation(
σ::F, weight::AbstractArray{<:Number, N}, x::AbstractArray{<:Number, N},
b::AbstractArray{<:Number, N}, cdims::ConvDims) where {F, N} fused_conv_bias_activation(
σ, weight, x, _vec(b), cdims)
σ::F, weight::AbstractArray{<:Any, N}, x::AbstractArray{<:Any, N},
b::AbstractArray{<:Any, N}, cdims::ConvDims) where {F, N} fused_conv_bias_activation(
σ, weight, x, Utils.vec(b), cdims)

## Private API that was at a point being illegally used in Lux
@deprecate __∇conv_data(args...; kwargs...) Impl.∇conv_data(args...; kwargs...)

@deprecate __apply_bias_activation::F, x, bias::AbstractArray) where {F} bias_activation(
σ, x, _vec(bias))
σ, x, Utils.vec(bias))
4 changes: 2 additions & 2 deletions src/impl/batchnorm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -402,10 +402,10 @@ end

function ∇batchnorm_affine_normalize!(
∂x::AbstractArray{∂xT, 3}, ∂σ²::AbstractArray{∂σ²T, 3},
∂γ::Optional{<:AbstractArray{∂γT, 3}}, ::GPUBroadcastOp,
∂γ::Optional{<:AbstractArray{<:Any, 3}}, ::GPUBroadcastOp,
∂y::AbstractArray{∂yT, 3}, x::AbstractArray{xT, 3}, μ::AbstractVector,
σ²::AbstractVector, γ::Optional{<:AbstractVector}, ϵ::Real,
γ′::AbstractVector) where {∂xT, ∂σ²T, ∂γT, ∂yT, xT}
γ′::AbstractVector) where {∂xT, ∂σ²T, ∂yT, xT}
backend = KA.get_backend(∂x)
Utils.run_ka_kernel(
∇batchnorm_affine_normalize_kernel!, backend, nothing, size(∂x),
Expand Down
6 changes: 3 additions & 3 deletions src/impl/groupnorm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -389,10 +389,10 @@ end

function ∇groupnorm_affine_normalize!(
∂x::AbstractArray{∂xT, 4}, ∂σ²::AbstractArray{∂σ²T, 4},
∂γ::Optional{<:AbstractArray{∂γT, 4}}, ::GPUBroadcastOp,
∂γ::Optional{<:AbstractArray{<:Any, 4}}, ::GPUBroadcastOp,
∂y::AbstractArray{∂yT, 4}, x::AbstractArray{xT, 4}, μ::AbstractArray{μT, 4},
σ²::AbstractArray{σ²T, 4}, γ::Optional{<:AbstractArray{γT, 4}},
ϵ::Real) where {∂xT, ∂σ²T, ∂γT, ∂yT, xT, μT, σ²T, γT}
σ²::AbstractArray{σ²T, 4}, γ::Optional{<:AbstractArray{<:Any, 4}},
ϵ::Real) where {∂xT, ∂σ²T, ∂yT, xT, μT, σ²T}
backend = KA.get_backend(∂x)
Utils.run_ka_kernel(
∇groupnorm_affine_normalize_kernel!, backend, nothing, size(∂x),
Expand Down
6 changes: 3 additions & 3 deletions src/impl/normalization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ CRC.@non_differentiable get_norm_reshape_dims(::Any...)

# Entry Points
## LayerNorm
function layernorm(x::AbstractArray{xT, N}, γ::Optional{<:AbstractArray{γT, N}},
β::Optional{<:AbstractArray{βT, N}}, act::F,
dims, epsilon::Real) where {N, F, xT, γT, βT}
function layernorm(x::AbstractArray{xT, N}, γ::Optional{<:AbstractArray{<:Any, N}},
β::Optional{<:AbstractArray{<:Any, N}}, act::F,
dims, epsilon::Real) where {N, F, xT}
μ, σ² = mean_var(x; dims, corrected=false)
return affine_normalize(act, x, μ, σ², γ, β, epsilon)
end
Expand Down

3 comments on commit c185f04

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/113551

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.48 -m "<description of version>" c185f04183d760b84d0dcfa2b49511255cd1e7dc
git push origin v0.3.48

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: c185f04 Previous: 17ac9a2 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6083 ns 4937.5 ns 1.23
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5417 ns 5666 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 8021 ns 8042 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6146 ns 5687.5 ns 1.08
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 120417 ns 120909 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 812042 ns 791750 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 424375 ns 413945 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10250 ns 10000 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9917 ns 10250 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10125 ns 9875 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 11792 ns 9584 ns 1.23
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 556460 ns 558079 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 2542833 ns 2765041 ns 0.92
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 686027 ns 664078 ns 1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1500 ns 1375 ns 1.09
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 2792 ns 1500 ns 1.86
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1708.5 ns 2083 ns 0.82
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1583 ns 1667 ns 0.95
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 22218 ns 21790 ns 1.02
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal 205792 ns 216396 ns 0.95
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 29920 ns 31411 ns 0.95
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 3542 ns 4229.5 ns 0.84
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4209 ns 3666 ns 1.15
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4271 ns 4166 ns 1.03
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4229 ns 4208 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 148035 ns 149451 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal 1621188 ns 1690125 ns 0.96
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 151742 ns 153327 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58542 ns 58500 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46375 ns 39500 ns 1.17
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46584 ns 47042 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83708 ns 83333 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37608 ns 37308.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1081917 ns 1066021 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 84866 ns 81381 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2027833 ns 2032541.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2085458 ns 2086500 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2090292 ns 2080042 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1999000 ns 1986125 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 233327.5 ns 235686.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7717583 ns 7909459 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1460226 ns 1203034 ns 1.21
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 145375 ns 148292 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 147458 ns 166416.5 ns 0.89
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 150584 ns 150375 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 170437.5 ns 153437 ns 1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166412 ns 165231.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1615604.5 ns 1574250 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 202872 ns 180947 ns 1.12
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1119083.5 ns 1115708.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1109000 ns 1115583 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1118458 ns 1111895.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1116145.5 ns 1116604 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 707978 ns 717544 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 5932000 ns 5783062 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1046946 ns 1033041 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5104.5 ns 4958 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4250 ns 4334 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5895.5 ns 5667 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5624.5 ns 4771 ns 1.18
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 93783.5 ns 95604 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 721583.5 ns 722292 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 70761 ns 60161 ns 1.18
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8875 ns 8875 ns 1
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8792 ns 8542 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9083 ns 8583 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8750 ns 8541 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 603451.5 ns 618298 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 6400917 ns 6128688 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 388649.5 ns 393664 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 20083 ns 17312.5 ns 1.16
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18812.5 ns 18834 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20958 ns 20375 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18000 ns 18291.5 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 68784 ns 67939.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1334334 ns 1353958 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 83861 ns 76051 ns 1.10
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 224229.5 ns 223334 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219416 ns 211917 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 219062.5 ns 219896 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212958 ns 212084 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 360915.5 ns 360648.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5929666 ns 5859625 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 478315 ns 479156 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 625 ns 583.5 ns 1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 709 ns 666 ns 1.06
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 1041 ns 834 ns 1.25
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 625 ns 708 ns 0.88
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 21396 ns 20628 ns 1.04
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal 303750 ns 300000 ns 1.01
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 32981 ns 32990 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1458 ns 1395.5 ns 1.04
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1459 ns 1541 ns 0.95
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1542 ns 1417 ns 1.09
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1542 ns 1334 ns 1.16
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 127634 ns 126083.5 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal 1626542 ns 1618083 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 138112 ns 126566.5 ns 1.09
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7333 ns 7375 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 5375 ns 1.14
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6125 ns 6166 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10333 ns 9917 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 24384 ns 23573 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 700271 ns 626500 ns 1.12
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 46841 ns 47160 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221166 ns 234375 ns 0.94
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 238834 ns 242458 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 230666 ns 270958 ns 0.85
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 251250 ns 251083 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 193817 ns 185731 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8912375 ns 9574500 ns 0.93
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 653712 ns 623787 ns 1.05
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4125 ns 4083 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4125 ns 4084 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4083 ns 4083 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 24189 ns 23126 ns 1.05
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal 223791 ns 229375 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 49151 ns 48711 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16584 ns 17041 ns 0.97
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16917 ns 16500 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17042 ns 17250 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16750 ns 16833 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 199158 ns 195934 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal 963270.5 ns 972750 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 176322 ns 179972 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 512792 ns 509541.5 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 404292 ns 332334 ns 1.22
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 404896 ns 404875 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 864583 ns 865104.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113852 ns 113032 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal 448709 ns 448145.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 250173 ns 248713 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2271145.5 ns 2319667 ns 0.98
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2031292 ns 1752729.5 ns 1.16
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2033750 ns 2031958 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3280292 ns 3283979.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 247459 ns 244203 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal 2065875 ns 2016625 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 765823 ns 763594 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7145.5 ns 6042 ns 1.18
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6958.5 ns 6458 ns 1.08
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8541 ns 7708 ns 1.11
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6479.5 ns 6333 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 93682.5 ns 93025 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 806084 ns 901583 ns 0.89
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 68781 ns 62671 ns 1.10
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11708.5 ns 11583 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11875 ns 10500 ns 1.13
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11000 ns 11458 ns 0.96
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12020.5 ns 10979 ns 1.09
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 642017 ns 646277 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5707875 ns 5976917 ns 0.95
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 421135 ns 418444 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 542 ns 541 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 541 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 24054 ns 23258 ns 1.03
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal 228333 ns 327292 ns 0.70
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 54330 ns 52080 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2083 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2084 ns 2125 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2167 ns 2125 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2083 ns 2084 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 237805 ns 221917 ns 1.07
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal 1998833 ns 2054166 ns 0.97
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 190172 ns 182777 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9333.5 ns 9062.5 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9104 ns 9792 ns 0.93
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10521 ns 10167 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8959 ns 8896 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 113550 ns 106196.5 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 875353.5 ns 876291.5 ns 1.00
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 78760 ns 75871 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16729.5 ns 16854.5 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 18250 ns 17624.5 ns 1.04
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18104 ns 18958 ns 0.95
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 18458 ns 17187.5 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 643636 ns 603520 ns 1.07
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5156541 ns 5108208 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 396545 ns 396274 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 35808 ns 35127 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 323000 ns 475271 ns 0.68
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 46571 ns 49441 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10375 ns 9792 ns 1.06
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9791.5 ns 10125 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10375 ns 10229.5 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10750 ns 9334 ns 1.15
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 262020 ns 257318.5 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5294125 ns 5182437.5 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 382009.5 ns 380274 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 399000 ns 397083 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288125 ns 215250 ns 1.34
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288292 ns 287916 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 755625 ns 756041 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 113561 ns 111427 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal 367729.5 ns 363792 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 77481 ns 78871 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1393333 ns 1454375 ns 0.96
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1136083.5 ns 859500 ns 1.32
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1131458.5 ns 1129916 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2438041 ns 2440417 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 212129 ns 209113 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal 1596167 ns 1658937.5 ns 0.96
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 329854 ns 328243 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7708 ns 6916.5 ns 1.11
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7458.5 ns 7084 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 9000 ns 8188 ns 1.10
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7812 ns 7042 ns 1.11
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 159498.5 ns 152190.5 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 481750 ns 764458 ns 0.63
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 60340 ns 60511 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14667 ns 16083.5 ns 0.91
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15437.5 ns 15625 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15479.5 ns 14167 ns 1.09
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14979 ns 14333 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 1030852 ns 1030700 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 6424458 ns 6599291 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 435905 ns 440235 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 26958 ns 25292 ns 1.07
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25209 ns 29625 ns 0.85
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27208 ns 26500 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24584 ns 25291 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 228128 ns 228373.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1045041.5 ns 1026771 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 120221 ns 118522 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103791 ns 146334 ns 0.71
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 150833 ns 118854 ns 1.27
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 148187 ns 148958 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 116292 ns 117125 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1163495 ns 1207252 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6459417 ns 6191583 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 607082 ns 601256 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76416 ns 73833 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 81020.5 ns 78104.5 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 85083 ns 78021 ns 1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 79625 ns 77750 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 234622 ns 234865 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 628124.5 ns 534625 ns 1.17
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 127432 ns 125536.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 283166.5 ns 305354 ns 0.93
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 316541 ns 321166 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 302917 ns 295667 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 315041.5 ns 304625 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1204655 ns 1245639 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6660083 ns 6703875 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 700322.5 ns 703602.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 16708.5 ns 16812.5 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 17333 ns 16334 ns 1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 17854.5 ns 17438 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 16479 ns 17083 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 167006 ns 166179 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 446708 ns 615083 ns 0.73
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 239982 ns 240073 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26125 ns 27771 ns 0.94
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26917 ns 30354.5 ns 0.89
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27208 ns 26791 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 25333 ns 26770.5 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 1047898 ns 1050438 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 6661333.5 ns 6159041 ns 1.08
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 718328 ns 717457 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11084 ns 11125 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11625 ns 11624.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13000 ns 11958 ns 1.09
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11666.5 ns 11229.5 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 141188 ns 139887 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 897333.5 ns 817625 ns 1.10
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 243182.5 ns 244463 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 22145.5 ns 20917 ns 1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21875 ns 21833.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 22667 ns 22563 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21792 ns 21416.5 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 756695 ns 755838.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5374500 ns 5465833 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 695018 ns 695787.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 63937.5 ns 69479 ns 0.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 63500 ns 66041 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 66042 ns 66229 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 63666.5 ns 67791.5 ns 0.94
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 124307.5 ns 119885 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1367917 ns 1370041.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 241283 ns 239323 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 437854 ns 484042 ns 0.90
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 464833 ns 465541 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 474208 ns 439354.5 ns 1.08
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 437729.5 ns 437646 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 560487 ns 558258 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6247083 ns 6275458.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 733228 ns 737788 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7104.5 ns 7166.5 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7083 ns 7792 ns 0.91
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8334 ns 8375 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7604 ns 7292 ns 1.04
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 163142 ns 161904.5 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 463833.5 ns 456041 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 68371 ns 61410 ns 1.11
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14542 ns 14708 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15396 ns 17500 ns 0.88
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15458.5 ns 14646 ns 1.06
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14750 ns 15250 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1022438 ns 1023653 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 6461041 ns 6089229.5 ns 1.06
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 412334 ns 412184 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6159375 ns 6148208 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6372249.5 ns 3227583 ns 1.97
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 6374125 ns 6378333 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11910167 ns 11914959 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 302029 ns 301812 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 302953 ns 296489 ns 1.02
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19119687 ns 19106770.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 19945437.5 ns 11136250 ns 1.79
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 20008771 ns 19962416 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36510208.5 ns 36542271 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1019652 ns 1158703 ns 0.88
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1173152.5 ns 1169188 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1000 ns 958 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 958 ns 958 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 959 ns 959 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 959 ns 959 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23843 ns 23501 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal 335916 ns 329667 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 215882 ns 216992 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3625 ns 3709 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3666 ns 3666 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3750 ns 3750 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3667 ns 3667 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 300289 ns 297158.5 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal 2148500 ns 2191521 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 644731.5 ns 650431.5 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8334 ns 8500 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8104 ns 9250.5 ns 0.88
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9750 ns 9396 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8500 ns 8125 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 137456 ns 136116.5 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 796375 ns 819208 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 68311 ns 67611 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11666 ns 11250 ns 1.04
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12083 ns 12667 ns 0.95
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 12583 ns 11729 ns 1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12750 ns 11042 ns 1.15
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 721292 ns 721603 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5345750 ns 5441770.5 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 373344 ns 373594 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 291 ns 291 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 23235 ns 22886 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal 226791 ns 331291 ns 0.68
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 51721 ns 51921 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2917 ns 3000 ns 0.97
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 3083 ns 2958 ns 1.04
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3250 ns 3042 ns 1.07
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2834 ns 2875 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 216259.5 ns 213807.5 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal 1692958 ns 1713479.5 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 161612 ns 168911.5 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11625 ns 11833 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11229.5 ns 11896 ns 0.94
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13250 ns 13000 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12166 ns 12291 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 139967.5 ns 137978.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 892584 ns 900666.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 243863 ns 239817.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 21083 ns 23042 ns 0.91
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20396 ns 22104 ns 0.92
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 26062.5 ns 20458 ns 1.27
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21604 ns 23917 ns 0.90
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 652418 ns 653027.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 4821708.5 ns 4833270.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 672612 ns 673012 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4417 ns 4375 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4416 ns 4375 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4333 ns 4375 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24831 ns 24516 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal 223938 ns 231666.5 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 52890.5 ns 54410 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16333.5 ns 16750 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16750 ns 16167 ns 1.04
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16583 ns 16833 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16625 ns 16459 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 356581 ns 353559.5 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal 1752937.5 ns 1092020.5 ns 1.61
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 210052 ns 216712 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 1958 ns 1959 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 1917 ns 1959 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2166 ns 2083 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2084 ns 2125 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 36754.5 ns 35968 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 299041 ns 444042 ns 0.67
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 208032 ns 208282 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 16958.5 ns 17958.5 ns 0.94
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 19042 ns 16958 ns 1.12
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 17458 ns 17333.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 18062.5 ns 17646 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 307642 ns 305401 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5677458.5 ns 5381292 ns 1.06
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 709468 ns 703887 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 59125 ns 59250 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 66208 ns 60625 ns 1.09
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 66083.5 ns 64167 ns 1.03
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51334 ns 51291 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66592 ns 66533 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 113701 ns 101811 ns 1.12
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 210458 ns 196208 ns 1.07
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 143000 ns 139333 ns 1.03
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 119583 ns 155270.5 ns 0.77
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 307688 ns 285354 ns 1.08
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 234156 ns 231110.5 ns 1.01
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 598956 ns 587041 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 123833.5 ns 82771 ns 1.50
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 123125 ns 87959 ns 1.40
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 86500 ns 85959 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82958 ns 81812.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 190129 ns 192437.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1825667 ns 2001125 ns 0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 188412 ns 172856.5 ns 1.09
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1927375 ns 1915166.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1909416.5 ns 1905625 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1906875 ns 1906791.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1931021 ns 1867521 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 578778.5 ns 575411.5 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9303959 ns 9319437.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1081141.5 ns 1079271.5 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 291 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 291 ns 291 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 22349 ns 21855 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal 372291 ns 370125 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 45590 ns 45340 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1792 ns 1833 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 272164 ns 268250 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal 1469500 ns 1115417 ns 1.32
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 187152 ns 183362 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9250 ns 8250 ns 1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8708 ns 11209 ns 0.78
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11166 ns 9708 ns 1.15
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 10459 ns 9084 ns 1.15
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 134628.5 ns 135375 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 897749.5 ns 905833 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 241763 ns 242893 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10125 ns 12042 ns 0.84
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8458.5 ns 11125 ns 0.76
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 14375 ns 8833 ns 1.63
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9542 ns 12083 ns 0.79
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 584537 ns 583740 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 4632562 ns 4734458 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 645752 ns 652127 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58375 ns 58458 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46625 ns 39584 ns 1.18
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46708 ns 47104.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82000 ns 83084 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 40806 ns 39769 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1140854.5 ns 1151625 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 78371 ns 78761 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1934584 ns 1929833 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1981708 ns 1940687 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1989334 ns 1942312.5 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1899750 ns 1910500 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 239556 ns 236370 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11301583 ns 11016708.5 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1030691 ns 1026191 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 422125 ns 417125 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 417583 ns 417396 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 419750 ns 419312.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 416292 ns 416334 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 241184 ns 238661.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 546083 ns 553834 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 289943 ns 288843 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 752875.5 ns 709000 ns 1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 755666 ns 734313 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 675729 ns 671250 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 760021 ns 669791.5 ns 1.13
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1151706 ns 1151563 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6939708 ns 6696083 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 927380 ns 931160 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 3457437.5 ns 3399479.5 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 3437021 ns 3363750 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 3434709 ns 3425625 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 3439146 ns 3391083.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 201324 ns 177139 ns 1.14
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1424084 ns 1423625 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 412665 ns 416864 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 6238000 ns 6186791 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 6200250 ns 6198687.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 6194458 ns 6090875 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 6143770.5 ns 6187875 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1091727.5 ns 1083853 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 8063541.5 ns 8058500 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1569386 ns 1565741.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 473666 ns 471667 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 340792 ns 253791 ns 1.34
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 342166 ns 342583 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 905125 ns 902708 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46953 ns 46521 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal 496959 ns 448250 ns 1.11
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 251203 ns 251212 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2275334 ns 2350667 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2043625 ns 1761583.5 ns 1.16
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2032437 ns 2037792 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3282416.5 ns 3284625 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 283225 ns 258155.5 ns 1.10
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal 2237145.5 ns 2294875 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 791808 ns 791358 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57833 ns 58292 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 45958 ns 39584 ns 1.16
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46250 ns 46542 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82792 ns 82833 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28918 ns 27855 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1145250 ns 1156292 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 78811 ns 77241 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2000229 ns 2035459 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2089833 ns 2077875 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2077250 ns 2072875 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1980437.5 ns 1932083 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 244212 ns 241361.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11407979 ns 11703125 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1055251 ns 1056652 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58000 ns 58333 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46250 ns 39458 ns 1.17
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46666 ns 46834 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83041 ns 83250 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 50656 ns 49658 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1123000 ns 1110916 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 73121 ns 75300.5 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1903916 ns 1894292 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1902541 ns 1940666 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1978250 ns 1969937.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1902959 ns 1886875 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 251664 ns 247040 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9794437.5 ns 9839292 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 936124.5 ns 1051031 ns 0.89
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 250 ns 1.33
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 35119.5 ns 34603 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 308104.5 ns 433687.5 ns 0.71
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 50550 ns 49160 ns 1.03
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7937.5 ns 6958 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7625 ns 7250 ns 1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7625 ns 7521 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8167 ns 7208.5 ns 1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 218323.5 ns 210766 ns 1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4836354 ns 5193791.5 ns 0.93
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 381674 ns 378014 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 250 ns 1.16
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 291 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 33417 ns 32342 ns 1.03
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal 259375 ns 261521 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 43851 ns 39160 ns 1.12
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2792 ns 2666 ns 1.05
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2875 ns 2667 ns 1.08
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2916 ns 2834 ns 1.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2667 ns 2625 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 205231.5 ns 202783.5 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal 1294875 ns 969250 ns 1.34
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 166746 ns 154716.5 ns 1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 437042 ns 457625 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 422021 ns 453792 ns 0.93
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 424229 ns 426146 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 425834 ns 456125 ns 0.93
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 142985.5 ns 142160 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2238375 ns 2271875 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 375684 ns 326853 ns 1.15
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3809770.5 ns 3802938 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3802375 ns 3809708 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3804250 ns 3801896 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3793125 ns 3792625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 782254 ns 781504 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11146187.5 ns 11052792 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1312364 ns 1495896 ns 0.88
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49907416.5 ns 49881521 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35559584 ns 26009250 ns 1.37
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 35529250 ns 35546334 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 96899084 ns 96980062.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1625871 ns 1600432 ns 1.02
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1003290 ns 1012971 ns 0.99
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154966354 ns 154537104 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112363000 ns 88927125 ns 1.26
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 112555750 ns 112528667 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 296527604.5 ns 298524146 ns 0.99
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6450345 ns 6474447 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5530212.5 ns 5518798 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 19374.5 ns 19062.5 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 18750 ns 15542 ns 1.21
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 17353.5 ns 17042 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 15188 ns 16021 ns 0.95
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 20779 ns 20743 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal 224333 ns 252583 ns 0.89
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 26660 ns 26040 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 10917 ns 10917 ns 1
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 8834 ns 7416 ns 1.19
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 9291 ns 9208 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17291 ns 17375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 299343 ns 296392 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal 1655375 ns 1636083.5 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 155331 ns 155431 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8312.5 ns 8729 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8459 ns 9000 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10895.5 ns 9229.5 ns 1.18
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 9312.5 ns 8562.5 ns 1.09
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 142637 ns 139671.5 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 798083 ns 799833 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 241143 ns 242752 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10333.5 ns 9312.5 ns 1.11
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9042 ns 9416 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9583 ns 10167 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8937.5 ns 9250 ns 0.97
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 705801.5 ns 704800.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5435917 ns 5428520.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 657647 ns 674252.5 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9020.5 ns 9333 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10229 ns 9709 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11250 ns 10625 ns 1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9792 ns 9250 ns 1.06
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 137059 ns 136210.5 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 882166.5 ns 947792 ns 0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 78120 ns 69541 ns 1.12
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13020.5 ns 13062.5 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12583.5 ns 13542 ns 0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13583 ns 13916.5 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13458 ns 13000 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 651470.5 ns 647891 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 4779312.5 ns 4788583 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 356033 ns 349204 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 458 ns 459 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 35430 ns 34950 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 385417 ns 441000 ns 0.87
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 210072 ns 208662 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8166 ns 7916 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8000 ns 8000 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8937.5 ns 8729.5 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8208 ns 8417 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 238141 ns 235567 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 5550500 ns 5655333.5 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 670717 ns 664097 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 16417 ns 16375 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 16709 ns 14604.5 ns 1.14
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 15209 ns 14708 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 10312.5 ns 10459 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 21707 ns 21454 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal 217458 ns 214750 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 194532 ns 188482 ns 1.03
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 31854.5 ns 31708 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 32167 ns 31875 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 32250 ns 32146 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 32125 ns 31917 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 316460 ns 314264 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal 1889916 ns 1721916 ns 1.10
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 608847 ns 610347 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 450417 ns 441229.5 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 482813 ns 445062.5 ns 1.08
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 444604 ns 447666 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 440875 ns 446000 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193879 ns 194324 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2124500 ns 2129687.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 376794 ns 356014 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3673458 ns 3806062.5 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3802062.5 ns 3830125 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3822709 ns 3819020.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3821333 ns 3829625.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 588897 ns 580459 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9577042 ns 10082833.5 ns 0.95
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1393435 ns 1390109 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 783185125 ns 833503354 ns 0.94
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 542907542 ns 415838000 ns 1.31
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 543132625 ns 544434542 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1514951833.5 ns 1561715250 ns 0.97
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22763713 ns 22756243 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14159478.5 ns 14023836 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2527739209 ns 2997704083 ns 0.84
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1799023667 ns 1512242750 ns 1.19
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1787795417 ns 2248995791 ns 0.79
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 4787274417 ns 5261167167 ns 0.91
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 333649192 ns 364718000 ns 0.91
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 88087394 ns 87342499 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76666.5 ns 77833 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 79083 ns 76542 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 79375 ns 78708 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 78124.5 ns 76354.5 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 238895.5 ns 235898 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 542209 ns 551041.5 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 111271 ns 109786.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 277000 ns 282312.5 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 278895.5 ns 251104 ns 1.11
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 194979 ns 197208 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 259250 ns 192416 ns 1.35
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1134646.5 ns 1133383 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6160709 ns 6595833 ns 0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 645127 ns 643627 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199977437.5 ns 199406375 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 139216750 ns 104150500 ns 1.34
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 139454459 ns 139302333 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 389873250 ns 388728500 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5849131.5 ns 5827807.5 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3425810.5 ns 3416565 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 621409333 ns 621451500.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 440537375 ns 353591958 ns 1.25
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 440145604 ns 438706083.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1186223625 ns 1195242542 ns 0.99
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26711378 ns 26241215 ns 1.02
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 21741902 ns 21717195 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7291 ns 7250 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6084 ns 5292 ns 1.15
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6291 ns 6000 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10292 ns 10042 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 28202.5 ns 27646 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 601583 ns 620417 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48405.5 ns 50410 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220749.5 ns 213208 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 222374.5 ns 221104.5 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 222542 ns 221854 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 217625 ns 216000 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 245623 ns 239232 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8971334 ns 9004750 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 543906 ns 536025 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8145.5 ns 8333.5 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 10083 ns 10250 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10833.5 ns 9937.5 ns 1.09
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 10000.5 ns 9416 ns 1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 136003.5 ns 133822.5 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 906833 ns 904312 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 72945.5 ns 72841 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7667 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7209 ns 7917 ns 0.91
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8292 ns 8167 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7500 ns 7833 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 587405 ns 581095 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 4757959 ns 4731020.5 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 326203 ns 326163 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 459 ns 1.09
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 458 ns 500 ns 0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 542 ns 584 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 375 ns 500 ns 0.75
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 26999 ns 26581 ns 1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 493458.5 ns 473959 ns 1.04
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 49231 ns 49351 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9458 ns 10166 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10250 ns 10334 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10521.5 ns 10416 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10125 ns 9584 ns 1.06
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 275766.5 ns 272007 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 6076395.5 ns 5995833.5 ns 1.01
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 401444 ns 394569 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 107104.5 ns 107229.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 99896 ns 85749.5 ns 1.16
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 101145.5 ns 99417 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 146459 ns 146291 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 24813 ns 24482 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal 277416.5 ns 274937.5 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 192192 ns 192342 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 479500 ns 478334 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 494084 ns 500041 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 478958 ns 478375 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 528667 ns 478708 ns 1.10
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 258431 ns 255734 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal 2276458 ns 2286625 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 624467 ns 624721 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5750.5 ns 4937.5 ns 1.16
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 6917 ns 7000 ns 0.99
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 6833.5 ns 7792 ns 0.88
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 4458 ns 4333 ns 1.03
batchedmm(16, Bsize=32)/forward/GPU/CUDA 18139 ns 16407 ns 1.11
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 73231 ns 78321 ns 0.94
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 11854 ns 11542 ns 1.03
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 10500.5 ns 9666.5 ns 1.09
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 11104.5 ns 10792 ns 1.03
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 17083 ns 16958 ns 1.01
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 235890 ns 233195.5 ns 1.01
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 372074 ns 378594 ns 0.98
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 38750 ns 39417 ns 0.98
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 51292 ns 50250 ns 1.02
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 52729.5 ns 51417 ns 1.03
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 15834 ns 13833 ns 1.14
batchedmm(16, Bsize=128)/forward/GPU/CUDA 20456 ns 19791.5 ns 1.03
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 87011 ns 85261 ns 1.02
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 36875 ns 51020.5 ns 0.72
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 34729 ns 28646.5 ns 1.21
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 32167 ns 31146.5 ns 1.03
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 57000 ns 64625 ns 0.88
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 212876 ns 208902 ns 1.02
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 418835 ns 415884.5 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1791 ns 1875 ns 0.96
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1708 ns 1667 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2187.5 ns 2250 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1875 ns 1750 ns 1.07
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 20570.5 ns 20332 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal 329917 ns 324854.5 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 31020 ns 28921 ns 1.07
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2209 ns 2083 ns 1.06
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2250 ns 2208 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2500 ns 2291 ns 1.09
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2208 ns 2250 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 226270.5 ns 222981 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal 1683458.5 ns 1764708 ns 0.95
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 142136.5 ns 139241 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5042 ns 4667 ns 1.08
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4500 ns 4750 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6208.5 ns 5750 ns 1.08
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4666.5 ns 4333 ns 1.08
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 163224.5 ns 161766.5 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 800792 ns 453291.5 ns 1.77
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 75611 ns 62650 ns 1.21
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8291 ns 8667 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8209 ns 7958 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8583 ns 8250 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8250 ns 8166 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 960930 ns 958412 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5752708 ns 5932250.5 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 398144 ns 385774 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56791 ns 57250 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 57459 ns 56916 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57667 ns 58250 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 58208 ns 58667 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 38436 ns 37674 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 411813 ns 380459 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 218852 ns 208842 ns 1.05
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 448812.5 ns 449562.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 499084 ns 466895.5 ns 1.07
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 465709 ns 465833 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 481396 ns 434708 ns 1.11
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 282356.5 ns 276347 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7964500 ns 8199750 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 842729 ns 814928 ns 1.03
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3322916 ns 3302500 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2338771 ns 1770792 ns 1.32
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 2339375 ns 2337291.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6304166.5 ns 6303499.5 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 204545 ns 204292.5 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 202912 ns 203467.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11552375 ns 11464458 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8313541.5 ns 6552083 ns 1.27
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 8336875 ns 8324666.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21101437.5 ns 21058833.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 734673 ns 741274.5 ns 0.99
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1078791.5 ns 1081561 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6166 ns 4583 ns 1.35
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4916.5 ns 5667 ns 0.87
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6541 ns 6104 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4875 ns 4854.5 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 158133 ns 156234.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 887167 ns 827584 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 57035.5 ns 58490 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7166 ns 7750 ns 0.92
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7209 ns 7042 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7292 ns 7416 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7083 ns 6959 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 816855 ns 812002.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 6166979.5 ns 5657917 ns 1.09
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 384744.5 ns 382614 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 123458 ns 95791 ns 1.29
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 131229 ns 98000 ns 1.34
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 100000 ns 125333 ns 0.80
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 94625 ns 98604 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 160516.5 ns 158376.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2207458 ns 2249500 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 187112 ns 189172 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1964000 ns 2001625 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2023146 ns 1968041.5 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2028667 ns 2021312.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2018916.5 ns 2030708.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 789517 ns 779642 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11417250 ns 11090459 ns 1.03
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1260093 ns 1124561.5 ns 1.12
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 33813 ns 34541.5 ns 0.98
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 36729 ns 35875 ns 1.02
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 34708.5 ns 33958 ns 1.02
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 667 ns 625 ns 1.07
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15818 ns 15484 ns 1.02
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 82161 ns 80681 ns 1.02
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2583 ns 2667 ns 0.97
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2709 ns 2791 ns 0.97
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 2959 ns 3000 ns 0.99
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2125 ns 2250 ns 0.94
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 152979.5 ns 148962 ns 1.03
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 352884 ns 353673 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7291 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6042 ns 5292 ns 1.14
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6125 ns 6084 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9958 ns 10125 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 37656 ns 36617 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 431042 ns 574854 ns 0.75
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 49591 ns 49650 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 214000 ns 213333.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 232937.5 ns 220792 ns 1.06
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221834 ns 221208.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 232000 ns 214813 ns 1.08
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 258714 ns 253557.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7857271 ns 7960167 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 526085 ns 522265 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3917 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22767 ns 22029 ns 1.03
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal 244500 ns 250333 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 47941 ns 45980 ns 1.04
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14667 ns 14958 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15000 ns 14625 ns 1.03
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14959 ns 14958 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14959 ns 14875 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 344878 ns 339194 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal 1074437.5 ns 1025166 ns 1.05
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 201792 ns 196272 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 120021 ns 103041 ns 1.16
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 98958.5 ns 125083 ns 0.79
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 104666.5 ns 132667 ns 0.79
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144250 ns 100249.5 ns 1.44
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 160419 ns 160077 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2228291 ns 2853125 ns 0.78
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 170682 ns 205222 ns 0.83
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1891375 ns 1923874.5 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1833541.5 ns 1935583 ns 0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1894375 ns 1923375 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1924667 ns 1927062.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 772105.5 ns 765025 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10866208 ns 10829375 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1240333 ns 1233282 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 20250 ns 18791 ns 1.08
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18937.5 ns 18729.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20542 ns 20145.5 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20208 ns 19646 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 127944 ns 123582.5 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1385750 ns 1393500 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 82111 ns 76250 ns 1.08
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 216708 ns 215917 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 255583 ns 216688 ns 1.18
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 218146 ns 219083 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 217458 ns 216250 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 580859 ns 569648 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6240292 ns 6226521 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 484605 ns 496345 ns 0.98
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 25687 ns 25312.5 ns 1.01
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 31687.5 ns 28312.5 ns 1.12
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 29145.5 ns 29041 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1541 ns 1458 ns 1.06
batchedmm(16, Bsize=4)/forward/GPU/CUDA 17059 ns 16184 ns 1.05
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 83471 ns 88291 ns 0.95
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 4896 ns 4875 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 4687.5 ns 4895.5 ns 0.96
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 5208 ns 5437.5 ns 0.96
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4708 ns 4875 ns 0.97
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 231729 ns 227416 ns 1.02
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 400815 ns 387604 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 304916 ns 305625 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 307083.5 ns 305812.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 310250 ns 309146 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 307458 ns 307792 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 260954.5 ns 259343.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1003667 ns 655771 ns 1.53
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 282392 ns 277977.5 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 530417 ns 532041 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 536417 ns 530083 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 533416.5 ns 538458 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 540917 ns 533250 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1194615.5 ns 1187558.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6650583.5 ns 6496375 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 886938 ns 870989 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19292 ns 19792 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20437.5 ns 21104 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21583 ns 22312.5 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19250 ns 20542 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 134679 ns 131573 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1513292 ns 1498125 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 76825.5 ns 75971 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215083 ns 214917 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 212625 ns 213083 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 215021 ns 213958 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 249312.5 ns 212500 ns 1.17
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 889532 ns 880154.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7210062.5 ns 7325541 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 554056 ns 546485 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6583 ns 6334 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6937.5 ns 7458 ns 0.93
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 9208 ns 8083 ns 1.14
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6792 ns 6583 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 160487 ns 157693 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 869792 ns 839500 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 69890 ns 69580 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10000 ns 11041 ns 0.91
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9854.5 ns 9917 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10292 ns 10729 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10375 ns 10209 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 896806 ns 890019.5 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5937375 ns 5554084 ns 1.07
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 398234 ns 391634 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4125 ns 4667 ns 0.88
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5542 ns 5124.5 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6750 ns 5833 ns 1.16
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4750 ns 6458 ns 0.74
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 162556 ns 161059 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 844750 ns 822917 ns 1.03
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 62561 ns 62251 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7500 ns 1
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7250 ns 7333 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7667 ns 7708 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7166 ns 7166 ns 1
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 844691 ns 835582 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5794250.5 ns 5986084 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 401898.5 ns 405494.5 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14528708 ns 14490500 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10144083 ns 7719208 ns 1.31
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 10119791 ns 10131041 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27783209 ns 27827208 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 561716 ns 529747 ns 1.06
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 405538.5 ns 389754 ns 1.04
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46624812 ns 46259291.5 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33411666.5 ns 26496000 ns 1.26
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 33562500 ns 33451708 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85401583 ns 85583541 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2800168 ns 2650995 ns 1.06
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3289235 ns 3276734 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 66500 ns 68250 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 68375 ns 68104.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 68875 ns 69312 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 67250 ns 66125 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 138855.5 ns 134037 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1526666.5 ns 1521625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 238492 ns 232902 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 444500.5 ns 449854 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 442146 ns 440625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 441583 ns 442209 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 493750 ns 440396 ns 1.12
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 807637.5 ns 796931.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7704542 ns 7473000 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 803267.5 ns 813753.5 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 625 ns 0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 666 ns 625 ns 1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 33435 ns 31856 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 422917 ns 476979 ns 0.89
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 52200 ns 51801 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9458.5 ns 10937.5 ns 0.86
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10771 ns 9542 ns 1.13
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10416.5 ns 10312.5 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9666.5 ns 10895.5 ns 0.89
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 303460.5 ns 298325 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5666958.5 ns 5492937.5 ns 1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 397994 ns 381794 ns 1.04
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9875 ns 9833 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9916 ns 9834 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9792 ns 9834 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9792 ns 9792 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 24011 ns 23467 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal 225541 ns 227250 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 218962 ns 218063 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 46000 ns 46250 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 46167 ns 45750 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 46416 ns 46625 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 46334 ns 46625 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 315869 ns 308147 ns 1.03
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal 1098270.5 ns 981645.5 ns 1.12
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 628475.5 ns 625806 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56250 ns 56500 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 57208 ns 56333 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57167 ns 57292 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 57833 ns 57958 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 29662 ns 28681 ns 1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 616041 ns 679666.5 ns 0.91
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 218842 ns 206472 ns 1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 450333 ns 454625 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 473958 ns 465208 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 468792 ns 467459 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 442709 ns 435834 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 260564.5 ns 255741 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9323750 ns 9276416.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 849638 ns 857403.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 607437.5 ns 647417 ns 0.94
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 677167 ns 646792 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 619062.5 ns 649354.5 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 645083.5 ns 663709 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 227369 ns 225589 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1393791.5 ns 1395125 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 251853 ns 235913 ns 1.07
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2229542 ns 2227104.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2242667 ns 2251250 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2238417 ns 2225292 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2233500 ns 2242250 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1055691 ns 1068301.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7106083 ns 7711771 ns 0.92
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1380353 ns 1379184 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 20396 ns 22916 ns 0.89
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20625 ns 20146 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21333.5 ns 21833 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 23708 ns 20709 ns 1.14
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 128483 ns 127032 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1530250 ns 1515770.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 82281 ns 84371 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 219229.5 ns 253853.5 ns 0.86
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 223875 ns 220458 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221083 ns 221000 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219917 ns 219020.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 851484 ns 840768 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7710292 ns 7691791.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 562290 ns 560576 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 584 ns 0.86
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 584 ns 500 ns 1.17
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23568 ns 22755 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 453729.5 ns 466021 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 50170 ns 50411 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10937.5 ns 11229.5 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10479 ns 10542 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 11166 ns 10771 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10208 ns 10312 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 278881.5 ns 277858 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 6153209 ns 6009125 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 418644 ns 412724 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 10416.5 ns 9209 ns 1.13
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9500 ns 10125 ns 0.94
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11000 ns 9750 ns 1.13
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8958 ns 8917 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 137213 ns 135766 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 886500 ns 904792 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 74561 ns 67721 ns 1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7458 ns 7750 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7750 ns 7666 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8208 ns 8312.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7541 ns 7500 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 553485 ns 551973.5 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4191417 ns 4446687.5 ns 0.94
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 340023 ns 336393 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1791.5 ns 1437.5 ns 1.25
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1625 ns 1500 ns 1.08
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2083 ns 2000.5 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1583 ns 1354.5 ns 1.17
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 21340 ns 21147 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal 310625 ns 311875 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 192401.5 ns 190276.5 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3292 ns 3333 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3375 ns 3333 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3583 ns 3458 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3375 ns 3333.5 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 244685 ns 241366 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal 1830688 ns 1889917 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 598576 ns 597216 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 148667 ns 148042 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 128833 ns 106084 ns 1.21
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 129604 ns 128375.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 225042 ns 225104 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 24647 ns 24502.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal 278416 ns 306333 ns 0.91
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 37400 ns 36970 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 143709 ns 174999.5 ns 0.82
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 124625 ns 87125 ns 1.43
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 110395.5 ns 110792 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 287812.5 ns 250729 ns 1.15
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 242298 ns 240885.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal 2059479 ns 2110083.5 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 238587 ns 226383 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7125 ns 7250 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 5292 ns 1.13
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 6084 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10062.5 ns 10083 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33200 ns 32889 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 358750 ns 369062.5 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 52880 ns 51151 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220291 ns 223583 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 231500 ns 228584 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229125 ns 228917 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 245229.5 ns 213604 ns 1.15
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 272719 ns 270279 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8345291.5 ns 8277437.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 536095 ns 534116 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 15417 ns 14833 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 15167 ns 15125 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 17042 ns 16500 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 15375 ns 15917 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 158597.5 ns 157359.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 852042 ns 824458 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 242502 ns 240222 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 23937.5 ns 23687 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24666 ns 23500 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 23874.5 ns 23854 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23291.5 ns 23292 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 931616 ns 926538.5 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5615896 ns 5882625 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 698756 ns 690662 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9958 ns 9812.5 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 10166 ns 9542 ns 1.07
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 12333 ns 10583 ns 1.17
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9083 ns 10167 ns 0.89
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 141537 ns 140467 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 805292 ns 821479 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 77251 ns 71471 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14083 ns 13917 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14250 ns 13166 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14208.5 ns 14458 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13375 ns 13583 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 768706 ns 766881 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5278042 ns 5288584 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 378343 ns 372183.5 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10104.5 ns 9459 ns 1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10000 ns 9542 ns 1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11354 ns 10958 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9791.5 ns 10333 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 139922.5 ns 138627.5 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 897959 ns 927624.5 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 77161 ns 72865.5 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12333 ns 12583 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12709 ns 12646.5 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13000 ns 13083.5 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12833.5 ns 11937.5 ns 1.08
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 626138 ns 624799.5 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4505687.5 ns 4551375 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 350573 ns 348243 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 27729 ns 31083.5 ns 0.89
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 35375 ns 32937.5 ns 1.07
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 32291 ns 31583 ns 1.02
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 2041 ns 2042 ns 1.00
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16815 ns 16203 ns 1.04
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 83101 ns 73550 ns 1.13
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5291.5 ns 5229.5 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 5146 ns 5063 ns 1.02
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5209 ns 5562.5 ns 0.94
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6229.5 ns 6416 ns 0.97
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 151130 ns 148737.5 ns 1.02
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 372413 ns 374559 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 292 ns 291 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 250 ns 292 ns 0.86
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 26290 ns 26129 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 357500 ns 467478.5 ns 0.76
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 48805.5 ns 48501 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7354 ns 7209 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7250 ns 7042 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8041 ns 7708 ns 1.04
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6979 ns 7458 ns 0.94
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 200306 ns 198167 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 6097521 ns 6016959 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 397569 ns 396144 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 1958 ns 2042 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 2041 ns 1917 ns 1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2084 ns 2084 ns 1
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 2000 ns 1959 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 27273 ns 26961 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 493416.5 ns 473229.5 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 209702 ns 211192 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17541 ns 17312.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 18166.5 ns 16979.5 ns 1.07
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17667 ns 17958 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17666.5 ns 17291.5 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 285604 ns 284214 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 6161167 ns 5834000 ns 1.06
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 724677 ns 717577 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 174417 ns 188417 ns 0.93
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 167583.5 ns 169438 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 151417 ns 149396 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 145583 ns 175916 ns 0.83
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 225867 ns 221937 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1429395.5 ns 1550833 ns 0.92
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 227572 ns 199412 ns 1.14
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1321729 ns 1315271 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1323417 ns 1324083 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1328313 ns 1325000 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1325750 ns 1331833.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1001329 ns 998483 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6753917 ns 6733584 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1011639.5 ns 1130086 ns 0.90
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24896.5 ns 27020.5 ns 0.92
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25250 ns 24792 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 28000 ns 26416 ns 1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 25542 ns 24687.5 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 271026.5 ns 268327.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 986750 ns 621687.5 ns 1.59
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 119521 ns 117991 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 117833 ns 131333 ns 0.90
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 120083 ns 116958 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 118375 ns 125645.5 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 176875 ns 127375 ns 1.39
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1213900 ns 1214493 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6376312.5 ns 6553167 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 614965 ns 601326 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 291 ns 375 ns 0.78
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 334 ns 1.12
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23468 ns 22301 ns 1.05
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 446666 ns 447500 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 49170 ns 51730.5 ns 0.95
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7562.5 ns 7541.5 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7584 ns 7167 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8000 ns 7792 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6875 ns 7416 ns 0.93
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 206004 ns 204142.5 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5961166 ns 5695875 ns 1.05
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 407824 ns 401495 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5812 ns 5896 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5937.5 ns 5708 ns 1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7333 ns 6667 ns 1.10
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6854.5 ns 6208 ns 1.10
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 167575 ns 167740 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 672646 ns 488083 ns 1.38
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 240143 ns 238573 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9875 ns 10083.5 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9834 ns 9709 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10125 ns 9958.5 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9854 ns 9708 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 978859.5 ns 976109 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5692125.5 ns 6285500 ns 0.91
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 683826 ns 679397 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 708 ns 625 ns 1.13
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 667 ns 666 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 666 ns 667 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 625 ns 666 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23025 ns 22844 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal 214354 ns 335708 ns 0.64
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 216952 ns 216152.5 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4667 ns 4583 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4667 ns 4542 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4958 ns 4792 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4542 ns 4584 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 242032 ns 237762 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal 1648667 ns 1793708 ns 0.92
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 606251 ns 600666.5 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8750 ns 9542 ns 0.92
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8500 ns 8375 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9917 ns 9542 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8375 ns 8375 ns 1
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 139395 ns 138258.5 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 800687.5 ns 834417 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 77821 ns 69561 ns 1.12
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8625 ns 8541 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8625 ns 8166 ns 1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8979.5 ns 9083.5 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8479 ns 8209 ns 1.03
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 674531.5 ns 673050 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 4665667 ns 5316625 ns 0.88
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 358963 ns 354703 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 126000 ns 125917 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 130375 ns 96125 ns 1.36
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 129416 ns 130167 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 183687.5 ns 183437 ns 1.00
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46315 ns 45933 ns 1.01
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 97061 ns 98581 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 332208 ns 339916 ns 0.98
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 323917 ns 166583 ns 1.94
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 315709 ns 348854.5 ns 0.90
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 569000 ns 574020.5 ns 0.99
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 209770 ns 207728 ns 1.01
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 517105 ns 495960 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397958 ns 397708 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288166 ns 215083 ns 1.34
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288250 ns 288291 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756041.5 ns 756250 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 44247 ns 43863 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal 421167 ns 508833 ns 0.83
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 84151 ns 84981 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1380646 ns 1459874.5 ns 0.95
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1132937.5 ns 862000 ns 1.31
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1131583.5 ns 1134791.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2441875 ns 2443958 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 276054.5 ns 264585.5 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal 1744958 ns 1843542 ns 0.95
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 354794 ns 355253 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 655000 ns 614666 ns 1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 645458 ns 586000 ns 1.10
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 606125 ns 645874.5 ns 0.94
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 651333 ns 657000 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 211637.5 ns 222791 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1332417 ns 1392125 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 234477 ns 247582 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2442417 ns 2443375 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2443729 ns 2464833.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2460479.5 ns 2434958 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2466125 ns 2451958 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1084419 ns 1084693 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9616354 ns 9656375 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1491474 ns 1475249.5 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 32917 ns 33979 ns 0.97
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 35833 ns 35146 ns 1.02
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 35333 ns 34541.5 ns 1.02
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 958 ns 958 ns 1
batchedmm(2, Bsize=32)/forward/GPU/CUDA 16181 ns 15785 ns 1.03
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 81781 ns 72911 ns 1.12
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3083 ns 3166 ns 0.97
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3166 ns 3208 ns 0.99
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3417 ns 3459 ns 0.99
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3042 ns 3166.5 ns 0.96
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 149907.5 ns 147758 ns 1.01
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 345503 ns 345553 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 406833.5 ns 406875 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 408833 ns 401958 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 409208.5 ns 409250 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 420333 ns 421375 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 44137 ns 43841 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1179333.5 ns 1170812 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 242582 ns 242582.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3874541 ns 3882208 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3981625 ns 3924041.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3995271 ns 3998375 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3778020.5 ns 3776500 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 254416 ns 250561 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 12000083 ns 11700333.5 ns 1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1240627 ns 1246592 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3917 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3958 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3875 ns 3917 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 35129 ns 34574 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal 181625 ns 264250 ns 0.69
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 42720 ns 40720 ns 1.05
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15500 ns 15750 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15708 ns 15542 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 16084 ns 15917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15834 ns 15792 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 276415 ns 273311 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal 889271 ns 885792 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 176511 ns 167912 ns 1.05
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404209 ns 404125 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 295395.5 ns 220833 ns 1.34
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 295625 ns 295250 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760584 ns 760375 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113822.5 ns 113355 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal 409229 ns 483500 ns 0.85
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 92275.5 ns 90391 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1418500 ns 1480125 ns 0.96
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1143416 ns 886750 ns 1.29
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1157042 ns 1160937.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2464062 ns 2466312.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 252054 ns 264186 ns 0.95
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal 1932667 ns 1873812.5 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 360264 ns 357734 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 458 ns 584 ns 0.78
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 542 ns 458 ns 1.18
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 583 ns 583 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 500 ns 541 ns 0.92
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 26614 ns 26163 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 362145.5 ns 465459 ns 0.78
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 209492 ns 210412 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8375 ns 8604.5 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8542 ns 8167 ns 1.05
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9125 ns 9125 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8208 ns 8750 ns 0.94
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 219325 ns 212800 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 6248208.5 ns 5710375 ns 1.09
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 707647 ns 711177 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 835021 ns 833479.5 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 618583 ns 471667 ns 1.31
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 620791 ns 618333 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1547209 ns 1549979.5 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA 131693 ns 129908.5 ns 1.01
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 167721 ns 169932 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2699249.5 ns 2690812.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 2010542 ns 1528250 ns 1.32
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 2008750 ns 2007542 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4923458 ns 4933833.5 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 254591.5 ns 255516 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 880209 ns 874763.5 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 250 ns 1.50
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 291 ns 292 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 32661 ns 31620 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 283208 ns 434791 ns 0.65
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 49561 ns 49800 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7417 ns 7646 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7417 ns 7084 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7958 ns 7875 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7083 ns 7333 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 230552 ns 227155.5 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5450104 ns 4969834 ns 1.10
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 374993 ns 366063.5 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2388875 ns 2419959 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2390042 ns 2370750 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2387625 ns 2383667 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2385041 ns 2405250 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 222782 ns 221771 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1608854 ns 1606125 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 336514 ns 359644 ns 0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4653250 ns 4630917 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4641333 ns 4535583 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4667333 ns 4657333 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4656333 ns 4651709 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 986732 ns 989560.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6571104 ns 6807396 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1423514 ns 1409064 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6875 ns 15188 ns 0.45
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7396 ns 6875 ns 1.08
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7583 ns 7459 ns 1.02
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6958.5 ns 9416.5 ns 0.74
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 24376 ns 24119 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal 275584 ns 280270.5 ns 0.98
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 34810 ns 34491 ns 1.01
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 33521 ns 67062.5 ns 0.50
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 33500 ns 45729.5 ns 0.73
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 33583 ns 47833 ns 0.70
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 32667 ns 48416 ns 0.67
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 243530.5 ns 241118 ns 1.01
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal 2038145.5 ns 2256625 ns 0.90
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 242918 ns 244442 ns 0.99
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 21625 ns 22000 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 26250 ns 24167 ns 1.09
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 25209 ns 24291.5 ns 1.04
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5167 ns 5333.5 ns 0.97
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18282 ns 17742 ns 1.03
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 86261 ns 91171 ns 0.95
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 11875 ns 12250 ns 0.97
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 10417 ns 9229 ns 1.13
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 10833 ns 10708.5 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 17792 ns 17979.5 ns 0.99
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 249417 ns 247367 ns 1.01
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 378534 ns 394269 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 406250 ns 405958 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 297375 ns 223625 ns 1.33
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 296750 ns 296750 ns 1
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762958 ns 762959 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 47260 ns 46786 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal 509104 ns 437125 ns 1.16
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 89561 ns 92421 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1445500 ns 1485583 ns 0.97
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1166562.5 ns 892146 ns 1.31
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1168167 ns 1165042 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2472542 ns 2472709 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 314496 ns 308920 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal 2114437.5 ns 2073458 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 384754 ns 380064 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 434833.5 ns 435750 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 436583 ns 430312.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 436750 ns 438875 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 447625 ns 448792 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 55692 ns 54925.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1118708.5 ns 1149375 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 238023 ns 238282 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3881625 ns 3884333 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4013979 ns 3995458.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4029083 ns 4027188 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3805271 ns 3806541.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 274092 ns 270795 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10308938 ns 10301625 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1240392 ns 1244507 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 8792 ns 8750 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 7666 ns 6875 ns 1.12
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 7709 ns 7708 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 12417 ns 12458 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24383 ns 24004 ns 1.02
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal 228000 ns 231583.5 ns 0.98
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 220382 ns 219512 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 44708 ns 45125 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45250 ns 44791 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 45208 ns 45166 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 45209 ns 45542 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 367981 ns 364741 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal 1846667 ns 1791396 ns 1.03
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 666456 ns 666126 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 83167 ns 85666 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 83416 ns 82854.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 83917 ns 90541 ns 0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 94583 ns 123042 ns 0.77
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 190250 ns 190268 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2072000 ns 2136500 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 172412 ns 206862 ns 0.83
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1982729 ns 1990916 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2023063 ns 1994062.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2022000 ns 2022062.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2016958 ns 2019666 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 583620.5 ns 579448 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9865250 ns 9777083 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1098935.5 ns 1101570 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.