From 7107ca52388ff2afac36708b2719133ca37bd25d Mon Sep 17 00:00:00 2001 From: github-action-benchmark Date: Sat, 31 Aug 2024 01:29:19 +0000 Subject: [PATCH] add LuxLib Benchmarks (julia) benchmark result for ef784ed12d473269f3fc4a4d3470d0f447b66ea7 --- benchmarks/data.js | 10682 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 10681 insertions(+), 1 deletion(-) diff --git a/benchmarks/data.js b/benchmarks/data.js index a2a4b237..b8f18838 100644 --- a/benchmarks/data.js +++ b/benchmarks/data.js @@ -1,5 +1,5 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1725044872028, + "lastUpdate": 1725067759218, "repoUrl": "https://github.com/LuxDL/LuxLib.jl", "entries": { "LuxLib Benchmarks": [ @@ -177998,6 +177998,10686 @@ window.BENCHMARK_DATA = { "extra": "gctime=0\nmemory=78408\nallocs=1715\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" } ] + }, + { + "commit": { + "author": { + "email": "avikpal@mit.edu", + "name": "Avik Pal", + "username": "avik-pal" + }, + "committer": { + "email": "avik.pal.2017@gmail.com", + "name": "Avik Pal", + "username": "avik-pal" + }, + "distinct": true, + "id": "ef784ed12d473269f3fc4a4d3470d0f447b66ea7", + "message": "fix!: change the default layernorm dims", + "timestamp": "2024-08-30T17:45:25-04:00", + "tree_id": "844c36d6d5e22c34fe9e13ebfe6e693a6e4520c3", + "url": "https://github.com/LuxDL/LuxLib.jl/commit/ef784ed12d473269f3fc4a4d3470d0f447b66ea7" + }, + "date": 1725067758452, + "tool": "julia", + "benches": [ + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 7333, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 5917, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 8208, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 5708, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 116143, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI", + "value": 2715895, + "unit": "ns", + "extra": "gctime=0\nmemory=175480\nallocs=1669\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 831083, + "unit": "ns", + "extra": "gctime=0\nmemory=40528\nallocs=1247\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 416844, + "unit": "ns", + "extra": "gctime=0\nmemory=17664\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 10083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 9937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9958, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 10042, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 581893, + "unit": "ns", + "extra": "gctime=0\nmemory=74720\nallocs=2886\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI", + "value": 19527380, + "unit": "ns", + "extra": "gctime=0\nmemory=1106456\nallocs=9943\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 2362500, + "unit": "ns", + "extra": "gctime=0\nmemory=234000\nallocs=7640\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 717727, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=1183\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 2854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 1584, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 1875, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 1896, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA", + "value": 21539, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI", + "value": 1410479, + "unit": "ns", + "extra": "gctime=0\nmemory=34072\nallocs=305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal", + "value": 205292, + "unit": "ns", + "extra": "gctime=0\nmemory=7160\nallocs=233\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU", + "value": 31470, + "unit": "ns", + "extra": "gctime=0\nmemory=1728\nallocs=60\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 4125, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 4250, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 4208, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 4458, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA", + "value": 142465.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16840\nallocs=631\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI", + "value": 8552824, + "unit": "ns", + "extra": "gctime=0\nmemory=279320\nallocs=2717\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal", + "value": 1549833, + "unit": "ns", + "extra": "gctime=0\nmemory=56552\nallocs=1881\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 153971, + "unit": "ns", + "extra": "gctime=0\nmemory=16344\nallocs=385\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 56875, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 39792, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 39958, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 75750, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 36542, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 579454, + "unit": "ns", + "extra": "gctime=0\nmemory=46584\nallocs=558\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 993542, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 77736, + "unit": "ns", + "extra": "gctime=0\nmemory=6976\nallocs=201\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2006708, + "unit": "ns", + "extra": "gctime=0\nmemory=7356096\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2079042, + "unit": "ns", + "extra": "gctime=0\nmemory=7356096\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2080833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356096\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1949000, + "unit": "ns", + "extra": "gctime=0\nmemory=7356096\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 223214, + "unit": "ns", + "extra": "gctime=0\nmemory=24776\nallocs=789\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 8127347, + "unit": "ns", + "extra": "gctime=0\nmemory=872368\nallocs=8373\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 4622417, + "unit": "ns", + "extra": "gctime=0\nmemory=221304\nallocs=6777\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1216091, + "unit": "ns", + "extra": "gctime=0\nmemory=89176\nallocs=2209\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 183375, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 147916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 151375, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 174500, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 165224, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 7960989, + "unit": "ns", + "extra": "gctime=0\nmemory=260008\nallocs=2473\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1465958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=56176\nallocs=1803\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 188512, + "unit": "ns", + "extra": "gctime=0\nmemory=19312\nallocs=459\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1096250, + "unit": "ns", + "extra": "gctime=0\nmemory=8401280\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1109458, + "unit": "ns", + "extra": "gctime=0\nmemory=8401280\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1118166, + "unit": "ns", + "extra": "gctime=0\nmemory=8401280\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1066000, + "unit": "ns", + "extra": "gctime=0\nmemory=8401280\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 668613, + "unit": "ns", + "extra": "gctime=0\nmemory=84752\nallocs=3063\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 32461742, + "unit": "ns", + "extra": "gctime=0\nmemory=1261024\nallocs=11324\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 6110208, + "unit": "ns", + "extra": "gctime=0\nmemory=281688\nallocs=8738\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1022559, + "unit": "ns", + "extra": "gctime=0\nmemory=75904\nallocs=1588\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 4708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 4708, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 5916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 4854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 88593, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI", + "value": 5444937, + "unit": "ns", + "extra": "gctime=0\nmemory=176792\nallocs=1671\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 509958, + "unit": "ns", + "extra": "gctime=0\nmemory=37584\nallocs=1260\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 70211, + "unit": "ns", + "extra": "gctime=0\nmemory=11200\nallocs=289\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8979, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8792, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9125, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8625, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 586344.5, + "unit": "ns", + "extra": "gctime=0\nmemory=76768\nallocs=3041\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI", + "value": 35774195.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1181288\nallocs=10712\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5746667, + "unit": "ns", + "extra": "gctime=0\nmemory=245128\nallocs=8111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 413663, + "unit": "ns", + "extra": "gctime=0\nmemory=57264\nallocs=1232\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 19666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 22708, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 20625, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 19375, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 65836, + "unit": "ns", + "extra": "gctime=0\nmemory=12544\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 3046337, + "unit": "ns", + "extra": "gctime=0\nmemory=130312\nallocs=1357\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1289000, + "unit": "ns", + "extra": "gctime=0\nmemory=34288\nallocs=1053\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 77471, + "unit": "ns", + "extra": "gctime=0\nmemory=16432\nallocs=320\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 219333, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 212625, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 219520.5, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 244792, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 345639, + "unit": "ns", + "extra": "gctime=0\nmemory=64176\nallocs=1466\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 13758011.5, + "unit": "ns", + "extra": "gctime=0\nmemory=650824\nallocs=5943\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 5581666, + "unit": "ns", + "extra": "gctime=0\nmemory=171040\nallocs=4725\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 479494, + "unit": "ns", + "extra": "gctime=0\nmemory=68848\nallocs=1194\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA", + "value": 20450, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI", + "value": 1168609.5, + "unit": "ns", + "extra": "gctime=0\nmemory=34072\nallocs=305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal", + "value": 289667, + "unit": "ns", + "extra": "gctime=0\nmemory=6880\nallocs=231\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU", + "value": 32680.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1920\nallocs=72\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 1417, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 1458, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 1583, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 1416, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA", + "value": 122769, + "unit": "ns", + "extra": "gctime=0\nmemory=14616\nallocs=562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI", + "value": 8593089, + "unit": "ns", + "extra": "gctime=0\nmemory=241744\nallocs=2311\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal", + "value": 1489000, + "unit": "ns", + "extra": "gctime=0\nmemory=54336\nallocs=1843\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 137421.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13560\nallocs=326\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7417, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 5416, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 5459, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 9666, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 23763, + "unit": "ns", + "extra": "gctime=0\nmemory=2800\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 1228445.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48360\nallocs=569\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 556958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 49651, + "unit": "ns", + "extra": "gctime=0\nmemory=6592\nallocs=167\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 230291, + "unit": "ns", + "extra": "gctime=0\nmemory=932800\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 227812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=932800\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 243042, + "unit": "ns", + "extra": "gctime=0\nmemory=932800\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 214959, + "unit": "ns", + "extra": "gctime=0\nmemory=932800\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 191151, + "unit": "ns", + "extra": "gctime=0\nmemory=23464\nallocs=725\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 30597537, + "unit": "ns", + "extra": "gctime=0\nmemory=1035344\nallocs=10127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 8752042, + "unit": "ns", + "extra": "gctime=0\nmemory=257480\nallocs=8015\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 649896, + "unit": "ns", + "extra": "gctime=0\nmemory=80312\nallocs=1903\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 4083, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 4084, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 4125, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA", + "value": 23201, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI", + "value": 2032029, + "unit": "ns", + "extra": "gctime=0\nmemory=32136\nallocs=279\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal", + "value": 218500, + "unit": "ns", + "extra": "gctime=0\nmemory=7384\nallocs=263\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU", + "value": 48621, + "unit": "ns", + "extra": "gctime=0\nmemory=2000\nallocs=80\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 16959, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 16666, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 16791, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 16292, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA", + "value": 190718.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15672\nallocs=581\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI", + "value": 10591957, + "unit": "ns", + "extra": "gctime=0\nmemory=245040\nallocs=2361\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal", + "value": 934584, + "unit": "ns", + "extra": "gctime=0\nmemory=49112\nallocs=1627\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 178742, + "unit": "ns", + "extra": "gctime=0\nmemory=13656\nallocs=339\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 510458, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 332125, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 332125, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 864042, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA", + "value": 112987, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI", + "value": 401373, + "unit": "ns", + "extra": "gctime=0\nmemory=32216\nallocs=284\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal", + "value": 449417, + "unit": "ns", + "extra": "gctime=0\nmemory=7408\nallocs=264\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU", + "value": 248883, + "unit": "ns", + "extra": "gctime=0\nmemory=8240\nallocs=328\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 2259145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 1754250, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 1758041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 3176667, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA", + "value": 236712, + "unit": "ns", + "extra": "gctime=0\nmemory=18568\nallocs=713\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI", + "value": 10389260, + "unit": "ns", + "extra": "gctime=0\nmemory=249664\nallocs=2442\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal", + "value": 1913792, + "unit": "ns", + "extra": "gctime=0\nmemory=58840\nallocs=1949\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 758582.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16472\nallocs=502\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 6667, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 6500, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 7167, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 6791, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 90572, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI", + "value": 5543628, + "unit": "ns", + "extra": "gctime=0\nmemory=175480\nallocs=1669\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 764979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=36856\nallocs=1231\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 60441, + "unit": "ns", + "extra": "gctime=0\nmemory=10928\nallocs=272\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 11333, + "unit": "ns", + "extra": "gctime=0\nmemory=51632\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 11667, + "unit": "ns", + "extra": "gctime=0\nmemory=51632\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 12333, + "unit": "ns", + "extra": "gctime=0\nmemory=51632\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 11979, + "unit": "ns", + "extra": "gctime=0\nmemory=51632\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 622876, + "unit": "ns", + "extra": "gctime=0\nmemory=73424\nallocs=2885\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI", + "value": 39629702, + "unit": "ns", + "extra": "gctime=0\nmemory=1141568\nallocs=10323\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 5428500, + "unit": "ns", + "extra": "gctime=0\nmemory=232000\nallocs=7543\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 416794, + "unit": "ns", + "extra": "gctime=0\nmemory=54800\nallocs=1192\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 541, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 541, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA", + "value": 22883, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI", + "value": 2357089, + "unit": "ns", + "extra": "gctime=0\nmemory=34200\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal", + "value": 325792, + "unit": "ns", + "extra": "gctime=0\nmemory=7696\nallocs=277\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU", + "value": 53740, + "unit": "ns", + "extra": "gctime=0\nmemory=2032\nallocs=79\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 2084, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 2125, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 2208, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 2083, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA", + "value": 232858, + "unit": "ns", + "extra": "gctime=0\nmemory=15592\nallocs=578\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI", + "value": 11265211, + "unit": "ns", + "extra": "gctime=0\nmemory=249664\nallocs=2448\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal", + "value": 1976709, + "unit": "ns", + "extra": "gctime=0\nmemory=59120\nallocs=2034\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 188121, + "unit": "ns", + "extra": "gctime=0\nmemory=14520\nallocs=350\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 9416, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 9250, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 10646, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 9084, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 103949, + "unit": "ns", + "extra": "gctime=0\nmemory=12496\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI", + "value": 3090177, + "unit": "ns", + "extra": "gctime=0\nmemory=130808\nallocs=1423\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 819708, + "unit": "ns", + "extra": "gctime=0\nmemory=35736\nallocs=1143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 78351, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=339\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 18229, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 17188, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 19750, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 18020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 603457.5, + "unit": "ns", + "extra": "gctime=0\nmemory=73576\nallocs=1707\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI", + "value": 17045555, + "unit": "ns", + "extra": "gctime=0\nmemory=768744\nallocs=7259\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5336854, + "unit": "ns", + "extra": "gctime=0\nmemory=190760\nallocs=5370\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 389463, + "unit": "ns", + "extra": "gctime=0\nmemory=77912\nallocs=1304\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 541, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 584, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 541, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 34827, + "unit": "ns", + "extra": "gctime=0\nmemory=4848\nallocs=166\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI", + "value": 1217545, + "unit": "ns", + "extra": "gctime=0\nmemory=46520\nallocs=557\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 286084, + "unit": "ns", + "extra": "gctime=0\nmemory=13656\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 46091, + "unit": "ns", + "extra": "gctime=0\nmemory=6592\nallocs=180\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 9458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=40176\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 9625, + "unit": "ns", + "extra": "gctime=0\nmemory=40176\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 9770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=40176\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 9270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=40176\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 258732.5, + "unit": "ns", + "extra": "gctime=0\nmemory=23440\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI", + "value": 19594533, + "unit": "ns", + "extra": "gctime=0\nmemory=667744\nallocs=6477\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 4570875, + "unit": "ns", + "extra": "gctime=0\nmemory=161712\nallocs=5015\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 380118.5, + "unit": "ns", + "extra": "gctime=0\nmemory=49824\nallocs=1256\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)", + "value": 398875, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)", + "value": 215334, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)", + "value": 215250, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)", + "value": 754917, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA", + "value": 111268, + "unit": "ns", + "extra": "gctime=0\nmemory=1168\nallocs=48\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI", + "value": 328076.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3888\nallocs=73\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal", + "value": 366250, + "unit": "ns", + "extra": "gctime=0\nmemory=2736\nallocs=96\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU", + "value": 76731, + "unit": "ns", + "extra": "gctime=0\nmemory=2032\nallocs=118\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1396375, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 860417, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 859416, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2350687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA", + "value": 202646, + "unit": "ns", + "extra": "gctime=0\nmemory=13808\nallocs=502\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI", + "value": 10590242, + "unit": "ns", + "extra": "gctime=0\nmemory=184312\nallocs=1893\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal", + "value": 1568000, + "unit": "ns", + "extra": "gctime=0\nmemory=45152\nallocs=1518\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU", + "value": 324833, + "unit": "ns", + "extra": "gctime=0\nmemory=12272\nallocs=338\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 7458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 7500, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 8458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 7770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 144884.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI", + "value": 5793206, + "unit": "ns", + "extra": "gctime=0\nmemory=176792\nallocs=1671\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 471062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=38232\nallocs=1254\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 69551, + "unit": "ns", + "extra": "gctime=0\nmemory=11152\nallocs=286\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 16000, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 14833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 15833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 16604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 958429, + "unit": "ns", + "extra": "gctime=0\nmemory=78992\nallocs=3110\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI", + "value": 41644802, + "unit": "ns", + "extra": "gctime=0\nmemory=1218864\nallocs=11118\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5853416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=249152\nallocs=8055\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 433574, + "unit": "ns", + "extra": "gctime=0\nmemory=60288\nallocs=1306\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 27334, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 25709, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 26792, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 26729, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 197253.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 7710776, + "unit": "ns", + "extra": "gctime=0\nmemory=259912\nallocs=2467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 676708, + "unit": "ns", + "extra": "gctime=0\nmemory=48184\nallocs=1530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 116041, + "unit": "ns", + "extra": "gctime=0\nmemory=18048\nallocs=380\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 117834, + "unit": "ns", + "extra": "gctime=0\nmemory=1061248\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 104000, + "unit": "ns", + "extra": "gctime=0\nmemory=1061248\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 146312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1061248\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 149459, + "unit": "ns", + "extra": "gctime=0\nmemory=1061248\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1064948, + "unit": "ns", + "extra": "gctime=0\nmemory=84752\nallocs=3063\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 43639958, + "unit": "ns", + "extra": "gctime=0\nmemory=1260720\nallocs=11305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 5911979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=257200\nallocs=7902\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 590945.5, + "unit": "ns", + "extra": "gctime=0\nmemory=71040\nallocs=1431\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 74584, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 81167, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 77813, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 97917, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 205745, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 7917430.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262168\nallocs=2474\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 528979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=49976\nallocs=1562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 126636.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18336\nallocs=398\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 217958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 304313, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 305500, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 283250.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1106076.5, + "unit": "ns", + "extra": "gctime=0\nmemory=91296\nallocs=3393\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 41493470, + "unit": "ns", + "extra": "gctime=0\nmemory=1406208\nallocs=12623\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6699500, + "unit": "ns", + "extra": "gctime=0\nmemory=289192\nallocs=8880\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 699067, + "unit": "ns", + "extra": "gctime=0\nmemory=76992\nallocs=1586\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 16625, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 17125, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 19208, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 16750, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 144987.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI", + "value": 5859256.5, + "unit": "ns", + "extra": "gctime=0\nmemory=176792\nallocs=1671\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 449500, + "unit": "ns", + "extra": "gctime=0\nmemory=38232\nallocs=1254\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 238782, + "unit": "ns", + "extra": "gctime=0\nmemory=13872\nallocs=456\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 25813, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 27416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 27250, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 26875, + "unit": "ns", + "extra": "gctime=0\nmemory=89840\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 961264, + "unit": "ns", + "extra": "gctime=0\nmemory=80048\nallocs=3154\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI", + "value": 40586738, + "unit": "ns", + "extra": "gctime=0\nmemory=1218864\nallocs=11118\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 6407542, + "unit": "ns", + "extra": "gctime=0\nmemory=249040\nallocs=8054\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 715236.5, + "unit": "ns", + "extra": "gctime=0\nmemory=60656\nallocs=1307\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 11125, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 11291, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 12916, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 10834, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 124632.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12496\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI", + "value": 3732232, + "unit": "ns", + "extra": "gctime=0\nmemory=130808\nallocs=1423\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 791625, + "unit": "ns", + "extra": "gctime=0\nmemory=35736\nallocs=1143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 242172.5, + "unit": "ns", + "extra": "gctime=0\nmemory=22336\nallocs=601\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 21625, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 22792, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 22500, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 21334, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 691300, + "unit": "ns", + "extra": "gctime=0\nmemory=62104\nallocs=1758\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI", + "value": 22068771, + "unit": "ns", + "extra": "gctime=0\nmemory=760440\nallocs=7294\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5402709, + "unit": "ns", + "extra": "gctime=0\nmemory=179872\nallocs=5391\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 690127, + "unit": "ns", + "extra": "gctime=0\nmemory=65192\nallocs=1298\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 65624.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 62791, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 66834, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 63541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 105200.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12544\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 3496796, + "unit": "ns", + "extra": "gctime=0\nmemory=130312\nallocs=1357\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1318312, + "unit": "ns", + "extra": "gctime=0\nmemory=34288\nallocs=1053\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 241393, + "unit": "ns", + "extra": "gctime=0\nmemory=20096\nallocs=549\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 447167, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 443500, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 437583, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 444750, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 512652.5, + "unit": "ns", + "extra": "gctime=0\nmemory=56264\nallocs=1599\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 20639374, + "unit": "ns", + "extra": "gctime=0\nmemory=677656\nallocs=6248\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6123895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=169656\nallocs=4968\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 714422, + "unit": "ns", + "extra": "gctime=0\nmemory=60632\nallocs=1301\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 7500, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 8229.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 9291, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 7208, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 145332.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI", + "value": 5566753, + "unit": "ns", + "extra": "gctime=0\nmemory=176792\nallocs=1671\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 474458, + "unit": "ns", + "extra": "gctime=0\nmemory=38232\nallocs=1254\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 67851, + "unit": "ns", + "extra": "gctime=0\nmemory=11104\nallocs=283\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 15041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=81136\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 15542, + "unit": "ns", + "extra": "gctime=0\nmemory=81136\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 14833, + "unit": "ns", + "extra": "gctime=0\nmemory=81136\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 15562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=81136\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 932226.5, + "unit": "ns", + "extra": "gctime=0\nmemory=75840\nallocs=2979\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI", + "value": 39419089.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1154144\nallocs=10570\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5510604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=236048\nallocs=7645\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 410714, + "unit": "ns", + "extra": "gctime=0\nmemory=58080\nallocs=1246\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)", + "value": 6161292, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)", + "value": 3226208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)", + "value": 3226959, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)", + "value": 11900167, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/GPU/CUDA", + "value": 349491, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/GPU/AMDGPU", + "value": 298868, + "unit": "ns", + "extra": "gctime=0\nmemory=4176\nallocs=213\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 19104125, + "unit": "ns", + "extra": "gctime=0\nmemory=16778320\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 11195792, + "unit": "ns", + "extra": "gctime=0\nmemory=16778320\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 11137250, + "unit": "ns", + "extra": "gctime=0\nmemory=16778320\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 36424541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16778128\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/GPU/CUDA", + "value": 1007331, + "unit": "ns", + "extra": "gctime=0\nmemory=11824\nallocs=431\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU", + "value": 1151831, + "unit": "ns", + "extra": "gctime=0\nmemory=20288\nallocs=589\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 959, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 1041, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 1000, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 958, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA", + "value": 22964, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI", + "value": 2111947, + "unit": "ns", + "extra": "gctime=0\nmemory=34200\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal", + "value": 223979, + "unit": "ns", + "extra": "gctime=0\nmemory=7696\nallocs=277\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU", + "value": 214012, + "unit": "ns", + "extra": "gctime=0\nmemory=8320\nallocs=330\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 3666, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 3708, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 3750, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 3667, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA", + "value": 276703, + "unit": "ns", + "extra": "gctime=0\nmemory=17960\nallocs=683\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI", + "value": 11656420, + "unit": "ns", + "extra": "gctime=0\nmemory=280072\nallocs=2692\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal", + "value": 2091333, + "unit": "ns", + "extra": "gctime=0\nmemory=65152\nallocs=2232\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 644996, + "unit": "ns", + "extra": "gctime=0\nmemory=17608\nallocs=497\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8333.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 7875, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 9833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 7479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 120568, + "unit": "ns", + "extra": "gctime=0\nmemory=12368\nallocs=300\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI", + "value": 3606763.5, + "unit": "ns", + "extra": "gctime=0\nmemory=130792\nallocs=1422\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 809187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=35440\nallocs=1140\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 68171, + "unit": "ns", + "extra": "gctime=0\nmemory=15952\nallocs=340\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 12208, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 12604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 13479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 12750, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 633540, + "unit": "ns", + "extra": "gctime=0\nmemory=71096\nallocs=1622\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI", + "value": 21850338, + "unit": "ns", + "extra": "gctime=0\nmemory=731136\nallocs=6851\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5130459, + "unit": "ns", + "extra": "gctime=0\nmemory=188288\nallocs=5386\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 367624, + "unit": "ns", + "extra": "gctime=0\nmemory=74584\nallocs=1211\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 291, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 291, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA", + "value": 22295, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI", + "value": 2081894, + "unit": "ns", + "extra": "gctime=0\nmemory=32136\nallocs=279\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal", + "value": 220958, + "unit": "ns", + "extra": "gctime=0\nmemory=7104\nallocs=261\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU", + "value": 51121, + "unit": "ns", + "extra": "gctime=0\nmemory=2192\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 3167, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 3041, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 3208, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 2833, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA", + "value": 199299.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13448\nallocs=512\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI", + "value": 9139195, + "unit": "ns", + "extra": "gctime=0\nmemory=207448\nallocs=1954\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal", + "value": 1575542, + "unit": "ns", + "extra": "gctime=0\nmemory=46784\nallocs=1608\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 159571, + "unit": "ns", + "extra": "gctime=0\nmemory=10760\nallocs=271\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 12791, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 11979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 13750.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 11375, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 122487, + "unit": "ns", + "extra": "gctime=0\nmemory=11664\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI", + "value": 3382384.5, + "unit": "ns", + "extra": "gctime=0\nmemory=125560\nallocs=1353\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 890333, + "unit": "ns", + "extra": "gctime=0\nmemory=33016\nallocs=1047\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 242632, + "unit": "ns", + "extra": "gctime=0\nmemory=17984\nallocs=492\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 20749.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 22042, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 25375, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 20791, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 587477.5, + "unit": "ns", + "extra": "gctime=0\nmemory=54152\nallocs=1599\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI", + "value": 21027738, + "unit": "ns", + "extra": "gctime=0\nmemory=664520\nallocs=6275\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 4822250, + "unit": "ns", + "extra": "gctime=0\nmemory=153416\nallocs=4612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 661916, + "unit": "ns", + "extra": "gctime=0\nmemory=56312\nallocs=1199\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 4375, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 4417, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 4375, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 4333, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA", + "value": 23701, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI", + "value": 2264988, + "unit": "ns", + "extra": "gctime=0\nmemory=34200\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal", + "value": 218416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7976\nallocs=279\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU", + "value": 52681, + "unit": "ns", + "extra": "gctime=0\nmemory=2128\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 16125, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 16417, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 16458, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 15958, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA", + "value": 324833, + "unit": "ns", + "extra": "gctime=0\nmemory=17816\nallocs=647\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI", + "value": 13493351, + "unit": "ns", + "extra": "gctime=0\nmemory=287256\nallocs=2855\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal", + "value": 1694208, + "unit": "ns", + "extra": "gctime=0\nmemory=61152\nallocs=2052\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 210642, + "unit": "ns", + "extra": "gctime=0\nmemory=17640\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 1959, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 2125, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 2167, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 2125, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 35419, + "unit": "ns", + "extra": "gctime=0\nmemory=4848\nallocs=166\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI", + "value": 1182395, + "unit": "ns", + "extra": "gctime=0\nmemory=46520\nallocs=557\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 427917, + "unit": "ns", + "extra": "gctime=0\nmemory=13656\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 207432, + "unit": "ns", + "extra": "gctime=0\nmemory=9600\nallocs=368\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 17687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48880\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 18375, + "unit": "ns", + "extra": "gctime=0\nmemory=48880\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 17750, + "unit": "ns", + "extra": "gctime=0\nmemory=48880\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 16708, + "unit": "ns", + "extra": "gctime=0\nmemory=48880\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 291778, + "unit": "ns", + "extra": "gctime=0\nmemory=27000\nallocs=902\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI", + "value": 21255993, + "unit": "ns", + "extra": "gctime=0\nmemory=705360\nallocs=6816\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 5606020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=170920\nallocs=5275\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 705246, + "unit": "ns", + "extra": "gctime=0\nmemory=52408\nallocs=1364\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)", + "value": 58750, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)", + "value": 60583, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)", + "value": 61583, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)", + "value": 51209, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/GPU/CUDA", + "value": 66445, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/GPU/AMDGPU", + "value": 119521, + "unit": "ns", + "extra": "gctime=0\nmemory=14448\nallocs=81\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 150750, + "unit": "ns", + "extra": "gctime=0\nmemory=2098256\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 98146, + "unit": "ns", + "extra": "gctime=0\nmemory=2098256\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 143812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2098256\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 315771, + "unit": "ns", + "extra": "gctime=0\nmemory=2098064\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/GPU/CUDA", + "value": 213433, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU", + "value": 590321, + "unit": "ns", + "extra": "gctime=0\nmemory=55344\nallocs=458\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 83709, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 81395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 125750, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 140625, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 193292.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=283\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 5538392, + "unit": "ns", + "extra": "gctime=0\nmemory=210104\nallocs=2179\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1797333, + "unit": "ns", + "extra": "gctime=0\nmemory=50976\nallocs=1614\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 170472, + "unit": "ns", + "extra": "gctime=0\nmemory=24304\nallocs=518\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1881771, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1919750.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1918125, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1879167, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 525593, + "unit": "ns", + "extra": "gctime=0\nmemory=64208\nallocs=1468\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 27317629.5, + "unit": "ns", + "extra": "gctime=0\nmemory=808728\nallocs=7592\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 8569437.5, + "unit": "ns", + "extra": "gctime=0\nmemory=188160\nallocs=5313\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1085101, + "unit": "ns", + "extra": "gctime=0\nmemory=88368\nallocs=1707\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)", + "value": 291, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA", + "value": 21292, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI", + "value": 2252158.5, + "unit": "ns", + "extra": "gctime=0\nmemory=31944\nallocs=279\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal", + "value": 322270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7072\nallocs=261\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU", + "value": 45471, + "unit": "ns", + "extra": "gctime=0\nmemory=1840\nallocs=70\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 1750, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 1834, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 1875, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 1792, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA", + "value": 248680.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13200\nallocs=472\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI", + "value": 9959591, + "unit": "ns", + "extra": "gctime=0\nmemory=214720\nallocs=2143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal", + "value": 1560459, + "unit": "ns", + "extra": "gctime=0\nmemory=52520\nallocs=1821\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU", + "value": 188106.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13344\nallocs=329\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8542, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 9459, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 12000, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 10792, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 119738.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11536\nallocs=274\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI", + "value": 3413977, + "unit": "ns", + "extra": "gctime=0\nmemory=125544\nallocs=1352\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 857250, + "unit": "ns", + "extra": "gctime=0\nmemory=32720\nallocs=1044\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 238532, + "unit": "ns", + "extra": "gctime=0\nmemory=18512\nallocs=525\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 9333, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 9625, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 13792, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 9167, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 521704, + "unit": "ns", + "extra": "gctime=0\nmemory=51672\nallocs=1514\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI", + "value": 19699849, + "unit": "ns", + "extra": "gctime=0\nmemory=626912\nallocs=5867\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 4144854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=150864\nallocs=4623\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 637946, + "unit": "ns", + "extra": "gctime=0\nmemory=52968\nallocs=1105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 57708, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 39833, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 39792, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 79459, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 39507, + "unit": "ns", + "extra": "gctime=0\nmemory=3536\nallocs=119\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 1379349, + "unit": "ns", + "extra": "gctime=0\nmemory=48376\nallocs=570\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1138499.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 78371, + "unit": "ns", + "extra": "gctime=0\nmemory=6944\nallocs=189\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1912125, + "unit": "ns", + "extra": "gctime=0\nmemory=6307920\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1972417, + "unit": "ns", + "extra": "gctime=0\nmemory=6307920\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1979624.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6307920\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1861187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6307920\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 219372, + "unit": "ns", + "extra": "gctime=0\nmemory=21872\nallocs=643\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 32210554.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1002656\nallocs=9879\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 11028542, + "unit": "ns", + "extra": "gctime=0\nmemory=251048\nallocs=7815\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1030149, + "unit": "ns", + "extra": "gctime=0\nmemory=82336\nallocs=1927\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 417333, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 419708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 421708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 425875, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 210015, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 7914644, + "unit": "ns", + "extra": "gctime=0\nmemory=262168\nallocs=2474\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 527459, + "unit": "ns", + "extra": "gctime=0\nmemory=49976\nallocs=1562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 284863, + "unit": "ns", + "extra": "gctime=0\nmemory=20928\nallocs=560\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 775041, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 670458, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 683750, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 763729.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2509296\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1043460, + "unit": "ns", + "extra": "gctime=0\nmemory=92352\nallocs=3437\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 45522633.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1406208\nallocs=12623\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6362250.5, + "unit": "ns", + "extra": "gctime=0\nmemory=289720\nallocs=8895\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 912978, + "unit": "ns", + "extra": "gctime=0\nmemory=76992\nallocs=1568\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 3426500, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 3410000, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 3471708, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 3434270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 169412, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 8998228.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262264\nallocs=2480\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1365625, + "unit": "ns", + "extra": "gctime=0\nmemory=58128\nallocs=1840\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 408484, + "unit": "ns", + "extra": "gctime=0\nmemory=24816\nallocs=665\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 6191145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 6236250, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 6228270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 6162854, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 992035, + "unit": "ns", + "extra": "gctime=0\nmemory=92352\nallocs=3437\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 51997929, + "unit": "ns", + "extra": "gctime=0\nmemory=1406512\nallocs=12642\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 7373770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=314816\nallocs=9747\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1560246, + "unit": "ns", + "extra": "gctime=0\nmemory=80304\nallocs=1628\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 472958, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 252500, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 253375, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 901750, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA", + "value": 46229, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI", + "value": 870514, + "unit": "ns", + "extra": "gctime=0\nmemory=34280\nallocs=314\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal", + "value": 411292, + "unit": "ns", + "extra": "gctime=0\nmemory=8016\nallocs=281\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU", + "value": 250933, + "unit": "ns", + "extra": "gctime=0\nmemory=8320\nallocs=330\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 2291000, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 1759750, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 1762041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 3182917, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA", + "value": 267543, + "unit": "ns", + "extra": "gctime=0\nmemory=20504\nallocs=772\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI", + "value": 16140139.5, + "unit": "ns", + "extra": "gctime=0\nmemory=318000\nallocs=3120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal", + "value": 2142375.5, + "unit": "ns", + "extra": "gctime=0\nmemory=75576\nallocs=2533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 786527.5, + "unit": "ns", + "extra": "gctime=0\nmemory=24952\nallocs=699\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 57708, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 39833, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 39500, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 79750, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 28137, + "unit": "ns", + "extra": "gctime=0\nmemory=2800\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 1444160.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48376\nallocs=570\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1123042, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 74041, + "unit": "ns", + "extra": "gctime=0\nmemory=7376\nallocs=216\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2040354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356560\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2077604, + "unit": "ns", + "extra": "gctime=0\nmemory=7356560\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2096416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356560\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1968958, + "unit": "ns", + "extra": "gctime=0\nmemory=7356560\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 232839, + "unit": "ns", + "extra": "gctime=0\nmemory=23464\nallocs=725\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 36152181, + "unit": "ns", + "extra": "gctime=0\nmemory=1035776\nallocs=10154\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 11654083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=257480\nallocs=8015\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1052035.5, + "unit": "ns", + "extra": "gctime=0\nmemory=84456\nallocs=2015\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 57792, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 40083, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 39875, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 78292, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 48894, + "unit": "ns", + "extra": "gctime=0\nmemory=4448\nallocs=152\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 811232, + "unit": "ns", + "extra": "gctime=0\nmemory=46584\nallocs=558\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1076270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 77385.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7136\nallocs=211\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1920875, + "unit": "ns", + "extra": "gctime=0\nmemory=6307456\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1953584, + "unit": "ns", + "extra": "gctime=0\nmemory=6307456\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1947208, + "unit": "ns", + "extra": "gctime=0\nmemory=6307456\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1862208, + "unit": "ns", + "extra": "gctime=0\nmemory=6307456\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 236942, + "unit": "ns", + "extra": "gctime=0\nmemory=23184\nallocs=707\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 16978161.5, + "unit": "ns", + "extra": "gctime=0\nmemory=839248\nallocs=8098\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 9712334, + "unit": "ns", + "extra": "gctime=0\nmemory=212160\nallocs=6509\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 937454.5, + "unit": "ns", + "extra": "gctime=0\nmemory=80320\nallocs=1700\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 333, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 333, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 333, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 34472, + "unit": "ns", + "extra": "gctime=0\nmemory=4848\nallocs=166\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI", + "value": 1232134, + "unit": "ns", + "extra": "gctime=0\nmemory=46488\nallocs=555\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 274313, + "unit": "ns", + "extra": "gctime=0\nmemory=13344\nallocs=443\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 48590, + "unit": "ns", + "extra": "gctime=0\nmemory=6560\nallocs=178\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 6667, + "unit": "ns", + "extra": "gctime=0\nmemory=16512\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7084, + "unit": "ns", + "extra": "gctime=0\nmemory=16512\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 7083, + "unit": "ns", + "extra": "gctime=0\nmemory=16512\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7125, + "unit": "ns", + "extra": "gctime=0\nmemory=16512\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 208175.5, + "unit": "ns", + "extra": "gctime=0\nmemory=21216\nallocs=667\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI", + "value": 21273048.5, + "unit": "ns", + "extra": "gctime=0\nmemory=630104\nallocs=6067\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 4868583, + "unit": "ns", + "extra": "gctime=0\nmemory=159576\nallocs=5021\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 378833, + "unit": "ns", + "extra": "gctime=0\nmemory=46976\nallocs=1193\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)", + "value": 291, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)", + "value": 250, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA", + "value": 31850, + "unit": "ns", + "extra": "gctime=0\nmemory=1008\nallocs=38\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI", + "value": 1289466, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=68\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal", + "value": 251646, + "unit": "ns", + "extra": "gctime=0\nmemory=2496\nallocs=97\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU", + "value": 39190, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=57\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 2833, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 2833, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 3417, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 4042, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA", + "value": 184376, + "unit": "ns", + "extra": "gctime=0\nmemory=11104\nallocs=403\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI", + "value": 8659489, + "unit": "ns", + "extra": "gctime=0\nmemory=146432\nallocs=1468\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal", + "value": 932458, + "unit": "ns", + "extra": "gctime=0\nmemory=35560\nallocs=1230\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU", + "value": 151562, + "unit": "ns", + "extra": "gctime=0\nmemory=8960\nallocs=246\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 458125, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 428042, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 458937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 454604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 136899, + "unit": "ns", + "extra": "gctime=0\nmemory=13392\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 5875893, + "unit": "ns", + "extra": "gctime=0\nmemory=215352\nallocs=2249\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 2797833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=1710\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 386248.5, + "unit": "ns", + "extra": "gctime=0\nmemory=26400\nallocs=624\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 3807583, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 3819354, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 3823041, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 3791563, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 705185, + "unit": "ns", + "extra": "gctime=0\nmemory=69016\nallocs=1898\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 33229800, + "unit": "ns", + "extra": "gctime=0\nmemory=1009512\nallocs=9738\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 10648833, + "unit": "ns", + "extra": "gctime=0\nmemory=229784\nallocs=6908\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1313027.5, + "unit": "ns", + "extra": "gctime=0\nmemory=95880\nallocs=2041\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)", + "value": 49909083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)", + "value": 25989542, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)", + "value": 26000792, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)", + "value": 97028334, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/GPU/CUDA", + "value": 1599347, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/GPU/AMDGPU", + "value": 1007964, + "unit": "ns", + "extra": "gctime=0\nmemory=9328\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 154646916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 88926979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 89520791, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 295691500, + "unit": "ns", + "extra": "gctime=0\nmemory=134218640\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/GPU/CUDA", + "value": 6481675, + "unit": "ns", + "extra": "gctime=0\nmemory=11840\nallocs=432\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU", + "value": 5572863, + "unit": "ns", + "extra": "gctime=0\nmemory=27072\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)", + "value": 19375, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)", + "value": 14542, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)", + "value": 13375, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)", + "value": 15020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA", + "value": 20415, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI", + "value": 1158809, + "unit": "ns", + "extra": "gctime=0\nmemory=34072\nallocs=305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal", + "value": 218854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7176\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU", + "value": 26150, + "unit": "ns", + "extra": "gctime=0\nmemory=1712\nallocs=59\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 11062, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 7812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 8000, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 17270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA", + "value": 257691.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16840\nallocs=631\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI", + "value": 9697921, + "unit": "ns", + "extra": "gctime=0\nmemory=279320\nallocs=2717\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal", + "value": 1574333, + "unit": "ns", + "extra": "gctime=0\nmemory=55920\nallocs=1864\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU", + "value": 152551, + "unit": "ns", + "extra": "gctime=0\nmemory=16264\nallocs=380\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8729, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 10729, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 9042, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 115052, + "unit": "ns", + "extra": "gctime=0\nmemory=12368\nallocs=300\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI", + "value": 3630279.5, + "unit": "ns", + "extra": "gctime=0\nmemory=130792\nallocs=1422\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 828750, + "unit": "ns", + "extra": "gctime=0\nmemory=35440\nallocs=1140\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 240002, + "unit": "ns", + "extra": "gctime=0\nmemory=22368\nallocs=599\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 10209, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 10958, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 10395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 10020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 619046.5, + "unit": "ns", + "extra": "gctime=0\nmemory=59624\nallocs=1673\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI", + "value": 23101770, + "unit": "ns", + "extra": "gctime=0\nmemory=722832\nallocs=6886\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5071833, + "unit": "ns", + "extra": "gctime=0\nmemory=177168\nallocs=5408\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 673066, + "unit": "ns", + "extra": "gctime=0\nmemory=61752\nallocs=1198\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 10125.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 10042, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 12333.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 10020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 121219, + "unit": "ns", + "extra": "gctime=0\nmemory=11664\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI", + "value": 3538452, + "unit": "ns", + "extra": "gctime=0\nmemory=125560\nallocs=1353\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 898750, + "unit": "ns", + "extra": "gctime=0\nmemory=33016\nallocs=1047\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 78111, + "unit": "ns", + "extra": "gctime=0\nmemory=15504\nallocs=337\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 13187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 16145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 15834, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 15167, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 581681, + "unit": "ns", + "extra": "gctime=0\nmemory=52312\nallocs=1515\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI", + "value": 18906270, + "unit": "ns", + "extra": "gctime=0\nmemory=659512\nallocs=6207\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 4652917, + "unit": "ns", + "extra": "gctime=0\nmemory=151056\nallocs=4562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 361148.5, + "unit": "ns", + "extra": "gctime=0\nmemory=54808\nallocs=1115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 458, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 584, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 542, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 34228, + "unit": "ns", + "extra": "gctime=0\nmemory=4848\nallocs=166\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI", + "value": 1202845.5, + "unit": "ns", + "extra": "gctime=0\nmemory=46488\nallocs=555\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 273833, + "unit": "ns", + "extra": "gctime=0\nmemory=13344\nallocs=443\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 210122, + "unit": "ns", + "extra": "gctime=0\nmemory=9712\nallocs=375\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7292, + "unit": "ns", + "extra": "gctime=0\nmemory=17664\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 10250, + "unit": "ns", + "extra": "gctime=0\nmemory=17664\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17664\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17664\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 228645, + "unit": "ns", + "extra": "gctime=0\nmemory=24776\nallocs=833\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI", + "value": 23040318, + "unit": "ns", + "extra": "gctime=0\nmemory=667720\nallocs=6406\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 5564541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=168400\nallocs=5288\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 682107, + "unit": "ns", + "extra": "gctime=0\nmemory=48360\nallocs=1226\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 16541, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 13667, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 12208, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 10166, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA", + "value": 22131, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI", + "value": 1181307, + "unit": "ns", + "extra": "gctime=0\nmemory=34072\nallocs=305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal", + "value": 208625, + "unit": "ns", + "extra": "gctime=0\nmemory=7176\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU", + "value": 193542, + "unit": "ns", + "extra": "gctime=0\nmemory=5248\nallocs=280\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 31917, + "unit": "ns", + "extra": "gctime=0\nmemory=67136\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 32000, + "unit": "ns", + "extra": "gctime=0\nmemory=67136\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 32333, + "unit": "ns", + "extra": "gctime=0\nmemory=67136\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 31708, + "unit": "ns", + "extra": "gctime=0\nmemory=67136\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA", + "value": 273430.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19208\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI", + "value": 11641076, + "unit": "ns", + "extra": "gctime=0\nmemory=311984\nallocs=2991\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal", + "value": 1721167, + "unit": "ns", + "extra": "gctime=0\nmemory=62696\nallocs=2079\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 603875, + "unit": "ns", + "extra": "gctime=0\nmemory=19656\nallocs=543\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 443417, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 439687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 485917, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 460021, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 195147.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=283\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 5889014, + "unit": "ns", + "extra": "gctime=0\nmemory=210104\nallocs=2179\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1969208, + "unit": "ns", + "extra": "gctime=0\nmemory=50976\nallocs=1614\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 375948.5, + "unit": "ns", + "extra": "gctime=0\nmemory=30112\nallocs=739\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 3823541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 3828833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 3839500, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 3811708, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 537176, + "unit": "ns", + "extra": "gctime=0\nmemory=56296\nallocs=1601\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 28881412, + "unit": "ns", + "extra": "gctime=0\nmemory=835560\nallocs=7897\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 9161875, + "unit": "ns", + "extra": "gctime=0\nmemory=187416\nallocs=5573\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1386538.5, + "unit": "ns", + "extra": "gctime=0\nmemory=78600\nallocs=1717\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)", + "value": 783498542, + "unit": "ns", + "extra": "gctime=396417\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)", + "value": 416342750, + "unit": "ns", + "extra": "gctime=406687.5\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)", + "value": 417457250, + "unit": "ns", + "extra": "gctime=423750\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)", + "value": 1521998333, + "unit": "ns", + "extra": "gctime=381416\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/GPU/CUDA", + "value": 22755687, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/GPU/AMDGPU", + "value": 14035069.5, + "unit": "ns", + "extra": "gctime=0\nmemory=20992\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 2525917708, + "unit": "ns", + "extra": "gctime=34605792\nmemory=2147484752\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 1529494625, + "unit": "ns", + "extra": "gctime=38833334\nmemory=2147484752\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 1522524250, + "unit": "ns", + "extra": "gctime=36779542\nmemory=2147484752\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 4783169333, + "unit": "ns", + "extra": "gctime=30469208\nmemory=2147484560\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/GPU/CUDA", + "value": 378990704, + "unit": "ns", + "extra": "gctime=308149660\nmemory=12048\nallocs=437\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU", + "value": 89293520, + "unit": "ns", + "extra": "gctime=0\nmemory=65536\nallocs=806\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 79875, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 77417, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 79417, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 76584, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 209756, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 7877398.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262168\nallocs=2474\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 534145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=49976\nallocs=1562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 110386, + "unit": "ns", + "extra": "gctime=0\nmemory=17824\nallocs=366\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 192958, + "unit": "ns", + "extra": "gctime=0\nmemory=2246800\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 200479, + "unit": "ns", + "extra": "gctime=0\nmemory=2246800\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 210146, + "unit": "ns", + "extra": "gctime=0\nmemory=2246800\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 200666, + "unit": "ns", + "extra": "gctime=0\nmemory=2246800\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1037361, + "unit": "ns", + "extra": "gctime=0\nmemory=88272\nallocs=3262\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 44132920, + "unit": "ns", + "extra": "gctime=0\nmemory=1340752\nallocs=12085\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6120979, + "unit": "ns", + "extra": "gctime=0\nmemory=275496\nallocs=8469\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 646276, + "unit": "ns", + "extra": "gctime=0\nmemory=75008\nallocs=1536\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)", + "value": 199891625, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)", + "value": 104029229, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)", + "value": 104078375, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)", + "value": 389069083, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/GPU/CUDA", + "value": 5813154.5, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/GPU/AMDGPU", + "value": 3415713, + "unit": "ns", + "extra": "gctime=0\nmemory=11584\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 620198896, + "unit": "ns", + "extra": "gctime=2834042\nmemory=536872016\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 353375979, + "unit": "ns", + "extra": "gctime=2844583.5\nmemory=536872016\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 353541500, + "unit": "ns", + "extra": "gctime=2855416.5\nmemory=536872016\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 1181436625, + "unit": "ns", + "extra": "gctime=2816125\nmemory=536871824\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/GPU/CUDA", + "value": 26635966, + "unit": "ns", + "extra": "gctime=0\nmemory=11840\nallocs=432\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU", + "value": 21931088, + "unit": "ns", + "extra": "gctime=0\nmemory=37312\nallocs=806\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7250, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 5375, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 5417, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 9834, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 27550, + "unit": "ns", + "extra": "gctime=0\nmemory=3536\nallocs=119\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 1252191.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48360\nallocs=569\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 514458, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 50410, + "unit": "ns", + "extra": "gctime=0\nmemory=6560\nallocs=165\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 253979, + "unit": "ns", + "extra": "gctime=0\nmemory=801664\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 221083, + "unit": "ns", + "extra": "gctime=0\nmemory=801664\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 223687, + "unit": "ns", + "extra": "gctime=0\nmemory=801664\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 206749.5, + "unit": "ns", + "extra": "gctime=0\nmemory=801664\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 223256, + "unit": "ns", + "extra": "gctime=0\nmemory=21872\nallocs=643\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 32181520.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1002224\nallocs=9852\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 8954479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=250408\nallocs=7798\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 548940, + "unit": "ns", + "extra": "gctime=0\nmemory=77184\nallocs=1752\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 9562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 9916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 11334, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 8146.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 117820, + "unit": "ns", + "extra": "gctime=0\nmemory=11536\nallocs=274\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI", + "value": 3438688, + "unit": "ns", + "extra": "gctime=0\nmemory=125544\nallocs=1352\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 840083, + "unit": "ns", + "extra": "gctime=0\nmemory=32720\nallocs=1044\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 72920, + "unit": "ns", + "extra": "gctime=0\nmemory=15504\nallocs=337\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 10958, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9292, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7917, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 524870, + "unit": "ns", + "extra": "gctime=0\nmemory=49832\nallocs=1430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI", + "value": 19312212, + "unit": "ns", + "extra": "gctime=0\nmemory=621904\nallocs=5799\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 4524125, + "unit": "ns", + "extra": "gctime=0\nmemory=148704\nallocs=4570\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 322534, + "unit": "ns", + "extra": "gctime=0\nmemory=51576\nallocs=1028\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 709, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 750, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 542, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 26131, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI", + "value": 1259871, + "unit": "ns", + "extra": "gctime=0\nmemory=48312\nallocs=569\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 431000, + "unit": "ns", + "extra": "gctime=0\nmemory=15928\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 48610, + "unit": "ns", + "extra": "gctime=0\nmemory=6832\nallocs=185\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 9583, + "unit": "ns", + "extra": "gctime=0\nmemory=40640\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 10208, + "unit": "ns", + "extra": "gctime=0\nmemory=40640\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 13291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=40640\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 9291, + "unit": "ns", + "extra": "gctime=0\nmemory=40640\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 252396, + "unit": "ns", + "extra": "gctime=0\nmemory=22464\nallocs=672\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI", + "value": 23727040, + "unit": "ns", + "extra": "gctime=0\nmemory=754208\nallocs=7436\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5562917, + "unit": "ns", + "extra": "gctime=0\nmemory=183792\nallocs=5742\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 401504, + "unit": "ns", + "extra": "gctime=0\nmemory=55936\nallocs=1390\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 105916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 84792, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 86084, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 146500, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA", + "value": 25162, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI", + "value": 1227978.5, + "unit": "ns", + "extra": "gctime=0\nmemory=34072\nallocs=305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal", + "value": 260375, + "unit": "ns", + "extra": "gctime=0\nmemory=7184\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU", + "value": 191642, + "unit": "ns", + "extra": "gctime=0\nmemory=4144\nallocs=211\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 478125, + "unit": "ns", + "extra": "gctime=0\nmemory=1051456\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 478584, + "unit": "ns", + "extra": "gctime=0\nmemory=1051456\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 479208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051456\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 478208, + "unit": "ns", + "extra": "gctime=0\nmemory=1051456\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA", + "value": 232894, + "unit": "ns", + "extra": "gctime=0\nmemory=19208\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI", + "value": 11980194.5, + "unit": "ns", + "extra": "gctime=0\nmemory=312080\nallocs=2997\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal", + "value": 2204709, + "unit": "ns", + "extra": "gctime=0\nmemory=70856\nallocs=2355\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 618666, + "unit": "ns", + "extra": "gctime=0\nmemory=20104\nallocs=571\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)", + "value": 6208, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)", + "value": 7417, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)", + "value": 7417, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)", + "value": 6562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/GPU/CUDA", + "value": 16916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/GPU/AMDGPU", + "value": 72841, + "unit": "ns", + "extra": "gctime=0\nmemory=2848\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 11625.5, + "unit": "ns", + "extra": "gctime=0\nmemory=132176\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 10917, + "unit": "ns", + "extra": "gctime=0\nmemory=132176\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 10875, + "unit": "ns", + "extra": "gctime=0\nmemory=132176\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 16958, + "unit": "ns", + "extra": "gctime=0\nmemory=131984\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/GPU/CUDA", + "value": 213980.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU", + "value": 373474, + "unit": "ns", + "extra": "gctime=0\nmemory=17472\nallocs=411\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)", + "value": 38791, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)", + "value": 50417, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)", + "value": 50833, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)", + "value": 13791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/GPU/CUDA", + "value": 22049, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/GPU/AMDGPU", + "value": 87401, + "unit": "ns", + "extra": "gctime=0\nmemory=4640\nallocs=56\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 35937, + "unit": "ns", + "extra": "gctime=0\nmemory=525392\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 28958, + "unit": "ns", + "extra": "gctime=0\nmemory=525392\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 30000, + "unit": "ns", + "extra": "gctime=0\nmemory=525392\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 57167, + "unit": "ns", + "extra": "gctime=0\nmemory=525200\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/GPU/CUDA", + "value": 192727, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU", + "value": 404644, + "unit": "ns", + "extra": "gctime=0\nmemory=24016\nallocs=397\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)", + "value": 1833, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)", + "value": 1917, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)", + "value": 2250, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)", + "value": 1708, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA", + "value": 20655, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI", + "value": 1133358.5, + "unit": "ns", + "extra": "gctime=0\nmemory=34072\nallocs=305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal", + "value": 299917, + "unit": "ns", + "extra": "gctime=0\nmemory=6896\nallocs=232\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU", + "value": 28851, + "unit": "ns", + "extra": "gctime=0\nmemory=1536\nallocs=48\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 2375, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 2292, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 2541, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 2229, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA", + "value": 203780.5, + "unit": "ns", + "extra": "gctime=0\nmemory=14616\nallocs=562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI", + "value": 9388339, + "unit": "ns", + "extra": "gctime=0\nmemory=241744\nallocs=2311\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal", + "value": 1600062, + "unit": "ns", + "extra": "gctime=0\nmemory=54448\nallocs=1844\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU", + "value": 138732, + "unit": "ns", + "extra": "gctime=0\nmemory=13544\nallocs=325\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 4833, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 4625.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 6416, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 6062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 145657.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI", + "value": 6033040.5, + "unit": "ns", + "extra": "gctime=0\nmemory=176792\nallocs=1671\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 683125, + "unit": "ns", + "extra": "gctime=0\nmemory=37632\nallocs=1263\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 72790, + "unit": "ns", + "extra": "gctime=0\nmemory=10944\nallocs=273\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16720\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 9083, + "unit": "ns", + "extra": "gctime=0\nmemory=16720\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8875, + "unit": "ns", + "extra": "gctime=0\nmemory=16720\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8458, + "unit": "ns", + "extra": "gctime=0\nmemory=16720\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 877669.5, + "unit": "ns", + "extra": "gctime=0\nmemory=73616\nallocs=2910\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI", + "value": 37955172.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1116568\nallocs=10164\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5604375, + "unit": "ns", + "extra": "gctime=0\nmemory=232120\nallocs=7694\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 390363, + "unit": "ns", + "extra": "gctime=0\nmemory=55248\nallocs=1184\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 56833, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 56958, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 56958, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 57916, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 37730, + "unit": "ns", + "extra": "gctime=0\nmemory=4448\nallocs=152\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 1286751, + "unit": "ns", + "extra": "gctime=0\nmemory=46568\nallocs=557\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 396312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 207312, + "unit": "ns", + "extra": "gctime=0\nmemory=9872\nallocs=382\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 456812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1063696\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 469500, + "unit": "ns", + "extra": "gctime=0\nmemory=1063696\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 466417, + "unit": "ns", + "extra": "gctime=0\nmemory=1063696\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 433667, + "unit": "ns", + "extra": "gctime=0\nmemory=1063696\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 269341, + "unit": "ns", + "extra": "gctime=0\nmemory=26856\nallocs=873\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 26905233, + "unit": "ns", + "extra": "gctime=0\nmemory=876528\nallocs=8416\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 8032459, + "unit": "ns", + "extra": "gctime=0\nmemory=221464\nallocs=6774\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 801642.5, + "unit": "ns", + "extra": "gctime=0\nmemory=78280\nallocs=1666\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)", + "value": 3319708, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)", + "value": 1770417, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)", + "value": 1768875, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)", + "value": 6289750, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/GPU/CUDA", + "value": 204342, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/GPU/AMDGPU", + "value": 207902, + "unit": "ns", + "extra": "gctime=0\nmemory=6592\nallocs=178\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 11527396, + "unit": "ns", + "extra": "gctime=0\nmemory=33555536\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 6577354, + "unit": "ns", + "extra": "gctime=0\nmemory=33555536\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 6605937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33555536\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 21027166.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33555344\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/GPU/CUDA", + "value": 736076, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU", + "value": 1060810, + "unit": "ns", + "extra": "gctime=0\nmemory=28432\nallocs=540\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 7042, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 5437, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 7875, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 5750, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 138120, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI", + "value": 5483981.5, + "unit": "ns", + "extra": "gctime=0\nmemory=175480\nallocs=1669\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 773875, + "unit": "ns", + "extra": "gctime=0\nmemory=36672\nallocs=1235\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 63911, + "unit": "ns", + "extra": "gctime=0\nmemory=10912\nallocs=271\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7521, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 12312, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9416, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7375, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 753344, + "unit": "ns", + "extra": "gctime=0\nmemory=68080\nallocs=2685\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI", + "value": 33467647, + "unit": "ns", + "extra": "gctime=0\nmemory=1039304\nallocs=9369\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 5418021, + "unit": "ns", + "extra": "gctime=0\nmemory=216816\nallocs=7156\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 388693.5, + "unit": "ns", + "extra": "gctime=0\nmemory=49984\nallocs=1082\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 96375, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 94833, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 129333, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 98250, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 151062, + "unit": "ns", + "extra": "gctime=0\nmemory=13392\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 5916315, + "unit": "ns", + "extra": "gctime=0\nmemory=215352\nallocs=2249\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 2030167, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=1710\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 186551, + "unit": "ns", + "extra": "gctime=0\nmemory=24640\nallocs=514\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2026541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2017917, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2027917, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 2021709, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 708592.5, + "unit": "ns", + "extra": "gctime=0\nmemory=67176\nallocs=1814\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 32150977, + "unit": "ns", + "extra": "gctime=0\nmemory=1004504\nallocs=9670\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 10464750, + "unit": "ns", + "extra": "gctime=0\nmemory=228040\nallocs=6874\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1123261, + "unit": "ns", + "extra": "gctime=0\nmemory=92664\nallocs=1884\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)", + "value": 34709, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)", + "value": 33959, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)", + "value": 33292, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/GPU/CUDA", + "value": 15533, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/GPU/AMDGPU", + "value": 80771, + "unit": "ns", + "extra": "gctime=0\nmemory=2272\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 2542, + "unit": "ns", + "extra": "gctime=0\nmemory=1360\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 3084, + "unit": "ns", + "extra": "gctime=0\nmemory=1360\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 3500, + "unit": "ns", + "extra": "gctime=0\nmemory=1360\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 2250, + "unit": "ns", + "extra": "gctime=0\nmemory=1168\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/GPU/CUDA", + "value": 137499, + "unit": "ns", + "extra": "gctime=0\nmemory=9456\nallocs=360\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU", + "value": 348703, + "unit": "ns", + "extra": "gctime=0\nmemory=11888\nallocs=326\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7250, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 5416, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 5416, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 9541, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 36561, + "unit": "ns", + "extra": "gctime=0\nmemory=4448\nallocs=152\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 1263009, + "unit": "ns", + "extra": "gctime=0\nmemory=46568\nallocs=557\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 585833, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 49680, + "unit": "ns", + "extra": "gctime=0\nmemory=5952\nallocs=137\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 248625, + "unit": "ns", + "extra": "gctime=0\nmemory=801424\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 220771, + "unit": "ns", + "extra": "gctime=0\nmemory=801424\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 222750, + "unit": "ns", + "extra": "gctime=0\nmemory=801424\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 206333, + "unit": "ns", + "extra": "gctime=0\nmemory=801424\nallocs=105\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 245692, + "unit": "ns", + "extra": "gctime=0\nmemory=23184\nallocs=707\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 26025502.5, + "unit": "ns", + "extra": "gctime=0\nmemory=838912\nallocs=8077\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 7845292, + "unit": "ns", + "extra": "gctime=0\nmemory=212160\nallocs=6509\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 529795, + "unit": "ns", + "extra": "gctime=0\nmemory=75120\nallocs=1522\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)", + "value": 3958, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)", + "value": 3916, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA", + "value": 21976, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI", + "value": 2103022, + "unit": "ns", + "extra": "gctime=0\nmemory=31944\nallocs=279\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal", + "value": 236000, + "unit": "ns", + "extra": "gctime=0\nmemory=7352\nallocs=263\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU", + "value": 47710, + "unit": "ns", + "extra": "gctime=0\nmemory=1776\nallocs=66\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 14666, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 14709, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 14791, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 14542, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA", + "value": 309715, + "unit": "ns", + "extra": "gctime=0\nmemory=15424\nallocs=541\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI", + "value": 10728562, + "unit": "ns", + "extra": "gctime=0\nmemory=252312\nallocs=2550\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal", + "value": 1017000, + "unit": "ns", + "extra": "gctime=0\nmemory=54272\nallocs=1837\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU", + "value": 203842, + "unit": "ns", + "extra": "gctime=0\nmemory=16144\nallocs=389\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 106833, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 101208, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 131666, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 140875, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 152503, + "unit": "ns", + "extra": "gctime=0\nmemory=13392\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 5778842, + "unit": "ns", + "extra": "gctime=0\nmemory=215352\nallocs=2249\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 2123209, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=1710\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 208752, + "unit": "ns", + "extra": "gctime=0\nmemory=24448\nallocs=502\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1922375, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1930291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1900083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1903042, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 693606, + "unit": "ns", + "extra": "gctime=0\nmemory=65584\nallocs=1732\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 29982332, + "unit": "ns", + "extra": "gctime=0\nmemory=971336\nallocs=9400\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 10251250, + "unit": "ns", + "extra": "gctime=0\nmemory=220544\nallocs=6654\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1073896, + "unit": "ns", + "extra": "gctime=0\nmemory=93264\nallocs=1967\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 20416, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 19250, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 21209, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 18208, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 109595, + "unit": "ns", + "extra": "gctime=0\nmemory=12544\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 3350251.5, + "unit": "ns", + "extra": "gctime=0\nmemory=130312\nallocs=1357\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1333937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=34288\nallocs=1053\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 82865.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16816\nallocs=344\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 225166.5, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 217875, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 217167, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 216229, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 519722, + "unit": "ns", + "extra": "gctime=0\nmemory=54424\nallocs=1515\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 20040625, + "unit": "ns", + "extra": "gctime=0\nmemory=672648\nallocs=6180\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 5978229, + "unit": "ns", + "extra": "gctime=0\nmemory=167336\nallocs=4921\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 493880, + "unit": "ns", + "extra": "gctime=0\nmemory=58408\nallocs=1172\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)", + "value": 25125, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)", + "value": 28250, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)", + "value": 26375, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)", + "value": 1291, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/GPU/CUDA", + "value": 16162, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/GPU/AMDGPU", + "value": 87991, + "unit": "ns", + "extra": "gctime=0\nmemory=2352\nallocs=99\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 4416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18256\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 4979, + "unit": "ns", + "extra": "gctime=0\nmemory=18256\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 5166.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18256\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 4459, + "unit": "ns", + "extra": "gctime=0\nmemory=18064\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/GPU/CUDA", + "value": 207084, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU", + "value": 387704, + "unit": "ns", + "extra": "gctime=0\nmemory=15424\nallocs=418\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 307167, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 306625, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 308209, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 304833, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 231087.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 7762114, + "unit": "ns", + "extra": "gctime=0\nmemory=259912\nallocs=2467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1108291, + "unit": "ns", + "extra": "gctime=0\nmemory=48184\nallocs=1530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 278113, + "unit": "ns", + "extra": "gctime=0\nmemory=24752\nallocs=661\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 530708, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 567750, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 532125, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 539333, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1075601, + "unit": "ns", + "extra": "gctime=0\nmemory=86336\nallocs=3212\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 45020857, + "unit": "ns", + "extra": "gctime=0\nmemory=1323680\nallocs=11817\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6232104.5, + "unit": "ns", + "extra": "gctime=0\nmemory=268928\nallocs=8303\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 881559, + "unit": "ns", + "extra": "gctime=0\nmemory=71008\nallocs=1467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 20917, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 20124.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 23417, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 20375, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 114759.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13376\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 3643976.5, + "unit": "ns", + "extra": "gctime=0\nmemory=135560\nallocs=1427\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1445375, + "unit": "ns", + "extra": "gctime=0\nmemory=37008\nallocs=1149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 84891, + "unit": "ns", + "extra": "gctime=0\nmemory=17280\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 217416, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 214604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 213771, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 213083, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 744609, + "unit": "ns", + "extra": "gctime=0\nmemory=65552\nallocs=1730\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 24441546.5, + "unit": "ns", + "extra": "gctime=0\nmemory=813336\nallocs=7745\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 7247292, + "unit": "ns", + "extra": "gctime=0\nmemory=202848\nallocs=6053\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 541395, + "unit": "ns", + "extra": "gctime=0\nmemory=72976\nallocs=1406\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 7625, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 7042, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 8583, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 6500, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 141892.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI", + "value": 5536910, + "unit": "ns", + "extra": "gctime=0\nmemory=175480\nallocs=1669\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 779167, + "unit": "ns", + "extra": "gctime=0\nmemory=36872\nallocs=1232\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 71560, + "unit": "ns", + "extra": "gctime=0\nmemory=11056\nallocs=280\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 9667, + "unit": "ns", + "extra": "gctime=0\nmemory=42768\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 10500, + "unit": "ns", + "extra": "gctime=0\nmemory=42768\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 10563, + "unit": "ns", + "extra": "gctime=0\nmemory=42768\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 10417, + "unit": "ns", + "extra": "gctime=0\nmemory=42768\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 826962.5, + "unit": "ns", + "extra": "gctime=0\nmemory=70304\nallocs=2754\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI", + "value": 41129625, + "unit": "ns", + "extra": "gctime=0\nmemory=1076880\nallocs=9775\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 5329375, + "unit": "ns", + "extra": "gctime=0\nmemory=218944\nallocs=7134\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 403524, + "unit": "ns", + "extra": "gctime=0\nmemory=52848\nallocs=1146\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 6792, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 5417, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 7666, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 5292, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 144458, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI", + "value": 5612185, + "unit": "ns", + "extra": "gctime=0\nmemory=175480\nallocs=1669\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 779625, + "unit": "ns", + "extra": "gctime=0\nmemory=36672\nallocs=1235\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 70921, + "unit": "ns", + "extra": "gctime=0\nmemory=10896\nallocs=270\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7750, + "unit": "ns", + "extra": "gctime=0\nmemory=13872\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7833, + "unit": "ns", + "extra": "gctime=0\nmemory=13872\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8042, + "unit": "ns", + "extra": "gctime=0\nmemory=13872\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7791, + "unit": "ns", + "extra": "gctime=0\nmemory=13872\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 785083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=71200\nallocs=2816\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI", + "value": 37989213, + "unit": "ns", + "extra": "gctime=0\nmemory=1103992\nallocs=9917\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 5671167, + "unit": "ns", + "extra": "gctime=0\nmemory=229472\nallocs=7571\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 406834, + "unit": "ns", + "extra": "gctime=0\nmemory=51920\nallocs=1127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)", + "value": 14571334, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)", + "value": 7693396, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)", + "value": 7710583.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)", + "value": 27712708, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/GPU/CUDA", + "value": 529950, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/GPU/AMDGPU", + "value": 395489, + "unit": "ns", + "extra": "gctime=0\nmemory=20992\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 46570437.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 26539458, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 26599292, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 85335000, + "unit": "ns", + "extra": "gctime=0\nmemory=134218640\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/GPU/CUDA", + "value": 2646944, + "unit": "ns", + "extra": "gctime=0\nmemory=11840\nallocs=432\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU", + "value": 3276491, + "unit": "ns", + "extra": "gctime=0\nmemory=62064\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 66708, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 67666, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 70959, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 66958, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 119346, + "unit": "ns", + "extra": "gctime=0\nmemory=13376\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 3516464.5, + "unit": "ns", + "extra": "gctime=0\nmemory=135560\nallocs=1427\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1432937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=37008\nallocs=1149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 237133, + "unit": "ns", + "extra": "gctime=0\nmemory=20112\nallocs=525\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 451041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 440292, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 442417, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 445917, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 730490, + "unit": "ns", + "extra": "gctime=0\nmemory=68984\nallocs=1896\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 27188092.5, + "unit": "ns", + "extra": "gctime=0\nmemory=851512\nallocs=8083\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 7468750, + "unit": "ns", + "extra": "gctime=0\nmemory=212712\nallocs=6323\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 803378, + "unit": "ns", + "extra": "gctime=0\nmemory=76360\nallocs=1562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 584, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 542, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 32498, + "unit": "ns", + "extra": "gctime=0\nmemory=4112\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI", + "value": 1224580, + "unit": "ns", + "extra": "gctime=0\nmemory=46520\nallocs=557\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 283917, + "unit": "ns", + "extra": "gctime=0\nmemory=13656\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 49800, + "unit": "ns", + "extra": "gctime=0\nmemory=6560\nallocs=178\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 8458, + "unit": "ns", + "extra": "gctime=0\nmemory=44528\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 9750, + "unit": "ns", + "extra": "gctime=0\nmemory=44528\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 9312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44528\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 8542, + "unit": "ns", + "extra": "gctime=0\nmemory=44528\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 287352.5, + "unit": "ns", + "extra": "gctime=0\nmemory=25016\nallocs=818\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI", + "value": 21281949, + "unit": "ns", + "extra": "gctime=0\nmemory=700864\nallocs=6752\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 5535041, + "unit": "ns", + "extra": "gctime=0\nmemory=168744\nallocs=5230\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 386714, + "unit": "ns", + "extra": "gctime=0\nmemory=49960\nallocs=1219\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 9792, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 9834, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 9875, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 9792, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA", + "value": 23340, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI", + "value": 2116807, + "unit": "ns", + "extra": "gctime=0\nmemory=32136\nallocs=279\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal", + "value": 220083, + "unit": "ns", + "extra": "gctime=0\nmemory=7384\nallocs=263\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU", + "value": 218562, + "unit": "ns", + "extra": "gctime=0\nmemory=8240\nallocs=328\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 45542, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 45750, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 46167, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 45667, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA", + "value": 289775.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18088\nallocs=683\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI", + "value": 10739963, + "unit": "ns", + "extra": "gctime=0\nmemory=249376\nallocs=2424\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal", + "value": 960916, + "unit": "ns", + "extra": "gctime=0\nmemory=51536\nallocs=1689\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 626776, + "unit": "ns", + "extra": "gctime=0\nmemory=15160\nallocs=420\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 56667, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 56916, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 56834, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 57709, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 29069, + "unit": "ns", + "extra": "gctime=0\nmemory=3536\nallocs=119\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 1186389, + "unit": "ns", + "extra": "gctime=0\nmemory=48360\nallocs=569\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 441250, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 205872, + "unit": "ns", + "extra": "gctime=0\nmemory=9648\nallocs=358\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 461479, + "unit": "ns", + "extra": "gctime=0\nmemory=1063936\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 465209, + "unit": "ns", + "extra": "gctime=0\nmemory=1063936\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 465125, + "unit": "ns", + "extra": "gctime=0\nmemory=1063936\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 443500.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1063936\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 249346, + "unit": "ns", + "extra": "gctime=0\nmemory=25544\nallocs=809\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 31772620.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1039840\nallocs=10191\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 9164791, + "unit": "ns", + "extra": "gctime=0\nmemory=259760\nallocs=8066\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 856638, + "unit": "ns", + "extra": "gctime=0\nmemory=79816\nallocs=1863\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 641875, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 638437.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 648313, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 648229, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 205463, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 8583572, + "unit": "ns", + "extra": "gctime=0\nmemory=262264\nallocs=2480\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1350479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=58128\nallocs=1840\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 256632, + "unit": "ns", + "extra": "gctime=0\nmemory=18560\nallocs=412\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2249083, + "unit": "ns", + "extra": "gctime=0\nmemory=17901328\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2197292, + "unit": "ns", + "extra": "gctime=0\nmemory=17901328\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2231624.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17901328\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 2127521, + "unit": "ns", + "extra": "gctime=0\nmemory=17901328\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 979615, + "unit": "ns", + "extra": "gctime=0\nmemory=88272\nallocs=3262\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 48530388.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1341056\nallocs=12104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 7008083, + "unit": "ns", + "extra": "gctime=0\nmemory=299952\nallocs=9303\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1383583, + "unit": "ns", + "extra": "gctime=0\nmemory=78336\nallocs=1597\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 20291, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 19958, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 23479, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 19875, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 114769, + "unit": "ns", + "extra": "gctime=0\nmemory=13376\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 3457789, + "unit": "ns", + "extra": "gctime=0\nmemory=135560\nallocs=1427\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1444500, + "unit": "ns", + "extra": "gctime=0\nmemory=37008\nallocs=1149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 82891, + "unit": "ns", + "extra": "gctime=0\nmemory=17120\nallocs=338\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 231729.5, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 219834, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 220458, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 251750, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 734829, + "unit": "ns", + "extra": "gctime=0\nmemory=67144\nallocs=1812\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 25797489, + "unit": "ns", + "extra": "gctime=0\nmemory=846504\nallocs=8015\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 7651583, + "unit": "ns", + "extra": "gctime=0\nmemory=210344\nallocs=6273\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 562615, + "unit": "ns", + "extra": "gctime=0\nmemory=74440\nallocs=1452\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 542, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 542, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 23642, + "unit": "ns", + "extra": "gctime=0\nmemory=3072\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI", + "value": 1220028, + "unit": "ns", + "extra": "gctime=0\nmemory=48312\nallocs=569\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 450854, + "unit": "ns", + "extra": "gctime=0\nmemory=15928\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 49881, + "unit": "ns", + "extra": "gctime=0\nmemory=6848\nallocs=186\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 9020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44992\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 10500, + "unit": "ns", + "extra": "gctime=0\nmemory=44992\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 10708, + "unit": "ns", + "extra": "gctime=0\nmemory=44992\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 9791, + "unit": "ns", + "extra": "gctime=0\nmemory=44992\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 269830, + "unit": "ns", + "extra": "gctime=0\nmemory=24040\nallocs=754\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI", + "value": 25215014, + "unit": "ns", + "extra": "gctime=0\nmemory=787328\nallocs=7711\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 6045541, + "unit": "ns", + "extra": "gctime=0\nmemory=190792\nallocs=5955\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 434444.5, + "unit": "ns", + "extra": "gctime=0\nmemory=56920\nallocs=1407\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8666, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8542, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 11750, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 9875, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 122995, + "unit": "ns", + "extra": "gctime=0\nmemory=11536\nallocs=274\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI", + "value": 3279595, + "unit": "ns", + "extra": "gctime=0\nmemory=125544\nallocs=1352\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 841291, + "unit": "ns", + "extra": "gctime=0\nmemory=32720\nallocs=1044\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 73020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15600\nallocs=343\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7688, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7958, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7750, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 512178, + "unit": "ns", + "extra": "gctime=0\nmemory=48240\nallocs=1348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI", + "value": 16877624.5, + "unit": "ns", + "extra": "gctime=0\nmemory=588736\nallocs=5529\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 4195083, + "unit": "ns", + "extra": "gctime=0\nmemory=141496\nallocs=4352\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 331043, + "unit": "ns", + "extra": "gctime=0\nmemory=50608\nallocs=1013\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 1459, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 1729.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 2208, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 1750, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA", + "value": 21937, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI", + "value": 1190464, + "unit": "ns", + "extra": "gctime=0\nmemory=34072\nallocs=305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal", + "value": 296791, + "unit": "ns", + "extra": "gctime=0\nmemory=6896\nallocs=232\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU", + "value": 192412, + "unit": "ns", + "extra": "gctime=0\nmemory=4688\nallocs=245\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 3479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=4800\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 3375, + "unit": "ns", + "extra": "gctime=0\nmemory=4800\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 3709, + "unit": "ns", + "extra": "gctime=0\nmemory=4800\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 3333, + "unit": "ns", + "extra": "gctime=0\nmemory=4800\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA", + "value": 223485, + "unit": "ns", + "extra": "gctime=0\nmemory=16984\nallocs=667\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI", + "value": 10435568.5, + "unit": "ns", + "extra": "gctime=0\nmemory=274408\nallocs=2585\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal", + "value": 1631146, + "unit": "ns", + "extra": "gctime=0\nmemory=60832\nallocs=2056\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 595155, + "unit": "ns", + "extra": "gctime=0\nmemory=15064\nallocs=371\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)", + "value": 148812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)", + "value": 106124.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)", + "value": 107083, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)", + "value": 225625, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA", + "value": 24486, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI", + "value": 1174332, + "unit": "ns", + "extra": "gctime=0\nmemory=34072\nallocs=305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal", + "value": 268625, + "unit": "ns", + "extra": "gctime=0\nmemory=7184\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU", + "value": 38980, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 160292, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 87250, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 87688, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 287271, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA", + "value": 219345.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16840\nallocs=631\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI", + "value": 10350217, + "unit": "ns", + "extra": "gctime=0\nmemory=279416\nallocs=2723\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal", + "value": 2009687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=64072\nallocs=2140\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU", + "value": 239337, + "unit": "ns", + "extra": "gctime=0\nmemory=16184\nallocs=375\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7250, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 5333, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 5416, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 9750, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 33471, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 1227184, + "unit": "ns", + "extra": "gctime=0\nmemory=46568\nallocs=557\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 422000.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 51490.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6096\nallocs=146\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 226562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=932560\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 264000, + "unit": "ns", + "extra": "gctime=0\nmemory=932560\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 228646, + "unit": "ns", + "extra": "gctime=0\nmemory=932560\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 212979, + "unit": "ns", + "extra": "gctime=0\nmemory=932560\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 266317.5, + "unit": "ns", + "extra": "gctime=0\nmemory=24776\nallocs=789\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 28527191.5, + "unit": "ns", + "extra": "gctime=0\nmemory=872032\nallocs=8352\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 8129708, + "unit": "ns", + "extra": "gctime=0\nmemory=219248\nallocs=6727\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 541245, + "unit": "ns", + "extra": "gctime=0\nmemory=75800\nallocs=1520\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 14708, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 15395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 17042, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 15958, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 141525, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI", + "value": 5628552, + "unit": "ns", + "extra": "gctime=0\nmemory=175480\nallocs=1669\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 775375, + "unit": "ns", + "extra": "gctime=0\nmemory=36872\nallocs=1232\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 241442, + "unit": "ns", + "extra": "gctime=0\nmemory=17280\nallocs=531\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 24042, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 24333, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 24541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 24208, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 876931, + "unit": "ns", + "extra": "gctime=0\nmemory=76944\nallocs=2955\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI", + "value": 41555847, + "unit": "ns", + "extra": "gctime=0\nmemory=1144032\nallocs=10349\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 5570333.5, + "unit": "ns", + "extra": "gctime=0\nmemory=234368\nallocs=7569\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 707371.5, + "unit": "ns", + "extra": "gctime=0\nmemory=57648\nallocs=1220\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 9875, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 9729, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 12208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 9604, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 126928, + "unit": "ns", + "extra": "gctime=0\nmemory=12496\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI", + "value": 3585071, + "unit": "ns", + "extra": "gctime=0\nmemory=130808\nallocs=1423\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 835167, + "unit": "ns", + "extra": "gctime=0\nmemory=35736\nallocs=1143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 76301, + "unit": "ns", + "extra": "gctime=0\nmemory=16224\nallocs=357\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 13792, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 14583, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 14792, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 13708, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 675758, + "unit": "ns", + "extra": "gctime=0\nmemory=58672\nallocs=1592\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI", + "value": 21517947, + "unit": "ns", + "extra": "gctime=0\nmemory=722264\nallocs=6956\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5218125.5, + "unit": "ns", + "extra": "gctime=0\nmemory=170152\nallocs=5129\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 378443.5, + "unit": "ns", + "extra": "gctime=0\nmemory=63152\nallocs=1226\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 9666, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 9729, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 12208, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 9875, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 125226, + "unit": "ns", + "extra": "gctime=0\nmemory=11664\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI", + "value": 3323900, + "unit": "ns", + "extra": "gctime=0\nmemory=125560\nallocs=1353\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 834833, + "unit": "ns", + "extra": "gctime=0\nmemory=33016\nallocs=1047\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 75550, + "unit": "ns", + "extra": "gctime=0\nmemory=15680\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 12750, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 13250, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 13667, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 12625, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 559037.5, + "unit": "ns", + "extra": "gctime=0\nmemory=50720\nallocs=1433\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI", + "value": 19446778, + "unit": "ns", + "extra": "gctime=0\nmemory=626344\nallocs=5937\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 4308917, + "unit": "ns", + "extra": "gctime=0\nmemory=143648\nallocs=4347\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 354594, + "unit": "ns", + "extra": "gctime=0\nmemory=53632\nallocs=1086\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)", + "value": 28833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)", + "value": 31166.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)", + "value": 28791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)", + "value": 1875, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/GPU/CUDA", + "value": 16504, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/GPU/AMDGPU", + "value": 82571, + "unit": "ns", + "extra": "gctime=0\nmemory=5136\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 5250, + "unit": "ns", + "extra": "gctime=0\nmemory=10064\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 5000, + "unit": "ns", + "extra": "gctime=0\nmemory=10064\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 5208, + "unit": "ns", + "extra": "gctime=0\nmemory=10064\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 6375, + "unit": "ns", + "extra": "gctime=0\nmemory=9872\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/GPU/CUDA", + "value": 141085.5, + "unit": "ns", + "extra": "gctime=0\nmemory=9456\nallocs=360\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU", + "value": 374354, + "unit": "ns", + "extra": "gctime=0\nmemory=24176\nallocs=407\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 291, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 26191.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI", + "value": 1233371, + "unit": "ns", + "extra": "gctime=0\nmemory=48280\nallocs=567\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 281500, + "unit": "ns", + "extra": "gctime=0\nmemory=15616\nallocs=529\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 48791, + "unit": "ns", + "extra": "gctime=0\nmemory=6736\nallocs=179\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 6458, + "unit": "ns", + "extra": "gctime=0\nmemory=16752\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7167, + "unit": "ns", + "extra": "gctime=0\nmemory=16752\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 6812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16752\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 6292, + "unit": "ns", + "extra": "gctime=0\nmemory=16752\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 189828.5, + "unit": "ns", + "extra": "gctime=0\nmemory=20240\nallocs=603\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI", + "value": 23358236.5, + "unit": "ns", + "extra": "gctime=0\nmemory=716568\nallocs=7026\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5330375, + "unit": "ns", + "extra": "gctime=0\nmemory=181440\nallocs=5750\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 399294, + "unit": "ns", + "extra": "gctime=0\nmemory=52416\nallocs=1285\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 1917, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 2041, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 2083, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 2042, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 26849, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI", + "value": 1110283, + "unit": "ns", + "extra": "gctime=0\nmemory=48312\nallocs=569\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 295833, + "unit": "ns", + "extra": "gctime=0\nmemory=15928\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 209202, + "unit": "ns", + "extra": "gctime=0\nmemory=12960\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 16250, + "unit": "ns", + "extra": "gctime=0\nmemory=49344\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 16916, + "unit": "ns", + "extra": "gctime=0\nmemory=49344\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 17167, + "unit": "ns", + "extra": "gctime=0\nmemory=49344\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 16562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=49344\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 276720, + "unit": "ns", + "extra": "gctime=0\nmemory=25864\nallocs=838\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI", + "value": 26169758, + "unit": "ns", + "extra": "gctime=0\nmemory=791824\nallocs=7775\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5292750, + "unit": "ns", + "extra": "gctime=0\nmemory=193016\nallocs=6003\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 722757, + "unit": "ns", + "extra": "gctime=0\nmemory=58376\nallocs=1489\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 147834, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 148354, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 156167, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 171187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 202935, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 7891917, + "unit": "ns", + "extra": "gctime=0\nmemory=260008\nallocs=2473\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1405270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=56336\nallocs=1808\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 236467.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18832\nallocs=429\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1335021, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1339313, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1331979, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1282125, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 919385.5, + "unit": "ns", + "extra": "gctime=0\nmemory=85280\nallocs=3168\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 45576933, + "unit": "ns", + "extra": "gctime=0\nmemory=1323984\nallocs=11836\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 6791917, + "unit": "ns", + "extra": "gctime=0\nmemory=294760\nallocs=9171\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1129796.5, + "unit": "ns", + "extra": "gctime=0\nmemory=74288\nallocs=1543\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 25667, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 26458, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 27834, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 24833, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 239791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI", + "value": 7689078, + "unit": "ns", + "extra": "gctime=0\nmemory=259912\nallocs=2467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1067416, + "unit": "ns", + "extra": "gctime=0\nmemory=48184\nallocs=1530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 119831, + "unit": "ns", + "extra": "gctime=0\nmemory=18384\nallocs=401\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 119312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 117958, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 118500, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 125458, + "unit": "ns", + "extra": "gctime=0\nmemory=1321456\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1099473.5, + "unit": "ns", + "extra": "gctime=0\nmemory=85280\nallocs=3168\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI", + "value": 47920704, + "unit": "ns", + "extra": "gctime=0\nmemory=1323680\nallocs=11817\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6180125, + "unit": "ns", + "extra": "gctime=0\nmemory=269024\nallocs=8303\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 611066, + "unit": "ns", + "extra": "gctime=0\nmemory=70720\nallocs=1467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 250, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 334, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 23480, + "unit": "ns", + "extra": "gctime=0\nmemory=3072\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI", + "value": 1243279.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48280\nallocs=567\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 307938, + "unit": "ns", + "extra": "gctime=0\nmemory=15616\nallocs=529\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 51311, + "unit": "ns", + "extra": "gctime=0\nmemory=6768\nallocs=181\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 6625, + "unit": "ns", + "extra": "gctime=0\nmemory=17328\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7250, + "unit": "ns", + "extra": "gctime=0\nmemory=17328\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 7042, + "unit": "ns", + "extra": "gctime=0\nmemory=17328\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 6812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17328\nallocs=89\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 207453, + "unit": "ns", + "extra": "gctime=0\nmemory=21816\nallocs=685\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI", + "value": 25423424, + "unit": "ns", + "extra": "gctime=0\nmemory=749688\nallocs=7301\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5719458, + "unit": "ns", + "extra": "gctime=0\nmemory=188240\nallocs=5966\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 404894, + "unit": "ns", + "extra": "gctime=0\nmemory=54136\nallocs=1348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 6312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 6458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 8042, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 6959, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 147343, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI", + "value": 6171482.5, + "unit": "ns", + "extra": "gctime=0\nmemory=176792\nallocs=1671\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 653375, + "unit": "ns", + "extra": "gctime=0\nmemory=37632\nallocs=1263\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 237442, + "unit": "ns", + "extra": "gctime=0\nmemory=17360\nallocs=532\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 9959, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 10250, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 10459, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 10125, + "unit": "ns", + "extra": "gctime=0\nmemory=17872\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 911223.5, + "unit": "ns", + "extra": "gctime=0\nmemory=77824\nallocs=3085\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI", + "value": 42477102, + "unit": "ns", + "extra": "gctime=0\nmemory=1181288\nallocs=10712\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5901250, + "unit": "ns", + "extra": "gctime=0\nmemory=244712\nallocs=8109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 688016.5, + "unit": "ns", + "extra": "gctime=0\nmemory=57536\nallocs=1227\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 667, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 708, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 667, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 667, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA", + "value": 22797, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI", + "value": 2025064.5, + "unit": "ns", + "extra": "gctime=0\nmemory=32136\nallocs=279\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal", + "value": 216375, + "unit": "ns", + "extra": "gctime=0\nmemory=7104\nallocs=261\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU", + "value": 214432, + "unit": "ns", + "extra": "gctime=0\nmemory=8240\nallocs=328\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 4583, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 4625, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 4834, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 4542, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA", + "value": 229181.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15864\nallocs=614\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI", + "value": 9251999.5, + "unit": "ns", + "extra": "gctime=0\nmemory=211784\nallocs=2017\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal", + "value": 1612209, + "unit": "ns", + "extra": "gctime=0\nmemory=48672\nallocs=1657\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 597305.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12824\nallocs=389\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8396, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 10187, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 8459, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 125073.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12368\nallocs=300\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI", + "value": 3450355, + "unit": "ns", + "extra": "gctime=0\nmemory=130792\nallocs=1422\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 784250, + "unit": "ns", + "extra": "gctime=0\nmemory=35440\nallocs=1140\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 77250.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16048\nallocs=346\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8750, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9000, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8584, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 601224, + "unit": "ns", + "extra": "gctime=0\nmemory=56192\nallocs=1507\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI", + "value": 20077995.5, + "unit": "ns", + "extra": "gctime=0\nmemory=684656\nallocs=6548\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 4881916, + "unit": "ns", + "extra": "gctime=0\nmemory=167800\nallocs=5137\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 363508.5, + "unit": "ns", + "extra": "gctime=0\nmemory=60208\nallocs=1157\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)", + "value": 125666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)", + "value": 95708, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)", + "value": 96625, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)", + "value": 183375, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/GPU/CUDA", + "value": 46327, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/GPU/AMDGPU", + "value": 100006, + "unit": "ns", + "extra": "gctime=0\nmemory=2080\nallocs=82\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 321292, + "unit": "ns", + "extra": "gctime=0\nmemory=1049680\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 174833, + "unit": "ns", + "extra": "gctime=0\nmemory=1049680\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 180333, + "unit": "ns", + "extra": "gctime=0\nmemory=1049680\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 623645.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1049488\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/GPU/CUDA", + "value": 194032, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU", + "value": 498985, + "unit": "ns", + "extra": "gctime=0\nmemory=15136\nallocs=400\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)", + "value": 398667, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)", + "value": 215333, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)", + "value": 215167, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)", + "value": 754791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA", + "value": 44215, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI", + "value": 1378184.5, + "unit": "ns", + "extra": "gctime=0\nmemory=32024\nallocs=284\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal", + "value": 407000, + "unit": "ns", + "extra": "gctime=0\nmemory=7392\nallocs=265\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU", + "value": 83621, + "unit": "ns", + "extra": "gctime=0\nmemory=2592\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1406146, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 863833, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 863312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2352083, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA", + "value": 255781, + "unit": "ns", + "extra": "gctime=0\nmemory=15744\nallocs=561\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI", + "value": 10669273.5, + "unit": "ns", + "extra": "gctime=0\nmemory=252648\nallocs=2571\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal", + "value": 1796146, + "unit": "ns", + "extra": "gctime=0\nmemory=62528\nallocs=2119\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU", + "value": 353138, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=432\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 645666, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 648625, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 657667, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 647292, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 207089, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 8128772, + "unit": "ns", + "extra": "gctime=0\nmemory=262264\nallocs=2480\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1351833, + "unit": "ns", + "extra": "gctime=0\nmemory=58128\nallocs=1840\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 252373, + "unit": "ns", + "extra": "gctime=0\nmemory=18384\nallocs=401\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2448000, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2499458, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2385854, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 2355625, + "unit": "ns", + "extra": "gctime=0\nmemory=19998832\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 1012275, + "unit": "ns", + "extra": "gctime=0\nmemory=91296\nallocs=3393\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 56122847, + "unit": "ns", + "extra": "gctime=0\nmemory=1406512\nallocs=12642\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 7335833, + "unit": "ns", + "extra": "gctime=0\nmemory=314400\nallocs=9738\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1476514, + "unit": "ns", + "extra": "gctime=0\nmemory=80528\nallocs=1660\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)", + "value": 33687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)", + "value": 34104, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)", + "value": 32458, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)", + "value": 792, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/GPU/CUDA", + "value": 16059, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/GPU/AMDGPU", + "value": 80971, + "unit": "ns", + "extra": "gctime=0\nmemory=2848\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 3084, + "unit": "ns", + "extra": "gctime=0\nmemory=3152\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 3395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3152\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 3354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3152\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 3062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2960\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/GPU/CUDA", + "value": 139347.5, + "unit": "ns", + "extra": "gctime=0\nmemory=9456\nallocs=360\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU", + "value": 345154, + "unit": "ns", + "extra": "gctime=0\nmemory=14224\nallocs=337\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 408583, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 403917, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 403667, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 419979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 44907, + "unit": "ns", + "extra": "gctime=0\nmemory=3536\nallocs=119\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 1462363.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48376\nallocs=570\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1166500, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 244452, + "unit": "ns", + "extra": "gctime=0\nmemory=13088\nallocs=431\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 3880750, + "unit": "ns", + "extra": "gctime=0\nmemory=8405200\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 3970167, + "unit": "ns", + "extra": "gctime=0\nmemory=8405200\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 3993125, + "unit": "ns", + "extra": "gctime=0\nmemory=8405200\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 3758375, + "unit": "ns", + "extra": "gctime=0\nmemory=8405200\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 248293, + "unit": "ns", + "extra": "gctime=0\nmemory=25544\nallocs=809\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 36955841.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1040272\nallocs=10218\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 11426625, + "unit": "ns", + "extra": "gctime=0\nmemory=259760\nallocs=8066\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1454889, + "unit": "ns", + "extra": "gctime=0\nmemory=87512\nallocs=2197\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)", + "value": 3916, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)", + "value": 3958, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA", + "value": 35113, + "unit": "ns", + "extra": "gctime=0\nmemory=1008\nallocs=38\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI", + "value": 1228255, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=68\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal", + "value": 178729.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2696\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU", + "value": 42711, + "unit": "ns", + "extra": "gctime=0\nmemory=1136\nallocs=62\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 15500, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 15584, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 15791, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 15417, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA", + "value": 254288, + "unit": "ns", + "extra": "gctime=0\nmemory=13328\nallocs=472\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI", + "value": 8768386.5, + "unit": "ns", + "extra": "gctime=0\nmemory=184024\nallocs=1875\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal", + "value": 864709, + "unit": "ns", + "extra": "gctime=0\nmemory=37864\nallocs=1258\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU", + "value": 170181, + "unit": "ns", + "extra": "gctime=0\nmemory=11824\nallocs=310\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 404416, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 221167, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 221020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 759541, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA", + "value": 113538, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI", + "value": 1039248, + "unit": "ns", + "extra": "gctime=0\nmemory=32216\nallocs=284\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal", + "value": 389833, + "unit": "ns", + "extra": "gctime=0\nmemory=7424\nallocs=265\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU", + "value": 90451, + "unit": "ns", + "extra": "gctime=0\nmemory=2368\nallocs=103\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1435625, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 888042, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 888458, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2377500, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA", + "value": 242331, + "unit": "ns", + "extra": "gctime=0\nmemory=15992\nallocs=601\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI", + "value": 12060240, + "unit": "ns", + "extra": "gctime=0\nmemory=245328\nallocs=2379\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal", + "value": 1855229, + "unit": "ns", + "extra": "gctime=0\nmemory=56744\nallocs=1903\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 359323, + "unit": "ns", + "extra": "gctime=0\nmemory=15128\nallocs=431\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 584, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 26216, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI", + "value": 1225691.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48280\nallocs=567\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 282250, + "unit": "ns", + "extra": "gctime=0\nmemory=15616\nallocs=529\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 210112, + "unit": "ns", + "extra": "gctime=0\nmemory=9984\nallocs=382\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7375, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8083, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8042, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7667, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 217974, + "unit": "ns", + "extra": "gctime=0\nmemory=23640\nallocs=769\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI", + "value": 26028308, + "unit": "ns", + "extra": "gctime=0\nmemory=754184\nallocs=7365\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5930125, + "unit": "ns", + "extra": "gctime=0\nmemory=190264\nallocs=6017\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 689567, + "unit": "ns", + "extra": "gctime=0\nmemory=54920\nallocs=1381\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)", + "value": 833687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)", + "value": 471041, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)", + "value": 472604, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)", + "value": 1546041, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/GPU/CUDA", + "value": 129608, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/GPU/AMDGPU", + "value": 238367.5, + "unit": "ns", + "extra": "gctime=0\nmemory=4032\nallocs=159\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 2691166.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8389712\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 1532750, + "unit": "ns", + "extra": "gctime=0\nmemory=8389712\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 1538500, + "unit": "ns", + "extra": "gctime=0\nmemory=8389712\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 4929959, + "unit": "ns", + "extra": "gctime=0\nmemory=8389520\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/GPU/CUDA", + "value": 259842.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU", + "value": 780703, + "unit": "ns", + "extra": "gctime=0\nmemory=21104\nallocs=489\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 32769, + "unit": "ns", + "extra": "gctime=0\nmemory=4112\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI", + "value": 1160784, + "unit": "ns", + "extra": "gctime=0\nmemory=46488\nallocs=555\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 331208, + "unit": "ns", + "extra": "gctime=0\nmemory=13344\nallocs=443\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 49181, + "unit": "ns", + "extra": "gctime=0\nmemory=6592\nallocs=180\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 6500, + "unit": "ns", + "extra": "gctime=0\nmemory=17088\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7125, + "unit": "ns", + "extra": "gctime=0\nmemory=17088\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 6958, + "unit": "ns", + "extra": "gctime=0\nmemory=17088\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 6334, + "unit": "ns", + "extra": "gctime=0\nmemory=17088\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 226865.5, + "unit": "ns", + "extra": "gctime=0\nmemory=22792\nallocs=749\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI", + "value": 21894190, + "unit": "ns", + "extra": "gctime=0\nmemory=663224\nallocs=6342\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 5221416, + "unit": "ns", + "extra": "gctime=0\nmemory=166392\nallocs=5238\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 371788.5, + "unit": "ns", + "extra": "gctime=0\nmemory=47624\nallocs=1189\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 2379417, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 2400625, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 2409250, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 2398041, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 203690, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 8376500, + "unit": "ns", + "extra": "gctime=0\nmemory=260008\nallocs=2473\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1518417, + "unit": "ns", + "extra": "gctime=0\nmemory=56336\nallocs=1808\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 332883, + "unit": "ns", + "extra": "gctime=0\nmemory=19056\nallocs=443\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 4647292, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 4667750, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 4671396.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 4599667, + "unit": "ns", + "extra": "gctime=0\nmemory=10496496\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 908171, + "unit": "ns", + "extra": "gctime=0\nmemory=86336\nallocs=3212\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 46899993.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1323984\nallocs=11836\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 6509917, + "unit": "ns", + "extra": "gctime=0\nmemory=294680\nallocs=9172\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1411159, + "unit": "ns", + "extra": "gctime=0\nmemory=74736\nallocs=1553\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 6958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 7292, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 7167, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 7084, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA", + "value": 23394, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI", + "value": 1162678, + "unit": "ns", + "extra": "gctime=0\nmemory=34072\nallocs=305\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal", + "value": 255833, + "unit": "ns", + "extra": "gctime=0\nmemory=7184\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU", + "value": 33801, + "unit": "ns", + "extra": "gctime=0\nmemory=1392\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 45625, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 45770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 32792, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 69958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA", + "value": 217688.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16840\nallocs=631\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI", + "value": 10382796, + "unit": "ns", + "extra": "gctime=0\nmemory=279416\nallocs=2723\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal", + "value": 1969000, + "unit": "ns", + "extra": "gctime=0\nmemory=64072\nallocs=2140\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 241322, + "unit": "ns", + "extra": "gctime=0\nmemory=16040\nallocs=366\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)", + "value": 22687, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)", + "value": 24041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)", + "value": 21396, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)", + "value": 5125, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/GPU/CUDA", + "value": 16877, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/GPU/AMDGPU", + "value": 85651, + "unit": "ns", + "extra": "gctime=0\nmemory=14784\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 12146, + "unit": "ns", + "extra": "gctime=0\nmemory=34640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 9250, + "unit": "ns", + "extra": "gctime=0\nmemory=34640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 9833, + "unit": "ns", + "extra": "gctime=0\nmemory=34640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 17938, + "unit": "ns", + "extra": "gctime=0\nmemory=34448\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/GPU/CUDA", + "value": 228534.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU", + "value": 377204, + "unit": "ns", + "extra": "gctime=0\nmemory=52624\nallocs=421\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 406125, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 223250, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 223292, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 762292, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA", + "value": 46452, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI", + "value": 1317617, + "unit": "ns", + "extra": "gctime=0\nmemory=34280\nallocs=314\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal", + "value": 475895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8016\nallocs=281\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU", + "value": 89131, + "unit": "ns", + "extra": "gctime=0\nmemory=2960\nallocs=137\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1435167, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 894167, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 891646, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2386375, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA", + "value": 290455, + "unit": "ns", + "extra": "gctime=0\nmemory=18136\nallocs=667\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI", + "value": 12069732, + "unit": "ns", + "extra": "gctime=0\nmemory=287592\nallocs=2876\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal", + "value": 2061646, + "unit": "ns", + "extra": "gctime=0\nmemory=69448\nallocs=2336\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 379284, + "unit": "ns", + "extra": "gctime=0\nmemory=17336\nallocs=411\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 434000, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 430459, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 430458, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 445250, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 55368.5, + "unit": "ns", + "extra": "gctime=0\nmemory=4448\nallocs=152\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 986333, + "unit": "ns", + "extra": "gctime=0\nmemory=46584\nallocs=558\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1085875, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 236333, + "unit": "ns", + "extra": "gctime=0\nmemory=12896\nallocs=429\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 3908125, + "unit": "ns", + "extra": "gctime=0\nmemory=8404736\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 4017563, + "unit": "ns", + "extra": "gctime=0\nmemory=8404736\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 4021125, + "unit": "ns", + "extra": "gctime=0\nmemory=8404736\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 3766291, + "unit": "ns", + "extra": "gctime=0\nmemory=8404736\nallocs=109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 266571, + "unit": "ns", + "extra": "gctime=0\nmemory=26856\nallocs=873\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 33066198, + "unit": "ns", + "extra": "gctime=0\nmemory=876864\nallocs=8437\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 10302187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=222120\nallocs=6792\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1411043, + "unit": "ns", + "extra": "gctime=0\nmemory=83704\nallocs=1858\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 8750, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 6875, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 6875, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 12416, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA", + "value": 23724.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI", + "value": 2157256, + "unit": "ns", + "extra": "gctime=0\nmemory=34200\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal", + "value": 221187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7976\nallocs=279\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU", + "value": 217062, + "unit": "ns", + "extra": "gctime=0\nmemory=8320\nallocs=330\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 44584, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 44750, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 45083, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 44584, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA", + "value": 340497, + "unit": "ns", + "extra": "gctime=0\nmemory=20184\nallocs=752\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI", + "value": 14051481.5, + "unit": "ns", + "extra": "gctime=0\nmemory=317664\nallocs=3099\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal", + "value": 1815145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=67304\nallocs=2251\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 666171.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19576\nallocs=505\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 86375, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 123937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 91520.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 91729.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 189642, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=283\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI", + "value": 5719630.5, + "unit": "ns", + "extra": "gctime=0\nmemory=210104\nallocs=2179\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1969250, + "unit": "ns", + "extra": "gctime=0\nmemory=50976\nallocs=1614\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 185192, + "unit": "ns", + "extra": "gctime=0\nmemory=23632\nallocs=476\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2022895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1989875, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2022250, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1932541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 537392, + "unit": "ns", + "extra": "gctime=0\nmemory=54456\nallocs=1517\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI", + "value": 28338444, + "unit": "ns", + "extra": "gctime=0\nmemory=830552\nallocs=7829\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 8689459, + "unit": "ns", + "extra": "gctime=0\nmemory=184408\nallocs=5506\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1098841, + "unit": "ns", + "extra": "gctime=0\nmemory=78616\nallocs=1728\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + } + ] } ] }