From a05d770641ec372a3349b277517650b523bc7bd6 Mon Sep 17 00:00:00 2001
From: Loren Kuich <loren.jk3@gmail.com>
Date: Thu, 23 Jan 2020 15:12:20 -0800
Subject: [PATCH] Updated to lkuich/Barricuda 0.4.1

---
 Assets/Coach-ML/Barracuda/Barracuda.dll       |  Bin 201216 -> 0 bytes
 Assets/Coach-ML/Barracuda/Barracuda.dll.meta  |   30 -
 .../Barracuda/{Resources.meta => Burst.meta}  |    2 +-
 .../Coach-ML/Barracuda/Burst/BurstBLAS.asmdef |   11 +
 .../Barracuda/Burst/BurstBLAS.asmdef.meta     |    7 +
 Assets/Coach-ML/Barracuda/Burst/BurstBLAS.cs  |  111 +
 .../Barracuda/Burst/BurstBLAS.cs.meta         |   11 +
 Assets/Coach-ML/Barracuda/Core.meta           |    8 +
 .../Coach-ML/Barracuda/Core/AssemblyInfo.cs   |    6 +
 .../Barracuda/Core/AssemblyInfo.cs.meta       |    3 +
 Assets/Coach-ML/Barracuda/Core/Backends.meta  |    8 +
 .../Core/Backends/BarracudaBackends.cs        |  163 +
 .../Core/Backends/BarracudaBackends.cs.meta   |   11 +
 .../Core/Backends/BarracudaBackendsFactory.cs |  181 +
 .../Backends/BarracudaBackendsFactory.cs.meta |   11 +
 .../Core/Backends/BarracudaCompute.cs         | 1073 +++++
 .../Core/Backends/BarracudaCompute.cs.meta    |   11 +
 .../Backends/BarracudaPrecompiledCompute.cs   |  525 ++
 .../BarracudaPrecompiledCompute.cs.meta       |   11 +
 .../Core/Backends/BarracudaReferenceCPU.cs    | 1847 +++++++
 .../Backends/BarracudaReferenceCPU.cs.meta    |   12 +
 .../Backends/BarracudaReferenceCompute.cs     | 1456 ++++++
 .../BarracudaReferenceCompute.cs.meta         |   11 +
 .../Core/Backends/BarracudaUnsafeArrayCPU.cs  | 1969 ++++++++
 .../Backends/BarracudaUnsafeArrayCPU.cs.meta  |   11 +
 .../Barracuda/Core/Backends/CompareOps.cs     |  602 +++
 .../Core/Backends/CompareOps.cs.meta          |   12 +
 .../Barracuda/Core/Backends/ComputeInfo.cs    |   52 +
 .../Core/Backends/ComputeInfo.cs.meta         |    3 +
 .../Core/Backends/ComputeShaderSingleton.cs   |   51 +
 .../Backends/ComputeShaderSingleton.cs.meta   |   12 +
 .../Barracuda/Core/Backends/GenericWorker.cs  | 1096 +++++
 .../Core/Backends/GenericWorker.cs.meta       |   12 +
 .../Barracuda/Core/Backends/MatrixUtils.cs    |  220 +
 .../Core/Backends/MatrixUtils.cs.meta         |   11 +
 .../Barracuda/Core/Backends/ModelAnalyzer.cs  |  499 ++
 .../Core/Backends/ModelAnalyzer.cs.meta       |   11 +
 .../Barracuda/Core/Backends/StatsOps.cs       |  530 ++
 .../Barracuda/Core/Backends/StatsOps.cs.meta  |   11 +
 .../Core/Backends/TensorAllocators.cs         |  705 +++
 .../Core/Backends/TensorAllocators.cs.meta    |   11 +
 .../Barracuda/Core/Backends/VerboseOps.cs     |  545 +++
 .../Core/Backends/VerboseOps.cs.meta          |   12 +
 .../Barracuda/Core/BackwardsCompatibility.cs  |  162 +
 .../Core/BackwardsCompatibility.cs.meta       |   11 +
 .../Coach-ML/Barracuda/Core/Barracuda.asmdef  |    8 +
 .../Barracuda/Core/Barracuda.asmdef.meta      |    7 +
 Assets/Coach-ML/Barracuda/Core/Barracuda.cs   |  430 ++
 .../Coach-ML/Barracuda/Core/Barracuda.cs.meta |   12 +
 Assets/Coach-ML/Barracuda/Core/Internals.meta |    8 +
 .../Barracuda/Core/Internals/Debug.cs         |  164 +
 .../Barracuda/Core/Internals/Debug.cs.meta    |    3 +
 .../Barracuda/Core/Internals/NNModel.cs       |   10 +
 .../Barracuda/Core/Internals/NNModel.cs.meta  |   11 +
 .../Barracuda/Core/Internals/StringCache.cs   |   90 +
 .../Core/Internals/StringCache.cs.meta        |   11 +
 .../Barracuda/Core/Internals/TestSetLoader.cs |  278 ++
 .../Core/Internals/TestSetLoader.cs.meta      |   12 +
 Assets/Coach-ML/Barracuda/Core/Model.cs       |  317 ++
 Assets/Coach-ML/Barracuda/Core/Model.cs.meta  |   12 +
 .../Coach-ML/Barracuda/Core/ModelBuilder.cs   |  902 ++++
 .../Barracuda/Core/ModelBuilder.cs.meta       |    3 +
 Assets/Coach-ML/Barracuda/Core/ModelLoader.cs |  265 +
 .../Barracuda/Core/ModelLoader.cs.meta        |   12 +
 Assets/Coach-ML/Barracuda/Core/ModelWriter.cs |  146 +
 .../Barracuda/Core/ModelWriter.cs.meta        |   11 +
 .../Barracuda/Core/PluginInterfaces.cs        |   66 +
 .../Barracuda/Core/PluginInterfaces.cs.meta   |    3 +
 Assets/Coach-ML/Barracuda/Core/Resources.meta |    9 +
 .../{ => Core}/Resources/Activation.compute   |   44 +-
 .../Resources/Activation.compute.meta         |    0
 .../Resources/BarracudaReferenceImpl.compute  |   11 +-
 .../BarracudaReferenceImpl.compute.meta       |    0
 .../{ => Core}/Resources/Broadcast.compute    |    0
 .../Resources/Broadcast.compute.meta          |    0
 .../{ => Core}/Resources/Conv.compute         |   29 +-
 .../{ => Core}/Resources/Conv.compute.meta    |    0
 .../{ => Core}/Resources/Dense.compute        |    0
 .../{ => Core}/Resources/Dense.compute.meta   |    0
 .../{ => Core}/Resources/DenseFP16.compute    |    0
 .../Resources/DenseFP16.compute.meta          |    0
 .../{ => Core}/Resources/Generic.compute      |  322 +-
 .../{ => Core}/Resources/Generic.compute.meta |    0
 .../Resources/Pad.compute}                    |    7 +-
 .../Resources/Pad.compute.meta}               |    0
 .../Barracuda/Core/Resources/Pool.compute     |  260 +
 .../Resources/Pool.compute.meta}              |    4 +-
 .../{ => Core}/Resources/Random.cginc         |    0
 .../{ => Core}/Resources/Random.cginc.meta    |    0
 .../{ => Core}/Resources/Tensor.cginc         |   12 +-
 .../{ => Core}/Resources/Tensor.cginc.meta    |    0
 Assets/Coach-ML/Barracuda/Core/Tensor.cs      |  938 ++++
 Assets/Coach-ML/Barracuda/Core/Tensor.cs.meta |   12 +
 .../Barracuda/Core/TensorExtensions.cs        |  484 ++
 .../Barracuda/Core/TensorExtensions.cs.meta   |   12 +
 .../BarracudaEditor/Barracuda-editor.asmdef   |    4 +-
 .../Editor/BarracudaEditor/HalfHelper.cs      |  165 +
 .../Editor/BarracudaEditor/HalfHelper.cs.meta |   11 +
 .../BarracudaEditor/ONNXModelImporter.cs      |    7 +
 .../Barracuda/Plugins/OSX/MacBLAS.asmdef      |    6 +-
 .../Barracuda/Plugins/iOS/iOSBLAS.asmdef      |    6 +-
 .../Barracuda/Resources/ConvOld.compute       |  418 --
 .../Barracuda/Resources/Experimental.compute  | 4284 -----------------
 .../Resources/Experimental.compute.meta       |    9 -
 .../Barracuda/Resources/FastNV.compute        |  188 -
 .../Barracuda/Resources/FastNV.compute.meta   |    9 -
 .../Barracuda/Resources/TexConv.compute       |   99 -
 .../Barracuda/Resources/TexConv.compute.meta  |    9 -
 108 files changed, 16820 insertions(+), 5377 deletions(-)
 delete mode 100644 Assets/Coach-ML/Barracuda/Barracuda.dll
 delete mode 100644 Assets/Coach-ML/Barracuda/Barracuda.dll.meta
 rename Assets/Coach-ML/Barracuda/{Resources.meta => Burst.meta} (77%)
 create mode 100644 Assets/Coach-ML/Barracuda/Burst/BurstBLAS.asmdef
 create mode 100644 Assets/Coach-ML/Barracuda/Burst/BurstBLAS.asmdef.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Burst/BurstBLAS.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Burst/BurstBLAS.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/AssemblyInfo.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/AssemblyInfo.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackends.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackends.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackendsFactory.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackendsFactory.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaCompute.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaCompute.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaPrecompiledCompute.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaPrecompiledCompute.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCPU.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCPU.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCompute.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCompute.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaUnsafeArrayCPU.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/CompareOps.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/CompareOps.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/ComputeInfo.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/ComputeInfo.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/ComputeShaderSingleton.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/ComputeShaderSingleton.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/GenericWorker.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/GenericWorker.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/MatrixUtils.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/MatrixUtils.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/ModelAnalyzer.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/ModelAnalyzer.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/StatsOps.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/StatsOps.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/TensorAllocators.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/TensorAllocators.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/VerboseOps.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Backends/VerboseOps.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/BackwardsCompatibility.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/BackwardsCompatibility.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Barracuda.asmdef
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Barracuda.asmdef.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Barracuda.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Barracuda.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Internals.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Internals/Debug.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Internals/Debug.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Internals/NNModel.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Internals/NNModel.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Internals/StringCache.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Internals/StringCache.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Internals/TestSetLoader.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Internals/TestSetLoader.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Model.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Model.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/ModelBuilder.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/ModelBuilder.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/ModelLoader.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/ModelLoader.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/ModelWriter.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/ModelWriter.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/PluginInterfaces.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/PluginInterfaces.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Resources.meta
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Activation.compute (95%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Activation.compute.meta (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/BarracudaReferenceImpl.compute (99%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/BarracudaReferenceImpl.compute.meta (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Broadcast.compute (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Broadcast.compute.meta (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Conv.compute (94%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Conv.compute.meta (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Dense.compute (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Dense.compute.meta (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/DenseFP16.compute (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/DenseFP16.compute.meta (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Generic.compute (51%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Generic.compute.meta (100%)
 rename Assets/Coach-ML/Barracuda/{Resources/Padding.compute => Core/Resources/Pad.compute} (97%)
 rename Assets/Coach-ML/Barracuda/{Resources/Padding.compute.meta => Core/Resources/Pad.compute.meta} (100%)
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Resources/Pool.compute
 rename Assets/Coach-ML/Barracuda/{Resources/ConvOld.compute.meta => Core/Resources/Pool.compute.meta} (65%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Random.cginc (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Random.cginc.meta (100%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Tensor.cginc (94%)
 rename Assets/Coach-ML/Barracuda/{ => Core}/Resources/Tensor.cginc.meta (100%)
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Tensor.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/Tensor.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Core/TensorExtensions.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Core/TensorExtensions.cs.meta
 create mode 100644 Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/HalfHelper.cs
 create mode 100644 Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/HalfHelper.cs.meta
 delete mode 100644 Assets/Coach-ML/Barracuda/Resources/ConvOld.compute
 delete mode 100644 Assets/Coach-ML/Barracuda/Resources/Experimental.compute
 delete mode 100644 Assets/Coach-ML/Barracuda/Resources/Experimental.compute.meta
 delete mode 100644 Assets/Coach-ML/Barracuda/Resources/FastNV.compute
 delete mode 100644 Assets/Coach-ML/Barracuda/Resources/FastNV.compute.meta
 delete mode 100644 Assets/Coach-ML/Barracuda/Resources/TexConv.compute
 delete mode 100644 Assets/Coach-ML/Barracuda/Resources/TexConv.compute.meta

diff --git a/Assets/Coach-ML/Barracuda/Barracuda.dll b/Assets/Coach-ML/Barracuda/Barracuda.dll
deleted file mode 100644
index b3ede234effba720125cd426c0d83c3883c664fc..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 201216
zcmd?S37lM2l|TOa?X`9Kb#<z`J0T0Cn@3f5CsaddK<EI1fDl$=cf}xEfHca(YH*ZF
z#enM|1jUFN9l)r#jH9FD?hNAYjL7KVHj0^y+qnG386CG#C;Y$XocorluIi+dpfmjY
zlhnKGx#ygF?z!ild*6NMS6`(Wnx>if|M=sY_7ObkZyo>6|Fa$0-OE4Lt$iT%rIjD)
zI`2y>FTHNd?1~#}!L_w3p1b0zD{i<UxM{_en^)9szG20d8&*8^f=gCBH@JH9ST2`d
zr9fYJrly_OWoYAH`NpQOwY}PrD>7YcG%XFxY)SovkMIhF*Acc_4~KQZn-WwpP1}xq
zq|=|FUH(!kO8?@gic%8(?gQKlC;+(cQ1vJp{atQoSwvPHZD<ow;sFF!XquZS<|2M}
zB0hH0<`>+Acz;UKMYOiGk1Cgs)n;o~p&a7|9J!;icqabV0sFDq=H~<`NVsainPK(J
zDR+9J8V4UGzT%H?(=|(b))%|9FKq16x;S;=KlwF=b}y=-zNh;|^r61(Gr+orgd3?X
zyMd57+mD>I<g7-H?Lz=<iBGn~pV1QE(h`4ROB{XP1n&(|T(^S(wA%om&C=jDU}Kif
z(nqp}KZxX;azMM(SJLupbPf0s3q9Rnq|>&)1Vu(1(+-v*P#W})pvTxOsIj**B-FFD
zyHF1?uiGkg+ue56Uxs2vw>@S0%MnUj2zE=@n$C1PBgYo9W#7n^mv&nzV;VKwDLrMm
zQ)YGQ(ZTA}`ukJGsa(pg>~)J%S=Xt2wPfVY@-ez+cAHfyVbPDj0>Ha1&twpezY<Z;
zEKcQ3&$KK5L^K)NV&Dcgdxdm4tJ_q~*kBi@(zaRo&l2+TYfKGPhGlvUE+Zw^hb4bP
zuq^E!loL9SL`V}_A@KHmv>p#yM?mX}m{!7~iQWy0-p3TZUuJs!7J4I6slMZ<&~Qxd
zqacQGJO_wbpw|S?En$B*g}WuvZBA#*Y<Wm`P2#)Zc3aHQZZqPj<Ab_1V`5jn5%P$}
z%KqW5QgE5i&9DVw1@@aI!u8W6N8C3PA3(5>nb=J!R@Irxnl8aupOfd{)cU+B!G2E9
zTM`?Pn5~LQ%#qj<iMbM6DzTKrj*wWokk3qf45aC3)ys(V<+EOfaLDIUUe@q(T(mls
z^UNp;e5xd8R+J)WA<^>eC{eIOqU||R;-G?;cRV*r6kJi#_2NbaWhAD&bW~C>M`GH`
zK-M$Bb@J55AAelvN^Owm;FM>oAY<lPD#j>zc36yX@f;OnBs^EexOFcC$jy4`4bHNu
zjAKncbKm2S^Dwd|&qR>MhpGJ<jC9bSv#&QbJ8gHT8c>O*tv9;pVKj7lm<@v--RXu&
zPpe^Z)NI)Nv>FaSGYyxYb|b}4r;+BT+sM$fPL)Rn?K;8C*mj+ON&}8GP^bD-g5YLR
zg5XLcuCyVQ8(E}F{b4CIT#%<-C-gTaBby3qj6%7qIA!m%-vA<KkHS##F|VhGOUp#{
zmyXnpsp7P;k$95PHb9|5MR<#J|779{o=?Fu(|;!<f5}jxcqc}dK2)f^1sdDw-)I+U
z_}PmzF6INiU7X#88gHd?yN8NLgcUwP6_#zZmu;L|h4j;D8=d9rHjOT}>35L=h;U(F
zSGlV|NZYe-3IPulYyU#<1~=M+?cr@-w;*`^HT1}S;jLi&@EhT`q62;dwWH>P_q<cy
zHCYAKgyrC-(S930`IzQWXWOTt<J-_ZSUZE@g7N+B=nmm(fZI=S@e{!X-p;m9wTHW;
zb``;G)15~$%n-L8qPMj(meyX}p>k3dtX-`0U2T)DX_yT>v;JQ|CT-S#iATn){|b-n
zqAvd=^l~neo3{aUYS4#$XxnbpsY*~}NUviIn=Ez0Ow^u0?c#U|p%XZ?!Yb@6jt^Qz
z{Pv|txVoj<a;615j^!0%^_zt>w?A<GwR7|X!~RPT-w_H#YI)B-J%Q6e?N$I$d~qS}
zQxN;&6puS$q0m!p<^yq&sO?qMBIs=UB-qtV@nG%q3xmFH-A0AWatKri9l>_cm(+4b
z5_br0XS^Y_F2;ci$!$OAm)1@Jw3N$JzCD-rgtUnr;Al^SW+8nhLIznQA<INs?CCLf
zj@<#XCaC!}uJ%#Dq`&@9wOt=Wkp`r-1U!=kwc65AV5Ow2AX|LRfRfoIYZf6ya=c`p
zE}Te+L$UT32h&^=6=Zuk+Lb~^M2A`XARzlgm>$snDKvUyn9<mpOaK@vnzc{1FWS#S
z$n?y<5fFG9;BoJ0cdGUVLcV<oFbw95!K%PuO;9zsld7HCuKhshpAW(Q+VAHSYLD7K
zM6%$SEU7fzfqa_%+G9|PtE@cGD$MUW^g}@%4E@iPR_CYx<~fDh)Bm%G{@0*}h@;LO
zpmT4q5bNAMt+RV+PgvIsdyz($^di@jZk<!_LoM}`m~|+<FDzYqXT-0j8cJT2^vr3k
zJ?}pq!CH{){d^8v=M0Yi+7T8@Qlf+Q$NO_oqXX|daAp0#olnPFc&MP?J5)689qKpl
zEp_YHi{8_<UW~Og^j@#FZ!vVB-mB@299fP$7Te_)RenX^iUp`yhjuVar|L$7V!GMD
zYRZ&#WglQ?09&t9R=QU+ObhGftGhHC>m*w%w9UtgZZnmTH7w!R3d6s4Kl#^%`G)4r
zt?zG<Z1i{g{Fx|U=XG{d`+A)S^q-^3EWvv34N%JP)p#qhT%o@grH#4=9**`zDS7KV
zH=$H-mu}R%k>$gRvYq;7fO6{B;J513ucgPf&*FD&Q^k`}(Wv*L*2p<VeG#68oikGR
zhE1cxl2oK50)(WzB=u7g0li_<NE5)HfF~g!f#cr*bY`+4D1ju2xe%DZlEgd)Z=NW;
z#$7?4#+fzwlnTIAA)#Z>HAhaWT>^m}yc67k1PbY}_1!zpL&EN%{*(#~c&3E>VOn^6
zP}l4P>-IdwO;tU!nznnp{AUuTRdBcm?c3K^XVQC?44Wg1YrjKFin&7}kUfwi&w?mv
zr@_-he`)*=z|a@sKlw)>UtJ`N??)IuBhM(jg%%{~sQFAGS6-zXDNZ(hcrtBr#5k)n
zDh&i@*w10?*i!krIg+uZhd&A;dUajJsK0IISz+xu3Ao(-)x6bfyWt@9ER513MrrY0
z-$zruwtqApwbO0wM$ZZmPJ4FAwOI^isYK~Y-L!m2g^eKy>P)@cY(Qeo(t5p4aYL_D
z)IXLAL*{Mm-Jrdj#-$Npim=UK3cs}~n(*%gob+G@Qu+tfG5Oc#U#I^nAA`yE$MKWZ
z^*YvhrT~~R7{W|dAk9>#8kC7L4wsn*_!%?c$CRa>wt7xAsE}TV@nw#jUwam?2<63P
zy-~jkIS?8_<Bl_Ux!5lVs@Q-aDlriu;=W$T>cJcts=W+A1!0yl!Ha{$<g_ncRr?T%
z30!yzAzpUyL(p27@sGiySFe2x`GSRRx66MJ`GSdVO8$>famFu8^-m$&VqWT2o9xda
zJFG|T`lDRUhfCEKeL1RY+9K|^MDM^0X8HdCYNZ2TNgddx`?wf{UHUJh`p*t7puVw#
zi|A3cgA4g-1Q+wu3@%a6OVv}-`wy;*Rc|~(gtGW_?hPdmjoKejc$v~4&Oyn;&|o(d
z<{eLGjkBcNEQVotX`}Qby0Pw)GO_&hcI`(by7nUyUA4=cc|@YCHm^u@)!K_hm#Bm*
z0lM1?Ae(8ZMb;9}p(Ha*62XC9qcK0BWXBAifvzl$b?ZDCLv{usF{LJnIY5!}^M@!-
z2PsCZDH^q{Aofos8Ifk$10?n*%m2H@s99!MjG84Y)<3s&(m&m{|DXHSKt9aDcG^dj
zkJ3F=Q9V^m7o>V<v~Hw%h4c>%V~!k>5>nE$)cs!|J==i9{-kG6neU1rP+8N(X&4EL
zCs11)?+^3)B|Z@54{*H4`rj>u%_29CG9z7fE9@7U2YLE3c-DfXKzL{#gP15Wk|}XV
zVoH?ho3;p>MbVInFr4@V5bPS657qn_PLES<4V<<H%`<8rLLV<#_yiWsOoP#c6;Di&
z`uXJn2+*N-yi6{o6sEGabIVs$f6#n7x}0Xx(SdHOf9Ghv+Zxz8nlox^QJ4KiJzct3
zNLUDB?)h1Z)<_zh2JO-|_mZ+9V@@Z`%sF|nS$X2iFbUSq0uIYl2hbBqV{&L9wh_oR
z9GekQ2Q5vQ6fh&A^kwZ#im2!Q7TD}>=&PP<HYt-P>3;2xxaa0G6gn#XbM@L!fM;zV
zevIH0bVmtE%hE`y;728~eX$)ZPK?R^q70=@Bd~Tc>ibxesSZ92LK@Fg4*?f!19;`J
zex*DIp)d#U&AK_!rg>9VA;Y?1xhc;Kj)aQnF)SYL7AukJ3Ualseo(hqGZc#!+r&wF
z#FiJ0V$P+d+0<WHw}rG=+H|Dr*yf*W<SUJavk=W=rM*UjVic9u!S;%WRy=%0dJ={=
zuazPxz74&2fKuFfVYOc=uFbo!I%q3em<(B=J^MX)VtI?R9s^^UZfo(FJqx!%`W6@+
zYh2-F4^C+mk5%bdJHIh8uf+WluALq3FFG<J{gTB3@<0}tQcrg}EimU<fXZn982Y5!
z4zTD<_v*ZQ)E&q>jv$Cn&C-quWlA(ljC)!sXKBOmamb`us7#qA&1iin>qfJKP|c&0
z=Mk!h4_v4YSvCF^ZCS!AwT7}YyoO@-b*aU-l98iAzKJX+`6el9EW)qZ%(g$uSnLJj
zf0D7-q}oD`(U_VA-(*ghUw7eTnc~-Bn+AAHA87kHA9mQ=C%-UU4&+~ipDigWTj30a
zCYk*ZOx29&+bn#Hc{@dpyazB$gB{3Xm!ezw@t(9ZVmkqRlj+AWfV(iU>F`{xnq*g|
zM(x0v+O8c`Y12-hokBAs09Tz#HuR(BhQfA+)=4|mpK!fuSo4FPZHwW29U3hTZ|g%~
zpM4(?o`t_EJ+z&2d=xDESS^-K|6f~7JaN87jwe~mwK|}9c9I#0okoMT(^<MHN&I&3
z!;0TdW^_1R$zcg4Yx?VeOpk8l3VCzlO<)Zs_+5ER_mcB9ogkAw&%?*cidp5O++5Xb
z@5j-jytK56al}Nnc_29EUF3HfI#&T_{LTb^Q<sKE+)_KXOjfaJpnS79;-F!n?17H&
zH*>BpWA50}e=JehoRcvNyRy2=3<x#%%8<NVU*0U{t%=tFF3xrB!jUbje^YVHaLQe9
zWcM0OyHwWQxs)4BIkv@#y*Rknt^|3pJZV|?5}wg0B5qLjq8iK>&&+xjPI+0L1)?Ue
z0{x!JXXgqK9ZWlrF>cZHf85Lkl`gGW{)&*!%wbR+M*~=m`Y>ANa0+=y1RHdpe-?U=
zyV$C&hfI=NJw0-nEEo%B|IW>R1u4@xF|-QiK$wy@mGKFp3#ofSzh~YXdhTd(1^<51
znb&?IqH*{F^yLBQ)!OBRUp$8&1`dhIv;=2ruLKU5=Ws6gt>Bto$9{qtjDzyvIQ*o`
z1HGDmJVF`Ca)Py#cLIKpF@eYEo;}^zbSz+tVu@%s1>{6z>UGd(1}7m1D@=4zQ5!ym
zwk^nBb2aS0X_z-~{?)914P|$1G3#IF=hU4~g^I*6Axh4hg}hasz!67l*KiSkdL1}m
zW8-1`)}9Xr*)zd?pkVfT^lRRB>>(VY_3YiDczS?bc}Vb)TEWSLy;+zkV`G!hRLM_4
za#m<iDOfP2M~yxzN9<9l#E4vK^jO`5G(mOSxzxz2WQkO8DvBgwyTK$<guuK*6wTn^
zrS+C)gHW8TbnCx>(oLx+9{+`?JH3TxomprU^iP87+Y?k@M^vX2)v4Wj9l~pRsbC%2
z8;*G3fCoDq-m`r<_>cM0GwR<!$sIH4!c4|s&Xc|^9}i_KXjxD+jEH|4V5NaB$ME<S
zu$!6kPe*blp#>|SaqMXy+dJv3TUY}x&df9VO77q+dNXa>&lr*x&GWcO!41wtdg=N4
z>{)m=h`;*m(<n$?qWjoJC36?DqH9$==}(_Mn+jkGLjed6_DLF;PEG9gd=AAMu>UFi
zc?x6Rrp;`qVpF#o7(|HOj4sw2kb2%@0~bua*Z>@CcViyzl2;R`o!}yX*yHGa28}#c
z68XjXBA1Sl!WE&{sn!Ujh2R&1j~eKrLI{d+K9>sOPp?zkL!-uU+2Ua$bvVH;NX<G?
zBt`slh^#(AxKGgUV~QC^Q@aK*u~vjgQl)^38vz=_kX>O1n5<~TQWi85gp;-w&c`EI
zj~_b#o71D;+>^FPY&#sZ%$qv%k;zezKMk7DrqJH2jhq_W5zk@DK%0Ld_#_0G!G}>{
zSjaWZ;8VmBhFbq;Li7T(TwF7Z%}nYg(YpQ8p+knuPK;x4F@q!u!elg(E}euOFREz-
zm!Loi{bgtvR2n>i9+WqVq{~ZestpE!Xp|z_FU5mvDB8c%M0@Z|h9xvQ+R<Y|<4IT?
zqN1&<{JfG4__I7ccouR(R{pkhc<>;Yp=}uBajvjduY*W4_y~1Kj2c&2rq>}1X6dBD
z?ETOLLwchhgA?7xG6*%TEd|cCoivd=1FVql(d#rig3kjW2%*j66+bd?F3p8NZbxj`
z-+;2TXQ^xB9bg#yuYQ)a7JLEi5GQFWV0-sBc;bx>52Fw0g=5*2h|xe6?LE_@{%t~Z
z3Uzr>XkayN7C=RuH;uf^#2tW$-iD$`XRVRrEw}!A@FCcU_{h<EovQJ~5N3dR>D^&u
z6C%Uq^Lh=p{yWs-BT!~lMSh8@D)_57_-h3H<G`u}?3AeM=OT4CaJdXnPr43<5vIwm
zeFbP8HBE;7e2#YCk7Pb<);*Y+=g~dXCu5)%BtZudVWV9o38V{|a@KZs1+M@KP7&wf
zZRq`<L7Px9oCB{Dx6uhp;Dg6_)RqHA8*Y|YgscQdh>r;(cLkf!n~J>AZ*-z9uZ^w?
zX)`piLR-<Ht<{OP{(!dp8vsA2S`<=LB(<9k_J>V>VQ$k(SLL;*Tp(>@a={FnZh%fq
zE|*gfXefDOJ|uO1uwTBf1RY?ZXXvxf2aD5o?fn>7jo%Z@IyGVkSD+C!ZyDO^usm9S
z-aHlbI-z9-==t=pU9wAOnIm99BC+%`D@?KI&9XF3%(RqPrUyhpX}3`faIlM1F`;Ax
zS5t9;W(2QAjsQ^fH-IDPf-F(#U^5EijU?E3m}t^0NeEwy@aRv)-h(cR>^;ITxDJ^R
zJao(!s)AK3b<#{&1+ovl8CBs{u1BU|6ksl4z0d{5>NtfpFeH7aL%R=5#WJ)GcpgbI
zG=rKEs+n>h+9Fi5w#x@n2CFkHB#iR+L`Y(kZ8euK(pIh!*xiL0C{_PPuuk`%hu^7E
zwe}fwtM1njix;b<NA%g}qM-)3h5o#Of<OpCusz*4ih}@?Zmgmp+b45mY&$@9wkzme
z6&!uiOp-ASYZ{vb%;?nuphmsOQH+BC$T8pO4BSbmgZ8<ODv}<ugIN?QPG{hJpu$e!
zSW*-WAh?O3nRtWF2yPZY1c$~c^1WVt-(px2dN1s&B^Z-fvNo3D=>i|3-mY_MSMYq)
z9a*H;i4Tq){24(-8x*yJ7a#^1jVq*r7shS<2igLyUhPNW+{A-PwP5rL@DpE;Voq=i
zwKF2mppFP5VEE@inn(Ji4MSybMRr-WYXlJQ^spm!hy>W`xs*Ee9DEF_#z@K7<d^f*
zP-b$1ZK$f(sb4)a6}*UIDIXNuJUe2!tJFZ&wN*VT>$)<=xV(hpN-mw>GPILH)NmWx
z4W%cbxDyZ^QyMp8I1brAy7o@MIhx=MXVv^}1TQB&BWN4Jb|5|aXEx6eL{xAq5;A2E
zW9Bx5>ckOyT;FpI5?)MYe<%Y6WupO0L<KKFW)@>j_g{)ouGjEih9^}mBkko>mByOB
zuYlrBRgK^k$Q6qM!$FgAzFJ#$emKsr)mFYj!X*E&Q#Btn&Ay!o!I^=^5rm}PSVj+{
zu^bQ68YfXE%<MWR1b>c3;8>yV(Jj0L^$ygYy&Z`@gwihEo~Q!jw5fET6hXUW%T@6_
zhIo2i#M4n2g4+o&w-~&Vpc?RvDP`HClx0SD3M6QDs4)MnRbjH=fC*x69-|#`R;=IF
zcWD~3pofVp#Wm5_Y3jh*10g;I82B`bF$;@UaEhny2G%^+VM#*uxx?^oCpwBc41OY0
z^;phD@J^)hRE2!*JetUjWaqg|92TmTUW@fB{7$eAd1CQw7V*3zZQ;E_M>y&^+ySV;
z+V`^nYV9k;+E?K%+DFs`OJv?HyS*-p<S?Eu26m(Wfy(Z_LR(49p|}t>91=jqn+S+_
zeH_gz=V_xa4CU=EZSs*Y{A+D-rVEQADl^oz_#_FF9K0K8W$q7BV8IUHbjbamAURtu
zV&FuBFtI*FGBaQ=D8hJ}!+^UO6C|6#t2juq5WE_}l8mjD+7YLQ^i652w#G7cCTJP~
zO+1IPn2;<7cjHGU($^r=I+5-~8V_zv<WO2_aI=jga@?n_JU(s%D|Br%Y=f~BowHQw
z9Drx#O*(|m&4=?f)BJ7FjAmk*<N5lS326g!H2`9+z74-qrSFDkO?l47DO2X2Oxrl0
zT>CXf&F-NAsl=1{ZvVBQ2x}XI{q8^)euaCG)WCEU&gLK#L7vT_V8a!PzRw23M6WcU
zq_9faXG6)T6sQdo>%pW=35L!y-`}9~KpJ&)HeE^*4L*Ts3HZ&m1qhXdEx^(FZ2>OQ
z(H7v`4z|Es!Iv@gqrvE;F}N0=PbPt3H&#Fe_ZmiHB|nP|Y`&YROd(7Dql@)Au@PJH
zuLDc-mH~eel^Y-EwWMTN0Nzqe?&)Hd{7eLLbx)Y(1aIIRaW1)|i#*uZ1UA@(^vavu
zQK&hJJHZ=~!0XgW;A`@DjS3d0(>VFf1#L!$F)_wfWjlBiD%&V!7gm4MV8d1BXR=LJ
z16Iw>sd-gwlR}o=@B-3hkhv7&t}KemY^qM&7`eXJC@^1C*%Z^7Vg$(280q%RCMyIK
z@)ChO>fz*1LWnoHt%O0LB0lC+WrkQKesx{iVg5`$3yV%=o5Yp`0dt%LC<(`qU{lbT
zI1{v^>>S$X+9n4Ak>*rg`!HaxA(*F!{EX}r0wQ+GNf^+Ym@3?Ze}T@)l$Stl+>21w
zW!~B2X;(O`@3|IP_aSTaG0_?rkdfs<VAUMBSWOj)IgqRtHSvA7b>7hHf)Y3!SuLVh
zD;oR@YOc6RJMtb0|4>;LTr09HsDHZA%etl4ftTsS;=mzY#7Id<EVN1H*3_*&DFTkr
zT!hNnvBbaIBL0z0Ldr$Zo!=(G+|}78QG!F<&SF~CncKs7GHYpC#UvE7Hk?+K9T(d$
zx^}s?_~v*XAbX|)oSXQWv}Ya+X(4-tG*KseCefEqxIIH{HQ6)Nk@MIyjCW|yL}IJ#
z8IoJFXCh9*o{2fBdhR7C^vAGg-mGNk@CpnWF+?U0&SoQqsa8hJ9IC~LVTRq0>LLJQ
z!~jnjZw2j$@o>DIudRAsEL)_bB0Gk~qrDwNwryl!9E2S+4*6wUDeBownd}&)kXX}`
zJwvlsXvZ+w%8rT2ZfnN?BiJ#pM%vpkuu0n3F;ppO$K0#ev24KWT=eUsm(wG7RU4Ko
zlIWL{N`+F*7}X0U<K`SDak!3*w^6{y3~VFwN&yn==Dwh69K51*w_XRMC8pP5*QpGB
zD{KSPe+$ZAQQ9Mg3@$1t94Unjt|Ccr6Hpz4fA_(;5upT~DkxFF$D~Xsk^Mt!vE$GO
zun89vy1iY-(5OAke%1!j+uMb0lQ4nMb|6Zax#4|){gNKdtC(T(;nm=6OrI2Z0AawO
z{n+c!&he@p!l9`hqQ9vf;zDni)gWr9J<M!q@9osyfQsSO3~?O21F69~xm79lE^75M
zK`Z#nC?j|`rNdwOS9mrEL4Ec;6r@^_FNu7=&jzmar_a8Z@-Z!-=wDM1+7`k0Q4oTQ
z;QJ{E{)Iu1h~VE)>RJv$TI|LN9Q+IgCn%^KSIu@TxH*p<OJyg46QDxh6CS|_(2DpG
zKZsE1NBj`Np&t?Ki>7O?0xJO>B5R7+4+yU{p#{5I*KR-?K-Vkc?ZJPhRz89%nerJJ
zk{?9~b{(<9t^XF|k$wN8KIY&s;bYWzv6k)DZj>S{6a21XfjpKUr~D6yErA|LEOZ2?
zZtcR$L<i;9q_uwsE<kL5``2_bXpNiNH7r}S``y9bEysr7xBmyVU2FnE9UdsPD+^<p
zHj7Q(zD0z#inpl5ywmIOqH1<KP9e>|XKY&|X<P}!yLNe$r7l2p9hjhS#U?r<Lp$h`
zb=Z^+9Hq=*cu@Wp<<KXl_HUTyPo#b~aP|pOGOX80<buCL0liKlr-CGM!GlP}jaO$F
z^~VskgHIri>7Vl8rhJI&knlReClMYk>IS@D7+QE?vCipk@F`^Q4L<@S_%z~`_ku~6
zNJ5H%=CNWRJaF*kVLVa+1RR$tS>>gI6w<#&+14(}inhwb(+HJs$UC8os5Ek4LhAB%
zc^L0@lGu?Sgy{4f7Q6&n+$c)1WPJsaW$!D*l4aDtg<8R934~i-2}9v?6ig{-sX2gL
z+SyKzejR{dNq`+nL=D*`o2J^}ZEu?Q!SdiMqm|)3oK|R2<{uzHt(gFE88t<tA>Fcf
zg-K1<ey)wL3jHRNU1QG;!>{PN@S74Q{e2R~2>wGSP~REB7XTBRR0gx2Mn*qIMjkzn
zGaDJ4O$wm>l+d{t0pBGj1FcD312`u$9O_bodo=ia0akv2h>sKe0}?nZ)C02pcd1b7
z246(}$XQ{Gs@tni5W!c9gSK|l0puk;3bn(!M3QJjElk*i>|xnmtGuN2AmofRs328a
z2jvpLJP4O?1Vggaka>?9L86{|YUiSY5i6oFmk^;1J8+LT?w&$V=EXVS1^r=H?-V5J
zb;@D~F9m}2z;p!DP!y41A;1+Jb{QhU8B9-KgTkfv(~E@uJAt$AV;%vW7|y0Kmc>a5
zu)=PD^Eg$}pw!X~5;1Gp*q)_iBlr>!Z7`O=iTXTX?2yMxO=)jv)Ew?BTRv?*jfKtl
znBVLXjBwM(<Y$8(LPwERnwPDn6cxrY8!&a&i5-R<EWn6l+JG&I6kUM@(?;#fnc6df
zBl4lMKshE%(0nRY9y>*^V;(c1${s-~`AzP?Gozkas3kh&kTG3;W{2SL-GKtGJisIZ
zQ%dWl+jSui`w;60Aj2ss=n}bG6~QG%T*M6S1gNl#ik)R}kMgUY!57IdbZV3mJWTPE
z!Z`PEX|0L|k5J|{VT-Q@=nxeOlf_4r20Nn+4o*cGuZ=S(7-!rQXHc-M3l7S=jp+(y
z-(<SN++nkW5K%$mgzkS31*y+os-95s+;?OHya|1Wo`z2OXTtvi$X{_B7B9sL0lF>C
zTDUa(j|gH0dW=ea8R1O%EKKrWL5NItNVXaL6C%TI9Wsn74*+*x*XWThE2kY>isMBI
z7Ix+ARPZQ@j#{a_xo0_`e3fGF6H@_mm(s&*mzJ<GB=$l@yX2*zy<lkP13vxfqV>N9
z2*q)T$F~^ouOo`CXJlzYD;$f#Pq9Q=ahhi~M!h6$0UqhcG@g(=$TPMcpxB66bgp=(
zqmf<a)m{ND!{<L{zX9NBlh09IB<F^5<s(vjZU`3{UWmPc^H2g0;F}J<iJz1l8v6F0
zbZP{Q$RX)5^Fp0ZTww<Se1|%O%8eL}!>YfraU8>9X4Afbuyka|yikP;yMm9Q-bg-~
z0#2ZlbT@-S)Fs97QG~!5vvAk5iE(82z{Kg|D|=w+JO%1x_IUhcCvHT8s@n3tu@J)Y
z#p7qR0pZdGhjtx80gI;&IWhg8;*cti(Z%U0ekDIUQA_|XjIVD44TlZukSa5uac~=A
z8?f0#I&N^}H*K>M4_YzC#qkpyK6T>oO*{%9-iLJfejXLaTagamp)^YePqGtR6A<vu
z3rZ2!mJg1f=%^ZaB#HcYA~M&Snd6uEvJ;~Ta7$uvOUDNUw_>rW^?hcH{}E#))%uG2
zA&O&5!x$DA+E{lOg9_Hhal54AIUNknPT&-M)N{+m&R1oZuNXT$46YnIAq*ZhHWCI`
zjjarWLt}+7I2<E>^tgJ{C(K(H=bbiw6jG_zc8v6@(Ck{~>M-zevJ=Y_Xq`SjC@fGV
z3p<t6{mzJSKV!TE$t01?6Tt+#urt3%@y_e^7O8IgY@*WDCqoZK<ZnLt!9?tulQA1d
zdDosy+yh|l<Lgd_-i~57ocyvxEEubY#cmv<W8G9^1?Dkr?8-29C&exeW0=CUv9rS1
zT@;%LW3QrE)Vr^ySk$|BQ!MIT!f5<*)PriniedKK7|!_~70-Kxd~ezJJg;|Go<@CM
z6hM#H!x0+$0W?V3>&2<^Ua!~lU;uvE>j}PtME`p%DzDUInqM%sMWML-g=3e6!CS^I
z34`^qv%}!lu~WieW2_Pew~ZYg2457@w7rR@TZtx?SWFeB|8I<j8KbcR7Ogg(L&u-Z
z!t>*<2*VZ7!y2m}Rt|k$-;PmgP`Fb(c1fiAin{_LKfkzlF&|FP_c>ml|08be^$CEm
zZ#8VAh>$5Qs08tOiDEEw6TL$6)iMxPfEI21+6WivDH=kATT@RJfn+i4sRBnj^;Cg-
z3P6}s^wLOn0%b!XU4aQy8($d?ev~7&;6Vz?{{<I^VHp+246BV7Rk3`(<MsPL<U$Ei
z=Q(?PF$%Rz64~-qNF=!*p&5sVOl9oMFgQMTQW!jLY;_nser#13e9G8>3a+4~2kQ-3
z7UuHYxanleO0VF+5lJlfBiIb1kfuvadt@D)^gqAllYiL%jtvBMp4-f*(rL%hI*QNn
z(Rl;yY4}g`IQ>!jjrgC&{Cx}lcT*z%Xs*xW*)-P=+ju#UW??!LVrm<ZzYculDIWF@
zTS?+4A<N~XC`k4Xxk%jlf8&?+9v{6$_Q#Qp^TE6eyE-y#bQ?xG#E<L>v2FM~Fr!5_
z*<BF7)2L0hx%#lmh>ZcWd@igBbT&2v;ig~_vkQ3#$^hpEcI91Xq~CKLwrc!6C^nL^
zJy%RK56di!J_rpf11kea#N5%p0)}Riv)nR_5J;R_QK!niy(w8yr(G{C0Fm-}s{aGj
zPbHVodSVHk!Pb06me5&Tgj}ZuvZ|huWiffRGs$JKtMj3ADK+~qXabg9k_Xcv4{}~k
z6f)VH<EfOF3-+S;uu2|{Q*0!WSc&Pm$W2B^;$k67?rmjB3cfU!vBlampCclq<7UR?
z?+(MC*H-;B3=ec&^CDTd(|*$`^ylf+pZ^Nfk~TC}?ASQt&NOkyH{*w8`*%?c>D)E{
z0z7aKBNlpQM}~j1M&2sGb8<X(HSJw_hYpT>4|PYDxfJ|9g2D~XiK8C;J!D6E7Uba5
z<!uZskCRqEK#hE=d=dKfhX{&vaT9B$iL{Nb=O&hWXd~U+Ml#ye#-{-A-vDs*0TGlF
zqoLrrSvU9*vI?g>pIu_ISJA5toe=W!?b>Bl=mW9&jKLY;W$nwM4UR=ArVZ@Kku#7F
z5o7PXyv%Xmf^vFzE&`T39gjFxPsx?%^a{sgqCntxv$PjNex3@r2`~wf%aqT=Xr_*0
z-AKNcF;}?|$tT99v;|#7ZG=s^DkB~FARh)0?$!5$4>*_+_bRW)L>Cmbi?;~vq<?ad
zoil-#Xr65lC3(*f@804)!@opxvp)t}L)$55+fZ}%yD9r8oJ~1CjIgqjziq?KMgNM5
z{y#2CIopnI&VCPN|2t<>jt}FuoC&o=G%|BwlgP7(4!>-K6&$~32ereZGfJWt{0y*0
zQ&=}B)ui5j<JXHG35B27)Fk|_kM%<tYE$XW2egZS8}sS`q@NY@3U6H|dG+3ySCq32
zZ+$1T|2pOs<s^Bf*1N}qB0x%)M4(sPZ5Xlt0cwV$ZvIi&&5z9K=HNe3G9*I!dxN(2
zj+h2AC7%WwjBTZ6{N6|S{hWJ%a<-M5v)@nIzu;`j@iEPXU9Hextk92^>x^`WDs9|1
zOJozOir*J$N6reaYvHn@*};c>Vq*tp_P@|}<6@-9MOuC(TsQ056lgk|XyRS4w8`(+
zWXctX*U4G?h$^-{&qICA$2S`|;d~@u^zJLrFjhH^ES^8t?%mvO?bP#R%BS=AuQoLL
zMgsYy{I3I*H2gIe0XsVBXg4UN*9qI;muSYIw|ih<+PFM266Wm0aex8LXxL-^q0*_~
zSBRXn1Q{t7@f{~FLfDZAmf>X1<xRwSs^J)}Gu*cr<3&&LOjR5o8-$A+I}ojA7F2B0
zrGj~T;&zxWQf+zP_;~wDDYp4ivheN&5qu*dDxV)$UOawu`^xd1WvH+;u2AS$p)b+G
zPobO?dd2YqTEM%)gNghv#`yzdX;pG?{5h$uWJB&NJd1i3-a|O-i63Aze#wGW)bI*B
z<)v_q&I&JuYjg&^6t2<f^HR7*r`vPmdvx+Hx9+*|B{?AI>2dqxA$|i1{4i2N{I4E+
zio$R6SVe`d8S_=>+OcISblq5=3f&M<6pXK~Jp(4DWZXei(Y=`1JrJbQ{>nt7AHrx*
zAS&zL$q_P!`$GhaiL$px*~`=8d0}vWoL_!!s!ny62jl&mSzEOTo~gx&5>Hnpa$)sC
zB6CbKM}|{d3CzC2rl><`{{RA+Ndi-2pf|g3@dYT@oAvC*2$T&1R$~+aFB|+CbYslW
zEO*jFe_)#(2VT}j6F%wg)0Uz(o^W|<Ps8xJ4gb3lroY?tt(Oog+=4V{NK)_{z#OLf
zbm!I2@LWXWn~QHHH2AuYKL58UHiB0f;fp>Fw3c)!QYQr2&F8<8FDl`Rb*hMWrMI1i
zMz@`gA9=GGMnw<ZJODF@F1$pmI8L+e3@QXG0mr2OfZvu|FI6GT*4R$u{H9wkM+63x
zlG%I}4$L21im9@nPH+=%mHT5{Exbm8X~#Qh^U)q(Wq3U5{?X~j?DYiDe>6zhhA9EJ
zWCj0CGA7rPY{RsHOC7?b)hH!*gW`}R^GMzAi1E>0P-135M*siFEFMp|(|&%L!u=6+
zS9Lc&@#LRyfQ&kPH}xIH(fOyBPSIba_~+4IDY9E84v^7@Q+=@#-L{^_aPy#2dKZd)
zKD^`Mgf-rRhf}lKT)6Iyk)KaVv{5Co-Z-Fn1rDPH)AoO0UirlQeTyAzzOR1WK~R5q
zdiDZ{r4{@xq5dxv^{<ko8+nfW+xcnX@r616Pt%Rtuu5&2Bb$U>&GuyH+%rENE({e|
zEq`<}n(UDSz!8?ypAtt$`#13M+|KqQ%LC?y145GS7b@l^XsK?}ft#fMZF_UjLd-|9
z54QnRax1YBnY?2NM!DFfVlxU{?hAH4(8;nHxY6KEqB|CT4g~3suQc*E!k*m@kvAp!
zEQ*rB%NH6+F4?yd<y^9K`L039CBs&toJ%sy_jyY$O{@~-T&(v^zRp{6i7JV5F44!g
zdHWnCqqj!a3i-ds*xpVy9p5`7QL_4KbRm#LX>O{~J#`W#olv8@fh0-_flm5hJ;u)s
z>WNjRy2ruLu#%^{$ii=UnhiK8OkKS{A{VSQsJ=BqUlt;l-kn&Jbis2*sdj*Fp{r5|
zG;S#&$hdjoQanoUQTM3eHA{wRFdV$@%VjMoTadDJrof=R2mTY-qxzn4y-uJII;J*)
z)toGlYZubO?tjoQAF$Me{|8x(-+T`JVK_6oR_;`}UBJ-cWkQ;78p2^93Ik_G_WgRD
za5tv(;1_^)?2_6H8Zr1v*V-f0-aF9V>^}V5QhJY7SlufH@m&Z0;G{X-gX>2KB3@=;
zP?lUmlP;3{S$tlN_+0xw0YkyG0>IUKx{M~a78E)GT={9`ncaW}2**oxsYyO5yHrs$
zX<f8SL-v$>4-<0Z@d__pThSBWGOX9BFX&x1_98KTc$ZDDQ>TPqOc0Q}hX(E4e6KJX
zk=gDi)Z%6O^2xCteR*muqc2~CGgr%dapby4={=ObG)nL0^mFy)r&3Z670ri9I8~r8
z&rwpAlj!_0<dx2{c>2wwEMhlfz1Nk|Z^l&Cm9_i)F3>oFTSas{i>T#<Nlsv>BsWZQ
z15+i@d2d~-0X9ZUd)WHTYtTe4Y$7X7_`f1@!$z{wh$M#1z+q3#NMcI0olON6CH|VK
z7tkpO+x~Cbwa<yI?Q>#l`@fCbH?_a#k%e!&6sK{=9DmnTy9&<+{=W|YC+^0N?!$Pn
zM+T47@U1TV@U;l)XBOX-#5p1#3r;DK%;XAEqNN%~qa22s1l=Y_Wp~QiJ}Egsj?M&b
zCoJ*T8TGJ&PD(LBT#QCs;sA(?Nl+m{2Woa2Lx|LERXsPV=d;uk4kgaNjGi=3PsKVD
zDm&J_RD(XzLN)Ymt3fqz!xUA3qA9M<)V5+Qixx8AVRg_YX$rUqV5dj+z$V@XeyD9Q
zY<*E7yxWElm2$>1wFe_0p2I^84;pIlJUTqs@bOiLheSRtpF_6kMZog8;!;eHVDHim
zQh<iU<GqxWL1HRMOX3PoRYYPs$Vg(Qw$=moS;THhZi!7^h-n&g2=T3$!?kRBoft4`
zr`&3;jCVUIRvgEspj$;IZY1+eKJe0Dj$fu1?;HX4_=eDKS(p15es*cBc3sq*=tUR`
zq+@Wde&U}=WDI-iEytoJ?)1$rvI{=;m(nb+Yl}G9kAuTmhd$SnUSvpK_5MZpoLjq6
z_{0uMNgkEzZ(9mpG?bD&DmBoy6uu6HQj%A_A0HIn*EMov6Fv@o;U_)i&}WX)Q@|MC
z0K6aW2xqFt;^XV)BtEnBIAqM#_Uf8;5z82+5<A=hl*y(5kr8p_tMDfN5@po}m1nbH
z(D{^5H0)3`Y?^y*e2W*`2qdxoA1Io^8~RHyTb)R&c@L*odx2aYFYe~bS9>|^ZIK*S
z^Ei@9;#?dvy<8biQx-}uhwl{R@tIhKqvI_qxc(V1HgL)(ivvzl+ryS?kC%~Rd|b_k
z5MnbMTe#T#R=_H+R`QdglhhMm#voH7wC*zqyMpu-j)x(pB>KdMfpj77O}vkgcRjBV
z;1gEqe6PzhBwuV@$mc;xKB5G7gM^giV@mQw$>Ibhi$h8ZIC<2Uprp`-k~}EMa~m{P
zBTD$jCESLgMnWp0!Shf7C!f@CoIDNPKCy>rkU@Fk=Q&&ma20DhUo7PNSveH)1LeT$
zR}8CqIFcgWGnF5VsTzo>8X&5cB&b>vQnl18dV>k7mbRg407wpSBTUuHRR46Qsvi~l
zfuHD;$@4)~Ia7N_G@Ro%Dq!TF`#)sl(;V9mVqwqXJ6*7mSeEEQNwQf<1(B_^l{5yY
zlx~39AWOD!Ig;RIz8xi?gr1F<*iohiy`!NB_Udj040*xy<|4#t^b{2tL<AS!EjDnB
zrT%+@jVo^L;Dsn&9x%ksAsI#5nzoC<3z{MEJg^GwzzRh>bc&r@0MI`1xa)O7uuv$n
zE!b3EeJTnc+EiY2sbSM00(jA{CYouUA)U4q@O8@+e?L`daoAH~vhDC$chiUL<MWmB
z8HVQ)6w;rL_D>ivuv~F2K(7;@YK30dvR9yhLCE1+MkX@TtMS4~5}k10fLq`%0@l`|
zo<WwA4KpqP%hKW>%^CshSetL>-Gs*Ec3#gC6@)VyqJnT=q|8ok5i^x7(+>`*@|1)#
z8^cN!jX)>yjD)n_aPGoYm^>k&rey>$DQ(eT5EXURuYWxfZNZe5TA}Y2FUXS)!{-7J
zB1z^?u?-Cr@8!E$RSmi$)=ehRS}p|9o0gem3xM>dWxhbTzc($j0>T5mY0>>#1T4Oc
z$js&Ax+?ffFdrkFy2c=D*dATv7Vg6Gq6@bPr=~6~A0f&3Sg|`0r3<yy=Ie#ufG?Tj
z^?dZN&EKTWpnuH(H|?fJTr()*Iof9i*kwuMF3JIV1o$Wv-aQUw;HjYu=yFxE;Y0D`
z+y{$*ndBv{Q?X-F#w4M5#n;%%b@U1Dh0Ma|?gqNeCgzgh6j@+;v5uqOFu5m+%nKQ}
zj<yBzj`a-WH2D!bpN@0mBa0%sFT@@ti>~f)w+q^c@b?6m21NK9Tq;6mS?CNd9{gQb
zaSlMS@_7{+9~&UD&qTX2?!{#heJ=}`(uMc~<Gpet{=j&OK3aqUfNv>}6gWJ-h(D8(
zqF?!D?}>j7TJWVnLHg>5*KQJNS3V^sc)AIsT@hlEK=tV_l|n+Qz7M9m_{iS|U}m#~
z_50{+gh|9CoS-w}9(_W&E%*4FW^=jc%t1t@BoT3%5D^p-jf)O?m+ITO-zwWlJ<*=^
z5#IEv=06SI-%)tCBB(x(r4-;$A^cR+!GFq0$qIeo`HxR3&l7)^7=QE)!OkCH;v;HN
z@6gAC=c246g*36;y{$WdNO!}_#o7WrDJV1+_W}pFI>NEYSA}DvRG4FRY=z`eT1kkk
z^jXIXzMpll7knou!;Eqa^rA{hT<;BHZ8mjfc+MFW__(ggmyKHiE;F<7VNb;GhMu5F
zUrVGt&?fDn<}~8lt*Ax1Q?yt7C8mgXMv`|X5A_5cO=KkWjw;eT+~+0iEcw0=JD&I(
zJSC*i=Y<j}z?E=@_f_e8`36Z5A3!7dXp*irct^Y~78ByLR#)FN!ZAU+gqVEJh-1>;
zxlw!a@s99=MosO>SB%IwYMEhFe>Sx#a4%PIMcyKg*Y-qe|A6PJd09(5`zzHJebZ(!
zebXkntNm7@93Qk99}ohI?Q#>}zG+36zOl_>qPjit*_jq*%ro5z2`y~m{(epeKK-IC
zpA-&ceVA-~s02L^B;hznb7CY59mW~)Y_n=t#zxdXZ6qtvrWiabVF3gCGQHLf3=JN~
zGMI5b2QO*rwSG#+<{!sV?#OoKkY4JLzP3a9x_0U7_1cA$ZYFR5?5PfAuWl_1o)Pb`
zj^Z)eDk?8P`J_f^YlqFR0)fp-+F^tAJ36HAY?ls7>Bv|UrKjq(d#P;5ZKef=wuXhI
znp<dII2+m~l82fXihql+&u2jrkM;+5<w3zUHC%5=_nmm=z*x>)JjVNbpCnl_<P!)(
z?DXLsHu~a;e=4XMOcYBlw=0ht+fKkaG5)~IS_JmlA4GBWffvg0(OU8YFUqvRs?b!Q
zUy1D5bMRSi>3o+Ju+1yaoI>kr@kKeh7`s^1(aIuW!K%-x=vd{Ey2pboY&<T+gI5Ic
zH&*1+Swx=ctB<TCk6G`;D-GPYw03|~3ww+7A(b??H!~r+GR}*+@>6-IQ=uhTA;Moe
zkqHqt^?V9gJ_$}AkNA*76U;kE!h%Wa5yU>#6nm^G_UootPhYZLsVVl<I5wl#E<>!J
zuQ?|#DLxRoTbe`G_Rd-F?ws|7&RIX}oRwYN5sgactZIiWjBomi$s7&I{Rp2Y>l;jO
z>G+Fn%BtbF?V|D8Mt}2cfbAa~Zlr}!(mXgxuiXZOa8JpF)M1F;(;R(QbM(RH=vNcb
zk~&#AY?L<W{K0p<ehr$d>h){sL7NGfb9mE$(kV_ErRuip@PL5?V0<M6@ApQ@zidw4
zwhcmUs>7h!Wy@x9!*Y5%rEx~-_H8q9{-R~eic1Q!OQ{GYa~_IZKc@)H&GdFk<~-bz
zU=;7nHT&TMaEQk@J`I`3^y!25rHgfgO*wq(lgn`;`p(i`ioS#I%`@oM`Kxg2d^X(g
zx)?qQXtVUlIl9}p5^ZBU8M|0?r9a2`d+-6S?M*kXreu}M;jP3<eJe3i-%4x@aJN>v
zB-d?VEWWO`4}3Fg-32Hmc(~PWFB#O`rhhF^rh90%DAX;WeI2Dy<_sma6eK4S>lB;y
zI!J=Ygpf4&6u{E()4lw1Hhr^~B@MnHlh@(!eVA<b-#avD!7a1|-f{9|-peP^<h~AU
z5AokN7(`}4J}z)CM?z>E2_bSMgvt?u%z+5HLXi`trE8K*H292y8LTAEUEGwT5@}EF
zO~4Acc|Lr2cG`b7e(`o2ezW{F&2;~`X{O6V5?O*sri@P%9|@QPh-BgFjW>##+Ncdv
z-wcrhDZooVID{a*PW@}tDq*JECiB!V)8Wk0>AF$aX%u%F{W}xBBTIV=aJPWF&Z5K{
z_1JeO(eZB}Y9Lc2K^)28&P5U=j%4i_Y`SRL%$hzY45eKaI`+icr9dFOP}b&M1?XW%
z1^Q*5fo_xKc4gmPjyEJ|v>i8C1-jKui8NM#8QrZeW91?mD>{E{=VbDo4!JxQ=<Uc}
zyVvP;S>(+b&%R;PXx_qyg7D0ABf&*Lcky}>JnuMdgAb2Y9*joDI}O@>oZeNc&b}FL
z-tDA(M1HrE<dIr>E<M+)6CZMxp}}b<g=b0p8rVngktXl3y%}lo$qji2DE=1CkvOEm
zK(0g=P+-TFV{!0K-1eODAWmX9Ku(TdP?7-&qAP2x;Q4?}hcPI%VEI_Tg?F-@1fIdx
zLLUoI!(R@O!WoP(VGS;MVffLh3<hEsaGego*rgI}F-Vc&uu^G9%_!0tmE8cw{Tk+Q
z#P^X*th#|%g=e6iH`4Q7VO^3Oqd_I1;HV&Wfw^1@|8AR?e`B>1fRJpslB|>?x(QZ9
zb$IfGn`5+MLQe%xsC`h&c%LrM9a|)Mn)esLOa80jD@BovC-^|eO}P4wEup+ibrplq
z6G9S`6DL;+Ka5dl_N5Rxd>YzlJP&0^>Cn>%{vMam44HhFJ!ThKR>ZF~V3}l?{06(l
zTOO>P;mzQEW2o*|k}$3fVf`arXhZg&rQHWgo=xK^xDDF2r^}M3VUECT4*mtzdb)7?
zY6=3z5d*2uL`jRkbt4GeiBdFmU4VBlPd$=Y!$Wdu%5dOYzl%y^FPj6^sa!bb42Dqo
z76y4Lcohoq<-76?oytdLFmh9scK}(!^#ux7g#tcigIdifTm%%>S#EGAa$1qVIA0hN
zA5J3ifV^999<2S#X)H+(v^c%v4go(V3Mh+vj-e#ALT&<^RM>JgC;kyn@Pkq0cX%iM
zf^(dS+wgSh3))rBo}w>iPv>2BnpZssakxmaiQ^V3&-CyWUb6XDpO5#WDrb8s@yM+{
zwSY^@R)gy}nt*c%uSSu41~&uXweK`q<T&2f@iJbvN*^3dL;IyYeDsQz6P3@0?}&>}
zcJg`Hh78Xwn8BqeGPwas)L*nx3GPNp*qP!|pIn7}K8+|Jv7F2zuDX@*o%|K;zVO#{
z_~W^gdk`1CPJ&lgfUlvSJNmRRFVp5_^Y4g#*@^GbBRBCdGt->-8$6rYd@JWT<>wp-
zk57wud;;;f67kqq>{aYcGkXgmcQfE_+RHF^K`6a(FWju$-onb-eptD#g_V8_D?P-M
zto$9b663_y-UVL5P?#&D%K4s<(kAzNMtYeguQ-iku0=T~aRK1rUGs_K_~}ewTMTh`
zVjr_u23tM_*_-%#W^rcXJ$ORsGE>epZS-u+<K%;vo^lZ@B9=?!Z5nMvd4-*uMguQf
zsLXiS8l}bU>dU=cp|a6apO9RAQ6b-5J{>oq<M;y(N{C1=!utv&p2cY|JuMU=n3={w
z3Rg<>U_31HdeAK?uN&Rc?R6`m{{RMqIfc_4ApQ~((KEqUFP-9np`^ZqBy7g=IHE&8
zEcg&>`rM!rd=l(%_$b-`7A2{-?QI^elb@g@wG5Z9zD~YXx{dqoesB%eJ^GDDN9Sz>
z>A)RI?QVw7yA-b^pFx+#uW^~fR*KnfLs$uKC9IAbyH3;C#h2bdc;$16fm``NV$@dN
zl902?-7;RW7pcJvjbsdS4PSngkS%DyZ;(7>Cmy0lZsMKvfChXOo<~0Z_=_HwD-34n
z3X3Th`i}J;@F%?ob;oKCbSSmAIvSEgG$ip>sFIOF(~!h_sCbK=uE+=nU}0zF&0$Xf
zdgZRRDoohG-(O8|V)E%N<L<bYad!j>2hC58hIJ;{r~Bc_Pl*fJiLVMz{!a1at;7@j
z*YcJ;_+pQnFwDwV3TrS0Fl$o68W&MwO`3RdBJqM}gw<ZUP$`j4=i##gtie<e3IM66
z3~7EFjclK!n;=3t29sVFaSTj)X~-nc-q=Q5CNBWpNj{y>!Y4YBm5`gsRLCb=y8~om
zQ4z{{u_$A)S*ovsRA0<e!t?kTT{t=bYB2N2piXf&=fd$M1XsQj&Pc+K$`@>>P=J9!
zW0qxFjr+GTZZgm+J4vQp?Tx?->zn!UssXPnxQjaAg^yQb`aVFs1Z%;ydc80@iRL{?
zG+)Yc=4#&oo>-HFc!t|CP5fgO3^98h$_ln3R(vhP1l=5Rl;{SfAsrlnB-;#{*@2qG
ztOost%Ni70AC1{eVpe%jjlBqSRBfS!E+dUtkF!q&de1vmoo?-l$xWf8NN-G@C6Yq+
zX-315KRVwKjj7geAWNf(j}2!8rNqaEab3g2C%^_$JrD5Hns_Ci_}(%NAZSM(S2+4t
zs4VhqXEn}_7AjC5ZlyxwZzVdE-iLNA15AQ8b%Ud!U}R9J;iGiu*UGC4IDE50)kvud
zDV~L-0>%s`p_Hod^^mV>(x8tVh7znM4LS{`1{!THPae<gA-@ClD127l>_?D3WHnA=
z_w+{*nT1mtx6K%)?Z1gE|2?L8Zh@|4NMC9gVF+nD`(BFEydb13UHpG40heAH=C+d=
z?B9l91!5xS?>go@*qlQ;lE(h^+@Jm(YW}(zBkF3*UzgDQwazLOB5MjA6h|<Zk>M#5
zdNh-HcGa_{gtclhuSWQXm}yFW->L=`3-4=^gX%WBG&qaJQ2G|0gTD=V!y@0KeL|d*
zT|A}?Q7GFru)3)Sdg#NrjJ&i2i6dPLi-n+*R+zxyI*6M!F^i{Xl8AMiQ!?W-tMd+1
zW(hGa)2t#X8K*dHn(rx}f-{=sQM&K#2m?n!apfEAZZ_ldQ0jRyQq(1HZe>q+I5Jgv
z#Or~S;X*jisr;Rnc2iI-`AiJBuY6V^zqq`%kS~;n4V=<k>=i=5MF1|6@kBq+1Zl4T
zSOu@(R=(yfMsTsWI92(q*M}f(VnYib2wTXah3%7Gw%eZakAVP^p@+Mca0nK2kPE9A
zVH|q}W&Py~=}hLjLVmEk#=yzP0UWASlr8a=c->WR(Nunk9p-pTq=yI5=%6>~R=((|
zLzbz^$G!0Qq*HlUIGmixmy)eTZ0E1h$#P>08(SCiE~s%V&4Kt(7$Xh+Z}43uaC~^2
zPnqkgoH?DIlajIRsslxtb-Bdqhv|vy%|1hqhg`n|@~iGfW<w6cu$rn)@g3io_vr3*
z=ra)^eJkG7Fjt&t;iF@?ztM&;O_MNTJghNNvPi_jRp(~RY5#2)G%CCir^89n1UpcH
zh25<@+<ZF<+|xbq!{RH)r&YuIQ7rb)_A(9$Yfaxe3q;^CU@Yti3ruG^u2Xmb?Is)g
zaZ5we&Ju0aWZceqptVgqH??i2&rYG45e&rYRI;HTH8&KtGqf&l2WKehu4I}I$u*&F
zdSI`FzV&9bW(iT44rv}B_C1baYLyFy3#lUq#T;XTTnm-G)SDVDg6#4VOmK8%5tp9!
z%jn>yX_6~^n@rhfFcqFz&AXTk1Oz^)44aG=4^8!9yg{Nd9f&hzHRcvCz2<8+UQ9sF
z^!4l)iLA2914@e#l{^o$<IM`~tG^W9&{`1ABf9@C<PA(MLR7IRTtS=-4SWLBD<ijs
z?V9D7tj8b@<t{64VIs%m4OTTR`74hpbeQyqlu{+!D(@xt67;D-_t_H;k-Up|(=A;q
z4CST~7Kt@y!&D=2X2?$EF$Kruv5Aq=2F_#%$HZcc#?B@+YMArVv7YSar+geCfPCdz
zn)y4j|4fZMGZ+MDgPjhz8iY5x6N3a2bUzBk5;PD=5E)CUwT5RWEI(dXeJ2tkr@S1+
z9p*CKdgHmNSdhRz@)jts*f+?GC`AGTEu2<mJ#%9|4FMqs&#ORfvjkNhPKwB-Vo4Eg
zwA|P)ZEySI(YCq+ZLB%xBk=P{0%;K<eYXavL$S1xkD~1=3Gx3mRANfJen52%?KISF
zFQa}aTh!z9hS>JW)Mgp`8(W3wtC7r~(GebcR9urR6C#@kKD3SnvlZe$vAu85-E<xB
zD~Ir83?0A7Ofhzh>2$Rf;8%p+6FS0M7Qr)9)wUyYkHYEfjxf-(LSu-NrI4zcz^H6m
zsX2IkN$@(QBg|DvnDg|(H={l{t|P1^5iEZc+Mtz3|0=kUJhhjJVpM-~Yh5Kz2h{dh
z=eE~&Z2O3)ZOoOc+d%jiVgSkX0pWnXsZR7C-w}tUAr3Me=8S{AQ6C%tjFjmg+Xjgk
z#sSemlW}WbsD3=4`edygN_{eR+t#-=?3rsYK|EaP@?AP$gSQ5%Ha2x>LnecrygOke
zb>Xcsx7-JBqT#u_f}POnGxvp#8I-B~AS{TfIaRKbz2K#G$pNM`J}`0?YBZ(D$s^yz
z$)w`Uwx?hL$LE)1CHD!Go*f6uL)$6GPodO((KLeQ?!yws3GP8LoZWvAdqr><nKL*`
zq;X5(!6rW~;7@4=|4<B`au|HxDdaQdrx)_s@`))gdrv-Rc-dV64nSij%ihC2%AAkm
zRk&X^dl#`}=sYhs^$aYwbL57|VsgyFCk*5s@-m3PUyI$`%6kM&&nheqJ`O}3=<}TR
z6UgrK<9VDfE}?6YW6tB8(@t5;`9DWS1`YsOy>T>a_9IGbI*#Vd0Yqso=ct?^+eS9w
z7BcrPrOum-{KMo`A<tZO3f3ZT=1jqB&<&8)DVp-T0Sy)|E#kzS!dyg-WaSX}aA@s`
zL!h!dbi{}^pmJxp!cqR?n`Mf}n9pjj0Npr?9@;M^U-O(G8e4Q%!0@^dYl}%*YmbPF
z58nlX<DLA*n^Hh?6Z|0qJ2xdTRO@Pm;Xf#61o5Zx5uQmEZf|E3P!Ox!LweNKeh8ef
zZ%6YoUZ=9@?cGc<JMjf18}2SLH7CgRNsHx|@Y61{Klnn2U47(p4_-;Uq8&ZBg~b)q
zoSL$+z-Ae8g4<CHT!x{Ivw3N@B_ob*{R7qle8?pB2iubPE96vO9`RNZe~CoCQzP=o
zNcw+K`nkmTj2I($D{pnw4+h^sjrVQ5ZFu21Iq*IS+-4_u;870zGq(R3FeS8=j{yU?
z9yY_0^*$BXqnyt5)-~7rbX<>eI@dd`x!z~udX&?--s#QtJ{#AgoX+*mXs-9UxE|$n
zuJ_dDdY_N$QO=xtwAQ|b`r*vxdS8g^QBG8kJS5vEp=7S#h}#cnDKU(pveb!};q*c_
zJf(>xx0I`VJA9|?KD1d!n>c%##@W-BZA?Chml%=CfrLSM6<WHO2IVCAyk8_cBTMkg
z9&fhVJQ~?ZB<DRe238wRWiQq-o=L8o<ibp8jt{GGN=~WD+tfHE{}s8r+1GV=a8<iF
zt+=W^n8SrD`Q{IXug?Zw*Arh)M(4sb;v22;LSvGzyde_u^!xMi^y5jMzB9?w*E3HY
z?cRgL^Ge`}%}rAspjZH&j@mYeloZcgm5(G*6~}?n%&#SdL%R8}@O1lRTU@J2IXxtv
z8k!HDZk0KN4k;(c4>=NC$K5r|eL)v{|3c;++E;5(o#tb}G6`X4atHEJ_hzvD*~Ipz
z8OpsJPd1imipM!lvTd0%d78l~Gx&qzX(pT|6VuGQq!SWz%xgqw5);fVm^ozPOw2A<
zsObd#WNI`&@VszVvmX2VSdUQ-^cej8kIID7RPSlc_5LBQM>(DAo!wmTi*Y^5>0Ixe
z=6Vmu^(d!vy>pxEeJQR-Ii2f0y}8~aaXrfETn~qNlJo38#`P$tbG`GM>wP({M>%uq
zO`!j768YtA197gB=q`_a6DPthz&o%$HgUvk5s9vR2cPef!&#*xaR~dxXiMw`o8F;k
zxZ&9?_&>=NCtgq)CF}%+-8Y2B`XIZ8EAFQ`2d=)SY&Uk_!SPTsiotdO!&z;}psXji
z5e-M)z1TW!wad>>9`EQr78Pl7qHF8)Dmx$2;-H>Zd(0c@Y#N`rD=hZ6lXS*sB)>;d
zWC!@_iXCR(hIj*UBiP2jFTyWZ#7$c;>*@QY$upJVnZR37>@oGdu=q?Rje~w5XA<sx
za>P(C45j&aq0MIvOW)G#1b2ik)O#_YlyC@;PX8s3Us4>GXVKX}9L<AA6}KkPnKZm-
z!hZ?`^}Xf4gn%T^HZnK^*D&+pLUSA^c?pC?H<DWcwMi#8xeYEM*>Z;ST>^S^Fv4mL
z?ua2oMsaMD>Xd-@%cx0ULO4a|6aV$Folm?)pFI+)w*fUoyI;4XHC*&eDOhmO1rj79
z*w@ouM11D^BXqglVI|!kah{4Sqx&ON_y~kEHoog_&;AhLf;yzFe3XL(XZ8nC0=<kv
zX8Y{-ktEkiBmj$S65ZR;uAxt_(6Hv6IhMM?$Kn=uM(g-=F79o^QWT#$oxuh3M&&;l
zUrW0k^=>76qwPGh`7(z8GishiP=X|fcU^*2DeWcg&DwDR6^sjDQrMf*ok2I{{~WcP
z!Ht7etQ3nl6Y%IDuL<d~e)ERdiu-<HFV>WM;bt%G((A-wTCoO0;BH~YFgMOfVjX+d
zrvD!RO7v1c)BJtg6NmqQCGhvLm}SJtN6nU$9I}Bi%;}z!An4HpB<RirL1d4-;|Zn5
z^naBg=(srq9gj7nlk}eE+Y<Nrn7G|>Z{c!*qg(syjs$t=FPHCReDT8ii^OSOdBZwj
zZw^PVKpIP|3dRD7{(^XsM9S6+77VgY;{PUMhg&`u9ti1QT40A8DUu?0MB)C8VT>U!
zipf?=9xHOL>7_P=E-R<<J}}9`e;fZD8{b70vF^4Y*xh!w6Rc`w){X?TNVebdgvx@C
zuQf$ub4@G?kZk4V_ngqr-(C#97>mKYgBCg_8ni^${ZA}izfaIL8q>vN7F+ZCA?lF>
zB<jy&qKKw@o>-dxCqdIuNt$@Dw(p*A9v~$zk0}wI`UKPRct={uhkR(XyyA~cOBcp)
zIKMUb3+=NV8Z9qtrsZJPfCk<9%kMV97B~kJ`-omFElv+)%7cr#9DXTwkuJ|M(@A3)
zS)9%|g*D%`vt@6Q?sAFZm|iHkI1KF;*8H@H*A@ZG0NwO<gvBA;c$r_5(%t|+KOC53
zOX$dFKc^F`WfhFnaxS%ZJ}ZLFjJy@T#9Pce6Oa$h;IF*lW~WHqhv5h??G*2&51*4A
zF^J<0IBLcQ6rEe+KyVcB(_0dKlZfgo%*fhgaN`Wl^OCJm_7Z6pCxt~|*ksDjhfg1O
z-r-nwA-_m~=6iZ^Yh7=)H`m+UyU0HctjFOWnoHoMRU}c7EY5J_Tb};uD8Mrfof48w
zjkejm)ZP%f{{@T6b`12i3)@L5!Z8d-4(+4EC;KHF_bKenkt(oD59qL;i7E9D;ztE{
z#z9Ot__P(ckSO_qIQunmaCe)Me-mfl9S7gsrsVtM>{rJ@Obc^be_x#asyGOz-JFtt
z9cSMa2jPlRCEo?HRTO_H&VOwjd@B`QOvEROemKsDRty2)jKJKYABpo{7Y83`SM;NC
z{_Er5+uIfWSe*ZcI0!3oE&_iW=ff%vaeY_2q92d*cg4YXwkrxJQV8lzaqusrqR=n2
zzJx&0JG}8aA|9V=EfEb*jQh-f2fiA-IFBQ1fN;Qt3xU5N1ZZ%E85oIH87kPKVTAoB
z9!-AIqp%^`+F-wS1tz*07feG+1BM!u?i_!aD6`H4@ro!jp8<McCaj`4<+w7Eldo5?
z#!4oOIzL*H<lP`iYEodLt;)R#1*Vi*f^SZF7W;SDR>kWp*rQZxB#X});q}qpRBsyJ
zuOtnIJv`#7<p^SVD#3?v@R_7Yfw_W9`%I<2lzvj`>&2m+f+y}HBzekdzd)bBFu^-{
z<j%oSesaoW<c+(m#j%_@g5%Zc3|`GM-BcDUPB(V=<ngX!G()sXAni56IJe>wDj3Nq
zoks&UxDp})5{Bb4Jk7xWK+Au+SddT@d_BAyNHM_|2iZ)H;z86bUx7nq!J|+$pxDL#
z6#mN#zWCa^&=0|-y-bWJt~uvDzxk*mX`5FZX~C48k9t}#<>nis%s_ZrnLEUhd`iJ1
zPZiEC#eB<WXv_PEl)nP!8u6wP_P#xwKf`UoD=9&3Yb(F}KMOZ;1T1DIR$>$p7I^Su
z&<2id#4hhC6hcNuYb|1`@(*|#t`E^*s)LU_Ir2XE&%pVl1D($-%Vu{~cB885JhO`7
zSTk*X=VXsp_Li!G0i8eP&4w^)NdS_}cw*_t;DHhs%d1oGd5H9dfSFe<<+Jnk=pt{?
z-u%2Bf`iU`^W8)qz1isH$ZN0OfrNp<JH!5fNZcYYs7J<`sbT&hACNcu6(9r0P-8v3
zx3<~1fF4%kLVDnK>WlbkH7@3--MECGPUBL3x{YVhQ*UhGXs+>0es(ve__?St&Ci}j
zm7m$hMt)`*&*Epgu?f#Ycgo&p{}a%dJqm*1ufoqT?8<)aqv%68exg3)o{TP8JgA<O
z`xxbV#;V*LhMtso7_}roJxM*)((UM1+Wil^U7cN^?j`ZW2vFUtx*gXb(1nZnoV-tm
zuHski@{5!h^oacBq3}r5^7sU3q<QA3ucBAni_f0NCFXcMaFg!deBXRS9GCO%%`Z-j
z)5KsFKOYX6@6F?q%5Hw0lTFpdWUQ(~_<0hsK6U7zAj0M^5fLjw#QO1n0RIQoSTwaS
zL)PFB@@5|eg=xFnZd^wFuQ#4e52JB8OGB!01wYe`EBTpiT*c2^<7$3(H#YNgQR5nZ
z_B5{Lr`NzaBM576V+%j?jTwGs8rSpFZ9IpcPUE@!gd?1viyHwy3ymB3S!_IypH>66
z7qIZp($i|(gh#=IjD8j9H_NE29dBs(8Zv16n$Tu8{)}oFjTg`ZOAUI83Tk<Mr-4fg
zOuK;dC;W0s1NS3Pw5XyllqjyOs9-=ewjxUJ^>8f1)iz7cEu2%_DbZ~FI#4ct4+zRi
zFcI0UQ=Qb*bfZp<Fp4a-IF~y$P3s9lI+n6zJe~!4Dgam_BUv_%$7Oj|b!NX#5{}u^
zZeX@h)+Gc8ugBIYfk82^!i0!<5Jf3MTnfX)qQHmb;kAtf1g5bmfksw!rU86Vw<(7P
z_zY4gtJ;9u+noIdA&E0qUb=y86{yuh13Ne7#4>0jFH@(qIu%5i(r2n+B9lNkH6;)`
z7^CKLoGf_dvqM8(cB-is2@#vQO{okVZ!roIQG2Mm&;BORP6%fLAd_v)kae7_c>svz
zhHwO}sZtUzq8<y*Zm_DGS%i|5)>MI8A<a1X5U)tZ?h&bBmjdop;g#vT$*{+lSZ#js
zirwrg=e-?s{;Ffnd!n3S(W2pAUdn_r!k@anMBj%`V}o(0!UNSH$BT}X)lmw$$S4ID
zj;k`{Xrh#{D1{t?lrkEnU|!{vW1<w=zMwMYD1{v4lv0XPXh(ojMxqq*3scGoQ3|hc
zk+L>Q;nguxj*n7!WsQ{Mq7+`0BV{~F;T1hnDp3l11CTNirSK9NDJMoLyb?yrNl^+f
zc#(2)l)~#;q&y``;pHn*PK{D{>4}t6q7+`YB4sj4;WZ~x)<-F{@*{4X8Kux7pHiM0
zrI3SwQqG7{?pG<NM=5VsDW^p#<kX;A>!K8Lh)~K|Q3`npDCKEU3ONHP<?JYh*8G%m
zPLx7RdrCPsN}&}zrJNU~(CV8~o*t#VOQqmL#!BjF*-d3Gj8bU0P|8J73JnuVxj0Ip
z;Xx^vL@6{ZC}o;bCP?PleS(y!D1}`wNO@+I!fqR+Y=~0W<%5)GL@DepLdvC43cH$+
zQjJpB|ALf_Q3|_Rkn*f3g}o_A*%YO)(*!A(MJenXLCWP(3i~yX^6V&uogzrNB1&P0
z22!r1lu4rfL2v`hF04D{dqk`kjK(d*B(qVc2R_X5VtQJQtsJ!*FQMlmn=j~UG&s?0
zY~!cXfF}f*j$~px*erbvJLXA%As|!LwNzb9iB!BA7Pq8*liMxb(l(hsA44tAlf9j?
zzlGXNjX(o2OcQD|F`$u(>9EUg<)_oQjjFMKXA77DJ%sy{hyidT5-~H?!I|o#Q+l1C
zHm9YB)@OzWr`BhO`q>kc8{h|y4sOo4o3p9SInUe-bHv$T4^FuoY%Z1BU~@6g+#vm7
zYP$|syEKzbIu5_@FjeIjK_^{TMd__gZ8DWo(Huu%+L|h1eniRt^~mAEQ+Ae*$CpO7
zgQ-oMu^D;6GR$+68?{;WrirT8{0fYE_3L<iVvK--tTN;tL~fs)rUmRgoj|nx60~8)
zIq<&M>m-hmK<IUn>6uL|ESVw51Oy|e1S4-M$*>%tM+G;Dq~LY=rnuwalVyo~*TEf9
ziFitJ2M5yJYRC?}!H!pLLi%Yzj8_t|8m11`j;DJeEcgv=2;krpq>rT#(x=4G+JN<^
z$@eB77TITx^G~!1Dd%lD%?)>3<sD*9i{G@0>tf(=|4nV2zw3D1-U~Vp3+^PB+b3hZ
ziQC&JW1LC;l5Zt3&#~QlfXEV`VgDT_T$w)^86a613lIWMJpi5^Mm%Uo6L`=Ac+i9J
zpywYK5890c9`pbn^dLOwxnDe7?fqEsz`pbh+yF;vzlw{!3V1skXGZbP0$u{aD<DY+
za&kMYknhpA(lmq5aP;cEMz6`PWR+H&>Q%QCCR6*T#Od6dO{NY^$!1z_kEHUX!<%L@
zC+S;h7UR>Zu{T;Yk-n8C<d&={+`mfm7EWiA!bt*=_b9Nh7<sjmaOUPlp!(;6qMK*n
zk?R0zPHqHhU@j=$D)f3ffSQv#MUyC<xp{K--}K?@zX;T!_g@6+(ED%t$=!d|!`FWi
zs6+3+2-KnXU-ik|e;W^9|3#n<z5gOmhu(i1!~Xl!lXCv!H>nO^A2xwI^nPptcj$d7
z;M9EhB<)W+ntB-fGYRf6^k)*>Vdzf=r~31UPs;JRMmzy!TB~lKJbWG79`2#{aC^9i
z-qkH|pVWQ6`U&du_HYlq&)dU2^geHa`=svk$`jP*?cpAJpSOp5=zZP-_etI7$38)Q
z-X89u_j!A`hu-HcaG%tDKK2Cld3(5r-skP%9(td*z~w%7w1WPB*2l<)j7N`2x(Hib
zjZ)&E`(LV$BpaCPgPf<01CfPxZOm~_&eO($$VIz0=6EgJwecj8k&z>lUHAmZ$jDH#
zfhR^rTH1I*WTd5y1D6pO-_8ENtw)j_*iJ&O7(FyKl7!zuKsq#Wz*?lKiT3i*p@{=l
zB27)S7mwB^Sch2JkFm#SHtl7pU7&am$kyw0o<p{wr6nZAJI&D+O>*N1yGU)SQDU2l
zNmknZ?J81Yn~F&W*#qq=Qev_q$i^PfVWB(P+E^&vQN@MQ9aUT?-Mn>jxaf|y1Qtqn
zRB@qnM->-JH*Yx{F1jOE`a<cBDlU}nsNzEDW>-Da{ojW^zMEXn3#GlO@<Qovs=QDh
zL@w^b#)qWKcENl|R$ee4l9d<C2d*6Iy(8coK16-r;(%;(YT!VWe|snk<xG1h3uRFY
zl*7oY7RTK|@~S<Qh4QLBl!fxD1<GOMRg1&xAbHgu%0hY79?C*_)dJ-(@~XvAbdbDi
z4`rdeY7b?hylR1R7<tv=U^z%$wTH4$UbTm^P+qk_5nh=ZE}C1AZWq_W$~Go_JEz0`
z`6jiFQt+B>{dpkH!^38X&5M9}AkM?jW{AzJgLV)PiMJzzhnKe_OAj}1BZ!BMw-Lkx
z<*lWib1-t-iL<Ti+yfNb&W!B<aDP&(0Jh<42Y~w%S_QBTTU!B!GW+QRu>JXLc`W$b
zCwYHe)7tH?tCOMY=PBO8mHk0)p}Avmxzn!@(Sy4Z`KYQmDYirH$=M`_z6+-r?9Ye~
z<+eL;>+1E}P;EMkYjJL+Ku)jId09G>>tBFZVVYXtms}F(Y0tomdyFhKi}i)0;mscn
zZ<ygs@PgZAxay;8c}tG%s!y)rLraC#7G4f1=WsIrSls4;$Mkg0MJnDJDd%wY6;g4R
zjy*k{Jy;3||BpX$_&cf1#pzsm5mAs<6cne32FFECw!Da_z{ky)isID4($Q;}QziUN
z(@&LtHq!N|ybk05Erf81sku+*8&ykd5wyblRXM|qGmNGTxr4QInIW?8n*1zx6{Uho
zABoZnlD=NbOX({l?NmvNz;6h@7~l8=<PR@cJ%k5d_N-IX2tI;O80d9MlQ0C1?nPXU
z{#puoQ^ojp$RI8n#nVG}zn{TJ8T|J`sO5gra%<h-#$-*7p<c1Mp2Ob{X=xL896m2u
zm)?%&6x3hdmjFOAc3f+_T)H-=KBZ9o#ZC2HZK7TMZabK5g`m<1MxhCY#rGyBX$Wpg
zROSfPY9gKay=HFP!Oe-PbUP=-srEoqo3_@j?(<vfQk?1zHr2KGp6dj@!JozTI81ex
z?5EBP5_Kp{b(Zd@&I=QDDBP~jNGnf*TjE-##WAXPL`z$vE%kzWq8dj~YsLL2m8ggq
z`EBRa-I}aR5vu#<1FPFe)};v5-QBkCTw1p!Dq8`?h}Q80%Ji;5Cz*XwvNnwa$SkE1
zjN_VMbQ(9?TOkCLLiLYts^3QDZ%tGUC_=TKlBgv*rk&j0mZ(ZHPjRZfE>WBGQ@gq^
zZmCOgs(V^fT}zwG>z5>J1{9-ur#IDW-{wn`HGwb1sqPsqbx%RBC&$^#;(7svsZQmN
zXW<3;+C}}^jm0}Q(Yx|^YdI}|{+hL`@U0cr%M-9D+ybkAK3J!=;Pr|GEDE>4TAuCz
zjQXS%uRl+~qHqhWfexTpAGX4}JpqftEwBdXgEiSqqZzz11}2dfXiM5dYe(Z9F{psT
zEs&PBha~+t2d_J0U}iv(7HCJbhemjv)<UBd>_`BUSPQge?V$;7>ssIicO}4aqy^gY
zTw8F2TdRz{Dh3r$xD`^WEhNFK71pa0uqfOL>*@u-x;p`j!mY42F96nS60j)T3hSDA
zV4dE=dn?!(L&LmBu~vB3&I7NNXRnQ+1r%<Db=?AB-IIVt;Z|5T%mZtVjJ+-fW(E{#
zg?3|GXzgU|^)aY`!mW^=*A`NH8GAzl7KK}3)fND2R{|D=TVVwqVa>JG-k5+zVS;sa
z)4Vgc&YKc-C`@%Wx71nHEPugYB<fI@>Re-p!A~}m-%|eGM0pBR`D<I+XsvT!q7H?r
z&UO2#bAO@^g{jUB`>FHhL>&rKonSw8b|>mknCjfvQis;*iTT|O-V#@m2-Vt}s712Y
z#&&vZvK~dKUSt3D-j=LK5vsRs|Meb7)}sj3d(oVFbL`@`C+kt<|6}jX<D)9Fu;IE(
zuUR`wcal!h&63b&BMVDdlvPv|1O*IXS42RiwSx{4g3)0Vkwp|y8AU{KS5U!y7c+{u
zppK)6h#NABii+ES@IB{L-R@3Eg3ded_t%@>FV*$bId#sdy>8uGRfrdCL@qRo+WMUO
z?)bO_DDL7v7Wba`xCAKfl0O#r-uSo#C@yyE{}G$-i;qiy;^LHsKN9!;__zcp?lpfb
z?gR002~b?D1pN`c55~tOKyh&@(I1JsK0Yo1ii^{VT8Z1@c|W`%7M1-J6DJ=vi%E6A
zWxR*t;*p=?z0yj&BkJPAv501v0L6T@MNHY=TIzfx7Bx(Min*ml%vPTZH^w5GVFDEM
z?G`cRcv6MX_`ZRjYc|D2w=hX~r-g)8bw3&xIZS|JzS|;Z>*;+g7Bx(M#Egu_1BYz2
z{FD)08hRp*m;7`}PKtiWOHV1`Gw^6)3vmRp@hfPj_x`%k(f8Q4>dep%Jh1ep72=#F
ziFCK7H*z-AtS#`FYYVa>H?Th&CzpQ&zQw15V{Hg0U%5k@fFloqEbm}hJcyoy@U-U+
zJwcQgK#@m_wxH-bALOWq>6FOYtXg?Y<#xm|<PneRz|nY$*Ik@%v#$ySBikU>zW8LA
zA$3taEJwTTHeBzuL*D*k@Z~aPIC6F&R1{6b0oS$^GcG$q4<i<@pP7PM8U5kNc9Zj>
zG{PBx(|`;9Pm*bRyW*}JI>0WP(o|hncdK)1<NU4KE$c-o-ig3QkFW6s6r8W|MRrkM
z@Nb42q|$7B8NN^+?B%e~04#6@5~|J*4J1!ynS(q0Vp~u^nq(dXN<pc;P-g$&c;t?J
zj9d=m`a^dw9Cn4m{D$xl!nXk{sr^w61Pf+~MXWWy%q6u4sxAqUjs!=^G)Kgdtpt>e
z^(Hb3;dU{15-zq4xN9A*0;~gvac`KrCQy#0ct@D1UA=*F+^<N%<q66Y=vDE$GvKOl
za8i(>T`)rGk0p-AcsjGH62IurJoC}Tn5A0L{3iO%%?jZ~liQu;4dLc6w;sYhVD2nC
zgCdEzZs@#Dd1U;&7V(V`E(D7SkPzqN#OdFn^#3DXzt_@_*R^hMR6n7X`kTeK^jq<b
z5RRse36QY4{@0X#bkec(XIuJlq^&zUs-I9x{mtTA`mOjz2#4Xu1V~u)Xb4B+x*I9S
zeT|fEZzE~<HBwpl8YxARgJc)w>!GKtHkl~=C^z~;yOk?q$&O@avMbr0oRFLtN=A*O
z^=qB(XX3x;!ueLn!bM!K%bz#+X9;x{4R6Nz--`2ZBY$QC#sgFDh?>-|0UbMgaYZ0{
zYKU7hlmz@!`X`~MXn0ThsWWW&7yQ)UhF*aEShQ!|wg%EoJ&z0yRLXEuhSPc=uHxgr
zfScu`+x4N1==Ge0dWy~u7?BwqO4>s>W7r+S0mE)|&XMyGP93zQQ}ROSzKhouWJfMx
zQjQ`~Y8A;JS<a+fMWVzjQeI>olky`E;8(AwT;Sebucus4Udkrf`ARbauE;lFj~wPW
zlr8gr!2y?ipY+f09I}6i4zLb~AiE11i4sbpHxOfhBeon2X1}A$l4*{(2ve8KJyQ4#
z@F)tcv$deC4b_m0B*claM=F}blbXYE&0$@0xV$-B*Bn049KO~Ze%uUF-E<4;x9Kn6
zD&8N#L<J}F>I2Fvp{u+)%z5=G<<-BD0|kvlp^7f^iptsHr|8)#`iB%9cdW>)meD_>
z=wmsni4sbQMbC*8aCE<lz8%rI6xJ5xMg}k`FLDBY_4*wkg)zLixtt}QkAwc>HGNK+
zaAY+MB}yn2nwULubyUWD+=LPz=L?FnmlH>nkjJu4)@UwIbOZIfpdZ~2w{J8y{?qGl
z9=5xI@=UM8k=X7AN`PKBk-U@@dfg=QlI?nA8Ek0yh`qOvcV{g0W9FSqyt~L7^1+}Y
z;l~+Ue}^W_n(-cua{#Tz?Jf@ER&z8G3}W?ExpA~p@ffG;cIpJ^ZiD*oQEo=40vsj1
z0e=8=)KiZD@FTzs6_{>Iv+<dSbvXYOhv##|y6MQ0bo<T%z6aEa`O6(pht}pUUAQwj
zO&s#}5W*iui*fJBDDt4s(~!}+NEGuH3e*h{%MH;N^$Y_s?56Aq<9JASLUETgE%XF@
ziO%|8V<zGPmEVBsdO&Z|M>a$R+f$_PE$Gt@dAmmS=>|+e&y2%XaRO^BOK=(Li8!f>
z@`Q6lUhi5;8zy9~;DG%2TuB0?TuJ(9CG54Xwe!$q?KQ~Z#IA>O%+6wo)$6HD!(*WY
zx2IR6IYYJN#Tm)<xWY<hcxejY!6+<o1OA6J?d*yO6Dd_q5bQkU^+r<_xgGVvh&%+B
zd!D)}s0njjMH&`4C||qvm-(uT>0<OVmj8WmoLA%go~dVr&P0?t9K7u=z#HY)(C3G7
z`Znqi&8@%1R1E3#L|!EgqhW&{NSztRvD~<GP?pcB2n<g{0PftZRDsSgx_dX5ROodY
zMI^5ZrXmY%MKIs_87HRlc`$NIep5!`T&syHz@5=xpg>m`CxPQ&wKQ`cWl?AdDgcXY
zM(51642VVJOy*#&r&3~=)KsqV&ZLAzy9}8y+}6HM$KcK3xPJOMu%_E}kDbRhsQ`cm
zN{&DjWPk!*8K!<f;-QVwPkcO>Spc<A<_uA2Xo`xX(JPEo&)r0!@k@f3P##a;fKczu
zbO$lL2d1SnR05pU*P^ys!5-%_YVcgGqt+9@BFv)-9+#)MW~fG%xsq!M*bly_iE#qz
zf3gU4);F7CMIp*ey0Zv{PnUB<E$ugvE&+xnx7q7q7#m2ip56|l?qQtq*tkcgp@T*o
z3>}C|lZ&+lIgzAX3h{gF^QaW!o{TUafQ?F{)WHL5u{V+jW)&#cJj+p_YYXxsJy}YA
z)F!16lY(JP3AlNl0eW=$qMjY;3DGQ>r(*K`D#g=q=A>J8A_=)xVy3VlA5P4sOy(y^
zO3`0AI2*y#%Pu|z6ERBG4IJ!8Fy)&xM?S2QLXR7Hh||qIF7!UvT6f;WRo{Lm{f;}K
z(i5xyjts@URmB&nI-JOLSQbo@tPG?er*_E{XiiUV&PM%z^AIj;6_E_ad7m>p$zD(L
z(ix!{TuYO~IFsM)NrHt*o}?pV<HD*7P~g_3Ad-QH>3*jp9F33$Jgf22h-dj}$YZ||
zZbBx6&xM`@-JxoFN^uWdq(THwN*LDwxWhZCZ;-uEv+$3sa8C-1#JwAi%9lf9P!srB
z!4MsgXXyU0u3$AIcX5(#*jKP7enbbQ@*8~6V(`<QYzapAeA0>=2e=Q_1!jQahIly!
zzN0Emu{%~J;@UMbOkPF|SzpsU!H?$vdT(+g#|fjR$^O*{<Cz#kcnStGJcLl1IW{1k
zM%<-)<PcjG9ULG`j<4w=x2x>?@l=8<5Zt&^*x<8XZ-R~tcO9-nz>MH68Z<(<62Tpg
zkW&Z85m|>S$X@w0T$SpGJYfY@-w8W8+*ju2-`<#U2uI}AD8~ZAVO*%-PK<njP^Vmk
zpz0gCh_v9icsHFdS$sO~;#}nn<Ggt{Zd12qgm4vryQHVb7P<tUi^^aeuIaItB;p$K
zw-LR>@YvluaammCYskfb?Fb<Oav5hQ&b&mSV%9=U<4${Nq9cqO5ZvBlsnXf`-ee(e
z6~X6(4|%Q8eZJ@fxV#N7?tA8Yqf#|PdO`sRPSvnzlcd}2=Z2S}>c|RE`5m514S`HQ
zdfZgq-tqOVTaS&8zL&;XLv;Tm@zY3m2p2fW8?%)GUlqM=t#cXZCH49PFon7lddNU0
z8Ml}f*H@`Je!9|Yas~EQB$*tI%P3VxdJ9b59$tdv^79g}P^DtcC;9+`G|^g>%%-lm
z1*7r}lb6e6y247n&R3-v>{b=&eD#F94y7trt|rMfvZ`dc!X($t=r%$rSzhK!7&dv8
zNrtqHLgS36{KO~cE*KhQ`~9Ek#vRvI%`jlcdClz@%O^;iJw`_R`kr|_aLLs@Dv}8t
zkR6|;)XSgE!G4vV_$5hdCMtEd%4>XF*SaBHtI%%oaF1Gvt|WlYkBZ2UKhh36-(TzI
zu0`fy<@<KTx6$2g^lX5&Ji4TT7ZXUzOj;~BEYEr^J{70E7Wc+^uf+vAuC$`raCkYm
zaTAQk(G^qTM7zVaTF!oH66HY+pq0VOZPq9xOSST8s~yTRue?hcM5-&7(2^xT7iHr2
zG_r~B0x?2+kZw1!DR%$6L4ByW89a5n5f@di0hc)9Did<$tv%Aqi?Aw~Z`(+yn};St
zPdX>meMxCRuV8PmBbwI{!nGdmP#T`x(B+seM{R=*;YcnX^!I?b4mWAIL+PO71Jy7d
zb=*8<RN^@i_c_CtA)M-*DZYb^w@^Bkmb%mQLz%RM>@4ynRZ};pdzbN&GEG)>SIm$5
ziY3Je^(Xsmim0Hyz+>~SMj_bA4Q0TX(r!AR!&{t<`IYl_k}B7vc~)S}tQ_e2K8}P@
zx#|v~rp#s2bqu_9G{D9g<8G!u!I$nT4%i*bXql;CsP2)b8hk@&%_tAV(31-URc^QQ
zqgPRrKHrIa<5dy|JcsOGQ|6UIYe6GXLU=SQ4q*;mY=k~R*~)S^#Z+X=YUGz!eOio%
zP^x(YsmgJ(3hIAEE4K5!H|_%dcGw{=#3|tKik%D+>wgB9i~UJf4fJ68TluIRTkGq(
z!5Y+kQ%r`wTaVOt9O+wCzzxFE#dXTk#Y_827gzMwbUhi<^_9+c8|>EVY`4KKz)!H%
zU>AT4H`uL}W@7rnnHu_n@@ZCzkkuenOA$OJFED(oWv(MFlqN6ob`8>`=x$#^ChjOT
ziw77UbA`78mxJQBHUSUm<XK(ZiGE654g??=>(!Z6sPBoz36Te)RmEzxG{Jcs45QO*
z_7FW~L&>rDQ4jxRy{(}qa)A3VpQ1-+>OJ%2Ox-4Dx;%H4wb5fMQp}EVNtSPd_mELC
zz;8{K+YmZZ3Uw$gIguq45GUL2bgfoZ^GR6}st9~5jO-OP*f`H+;9X&4F``0{xqq7K
zf#m&7w$Rs92ut-W?j{SoDO#u+$zNUIgRdv~Ok3z1;yF%5w6H9j2T(@0p=@m472ZnS
zR)k--J+sVd4;<>h2sCL3E`*wpAT7jYP!kdzr6z(BocQsrNHn}F9Amv)w+-J4r}E-Z
zeH;q!F>!g53qIV~XdjCk=rXJPIm%X#2{S#gHOpgLjfW-2>Y^wK0X_tP1SUL3tuBh(
zhCHnMD|)iIT(9h6Ab#mDJ_BvO6*kI17%~>Njia4Vl$A)Ajj|H9j)mzQ9?Mk93!6YM
z#eeN*Cf(SV+Dxa^fc)-_{FbEzc@C=c{D(^+v8cYZbf8s8%}r88MrKVws%17?=}@Ie
zX_>G(757t7ZzEZJ6j-Ym$128}WCW@-gTV_1nQ~&FngIsyiW-2Jv9zv_PFv1J3@nip
z>Ba=N9reLpWH)NiFWKo{KCw2g4Wm8KD1o!ri)R?l6{d4VS+2v0AG>o!=zC-l0xj>@
z)y3#|fL7a_D@wB9z~431;Zz+s0>+Ih#+m58)eh$hZ&x=n!&@UUDBjh)6k|tgU+%Cv
z7b>&qu<Lf%Wz^we03-56=+d>4k@7dZ0JVio>|zh&slo1aIAu>ZZW$I3H3cSwlEAh`
zDqy~)GrDFvF*vDiukwXrdmRP@_gs5X@92w<Pmy79@H2vydOZb~V8Z7sI@1C^Eawrq
z{!2JZXB4eg1Z=h-?)op_#IqQy>M=~4MHgEFGFnivrfaq$em8<4{t6XM-YaQEB_afK
z)RqQ(PV$tEijNII-clYvgkSnoayzO<`qp7Y3;hjwM{{^i2?<&{Bi4+Y%lrn^`aN)b
z+*fd2=X_B*nU2NyC_Sr{_+Q1*q4tRQU&rVuvOj4*l|(D{Q@J7U+O$8O-kSaK^w#W;
zrypUzT?|1wOw@6e{|S+ws3*LFdP1YrQMVp7lOO$Z4BZ|&h_<KO(aG!hDk1zdX!6k`
zbO40Pj|^lAwdW&b&Iz<F^FNkMqg|P|!_GV6?fg9|6K~}XK?s&EL%-uU0gcP<TVc;y
zQESg$Q9CZthCAKmGpvEP_%^3)Y$`T`;XVZ70ZWJ`NZ1`lLwg;b{dtb;upy)`LTXJ!
z;OJHWJ)~pAk?;F?)xQ9$DQ_C+>67NdPP)@ptc$mIM$pOwFdtU-{`N=hjT(XH)J
z8@#nvLNobg1h4YdO@rZN6lJb7>T*4NB?>k2FPITt2w#aq54q{%5<TRipUXp65&aX;
z_0S@CLsyfNiCJsiPY{E$rFD}NkghwCt|PdUAjDl~!Y#0e67kbO4c78g<*(aMsom+e
zm*6(hBvhUq4(lC=R}}OSZiq(SnF5O);;z<oEP+MttgJdeehcj0&{F&s7Wl%;@XK{i
z?jyOI@;!VFG27o=f~R~fLAPU(E;I;A!^;t#==3_gctyj@bq@5j*}l4K5ngv4Ns9Hu
z`=~s7==Jm)-KNFu{y8!he}P16y7oPC4oeX7M&IZZklyEF9YS809G$ZL#Jv9Ks;n*Y
z8;9gZ4)brGyjEH9=kvPj+Jb;f-m|QNu^ubs1aLL9wZ5w)#g<8x@zITA1^DX5)dBA_
z=%W?++_pd_1@AT!>7mCP_ni;F;bfDt<n6>tceXMqN8VPfLj93Wkc>s=W_IRDGg~o<
zcchU)U84q3{}EC4SW5{l>^XsTWXW4b)f#ek<WvsLX{OoVrsn8=Hqt&es$KS9N6}yR
zztSI}#sdBI&Gb9O?Do@t*Z)Yr+D%6mtdCki>v032KbQM1uJ@+gJ{MOCk$1o^uTPgx
z-e__1J2Bk(9r@4_Mtug2@yEB2N8jD?p1OFEzqcn6XM@WMtDc=(mw7RDLGdyR5Uou!
zOIye^;`VaL_WqZa@`kl1?~J3$<2`!u_WiXzc}tEe?~<1GwJYz57V_-TdKUQ!q}Cgo
zO|HikWMN`&HGMn;C~uCLWqJL$t5=4G3a!cz?Rt2}Al0*xove<B+WAaX&m&<3TaUeQ
z`Gd|UrZy|4Han&^ryaGt<B+sI5!EW&cU14k|9AAxQ|-F6vz6Kwdd~pi|3Z3crz55J
z@6Gg@VpjY8i5zm!zte)DRmiF*M}Ha@+ID}+`y@&CQ_XZcqPkm8ZsbF-V{I4@mSv8}
zX8^^v$afr;9nlJimXoaBe6})v#O<#n-6@EmhBfp%c3$hHqz7EB4{SYIL6)TeU*nu0
zRm$(9@J2OqDhK3R%^)vw1_$IPE0#8l;w_tG(%E2_CVd-Mu_KotSW1vv4Qb`lYAw8R
zlj8n08W(Axe)L>bj(Jvc)Zpif+{#vES$1bf?&E+Q%Wl64$c5dsuo?F*#8OM0^5v=m
zJrdL0zTOZ1H+pHqD5dy|c)g|=uhQE_-AO9sx}#Er(u*d#tt9deQ&N<TsIpineHXOM
zS6=^$X6~^q^_;{3ahW@k11NK8ve`!F(hgP9JvOTQ?f5EDN5SPSx=!ArN|EaSOeETi
zN|BcTnMkzhlp;OwXChstBE9)%A}vypzW6hdXpbkEr**MPu33Z1MsS-Aing6n+%Eq`
z+$Ab*pJ-fai$|@SWDokaSc4qL0al%9rQ6_rqh#e-QB9=h3{^L+rRKOQ(`;@Y*Gp%&
zHS6W=qom@JsER*O1+P&#u>8+NqAj7M>7G9miS~X{q-Xw2B--Rjk+wx6{gLwd65P$o
z=Qj>$uY73tCn>UptQz<SGLN=-QY8PMiA4K3DN<E5(jTwuH>kLyqjCR$m9$Zl6wdfF
zk!VLIMQZpnk!b5BMOq$>g!)bOd^CRJo;kw2uhOdYk-NZ7&(=$4sQF1&<US6{j+@@c
z^}F;KvdL1Ro*9nRaWjl@x2NOPqv?3(s5*GxW_vomJerR0kE(<BdbX#-b_``-J9(7?
z1|ARF%q!j-+MbS1N7E6lk!|YWy`=5wC_S2vo<~gw?>%i#$H_<2G5V-Fc&}=EI%XVA
zM_mgYtvr*{KG$XsyH_DjTx*C;EU7VE7ca2YXMEoB8aKK`9%)bBGe?!jTVdnnecYbB
ze;-vIZ;_3c=PGI^z3CuED|D;r<*l>v^18PtuU~U{rZ}g0e`SVwJ8c^?Ra)Di(I$Rg
znH`w~nRvd5E3+0o*g0`ZsgFED=RH&h+R*t|mj0)8E;*V`-oD$0&PV<`olhN2CvW?0
zL+7^tPUnY5)5+U~+t7L7ztd?ej#bK5%kzPD$}<m<|BE`*<!Cw|Y)9t_|DDc}N7KpM
zn%l@*T3Y-s(peYNxm!Kb@*e2O^&rGI^S={gbiI=1BDYylv0RC{h!J<5nOq0tJ<;(p
zsO82r+n6}|&PIOE7&n(uTchIzyabhP2x!9w-Xk3^;<dI#P~95SvVY>O((MgO??7O?
zlZ20~e4+uot#w`6U)^k2`lU@fk@bh;MRMLBvG&Z{tm6}2P|{Z3L0~C}s^4E783Yg)
z^T?-b+K3%{y~A6rd*gHFNeI?M=M$0O<QVLsho=D*gs;F-857Tm!7+;H3(o_!P%&XI
zVVIek2h$#<3`2-}Xq`RTc1Xjyfo?r0@HtcHeaI{c@c!S@EZ$XF>WQ2{DWgr=LF{c2
z_(&wiTbi6~i=h%{-7xI*Nx*jnFVn9h{0ihH;zJRC<XZ@Fgje7tmy=!j%`mS%v4FA1
z!z6wYcMN_{#6Q$PUD@fywe$g-y&jG!eX!JYCYyEtL~v5^z_!ShOzspHr!EEWPm_vM
zops-^$KkBQ`$l_`z1MOmbfx3av^ZlD5JnC2B>WTM|1WFsL1f)o$nGR$oWoU@f;>zz
zd(l<`Y!vd+2YqiNWpcw*K7As(3g2QMSBh`39r7b@DywEYq_)@n13?9H%Mb5<gMSBd
zkvF=*{~`IwfuA-4@z?M8W}v3-dXyM%k}F_>BCkZK3^2pwPt|O_uuTl#h!kQ+7DeRE
zis=b9Pl7jm12{}i0x0bDB%~UKE%Z7i!pWAUyLj7OI+`z86@CjfAw97;FD;Qbh+!v#
z(~}fN1a}GDc9Rso5fVXs7a^tF@%?>b{H`yLD=wh~qN$^eA)I87%f>m$Df77S!MV$U
z3N;)i@i*$axJ57ehn|ci34A$}k%2D{-a<LxQy2nz6qp${1@QeKz3H@25IZMv6hP>0
ziq#{iqt3<<wKWUn>&7JGkVfxA<y@O|$|g3++oK29-mndZl@>)7qma3AF2=&T7Ni=v
zcm}v*AK-HsnrPA7ShdgKYA?-8M!Hes<#=P~y%zWKJ8SxUSAHw(t-A@ii?4t=J)sYg
z06Xoc`~f8s_kx-s+1wlU-pB9kD2O+wVas9W6xfv}j0V~^n1<R#yU*<5RS+#jF(Yl8
zEapDlE>8U;a;z`u=o>p9zN5o`wrzF0!f8Ez<e!y|5B2bB0K${om%dgdh8hh@^mAlZ
z_~uv|DL9m6rBSDEt@+O=)ia8Wi-w{UOC0hS5Ivt>k|=+9hZVvo8xeL1{qn}XqKH|B
z!9{kWw?LKrtW(Y(@O_iGA7x<#@fDNs*1d>kk1u7GOhRnlY%Vv2d3o!R(LlRS<3r?Q
zLc<R&f^!;Z!`YFH3E1^XpRP2}9;SG*cV+|aPO?jR78`#L!Q0%;@O6l0=MAp1d$4&c
z@%=T|G1&#Ygr;4Y9v5QgMqiv37i6=CS0e*Zo2iUZwPe}@Rk4Iq4i|WJlS^hT@4dHY
zwUHf>zQ|baL66878$Un&fsB!iR*Kuq7~TQqlcf+D*os<7OFp7NWO#gmG~;YVv7|S*
zOBGM^K&8wQQ4=dJ+Y!ZuH#V^nlqE0q@HV>et+5tB{kbeUId`**wb63w_W#N$)^h#_
z<z!E`@ktZ^+w#I_AlsGizbvj3TdCi&2l{`_s(CN?-!?1m{~oo<$+XG|)!|KQe#vG7
zW>omKH*BL{pE^lGxBa1Rad?~Mm+bhX{i5QuI+y?Bj?vjtv1~rAbc~X-6~!_}3&T{;
z*djE!xsbT7G1(E<5z3(~*%=q~r}{>hz*K?%bR&Q%C%$RtJq2`>bLihRkxVad#*^6=
zPex3Rmv~4*vcr;L7Z?`B1DGe6MzpTjRb8>Ex?;9dSK`KCt8X8u2XfRYJE3pLR0L!<
z3%FPh<l-|}PEouo*+KQUO{MXj=9y%(5r}p~wiY(V5A&_qNOL54W--g*^Wv=Z|7lSy
ztA}M1pIT?hingymV2>#VM`gSXpC!Y_Ckfe2{gAx?bEUd97*a#o^zdhgBM%XZO_!`v
zp!5xk%6lhHUaL0QV;ux{4CpYZqi8kjIK350wrSM8q^6c?h85qHv=YBa_u}F87pfRV
zx-aI<vb<$_z#ZogTmB*)waAKy8YaC%bW|4h7TEZlB}hf(vWR&th|~1^<{<^>67le;
z&WAC1z2QOBtt|C@3pVolY-`;{Ac@nN;IN|{3Xfh64xXesijW$YWzI}Ja84WA!FHj=
z27e^Ze|DUImgPqon4(v-45D=f#&tQmaaJXPrm2wVMhr5PG+F*>aegw|3ZEY5Cu^*5
zDkjT+f#sj2^U0n@^URcdpe=#@ZPAvtLtEAk?V5II*S14j-VSX=TQr1IJBzjb>kwj<
zMXT+sl?EYY3`9)oi(>t7G=6-8vL<#klGBLZioD)>0JR=neJGd8^)x-Wmh3n2{VM*V
zfooa|yzPiUO5Z4iaz7d9e?;Q2j6jdN>VZl&LAz{Hm|f6EPzBy(bH?hnthLr~TgX3(
zAj$^nr&>uqY$cm%?ODnka5S6EvD|U|8LLT$jrSq6&|y28uy!o(sFn?AHTV-G&IlD#
z0S@JD4tnS|^!rArh=Q;=pc21^FI1{vRV&bxRN6CP7f+f(tVjLQ-H}l=i^I1LZeDRq
z;03otJMUHC#e$g=l3X}rsLUIg4^9^!g%aZR1w61}k68G2{G^-Mm0ouTe7uV2#g66f
z4!PvxR!cr^dkCGBU9S58bR|HGKOQ%){#g5%J!a+04tuBq<k(VNyq0yOg_iET=?TRr
zICvEg3w@bqVl5*rQ3~cga}EE2bP8d;bg)wt93;Jo;X5J9lYsrx37!P#eMG6T;b@lX
zUOLv+II0T8$LXBZ6i>1zg;!A0Q>C^-_2W^PYj+gT$QcDxO#xUEkK_dIq7-3JW(oru
zgW`lzX7JE%MUAagc+)-A3sJj0sq*v~4=*05G}AgxO9JJ)%pO|KX+h;;=QWy^1WGzD
z_q0l&$5oF98R@~Fv;_Owr(<8c)04_$pHb2fc@Y)Jlf>W4fN3iym``cq+b;N+HiW-{
zaBJ^~H}W&Zk%97TAgX#193)1e?}1f)fFlnAN{t-^B-cLa_&+}X!YHX#5z^DV>FLFT
zJ!zhFc^FVe<U|se5jz10ep;vEx(+|3j;&@BS)3;wiB0#Ud(yB+bE8?}L^?xd$Xe@!
z9gnTwvpDR>;pBB?90Fzcre_up^QL=?anhQD7qdK>amW71Ctucu^O)=&FBEt^-Xl4o
z@L5kLsF|KjNe%O&*d2UWPGl9*!tL3kb{eHSBw@2fz&n6uTaW12C$Goh^;p}FnBSZ3
zE6!2ceQc-4!HaYrA9VXXzIfgL5@@pA%;=OL%fB<u|1tck0cCm9vx_@A`0yXq!Z>c-
z$H>3%-P9(tp(5Lp?a8Wqf2UEnQ#jId;&*9za*Bi)&<)ide?!rijl=Ic{2KV}k5L9~
zmlnareF%F1|8Dr{kNRmFo`*M}!5CcixR0xvZ>YNDtEiJ+I)dYPyMt>84!NV26t{Mo
zgtrHH%Vct+bjfwp0<ZIq=z6N|k~2PE4d#d_u}y}-6+W_;N}&K2HBkLG<WZv1lt5+K
z7=x>S$#BK}v&Ef`LkTzn(|Phqhw@mn0*@(&TpT4}$)#qb<B$w`C8Ff_tx7S}L7`Ag
z49G_0iNR+gJw)RHcT7A`QwAj29WrEf^hl&{)H{N>J=7eBP6xRMs_O2A!|$AtNbzt?
zCOr>WGP$_d#>?!Xd+Y9_=)B$STix)M8@^)AX!wzmVM=5+>?dTC$ky-^Av_EdD}Eb(
zrW8HcBCbzH@=5NfB<39s-WAw8k7EaL4gv4Fq&FI*lYU7X{gNK~<v6&mkgbS6Frv07
z62DblohhMClc+6n=M;S^>ArHqi>ut!j&=#G?dwrQY7}El(Q!Eb$fbvlL({<Cbh)2Z
zem6+<IoW1NvnGs6HkEgrG<d=&izaa`!pZOnqbx=hE`y0o>7FpE=sgHxZ>pyD-cLx?
zu?SDuFd@~zyDZL$3D}=6XO?tFVbL3iMe<uD9?TTa%8MGvq4d>aV}4E1v^L_@6uoHh
zdtYk9F8rtBzc^Exg{|9MFj6oO7t4dz8z9LPJy`(nYa}Dy*+|yasPt6lPl3!wsDAR+
zetdPw)6=&O!8+dNO}lmz?R*qg4))fr;PzJG!oK~n1&KJq!8((JPPF=1?b5$oYL^x^
zPkCK;%B625gFJ|&s#r40lSr!aWCVE>NmYupR&L72dq6Lq$&AoR5Q6veMQ<A=dHRKe
z(97|rfDpYvSMPr2z)M^5CD~X<OdPLyC7mL*;fL~X3cV}A{iw(j`itQ7tqZJAGo;Dt
zGgno2t&d)%5QS6*OT?S5sMY)m@pwNv?;w6<)3N{;qXoE;S%8ZacBGeUA_c#AKc>zi
za^rY4I@p2iZH-2UfV4&H)efz9JG4IS&}c(;Tl(l7SX;DyL_1EMErYkn6A6!z4T=xd
z;Ke--9fDA<J@gdEAzW5tsG#ttj%cv)u4>(ApnTTl=_h<jM8hu>P*1Mu)k9;U2$Lq>
zKOTMx4{Lfx0~HZ|exo06!+-D-Z`zryn#Q5cOydwLGrPDHc5URgg%`PQy}50?L@N)l
zz!+0%;@~~({H}qPT3ppB&#pk1M*%b=3D6X-0F_!ERTVlTE&-J<>UNpFRueH?_bn;m
zhfZTH@^~sGg!66y>^r@seTM1tdh0Mf;G?b%WBM=ivONuPB}j+u9$A7M+ANzmq^Z?X
zGuuha5Z#AAt{O(@eA0z4fp{WsN{GRXp7T}Zkzp`TRUL^fwPv%*lF^RZxCCC<Y63a5
zSpqraJ(WP-{ohQRN~Eo|TGuR*7dKBN!xo9;u((7rZd<iVsM(roJQr&`&TyOJiyd~N
zTH<3L_YU>#hY(Jua1i|~*`ACl-04fr1xNquzEC3!F4XZHJ{KO{#|;Ue>gb_MDJ6Pn
zKK(Sc5|t^@i1pMCOPrR#A}FH<TH=+zItEQ{%;DG2dW$p65_F9eP*7?V7Tfq719Srv
zQuL^$4E9MOcy-RLT?q+qr5-?WUu>x~N}YP>dRSYmW#}0Orw8Y=aFCwimFcU;vz1wj
zk3$wxvL2KfQR0^xOQ44=&^a;QOxe?@zGbe=3duoP*SzaN{?hn7nKBxGMrn?ciFf=g
zll!pn)36+=qRcAor8u(D@x`$`0~YSpiaomQ`Q*#%xQ@<;Axf4H>6`_9o1$Y)VwxU0
z5s7oFyu$>9@(%kVNk@1Kq$d=oDJYM*pvIlxh!kP?lqWyRx&8t0JEW~_5ByYZxQ5`!
z2~IDb9n^jkE^VdEDkoCrE>2T0rp#@R3~WUijG{!cGWes)xc@Ww?&@4lC+k86bEV*(
zIWBeXVhr~@o^XnrX|4TpS}Bb-aS77+Xf%H#=QWp6N8K7Sn{xM}3(;?v6vWpQ4hr9o
zk>H{}81@qEj_@UjP<)J4ZSB6WY=W}V+2xNccpgFQMKfhexv8M)zJXj-;~#2p^ZMPa
zQi6%=l^kaH`DQfILVsxJ)Dl!3!L*ID(ieUmvWrzSD8X@JTxXJKIBZ8UhF=mUpUZUa
zkDASW3_g+&`fYrMRdX6I^YUc89r{3I8QbrH?JeSm1``pTu<FV<1C1MTqHgmLbB*fV
za4b_Z3Wl4|V#v6$=<?PHwrI>*M=;RzW)VlewoO`cOu%u?nTp(Anr%`p)0t58TbdR>
z5AucPK)B*S+G(Cb=K|4Rq!BeSHs7V@iZ?y7?(+m2-+x;*$1Kb!w(*3UPje+jYvp8@
zWI6>7sS%HH9mnT<s21eba=(MB-7*OEqGb?H#;iULGH_;Kbr8RLy#@&wXz;Bd4w%#H
z4g4Y<GM)(#!kB~r%v_jm$1nN`qT{j4fgf}?lG_QOxLxo<s2e{q<^&+zS5M4d=Cc>1
zJ4@`eGE{2!?!r-!30NyyhN+VV;3!CUfrtknCfY%CSwy?DAl(h3TM-=~x-FuEzE1Th
ztLVlKdq%o5Bi)se?uPa@Et7`Pkc_huCEzoKWtQ}N=g{H9ND&!dlLSv9JpCR;V9~=E
z;gW%HdkVhv#R@`0D*a%|z(c>@hBWz;j-Mi)V)#O<sIq{A!_Jg*t9l%rW=e{W;}n(E
ztp;CX21U%5Uv|fR3yDuEO`Pql^L;Z|hQdHiL2i5C_cDBLQRNmi#kd_3c39^Qd2p(-
zoMZTLrrtmC;QM$q9x6)~q9-~^k{!N2XM*N+f3y)lxL=0kMqWWJ=Cwa(Xaq9JS)Yl@
z>8u9_+zfWp;f7yA*W|4C!dv2#zZMgdpQm2Y+mH>Bx+KItzir5&(7^AkuzEKh&h&W{
z#2@4)>Gc$e&*JyhumF@N1)Uo&=OuK4=Z<U0hJF{XYDt^fT32{8w5O}1tY#o<=p@*s
zs7a=GRl3btpG8(rwzg_S_9Dq>s;<~NGMcJHSWi<&%x6Xpg3ZNiXYnV@XRAY*bEFw@
zo1J9Vxxz2Uata@NP>x|KAp8PCrH<09h^Vj@Ty5b6Y;X1T$ym0CTV70w<XTMCmqcm4
zs=`QbD_oWjopkG4<uS8X@j*nm=wN7+J&;j#LZsH>mz*xXufQd*F!%~@-f<u>7j#UJ
zWbrt{7gE03s((XsQ~8q#f%A?g(AkDSXP6EPg220D0=-ouBO5G%u6laefum_TO-EYl
zZbLAR&kxs0(Zgb*eN~~zjuxU5q7oALKp+_7kHyGF>g<)#vXSOAGh;7h7#4S_Wq54y
z40tz0#{og$mc(dw<v?KB(FG>8DljQbX8}RrB{6~is{F_pOQ01iOlSOnZ)A)w7krD3
z;G?s9z;{fHFR!XB@~p+@;%h0=QERQ_(Hmk<GM{{dc;BNNRdsX?M!>4hk<YED&7<(4
zHHh$hEW&LvLRyv8-89n?rqgB+VOcD~S{Y$*8xiQ37(|#DjS$E$8=@*{u&l50bttP;
zzD{K!<?CFQuY3h%8Om2!=2pHgRSA*vpi`csssgLf?4E3CNVM6_m!`bqwsI%X4aNSc
z#jkP|w-a<kl0=nb{sWkgrl(k0X30T^^fqO8O88PK@bZ|z?p1vvY28%T$y(=Kk)CRa
zaPiexQW2Ci_QM;rMk(ajn2^3Gurf<X^Qe4W3$imM#@)Z_{K#n*w?}?nnU4ExsD&;q
zNlH^*K2lirCQC~l!Cu&2Q0$Z#7N$wd($iHYWGJ;BPX@gR<Fi_z_C_q&s;nLWwT&&+
zX2#SiZ~JP!N-eVVU6`AmrPOA6vOGSIH%v#hKy8PZ+JRL=A{#8V(G1RRGhsPa{^5;N
z4xh*ZdE1=PWDKf0F0$hYd46SAuCgmHD#h=~_2h;BChfW^CTA+z@gYl&;x_qQ0qC0&
z<2xICIo+GLEi+7K1AwnrjBgtFdK|$=`}@J?iSeBSzSEB2qdofI`!^<!s<@_uZ`Kif
zw9g)VFU0s}fp5_fd}}1%O)<VO`0hM{kM_Gm-i0x~3&8i{5qz{~9egLn_~xO`?`pw^
zZkrk!U$P8te~h~U+|C}&bJ60at>xhU2H(u7w9f~3WgFb<B=>VM?n}X4+Xgpn3y19G
zQSM-)?EMzB5rOu7Bf{ipgutA#VrAAPWgV1nZkd|o1nSF@WLWSr5}XjZAA;k%IR4D4
zs$(R%XN%NR@5I}}q3(M$ZY5?3iRl=rY?YmMbc6lH82eJmKDkwP+KdhM>tgK7B>VhU
z*=Zj&*r&$WuaWF`w#rUhpuyff#=cy#H?_)6yO+UkkFj4X*}rX-{Xxn8FU-_c&a9B^
zS=Fs&B<&f7^t)s1*AaVuWME5n-uDaEh8XKBlJ&eJSZQl7SaEueWyGt*S`)d(VwF8w
zdcG%rX{n!1)%R6oP|5O2O?ogb$S<@&m;JmapRaF74@CQ?0N;!ZCHNrMQnE$bv(Zvw
zG4oZ(V0|&h`WjfHRd6fCQ|CpiOhbF5>BNo}qA=IzmP*@xArRA1OZPhzp*(V^MFccf
znzG2`%846amW;^VVv#W>v@jr0+20L%q@<TBH97J3Rapa;jSsP+^yB@aBxc1CuapM9
z6(t7Taa1~$3Q=*vuZo%-N4!cBua6RQ;)shRaZZ$&8%Lzmq#$-&l<2RD4oh;N3Dcob
zV5yF><Q>7XM6zT?S@NrbR!K@Ja<Jl~ogiy3))!<31YB{(TqDK25u+xde;8xsrzMfQ
zlVy^ZcJC-j2{b2r10@$oq&XS;GH6c5zD#sj7ekWOVR7KpIOWTwz>zV5XU9?LL?#$s
z6{Ajzqtek!pk~IX=d_UlD<tRFs6EQC>2Y%CkR-@?Jw}}sN2Sw{KwTfBUT{Q7(5Xk@
zxjM#kVHK`NlymVg9dCri6!l0Nh{TcT5F?OAE33kBBs!`Hq~4OmpO#nRS(kE<?;9^q
zh`b-oS$z5-pGJ$_kx=hw|3xbTcq9uqK^E>%X~+TQ?|s<8(xi``?|uAu?{n`QYdz|#
zr+l<FcW?JT+*iWO7BciuW)FN;==3%CEMe*~9dlaJ$J@R6vnj8e7CdgF1rMx5;~O3<
zLZhSOzpYrNpnr_u{5MUlYA7AC>l<mAkE|@pse2HU@42h>`aD2vnB;dKMS6WcrJ+6m
zm-|E`GzY68SgW}giW@soRIJs|22}b*4`vp*4Ujhyxd#Q)yMPmw9qL6^Xgzc_^a9lw
zQJQD?mY^uLk!ENEf{hVgCf`9}#Yup-lP~lTXu2_?p_*h~tk=_T9H(AS;+Tul)SI}<
z>YgGi`jB6)nNNcqn9F$#`3C(bnClzsR%kyXoVM)kijB%=r<%ps9d561;)`JW*s-a&
z2%@Xqc6O>!oM3ZQI2<+y0`Vylt?8#2DP{=!B;|hIWE)>7q;$^ugF0Q%>4Tc$6w>OJ
zT75OF*J&e?QJe(LF4hb}ySgLo8HRlVGi!@FVu85=xyp8$q67{73FM8}U5%A7;Z3N<
zTB-0RR)2*AbUb|c(!=!qM)hq#xW}e?0e-PiIvkmcW6C`yKD{}W{bSSdr4xta3JH%*
zh2!!JT#SgDAx^@5O7$ba*|i_OldJ4aOPA=M%&z5n{b~5c0_y4bsU@~i<f$Q!(d={9
zypvN~Gu*}ZKSeLY(d);6>Rr+Ae0(E`epIbvQlM`|zY%mHQ2iMIkX%DzAb2eO)<AA8
zd!$^~7{16suOElNes08*Z^`S&6aEp4pYlBX`U!-0&|)a<6_|@$dQ#@wG?{P62g*0p
zP%Jo41hbRP=Io=)r0m0JOxcH<aHH8bnf)^RDBQ|E3b(SaAI;GyYo;KuFO>mh;-!>D
zXCkEQMP6LOI!&)X3jpbziXYC&v&q9b2^-0$vN9cOyXPQ;)_|uoGd?Psp;+|#ne4N8
z-MAXb<F3V(Sn3)kz5ZM<R9<6UOQhGI2i&i7vvN3}f?Qk*v*3kQVf;`jTtL563Kz1c
zMJYrO*zalNvMPl-!s}wCFq`mFDh2ue;ZtPyQ^*g!{xVSN8{p{mjc{jT<(`A)0~j-F
zYw+%X@ukFf5%b0H#lU<jZtctjT=-jz&Ll?s3y8Qb>{zrK9R9%HmfG!(n#vF1G?O}q
zrIL1}Hx~aA1dNRjK;<#LVR};&A4;)9-=aGS!(@96Zu^LdAp2yxnn|W)94UyH;{;(%
zxw#-tXnbHjBGls2t~kD!G00N8GsntJ&b^odBD!+W5#-eJx%*gTp4mK4=CS3i7*IPi
z-G<fHR<hb5v)Vyfjhor*=uRrPqhH@n{rb<4R}_hyhymG<*BHmXI_58y>H8F2;F}jd
zJK@L!6YyRaBILmaG-&)z0`Yr@e#fY>%50!uy!7DEvbs-5^<m;zCC9x;4GiE<rVW<I
zceL{%ouP-RcU=^j$QIH^^9J8~%x}TaKO=y@f-WjFc$+?MV3`=Xn3*wKGWknxL#2f3
zxZIaCpX)f?@y3_J;awHHMooAL48s>)dVLF$DM_J6$e6OaugHG7$PCR!3_N}4Kuw_G
z;16q`;^J;)<z-dnRTaG`#3s&zd;dd1bb`{XhlS{NJp8B6j?9>KP91Taf;V=Hdn0W0
z>EeBiC75!EPA7~$X)u2G0Pi{mc;I>AGl?00m`iJCe7v^U<p%hVRweKjfV5wT|C#vr
zfC1O5i3wPMDgcJfq67X<hoAW9A8lNu;r}H3pMw88@IM^?h{|+C=eT$!g1^$v6bF5+
z=#;kyZz<QZ`yRUw<~~%OE^;b}+9NNwi(P~{{D<7<0(OzqrM8=0oSsMSiroE!?c&_(
zcRIVppyC-BcJZF=v_!kOB$s49mr3D{5(<B}3;DNqo{@1Xb_9{+o9yp&wu=q+(-Oyt
zvMxeQ61R4x@YB;Md_BVL;&GO?hW*!cy%-uRy+nNx^7|<MguI8!lSLCaujY_6F$?}Q
z@fOEjYkQ~j4IDmB@h?G$TlA}*k>M7-*nQ4R{%f7&t{X`1!!B|k<nZrH$^Ib)r1wDw
zxy9_hm`8Fx$t2EmS<bH9p+URIEgkH3izyuPUe_5JX<`*ya`_<2$sNcC<ZAI?w_U93
zLhjYbVYhHrket&(q-*RT;u+C_TzfgWS9g3B`MIFeqDtJ+)tlU&K?={|JnX~voC(Rv
z;>0vjjBn#u<8gh-!e>kF3An{jE}h#sZ=VFGTg)olkMK}{^xjoSu2Del!b}SPn$vY(
zknFjp3zfyaX=Ls7oymPQmE1eopO{aURF;q%Vfp!m<bSR!x%VNrk#F6pK1^cj^bV9i
zdj^xP8(8n#siftm9wc)i=kr|7r8O-7Sj3{5`U_9msa0yY;x<1d@KTHsTGyf~LMG<*
zL@OCQJb_TEZ~<-XG(2GdP#18gii6#TC!B{`R>|lz&_>{z+@XxlEuna+Xf?zc$R(63
zCQHt)gaTq3qcvrOI^Zq=;(V4-0Zv|bLf;dmq^}5f`4ZYyO{fI7Z*nh?M)A7h{zF0?
zyA!IwEntN1^bJqwj~$i|GJ2$Nc*0=Z3H&IdU!kE~3=o?cl_B1VKrb`85Be&AnizeA
z^ko74lhNN2h&D>>W^^ggcrg~Y0F%DIR1s~g_=(Yd%sEyZX7nSZ%tMPz(I^LRN~APQ
z5oc%=??dK1UtGxOIYzTY1EX6JuL8Z+B1X3lAat?#gwa32IS1%7M(bI_Ujz=rg3OnT
zh;y#!sFQ|&u$1S-SwKPz2@&lD@d(p?aS>_~IA;o+_XUS1+$gq+XP7q4O|-4zSw^>V
zzHAjQFq(~$t`P5wm-Wk#q9AMdM7+YZROb9dY+=+5oc|V|h;909q#flT#HZqIrrpFf
z>r?SAOF4<9d@9~!+BTGQ1vvLJdWq5J7=g4P>}0fC*ch$MAkIB_Dk54Y*Pgw?fp0ES
z#q?sL?GsK$Z?lGdBE=wEhN6B7@q-v=P#UV4_Jf$gv{gVv3p1)iUK4tf(LYqY=NV<9
z_7LqwM*oC%La(yE`#60+h_@I`;TrOT_>j@XY}pTDCu<+i=tmJUT^&+|J(Ft4kD?ns
z6-gC!oT6XEa1)h@A1o`xA#o$JB2{!ZsHF<+M$-*MhYk)9N!lt#FAgS@3}is&a16W^
zB15}{<Iz%n1@66EV<sS;<n(KIF%8>6K+6M)(Sq7NaZ)+~5p7^8X(-ViF;Ph3DfA()
ziZX@r$_Z5{G>lO<pqMQ^fT&k-CzHNv?NQc_VIHWr_IMm;AMHt|(fVG67_Mz&gl`^z
zCTj068o}`{(*DV48;0i!w4(PIO~N{Tg=p0N#dcO=Ht?#rT>F60_CiAcP{^4?=o=u)
z&)cghecuDcto~Wi?(I*sU$q^~c`fJlf3%Nqmu{-~9FxHc@jFl~-eGMQ+d`c$dQ*Kj
zBdi<%+4L{tI34=G*|O6s$rhKsmyxtO5hxvbJqqPFK%@Z`ilVCVzCv+g)ns3VI4$*T
z_}3-Qfcr)8e5|>6x?c!4mEAs7v*CAiCpQRw%nW_x-=6drxZkjQYxjBZAMWutxP3b<
zfqQxSwQwI}cVWf#@W0UQR=ED|cfftG<Q}-ayOJC!?a3Z1@rhEf>_Oki#_mavApG)t
za;3C>6_3Kdr~A`z=XZY@?y$@^;V$pK3+|aczJfce`_FLacNbaBLtgW{5b_mVyC_e*
z)@K)|up94xDv{FLA(#%%dBI$`z6_k2D||h=!d+I{3nA|%4uSg&yBC!XNBBNZ4MHv}
z8xMCdr}`8RrKs971t9|lKL~%949W*dJ-_R@2wzfmZkAU(=zhfK6_>I*iQVDs&SCdL
zc3tf1><)Gke<i!WxyXM>D!B&o$}3LjOm0mAxm^+|-?BT9`%L#RG&UyF_e5`c$+k_!
zjTDlx0B&BdN01vM*g_fdOvP0Q-&RR+{pHKxKe>Db+$Tae!40C$XyV!8)mbAOcl27D
zRVYRmQ!T1MDHjU=z(?TT;->JA**yT)BMu>#JYtS>9o!pT_rkqm;Ci^zlg{^<V&lNo
z$VusvJ*K$04{4XVswGn$i?3-<qrQQE5T)g&@{O=$9qNf85|4QT7Ov+yqm@4ke;T_F
z6p*xEx>CsP?4OiCHSG%IDpJbsGR}=5nUp6B;kS#X>=)tI=a74>pWK}sGM3%-?2hK}
z^VnU$?nZW}a;#g}KaBk@<{v2Ia>y_2-@|?%yWQBG&EeOwe+#=omUG*2WW!6xl6xup
zZ)5*6eJSK)c9S@~h{MOTe>D3W*wxtmg54^nev?IU1<Magz3hIJwFTj$*?pbe<E7t6
z;UBSE&mktece0zuZWX)VFwfcSU&iiZS??lmzp+sd@V@<4)Y68*v*8Bq<PLC<doYjO
zH}cn_bz~%ye;~W1?2cu(j@{{SWxd55+`9u-eNp%c+<Md~oCB9i?m3;wy$CfKb6%9V
zU6dx0yR_$SaPI8!72NWktFy*5E-NISr9Jl{WJ}NA;0~=2*>>?scNg3lsnlaEDI|9U
z>Ujq-F(nP*r{w#yHStDu0PbVeoe{F3|7yg&yOP|cJ&!@i7IxnYlKe-xt-Xp?X&1gK
za;e?f#k|a};Oyxof5*ht$R+A0M>b9#NV=%^D-<N5kR=ofl2FJJG_fzG8?^7{{^v;#
z#qDssl{1BqO!fJ=;A~%&_BThb>?&;wTs#MqllIA>XJKam_1`X1*$p@;?pYk(kweni
z?Zx5KIJ|&EyzCBSS4)`<dnhe(>?jnq?v?IBA;WL3ChD@(wOKvHkB;NAy<(^_2sW4H
zQtLal&#ACyTIFd7S!Sn@XY6ATLYAZ5m2`uDd>^uETIFQ;=VElQi|^3JE$Vlvvwb9a
zDkN)S2HYFXI=Fr94RF7;Ur8Yx{w0T}Y1bj7$o6+g`>E6ImM(i9dd@M87iHXp`tY0k
zrYyS(*&1Ai5O>wGXvq4?6<MV5F37o)({d=4`l$6u_e0v4{FQE^ZU8mawh8VFg4|L1
z7S!s$v%8z!sRo6-$L=&i{twxmtC7Ef{XejKEr;x7x0vIe%kIPM{tnlw^RoWm%Qfv;
zu4&W<+J&b#<yaqf3)z*egZfE3-t!Q(8@mB^|G};19+q$&OQZht&uclzu}WEH7P|(!
zF0TKJS^jL6-<BoUVJxtVcuT0?wu`B5stFf{s0VwK{nV4&#U^$yWywp~|4{HkgfAaJ
z?nk!gP)4U<e6)*K?3=T#QhqbE1EqWjJ;|8HrJc#W!bNTayH8e;|9mI8Jviih_LsB&
zYxbYY{x8^VV*VxUf1BMG+1<+S)9mhGcY7hpTvSf(VfOcTlmGJm<UZ4p+@IK<wd{BH
zrI4jb<PJ?FcR5pM3?Tm!cGFnScJ^OZK;g?wa;xm*_TrFzMdW{-dFnYNkz=h-q42I8
zaysY6;Svft$l9;$LH?{x<mR&LFDCz$94nt~+g(Z_3#-Xp#1`J=A^$d(IX9F1m-@)P
zAfMbh>{c@WXDsL0Gz!l_8m-c~DB}}^^d3NN%0P0B4&-)BA@{fb<ZgD5+nsrav)h|_
z5}E()o<#kmfZQNcce0z$5*{e0@PPqxL!HUp7$VnRMs8zAawqg9cL~QjSVaEYgXCV{
zjojl|*EEblV;a-hznXP9*}t55`m;ZWd6L;b3OrUC4<x^uJ*IJY3AyE^ucDS(qXYGX
z=l7!8o5`(mC+ar!FmIq&sW;w<<D~IU_M98~(`fPx+zz5Bh1@}KDb`1b<u|CG9A*0y
z?nU;!aB<)R<j8i^huh;vwx6?!{{X0I+MsAlG4n;vn8r7Yu;`_F$g7iW2r=`?bp^<M
zvxwXb_(ufs*di{=dbKO95&ehJN@tH$cMfm|qOBDB^HYE_qR5+Lh-yYxINwR`fK{S?
zjFxA;*>$La)~h&&1Ulx}#U6zU29@U|i^PsnX7@qWIUdnXp&@|*IT_;pC^|JKSCn^>
zoI?WRb2?#7Pois+X5heyQM9-V%f<y}=ah*tc+wN%3Qj|BF_F<qQR7^Q^XR54bjW*U
zPG7MVPqGy63fGM}{Y7=LL=CR{bB-5_6?(Gj{+tuVs1ixrUifs*$zroY!2w%xhKmV!
zIwmP81GnUi5Z5S_2y~j*tk9zJEjgpagfhu_zH>{?7;%$Ay<AV{)QD{g9dJIKGfo^<
z=y2hdoQa}8R(MIn!A@IpCW{Xg`e4wOoHL_R76qQpnHojI`#+s?j>yE*xDd-lrQ=^Y
zGsQ6qjdp*WbG|57=-QOeb7qMV3SD3Qbxych4u-`nrd{D2S^Pa{^%5a>e+F7Ck(gF?
zDCa_Pe-yd=7m2qND(sZ$pDXr7QNF)Hq+&4xWh637y7(JKc@&lTFBc;e+7Z;m0x_G>
zbt3HT?q49Pv0_O%v9nWu|CQorMpV+n{EI{iRz(D2?CezQUn)8?Y7&RMXZe?j9o>o6
zgp!W<uNC-cj8Oy74PrbkMS%8X)!+O#i_iN?biQ+yf328(9HGskmusE>4l%O7q?Nli
z`R@@^75dt_$$zhyqtJao_X%?Vac<TQIG^?3FWzF*Bo>vQ<zFwV2NA7F94>s-|FGzc
z6$R3;z3@Z-<6=DCh!8C};7kA0V(}1(QU-4FKPwVXkSG!8Ik8xwXFC4qe_rfY=%>y<
z`ZtU5CrZxB>cjq5#PjrO12Sj#b?0so&krTES^O=KnY&edezK(H_ub@wOH8L1AmBU!
zv!iWd52MZEq#hk}w~5KaiMBu#_NdH#SNzCGwz2mFUQXbTP~Y75#5_iu#qNAO2a3BH
zEfAk16MBVFlSUf0i{J=~*QBlKKQwo{xI>{u{m0~fC~`+i+M@mwb3YO#3hmE3J9np8
z7)29vKNdGe5#A7qjSAhHcwz1?k$RexGNg-!waHEj-J3W+_Y+a6(91pUPXAQ&j?%8k
z{ZyQyP({b9fv$<7<v?#Lv~$pK<1_I&BTD;?xu1z&PbYmc?Yl+iQBpglcDI<w=nAnk
z|A4+*Ojl@km%DR!i+2_J8R$zfVl>5*xw}VPz(~s6BW`BYq&=3nK6j7U8by!i{#!W4
zNPS;=H|KsWTng0|Zq5Bh_!T;{@Alm9#jy(cdSxf<7so5KJTx))Cvky7WcAM?>kPKV
zSy1_C+RtLPLSI)9nxoKfjILzVB)SKO8$XNHl7@UCDep3pY4}-)8j`s{=z%YDe-_;o
zI*|Wu?k{4VLVJsU&;3p8VI)&~Kzu${rth)D$8!&gl3Ix<wKy7NoJ5pbq17rxsWr7m
zg($TS?N@~;wN7o}1R0M~>(a(gBy<I*)}=kK5T(|ob*62=piydFS`{OiT9<a7q;YCp
z+Ga*FwJvQRqZQ)yK8`$>mN=QD$h5n){t8jr6SRd2?JfQ?H%Y6WA~|0JO4b%Ik~xv0
zHJ(W{LXYRBYVRsUIgzGqJWJ9jCo;5e6{4KT(lVw>8s&svdqpA2iCnE_nxs)q<Y}GH
zAtZAmPg|%E<wTygS0T!YJnaA@nG<<hx9Jpbw{}})a$cS`o>7yytuiAoU%N!1XHpO7
z0qxpm=&@+L?7V>XU0l4NmN<iL5$_e%q;}9|M^PZJleR~pf?`b+YVw+B*>VcCAiV@u
z?Wj;&%t)3>q4qc<*%K9N8Rrsbla|u8Ft1R{Wwby{=ve}EH>1rO^@Abpw(}$>^-{&!
z289wmeez1Rt+OQUvYx~8%C(HJM2Vg=@~}Rx&|ke1^D4E^6sqXBJhz*cashEJaGvGq
zAi8Tlg(eQZI=8zPlt>I4tbukBqb5Q6x@&8r=#0E-?RkYrLocn*g{)nWhCbR@MlxUe
zYEu=Br1aJPp%A6LpSDFJ?qB+8-$c>lx&5`Yh)faHtbtl*g{Wo?(Yh%_b?_u@kU~_m
zPS!>$M78b|?OaB(W)0I8C`2`Dn0AXoRI`R@kIH!5qYTr&lQeE)!?dJ2vPGt7n3mZL
z1sSar*QZa<8>YRl&}%@$wQjR1-byj0BAj=sc1jdomN!xh&Xct588_#R()vfyhP)bW
zqe6bq(|O~x1@k568{N0$jo03~RH7d;w&zXMLX8qVmGO1nWbJu{=2q^{o1*Q#Owvy1
zxj*krZNudfUElLy-dWoCzgh^RoEATeou<uX8l`raf10*{(E@Q>sw;n{_A8?%?fgzz
z`R8dHuOul=+KAFF`Lne5CE{7o1=_m{nZ~oAI_=l1B%)c+94&OUL^KO(&~98T5zT`B
zs*PAeNX~-(rj;+1G@8vU(n^*|8qHspXumR&^Oxn?^VdjDn!jA9O<gV#&0kh%&TA#2
zS;_U<K}Pb4;Rfxu>xg!RvsX!F{tcS8LZT^|J%L;b&FFPNzd_582>Eg>Xc0yWfQIC+
z($+AN?R~X&kdbWftF^%OtX-R)ZxgGvQxuw#G&28IZH_{Lq_O#HwRsBFBu&h}U3*cX
z#XY9x-=V#)&}V%P=<BpS5^-%^ryaYJt!7)+X+s&U6!#7~H~${(%qWWF->c14=;4BM
z^Y7D^L{TLFer=6Hs|Q_@|A2Nd8t;<)2eo-OkcO4w;et!@*J}qA8s4!ne}ne!jgmIJ
z<D&eBo7JU9w8=NIWlqY^N3`h*y<2e|&;<(ZtvsMVqRmmLtb7${jS5XHUjwvIq31()
z0$n2!QhPtp_EoF_=-K?owI3KYab7<YMU>aiYIwnqzvW_d#+&&sG|Tx{wf#(M(kSP*
zXuDPuXOl)b|C(mrDiP)UR?V9C(8#q_^DvFt%4hjowQ`Asw{UO%8`_OBo*2?)V(y#T
zD~dL)Y=8b++OG;-P<AN)9j$N;X}Ch{2)YCBY5f^35H}=b2X<(DP%mhAdOHSoYJ1ia
zr+m`*T#J9+{8F2IyQI-`$G^4BcSuAn?ptl%I*F*o{isd9QzB|{ziE3JZPsX(en5Nw
zE=fDpRUSC3?NR71&nmyJA5>^z^(w!qr`|0&7bULp+x5<jHf!|s;?y5fv>vX$fh7GU
zg}!(84S4kT73Trx$$?D0=pHGBo^yP9wL<iqlck@i5IyH)>uVIEz92_WyjRAfC!0L|
z6@@PEKRVD!?|&cBHfz*R6zC5zY7)H%Z1NwYzpBvi0h0nn`UeUP3~ch3>fc+O9X9#P
zbl3e-g8|f4@1Rh}lug*m-%Fw2pszw7rqE``CV!QFwnE(pj1F|u=P5)_Sv~X*6*^e7
z$zQGisL<`fP5xeb!UIy@1Uw)0);lS*1XB9wJr&v*niM!zKT#p->-y_!6r#Rvfc~pO
z^gK3D@4TL6YSi-$)^CzXEKQmj7^2^$(9p!0ffMvi3N7zEGjO8*l0q{E%nY2QZ&PRq
z&`|wjh0+RV22R$$QD{48r|1V1I$ZLlf0*vwAX9X(<Cp$Z^$LZq?D3_4q<)G*S)IQ0
zpRS*yP%hAD{c?r;#b5f*&~LHgrF`iht3R&L?$R&)<MekFqG!qp`d*2&Nb2msME#_P
zr1r$ZO9NB%8iigTaB1Ky{Tzk57F-%QTfbPLdZ2UkzbUjn<<h_meT70NdpG&d)o*2V
zh4ab$s{-fgn-qGv!vXy~{TYQG#%OV#{<1>Z9SOb8Xr(x&Q-S+D{d0wOB^0>N*AFmS
z?!3ymBrr?IV><qpJMVE_2Xvf5HxIlCXbL0hZEgj+G>YyG)alno(L;fY^t+?znZR7V
z*CQnJ3SK#xrw@xFm%mXzGm2gg{7s*u5UrzJrN0$L`ToUvuZ>a)t)nc{PmiK9|8jl0
zLQi|Q1+LRmH%ZPN!4Cr0>z$)0H+`jU^;sP|FBCWF?=o$r__*rhz~A-bAEkJ5WL~3>
zXLN;g(x9&cYxJoSA?-G?MxPT!`vSM=3mM7LcdfpWksN*3>dz=dqwiY%J%wmQxLuDQ
zeecwrkCApc2Hd3&SBS=dyY;aO(HL-#ewIW$`re~o$w=CHkG@PHnls#^-=NTU9Pd_%
zkcN@@_vo)Ml2+fZ$B!%b>-(7|$FUe9s~>1)^#+{|2C=MuNbji-S^cnnf<k2VBYKTQ
zZ1p4hdPY>2ehECHzrtvNz<3ncsDGqTZrUdQqx!xm%^7@5r;Qf$OnE_>C-`L4Y8vC7
z(=(XXqz&qto$$QgS)rw!bAvDFZz)7;WH0NZo|IP8SoMm2|5FmtSoOO8>(dg^SoOBP
z;aQ1jta@J`|C~fLu58yYcwQnJS9a<*z912eB%kW#FG@rs$!Gfe3eiaNxxV=&qFsh{
zsiNQ)`mW8AlSaHfdh*MJmWw+x1_r;@hcIf=XvF(gpR5p#c>DCZ3enj3gTBI|VRYKB
zmo!P4{oTWYzv$f*qOt0rj%70ZEfAeL)&_N>T%qd=CIt;+1*0a7#xa|b^{SLY>uwIC
zQ6XA)OEC5*w6^2f!4$)MO>#Q>UJy((7AQm`p4Zr_5RE`SqkOC6q_HF0NPJx)8ar}~
zy$aEY=Ql!cNE(fJxyBrYXpGA<UQvj~zkDO>P02|kUeFk?5RI@MjSUL*hRg!vph6Fn
zZ}Jx!eczJtX!Px3%uy)Rd6Pe6JgyLpc*htA6|$FY@|PHWx5;>o9WiPfjSA67S!Qfj
zh(_3Q!+cwE(g<5&3|5Fn*h*u8LNroV8Na?GIX_G38|ZHAdsm`o(^mO=84LbN=rR%R
zJ1^MBnDd^b(I|SXu|XjkMUOM~C`6-ZKO^`r$w{MVf1^er8bt>f*C<4z=s@EYg=iEV
zWZ)x6{55Hp70e6_HfAeC<Jj>=@B>MsaqI+Rt3ou6on*}3E@?E5oooa@l!(T$Va6+r
zE)#3|&J3Jt98~Duu1kX>j8}F@P8zjOH^fI0(WpJzs8)zZ?K6xE6rxdktZ}zOG-{7C
zb}2-o_5>qqr<8IfM(s()2!&|Wo?<Lih(_(RjI9c7E7;_pYP_q^+|0GXvyEK}UFzH9
zKga0*v6M+;@pR)9Mk|G@!xxD&jQD5RnZ{zKQSW9GGmV?0XkXx5<Gv_*FnFHvR1|Ft
zo^R}6B>Uu9hI1F|<2CA8Mu|c+lFTxuDnxy9*w~^FJwHT@UlpR&$vWfIPXzvCPg`eP
zz-T3&5uOarHuf-DF1D1u6uig?eoCB8g4)9z<55PJiOY|9GkA%yOVOxJHyBrbMx2+4
zyNccn&NntNY7*2MFE^%sE@?LveGpt=T*F8{C0}JM`9jj}Ec!fnwed8gCP6LyTI1Z^
zk~S3jt}|9J!mJtX_6DQ>my&i1Xg3<u88r!NV>cV_J(89TTW&GBF=`Uj#%?p-Q?#o=
z!$}VRCe9{7{mY%kT1J<NWsrH7u~i|epEwTSD=Bkw@Q2|2#+WEN7<|w;PoZ#MSBHlT
z&t55$TH_|8k3!TMA2rTah+5-g#xjMdH9l@Up%AsdCydV&qW1Tsk@Ss}N$ue&qgo+q
z56>89DManzIb*3p)E-_io=}L|!%N2J3Q>D_*+}_T%B1%252Lq2)E-_nrYl74;WeY=
zJDK)-yLRcY)p&fLN*_kiZN}r@OGKmSKMj1&i@zp8qv88Ta6h5t;$NW?I_xkuGm>wi
zzA+a6ENKTrQ#$-$O#g)tjemdb;5P6151}SO<6pA*oI*6JdCU(KqVX@?{81qq|1!;_
z12P`9MxU9*h-$eevdr?sM7u&H=3d_+%Um2q_jCxDPeswA9SY31qv*K~A@lnv+R~xK
zOvQU)DdpV`<>rZus67;TdYaEGbXQS<r`nv3FV-nVHv#oB2h*p+Kz9Q5HV5O|XhK5~
z4`(==65RsS*Sue$WN;p9cEj0u#CbK)apuzsEdysibFf{~js@y(nm9s_IJ1BTn44V^
z{S#=Qxd6xK5p5Hs3^E5NN%SdbgUyT-iCzTlc=PjAiOPV6nD3`abU!#xFt_3`Ig)ZZ
zXeXM_dsRHpPBJ&*%Ppcc0Sz_p#v5`%esG>_Zq1cw9B8MQ`|w_jX#GGNW@15`(TB$s
zc!rzKjuO2DbgC&jNi?!wfoFvIt3sjv1)h=SzRr?11GLl3UkfDa58CP2v?0;Epp7z1
zLK39`jW%nJkthY6W6T9b5><hAhPeT2=ww+>pc-=rZQ%ua6lknjT_(|wu&mZxff_}$
zeV~mq_m)fa7o=#s89!T_U_Om~Tg3TQsK7JX+@(+$IWfhosg$%Sh<B!$RV7gYXlI!p
zE3`CeQgEvIxk5t|Ck3aO|5j*u=Sjio=C=yX7%(X~)BI7Pw8BZj^UQ+^Z3k_ZY3n96
z94?s@yueIV=wQc5!HAi!(3L$V1!tSbD3sM{Qt)E4r$V_vbIgGX`HLq7=bE)vyp&17
zd1geR-KCR)^UW(1%I{knY&2IgqSp9Dhs(_S88vA%OaH6+3ZqZ7OVOh&FwO2{=O@~C
z=usA$eHE&~O#Etdo<g&Lt}&lh=swKLaVl^R8E+!y<$pJ;6>_2{y2YHW&`X$?-)3%5
zr~~HZ>&(v;+JTw)on}@~DdlFMyUbG*>W`l2ZgYV`m!R*t&wO5?dh|UHm<JWQ5xvxU
zllLnl4fU9NKV)7|EoJV*Y<r`*QK1UV#Gf?xD^!P(@;S4lm*jj9^XnJP=?d+}{Q70{
z4u!76{Q6aMheE&gxgfaJ%<3)U{oWsMvdyUq?Z(Kw&D>zofZjF_DzqQ-&v(od`^b0>
zjJNNaD-`+)w*1rFrO+nKum5G1^p%{2nA5&*&QYizX4KowR}}JKKL3%KajfJl#+drC
zIa#6Am|yQQ?^mb_^NCN)UlmHm{Q6UK@NqKUI?S&>Ggm0|OX(*6ZgZDH7h-<>rCHKX
za?Xb>d(1fseT1?9D|53#E0Dgg&D8#qb2!Gaz2<m@F2%_Fjd_hieK0bAYi?HPJdDiW
znfn#0#pt`w3=OdK%?x~Rj#ua_%!Pk2uTkheaQ<j+R;UKF{pPO<J&T#~PiFZ*DdjxO
zg?}-pEA%_iZ{`|>)?j9Qz<gh!AZWjv8G~fJeHi->n<EtZ24lZwTddF=jQxh~6@^?F
z?QJ%5u#C3_oDN%mg%UCLyKM6mnuD=F!S=X9|H9axWZSP$3i2$)R&u<Q@*?KKskW&K
zor}4!$F@eHQ!y7#x4o}WCp^7m*fNHwczAm8+D0j~9#1bhwrdpXi*Y*F_78<}@r<5l
z`-IU-(KF+_4*9lUqeyh@X#4gAlDSe`Jt(DPq0M=sh4MQV+wNv0S8B>_@pJET+vg`q
z&JO|^spYoLLoJk@T49^25RQaNt+L$_McJv{Y@aK1doUxlhpqF;GT!aM?9`skH1x7f
zWt!C9%XTdzIrHdclWRJ=wa1e?h+ejvnYLTYb{BW-)lAC&$KJcZ$5mDP<9i<G%uJeh
z`k>I3G_R(#lqP+prLVl&2GTZdQd*!TOp<9jbdpJDCT+q)S_@dTMUaAUMFp)Q3JU(J
zh!zFq@=#PjR6xbT1qDR}K`&Ln|97pm&toRjQoMTa{oK#L{bZfJ)?WL)_S$Q&z0a9u
z;{lGN`)7r!*;vy`<xwt+%a$2iY^<fM#n{Q1oI0OroWF`VE$qhCoGy5F?V7SPjRS(Q
z6?CRi*d}fGYstBYdst(?FNu_`#HXDkZsVkQ*=nQ1#)it)8g1(-?i#hdVRzX&;}VUH
zDcxJP-nc<yFE#8hJKK2F#y(kgZd&^`8PyxOW?Yv|1~SCI2My{M2cv%3WXxpF>($b#
z+sdM8xkQbnXX`f1{|e$R*4PW?SIV{;hctHO?5~vd8SiLp;Pku8;>PTAB;C}7-znQ>
z?AO>WwGWr2jKXsz?&`)T%C;LXY3$s>XUi@$sy9m9&+A_*yVy9Su?xq%R(6STSYyi;
z{;lj%<FZY{SzG5X-)UTWo?sW%6qJ9^IH<8_>Q5}c%&6#;xTY}`<(C^*YpkYtM)?)S
z4{U5f`5xnGjU@`3%Rg+qrLhagEGXY=oEVl|J~M4W`Bg@X#=cv=y8LS6JR3W&e82HQ
zjjgQcE5E_mZ(|pg-(<9PNiLfzt}MUVxLIQ_m)}r+z<5n#*N*!_`4^0vx`nf~VUN1Q
zIHa*>3$85xiqX*{ar?)9v;6-V4>Lxx`(XLkjGx=s56i!vHkx-EzmYiAUNc$UZM>ne
zZ^J)$w~-s6Jg-o}-18>gZOqo#Z6!~Z;~j5}-3iWnjIA0QMBF{bK8^it)OnM>X?#Ot
z#bcf>|CTYdS@J9ecF3sf73_`1Uzgu&?A2I7)33|#GY)I4amJhF-!rb>BAgw-9!zV)
zBku#Nh*B=ss}u9yDSyOh)>w7UJLTUuwrFg7{X6A9Fdo&|n}z<0M~&bGl5T9{sEQvN
z^^9Gk?yW4T_)%KvKQ`Jpj(YyYiXR)DjESA~m@&YZ*jbMo3%639VrTu-cvNF#XFX-K
z#w3pHtX~<AYK-ix=Z#i;goM(Oo%ND&lg7x-`knEb#>mck)oAJ$PO`K9WbD@%*;#KG
zFKCSHtT&C>ap5F8>(9nPjgg)8mN9)m;>gbWt8qYMWM{o?OxUL5U}wE++@LYCvpnYL
zgv61Z6)-=iF|xCA%@;IAc2>T*F)5s6XBC@IYK-ixvF5^*#F3pf-u#Bf$j+K*{y}48
zXH7E828Ht-U?-dFHAZ$;skv8UWM@q_@7EYvR%Pa2G)7iexmmkiav?jb(j3qj*;!M}
zr!+=()+y%cJA{+$tm)=}#>mb()%={s$j+K!KBF<RvraP$ha?@@Su@Qcjgg%-%e+Hl
zWM|DbpVt`KS#!)u!;+5dta)a;#>md9GxunW?5z3beHtS>Yk^sDp`;@_>vVIE#>mcE
zXuhH`va=SMZ5IjW2Y@X$4{D6;tR?0N7fT%3Sxe1}HAZ$;y?Iz;WM?&)GcOTNva_1Z
zof;!MYnl0&#>mcEVU~VCILXdB!yM2U*;%dT3mPLkYn8d-QsE>!tId2=V`OKoHJ9#`
zII^=k%!3*uJL@d7dY4WIJ8Oe^lg7x-I>#*dpu~}#wb5+V7};5y%)J^TJL^32F^!R(
zb-r1+Thfu86*f~E`!4LPZu4P{k)73Rj=fAc$<B(Ji#0}e)&(Y}F#bJg(CyQ$Hb(bN
zV=nBh34J#9=0uN*yRh>nxUtBD0Xy!CxmQfs=E6RZhHaaWu;b`vYSPB&E~<mk?Nc{)
zjSHjOrzx|QTXelzIAwB0$~@P`POaE(UZAmysux!bnL{?Vykgk=kj7puU0iXI`AHjF
zUU9Kq7dj*QfQ^xN>w`8%XGE9V7@ZM)*v9CL=%Y49XGGW87<t7$Zew&t^a&fIGonx1
z7@ZN_Y-4mrbc>CVSL`!3Mt-o**%<l3ZnZJ;nB8V$bW7n&Hby5uU$!wi6*_2Rbk=i^
zjnP@pw``2gdcI?0bk_4d8>6$H2W^b}!r!+sI^Fq!jnV1Oqc%p~*B{y#`F?+7W8^{p
zv5nDL&*L^mr#nyB7@h7sX=8M@^m7{{pYhW+MrS?G*ckb?e_>;E*7HjnqqCl0*%+Pm
z9JVn!>v_(`=&a{?8zVpT3pPe)JuliA`JsPfW8@|Nt&NeF_$3=7FY(JZMqc9I*%+PK
z{NBdMXZ(tdk<XYuAuKbT&TL+_F*>FBqm9wI%<DEr=Q(fK7@gAm*~aKR=Ra+X{Ln{i
zjQr4lvoZ2Rzhh(MhyJ^bkssO<uxdtrXrGOdAKI`n@<Rt~jQr3c8>4fX92=u^nNc=I
z=Q4RVMki_cHby6Dg*HYfX`^k7e8yvKjQmpLY>ZBiPp~mMZ5eN4blP&FjnS>Si8e-O
zFOzJHPQXvLF**S+wK2NA=wNhvvCNL6yS5cJMz>b0Y>e*OPPH+*YkP`~(M{NCHbysL
zPqi_+?K;E8=r+9@`^JRR>^QpZT4Q5$+jWkO(QVf{8>8E<3v7&TyPh66sCDb`^feU=
z1224t)}!my@bpMUL*R8A8?0Csc-O{0SkV#~wa4PTrebAayvEi}|4hZ&z!Z&L0jwi1
zM`ND_c2=NKW4U8*uh<Z1)!0|Ze7)kFfZHc<Zs6Z^+~>!Br{dheAshQ)ML3OfbKqr;
zyCQgD`7;%p1Fs7P`}#02btTpF3iZXMzpdyEtgx}yDx!e{Hg=>U7VvynIKRHsU%4$X
zov~f&T40I5ULDu7G*p=k?D>dret72S%I$%f`vm)YRaxbQfx}k`_U_oxm6rzgTus<6
zwXMFUa(AHhV-j~oXhr2m0s|Vm1lTo!{Tln`taB<q9yrL@F4a}hU-_v(&b5;6_28wI
z2LiaN$}O53y0r4Pz;uoMKDe*)%YidAHZ!!Z^3K2(jr9eutNd!<PK{kS=DNzS1>V-!
z<CXg=zY$ovU-GQ2`*h_u16OP8KMF3byf^R)W4rJM&~2694NSdGIQ`B0D*rt&TVo6A
z_f<X=Sfa6lhJBTf1kTV{F0da2&eGVXh5IUh80gZNZ}Ps%9|!t0R#vdD^6|is#*QpK
zSoxE{hc)&<U|;3W0@rD5PW{2krvkTXY*qcemA?o)ps~{Whbn&^IN^G!=K_@Qe4s*O
z2dnR`d?BzzV{f0bukxk9nHu|S>AuR}1=ee<;pBaluLQa^wzF(s<*R`K#&)SyOCPIz
zE$}rRhqq@d-w52VF}y`v`RBk7HHNooEB`a_w8rpOZRL@`iyFh*wUvJhysk04Wn1}9
z;2n+OZQIJf2SOj0HsGz>N>6Z%#{NG2p-Nw{Mq_W6JydB1S8D7-b04Y<1)~}}q5PrB
z+~DOJn*gjJc#FkZ@K9xO@Lr9bH0`0vF~O%b_M1~5syre1mc}Mw#7+t>yn*VnOO4HY
zv2tSY!;D>l^S?JL%Yrv*?AD^|DyxED*4Uo9>nf)Pzo)Uj-0Lc*2cOc|y=CuKo))aT
zQQCKJS*U7uaIePxT=8z@{NQUg7OGktZ2UJVy~*pZY_zeD`}~znHns<G&B0dYr2YGt
zs%61*H1^xMCs(Zq_StbKSDg{O*v6(*wFW=J*tJMEvuaiFCdRG-wxDWF@P<!Q>DQ?9
zPi?O12;P3PV7(3PRc8ecYb>Yq+^Y3Kb&JG3)zDS7A?Wr3oE@yuarahUP<2jl31j31
zxOddK!HaC{8gFOtRvUZz<Z$p?f(1W_+hJY7?`iBt+#l=;{y<|P+!*W%{zPLpW990y
z(|v-|{X)mx$?0C#*fWgfe}-y(MQ|?XQlzn>*_6u}f~kSpL{(RCqsFE*?5OGv-m9^F
z3oonM5`05rZ4H-I#e!2l%XzA{hP_n-!Fs`h2P+a)iC{v<9clPjRWf+9#s((Mo0JT`
zrLo5=2^)Waa=AYEqo$98bDhSX0X7&6GbZou><C`3F?x4rD0om~bem!*82X&F?}J!V
zhk`{Kd!m#$CuxjsU=9VRX^d`X4h8EpRx*pQ6&j<S9}4zpj7DWBxKm>^4nx74G)Chv
z6#Rz9XdH%uPil<DVJP^T#%LUdg1MiUdQ#611*ZrG3Av?eC^*l?zF2ipaHWlXt!h{B
zd>i|A)#btO+StQYdxB5c*b`MB3BF`w&sJR%eA~ues=6-tp<B6hl=nu}4Z)ji?Cq*g
z1Yfo>bIPZJM{KNc$}PcyFIed&Ou02U*~Y4-d@0y&V}<I=!HA93O!-PMWn*<y?h5X)
zvHB@r58hy7E2i8N{JxE~P5D;v85`R;<=es6Z0v$54+OopS#{Yy<>6qRjs54zKM0;-
zW4ot38a&&^_D=axu+PRmKIJFDOKj}sDL)GyWK8b&JQX}FaVn?oi&LHop7cd7PyJ!u
zZzntxT*=rjbxPqEr~D$gN5_pW{OyEa1;4Jb^Q*o(<=Nnq8as8yJyU)i{F6>s((F;c
z4&o&X*0kR(B5WyRSEvJXzXQ(OHI`e9cS(Z>HTHPrM<yH&&b(dfa^aY-0J}+JPc(jH
z!gImVcSzhLjNPcQ-PL&OG<cA)-RfK8?wRsj@GxU?6ZW}a)0ZV3&Hm?t=WC2+`18S?
z8l&0&o8ZlM+}^6+1@9FM^ZlBfSAsv**u52nIe!SgY{xw?<&Qz{SGa`Wy%pb|@_Mjb
zV+Si9pYmpKi5>U!l(&LC8e56Gz3&858e4-|^-gey#yo}hOnE1Gsm3Pd5_W(wIeU30
z=(aK63BJW~yVcu1L%kFH=v`bF%z)>oyc;}ZW51u`4LxRKZ%#2oFEb{mNjV{Ro;jgE
zbKGv#;rsiPoX|UVIzx>L<$je)c+N9noJZw`nl$#qrIQK!9>wj#$5JWoJjR|2J-=Wk
zu+4&b-l}uQRh&+7?$UEZeayLAE%E<$LT>0f#-yHkX`FeXTQuh=6|Bw+$r;#g^%;Lb
zb$;kJ9k<LJTU`*kOJi@D6RQjDbe}=GB0JsdNLOrg)_SU{OYFGW#^s|%hYq^xGA6Aq
zW9&Y=6MZ%&bRTol+BmIxY-scY)Gs2DC#0o2Aymt8yVW0jv#U=CEnrMqJ28!OVrYft
zT<e)#Ju!4hWA_0o3(fs^Dq%OiaW}hqN~m39CmKttr`y;M)RO8MA-VOjTb<|aES(vW
zTOYgC=YZ9Q<krV-^%}6*A-VOj8@K*COXr5p)_EQVRu_^xAiLFo>MUIlk~<)~)r-Iu
z+9h1(=`39wk{cnr)rSzbG$c1dAg#a}LUJQyx4H?OO(D4vvRnNH*s_q^2-&UXpe`#y
zawBB78U%JmNN$AeR)>JKhU7-bZhWV*vvgHjpS6X0bgj?y8>%fWZcXT>2c^$?5Vs~R
zZtXF#iiaiLbfjAwD*irUGGgs%>DohhsftJCa$j?GduSnJSExWqYjsDcSFqq^MXP-4
zLUE04Dj{r#V33~n>UE)gj7bUW(@Iz$x{>2{s|sJRdVT0Vjcqc{uHF#(4~>1mJiq$v
z&>u8*Vqk3bIcd3^n}G#?Aho{Gzqjh#&`QRn^i64;n{1h%;Q9L$`U12^ZJ0veb$+)v
zJ?vARONnx3CB=Ta9=~g?8glT(Y;}4CeYJWb!lt?aJOO-(tFSPrt{g}FZ=6Q_W#z>G
z#_2?P4*XuVsoJ5u-#n=lFW1D!HSu#z46cdEHK96CS;x<LKI-GeSHh@Pn*seg2chGm
z#h@2*dx^(TW!&BpzyR|HRVBh9{Gus-VT5~NlsY+BSeUEm2hThzg&y;&K0ti25IkOW
z37`*Oe4?`EjdkexF8aR}Qdt+}Ws9})ari9?<>|xM3!PL%amT*j@W|XoQjRCjk*O?7
zq@l3Tt7bG87W&l8W~%LZ=x2|bHi}>+pjS0<xNS6rFCIf}Tr|C~(66fI6VL0pME?z-
zk7NBDYv@>aD&c>d`6VBr2+yMFl$*p#c*QKr`QawYIiICkC{t@PDF@1_{#TqvwGvL^
zv3f>E{4ACSkpUSyqWJNBzwBfE{$foPTi}%V$4L|>N$|4N`SAU~Y-LFeEjXF^3juwc
z%Bp{y<;q=uhvMg)4Sbo9^gteuomDRNN#`i)lT*>>?%b>rsW&|8`FT{&tD#|hY992D
zUlrsZi#{-y_^CvzjYP5Xr&&YNMdfzeeS$BkLKk_}=T4>AqB+F>xl_HMj7RBKEYW>x
z)f{IWzsTb#5)x@3PJvf(sykUXy2Hdtxsg<VViNU0f;sPI=@Lk?hwriidLgS6zHkPG
z=Z$m1r-ANM&rhS?*vR@*cpSP%#UPoI)}!;GdEr%)PR`8#Ur{2}CtJDFB7y&ka+|ov
zPA#L--Qk(J)Q7Er9()(q3j5Tzm?EPuqd8I%+~X^;A_>BO1!M+aSY@~s(5u$5Tv=mH
zl#%?$!#T2lNV>er8D(qqGwC9SmduDuiab#LMV>9GcDMSAHD1V7j+Rm?i(-BFx*C3_
zf@RW5MHJDf>A8L1HD8>Z-8@zf4yOlSW~Q=6>dBGgB6-q0_2Rpu)E3d1|9iElS2_C7
z(nvqfIbQ82+EO%_=!7#yQ`%Q~b)$J?t$KZ+_|+Lac4AYN<<flbn@{{gv7i_IM*K2E
zDF@;qPAi8sbrd@iZe%T&3BBqy*0(}`s)ovat(Nk9pqlb*gzaajBwJq(jXhTE(Q1AU
zO9=fMicf8yOf6l4Ic}(?Mp{{xgC0;_paj(}teCJcSu4JAy3^7Xv#8uFV0l{Wi_D9s
z*><JbYOQN-ip)WwoCcljQx9TQ!q>0?4V8=a%T$N4CVKEYQ#2Mb>qRD?uA?6KSt<3v
zKInV{>lg9w=YAL|w<E2={~k*~YLb0r^s=4lbFGy^vF4rI3YD<6zEfXiw=29%_pxo`
zPqSl*Kb`L6W{n!v)*VitU(%L1czjB1B+4`00ui20wi~VdX{0i(5vz^F=~EHr&n!K?
zCX%Pa<3*lUZKa-Wes?&Xb4?YMSi*D7$<3pVX4{B<I373?R;8DF2J7IzAS|`DdX-us
zmKf13tLz^Md-Tkb`E@^LkXQWzIsxN?J%XS22!`GxNbFYhA-*qt+?<rNrAz;@9Rz7M
zYv19{?fvd7B(3Pu>(NFJ>l|oQmg-!VD7UUSb{j<#iXJ<fb{(lLkG6Ikjc#e$!a~?W
ztYe>_NBUOA>m{~?$@=oD*P#!6nl6;d(Az$B3wFVNj)g~uS1}q{%Z?S9Nx}y9sUzqI
zzxn{%!$SX`TK%Q;ky`6mboZ>6u%%<S@%~8CiZ(urcWSckf1TI9N3iy}Q@vL>&NAcP
zoryN1J(r(XA_LzTck*=jv+dp9J%u=FFXmP4F0X}Xx%azMla`X~`ShMnC=*#8Do`uG
zI?1k^)l#aVgpUo=HL6<=OAj26K0lfsmU=pQrq{%+rG;}TELGS-&_%MNx7zYE-kA#j
zW2~=^M~Az4B>#U*$4Wk@@G8&-=u=Uaz<<TM;Lelk?CvqII-U2tZo1fJZryc=S3`kE
z*JF<Uk=z!bSH0?{0&X2_MXTIrF^|0X1r4GX^SF2bk0;iWtL!-#Syud!oOfUb8H!4T
zR|GJiz5vScl<rX@b35Le|D$O|PRR4-<-N0b;Uw(SdurnGs_Umwe|n)M{OZ$7s6}pB
zJzCf;8~=*1NP?5IPwhrM{pycXX%u#GJ&DJwj(>d_sq=~AWt-N=HmzTM5gOZ2--e`_
zEMo!uHUaHWP6rIBD!?4|gHokNscXTRi!c8xm8TxUPAZ>eqX55Fpj07#tpLBrs7?kH
z3Gu3<?F-0TXssrcv!8P|IA>E`3r!oquftFsQYbN`_M*fbuIDJOXRZpt-pJ$H=5uWe
zxVD8{+amQg(iW>zQIisN0^n%n102J(9m};H$F)5nt+uk3k+;o{Uomi6;n1yi$qqus
zB72yA^@iF=GH=aXYLTHn3Y!7Hng~r2!0*~nDI+;$-Vwiv-^X!yNCWsdPb-x}@pF%5
zql^Bhns~X#d|U@V*TLW&lZjv6q53=y=uxw{?f<voNL%G-?RC6xZjapm&ag+lY0xPD
z+N5!mwaKk9$o}-JE&nGhQfi&0L&W}cD3)JQ=BAr2y%9%0$iCZ}(GDe>Jt?Wg9y^*v
zDzogr!?v{6U#gWmY_0eX-A$3O+dpvZoxasG|F~^=e5<Ogw$j7@n0@%Ko}tb>vid}J
zsnXA4QMhaFwxWw&Cum|tODgelORH(0h%=+HG&kM8VxhOdYnP2qwxyrXs^G6fTB~l7
z${kBNU^i4`L9cqA_qoSI`DdLVZ{<1=#Ay}$5AgJ|EiDuYix*w?)b22y=6m@x$~wWE
z%6@e^pGv3DdA&zT>^Q7<RydC>YN1Q`pVcn%kbCutdF=45uyd?2`XisIxhXViR_qb>
zy63S+T6PeQoCuG_>FkI->g$j+%X8(HNupabQC2-VowQEw++mSn;gOsr{AF07*6H4#
zaW0eY`4pa!bUEo8X@3g;`=yV}KYO|O1#;MLIC;n3+{=3qtEKK)N%p5lk>^T0Y4n?M
zM7bNDaQw<7`*V&BuVSy!p3_MJ?}w+zgWr`SNpMprhhtHw#Co;>N5b#%7ztg@c^xhE
ze(e9FS>Pg3|43}MGkVEw<d`0d?v`Xrp2gCZ)m+kMx31HD`|<yitGU%Dj`!yOU(H9J
zjK6<7+?hvK50fUbq|vQ8MFQP=<JhaYocT)r;v1LnoxIw*?^j4zRyw*(;l=M%vi{+)
z#5#8^EbsJ43gvHUgBi_a4~Vx}=;W!k{2oGyLn{g;{l?Be9&2d>$w5{P36InU$D&)i
z72<JQ?>;r1J*$#-q&+{9P8N82s~t)<+g@@wnr-jIzW*COl9pC*%Wo`{k$z*MkL&?A
z#hUvLr<;;)+e;~;HynO<ZdMPwZTn14cmC;V#bQ5xDD@HR;{AsIdu)5B?Up?t6t^~z
zn!B}uSP?=O+uq8@IS&&1?Y&N;X!PB8NZj{E+?qsgWc>{{yL{?;w(E|4t4B1OSW0r2
zNoxCFxtT+I4YI~bs&B_!^Q&(I8hm5WR44IH#LYuD32{D>mG0r4i<29jOUlhdD+g-n
z?Xaw<6uOxpvC<F64oeO*_!b$3Etx;|Z3nr@Np~T9d=~<@TX7e{(03uI)pEjX$%%{(
zoe%nOCg{WpWis@wpHB=89V>cXprpNU29>plHGohk*1AI|cQCTisaA4R(H(m){H1bU
zX4P{B+V11^#jW%4`@(tT@guuBs9WdGhh$ssJQ&Pl-3Ixu@KA~L3%^vZv`fZ_^0aD5
ztA@eeY5L`8r$41nDAwHytBvm+>*jQaed^ib%u;05I4!mKX<fGN2{^GnMLNOHr)Gu{
z3*p5onj=oTtQ@GG|NJn?k7#?RR!%7+V}(L<O3oIYR6g}VwuKyulMl6>)(gMB?~$Fy
z&7U52kF{g}ShwROw|@nFkDqFO1b$JgKL4{6kpXva5QXL%&FCh!cAUPYGa4GHY;>nZ
zl!Kh+SZy3h$-J3tXq!{Y8X4=J1&*B{x0=(_j*OMGv<H`+xCPyMyB_xgaYBUE0eAc+
zJK;MjMutgZKgODT+BAuED9_EIxkf9dN6D_)4P^&CtCnK3-QsZQqP?Z&Vym15oh&;J
zpBkyjkDVLwJ9kB#Qsf@5<bO2VMs5X*CwVJw?)kKTDSL^AA|5l1huY#}x?j^r<~EY@
z&&~fzzCSq*w;8SaKg{<n-4v<AHwqk{qt$_UvbB-u>20)fBO6T4bmWGQgtO&O^UfVB
zH-AQ^wQ5U!OJjF5i89nT3TUNs^T>!0-LL5*a}$1{yVGXM67G+=N|&5Rrp1)P{Q3oZ
zeJ2-EZH4kFek*~(9`&hwf;aQblDiEenb~^j=<W63RvXO}v3_aRc$CZ?@#IKuR8OnN
zq|WZx%r;t_nRG)*sxyO>+hW`?qc>|ve#Qa%^&2-9oknULc>Jms(BXIJH_syaOVFQa
z#RP(Hv$u1k?m9L_a<~}1?NvWwyG8t;)b@1kAsXP@@L5^9<!OZd>f?ZhzUTj6i6u$+
zXZT6}x8)=AQ6R~iwI`+Zi{65vR*OenMyjQf=-0wxv{oN?i(Xn9REJ^tMsT|zg1ZH8
z;V*(44sYQvf;$dx;V*(4206InkdM0!Z{jb4I|XmyFQUe(*VH)m3jQMML_AN#b0VG-
z@tlO`Bs@=6FR2RrRpY0E=HXcf+UfXPguli3Yr@|d&=O~;LHtEj2mV5ezT;m6h_5ek
zcrsv)ssSui^BLANTmv{poek(w!wkdxYs^OwK1n^ozn$^2;<utz<LT4z6A<I(djK0x
zpNZeVrBt^uyr1FI3`1TjCCs^1Xn3TdLlt>X%sWTj<9T5IX0+(4ygoJF+de0$CVMNV
zU8<&fi>vmi8t-HCuTt~9PdDyoc&nO@93E5k-k+TMGquKBTlZ(cWp#y~D1ME*(nI0d
z2-nSP^h8mvM@7|!x^B;nYVD+yXCbHE=1tVy;MwLq((o<t*Vg?Q@JCHYJX5{T)bBz6
z{bF9Rw^iMATBWyx+t{QIR@8Z~<hFNk+nW^qX6!}{w>O5p8=0p`QMgInThR*&!5*|L
z?4`C`0r*V)72aLm6U@EH?UB>20eoZB$H7na+{iGdA?2S!9e(V+nCozv>TjHeUl}Tz
z^r|<8JbS%+)d%nmix`)(pE+-0&I8POfH|-9K2hB4yVCn`@fp6bqAzaT&Z%w(f6RA~
z`44fO4{`kusqmz*_aL|N0j|lT3?Japd6e7oDDym^A?5QV=kp||dK5W7>-!~BeyQ`J
zwmiyx^#bSi3d1)UzO7;1q*MKGb6ei#n!L^V`#q-=_o){Y#lE5fr*;DNE=>4e;2y~J
zJh1d8f3Bx!(lqc;Z@<8)UIF~P|4rt3lUw%+YV`~MXb(y3Xb(y6o1nbvp8$$!R3L{}
zy%RK@df-jY%@65)44fnn+q@f=;_KHQ8ig0QEpIY>h2h%_r+Y|dW^*aCIX9LrV<FFi
zg`R^IuONJH#h(D@EhI=IxX?qAztHo-JOhyWvx#$P;v5!wsQ1=*Z>c@v2`l=7O4v)V
z)kBijirjhuNj^K6LM499>|lBa(_1-*4u-8<lMaT5xaWHqZe&Ui!<dA598+AA6xXDO
z+qf6>nD1hyU&bX~%&~{K$9lMw7{k5X-Wb<pBXe#<OCh)1>K>-==l0&jx!uJ1Y~(uk
zFvKs?qFp_lwnsfNE9^bsA-ES3@``ag*K;rDaFBDj9r=g7w=>T{&TT)JwV!Kpi1{z$
zI$Y)<iMovQC%T~y@hBYfkX~BjeFoa#0P`Q<JRjij1Dy5&uFnIU_5ezM*7qov{wSAt
zfJ=N5{Ew+$GTi3<O=D^31rEQ$@J-I)O%F?&hotLmru(6<9#grjwQ`{Y@jVGIjdHG+
z`YqQ>b-u^*qo%)yNWX=0NV65@T;qKuRGCAX<g^?b{kb{UU`z|umFoA<Emtb~t-5<W
z&rmAQ%DR;~+ZYZr+y&T`b0tdOk`wmOmvzD(`r6Jm5B<*JFvDG*BcR{HI{Zqs`Wo<$
zpLsptS4s)C<`SF<81=NxdJ_4!&H81|Fo&aRNkz#hlrmwm+U5CjKEdCP^#FEH7!Ue`
z<0h*pO1}nWJ#yN%QQH_s)sy9ykBX`r%RdY_zxJbmmDB%iRM>NA)t3=IH1;6iFH7$o
zH5u!|lcVl}UR|0y+Ivdz4S>{#cX>B0d@=Vf?`KO1zI_V8;;N;9cg&=4bp^pks+Z^8
z<uzwecpq}u=DoRkOWp*|VVl=GVNc#R@3`VEc~r`+c@@b2B|yr5m-mXwnfbfCw-nFG
zzsvi}$`^BKJ!;9H4xT<Woq1+6ob1hOeyZReZ%O5E3dVasG3FJ(-xmJ~@WF~d6|C_-
zG3)YCYrL;cCir9Y^BV7hIe!8FBV+Kr6z{WRatfdJ(wFxMjt8VK?GvOheOaI2cyRva
zv?SzqUDf15S^=v8*Hz6dT*!5x@Jdj=S+D`Hw(h*bCZ@MC+~#elysU69>w*2~e~-GI
z!+Sa00Umsv0I*QK?7g@0KMNlrdeI}EvG`8PBc9}hN`#XW45Xr2N#Rq9ihSR}xKQ}C
zq7E-<xehPIuJL*r=P>_*qQ^P^jhz4M>ayZQ(d%j<z7ny~d*<XmwbA>N>S=1Sue$2O
zqRGCRnw>>2drHQB2t3a;?JJt<Yb?C3Xd~x*m-m|4FXr}eKHIzpiykbBA^aGuh?a#<
z74=~5^r;@K^&M)P_t#~AD<Z3)5K`ARYfbTd-@^?XieE=f3ekHDPF>|&<9!ihagTT1
z<UbX(_`;*-=Dh6v+0q@wYkXs>FD`!B8?60!@!7sN3qM&L_B>yBYw;1!yujB1pR4(1
z@$1O>K7^@%!m4EKyT#O>@bP-?D)N;~_AaT;FWJj_W*6uaO7`-qy3KpR!l@<unNDN3
z5InRx5dWDaDQ*kZ<j-o@NA*17$v6K79Y{ITND-tq9`RgX{DqPZZtoFK)0_uN)*$VI
zqFqS!Sjm;X3rEkyyfCV-&!K!g>P9tx!uruSsxcEb0ya(vk0#mp$mkorbE>Z$?NxKs
zRbzuHrfxyF4`Hu*j^T@dA$6JO^P_|64$nh?cLC<8$30hr|FGvv2)~GMF>JT*Qa;|R
z@M_@82<NIlFkIyOE>fNEyA!byAC+>A@1fDf>N?Q#)dRk-A^aS|$iYvXhZx@Hr?MXA
z@S_}l65(QXi$SG)&Y*m9P2w*!iF361DBua^<A4*)p8=MdzW}T-4+B=4F9J?Ce+M|z
z{3GCO^G(1y^9bNV^Ig<-5$Cql{1U?T9BwjwV}fdh83b%K^8nk-V!#e_9N-3X0^mk-
zGT`}U6=07!Z4BCI-iW$wF=v7jGv@#fm<s??<`TdmvkCBGvjuRcxe9O>mvx!hj_~E?
zR;1lyZa{dixe4%UvkP#)*$a4s83Vk@+y?kbPIa@neN4Wh?;Qu#0rMizzhLeJyxqJU
z@J{o?fCtTw0)E5X4|vGD5%50qQ-BYcp9Or_`~u*k<{f~KnO_Bb+$=&Jo;2@9_@~^r
zPnm}hews`CrFlQXzvA#=^Ff4PFn<8}BBy%Ed<@~2In^uXPZ9nDbG~LigYfGde$)Il
z!f%-`0KRR$4EPSyRp3>GJpt+gf8Y&-gMqgIa|7=H76!baj}8O?PYC1!mIi3_D*`n7
z)d3p)=>e+w%)nT{*@2TFlXIA-E>H@}!ayb9(!eQzO@Y$@R|IAQwg%<{wgnagb_5y$
zHv~wEHU>zF&JWNS^#o{iwghNgVu4NQ^S;0*z&Q}0(MbhpbcO;nIu{3Mban<-Aa)m*
zepz58!k2S+PvC_yx$4Tmr$E^ocp2e+Ouw4jzMort1GoC7z*_L!99R!{Ag~ee3xP1;
z?SajJcXFS6gVP@39=k6vfO&U>Db>M2@JtV02soR=b-_y!UKqR#a4FN9f>(n7lR=uN
zw*Us!zlW$NA7S_$!xsT_)H|W8@j;iO9D<VpgK9X3I4@y%O%A2KE{8_#HV)sx;d?lI
zFT%y@hY07ZpTaYduU=$G_tbE!eH6`<`}4T;Jmy5$tMbN>{wV?ss>x%B(#(`K3_r>6
z7LL7b43&Eq!+QYp)%`em$X7oE%vH~g`7B~z<k(-0rBa?__#(rX8UBIc>wv}T&m2B7
zmhyauDV}kZ${0tK5Qp;^7BOWEhsQIV$dt(ldsTP>wX1IerQJ4x+Vx3<p}PRRs%GL<
zpe&k5lzKq44iv9i$#5ORO@Ki)h*<PCV6M7k;-@fPyO?q%!+lJ@hUwQa<);(3k_=4z
z0^quni9dSs9e{%;e-&`}<hubcFQweB1N5qilW#=1l?+d1IEUdPhV=|v7_J1&S1;rJ
z3B0QTpETZ<U`Tfm^3_y^^$gcA+yn@(NLdk5MLE2U!-E{Vi{Xt7Z)5m1hW9YMpW#m#
zKF9E7K<L$Ss_iz0!wfG0%vGL>Pl0Y!+&C5=ZBl2dheqRL9Pq!$vqZgHP54PFFqNL=
zs*0b})M@;jr55t@bhUn}52rhyMY?W0DZLMRm7e7Rd3M8F`2W<;rta3y&*IsR`0wdw
zIldi5c%y#q*3a+hXE&st;y<gO-_y_k&`%${ixgk3pN;z2t)Dyb>{g$f>Vt2N_;%~(
zXZ7=Y`uQLF>47Jb_$K4|B|LAzyFY~Aub=mA5F8F5ZfB`}-gl1RhxJpPtK;=^seYcX
zpF8#QX8pWRKM(6CoE@D1&H8zteje6OwMpl%pG)=geEr<1pEv90efoJ=Kh=4XeyM(z
z_Gw%{&)3hL`gyZ{-e=*LNk81DpNI8R?N{nr^jE2V?$pnl_47Up|AbDjpA`p``UKJ)
z(9a<JbBC}Gg8N)u0FP5x?NFaox2QYS{qQcls{W=t9@8_<GtG0l=M2vVPt-Hy`Kjl3
zp1*sRd564T^#0wuz_-M=*0;xZweQou@A_W%&Gj$yf53m2{~P`ZMwzkP*lApE+-v;G
zIAY|O^Uc-f9`obo=gp_gSIoRXWuP~Z416$fRp4OYJAv}x{NRq@1HmVPzYnHDSB4IS
zejXZ|b4t#locf&hoPnGxb8gIeG3WO=ujTwXXVR#uQ8Py^8Fj{}u2C0{`pl?*ANAa*
z-;a7{RDNznZcXm8-1WI#x&65p<=&cmckTnZzsr3+_nq8=yc6@v^G?lMlD8`FVBUB0
z9?g3_@9n(2{PFn}`3v$F=Qrn{mp_odJO8Hq@8v(4|9XB-!PJ80f-?(t6kJm9nSw7B
ze6!$31-~x%Tfyyx4;KcDmKH59T2u6iqMM6uFZyj!e({OLRmHW%ONy5lcNNEruPpvp
z@t2E#P&~GzyriL|y(Ci7S8{vFw@bcX^7E4aD0!vitrE}ZlScQBPK@3OZRdes9AA5Z
zk9-tv4&)+KMDKp70{CBx;d>pUCO}I}RAunHma9`#m8w@$R3kjEO{x~Y)n%A_D=>mB
zg}v2E{X9PgxNh12;MCedz(3Xye23xjP<o3fhpwjKtT`;2ekte|kGTx6Z~lkv@KZHc
zA^ch`QEF#i2iP_96Mzp?-vapUNw)$%GMQq(&Z#DrQFveQOMrKT?gG5I;cme0#(Qlj
z`M+^Gac-*pF5m~s9|gREbND3ZmdE)VV4hDi{UFnyWw?N2$IPZ2?yq=)(w_Enz@L>m
zHIZ74n?)4KQ}RD{&Q_kk;+&sn_!8h6uKzl&iBb3(C}p_>FIn&xz;7|!Q}kDa!zF*M
zrf*x}O-)FE0XPNY@5MJCLx9^L4PKRiBzV;>$bwgW5OU$gHz-R0_d_bY>N-dW-u8qq
z-K)L~8Nr*hkQ1-^4!rYTb)T9Hct2zYUtUv{fWJivUij*#0{&f115}<<0X?460KJ|X
zK%Zw8px-kGaJ^?9;0Dipz_UH41D@kq1h~nw1n@jhJz%G&5pbKQ8SoO%a=;IGS^zKg
zoC)}#cO~GnzSV&9{A&Q~{Oy4A{p$c1_}2rT?mruFq5oXKMgC2Ii~XH|OZ;KLW`8%}
zDt`oUwZ9jz%^w9^<KGIn*53!%?vDd@__qPB^Ctn%@(%*8_wNAQ;2#D&+kX+@IsQuk
z_xUfyth)-(t3K-Ah49sYUUiLsH^LtS^r~z9mjmwie+ck8|CNB(`#%DBz`qah^Zt(l
z{>pz1;IsZ~VN-kxV-Zr{1<X-D0vv@On;wVRoa-3}{kjZrocbByMD>z)9A>iDHxA?T
zBftvvegCs&g?iL~81P5_=K&w{zX14z|2Ke7`d<S4ng4fyPx)T~e8&GO;4l5J0Y2;h
z6X0S0n}E;z{{r}e|1H4Z`2Py{lK*YM-}&DKe8sN<73x*L7w|Q|AMj6p6Yx!c5b!Vl
z9Kg5yxqyH5=L5d&F9dwoUks>>(STlKETG>w0nju~1PmGz0CS8<fVoB~V7^fXSZGuL
z78_N7qm637vBoKYCm7QKPc&u#PB3NyPBLl%OO4rpWyV~<3Zo9N%2)tcZ7c*l#aIkD
z-B=1Z!)O4UX*2=W8p{A@8!G_k8fO638LfZ|j8%XOjW)o=##+FoMh9SnaTZ{cu>o+I
zaSq@LV<X@h#(99P#`%D&j4r@7qX%%Uu^F(#*aCQ#aRJ~4BL;Ym(GR%MNCf7oGV@t;
z9_{7<tIX#CtIZbxPceT3INf{+aEAFiz?oR}=BZlqRlwQiYk+gjKLOU6Zvrkb{{pzs
zd<$@~`B%WD=G%Y`=DUDRrb1cEOfTRH(+_xtX#%#ILBLgJ4q%&^3%J(I2kbBl0najv
z0XLYV0naf{0NiMf1w79@5%7F-0$`Up39!d31>9_w0d6rX0534B0Apq~V83|^;D9+D
zFk#LBOqnwQx0|(qL*{J23(dKJ7n^l}A21gH?lczye$ZSDc$v8r@Cvg5aF5vp_+fJy
z;9heD;8o@sfLEKXfFCng0q!^30IxUK0^VSD0REeK7T`_h2Eb35=K$VpZUp>{c^=>a
z^L)V1n_YlkFna*MXl@3)-P{8BW%B~SJIxs2SIvIFgJvSoq+Sh<!}{3}Itj2TG!bw`
z=w!e%LX!boL*>w#v(-YiQaysR#eJR}??mrR?-Fkue(3wXPkCSTzTq9~tMJwMdVJe`
zm-_DV-Q)YI?<Jq|7y8HhEBuT6*ZN=d|J7e)+-iK)_^C0|{FHfzdB}X&{IU5T=HD=m
zYXj#5&JPR+4g|gwI4Stm;J1SLp>d%zLmi=L=#!xXp}RsK&b>76*LmynKT)u#aBJa3
zg$D|^7JaAa@uIU!-m3PgcU5qz54zmH69xdUB=n?zt6-D+CCKoG(Y3I?Gyg?L6D6Im
z8#gL5X+MDqP_Q(7YB{)9;ID<xc7xd0j8d)mTM1pa3aip;oF2EqYFUH7wfJkty3~R7
zXdV8}!ryw>F&prAHvZ1R-?{kPh!trQ{?5Z+C;raIUl^877yi2O*Mq-^nyEJ9uNQw?
z)Di`sK5W)zr2DF;6CU94o_gcwp2pD6J)NOWHGt=rLJ{vpc~S3Y^0wl+%Uf^UhUX@=
ztZ<X+!C$KIZtutO{A6J#ZlV1hafa`ryz#!>#p68#p?SW!CG&g(p-t+1yw#@`E}q}n
zSu?Y9rdqfx+TYWVh;NPbw?_N7CKq>E5sSM!JDZ}(fmnFBF&0iHg*}s6%b8?S=Li*r
z%;V6?Xg_5v2$fyO<mHJ-I2B1yln}UDHCDCO^>@en`y<_{NKZ@uz+g%$$I-+%mqU&5
z{_b#!vJjkN=5fr*@DLYjC#d6)F0^^uU^vDl+wnEERI+5)z9l>mkph`Y-Q=pz>cNy<
zA&zeg_tZA6jcksgx{G@|JC{dN8beG^w7>V_TIS>GacE^EES+N0>p0XJNpiV@W%g{%
zEUF`y);`?V7fB_e-MTy*w@YMRDzT13l(W-pC!^+`-iXeFf_C!RR1_!Ykdu^Ar{inq
zSd4QyRNvFX<6)EQIMf>Njdq7)t7W{I1i=gUbcd5XezWJ=*>I?Ru#2;?$#opk+0^%Q
zAwtXSrP=fBtT?oCkcWD%O|IjR&T2zks$|D<pd;Rn&h77I8RG0RCWOsiTSw8Z{$QLs
zZZ3zKqT9J4Ho0a#C2$uYl(w~z*dVdaVg|}#Hiw+N1s~B~n}05c+TuI7L`E9oiJnNJ
zwuvYx7JZS54#b8VV)5>+>-rP%SgdYnOItV*j>RIe_0bf<EU}uuegMqrq_vR&nb4G7
zc&IHNkJUD{V5%ls<M9EV4t3r>l3JhWZI35-yl1K?&FYzIZKP+gJJMQXt0jc8t0iK|
z7zL=MEL12UE4vyBWlY~Rb)p%eJ`i$xT5uWz=!KD4q-y}3$2?+mk1G*KuSR-Wqsi2&
zcu(Z~8r9O=KiC&Zgu7ybK@+7k(w+);Zv{(zcPbk1N1!p@HxN!llZ;piQ}G0%R)mvV
z+9N51oDwKSd&R=Vb)B6U-f%2gqxw2m58x3ez#UF??d$ABmte$)Yta2+=+_jk-FC#~
znH(G#z#t`?BK^t8tlA+&=`pIc#|WY9V?-<&V}#Mr3FlEDy@NoA_6#ka(}}2MgZ<s-
z*Qzrk!)JwKgV03L1kjeID0fFVF$`?s;(48&I}+i6+Ki^wN>gj8sc1_inQG~ergXs&
z6Zjyr*BB8r%=3sNy>7FlMza+4i`7wcXREXhnmc<4q|Q!^T`JmLpGbs<so;*`fk=Dw
z!pNeUxoTnaP^5b>6{$}S_jfPuK~v|d+4JYjpItYnrg3(|+{W5@^BZT)X{ec7yKH9d
zthx2`=Qq!5oK@4*G;6L@VXkTjBj@hHp74wo4Co9}6(H8nUDiBzer@xd`r4+(xicFZ
zn&;1~ojq$#<IKjoWzDl|=Qh>PterofT8OzC?-`6m7GwQLM7Lw*ZRs0`MfxKBRPlI!
zQzR9R#*&LwDiQ8a4#bm@dX<6%#1r+Zvk4;F;KVe566uaLsO~s`a5d0CZEwu1=(>Sq
zn98YbQpwJy!M?uXj>u4IFhPXgNUF0Zos`kc<*|5II99*C_bkkUaDR71luc$KV8|X}
zr`OSfXXmw5S!V8%=%~?_g3U}O8ZeXE6d6cu*%3`fbhOn!voiY!gv|Z{DXl>^uImO|
zDuV!Jk4IY~J`hQyhB3FBWEj^soGAf%U=Yt#^($4wO4YbhwJ&d8xl+}yQVpwA<0{pP
z)%lF})j-o_uXXJz)jpg|MfxC%lGke05ba&9x(INtt1&ybgcAb|t@Z7~vR)$S0b$yp
z1`=dy;2BS*H~?!9Xle^RmnY(b12la$NGfJ;L|eeXGPyzRh$ptTpbkkZtFslQJUcUo
zvmMS6apyR3=Wyja)ppW6ot<hkbP;3Go=z3Rx<=+ePoz`zhx-6~U>b*0-CH`H&QYE9
zv4JgN6=v8Q?&}MyWTKl|t6Dm^)*PmOiS&?Whe~kK^}OhakEGQnT{KD0ipE7<*Pje;
zj$~3{hf10H37i){CIXRVjImT?h@UKUROikHXav<opfA}SPsE~KPGdz2!A2u>Z%IQV
zj0qUPP$tP4%OkJ{qTLEZTWg!9ndvFA$%)l{LdyX6zjEtMRUgBe1|{Db9*!hdVH7&z
zXGS6e>-tmCSW8cO1n?fTfux^;5OW<I5K1RXR`r9aI6dQR%~OaZUbS~`fkhG%ZQT^#
z(H|pfUneHnvRF8^KH8Jo(y=9+qU0TkXm4*Mkz~zo^R64<B*{)aJ}ADuJ3-<yfSfXB
z6U=3GG^4DiCqgAR_xDEoBkd7Jn72RP7Y)al62)R;h5I_!hWmTseXHV$J{@h5*7Zj>
zgQ_!WLN}n<j3p`G9pOYjtX(!hI<U}0n)}Hn>4E5HAx093cw!{4&cv0}3UUGq>FcC{
zj#BP=#M=^)Zi_@S3yaUXei)hkU};G<VqRktNTI<*0_#;9hJhO+h6<ucX!WWLY;7bN
zNwK(uVJo$4UK`mq2zvlAuq&cHG{*rm#-Tmh+ZP9UML5x8g%_^Go+*s&(bmq+2AF2(
zp=HrXtVb=hg>Yoldg;pKh?tB5ot=y#-O`=Ho<>FRm%%5t7vz^3hW&VVB$Go_Q%`#=
zia0jd(<psf4p(E1-%c#lTPapZHmMQ2FdHFbfEnAO{T*>vqL%(-idJDILt>**i;g+c
z6y3Zzf>8ojOQJnKm_R_9pkO1B4prS-!Wi0^#vJ9Uk`mt-j&;LWk?n{W>tyXVf~26d
z?C>(k5b?p<W0PV%%tFR7{A?pKvN;in@XT)M!%$*6k<_<CFbIR>!N87KC|QU+v`L!b
zBsWnfQmdXMZIY}6nx;9E7MXe&o|8x-Ga{+WB?c{As!xP0b5x;_YMW?BuAm|?uet7x
zVQ42M3I_7h($BJt=^+uA72T0cB3g`o?e7(qWE!?CHi(YpOgq9`BWhJ-hbXPqC@|y%
z*AO&fI2KV!hAoY;cz=X+gL3vUWctypW%AlVXym>KPm~o<N8r>XvWYbX!${b&M7+=9
z!Q$PHT7n%`xCCf`GrOgOXqyQ%_xH53ZK)a}P>?!2s0~h*@?J>9mS{J&G9=HmXmC;v
z&@ghKmyOIbBMHbP6OvG#F&DnoiK1~^HW(AW_yDuh)Wi;!*Ca00Rt}0~fSJ%>mSjfy
zU?Xh~V<+yA5TA`E3v_xS7tKlKB0{mlw6IELS60KYh05q^Jjre@jaX_T>B9T45yZZW
z6E|*&bZ<o`OBeQ2plgukItS4S-H>DKMWG$yeO>g#nv)uq#Y?S%)F_r6nt@cV>XJuX
z9%N}s<phV<v2S3-ddnOVqRh`kFSI#a3D`#Wr`%~e62s^b=?XX9m26dns);ta<E_Dw
zLhLLAmF|>qe`JRQpy(vf*SQS(rGY!5VOVIr5RxP}afB)taxw<1TrE!%PMQu`WR}Ui
zY+;gH$e@H!QD+gxa>CK@35+C$VU=Oa2g{6kSS&fzlH><+;wP=bP>1!sVN&Uu)W4kr
zDXL&bgS7h83ch`?uN~fu1T=<h4;teLlQtj3iVc$@l^AY^C5d>bL1>gjx7OTxK_PWu
zQDl5N;%l)aMiLekt+RqDQTh-kQ9gPQ>Nmruye>(LPAn3}Go#Av)j*=NB8e{88EU(J
zLN<X}`x-}|&Ct0ks6Uu2QbpPpTl9?lVLi$AS?zFwLr!qm38rjTUbIEyK<C1sq=k+R
zXl#6W!C4y_1kq9Ly2sFv<Y3q8fhIJz7kvyI?z=P`792TVv09{&d)cZ=uv<4RB8K@f
zl8U7e&RcgmeS;Zzv_CCgm%BQVM%Bfp(KII<vFT(~B%Q!5Srt#EQG24>)9}9VP#WH)
z9iX;{($!b!yKH2b>8^cAHV$-hHgZyo$ZS#0Fl3A65^1r|7TF)^%|?bPwj~=q3HL(Q
z2=|a@OK6XLwn!F$Y!Q%ytmJ5aHgaTWARE~hmTZx_-r1rY(a9PM3Cc$2-qr^++2SQv
z2wJwt<hDdsIy`dO*b_&g!~c;z0pvMbJ`n4X<Q~~9>Ag;*N7f-*z{5T9ft0g8+4TBe
zbSbL~SuLP9$Y+Ul1EIC>Rq{4V(1nX@Iy;jb!|NJ{$iQ}J5#B4x)^mL#iaaQOB$al1
zFx(QX+E7$aeF}zY7u+B)6`=K^o1<hwlAZ6wG)1}wdwYrQ(BSvMcAB@KsmMxLudp_p
zSS(7(NMgH-(h-KqfR2F8LT)36RWgqxRz$JY?sw?TLn-W!uv|K%`eZWF*A*Kkn-ptD
zdSpBG8H}B_NTM&AOpk-}m~5n2@~mhwie_}golLdKw-Px=5)N59iq~tbV&NeUC7t*-
z9M|-*ug9Tj8=I1Xy(=0+b(~o3NwHnm#76zzFbqKB3KL^6qT-u3>p;IeMR`hCtSbq3
z!^Z>tlM;+F*v=WWhqsdi^~L$Y!=PXjL?Rc`7$<e854&l2L&(8L8#)K+vdMft#SDXd
zNX0=aC&NLgFfVjt)?;XG$DxAFhiyIzND?5=PRnKmhbN{zZ>aFl>4+lUN(#mMTd^0W
z(8_RX%VxDX7VgcoAy!3tk$6B3(OhR{M|I?kI9-!%I<|OpIIO~Q$c8Q<tc#B3DB!Z6
zXTZM3riZB7MA73tL(3w?w8ne2^FhJ4k0YE&!ouBIiA6;T*3zDf_lAn+tq#LMLpwsj
zG|f60vCXZ_3c|w`9Y_(iG29RP3w!UuSbSMLVW09+T$Zd`<Dh34QD_OAj=dN!On!y)
zGV=7c^w+bbsLe1FB@~wj$pwp!BBP9C$ac`GWf4L%^6$XXsp@F~RoH4Gm?Wqx*F$cD
z7<m~oleN>zijfDX?<UxW-80Jx&!J=rHdP;*NxCAHY=ivpb`v$)LdGt;4pldNZv&AY
zLAHv~CkQn|_L_pn9Wta)JdbL~A?_eor0h5&{k=LElOQ(7Y&XKx4&yA3gp@hA$79$w
z@hJr3Ql5kN(D?*5)QSE$wz*Li@47&>MIu|78TRT{YDl9va8`zsTNOsQFP!R#yJ<bi
zlpD?Hd@L~00c;n;F?3x5JEB--Kf8RKnNhOJh=VgTj;&VJ4oijHPl9EXz1(H>W+t~U
zNRWdN+32k$9g?V3t5_%?BG!|qh=_`%NGw|&yFhsOg;DR21fc=$BjJKuTvP(fvhFKV
zSv?{TILgpYG>vOjrLp>+3kLbrgk0d{wxH8s9csYd*kU8KqQhH-PdYAoq3i{5;FsA6
zjS<kWWYJk31)cuuiN)x=k2<CSNB*jlj?th>?WzdQs*GX=2sp|4*b)Off)q{>m6lbg
z8q6Ps)i92Yq98J-So%l@uw}sk>PtG?LQCMc=KZD;;ov^aEJmWV7u<44S&n4R9<EjP
zDvwCX_Y<(Ug}UUM3h;;nA@3I!A81e5WeN%<GacRV$)rgR9Mch}UL>AA0?WgLup1B{
zt)#kIBmHUzfgu7dNhE~mBDi`0Afs35SOWHn+T0Iq)(a5doWf3I1Rw5UWhT`XLIixh
zqzh?bYKr$^|IeMGJ?@&)MSeED^Q4o#w%8yMpz_oiaR9y<f@KS?dseAxD|$w@=}mA6
zvPH$&_SjfULN8!gvfz-VV@olvc!k8qm7Ertn_)~UY{sEMI8sXi*y!GbxH**L@yKj)
z+7y5^i{&O)F6_lsu5wY)xQ*bPYuP7K{XOf$(G+waP0Uz?>j9gn2WK%zLpulSa6Qt~
zJk$-Ps_E_4nT10|2}rOkKG^SKh5lZ>S>}Mlski@-0TTOozY}A5!yHOO1kT`y*nLzs
z`$ZhIc_0~uCs#%Ell&TI;GT)4%b>&Hb`d{pN|T}0I^tj$z;Uw0*V4Z^u6vDxsW=8s
zH4Lc60XjR1cCQ{#*lv-TgAgyq@I;WWdG!uBIFehU0}5Ta-G#Pq1ua9w8)Kls*G3PT
z`I{-gE<$wnfNH}ZTyk=-Z*P<IX}Iee5^9}xEeTmV+&Im25SV7!TmWP`I!yYU$RWUP
z(Y_Xk#yO>J?crs2HuR)AoZY%&6T?aDUauUgy;3<O=uj8p+I!0=knhHc$Em1wT?-QJ
zLUjFCC!Ba)1Bak_;H`1d+#78^^R;e6x_G2-+`DRQ4P1L^)$H03E4~fqq{e#WAQ)-~
z5z`&UZ+z?E2)RM)($qHL+JJ_udRwNL+3kmHMZyWT-qylMjwEowp&wpjjp*fACJ759
z%oyV0b)GgG2-6)TES4JMn1;b<jG|)F7pn)bRf4J|Qrs10cc5J5P_hGqpvvAJfp**?
z(unp!X6*k}Jf=MX5Z5#h0G39}S#3>Vos(_uIJgeM=dD9D@w(X4t>URIsB;T#x$u<Z
zK<t_1;IFZ5osQ5>%rv$%#7QF?(zHn3%bK=B%94!`70x|MJmgbAzog@IQn3z(HbyS}
z`id54n#rsaWz2a0V~w&#c91V4p(v-;_zr|R;_}pm%26SmA?oclAN)Z4vEz^+?NlVB
z4+>j)kOv(MDrjdbhEIXS-;Ki;p~jern^cdD?C^zD5#Fi^q|0%(71@b}D6^T)G{s2A
zM#eA0xod*9jLmeIubkr-En!k0S<IoawWMk4a($X*T}+`}yd1mO6kDL2Gev!O2L)(>
zMY(i1$w#pcNy;Gm8jFTLQ#nekL)JHJ9D;S6<xsS<MY4CusJeq>$V&Q60#ugHb1|(m
z=#r%@%kc!xP@N2^8u}E?A<{$&ivfc!aNX;qe$>aR=oN7iSUQOpeIkmHgV2m_Vw%y-
z54f=4JjRjy0IuMGOl3rq&9F#-U<RnJwp2oGB@ib78=oFK2n-Q`rHux%w`w3kr;*qQ
z)+Z$&a$&iNyh}u+i=TxoliwnClLR6%qkH*axgi=(;_?*%yl9ZgQs0%d6n0ZI+}n@a
zVmQyGdlEL?@`P~JaZwAtKOFXsh_*C^9gjie>j_DH?;xqf9SEtTc}Im{4}vF2h<N{^
zz@TVogTg^8psl(j9h4?<As1(7)~JhYV5Ptd1!R7}Xw?XH9-MF*gPw+&7RD;>(4wv|
z78uHb<cTjkpw{;BnT(P?5KCgg3-^#?PsptDZ7dZ}g<}#)Y>-Jopy~R^$JWyCr0g0@
z4hz=B>Cq3eo9sxxq?c6Kf%Hmg*6kPE3!B{uXI(U@>W2Wr1ZacndfUVrSji6EKBEI3
zg{0!0dlHA|U6B})V?z+`jj&GurncTJCza)KNRz_0=ujk9pR}2H7Fg1o899{3)-6sA
z%1&CD%-n@1acMEyulpdWWJ}HB!8s$EN_UB{9;$V?p_a5ZLQvJ%H414ZZyzO0%NuQM
zMH;)Y0Y@-nzKHe82Z(IR&T6v5xV+nsD&dbi*>(b;2-c(a{tNR0%LwN$8!{O;dq<X(
z^>8Kb{q=-XVcQd)RI(1c2qM}pg0`%zJ=Kgh+Z}GtMrZ`<jvyOIbnXC$8vD}}`3NAr
zju}C?>t-Vi3!?CycD7%zBOwB|kaU(yK`h4IViH(aEQ#vmDP{)Icq}Q<7YAq|=M8e^
zhwJPWI)QI$FrB0cG6P1MI}+MXC`IV|XGzjCaIIrn!>pq$5lUD`Glc7Pz)f<_Xo!JV
zeX+Svl3b@*5s6^|px89yLxvldh}m1hrUOkJ7#L|K$w*<&XeEPYcf`?E1E`r5r^~fa
zvFUVUq%sg)uA0T7GvHF)HVi7+0M%i!rTD7Z2w|p8jFP^o@1{!`>;uw4oi#hzP^_l*
zb;A1V#qo~T$uNX%0vdyBf=RyjnS}S1UH-JIg}A%DvZ&nZ&6|k>>qVT~vpy2--GUT2
z{Q!Lgd54aMMT0n+pX5>HdFFa2j5`3LNOudvsfb+E)<j%G>Y}P>bUFON5$v8ICP{fI
zV?=IZOhIj8=Pc`jMZx6lv%+zC4~-o@1Gve>atL<<$tNAb)ThX;V_o4RD^6c$<;rA6
z%B?co6i;Q*J6N`msp~3+=rJyFGpjUm48o`cpBx3yk=^R93t){PhUR=`uDmhE%;Y#x
zO1u(-vk969*=ehkw^8Z>cqjCh3XVQpQxkH+31VC@5!1|MI$Y?$!3111)@tL@Kr>vX
z*9^AlZ0TS*5<#Jb)(X)TOC5%jTZd@&L&kdI{jp&_m2=VKEK^GR`B$(gai_`Zp$TzU
z1iAV(I^-g6k)$Gwn8=;vVTyJ~QxwLwR>JML1a6&yv_|_7+f5tU)i4YZrat4q)^1u(
zan{9;A$e>a5~4hY<gs;Fi1HYcN1OIuW6zCUYm&0Uq@;(;5%`Jacu?Q17AG;?*2*>r
zTHCq@W1(J2M1B^}&~C_Vsog_(47TGagtfaIeZN;hZq|j1YdbsPM8uuf6dyaeZZEHZ
zxqu~!ASrIBavpmL@rjiLwRkujA{~M+*PAUHlj(1xdOd;l2)~F;wZl}`ZZ0~*npoW*
zSrH#GEA@5*ZW^gLO+}1%3L6F-=cd^1peH<z^eSKGq$ZCJP5_f&=)wM&o^&G6EN>do
zi!XE(WuFNm*PaNDwA>`LS6V0w2<a+^pYJ=_&L8VW!>YKO14{EfIASCHblJ1QVgqF^
zx1`=HPb^DpxxNP<c93M{>(?c-7u)CZ<{gE@<>JfEq1)*4<z?s7O?;0U>D=62zwFu3
zC8GDpE87j1^bzcHIry>_<95}$Q$ptKvfXqo)K~$tMr)T)`sk&lA2GJ>SZK(s*{*br
zk(=Kd8OQPJj&(U_-DKSa*KoRHbqlgqL#K2PojWPnVOi6;U9Ro~Ztts`4$l@gzA8ol
zI~72_$$+_>sU-C4C*n+p6$Hy78INtp=93U@`@z5vJjzs233@?cExHODVQf~mM>s$h
zIW!?{xoG*J&2VP1@GobQaJ^;(jXlqqjIELIR(O9$a>KS6gxS)AE9RML(1BPGwzJA)
za};JvCJ)}#cHM7S3oL;@?Ao?$ZXU{HM$c!G^Z_0sppYq^-e1A=$>gIm!7OCRh82U?
zaP^j!buxK9+TuyJK9gd+LQ}(BrBwiHjWE$G7ZQq+B8P`>h{kVobO>XrUo2*mNG_Pb
zex6k;c;?%nLZ!tj{gqImW7kA))+%^Gu$PiBSu%L@2~G>asX#I#asPsOT~}I!fSXqw
z!Iu>h62`_!0_3J=^Hosx|GROVdbv;-f5Jx<#v0++ek__{E~hKrGt4MVauNdGqf^+r
zOQ!fvJr>ABd?*U9KqNMdvtE9Ua5Zd6Irrrdc^;(@#W$nq5Dr_@$N&ehi-I?ChlKG$
zEe<N+t9IU-rm>XmhUK13kCVBDx7cLGbw%p^pEyKXqh0hi08AO%yM<APeVN22Qad@h
zy>Mg5ivtiSRFN(d^~kAy<^?p_|LHjANO>8KZ*d2}5n}N@SY*CQ&hEKkwti_v#dSBf
ziu-weC5(<U$+tq`49|pW!F9hBMKt#f0Ls><Wph&$@5+*?MpbS}3VI=ZF6Pt*e8nY!
z|6zQ^We{I+3FC_}cohPF3H<egic6HL4_`csDOIqXX%rVlj2cr9m_m3v($O=H6!hb4
zaK?htgIK&QiWpVU#&{2SZ~~7{EYfd}^dl|;ekZ&P-=(QhbCn7;a$ei<@y(I3O<W>=
zQ0rtTt`lFdsZ&Gv>qHAAq{`FdDd!>1pLk{?Jo^}VjEwI<xiQqZ2l-GeL3bUT_*wYs
z-;g4hJ>IE_n(VG)D{9w>p6bT;bF6akxjNk!wW<l<@}b_x0XJ$=!>#F5?dV|{5v)+$
zzB9RQ)F)0!t~MVv$1{;5UZ8&eInFy;9jzRXQj=zPx=wthXk`1T-nh((z9&hd);FN-
zarDtvYYg8b7aE7-$mO*3x*WSyjBK8pU&eU=HQa(PbW&aMiim66RE-<!Kr6y5|D9<g
z>K=PFE_(_!suN#%iXlDb6~e73C64*2#<g)z#W0UMQT8hQD$fu};21aEPMJov^C)OX
zUYn6~AMzhUT8hEt%yjC>?L4z-E;OL6G{@6QoR}@%E;(zgU4t>}&=z=zw9bj31x-qg
zZ{glbfi5~}6>5Zo9cZwl#dLs|=4)EbroA_}UC-m-wQJ4#Y5+a1c7D!1MN8F1_!!nD
zsuW6tgkk`df*}VXDMbj!Ss1K7bYnlLNzmfJQ~cZvC_>VU_)>gIl$t(_(7DL726;}$
zvsOQ6;kgNMi}5!XALO3{<ueDL=$Z~3{&SR#`znB>d}rXlTAp6Z0hC2k3ipcfU-(KP
z2-1b5in8TY3ZZiZ&{9J^$OYeMoGL9}fEq}BOYy@~6s9_t;&WjXrh2Q>92H1CK*KMo
zr01xA`ZR^wt$g_QFg{nQeCzSofZuXm4>3FwqF_y!<H{v`?E014^`jmX1xqcHn$1Q%
zsQ>41TWC~!5OWdwUg-0<o~RQ(bS6umDsa>)_t1NkZ<bm8^N*JASw#IxDk)PT{WJP!
z>3_X{>_x*-hsRa3MaLew^H4LA;S252A}OQMqES<GG8N2ml?g@L2HA{5q0mT@`jK%>
zDmD1t^%>A<MS9%z$u_zqRa9S+rc#WcC0!&{M6aWEoNCqQxawcB61iX#qwB#+_2lKe
zkai&kca}%Sm(;_s>pe!AEb*tg^OX3rkTaPlmg-Kc!@QA8#c{Q#s1qL+Buzmop3Tl_
zVvHVF>by>Tpc!|0p@+k~h&Di0XmL@eIg90S<ur<x>=bJSRn&l*B%le%NZ}sV%O@Gj
z)W;+>ShP#<p%vEqebDEftZhk8$a<hAjx@<<U<@Mozg4NCL0&7O$VW_3Ro=n5QvS52
z#PFXqIn|2vHc1@)ws)qPKLd+8wSE9x)E7=FYQqc{P1V2u)MPwS{m3-HYVZ$~PdQLp
z^w;Y5U;cWOOxuzOK6VLD1fIAFg7(rns0vE)*-Cgf(EHpwr)SS!G{}4O&64+<Hlt_W
zciFCfS^0j;K4!1Xa?jjSmdXKbjgaM1iZz&?mUX~A2dNc{MJ-C-8p@m>nW~hxilPm=
zSVB0T<=I-&%H`pB2ct(DP?AfMVmJXnq0J*v94Uq-ZqJs_GHwe#gTtxGnxgf56`q#8
zbF5rdfb`W4{QAQ(Zfig0R4Mv}o?_9(ksr-8%(L+`0BL(lmLAc?lA284Ob1S_Ao~vA
zg212E0{W;w=^l|vyqbl$1Tg7!9B1J_trBFC(>yH&H&GJcqmmOW=O}$lDKMIKjztMe
zFRO$#AqSclH0x<~vFfl4xluirL*uMd>fCA_BHM<nv2r}g26OB<nn_ljX)ML2D@ET@
z{VXj+nn|>ztin^3s;54HYYpk8C1MfN7?YLT2Re-t^$OX){is(zo+1k=@TeU!H>p1|
zb8=FXjw3xlT(qgCo}|qMg~%@IVLFv9BTGC{ta@9xY#L+fO&Uw80i>*E9X|&#nj)1{
zK8=5RJqRBL2lU^P^tRR`7ov=m02y=XIefwi95HHX-}^0*Y<HR+)Gm@Z8dDkpT8Tt*
zXw<tvqir<Rlq9NAM*ZaU+IAq$U0a3KlQv1DOYy58%x$&A;&sN=vC3&hT#r;VlD()K
z)u0pWByCbVu@fMNi7FARig=q5PqS5xcW7&YZ)O=M$CH}q#I&QI@a78QEoX=-8EIRn
zf+mzqdkn3&VAr)H4UL})QSP*TRw1HsC_s9G99HctEwqNH5N$2{DTM7Z$x0t`BS}($
zW=S>5$_cKll2+spMlTQB+jBL_u`(#l5bC1>t=unE_BPZJt;42LdRdJh=~Amlc_=YU
zOgmc8l{IE1Vqm{!Nkcjd&bBNuTKi>*S;=FWEsfZcYVychFGtHPt&mV^+Iw*{qb~;0
zd#LqEBYCN3X+=_%M~`#m0Lv=VG3koUng>a$WxcINYBgnK9x`qgS3M*H<MN(-8!+0)
zuY@FptU+3fbdP4x*b$RPqY;MdV@!Hl>Q}g5)2MCSVv=pLO1r5a5t8*r6{PzB)HrL+
zNnh#Js1(xDEN#L(knk)Dk8$YrC`UB6+Ia^FG&P4BMt`DmX<4IX!MzM-bBmB_pvgI)
zxy9fu<*7gwqa}fqDk-;->RH?)v{aG9!g7>Y%he{(EXAmu5BNE6QG4#?M;0>6{XhdC
z6k7VQbTy07L86TB?b%ItD@kiXwYpTq-cjk3VG=kclUgAogz`I9*|a60#g>+RYdIBn
z3iYe>l(Pt<rg5hXilwEG#=%_%84TnWqUF(P<<U!`k+IAMr@pkMMR`7$A@KE6OK96l
zPgUdY7cvQH0iaQg*pr+3p1h7Ss%ktf|J3WmLxM$9h3cjX$e1HjUT?XefJv-rp`@Nu
zd9=KUk<|%|lqA)GN>t-zYEo{btV^*qXa}rAuJuSmts)7n#}CBxqjXwIEVGAXiIz%Q
zK;35@6fdqgx4}&AD{|7&$ZTRcRKDq$I@BXnOZ>FeIH9LImOw8NIxSD+YGB@g&SMFw
zLD{mp>qX`fnUC(eP~VX$K{Rm@qVI~NpQuc!p)#ml%C`xBr@_F@-ojO=32mU5cbwC|
zl83Hak<&I!8JF2MO`n)auR+c=RzH=vTdq@-xZ6$K)E1}JGlc>!tIek(KjQL~WVRSB
zElF<y=?gVcN}>^`rJS}Ww6KuGA@(GR6@x)!%Z@Whqo~Fl^s{IS^2-k+HOVN|!7`V3
zfM%JMlA7GGGw@3^GD6CSA9BkhgM~^br=2oZkt~8{ZPQT_RJ1jFNRQE+?!j-=s5#j-
zJ}bdN8aM{Ci*l9uMH_!HiB+CcTjY&xv1PFGT*EBPCK2GoMuYOQj6P|PD#@G!;GA5F
zK5+kc>jEwXsWNq%$|G%0`3pw_+9`9s0h_pb)L#`udG<OunH=441MB!AS@^opYSL|}
z$(hT*VV&Pu2Ct>ZXq;sHXiGqvkTj31Vl)<F`dg!^Djh9kZHvXc7RUZlq=wvJ-g5mz
zr5K?_REI^YS$m&~nux4sTft=0Y;As=d_614)kIq$Yg<6;Dy@XwsNZHtoy3q1Be~d$
zKiTAED@C0yxmnUgT2fXa%F(?R%J@)O;t)YQAQ{xJvT1OqQl}p+#_3nun#H(JEsaa#
zk6a-MB&+W2t7`ASSjeVGGz_iyGNP8OQi<+Wg*I%GvuxZ*Qc2s>tW42d*}02+6}b6_
zjI8NAaz{q?oHg*YkWu%h7nDRpGIxNMaj_O+x*jR0a8h{GeS=6ZxZ1f}s<0Zw)c;f^
zwq|xiW+(g;8{9Xtz5l|}LNhR6^h=4Vfb<Ow0NDx9sXqLun-6x1F#}s#g_%jmP@us!
zaO@A#CAfmca;ip61;;eu8t3L}LXM;XVc`Ty;TVAgBR|cVmR}>!*zICKgO`qWJaAV#
zB~o6;I6Fof(kZIkoyMK3bAUp1p#u;#+0DBSxsNzg1t%RBs*~NE)b@;H0`Nts#%X+6
zT>ohOV#zVBZEm+>mfVplQpWioHC2|fPIuSYU4rl)^T^>`OV@uLBj1Iwv0U~v(q!Gy
z#y<j$ZLQs+c@{yZTh^599jqfIdeV{2ROOlIuGfg;fEIVjq;VZr29@nDN36>1<)NHP
zcR6jSa}P_`+O#~?N_TnW?oM}*yw5VHW-l{~<E7DEE@|Ih)a!kgmL_dZ3!8Or=~A1;
z(F3WSOwv7yoLq9`u8L^wAZ<YFDD6<m(y-RzMcA~vTS#^<-7p||pp{SDXSDW`rPK;%
z&MM^J4>_SZ7uL@%dLmb{Ji;go7>?1LnJ(!_Q|L4&xnnb(Q>9!+pU#_eOB?yMD1$6O
z+Ecsht45W=eM`1UW~<yO)*&rf4$@9av(jo&nx+v=NQ)s^3$zEPJ|X_wu-~H|@Sq23
zbS-DnGfn@k22TXPNXCa>;H;*dBJPeky+U%!?zXhv$hdVf*_}e1E*TtYdD>38j6BmE
zc4->jss)^O7(uF~9;37o)God>`D{h~GhBRVXD{`{|JUBx#>#P(cYJ2{WoGBjox3~N
zH@1^qvmpsF!dyGaHF2ojB*YgIH3mZL6x1MU$A+XN_9d~wE)Lz5tky_LOIlhLwNyYa
zt*Jn&t41Ur@&UD|v<)o*)T%<Q(hsPqWc5W#CHF%}fB)yq?9SXfju8@4tFm|Rne+0T
z=bZDL=R9vSyIYx)w2-Z{GHxjtSg9|C94j-p9quCNRjJFvl$Fu$U^H30Uawe<_Rct#
zy%T4jTsOl>$gwgLg?r_Z60oZ5f+!~Ivv_>O_@XkeJbYNG%LAU3I=7!|sg(yp8y`ov
zKF)4PyH?kohe56N&9wRzta&KZTBYf$eRB<Ot=reN<{?pQHH~Vy-Rt1?8BN{%OrKU}
z;543V?MaJPc4(jR4H5-;uvnvwMc-*~Q2XT|4b!@vm_p{EK*MS?u5fD=t2n}8H40r4
z>ok<8x!O)#5^Ely<P7W)y(%YgO$eAd{<MRtNVY0eDNGAaTTz9MI)4@j2HS3jtunWw
zy@UtIJr1(M=~al+-f7K$4X$-TVQ{^HKOigAb3DB!F#H-2c}Tc*wJg%sP*U<Wpe66I
zXrafo8ky^UU5!hxQLZ9`owev53lOZWt3vU6!KNsIi`m}GiPy}Xb$gxgRVconR{3gP
z<7z$gCiK(0oAP!Pv=Aye?h>PqlAi`I>U<}uRt1RV><d~Bri>!@sAHj~7O9gfuREb)
z-49V-e;>;QO&+DUV5RaGd5%C!ln^^;21zyC;Y=1`zuj;b&rk?{Eu@Sy@a@wbWf$zJ
zy5b5S$>r3@a-XM8Ug1(n5U;tH{xKO+nyDP%yI7f4yhu&i%jCNjDwKBXEa<Fo!*m^&
zqmnaky_;nQR7Y{M7j>O~ymtKxm3Xb~3dE4U%~GDlBhM3WZ>$SN=_rUFrW8Yg-SKv6
zor$HL)?He0OA&05u~B%=RX=;WPVc622+OU#9Cj;$1g?h*m3yzN4{H@ht9OI;Ks3?p
zTyU#so6@AH7cY4CltWy~%k3;MOlRvtDb?`3`j^99c{s3^ic`*<hW2!)NvBI(T|C0S
z2%R0hOK?1^A&PRRk(?gzCHKayIloM4=5j^Q*f}}~qK$G@WCA<d6hR?xqfTl$1e6DJ
zEZD3EVkeLUV;^RGnSztLcNF6np*4-}&bH3qbk|g`zH*d53WrEADY%{AcDSaF1~M~b
zp13D2ytygw39r}S1anVuJ?$h|p-t^`SOs0Z*VXmZ?6m8M#jESu%PzZjQyqjbyhb2^
z`=#sWTgF-n1k`d+Z6KhQgKAv$TDThLymn<}C%p#uW&6~O!yVi8)L7?q*L=i9&(mGP
zWxk;f=M=ie*{!9@w8z9@-KTq|;$nC6a_Uaj>Bq|kF>6Pyaf7$ltw$Vw3D>svMC49u
z?I~O1ver_&>~xmL_G)KWhF;^M*7E-)+|$|<SaC&b*P1&NwQVwM4_!iP<jzEGzuc9m
zZOa{`+P2)axI~+|1GQ3W;#SmhN!9!}EVZ(t3YWVSKLkeU$a*WFjB(;lTo$9K&eIMD
zr?&#6bDXnNMCj|zRbgvM>&cF|?3t?N@2VqG_kp=Zo{`gxbfRU!lw63~-6@=rOKEQB
z%9OCw>Y7q}Ed`~i);zmXyUSHLcZ<A!_o|S&v+_gW)QQJYsim$03sbA29!6X<wJ_?b
zHr0E&fqXd)oV!7_vg(p<Q0+=@$G#8`mLCeJw+%``o`X|PuD3)#2*)cxIpyx^ou|uk
z^1DP21-kUOOBL6Tq-|<BaIc+9J;UzJ74~uNQ{cSf!fveA%Epsj>CG4Qvi@ZMVqbdR
zue3k@#XtJl3wJ+tL)YK%((nI{o+a^{L*I{DB)tV?Q_EhbG&@Cm&|1hI@%zo8H!tvh
zxR8y$==YZL>#ivD7qVOYY-ihDnAc$5YkPh+`p2{xs=@OE3c?5q&kMq^71_&c_}RD@
zMJ+F&-17WQbAIGC{Y`@;iW(u)3L65XikeMPH%uud`727IFl=PwO;N<yqt9p+f#2UW
z>ot1g>!QfjHA8SW2fgvI9X5rG!O}Esg??|`XMVkvPS}onD>1LOng;#ZFa(^sD2{p?
zgp^@5QkP=eYouwvABF*CifSE1vti4MgGhg|0b7OjPU5ghDI33=WqDq*Aq*)FqTV<e
zkckX3UdRp@&T+E28w3m4c#ppY+Unw|ZHu?XkK!C;w(I)?)mw8bW;Kac*6%yyHXQTI
z(WhbC1_3H!f<Us;I1Cng2ZJ<QPT9@(DH9Jt`Tk(uoA;@De^CXLt8(D&p+C=FLUlUD
zSB7z<WyIO?Ak|tR_G#Ya*mF9D&`902BcEi)yR%T7KEObN(@X~1H$4_7YDH0J*mQiM
zJ!AwXeFam64HtSJv0`zZwe=UJNRxkrp^`FeR5Xi)wW?d7)frehDkEi92B+Cgky4uC
zqTq0m5o4c#hm`d=H0-Jm%CrZME%ZiXhB#~QBp>iB0W52n_a*>tb)bNmj1N-Sz1+Ks
zOn`RAG!LWX=t8C~WH{QmKFOiG;4o0(Mtl-_sLI+HATHp6m=k%SF4&%8#@mGUH<f`G
z-zP>hbjpG@8V2lTciwNq`YHqb84b88Y|^E`QOagRHVw@4Y>QpyxuF3^=lKm`D=m7t
zu}Fp0=Azj<$QZ$qXBj#L==B_@(FGRrSxuJf=kv*G3mq!A_|N8xH#h?F1}avZDRmOh
z(@)h9r$AtvyJL1rU+{K0blQRPxubel*bzbUy(wXyVx!VF$xV)un|atMH59d7+oXf}
zLg>M6ygx57!J+|H(_R`Ps9@(GQ6vE^4qxKPbGnw;-z8l+iZFu2Rm3iaZP8Kn%$(eH
z$S<R)ICqIRnAa1vV9uq5wG+2)KMEbJ<Ag2U*z9s|$utbyxs<B7tGA1FQQVbE0QdX-
z8Dr7Wst%45L{3^n*=RLJCmF}vi52UQyNuzE3WuL}u@o}dLnc!;hDb&LkzrE-235C(
ziE!Eh<vQsI5(XLjX=)dD;N1Zot)et<@zb^gm<e0f8_eh(G+HVyM-q-!XF_yU!l=9#
zLV8i{@X^RS1&7tSVX4vD@)`eJxY_M>0uJ6~sVCNx;ItVyTn@--9aB>xm{=vz#H>Cq
zQ^>g}(N0762`WduJ)ur6Qw4KA=X^n>@r2b8piCipyfw7Vn{+Oa=xXQ0mTujrSS7z!
zML6jxw+&V`E9sN4V+q`xYzo_?1(rg?Q1$xI_$NgoIRsTwYcOyh;Iy2XWP;as8@Bo_
z{+3)HBa@sW&aXijvw>&Qz&dM&PL10NI&nia{5u8>8nK@u^=t~c+=j6jdOoM}q|)^{
zfRd&wm!K76b_7l?k5+r*-B(IE)Tl((dVM%cljG`w6d<HU?+x1Plnve8!3w}I<85LA
zvp~`zNpo8UmXtP^VwNE{NKpII%Vq(kDfU?k`(~r3v(dBJ=s66(-snrIL~~w~jlO1;
zUkOp@GTJtUGqS)qW~1|*sq?NQVAyd{Z}jyD12l<(cJkJ6AQO=3PC;{|fbkv(+EC@0
zNkusHU49t2t66%96!wEk?jM9Ry%jF&XQ3&Wy%xw2#!QuNlzpf(+FR+ZAW~h<Sy7mI
zMG@gamNT>h#q6+CBnvHF24j+9JZxd#FwU0(49Y>aGs<={R*;fJX$+a7ZbyINumNDU
zYy^f*dW)9h1Cd(bsq-SLD3FF7v5=5OolYn5TG}K=!?5nyR$?+oy>OQC1rJ@7Cf!gI
zw7jcW2zz<Oq*AtasEMnFPH7mI`(U{NP(oz5v36x_+DJ2I=~{MzDFLd4$uMJcjlSu6
zsd3nvZ%H4|2lI{j=A1-csd+jjMjwX#$^53SfsJTOP2k@o!S)9>K@tOOc6ibYXY#xz
z1ngwqDwx4eW<tRnp50^=K}U&sAFPuDA>^AwNl9=H;sMref{81ph@v>_`cwH~2781k
zFbnF1MjXZ1`;CdIugvHL%Ch&HaS+A%mO}bo9M58G6#FsVV0gv0MooD$vou7^4ATut
z%Dyp5zM$!LP1{(O4y>lwDy`l*FviEFYLPN9dIlVRXc}ZEv*ToA=W@k}CrrXvDvL|c
z?fqObmn?VnW{x(wQjJ7{=Bug#Opr<P2IEHBit0H@FZH(y;~FZkWUQ)yiu%UrYw-yj
z=v)BhJ&HaIxTVyqRIRD7lhIX`qakF|>6)Q<|C_?Az^az&%T1(;(t7;-Ae_s_hYfm)
zEE_+VjUR?{Ozg9BVQWsjrSl@F;x!*~y4SoqAr_3i+FjqnIfUs`Jl7Uar;HC;dIUd3
zvZjG&?e|9i5P3>_<A+cM7y=JbV2}NNNU<-yU=DE_dLY8)O5kJ69p`hJ#>0I}+c9$T
zbQ%lR+6W<#O<Nx3sB4|*)U&R9*6CQ3Qlzs1s*;x*;mu0_;p(D2okcZ%;wlt)$vDV;
zNA?u5FNQH0Yj*ZYQ363>1LUn|vC~*=Ha^8+ecWlZaX;*GR$-8{W<5?kQ}(_ko#e+#
zk`~FJg&R3&Shmqdh9i1Axs#Bln_=+LGqC@EViuyh{wvB(QIU=QTW5mGkhsXH-uOv&
zBO5=}1{FXc%TCdRHa?Y|f^7Zl6k&@bnhCMfIF7uMOpN|%D0PcZT{iyhfp|m(2ygTQ
zikdNTkQAEc(Sgq(M&`qqF3?2Sv^X$|>}zDw3=TulUgX8o<kinGAbNY_lU5=m!t#X@
z1(_W;Wvd0DM8!=AtKRtY38t?O5MyS?9efhOV%RnKQd_*#_WJoqF6ArP=+9+mw4lJC
zm3F)al}`^^fp)1kK2P^}kPKS**#rsziDGo2wy`&U`o=H?`sHkNF&n+aTl)gZ&B)G1
z=R_$nL)o@wBePS^xrF;%m_k}p$8+?xwrOBHC*w0Pe#Pm7%M@%$`!;b|R*G>*K3iwm
zdk2DkiuCTMa_%`xSiM)-8jHEWakd<2bB6!3{IB%J$AK*IACzYHtQhsvE?5~58I>&%
zbGo5~j&s0@WyV1-+c27J*HC1~wF47RL)&k`D;&Pf;S3bz&j~cd)F6NVRCsj|WkZIM
z77!a~tEYd6CLu~pC{ut#W&W6n*+k&n!zL4uBkA1}n_MzBo;3^5vUH$9N|Gct5rfd+
zpuzB8{l)w5+r0GRCz~%Ju$r&FO2X}Auh}I@%Fie=6xF_xK*<Ir3rem~vQf!qC08oB
zN=aYIK*<&*S1Y+j$+b$ZQ}PBS*DJYE$xTX@l<ZWpTgh!oZddX)C3h$}tmKH24=Uj!
z&0h0CCETp>ntVLYYjV@uYx0Ph*CeFtHT4C7O`g2-nx~Zzxb>Q!Q1VMkMoPv?o>20n
zl3!KwYf3()<S8Y;uH-kAd|Juplzd*v7nFQM$zLk@D<yxe<as6ER`MMs-&F#Uy(Z-L
znj&hmrKGJSR1zuaD2bKKC`pveD*1>Ko?P^r-&FE9O8!>Kx0Jk~<W*kHA1LvBiXPk5
z^py*g_!NClgV+n}`(40zqg$IBzxu+i=9j;I{msqz@2}b2JoM(jyR~`W<G+0C==T==
z(U~>h&RG84h{fW^B&NS5G5J;FZNo<%d*Ilk4?lEh+c5tX!97chJM_10cn4oce&W$1
zdyXDCah%`U+%|moi9>uxI$yuJ|Ja9*9Nly1=9>@hd|>BWZn)*9r6W6b-Ll~C54!s4
zIWX}<)O_DPUvla<+L-ojta36@qupC+AVmX|T05Y+caLPXJkjOQW#VR=lOD9wjw^GN
zRP&OOmzDen$+#P|qZU{>kANZ52=kFKId{RuLFh<@i=nB>0(vus6le~;8-R@I3yT#Q
z4h!bFX%ThFGrNlUlt#i_bm*LL$QKYfywad7dAFu~aWXem-JMUu+}9?a)NU%5l}WUu
z(RpR}D!Eh1Z4fL8+8rT!TFG-to>lUUk|!GNxkO{j*0h1lm{rXI+!CQ!QF0tSDFeu2
zpv|JcZOhmiRX&>&6VYo8*%;(#dPez%lILxH-&OWoN-hZRIdbD`wWcC$AGG_dXvkc0
zLxs{dLG9A?Tx?;I3tUdJJjY=%12_E?Pg6b80M6MU6nB%kN16AkAN~PugNlchJgE7c
zRBlw@Kb@-hsiJt+wvmalL^XU`$?vM$=aj5+)-wE<Aln(FGhPET&$w&ntQYxnCa%#D
zK}ZWWC^uwurGDB>AxJz$2^@H&g2BjGh&kaZ(3W`#S?L@19|b`ddCUzLr#wgrP_`-+
zQaf{Aqf&><F)gQwhqqs#V49}-kTeO4VuSku;!O8@Q29Hhs(UNgO~a<FhlINxGv=4O
zyc}#*$L47(Belvw)diP4=5wSRYer+ijot?G8ahZoN1ngY#UL8`v{0Q6b>C?E9y$j!
zuhGIb4B3h)gj2Ec&;q{rlF^)XQMlGMf!e`i{g6r3(yQV?8P5!L$4PB?y#+E#uAWt~
zjcyPVCN7J~3cU3sLzF0)8xp0E)Hny{AdNbpS|;0#XIZXQV{N@05Z9lCB}HEi?2Kzh
z%$p21w@oZGMS)07c7m%m<wT3bx{#)fD%=>ml8`y3(Ti^7%-t=B3`fx!UfsavWAcqk
zHbb`-CGB|IL`D}<=0(W_23M8>X9xu>BEJ$#if}UqNYJ<JE=)iy5^PP6Eu}Z72jk7f
z)p~A<kff;TV2Ig}t8Tni4Irv9e+*vw_KZmxaf0PaW8!j6w?H?KW-LLbU9H<p;&Yat
z<vI7MeLE)~U}EhK(ypi2RjI!`J*x;Yt~D8*7+rPFQx+#Xvl!b^YEpzbv=FUMd$kNf
zz@_8Cg`lK3VOce)!d5aSMMSeqIe(BPL0fqccl0H9`HC(GeEbBw1u{q!98u-uL2$7l
z_F8(UrMLAsqH4j$ybU#hlIT)Xso?%b9{U3q(;(GdD{euve<9p<KP%{H+AHF!89P7Z
zwF)~x#Q0c+;`6XMtn*+xJ8U~X`kD+}*MqJD#i-mZ-TYMQV7T+<augDGK$S~i4t3){
zKegnd31rk5dL1Mbiq6vKEPYndLWGO%c}w<WJ5$Li$s1LP@ySz{i?Px9*d0BQ4^K6k
z@mdNC8Z;Z{0=TTu@#jT1ycW+^YCeiUN<4u)V&(bzw60)dA;ty~d^U~LAq>R?-KZNS
zg=mv#fdLZyWrYVPREQ!cd^7@ngAq&=0ikK``C^Px-9&UTh!&L{xAY|Zl8w#)$dT}i
zxRdT1J4eRFS*351*&)vnNZXk9Va*oQMg9#Nc@ESATWonUObWBYrLQYiR+?D3eY4kt
z+WN~?ng}TU7G3RavX)6jUT=+-^a?V=QV!?5ChU|YGjs0*HgpS6Cpg#&7B6;?wsc!>
zm+s2s9r4cX^dhBSLeJ)^1Jg*0O(zv5`BP7!V{`WCzT(1;u9#-Sb!wZ9m7<i>YJA7F
zqnoy}N}u3p3)Scxeta&9Qke}BIf1w)8+~8d$dhdOJ~y0P1|U~~8DbfePj()jF%Xz1
zzZyOUApoP$iA{V)QqyutM6H5Dye0D_;fa*E3U-RBqD~CJzXB=B0zzN0wQBPml}w1T
zdpl5pXc$))K~6&nteC-yu1Uq&yw!1vm8^dbIt7%i({#lgpy%l>LK%X&NI;FwQ|XTL
zH{9iUPF`xFH^Q=T_wJZap-ExH{H#kN6L8^nDwp#LObyT%uUPSw)FFFG!)O;@Bt@mT
zylDxtI)l~-1zaZOnGk@nC)wHo!9OUo%3>yUDq}QF%g|_20KDt9N<}J!n$ckta_Wu2
z%D9$Uo%PDX8FnRXqj@1+&>3`ct2h@8>3N2`UaVai?9m|vRmN0R+Lw&N5Lai(J+iu~
zz?(Y}rBZUm5H-0frtiCLbWk$C)JJu$hC@85DeAfgMPCO9N94#ID$bB~K|$qR^W31%
zd{xH*BjS`dqf*>Nk+ykNU;0U4D?A3kqJ@U1nQWANSfsJ*g}9Nh2LcgI$0BR|4C%7b
zpJMKV;vDxj?Mh&&<AMJi8J%DQdL;;EIj45$o&RD-&G?9;{)*Ttzfw~06k&{};67LW
z$89Wa11zg&z@(;hqS!F;J1dj?Oi)Y+Kwe}clk#LtlXC)H4lr^v1)SAN9jI243T(;>
zql^aYZT5uAv9A{<y_I*$M7g}Tvh5){SLT7;#)~NHHXUY<nUeI^bOFTPkmaBmyONZ2
zk(X`CY+M7|WL+*gl_i*qnxkbTx(yQRMZ)lcLZP}CdtZ>1MV3Hu<R(HCvMIF#qq_-0
zK#T+S8i&MXI#L>{!5hM!7Iw3*;psCV*FoXJmaWR#wk?%JVab8dgI)LN=%|YZVQAf{
zpc&tqi;fg^3hr9nSuU(+mEaK}kgZ<O+=}}}CT@|zm?T{$%Ot3b+FPCJr;5E+eW#_j
z(b#E$1M5iXbCy0!??KYpS|}W$9Gz=!g>sg?NH!rW>V#VzS*+m#==%JpQ4w(3;uCw?
zA6%iIq<QDDqsu2B;1`UE;qdFS$3FI$4;;}cUWxC8I~H$Rym4`d=UwT03kUcenNy2z
z({HvsaN_X6Mg2-1AG+G-d;P_CF7JQ)qx`6^ew}q&5o7x3(&EjFH!xVbVbZ7{-Q#yR
z^#i~g)N43t85;BspYQOS3{m-A^f&F!e?I*7$M`|ZpFaG758TKHt2@5e)bGLCoW1Xz
zyyo9Lf8V>cgg&9R`>$_!_jeTMb1$lRvXc+y^M7({!#f!_y!YI@_udo#dhNe_d;f2}
z;}72Zy)SHe{Fy%!-1Y<fgwJE!4;}k3aj@-&^po`4kJ&FtZa*PXY&U}3bu|AO*5#ug
zdg$m8SIMt!>IWUSA3O95+w)(3**;y>%Hczx|3C4%_s-jI-1!zSC(p+U@<8UJKiK;x
zpZmlc|1doFGynABUqH&@_0P%=i&FLaof4eq@k@V??tS>-ef%V^XA#OHM-~r1%*V_1
zdQ~40<<6cWrS<=U`v3Xzy)FyiM8vqdew}QO@10s-FStATfA4J#?{0U#RYhKr{`<Up
z`NGuu_@Kdi$lb*^VZD>`U8HX(uh(<UfB%7Y!EJLex!!86ig5F!bDl3Qd&&4<)kmuI
z{Z{&PZ}Bly!CY&-rVrY?2lw;e{4ped-25^N@|n|dd;DA<&Rvh`tikMH-R^IRkJLZ|
zzAY`FZvn3l!ac?#@Wr>GO~)AmyMt)_6!t#**hTTdQB7qBADq33{~P%i?%iGryjJ5r
zqM`5i)7P=PZ~6212<!R}k{t#@%#CWGuRgT#A?u}&+v$Ui?g4v!O|a(;`QK|}yeqHQ
zhhQ{Dv7X`^1z&rtC0keV(HMOx=y7(W{IS^eYujag^leJ|*ux`GQ(qZ8ylg+F{FB#@
Xivq)jryEc&@A`2K@DsfMn<?-=9CJ=S

diff --git a/Assets/Coach-ML/Barracuda/Barracuda.dll.meta b/Assets/Coach-ML/Barracuda/Barracuda.dll.meta
deleted file mode 100644
index 3e4f56d..0000000
--- a/Assets/Coach-ML/Barracuda/Barracuda.dll.meta
+++ /dev/null
@@ -1,30 +0,0 @@
-fileFormatVersion: 2
-guid: de59cc66e5e394f93b2a692e50bce97f
-PluginImporter:
-  externalObjects: {}
-  serializedVersion: 2
-  iconMap: {}
-  executionOrder: {}
-  isPreloaded: 0
-  isOverridable: 0
-  platformData:
-  - first:
-      Any: 
-    second:
-      enabled: 1
-      settings: {}
-  - first:
-      Editor: Editor
-    second:
-      enabled: 0
-      settings:
-        DefaultValueInitialized: true
-  - first:
-      Windows Store Apps: WindowsStoreApps
-    second:
-      enabled: 0
-      settings:
-        CPU: AnyCPU
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Resources.meta b/Assets/Coach-ML/Barracuda/Burst.meta
similarity index 77%
rename from Assets/Coach-ML/Barracuda/Resources.meta
rename to Assets/Coach-ML/Barracuda/Burst.meta
index da72593..af1d3db 100644
--- a/Assets/Coach-ML/Barracuda/Resources.meta
+++ b/Assets/Coach-ML/Barracuda/Burst.meta
@@ -1,5 +1,5 @@
 fileFormatVersion: 2
-guid: 264a957219ea041c58af860601fe1881
+guid: 6ebd255f2ebec45758db217c5d836515
 folderAsset: yes
 DefaultImporter:
   externalObjects: {}
diff --git a/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.asmdef b/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.asmdef
new file mode 100644
index 0000000..30db715
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.asmdef
@@ -0,0 +1,11 @@
+{
+    "name": "BurstBLAS",
+    "references": [
+        "Barracuda",
+		"Unity.Burst"
+    ],
+    "optionalUnityReferences": [],
+    "includePlatforms": [],
+    "excludePlatforms": [],
+    "allowUnsafeCode": true
+}
diff --git a/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.asmdef.meta b/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.asmdef.meta
new file mode 100644
index 0000000..ef32c0d
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.asmdef.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: 2fab472a0d46c4307939f2d23202cd1b
+AssemblyDefinitionImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.cs b/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.cs
new file mode 100644
index 0000000..1284699
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.cs
@@ -0,0 +1,111 @@
+﻿using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
+using Barracuda;
+using Unity.Burst;
+using Unity.Collections;
+using Unity.Collections.LowLevel.Unsafe;
+using Unity.Jobs;
+using Unity.Jobs.LowLevel.Unsafe;
+using UnityEngine;
+using UnityEngine.Scripting;
+
+[Preserve]
+public class BurstBLAS : BLASPlugin 
+{
+    public bool IsCurrentPlatformSupported()
+    {
+        try
+        {
+            // Sanity test if all the dependencies of the job are met at runtime
+            // Also prevent compiler from optimising this out
+            var test = new UnsafeMatrixBlockMultiplyUnrolled8xhJob();
+            D.Log($"Loaded: {test}");
+        }
+        catch (Exception e)
+        {
+            D.Log($"C# Job system not found. Disabling {this.GetType()}. Error: {e}");
+            return false;
+        }
+        return true;
+    }
+
+    public unsafe void SGEMM(float* Ap, int AN, int AM, float* Bp, int BN, int BM, float* Cp, int CN, int CM, int bs,
+        bool transposeA = false, bool transposeB = false)
+    {
+        if (transposeA)
+        {
+            var tmp = AN; AN = AM; AM = tmp;
+        }
+        if (transposeB)
+        {
+            var tmp = BN; BN = BM; BM = tmp;
+        }
+
+        UnsafeMatrixBlockMultiplyUnrolled8xhJob job = new UnsafeMatrixBlockMultiplyUnrolled8xhJob();
+        job.A = Ap;
+        job.AN = AN;
+        job.AM = AM;
+        job.B = Bp;
+        job.BN = BN;
+        job.BM = BM;
+        job.C = Cp;
+        job.CN = CN;
+        job.CM = CM;
+        job.bs = bs;
+        job.transposeA = transposeA;
+        job.transposeB = transposeB;
+
+        var fence = job.Schedule((BM / bs) + (BM % bs > 0 ? 1 : 0), 4);
+        fence.Complete();
+    }
+}
+
+//[BurstCompile]
+struct UnsafeMatrixBlockMultiplyUnrolled8xhJob : IJobParallelFor
+{
+    [NativeDisableParallelForRestriction] [NativeDisableUnsafePtrRestriction] public unsafe float* A;
+    public int AN, AM;
+    [NativeDisableParallelForRestriction] [NativeDisableUnsafePtrRestriction] public unsafe float* B;
+    public int BN, BM;
+    [NativeDisableParallelForRestriction] [NativeDisableUnsafePtrRestriction] public unsafe float* C;
+    public int CN, CM;
+    public int bs;
+    public bool transposeA;
+    public bool transposeB;
+
+    public void Execute(int colB)
+    {
+        unsafe
+        {
+            int sz = bs * bs * 4;
+            
+;           float* blockA = (float*)UnsafeUtility.Malloc(sz, 4, Allocator.TempJob);
+            float* blockB = (float*)UnsafeUtility.Malloc(sz, 4, Allocator.TempJob);
+            float* blockC = (float*)UnsafeUtility.Malloc(sz, 4, Allocator.TempJob);
+
+            for (int rowA = 0; rowA < AN; rowA += bs)
+            {
+                //for (int colB = 0; colB < BM; colB += bs)
+                {
+                    for (int l = 0; l < AM; l += bs)
+                    {
+
+                        MatrixUtils.CopyBlockWithPadding(A, rowA, AN, l, AM, blockA, bs, transposeA);
+                        MatrixUtils.CopyBlockWithPadding(B, l, BN, colB * bs, BM, blockB, bs, transposeB);
+                        MatrixUtils.CopyBlockWithPadding(C, rowA, CN, colB * bs, CM, blockC, bs);
+
+                        MatrixUtils.MultiplyBlockUnroll8xhPadded(blockA, blockB, blockC, bs);
+
+                        MatrixUtils.CopyBlockWithPadding(blockC, C, rowA, CN, colB * bs, CM, bs);
+                    }
+                }
+            }
+            
+            UnsafeUtility.Free(blockA, Allocator.TempJob);
+            UnsafeUtility.Free(blockB, Allocator.TempJob);
+            UnsafeUtility.Free(blockC, Allocator.TempJob);
+        }
+    }
+}
\ No newline at end of file
diff --git a/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.cs.meta b/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.cs.meta
new file mode 100644
index 0000000..fbe1bbd
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Burst/BurstBLAS.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5991aeeb69c95451aad913637fdf5036
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core.meta b/Assets/Coach-ML/Barracuda/Core.meta
new file mode 100644
index 0000000..f3b79c2
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: c29f3a5cd9b2b481a86711db870d89af
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/AssemblyInfo.cs b/Assets/Coach-ML/Barracuda/Core/AssemblyInfo.cs
new file mode 100644
index 0000000..f0f2692
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/AssemblyInfo.cs
@@ -0,0 +1,6 @@
+﻿using System.Reflection;
+
+// DON'T EDIT
+// Will be replaced by Tools/Build/build.py
+[assembly: AssemblyVersion("0.4.0.0")]
+[assembly: AssemblyFileVersion("0.4.0.0")]
diff --git a/Assets/Coach-ML/Barracuda/Core/AssemblyInfo.cs.meta b/Assets/Coach-ML/Barracuda/Core/AssemblyInfo.cs.meta
new file mode 100644
index 0000000..501d72b
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/AssemblyInfo.cs.meta
@@ -0,0 +1,3 @@
+﻿fileFormatVersion: 2
+guid: f7f9574517c146ada866c486dc392731
+timeCreated: 1533296387
\ No newline at end of file
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends.meta b/Assets/Coach-ML/Barracuda/Core/Backends.meta
new file mode 100644
index 0000000..a00328e
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 6ea9066885552475492543edc812867d
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackends.cs b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackends.cs
new file mode 100644
index 0000000..b2c27a1
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackends.cs
@@ -0,0 +1,163 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using UnityEngine;
+
+namespace Barracuda {
+
+/// <summary>
+/// Interfaces for backend implementers
+/// see ModelBuilder.cs for detail on layers.
+/// </summary>
+public interface IOps
+{
+    Tensor MatMul(Tensor x, bool xTranspose, Tensor y, bool yTranspose);// @TODO: consider MatMulAdd instead
+    Tensor Dense(Tensor x, Tensor w, Tensor b);
+    Tensor Conv2D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad);
+    Tensor DepthwiseConv2D(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad);
+    Tensor Conv2DTrans(Tensor x, Tensor k, Tensor b, int[] stride, int[] pad, int[] outputAdjustment);
+    Tensor Upsample2D(Tensor x, int[] size);
+    Tensor MaxPool2D(Tensor x, int[] pool, int[] stride, int[] pad);
+    Tensor AvgPool2D(Tensor x, int[] pool, int[] stride, int[] pad);
+    Tensor GlobalMaxPool2D(Tensor x); // @TODO: consider, if it should be just a special case of MaxPool2D with {pool=X.width/height, stride=1}
+    Tensor GlobalAvgPool2D(Tensor x);
+    Tensor GlobalAvgVariancePool2D(Tensor x);
+    Tensor Border2D(Tensor x, int[] pad, float borderValue);
+    Tensor Pad2DReflect(Tensor x, int[] pad);
+    Tensor Pad2DSymmetric(Tensor x, int[] pad);
+    Tensor Pad2DEdge(Tensor x, int[] pad);
+
+    Tensor ScaleBias(Tensor x, Tensor s, Tensor b);
+    Tensor Normalization(Tensor x, Tensor s, Tensor b, int pool, int axis, float epsilon);
+    Tensor LRN(Tensor x, float alpha, float beta, float bias, int size);
+    Tensor Dropout(Tensor x, float alpha);
+    Tensor RandomNormal(TensorShape s, float mean, float scale, int seed);
+    Tensor RandomUniform(TensorShape s, float mean, float scale, int seed);
+    Tensor Multinomial(Tensor x, int count, int seed);
+    Tensor OneHot(Tensor x, int depth, float onValue, float offValue);
+
+    Tensor Relu(Tensor x);
+    Tensor Softmax(Tensor x);
+    Tensor LogSoftmax(Tensor x);
+    Tensor Tanh(Tensor x);
+    Tensor Sigmoid(Tensor x);
+    Tensor Elu(Tensor x, float alpha);
+    Tensor Relu6(Tensor x);
+    Tensor LeakyRelu(Tensor x, float alpha);
+    Tensor Selu(Tensor x, float alpha, float gamma);
+    Tensor PRelu(Tensor x, Tensor alpha);
+    Tensor Swish(Tensor x);
+    Tensor Abs(Tensor x);
+    Tensor Neg(Tensor x);
+    Tensor Ceil(Tensor x);
+    Tensor Clip(Tensor x, float min, float max);
+    Tensor Floor(Tensor x);
+
+    Tensor Reciprocal(Tensor x);
+    Tensor Pow(Tensor x, float alpha);
+    Tensor Exp(Tensor x);
+    Tensor Log(Tensor x);
+    Tensor Sqrt(Tensor x);
+
+    Tensor Add(Tensor[] tensors);
+    Tensor Sub(Tensor[] tensors);
+    Tensor Mul(Tensor[] tensors);
+    Tensor Div(Tensor[] tensors);
+    Tensor Pow(Tensor[] tensors);
+    Tensor Min(Tensor[] tensors);
+    Tensor Max(Tensor[] tensors);
+    Tensor Mean(Tensor[] tensors);
+
+    Tensor ReduceMax(Tensor x, int axis);
+    Tensor ReduceMean(Tensor x, int axis);
+    Tensor ReduceMin(Tensor x, int axis);
+    Tensor ReduceProd(Tensor x, int axis);
+    Tensor ReduceSum(Tensor x, int axis);
+
+    Tensor Greater(Tensor a, Tensor b);
+    Tensor GreaterEqual(Tensor a, Tensor b);
+    Tensor Less(Tensor a, Tensor b);
+    Tensor LessEqual(Tensor a, Tensor b);
+    Tensor Equal(Tensor a, Tensor b);
+    Tensor LogicalOr(Tensor a, Tensor b);
+    Tensor LogicalAnd(Tensor a, Tensor b);
+    Tensor LogicalXor(Tensor a, Tensor b);
+    Tensor LogicalNot(Tensor x);
+
+    Tensor Flatten(Tensor x);
+    Tensor Reshape(Tensor x, TensorShape shape);
+    Tensor Transpose(Tensor x);
+
+    Tensor Concat(Tensor[] tensors, int axis);
+    Tensor StridedSlice(Tensor x, int[] starts, int[] ends, int[] stride);
+    Tensor Tile(Tensor x, int[] repeats);
+
+    /// <summary>
+    /// Prepares tensor for use
+    /// </summary>
+    Tensor Prepare(Tensor x);
+
+    /// <summary>
+    /// Waits for previously scheduled OP to complete
+    /// Tensor x is the destination of that OP
+    /// </summary>
+    void WaitForCompletion(Tensor x);
+
+    /// <summary>
+    /// Reset internal allocator
+    /// </summary>
+    void ResetAllocator(bool keepCachedMemory = true);
+}
+
+/// <summary>
+/// Interfaces for model compiler
+/// </summary>
+public interface IModelCompiler
+{
+	void PrepareModel(Model model, IDictionary<string, TensorShape> inputShapes);
+	void PreExecuteLayer(Layer layer, Tensor[] inputs);
+}
+
+/// <summary>
+/// Interfaces for variables
+/// </summary>
+public interface IVars : IDisposable
+{
+    void SetInput(string name, Tensor x);
+    void PrepareStorage(Model model, IOps optionalOpsToPrepareTensors = null, IDictionary<string, TensorShape> optionalInputShapes = null);
+    Tensor[] GatherInputs(Layer forLayer);
+    void PrepareStorage(Layer forLayer);
+    void Store(Layer fromLayer, Tensor result);
+    Tensor PeekOutput(string name);
+
+    ITensorAllocator GetAllocator();
+}
+
+/// <summary>
+/// Interfaces for tensor allocator
+/// </summary>
+public interface ITensorAllocator : IDisposable
+{
+    Tensor Alloc(TensorShape shape);
+    Tensor Alloc(TensorShape shape, ITensorData buffer);
+
+    // Repin() callback is called from the following Tensor methods:
+    //  PinToDeviceAndUploadToIt(), PinToDeviceAndDownloadFromIt(),
+    //  Unpin() and UnpinAndDisposeTensor()
+    void Repin(Tensor x, ITensorData newBuffer, ITensorData oldBuffer, bool disposeUnpinnedHint);
+
+    // Cast() callback is called from the following Tensor methods:
+    //  CastOnDevice()
+    void Cast(Tensor x, ITensorData newBuffer, ITensorData oldBuffer);
+
+    // NOTE: Release() should be ready to handle edge-case situation when
+    //  externally created new Tensor instance is passed with
+    //  ITensorData (tensorOnDevice) that is already owned by the allocator
+    void Release(Tensor x, bool calledFromTensorDispose);
+
+    void WaiveOwnership(Tensor x);
+    void Reset(bool keepCachedMemory); // end-of-frame
+}
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackends.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackends.cs.meta
new file mode 100644
index 0000000..cb5b450
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackends.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 67f00a1befd4144eca5685250d893f09
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackendsFactory.cs b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackendsFactory.cs
new file mode 100644
index 0000000..bd311b9
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackendsFactory.cs
@@ -0,0 +1,181 @@
+using System.Collections.Generic;
+using System.Linq; // ToList()
+using UnityEngine;
+using UnityEngine.Assertions;
+
+namespace Barracuda {
+
+
+internal class BarracudaBackendsFactory
+{
+    public static WorkerFactory.Type ResolveAutoType(WorkerFactory.Type type)
+    {
+        if (type != WorkerFactory.Type.Auto)
+            return type;
+        return GetBestTypeForDevice(WorkerFactory.Device.Auto);
+    }
+
+    public static WorkerFactory.Type GetBestTypeForDevice(WorkerFactory.Device device)
+    {
+        switch (device)
+        {
+            case WorkerFactory.Device.Auto:
+            case WorkerFactory.Device.GPU:
+                return WorkerFactory.Type.ComputePrecompiled;
+            default:
+                return WorkerFactory.Type.CSharp;
+        }
+    }
+
+    public static WorkerFactory.Type ValidateType(WorkerFactory.Type type)
+    {
+        type = ResolveAutoType(type);
+        Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
+
+        if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !ComputeShaderSingleton.Instance.supported)
+        {
+            D.LogWarning(
+                $"SystemInfo.supportsComputeShaders: {SystemInfo.supportsComputeShaders}. Falling back to {WorkerFactory.Type.CSharp}");
+            type = WorkerFactory.Type.CSharp;
+        }
+
+        return type;
+    }
+
+    public static IOps CreateOps(WorkerFactory.Type type, ITensorAllocator allocator, bool verbose)
+    {
+        switch(type)
+        {
+        case WorkerFactory.Type.ComputePrecompiled:
+            return new PrecompiledComputeOps(ComputeShaderSingleton.Instance.kernels,
+                                            ComputeShaderSingleton.Instance.referenceKernels, allocator, verbose);
+
+        case WorkerFactory.Type.Compute:
+            return new ComputeOps(ComputeShaderSingleton.Instance.kernels,
+                                 ComputeShaderSingleton.Instance.referenceKernels, allocator, verbose);
+
+        case WorkerFactory.Type.ComputeRef:
+            return new ReferenceComputeOps(ComputeShaderSingleton.Instance.referenceKernels, allocator);
+
+        case WorkerFactory.Type.CSharp:
+            return new UnsafeArrayCPUOps(allocator);
+
+        default:
+            return new ReferenceCPUOps(allocator);
+        }
+    }
+
+    public static IWorker CreateWorker(WorkerFactory.Type type, Model model, string[] additionalOutputs = null, string[] trimOutputs = null, bool verbose = false)
+    {
+        return CreateWorker(type, model, additionalOutputs, trimOutputs, verbose, compareAgainstType:type);
+    }
+
+    public static IWorker CreateWorker(WorkerFactory.Type type, Model model, string[] additionalOutputs, string[] trimOutputs, bool verbose, WorkerFactory.Type compareAgainstType)
+    {
+        type = ResolveAutoType(type);
+        compareAgainstType = ResolveAutoType(compareAgainstType);
+        Assert.AreNotEqual(type, WorkerFactory.Type.Auto);
+        Assert.AreNotEqual(compareAgainstType, WorkerFactory.Type.Auto);
+
+        bool compare = type != compareAgainstType;
+
+        if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) && !SystemInfo.supportsComputeShaders && !Application.isEditor)
+        {
+            D.LogWarning("Compute shaders are not supported on current platform. Falling back to CSharpFast.");
+            type = WorkerFactory.Type.CSharp;
+        }
+
+        IVars vars;
+        if (WorkerFactory.IsType(type, WorkerFactory.Device.GPU) || WorkerFactory.IsType(compareAgainstType, WorkerFactory.Device.GPU))
+            vars = new ComputeVarsWithSharedModel();
+        else
+            vars = new DefaultVars();
+
+        ITensorAllocator allocator = vars.GetAllocator();
+
+        if (verbose)
+            D.Log($"Storage type: {vars.GetType()}. Allocator type: {allocator.GetType()}.");
+
+        IOps ops = CreateOps(type, allocator, verbose);
+
+        if (compare)
+            ops = new CompareOps(ops,
+                CreateOps(compareAgainstType, allocator, verbose));
+
+        if (verbose)
+            ops = new VerboseOps(ops);
+
+        if (Application.isEditor)
+            ops = new StatsOps(ops);
+
+        model = ValidateModel(
+            PatchModel(model, additionalOutputs, trimOutputs));
+
+        return new GenericWorker(model, ops, vars, verbose);
+    }
+
+    public static Model PatchModel(Model model, string[] additionalOutputs, string[] trimOutputs = null)
+    {
+        bool trimModel = trimOutputs != null;
+
+        if (trimOutputs != null)
+        {
+            foreach (var o in trimOutputs.Except(model.outputs))
+                if (additionalOutputs == null || !additionalOutputs.Contains(o))
+                    D.LogWarning($"Output specified in trimOutputs was not found in the model: {o}");
+
+            var newModel = model.ShallowCopy();
+            newModel.outputs = trimOutputs.Intersect(model.outputs).ToList();
+            model = newModel;
+        }
+
+        if (additionalOutputs != null)
+        {
+            foreach (var o in additionalOutputs.Except(model.layers.Select(l => l.name)))
+                D.LogWarning($"Layer specified in additionalOutputs was not found in the model: {o}");
+
+            // 'new' means that output name does not yet exist in model.outputs
+            // 'valid' means that output name matches one of the existing model.layer names
+             var newAndValidAdditionalOutputs =
+                additionalOutputs.Except(model.outputs).Intersect(model.layers.Select(l => l.name));
+
+            var newModel = model.ShallowCopy();
+            newModel.outputs.AddRange(newAndValidAdditionalOutputs);
+            model = newModel;
+        }
+
+        if (trimModel)
+        {
+            var newModel = model.ShallowCopy();
+            var upstream = ModelAnalyzer.FindUpstreamLayers(model, newModel.outputs.ToArray());
+            foreach (var l in model.layers)
+                if (!upstream.Contains(l))
+                    newModel.layers.Remove(l);
+
+            model = newModel;
+        }
+
+        return model;
+    }
+
+    public static Model ValidateModel(Model model)
+    {
+        // validate, all model outputs are unique
+        // https://stackoverflow.com/questions/18547354/c-sharp-linq-find-duplicates-in-list
+        var duplicateOutputs = model.outputs.GroupBy(x => x)
+            .Where(g => g.Count() > 1)
+            .Select(y => y.Key);
+        foreach (var o in duplicateOutputs)
+            D.LogWarning($"Output is specified more than once in the model: {o}");
+
+        // validate, model contains no unconnected layers
+        var unconnectedOutputs = ModelAnalyzer.FindUnconnectedOutputs(model, model.outputs);
+        foreach (var o in unconnectedOutputs)
+            D.LogWarning($"Layer is specified as output, but is missing in the model: {o}");
+
+        return model;
+    }
+}
+
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackendsFactory.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackendsFactory.cs.meta
new file mode 100644
index 0000000..7a045f5
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaBackendsFactory.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 355dc370391814b1c874848bb843b91c
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaCompute.cs b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaCompute.cs
new file mode 100644
index 0000000..81d1da5
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaCompute.cs
@@ -0,0 +1,1073 @@
+using UnityEngine;
+using UnityEngine.Assertions;
+using UnityEngine.Profiling;
+using System;
+using System.Linq;
+using System.Collections.Generic;
+
+/*
+PERFORMANCE COMPARISON after the latest OPTIMIZATION pass
+default @ be623ff20d72 VS compute-optimizations2 @ 13946c6c7e50
+
+NOTES:
+1) 33% in 1 batch cases and over 100% for 16 batch cases in most models
+2) Most models saw boost with large batches due to "unrolling" of images over N,W,H dimensions in optimized Convolution kernel
+3) INCEPTION saw large performance boost due to introduction of Convolution kernel that efficiently supports arbitrary input/output channel counts
+
+-------------------------------------------------------------
+BASELINE: default @ be623ff20d72
+log comment: “Added Conv2d_L1Cache32 variant, removed extra check in the kernel, restored performance on older Radeons + Intel”
+
+VGG
+@1    Exec #50:  95.2 ms, cpu: 1.0 ms, avg:  64.8 ms, result:OK
+@16   Exec #8: 1108.1 ms, cpu: 1.2 ms, avg: 1112.6 ms, result:OK
+
+MOBILENET
+@1    Exec #100:  37.9 ms, cpu: 7.9 ms, avg:  22.5 ms, result:OK
+@16   Exec #32: 213.0 ms, cpu: 9.3 ms, avg: 216.3 ms, result:OK
+
+RES
+@1    Exec #50:  42.4 ms, cpu: 7.0 ms, avg:  43.2 ms, result:OK
+@16   Exec #15: 654.8 ms, cpu: 16.0 ms, avg: 682.6 ms, result:OK
+
+INCEPTION
+@1    Exec #32:  86.8 ms, cpu: 21.8 ms, avg:  92.6 ms, result:OK
+@16   Exec #8: 1344.2 ms, cpu: 26.4 ms, avg: 1349.7 ms, result:OK
+
+
+PIX2PIX
+@1    Exec #15: 279.0 ms, cpu: 2.5 ms, avg: 239.6 ms, result:OK
+PIX2PIX_T
+@1   Exec #32: 114.3 ms, cpu: 2.3 ms, avg: 117.2 ms, result:OK
+
+
+-------------------------------------------------------------
+OPTIMIZED: compute-optimizations2 @ 13946c6c7e50
+log comment: “Optimizations: added path that support arbitrary number of input and ouptut channels in Convolutions (toggled via STRICT_CHANNELS)”
+
+VGG
+@1    Exec #50:  45.8 ms, cpu: 1.0 ms, avg:  46.5 ms, result:OK      39%
+@16   Exec #16: 529.1 ms, cpu: 1.1 ms, avg: 539.6 ms, result:OK     106%
+
+MOBILENET
+@1    Exec #100:  28.6 ms, cpu: 6.7 ms, avg:  16.8 ms, result:OK     33%
+@16   Exec #48: 138.2 ms, cpu: 9.4 ms, avg: 116.4 ms, result:OK      85%
+
+RES
+@1    Exec #50:  32.7 ms, cpu: 6.6 ms, avg:  33.6 ms, result:OK      28%
+@16   Exec #31: 312.2 ms, cpu: 8.3 ms, avg: 319.4 ms, result:OK     113%
+
+INCEPTION
+@1    Exec #50:  48.0 ms, cpu: 21.9 ms, avg:  55.2 ms, result:OK     67%
+@16   Exec #32: 188.7 ms, cpu: 25.7 ms, avg: 198.4 ms, result:OK    580%
+
+PIX2PIX
+@1   Exec #32: 152.2 ms, cpu: 2.6 ms, avg: 154.6 ms, result:OK       55%
+PIX2PIX_T
+@1   Exec #32: 123.1 ms, cpu: 2.4 ms, avg: 107.1 ms, result:OK      9.4%
+
+
+*/
+
+namespace Barracuda {
+
+public sealed class ComputeKernelLibrary
+{
+    static public int IDivC(int v, int div)
+    {
+        return (v + div - 1) / div;
+    }
+
+    static public Entry[] Dense(TensorShape X, TensorShape W, TensorShape O, int type)
+    {
+        var h = O.flatHeight;
+        var w = O.flatWidth;
+
+        return new[] {
+            new[] { // float16
+                new Entry("DenseFP16Div2",
+                    Int3(w / 2, h),                                 BigO(X.flatWidth)
+                    // @TODO: w % 2 == 0
+                ),
+            },
+            new[] { // float32
+                new Entry("Dense_T8x8_R8x8",
+                    Int3(w / 8, h / 8),                             BigO(X.flatWidth)/8,
+                    StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0)
+                ),
+                new Entry("Dense_T16x16_R4x4",
+                    Int3(w / 4, h / 4),                             BigO(X.flatWidth)/4,
+                    StrictAnd(w % 64 == 0 && h % 64 == 0 && X.flatWidth % 64 == 0)
+                ),
+                new Entry("Dense_T8x8_R4x4",
+                    Int3(w / 4, h / 4),                             BigO(X.flatWidth)/4,
+                    StrictAnd(w % 32 == 0 && h % 32 == 0 && X.flatWidth % 32 == 0)
+                ),
+                // old
+                new Entry("DenseTiled64x64",
+                    Int3(w / 4, h / 4),                             BigO(X.flatWidth)*1.33f/4,
+                    StrictAnd(w % 4 == 0 && h % 4 == 0
+                        && X.flatWidth % 64 == 0 && ComputeInfo.supportsDense64x64)
+                ),
+                new Entry("DenseTiled32x32",
+                    Int3(w / 2, h / 2),                             BigO(X.flatWidth)*1.33f/2,
+                    StrictAnd(w % 2 == 0 && h % 2 == 0
+                        && X.flatWidth % 32 == 0 && ComputeInfo.supportsDense32x32)
+                ),
+                new Entry("DenseTiled16x16",
+                    Int3(w, h),                                     BigO(X.flatWidth)*1.33f,
+                    StrictAnd(X.flatWidth % 16 == 0)
+                    // @TODO: relax Strict constraint, only And part should be necessary due to mask
+                ),
+                new Entry("Dense_L1Cached64",
+                    Int3(w, h),                                     BigO(X.flatWidth)
+                ),
+
+            },
+        } [type];
+    }
+
+    static public Entry[] Conv2D(TensorShape X, TensorShape K, TensorShape O, int[] stride, int[] pad)
+    {
+        var n = O.batch;
+        var h = O.height;
+        var w = O.width;
+        var k = K.kernelCount;
+        var c = X.channels;
+
+        return new[] {
+
+            new Entry("Conv2DKernel1x1_StrictC16K64_T16x16_R4x4",
+                Int3(IDivC(k, 4), IDivC(n*w*h, 4)),                 BigO(X.channels) * 0.8f / 4,
+                K.kernelWidth == 1 && K.kernelHeight == 1 &&
+                stride[0] == 1 && stride[1] == 1 &&
+                k % 64 == 0 && X.channels % 16 == 0 &&
+                ComputeInfo.supportsComputeSharedMemory
+            ),
+            new Entry("Conv2DKernelKxK_StrictC16K64_T16x16_R4x4",
+                Int3(IDivC(k, 4), IDivC(n*w*h, 4)),                 BigO(X.channels) * 0.9f / 4,
+                k % 64 == 0 && X.channels % 16 == 0 && ComputeInfo.supportsComputeSharedMemory
+            ),
+            new Entry("Conv2DKernelKxK_T16x16_R4x4",
+                Int3(IDivC(k, 4), IDivC(n*w*h, 4)),                 BigO(X.channels) * 1.0f / 4,
+                k >= 16 && c >= 16 && ComputeInfo.supportsComputeSharedMemory
+            ),
+//            new Entry("Conv2DKernelKxK_T16x16_R4x4",
+//                Int3(IDivC(k, 4), IDivC(n*w*h, 4)),                 BigO(X.channels) * 1.1f / 4
+//            ),
+            // old
+            new Entry("Conv2D_L1Cached64_RegisterBlock4x4",
+                Int3(K.kernelCount, w/4+1, h/4+1),                  BigO(O.batch * X.channels) * 1.1f / 4,
+                K.kernelCount % 64 == 0 && X.channels % 64 == 0 && ComputeInfo.supportsComputeSharedMemory
+            ),
+            new Entry("Conv2D_L1Cached32_RegisterBlock4x4",
+                Int3(K.kernelCount, w/4+1, h/4+1),                  BigO(O.batch * X.channels) / 3,
+                K.kernelCount % 32 == 0 && X.channels % 32 == 0 && ComputeInfo.supportsComputeSharedMemory
+            ),
+            new Entry("Conv2D_RegisterBlock4x2",
+                Int3(K.kernelCount, w/4, h/2),                      BigO(O.batch * X.channels) / 2,
+                StrictAnd(
+                    w % 4 == 0 && h % 2 == 0)
+            ),
+            new Entry("Conv2D",
+                Int3(k, w, h),                          BigO(O.batch * X.channels)
+            ),
+        };
+    }
+
+    static public Entry[] DepthwiseConv2D(TensorShape X, TensorShape K, TensorShape O)
+    {
+        var h = O.height;
+        var w = O.width;
+
+        return new[] {
+
+            new Entry("DepthwiseConv2D",
+                Int3(K.kernelCount, w, h),                          BigO(O.batch * X.channels)
+            ),
+        };
+    }
+
+    static public Entry[] Conv2DTrans(TensorShape X, TensorShape K, TensorShape O)
+    {
+        return new[] {
+            new Entry("Conv2DTrans_KernelCached_K5x5_T16x16",
+                dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels) / 3,
+                valid_: (X.channels <= 256 && K.kernelHeight <= 5 && K.kernelWidth <= 5)
+            ),
+            new Entry("Conv2DTrans",
+                dispatch_: Int3(K.kernelCount, O.width, O.height), bigO_: BigO(O.batch * O.channels * X.channels)
+            ),
+        };
+    }
+
+    static public Entry[] Activation(TensorShape X, TensorShape O, string kernelName)
+    {
+        return
+            new[] {
+            new Entry(kernelName + "_FlatStrict",
+                dispatch_: Int3(O.length/2),
+                bigO_: 0.8f* BigO(1),
+                strictDims: StrictAnd(O.length % 128 == 0)
+            ),
+            new Entry(kernelName + "_Flat",
+                dispatch_: Int3(O.length),
+                bigO_: BigO(1)
+            ),
+            new Entry(kernelName + "_Loop",
+                dispatch_: Int3(O.length),
+                bigO_: BigO(2),
+                loopStride_: 256
+            )
+        };
+    }
+
+    static public Entry[] PRelu(TensorShape X, TensorShape O)
+    {
+        return new[] {
+            new Entry("PRelu_CNyx2",
+                Int3(O.channels, O.batch * O.height * O.width)
+            ),
+            new Entry("PRelu_Flat",
+                Int3(O.length)
+            ),
+            new Entry("PRelu_Loop",
+                Int3(O.length), BigO(2), 256
+            )
+        };
+    }
+
+    static public Entry[] Softmax(TensorShape X, TensorShape O)
+    {
+        return new[] {
+            new Entry("Softmax",
+                Int3(O.flatWidth, O.flatHeight)
+            ),
+        };
+    }
+
+    static public Entry[] LogSoftmax(TensorShape X, TensorShape O)
+    {
+        return new[] {
+            new Entry("LogSoftmax",
+                Int3(O.flatWidth, O.flatHeight)
+            ),
+        };
+    }
+
+    static public Entry[] ScaleBias(TensorShape X, TensorShape O)
+    {
+        return new[] {
+            new Entry("ScaleBias_CNyx2",
+                Int3(O.channels, O.batch * O.height * O.width)
+            ),
+            new Entry("ScaleBias_Flat",
+                Int3(O.length)
+            ),
+            new Entry("ScaleBias_Loop",
+                Int3(O.length), BigO(2), 256
+            )
+        };
+    }
+
+    static public Entry[] Upsample2D(TensorShape X, TensorShape O)
+    {
+        return new[] {
+            // NOTE: dispatched over X (not O)
+            new Entry("Upsample2D",
+                Int3(X.channels, X.width, X.height),                BigO(X.batch)
+            ),
+        };
+    }
+
+    static public Entry[] Pool2D(TensorShape X, TensorShape O, string kernelName)
+    {
+        return new[] {
+            //new Entry(kernelName + "_16x4x4",
+            //    Int3(O.channels, O.width, O.height),            BigO(O.batch)
+            //),
+            new Entry(kernelName,
+                Int3(O.channels, O.width, O.height),                BigO(O.batch)
+            ),
+        };
+    }
+
+    static public Entry[] GlobalPool2D(TensorShape X, TensorShape O, string kernelName)
+    {
+        return new[] {
+            new Entry(kernelName,
+                Int3(O.channels),                                   BigO(O.batch)
+            ),
+        };
+    }
+
+    static public Entry[] Normalization(TensorShape X, TensorShape O)
+    {
+        return new[] {
+            new Entry("InstanceNorm",
+                Int3(O.channels),                                   BigO(O.batch * O.width * O.height)
+            ),
+        };
+    }
+    static public Entry[] NormalizationTail(TensorShape X, TensorShape O)
+    {
+        return new[] {
+            new Entry("InstanceNormTail_CNyx2",
+                Int3(O.channels, O.batch * O.height * O.width)
+            ),
+            new Entry("InstanceNormTail_Flat",
+                Int3(O.length)
+            ),
+            new Entry("InstanceNormTail_Loop",
+                Int3(O.length), BigO(2), 256
+            )
+        };
+    }
+
+    static public Entry[] Copy(TensorShape X, TensorShape O)
+    {
+        return new[] {
+            // NOTE: dispatched over X (not O)
+            new Entry("Copy",
+                Int3(X.channels, X.width, X.height),                BigO(O.batch)
+            ),
+        };
+    }
+
+    static public Entry[] Padding(TensorShape X, TensorShape O, string kernelName)
+    {
+        return new[] {
+            new Entry(kernelName,
+                Int3(O.channels, O.width, O.height),                BigO(O.batch)
+            ),
+        };
+    }
+
+    static public Entry[] Broadcast(TensorShape X, TensorShape O, string kernelName)
+    {
+        return new[] {
+            new Entry(kernelName,
+                Int3(O.channels, O.width, O.height),                BigO(O.batch)
+            ),
+        };
+    }
+
+    static int[] Int3(int x, int y = 1, int z = 1) { return new[] { x, y, z }; }
+    static float BigO(int o) { return (float)o; }
+    public struct StrictDimensions { public bool valid; }
+    static StrictDimensions StrictAnd(bool valid_) { return new StrictDimensions { valid = valid_ }; }
+    static StrictDimensions Strict() { return new StrictDimensions { valid = true }; }
+
+    public struct Entry
+    {
+        public readonly string name;
+        public readonly int[] dispatch;
+        public readonly float bigO;
+        public readonly bool valid;
+        public readonly bool strict;
+        public readonly uint loopStride; // > 0 indicates looping kernel
+
+        public Entry(string name_, int[] dispatch_, float bigO_ = 1.0f, bool valid_ = true)
+        {
+            name = name_;
+            dispatch = dispatch_;
+            bigO = bigO_;
+            valid = valid_;
+            strict = false;
+            loopStride = 0;
+        }
+
+        public Entry(string name_, int[] dispatch_, float bigO_, uint loopStride_) :
+            this(name_, dispatch_, bigO_)
+        {
+            loopStride = loopStride_;
+        }
+
+        public Entry(string name_, int[] dispatch_, float bigO_, StrictDimensions strictDims) :
+            this(name_, dispatch_, bigO_, strictDims.valid)
+        {
+            strict = true;
+        }
+    }
+}
+
+public struct ComputeKernel
+{
+    readonly public ComputeFunc func;
+    readonly public int[] dispatch;
+    public ComputeShader shader { get { return func.shader; } }
+
+    public ComputeKernel(ComputeFunc func_, int[] dispatch_)
+    {
+        func = func_;
+        dispatch = dispatch_;
+    }
+
+    public void SetTensor(string name, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0)
+    {
+        func.SetTensor(name, shape, buffer, dataOffset);
+    }
+    public void SetTensor(ComputeFunc.TensorDecl tensorDecl, int dataPropId, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0)
+    {
+        func.SetTensor(tensorDecl, dataPropId, shape, buffer, dataOffset);
+    }
+
+    public void SetTensorDecl(string name, TensorShape shape, Int64 dataOffset)
+    {
+        func.SetTensorDecl(name, shape, dataOffset);
+    }
+    public void SetTensorDecl(ComputeFunc.TensorDecl tensorDecl, TensorShape shape, Int64 dataOffset)
+    {
+        func.SetTensorDecl(tensorDecl, shape, dataOffset);
+    }
+
+    public void SetTensorBuffer(string name, ComputeBuffer buffer)
+    {
+        func.SetTensorBuffer(name, buffer);
+    }
+    public void SetTensorBuffer(int propId, ComputeBuffer buffer)
+    {
+        func.SetTensorBuffer(propId, buffer);
+    }
+
+    public void Dispatch()
+    {
+        func.Dispatch(dispatch);
+    }
+
+    internal static long CalculateEntryScore(ComputeShader[] kernels, ComputeKernelLibrary.Entry entry, bool verbose)
+    {
+        const long InvalidEntry = long.MaxValue;
+        long work = InvalidEntry;
+        try
+        {
+            if (!entry.valid)
+                return InvalidEntry;
+
+            // @TODO: @OPTIMIZE: cache threadGroupSize instead of creating ComputeFunc and querying every time
+            var fn = new ComputeFunc(kernels, entry.name);
+
+            if (fn.threadGroupSizeX * fn.threadGroupSizeY * fn.threadGroupSizeZ > ComputeInfo.maxComputeWorkGroupSize)
+                return InvalidEntry;
+
+            if (entry.strict)
+            {
+                if (entry.dispatch[0] % fn.threadGroupSizeX != 0 ||
+                    entry.dispatch[1] % fn.threadGroupSizeY != 0 ||
+                    entry.dispatch[2] % fn.threadGroupSizeZ != 0)
+                    return InvalidEntry;
+            }
+
+            var x = (long) ComputeFunc.IntDivCeil(entry.dispatch[0], (int) fn.threadGroupSizeX);
+            var y = (long) ComputeFunc.IntDivCeil(entry.dispatch[1], (int) fn.threadGroupSizeY);
+            var z = (long) ComputeFunc.IntDivCeil(entry.dispatch[2], (int) fn.threadGroupSizeZ);
+
+            if (entry.loopStride == 0 && (x > 65535 || y > 65535 || z > 65535))
+            {
+                if (verbose)
+                    D.LogWarning($"Kernel {entry.name} dispatch arguments out of range (any [{x},{y},{z}] > 65535), skipping..");
+
+                return InvalidEntry;
+            }
+
+            work = x * y * z;
+
+            work *= (int) fn.threadGroupSize;
+            work = (long) (entry.bigO * work);
+        }
+        catch (ArgumentException)
+        {
+            if (verbose)
+                D.LogWarning($"Kernel processing failed, skipping {entry.name}");
+        }
+        return work;
+    }
+
+    public static ComputeKernel BestKernel(ComputeShader[] kernels, ComputeKernelLibrary.Entry[] entrees, bool verbose)
+    {
+        var bestEntry = entrees[0];
+        var bestScore = long.MaxValue;
+        for (int i = 0; i < entrees.Length; i++)
+        {
+            var score = CalculateEntryScore(kernels, entrees[i], verbose);
+            if (score < bestScore)
+            {
+                bestEntry = entrees[i];
+                bestScore = score;
+            }
+        }
+
+        if (verbose)
+            D.Log(bestEntry.name);
+
+        var func = new ComputeFunc(kernels, bestEntry.name);
+
+        if (bestEntry.loopStride > 0)
+        {
+            int preferedDispatch = (int)bestEntry.loopStride * (int)func.threadGroupSizeX;
+            var kernel = new ComputeKernel(func, new int[] {preferedDispatch, 1, 1});
+            kernel.shader.SetInt("_LoopStride", preferedDispatch);
+            return kernel;
+        }
+        else
+        {
+            return new ComputeKernel(func, bestEntry.dispatch);
+        }
+    }
+
+}
+
+public class ComputeOps : ReferenceComputeOps
+{
+    // ---------------------------------------------------------------------------------
+    private bool printKernels = false;
+
+    // ---------------------------------------------------------------------------------
+    private ComputeShader[] m_Kernels;
+    private bool m_Verbose = false;
+
+    public ComputeOps(ComputeShader[] kernels, ComputeShader referenceKernel,  ITensorAllocator allocator = null, bool verbose = false)
+    : base(referenceKernel, allocator)
+    {
+        m_Kernels = kernels;
+        m_Verbose = verbose;
+    }
+
+    // ---------------------------------------------------------------------------------
+
+    protected ComputeKernel BestKernel(ComputeKernelLibrary.Entry[] entrees)
+    {
+        return ComputeKernel.BestKernel(m_Kernels, entrees, m_Verbose);
+    }
+
+    // ---------------------------------------------------------------------------------
+    public override Tensor Dense(Tensor X, Tensor W, Tensor B)
+    {
+        Assert.IsTrue(W.dimensions <= 2);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(X.flatWidth, W.flatHeight);
+
+        var O = NewTensor(X.flatHeight, W.flatWidth);
+
+        var itemSize = 4; // @TODO: itemSizeInBytes == 2 | float16
+        var fn = BestKernel(ComputeKernelLibrary.Dense(X.shape, W.shape, O.shape, itemSize >> 2));
+
+        if (printKernels)
+            Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} * {W.shape}" );
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+        fn.SetTensorDecl("W", W.shape, Pin(W).offset);
+        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(W).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer("WBK", Pin(W).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        Assert.AreEqual(X.channels, K.kernelDepth);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyKernel(K.shape, stride, pad));
+        var fn = BestKernel(ComputeKernelLibrary.Conv2D(X.shape, K.shape, O.shape, stride, pad));
+
+        if (printKernels)
+            Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} # {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+        fn.SetTensorDecl("K", K.shape, Pin(K).offset);
+        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer("WBK", Pin(K).buffer);
+
+        fn.shader.SetInts("_Pad", pad);
+        fn.shader.SetInts("_Stride", stride);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        if (K.kernelDepth != 1)
+            return base.DepthwiseConv2D(X, K, B, stride, pad);
+
+        Assert.AreEqual(K.kernelDepth, 1);
+        Assert.AreEqual(K.kernelCount, X.channels);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyKernel(K.shape, stride, pad));
+        var fn = BestKernel(ComputeKernelLibrary.DepthwiseConv2D(X.shape, K.shape, O.shape));
+
+        if (printKernels)
+            Debug.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ∆ {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+        fn.SetTensorDecl("K", K.shape, Pin(K).offset);
+        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer("WBK", Pin(K).buffer);
+
+        fn.shader.SetInts("_Stride", stride);
+        fn.shader.SetInts("_Pad", pad);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment)
+    {
+        Assert.AreEqual(X.channels, K.kernelDepth);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment));
+        var fn = BestKernel(ComputeKernelLibrary.Conv2DTrans(X.shape, K.shape, O.shape));
+
+        if (printKernels)
+            D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} @ {K.shape} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );
+
+        pad = new int[]
+        {
+            K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
+            K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
+        };
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+        fn.SetTensorDecl("K", K.shape, Pin(K).offset);
+        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer("WBK", Pin(K).buffer);
+
+        fn.shader.SetInts("_Pad", pad);
+        fn.shader.SetInts("_Stride", stride);
+
+        fn.Dispatch();
+
+        return O;
+    }
+
+    public override Tensor Upsample2D(Tensor X, int[] size)
+    {
+        Assert.AreEqual(size.Length, 2);
+
+        var O = NewTensor(X.batch, X.height*size[1], X.width*size[0], X.channels);
+        var fn = BestKernel(ComputeKernelLibrary.Upsample2D(X.shape, O.shape));
+
+        if (printKernels)
+            D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ^ size: {size[0]},{size[1]}" );
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+
+        fn.shader.SetInts("_Pool", size);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    protected override Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        Assert.AreEqual(pool.Length, 2);
+        Assert.AreEqual(stride.Length, 2);
+
+        if (pad[0] == 0 && pad[1] == 0 && pad[2] == 0 && pad[3] == 0)
+            kernelName += "_NoPads";
+
+        var O = NewTensor(X.shape.ApplyPool(pool, stride, pad));
+        var fn = BestKernel(ComputeKernelLibrary.Pool2D(X.shape, O.shape, kernelName));
+
+        if (printKernels)
+            D.Log($"{fn.func.kernelName}: {O.shape} = {X.shape} ^ pool: {pool[0]},{pool[1]} stride: {stride[0]},{stride[1]} pad:{pad[0]},{pad[1]}" );
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+
+        fn.shader.SetInts("_Pool", pool);
+        fn.shader.SetInts("_Stride", stride);
+        fn.shader.SetInts("_Pad", pad);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor GlobalMaxPool2D(Tensor X)
+    {
+        return GlobalPool2D("MaxPool2D", "GlobalMaxPool2D", X);
+    }
+
+    public override Tensor GlobalAvgPool2D(Tensor X)
+    {
+        return GlobalPool2D("AvgPool2D", "GlobalAvgPool2D", X);
+    }
+
+    public override Tensor GlobalAvgVariancePool2D(Tensor X)
+    {
+        var O = NewTensor(X.batch, 2, 1, X.channels);
+        var fn = BestKernel(ComputeKernelLibrary.GlobalPool2D(X.shape, O.shape, "GlobalAvgVariancePool2D"));
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    protected virtual Tensor GlobalPool2D(string smallKernelName, string globalKernelName, Tensor X)
+    {
+        // downsample with pyramid approach
+        while (X.height * X.width >= 256)
+        {
+            var pool = new [] {4, 4};
+            var stride = pool;
+            var noPad = new[] {0, 0, 0, 0};
+
+            var lastLength = X.length;
+            X = Pool2D(smallKernelName, X, pool, stride, noPad);
+            Assert.IsTrue(X.length < lastLength);
+        }
+
+        var O = NewTensor(X.batch, 1, 1, X.channels);
+        var fn = BestKernel(ComputeKernelLibrary.GlobalPool2D(X.shape, O.shape, globalKernelName));
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
+    {
+        Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
+        Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
+
+        var O = NewTensor(X.shape);
+        var fn = BestKernel(ComputeKernelLibrary.ScaleBias(X.shape, O.shape));
+
+        if (printKernels)
+            D.Log(fn.func.kernelName);
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+        fn.SetTensorDecl("W", S.shape, Pin(S).offset);
+        fn.SetTensorDecl("B", B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer("WBK", Pin(S).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon)
+    {
+        if (axis != 3 && axis != -1)
+            throw new NotImplementedException();
+
+        if (pool <= 0)
+            pool = X.batch;
+
+        if (pool > 1)
+            throw new NotImplementedException(); // @TODO: support other types of Normalization at test time
+                                                 // Currently supported only pool=1 (InstanceNormalization)
+
+        var meanVariance = GlobalAvgVariancePool2D(X);
+
+        var O = NewTensor(X.shape);
+        var fn = BestKernel(ComputeKernelLibrary.NormalizationTail(X.shape, O.shape));
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+        fn.SetTensor("W", meanVariance.shape, Pin(meanVariance).buffer);
+
+        fn.shader.SetFloat("_Epsilon", epsilon);
+
+        fn.Dispatch();
+
+        return ScaleBias(O, S, B);
+    }
+
+    protected override Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f)
+    {
+        var O = NewTensor(X.shape);
+        var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, kernelName));
+
+        if (printKernels)
+            D.Log(fn.func.kernelName);
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+
+        fn.shader.SetFloat("_Alpha", alpha);
+        fn.shader.SetFloat("_Beta",  beta);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor PRelu(Tensor X, Tensor S)
+    {
+        Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));
+
+        var O = NewTensor(X.shape);
+        var fn = BestKernel(ComputeKernelLibrary.PRelu(X.shape, O.shape));
+
+        if (printKernels)
+            D.Log(fn.func.kernelName);
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+        fn.SetTensor("W", S.shape, Pin(S).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor Softmax(Tensor X)
+    {
+        var O = NewTensor(X.shape);
+        var fn = BestKernel(ComputeKernelLibrary.Softmax(X.shape, O.shape));
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor LogSoftmax(Tensor X)
+    {
+        var O = NewTensor(X.shape);
+        var fn = BestKernel(ComputeKernelLibrary.LogSoftmax(X.shape, O.shape));
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    // @TODO: implement Dropout in terms of RandomUniform by preparing random values on CPU upfront and multiplying result on GPU later on
+    // public override Tensor Dropout(Tensor X, float alpha)
+
+    private UnityEngine.Random.State[] m_RandomNormalSeed;
+    public override Tensor RandomNormal(TensorShape s, float mean, float scale, int seed)
+    {
+        var O = NewTensor(s);
+
+        using (var seedOverride = new Seed(ref m_RandomNormalSeed, seed))
+        {
+            var end = O.length;
+            for (int i = 0; i < end; ++i)
+                O[i] = Gaussian(mean, scale);
+        }
+
+        return O;
+    }
+
+    private UnityEngine.Random.State[] m_RandomUniformSeed;
+    public override Tensor RandomUniform(TensorShape s, float mean, float scale, int seed)
+    {
+        var O = NewTensor(s);
+
+        using (var seedOverride = new Seed(ref m_RandomUniformSeed, seed))
+        {
+            var end = O.length;
+            for (int i = 0; i < end; ++i)
+                O[i] = mean + scale * UnityEngine.Random.value;
+        }
+
+        return O;
+    }
+
+
+    public override Tensor Concat(Tensor[] tensors, int axis)
+    {
+        var O = NewTensor(TensorExtensions.Concat(tensors.Select(t => t.shape).ToArray(), axis));
+
+        var offsets = new int[] { 0,0,0,0 };
+        axis = O.shape.Axis(axis);
+
+        foreach (var X in tensors)
+        {
+            var fn = BestKernel(ComputeKernelLibrary.Copy(X.shape, O.shape));
+
+            fn.SetTensor("X", X.shape, Pin(X).buffer);
+            fn.SetTensor("O", O.shape, Pin(O).buffer);
+
+            fn.shader.SetInts("_Pad", offsets);
+
+            fn.Dispatch();
+
+            offsets[axis] += X.shape[axis];
+        }
+
+        return O;
+    }
+
+    public override Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors)
+    {
+        Assert.IsTrue(tensors.Length > 0);
+
+        Tensor outputTensor1 = NewTensor(TensorExtensions.MaxShape(tensors));
+        Tensor outputTensor2 = null;
+        if (tensors.Length > 2)
+            outputTensor2 = NewTensor(TensorExtensions.MaxShape(tensors));
+
+        var X = tensors[0];
+        var fn = BestKernel(ComputeKernelLibrary.Broadcast(X.shape, outputTensor1.shape, kernelName));
+
+        Tensor O = null;
+        for (int t = 1; t < tensors.Length; ++t)
+        {
+            var B = tensors[t];
+            O = (t%2 == 1)?outputTensor1:outputTensor2;
+            fn.SetTensor("X", X.shape, Pin(X).buffer);
+            fn.SetTensor("O", O.shape, Pin(O).buffer);
+            fn.SetTensor("B", B.shape, Pin(B).buffer, Pin(B).offset);
+
+            fn.Dispatch();
+
+            X = O;
+        }
+
+        return O;
+    }
+
+    protected override Tensor ApplyPadding(Tensor X, int[] pad, string kernelName, float constant = 0.0f)
+    {
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyBorder(pad));
+        var fn = BestKernel(ComputeKernelLibrary.Padding(X.shape, O.shape, kernelName));
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+
+        fn.shader.SetInts("_Pad", pad);
+        fn.shader.SetInts("_Stride", X.shape.ToArray());
+
+        if (kernelName == "Border2D")
+        {
+            // NOTE: negative "pad" variable will crop X tensor
+            int croppedWidth = X.width - Math.Max(0, -pad[2]);
+            int croppedHeight = X.height - Math.Max(0, -pad[3]);
+            var croppedSize = new int[] { 0, 0, 0, 0 };
+            croppedSize[0] = croppedWidth;
+            croppedSize[1] = croppedHeight;
+
+            fn.shader.SetInts("_Pool", croppedSize);
+            fn.shader.SetFloat("_Beta", constant);
+        }
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor LogicalNot(Tensor X)
+    {
+        var O = NewTensor(X.shape);
+        var fn = BestKernel(ComputeKernelLibrary.Activation(X.shape, O.shape, "LogicalNot"));
+
+        fn.SetTensor("X", X.shape, Pin(X).buffer);
+        fn.SetTensor("O", O.shape, Pin(O).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+}
+
+public class ComputeVarsWithSharedModel : DefaultVars
+{
+    private Dictionary<string, ComputeBuffer> m_ModelBuffers = new Dictionary<string, ComputeBuffer>();
+    private Dictionary<string, Int64> m_OffsetsIntoModelWeights = new Dictionary<string, long>();
+
+    public override void Dispose()
+    {
+        base.Dispose();
+
+        foreach (var key in m_ModelBuffers.Keys)
+            m_ModelBuffers[key].Dispose();
+        m_ModelBuffers.Clear();
+        m_OffsetsIntoModelWeights.Clear();
+    }
+
+    protected override Tensor[] PrepareLayerInputTensors(Model model, Layer layer, IOps ops)
+    {
+        var tensorIndex = 0;
+        var tensors = new Tensor[layer.inputs.Length + layer.datasets.Length];
+
+        foreach (var name in layer.inputs)
+        {
+            var tensor = new Tensor(1, 1, 1, 1, m_StringCache.Lookup(layer.name, "_dummy_in", tensorIndex));
+            tensors[tensorIndex++] = tensor;
+        }
+
+        Int64 offsetIntoModelWeights = m_OffsetsIntoModelWeights.ContainsKey(layer.name) ?
+                                       m_OffsetsIntoModelWeights[layer.name]: 0;
+        ComputeBuffer buffer = m_ModelBuffers.ContainsKey(layer.name) ? m_ModelBuffers[layer.name] : null;
+
+        if (buffer == null)
+        {
+            buffer = CreateComputeBufferForModelTensors(layer, out offsetIntoModelWeights);
+            if (buffer != null)
+            {
+                m_ModelBuffers[layer.name] = buffer;
+                m_OffsetsIntoModelWeights[layer.name] = offsetIntoModelWeights;
+            }
+        }
+
+        foreach (var arg in layer.datasets)
+        {
+            Assert.IsNotNull(buffer);
+            var tensor = new Tensor(arg.shape,
+                new SharedComputeTensorData(buffer, arg.shape, (int)(arg.offset - offsetIntoModelWeights)),
+                m_StringCache.Lookup(layer.name, "_arg", tensorIndex));
+            tensors[tensorIndex++] = tensor;
+            m_ModelTensors.Add(tensor);
+        }
+
+        Assert.AreEqual(tensorIndex, tensors.Length);
+        return tensors;
+    }
+
+    protected ComputeBuffer CreateComputeBufferForModelTensors(Layer layer, out Int64 offsetIntoModelWeights)
+    {
+        Int64 minOffset = layer.weights.LongLength;
+        Int64 maxOffset = 0;
+        foreach (var t in layer.datasets)
+        {
+            minOffset = Math.Min(minOffset, t.offset);
+            maxOffset = Math.Max(maxOffset, t.offset + t.length);
+        }
+        var length = Convert.ToInt32(maxOffset - minOffset);
+        if (length <= 0)
+        {
+            offsetIntoModelWeights = 0;
+            return null;
+        }
+
+        var buffer = new ComputeBuffer(length, sizeof(float));
+        // @WARN: looks like Unity ComputeBuffer.SetData API take "computeBufferStartIndex" and "length" arguments in floats, instead of buffer element size aka stride
+        // as would be expected per API documentation
+        // @TODO: bugreport documentation discrepancy!
+        offsetIntoModelWeights = minOffset;
+        buffer.SetData(layer.weights, Convert.ToInt32(offsetIntoModelWeights), 0, length);
+        return buffer;
+    }
+}
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaCompute.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaCompute.cs.meta
new file mode 100644
index 0000000..4dec977
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaCompute.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: badd0d6a0383049eab2cb58e1d0d6fa9
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaPrecompiledCompute.cs b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaPrecompiledCompute.cs
new file mode 100644
index 0000000..fd9ec91
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaPrecompiledCompute.cs
@@ -0,0 +1,525 @@
+using UnityEngine;
+using UnityEngine.Assertions;
+using UnityEngine.Profiling;
+using System;
+using System.Linq;
+using System.Collections.Generic;
+
+
+namespace Barracuda {
+
+
+public class PrecompiledComputeOps : ComputeOps, IModelCompiler
+{
+    public PrecompiledComputeOps(ComputeShader[] kernels, ComputeShader referenceKernel,  ITensorAllocator allocator = null, bool verbose = false)
+    : base(kernels, referenceKernel, allocator, verbose)
+    {
+    }
+
+    // ---------------------------------------------------------------------------------
+
+    static public ComputeFunc.TensorDecl _DeclX = ComputeFunc.GetTensorDecl("X");
+    static public ComputeFunc.TensorDecl _DeclO = ComputeFunc.GetTensorDecl("O");
+    static public ComputeFunc.TensorDecl _DeclW = ComputeFunc.GetTensorDecl("W");
+    static public ComputeFunc.TensorDecl _DeclK = ComputeFunc.GetTensorDecl("K");
+    static public ComputeFunc.TensorDecl _DeclB = ComputeFunc.GetTensorDecl("B");
+    static public int _DataX = ComputeFunc.GetTensorData("X");
+    static public int _DataO = ComputeFunc.GetTensorData("O");
+    static public int _DataW = ComputeFunc.GetTensorData("W");
+    static public int _DataK = ComputeFunc.GetTensorData("K");
+    static public int _DataB = ComputeFunc.GetTensorData("B");
+    static public int _DataWBK = ComputeFunc.GetTensorData("WBK");
+    static public int _Stride = Shader.PropertyToID("_Stride");
+    static public int _Pad = Shader.PropertyToID("_Pad");
+    static public int _Pool = Shader.PropertyToID("_Pool");
+    static public int _Alpha = Shader.PropertyToID("_Alpha");
+    static public int _Beta  = Shader.PropertyToID("_Beta");
+
+    struct CompiledLayer
+    {
+        public ComputeKernel kernel;
+        public TensorShape shape;
+    }
+
+    private int m_CachedModelHash;
+    private Dictionary<Layer, CompiledLayer> m_CompiledLayers = new Dictionary<Layer, CompiledLayer>();
+    private CompiledLayer m_Compiled;
+
+    protected int CalcModelWithInputsHashCode(Model model, IDictionary<string, TensorShape> inputShapes)
+    {
+        var hash = model.GetHashCode();
+        foreach (var entry in inputShapes)
+        {
+            hash = (hash * 7) + entry.Key.GetHashCode();
+            hash = (hash * 7) + entry.Value.GetHashCode();
+        }
+        return hash;
+    }
+
+    public virtual void PrepareModel(Model model, IDictionary<string, TensorShape> inputShapes)
+    {
+        var modelHash = CalcModelWithInputsHashCode(model, inputShapes);
+        if (modelHash == m_CachedModelHash)
+            return;
+
+        m_CachedModelHash = modelHash;
+        m_CompiledLayers.Clear();
+
+        IDictionary<string, TensorShape> shapesByName;
+        ModelAnalyzer.ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
+
+        foreach (var l in model.layers)
+        {
+            if (m_CompiledLayers.ContainsKey(l))
+                continue; // already compiled
+
+            if (l.inputs.Length == 0)
+                continue;   // don't need to compile layers without inputs, so far all of them are CPU only
+
+            var X = shapesByName[l.inputs[0]];
+            var O = shapesByName[l.name];
+
+            ComputeKernel kernel = new ComputeKernel();
+            if (l.type == Layer.Type.Dense)
+            {
+                var itemSize = 4; // @TODO: itemSizeInBytes == 2 | float16
+                kernel = BestKernel(
+                    ComputeKernelLibrary.Dense(X, l.datasets[0].shape, O, itemSize >> 2));
+            }
+            else if (
+                l.type == Layer.Type.Conv2D)
+            {
+                Assert.IsNotNull(l.stride);
+                Assert.IsNotNull(l.pad);
+                kernel = BestKernel(
+                    ComputeKernelLibrary.Conv2D(X, l.datasets[0].shape, O, l.stride, l.pad));
+            }
+            else if (
+                l.type == Layer.Type.DepthwiseConv2D)
+            {
+                kernel = BestKernel(
+                    ComputeKernelLibrary.DepthwiseConv2D(X, l.datasets[0].shape, O));
+            }
+            else if (
+                l.type == Layer.Type.Conv2DTrans)
+            {
+                kernel = BestKernel(
+                    ComputeKernelLibrary.Conv2DTrans(X, l.datasets[0].shape, O));
+            }
+            else if (
+                l.type == Layer.Type.Upsample2D)
+            {
+                kernel = BestKernel(
+                    ComputeKernelLibrary.Upsample2D(X, O));
+            }
+            else if (
+                l.type == Layer.Type.MaxPool2D ||
+                l.type == Layer.Type.AvgPool2D)
+            {
+                var kernelName = l.type.ToString();
+
+                Assert.IsNotNull(l.pool);
+                Assert.IsNotNull(l.stride);
+                Assert.IsNotNull(l.pad);
+                var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad);
+                if (pad[0] == 0 && pad[1] == 0 && pad[2] == 0 && pad[3] == 0)
+                    kernelName += "_NoPads";
+
+                kernel = BestKernel(
+                    ComputeKernelLibrary.Pool2D(X, O, kernelName));
+            }
+            // @TODO: reimplement GlobalPools, currently require different kernels for each pyramid step
+            //else if (
+            //    l.type == Layer.Type.GlobalMaxPool2D ||
+            //    l.type == Layer.Type.GlobalAvgPool2D)
+            //{
+            //    var kernelName = l.type.ToString();
+            //    kernel = BestKernel(
+            //        ComputeKernelLibrary.GlobalPool2D(X, O, kernelName));
+            //}
+            else if (
+                l.type == Layer.Type.ScaleBias)
+            {
+                kernel = BestKernel(
+                    ComputeKernelLibrary.ScaleBias(X, O));
+            }
+            // @TODO: reimplement Normalization, which became a multi-kernel operation after optimizations
+            //else if (
+            //    l.type == Layer.Type.Normalization)
+            //{
+            //    kernel = BestKernel(
+            //        ComputeKernelLibrary.Normalization(X, O));
+            //}
+            else if (
+                l.type == Layer.Type.Add ||
+                l.type == Layer.Type.Sub ||
+                l.type == Layer.Type.Mul ||
+                l.type == Layer.Type.Div ||
+                l.type == Layer.Type.Pow ||
+                l.type == Layer.Type.Min ||
+                l.type == Layer.Type.Max
+                // || l.type == Layer.Type.Mean @TODO: implement BroadcastMean
+                )
+            {
+                var kernelName = "Broadcast" + l.type;
+                kernel = BestKernel(
+                    ComputeKernelLibrary.Broadcast(X, O, kernelName));
+            }
+            // @TODO: implement Concat, currently might require different kernel for each tensor
+            //else if (
+            //    l.type == Layer.Type.Concat) {}
+            // Activations
+            else if (l.type == Layer.Type.Activation)
+            {
+                if (l.activation == Layer.Activation.Softmax)
+                {
+                    kernel = BestKernel(
+                        ComputeKernelLibrary.Softmax(X, O));
+                } else if (l.activation == Layer.Activation.LogSoftmax)
+                {
+                    kernel = BestKernel(
+                        ComputeKernelLibrary.LogSoftmax(X, O));
+                }
+                else if (l.activation == Layer.Activation.PRelu)
+                {
+                    kernel = BestKernel(
+                        ComputeKernelLibrary.PRelu(X, O));
+                }
+                else if (l.activation != Layer.Activation.None)
+                {
+                    var kernelName = l.activation.ToString();
+                    kernel = BestKernel(
+                        ComputeKernelLibrary.Activation(X, O, kernelName));
+                }
+            }
+            
+            m_CompiledLayers.Add(l, new CompiledLayer { kernel = kernel, shape = O });
+        }
+    }
+
+    public virtual void PreExecuteLayer(Layer layer, Tensor[] inputs)
+    {
+        m_Compiled = new CompiledLayer();
+        m_CompiledLayers.TryGetValue(layer, out m_Compiled);
+    }
+
+    // ---------------------------------------------------------------------------------
+
+    public override Tensor Dense(Tensor X, Tensor W, Tensor B)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.Dense(X, W, B);
+
+        Assert.IsTrue(W.dimensions <= 2);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(X.flatWidth, W.flatHeight);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+        fn.SetTensorDecl(_DeclW, W.shape, Pin(W).offset);
+        fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(W).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer(_DataWBK, Pin(W).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.Conv2D(X, K, B, stride, pad);
+
+        Assert.AreEqual(X.channels, K.kernelDepth);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+        fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset);
+        fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer(_DataWBK, Pin(K).buffer);
+
+        fn.shader.SetInts(_Pad, pad);
+        fn.shader.SetInts(_Stride, stride);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        if (K.kernelDepth != 1 || m_Compiled.kernel.shader == null)
+            return base.DepthwiseConv2D(X, K, B, stride, pad);
+
+        Assert.AreEqual(K.kernelDepth, 1);
+        Assert.AreEqual(K.kernelCount, X.channels);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+        fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset);
+        fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer(_DataWBK, Pin(K).buffer);
+
+        fn.shader.SetInts(_Pad, pad);
+        fn.shader.SetInts(_Stride, stride);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.Conv2DTrans(X, K, B, stride, pad, outputAdjustment);
+
+        Assert.AreEqual(X.channels, K.kernelDepth);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        pad = new int[]
+        {
+            K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
+            K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
+        };
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+        fn.SetTensorDecl(_DeclK, K.shape, Pin(K).offset);
+        fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(K).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer(_DataWBK, Pin(K).buffer);
+
+        fn.shader.SetInts(_Pad, pad);
+        fn.shader.SetInts(_Stride, stride);
+
+        fn.Dispatch();
+
+        return O;
+    }
+
+    public override Tensor Upsample2D(Tensor X, int[] size)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.Upsample2D(X, size);
+
+        Assert.AreEqual(size.Length, 2);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+
+        fn.shader.SetInts(_Pool, size);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    protected override Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.Pool2D(kernelName, X, pool, stride, pad);
+
+        Assert.AreEqual(pool.Length, 2);
+        Assert.AreEqual(stride.Length, 2);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+
+        fn.shader.SetInts(_Pool, pool);
+        fn.shader.SetInts(_Stride, stride);
+        fn.shader.SetInts(_Pad, pad);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.ScaleBias(X, S, B);
+
+        Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
+        Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+        fn.SetTensorDecl(_DeclW, S.shape, Pin(S).offset);
+        fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer(_DataWBK, Pin(S).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    // @TODO: reimplement Normalization, which became a multi-kernel operation after optimizations
+    /*
+    public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis)
+    {
+        if (axis != 3 && axis != -1)
+            throw new NotImplementedException();
+
+        if (pool == 1 && X.batch != 1)
+            throw new NotImplementedException(); // @TODO: Instance Normalization with batch > 1
+
+        if (pool <= 0)
+            pool = X.batch;
+
+        if (pool > 1)
+            throw new NotImplementedException(); // @TODO: support other types of Normalization at test time
+                                                 // Currently supported only pool=1 (InstanceNormalization)
+
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+        fn.SetTensorDecl(_DeclW, S.shape, Pin(S).offset);
+        fn.SetTensorDecl(_DeclB, B.shape, Pin(B).offset);
+        Assert.AreEqual(Pin(S).buffer, Pin(B).buffer);
+        fn.SetTensorBuffer(_DataWBK, Pin(S).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+    */
+
+    protected override Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.Activation(kernelName, X, alpha, beta);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+
+        fn.shader.SetFloat(_Alpha, alpha);
+        fn.shader.SetFloat(_Beta,  beta);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor PRelu(Tensor X, Tensor S)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.PRelu(X, S);
+
+        Assert.AreEqual(X.channels, S.channels);
+        Assert.AreEqual(S.length, S.channels);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+        fn.SetTensor(_DeclW, _DataW, S.shape, Pin(S).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor Softmax(Tensor X)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.Softmax(X);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor LogSoftmax(Tensor X)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.LogSoftmax(X);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+        fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+
+        fn.Dispatch();
+        return O;
+    }
+
+    public override Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors)
+    {
+        if (m_Compiled.kernel.shader == null)
+            return base.ElementwiseWithBroadcast(kernelName, tensors);
+
+        Assert.IsNotNull(m_Compiled.kernel.shader);
+        var O = NewTensor(m_Compiled.shape);
+        var fn = m_Compiled.kernel;
+
+        Assert.IsTrue(tensors.Length > 0);
+        var X = tensors[0];
+
+        for (int t = 1; t < tensors.Length; ++t)
+        {
+            var B = tensors[t];
+
+            fn.SetTensor(_DeclX, _DataX, X.shape, Pin(X).buffer);
+            fn.SetTensor(_DeclO, _DataO, O.shape, Pin(O).buffer);
+            fn.SetTensor(_DeclB, _DataB, B.shape, Pin(B).buffer, Pin(B).offset);
+
+            fn.Dispatch();
+        }
+
+        return O;
+    }
+}
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaPrecompiledCompute.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaPrecompiledCompute.cs.meta
new file mode 100644
index 0000000..a876162
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaPrecompiledCompute.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 5fea18c74a3be4c7680b4ee28cbe1a86
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCPU.cs b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCPU.cs
new file mode 100644
index 0000000..68da15c
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCPU.cs
@@ -0,0 +1,1847 @@
+using UnityEngine;
+using UnityEngine.Assertions;
+using System;
+using System.Linq;
+
+namespace Barracuda {
+
+public class ArrayTensorData : ITensorData
+{
+    protected float[] m_Array;
+    public float[] array { get { return m_Array; } }
+
+    public ArrayTensorData(int count)
+    {
+        m_Array = new float[count];
+    }
+
+    public ArrayTensorData(TensorShape shape) : this(shape.length)
+    {
+    }
+
+    ~ArrayTensorData()
+    {
+        Dispose();
+    }
+
+    public virtual void Dispose()
+    {
+        m_Array = null;
+    }
+
+    public virtual void Reserve(int count)
+    {
+        if (count > m_Array.Length)
+            m_Array = new float[count];
+    }
+
+    public virtual void Upload(float[] data, int offset = 0, int count = -1)
+    {
+        Assert.IsTrue(offset >= 0);
+        if (count < 0)
+            count = data.Length - offset;
+
+        if (m_Array == data && offset == 0)
+        {
+            Assert.IsTrue(count == data.Length);
+            return;
+        }
+
+        Reserve(count);
+
+        Array.Copy(data, offset, m_Array, 0, count);
+    }
+
+    public virtual bool ScheduleAsyncDownload(int count)
+    {
+        return true;
+    }
+
+    public virtual float[] Download(int count)
+    {
+        //;;D.logStackTraceEnabled = true;
+        //;;D.Log("Download ArrayTensorData " + count + " from " + m_Array.Length + " @ " + ToString());
+        //;;D.logStackTraceEnabled = false;
+
+        Assert.IsTrue(m_Array.Length >= count);
+        count = Math.Min(m_Array.Length, count);
+
+        if (count <= m_Array.Length)
+            return m_Array;
+
+        var dest = new float[count];
+        Array.Copy(m_Array, 0, dest, 0, count);
+        return dest;
+    }
+
+    public virtual float[] SharedAccess(out int offset)
+    {
+        offset = 0;
+        return m_Array;
+    }
+
+    public virtual int GetMaxCount()
+    {
+        return m_Array.Length;
+    }
+
+    public override string ToString()
+    {
+        return string.Format("(CPU array: {0} max: {1})",
+            GetHashCode(), m_Array.Length);
+    }
+}
+
+public class SharedArrayTensorData : ITensorData
+{
+    protected float[] m_Array;
+    protected int m_Offset;
+    protected int m_Count;
+
+    public float[] array { get { return m_Array; } }
+    public int offset { get { return m_Offset; } }
+    public int count { get { return m_Count; } }
+
+    public SharedArrayTensorData(float[] data, int offset = 0, int count = -1)
+    {
+        Assert.IsTrue(offset >= 0);
+        if (count < 0)
+            count = data.Length - offset;
+
+        m_Array = data;
+        m_Offset = offset;
+        Assert.IsTrue(count >= 0);
+        Assert.IsTrue(offset + count <= m_Array.Length);
+        m_Count = count;
+    }
+
+    ~SharedArrayTensorData()
+    {
+        Dispose();
+    }
+
+    public virtual void Dispose()
+    {
+    }
+
+    public virtual void Reserve(int count)
+    {
+        // currently always readonly
+        throw new InvalidOperationException("SharedArrayTensorData is readonly!");
+    }
+
+    public virtual void Upload(float[] data, int offset = 0, int count = -1)
+    {
+        // currently always readonly
+        throw new InvalidOperationException("SharedArrayTensorData is readonly!");
+    }
+
+    public virtual bool ScheduleAsyncDownload(int count)
+    {
+        return true;
+    }
+
+    public virtual float[] Download(int count)
+    {
+        //;;D.logStackTraceEnabled = true;
+        //;;D.Log("Download SharedArrayTensorData " + count + " from " + m_Count + " @ " + ToString());
+        //;;D.logStackTraceEnabled = false;
+
+        Assert.IsTrue(m_Count >= count);
+        count = Math.Min(m_Count, count);
+
+        var dest = new float[count];
+        Array.Copy(m_Array, m_Offset, dest, 0, count);
+        return dest;
+    }
+
+    public virtual float[] SharedAccess(out int offset)
+    {
+        offset = m_Offset;
+        return m_Array;
+    }
+
+    public virtual int GetMaxCount()
+    {
+        return m_Array.Length - m_Offset;
+    }
+
+    public override string ToString()
+    {
+        return string.Format("(CPU shared: {0} max: {1} offset: {2} count: {3})",
+            GetHashCode(), m_Array.Length, m_Offset, m_Count);
+    }
+}
+
+
+public class ReferenceCPUOps : IOps
+{
+    private ITensorAllocator m_Allocator;
+
+    public ReferenceCPUOps(ITensorAllocator allocator = null)
+    {
+        if (allocator == null)
+            allocator = new TensorCachingAllocator();
+        m_Allocator = allocator;
+    }
+
+    protected Tensor NewTensor(TensorShape s, string name = "")
+    {
+        var tensor = m_Allocator.Alloc(s);
+        tensor.name = name;
+        return tensor;
+    }
+
+    protected Tensor NewTensorLike(Tensor t)
+    {
+        return NewTensor(t.shape);
+    }
+
+    protected Tensor NewTensor(int b, int ch, string name = "")
+    {
+        return NewTensor(new TensorShape(b, ch), name);
+    }
+
+    protected Tensor NewTensor(int b, int h, int w, int ch, string name = "")
+    {
+        return NewTensor(new TensorShape(b, h, w, ch), name);
+    }
+
+    public virtual void WaitForCompletion(Tensor x)
+    {
+        // do nothing on CPU
+    }
+
+    public virtual void ResetAllocator(bool keepCachedMemory = true)
+    {
+        m_Allocator.Reset(keepCachedMemory);
+    }
+
+    // ---------------------------------------------------------------------------------
+
+    public virtual Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
+    {
+        Assert.IsTrue(X.dimensions <= 2);
+        Assert.IsTrue(Y.dimensions <= 2);
+        X = X.Flatten();
+        Y = Y.Flatten();
+
+        if (xTranspose)
+            X = Transpose(X);
+        if (yTranspose)
+            Y = Transpose(Y);
+
+        Assert.AreEqual(X.flatWidth, Y.flatHeight);
+        var O = NewTensor(X.flatHeight, Y.flatWidth);
+
+        for (int y = 0; y < O.flatHeight; ++y)
+            for (int x = 0; x < O.flatWidth; ++x)
+            {
+                float v = 0;
+                for (int i = 0; i < X.flatWidth; ++i)
+                {
+                    v += X[y, i] * Y[i, x];
+                }
+                O[y, x] = v;
+            }
+        return O;
+    }
+
+    public virtual Tensor Dense(Tensor X, Tensor W, Tensor B)
+    {
+        Assert.IsTrue(W.dimensions <= 2);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(X.flatWidth, W.flatHeight);
+
+        var O = NewTensor(X.flatHeight, W.flatWidth);
+
+        for (int y = 0; y < O.flatHeight; ++y)
+            for (int x = 0; x < O.flatWidth; ++x)
+            {
+                float v = B[x];
+                for (int i = 0; i < X.flatWidth; ++i)
+                {
+                    v += X[y, i] * W[i, x];
+                }
+                O[y, x] = v;
+            }
+        return O;
+    }
+
+    public virtual Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        Assert.AreEqual(X.channels, K.kernelDepth);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyKernel(K.shape, stride, pad));
+
+        for (var n = 0; n < O.batch; ++n)
+            for (var y = 0; y < O.height; ++y)
+                for (var x = 0; x < O.width; ++x)
+                    for (var k = 0; k < K.kernelCount; ++k)
+                    {
+                        float v = B[k];
+                        for (int dy = 0; dy < K.kernelHeight; ++dy)
+                        {
+                            for (int dx = 0; dx < K.kernelWidth; ++dx)
+                            {
+                                int oy = y * stride[1] + dy - pad[1];
+                                int ox = x * stride[0] + dx - pad[0];
+
+                                if (oy < 0) continue;
+                                if (oy >= X.height) continue;
+                                if (ox < 0) continue;
+                                if (ox >= X.width) continue;
+
+                                for (var c = 0; c < X.channels; ++c)
+                                {
+                                    float xv = X[n, oy, ox, c
+                                        //n  * X.height * X.width * X.channels +
+                                        //oy * X.width * X.channels +
+                                        //ox * X.channels +
+                                        //c  +
+                                        //X.offset
+                                    ];
+
+                                    float kv = K[dy, dx, c, k
+                                        //dy * K.height * K.width * K.channels +
+                                        //dx * K.width * K.channels +
+                                        //c  * K.channels +
+                                        //k  +
+                                        //K.offset
+                                    ];
+
+                                    v += xv * kv;
+                                }
+                            }
+                        }
+                        O[n, y, x, k
+                            //n * O.height * O.width * O.channels +
+                            //y * O.width * O.channels +
+                            //x * O.channels +
+                            //k +
+                            //O.offset
+                        ] = v;
+                    }
+        return O;
+    }
+
+    public virtual Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        if (K.kernelDepth != 1)
+            throw new NotImplementedException();
+
+        Assert.AreEqual(K.kernelDepth, 1);
+        Assert.AreEqual(K.kernelCount, X.channels);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        // ONNX: (M x C/group x kH x kW)
+        // TF: [H, W, in_channels, channel_multiplier]
+
+        // TF pseudocode:
+        // output[b, i, j, k * channel_multiplier + q] =
+        // sum_{di, dj}
+        //      input [b, i + di, j + dj, k] *
+        //      filter[di, dj, k, q] *
+
+        var O = NewTensor(X.shape.ApplyKernel(K.shape, stride, pad));
+
+        for (var n = 0; n < O.batch; ++n)
+            for (var y = 0; y < O.height; ++y)
+                for (var x = 0; x < O.width; ++x)
+                    for (var k = 0; k < K.kernelCount; ++k)
+                    {
+                        float v = B[k];
+                        for (int dy = 0; dy < K.kernelHeight; ++dy)
+                            for (int dx = 0; dx < K.kernelWidth; ++dx)
+                            {
+                                int oy = y * stride[1] + dy - pad[1];
+                                int ox = x * stride[0] + dx - pad[0];
+
+                                if (oy < 0) continue;
+                                if (oy >= X.height) continue;
+                                if (ox < 0) continue;
+                                if (ox >= X.width) continue;
+
+                                float xv = X[n, oy, ox, k];
+                                float kv = K[dy, dx, 0, k];
+                                v += xv * kv;
+                            }
+                        O[n, y, x, k] = v;
+                    }
+        return O;
+    }
+
+    public virtual Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment)
+    {
+        Assert.AreEqual(X.channels, K.kernelDepth);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+        Assert.AreEqual(pad[0],pad[2]);
+        Assert.AreEqual(pad[1],pad[3]);
+
+        var O = NewTensor(X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment));
+        int prePadW = K.kernelWidth  - pad[0] - 1;
+        int prePadH = K.kernelHeight - pad[1] - 1;
+        int strideH = 1;
+        int strideW = 1;
+
+        for (var n = 0; n < O.batch; ++n)
+            for (var y = 0; y < O.height; ++y)
+                for (var x = 0; x < O.width; ++x)
+                    for (var k = 0; k < K.kernelCount; ++k)
+                    {
+                        float v = B[k];
+                        for (int dy = 0; dy < K.kernelHeight; dy += strideH)
+                            for (int dx = 0; dx < K.kernelWidth; dx += strideW)
+                            {
+                                int readX = (x + dx - prePadW) / stride[0];
+                                int readY = (y + dy - prePadH) / stride[1];
+
+                                if ((x + dx - prePadW) % stride[0] != 0) continue;
+                                if ((y + dy - prePadH) % stride[0] != 0) continue;
+                                if (readX < 0) continue;
+                                if (readX >= X.width) continue;
+                                if (readY < 0) continue;
+                                if (readY >= X.height) continue;
+
+                                for (var c = 0; c < X.channels; ++c)
+                                {
+                                    float xv = X[n, readY, readX, c];
+                                    float kv = K[K.kernelHeight - 1 - dy,
+                                                 K.kernelWidth  - 1 - dx, c, k];
+                                    v += xv * kv;
+                                }
+                            }
+                        
+                        O[n, y, x, k] = v;
+                    }
+        return O;
+    }
+
+    public virtual Tensor Upsample2D(Tensor X, int[] size)
+    {
+        Assert.AreEqual(size.Length, 2);
+        var O = NewTensor(X.batch, X.height*size[1], X.width*size[0], X.channels);
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                    for (int c = 0; c < O.channels; ++c)
+                    {
+                        int oy = y / size[1];
+                        int ox = x / size[0];
+                        float v = X[b, oy, ox, c
+                            //b  * X.height * X.width * X.channels +
+                            //oy * X.width * X.channels +
+                            //ox * X.channels +
+                            //c +
+                            //X.offset
+                        ];
+
+                        O[b, y, x, c
+                            //b * O.height * O.width * O.channels +
+                            //y * O.width * O.channels +
+                            //x * O.channels +
+                            //c +
+                            //O.offset
+                        ] = v;
+                    }
+        return O;
+    }
+
+    public virtual Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        Assert.AreEqual(pool.Length, 2);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyPool(pool, stride, pad));
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                    for (int c = 0; c < O.channels; ++c)
+                    {
+                        float maxVal = float.MinValue;
+                        for (int dy = 0; dy < pool[1]; ++dy)
+                            for (int dx = 0; dx < pool[0]; ++dx)
+                            {
+                                int oy = y * stride[1] + dy - pad[1];
+                                int ox = x * stride[0] + dx - pad[0];
+
+                                if (oy < 0) continue;
+                                if (oy >= X.height) continue;
+                                if (ox < 0) continue;
+                                if (ox >= X.width) continue;
+
+                                float v = X[b, oy, ox, c
+                                    //b  * X.height * X.width * X.channels +
+                                    //oy * X.width * X.channels +
+                                    //ox * X.channels +
+                                    //c +
+                                    //X.offset
+                                ];
+                                maxVal = Mathf.Max(v, maxVal);
+                            }
+
+                        O[b, y, x, c
+                            //b * O.height * O.width * O.channels +
+                            //y * O.width * O.channels +
+                            //x * O.channels +
+                            //c +
+                            //O.offset
+                        ] = maxVal;
+                    }
+        return O;
+    }
+
+    public virtual Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        Assert.AreEqual(pool.Length, 2);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyPool(pool, stride, pad));
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                    for (int c = 0; c < O.channels; ++c)
+                    {
+                        float accum = 0.0f;
+                        float counter = 0.0f;
+                        for (int dy = 0; dy < pool[1]; ++dy)
+                            for (int dx = 0; dx < pool[0]; ++dx)
+                            {
+                                int oy = y * stride[1] + dy - pad[1];
+                                int ox = x * stride[0] + dx - pad[0];
+
+                                if (oy < 0) continue;
+                                if (oy >= X.height) continue;
+                                if (ox < 0) continue;
+                                if (ox >= X.width) continue;
+
+                                float v = X[b, oy, ox, c
+                                    //b  * X.height * X.width * X.channels +
+                                    //oy * X.width * X.channels +
+                                    //ox * X.channels +
+                                    //c +
+                                    //X.offset
+                                ];
+                                accum += v;
+                                ++counter;
+                            }
+
+                        O[b, y, x, c
+                            //b * O.height * O.width * O.channels +
+                            //y * O.width * O.channels +
+                            //x * O.channels +
+                            //c +
+                            //O.offset
+                        ] = accum / counter;
+                    }
+        return O;
+    }
+
+    public virtual Tensor GlobalMaxPool2D(Tensor X)
+    {
+        var O = NewTensor(X.batch, 1, 1, X.channels);
+
+        for (int b = 0; b < X.batch; ++b)
+            for (int c = 0; c < X.channels; ++c)
+            {
+                float maxVal = float.MinValue;
+                for (int y = 0; y < X.height; ++y)
+                    for (int x = 0; x < X.width; ++x)
+                    {
+                        float v = X[b, y, x, c
+                            //b * X.height * X.width * X.channels +
+                            //y * X.width * X.channels +
+                            //x * X.channels +
+                            //c +
+                            //X.offset
+                        ];
+                        maxVal = Mathf.Max(v, maxVal);
+                    }
+
+                O[b, 0, 0, c
+                    //b * O.channels +
+                    //c +
+                    //O.offset
+                ] = maxVal;
+            }
+        return O;
+    }
+
+    public virtual Tensor GlobalAvgPool2D(Tensor X)
+    {
+        var O = NewTensor(X.batch, 1, 1, X.channels);
+
+        for (int b = 0; b < X.batch; ++b)
+            for (int c = 0; c < X.channels; ++c)
+            {
+                float accum = 0.0f;
+                for (int y = 0; y < X.height; ++y)
+                    for (int x = 0; x < X.width; ++x)
+                    {
+                        float v = X[b, y, x, c
+                            //b * X.height * X.width * X.channels +
+                            //y * X.width * X.channels +
+                            //x * X.channels +
+                            //c +
+                            //X.offset
+                        ];
+                        accum += v;
+                    }
+
+                O[b, 0, 0, c
+                    //b * O.channels +
+                    //c +
+                    //O.offset
+                ] = accum / (X.width * X.height);
+            }
+        return O;
+    }
+
+    public virtual Tensor GlobalAvgVariancePool2D(Tensor X)
+    {
+        var O = NewTensor(X.batch, 2, 1, X.channels);
+
+        for (int b = 0; b < X.batch; ++b)
+            for (int c = 0; c < X.channels; ++c)
+            {
+                float mean = 0.0f;
+                float mean2 = 0.0f;
+                for (int y = 0; y < X.height; ++y)
+                    for (int x = 0; x < X.width; ++x)
+                    {
+                        float v = X[b, y, x, c
+                            //b * X.height * X.width * X.channels +
+                            //y * X.width * X.channels +
+                            //x * X.channels +
+                            //c +
+                            //X.offset
+                        ];
+                        mean  += v;
+                        mean2 += v*v;
+                    }
+
+                mean  /= (X.width * X.height);
+                mean2 /= (X.width * X.height);
+
+                O[b, 0, 0, c
+                    //b * O.channels +
+                    //c +
+                    //O.offset
+                ] = mean;
+
+                O[b, 1, 0, c
+                    //b * O.channels +
+                    //c +
+                    //O.offset
+                ] = mean2 - mean * mean;
+            }
+        return O;
+    }
+        
+
+    private Tensor ApplyPadding(Tensor X, int[] pad, Func<Tensor, int, int, int, int, float> paddingOp)
+    {
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyBorder(pad));
+
+        // NOTE: negative "pad" variable will crop X tensor
+        int croppedWidth = X.width - Math.Max(0, -pad[2]);
+        int croppedHeight = X.height - Math.Max(0, -pad[3]);
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                {
+                    int readX = x - pad[0];
+                    int readY = y - pad[1];
+
+                    if (readX < 0 || readX >= croppedWidth ||
+                        readY < 0 || readY >= croppedHeight)
+                    {
+                        for (int c = 0; c < O.channels; ++c)
+                            O[b, y, x, c] = paddingOp(X, b, readY, readX, c);
+                    }
+                    else
+                    {
+                        for (int c = 0; c < O.channels; ++c)
+                            O[b, y, x, c] = X[b, readY, readX, c];
+                    }
+                }
+
+        return O;
+    }
+
+    public virtual Tensor Border2D(Tensor X, int[] pad, float value)
+    {
+        Func<Tensor, int, int, int, int, float> padOp = (tensor, b, h, w, c) => value;
+        return ApplyPadding(X, pad, padOp);
+    }
+
+    private static void ClampHWToTensorShape(TensorShape shape, ref int height, ref int width)
+    {
+        width = Math.Max(width, 0);
+        height = Math.Max(height, 0);
+        width = Math.Min(width, shape.width - 1);
+        height = Math.Min(height, shape.height - 1);
+    }
+
+    public virtual Tensor Pad2DReflect(Tensor X, int[] pad)
+    {
+        float GetReflectPadding(Tensor tensorX, int b, int readY, int readX, int c)
+        {
+            int lastXIndex = tensorX.shape.width - 1;
+            int lastYIndex = tensorX.shape.height - 1;
+
+            if (readX < 0)
+                readX = -readX;
+            else if (readX > lastXIndex)
+                readX = lastXIndex - (readX - lastXIndex);
+
+            if (readY < 0)
+                readY = -readY;
+            else if (readY > lastYIndex)
+                readY = lastYIndex - (readY - lastYIndex);
+
+            ClampHWToTensorShape(tensorX.shape, ref readY, ref readX);
+            return tensorX[b,readY, readX,c];
+        }
+
+
+        return ApplyPadding(X, pad, GetReflectPadding);
+    }
+
+    public virtual Tensor Pad2DSymmetric(Tensor X, int[] pad)
+    {
+        float GetSymmetricPadding(Tensor tensorX, int b, int readY, int readX, int c)
+        {
+            int lastXIndex = tensorX.shape.width - 1;
+            int lastYIndex = tensorX.shape.height - 1;
+
+            if (readX < 0)
+                readX = -readX - 1;
+            else if (readX > lastXIndex)
+                readX = lastXIndex - (readX - lastXIndex) + 1;
+
+            if (readY < 0)
+                readY = -readY - 1;
+            else if (readY > lastYIndex)
+                readY = lastYIndex - (readY - lastYIndex) + 1;
+
+            ClampHWToTensorShape(tensorX.shape, ref readY, ref readX);
+            return tensorX[b,readY, readX,c];
+        }
+
+        return ApplyPadding(X, pad, GetSymmetricPadding);
+    }
+
+    public virtual Tensor Pad2DEdge(Tensor X, int[] pad)
+    {
+        float GetEdgePadding(Tensor tensorX, int b, int readY, int readX, int c)
+        {
+            ClampHWToTensorShape(tensorX.shape, ref readY, ref readX);
+            return tensorX[b,readY, readX,c];
+        }
+
+
+        return ApplyPadding(X, pad, GetEdgePadding);
+    }
+
+    public virtual Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
+    {
+        Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
+        Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
+
+        var O = NewTensorLike(X);
+
+        for (int b = 0; b < X.batch; ++b)
+            for (int y = 0; y < X.height; ++y)
+                for (int x = 0; x < X.width; ++x)
+                    for (int c = 0; c < X.channels; ++c)
+                    {
+                        float beta = B[0, 0, 0, c];//.array[c + B.offset];
+                        float gamma = S[0, 0, 0, c];//S.array[c + S.offset];
+
+                        //var i = X.IndexWithOffset(b, y, x, c);
+                        float v = X[b, y, x, c];//.array[i];
+                        O[b, y, x, c] = v * gamma + beta;
+                    }
+        return O;
+    }
+
+    public virtual Tensor LRN(Tensor X, float alpha, float beta, float bias, int size)
+    {
+        // https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
+        throw new NotImplementedException();
+    }
+
+    public virtual Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon)
+    {
+        Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
+
+        if (axis != 3 && axis != -1)
+            throw new NotImplementedException();
+
+        // Special cases of Normalization:
+        // 1) Instance Normalization, if pool == 1
+        // 2) Batch Normalization, if pool <= 0
+        if (pool <= 0)
+            pool = X.batch;
+
+        var O = NewTensorLike(X);
+
+        var channels = X.channels;
+        var width = X.width;
+        var height = X.height;
+
+        for (int subBatch = 0; subBatch < O.batch; subBatch += pool)
+            for (int c = 0; c < channels; ++c)
+            {
+                int bBegin = subBatch;
+                int bEnd = Math.Min(subBatch + pool, O.batch);
+
+                float gamma = S[0, 0, 0, c];//.array[c + S.offset];
+                float beta = B[0, 0, 0, c];//B.array[c + B.offset];
+
+                // calc mean
+                float accum = 0.0f;
+                for (int b = bBegin; b < bEnd; ++b)
+                    for (int y = 0; y < height; ++y)
+                        for (int x = 0; x < width; ++x)
+                        {
+                            float v = X[b, y, x, c
+                                //b * X.height * X.width * X.channels +
+                                //y * X.width * X.channels +
+                                //x * X.channels +
+                                //c +
+                                //X.offset
+                            ];
+                            accum += v;
+                        }
+                float mean = accum / (float)(width * height);
+
+                // calc variance
+                accum = 0.0f;
+                for (int b = bBegin; b < bEnd; ++b)
+                    for (int y = 0; y < height; ++y)
+                        for (int x = 0; x < width; ++x)
+                        {
+                            float v = X[b, y, x, c
+                                //b * X.height * X.width * X.channels +
+                                //y * X.width * X.channels +
+                                //x * X.channels +
+                                //c +
+                                //X.offset
+                            ];
+                            accum += (v - mean) * (v - mean);
+                        }
+                float var = accum / (float)(width * height);
+
+                // calc normalization factor
+                float invNormFactor = 1f / Mathf.Sqrt(var + epsilon);
+
+                var scale = gamma * invNormFactor;
+                var bias = beta - gamma * mean * invNormFactor;
+
+                // apply normalization
+                for (int b = bBegin; b < bEnd; ++b)
+                    for (int y = 0; y < height; ++y)
+                        for (int x = 0; x < width; ++x)
+                        {
+                            float v = X[b, y, x, c
+                                //b * X.height * X.width * X.channels +
+                                //y * X.width * X.channels +
+                                //x * X.channels +
+                                //c +
+                                //X.offset
+                            ];
+
+                            v = v * scale + bias;
+
+                            O[b, y, x, c
+                                //b * O.height * O.width * O.channels +
+                                //y * O.width * O.channels +
+                                //x * O.channels +
+                                //c +
+                                //O.offset
+                            ] = v;
+                        }
+            }
+        return O;
+    }
+
+    protected float Bernoulli(float p)
+    {
+        return (UnityEngine.Random.value <= p) ? 1f: 0f;
+    }
+
+    protected float Gaussian(float mean, float stdDev)
+    {
+        float u, v, s;
+        do {
+            u = UnityEngine.Random.value * 2 - 1;
+            v = UnityEngine.Random.value * 2 - 1;
+            s = u * u + v * v;
+        } while (s >= 1 || s == 0);
+        float mul = Mathf.Sqrt(-2.0f * Mathf.Log(s) / s);
+        return mean + stdDev * u * mul;
+    }
+
+    protected class Seed : IDisposable
+    {
+        UnityEngine.Random.State[] m_SeedStorage;
+        UnityEngine.Random.State m_EngineSeed;
+        public Seed(ref UnityEngine.Random.State[] storage, int initialSeed)
+        {
+            m_EngineSeed = UnityEngine.Random.state;
+            if (storage == null)
+            {
+                storage = new UnityEngine.Random.State[1];
+                UnityEngine.Random.InitState(initialSeed);
+                storage[0] = UnityEngine.Random.state;
+            }
+            else
+                UnityEngine.Random.state = storage[0];
+            m_SeedStorage = storage;
+        }
+
+        public virtual void Dispose()
+        {
+            m_SeedStorage[0] = UnityEngine.Random.state;
+            UnityEngine.Random.state = m_EngineSeed;
+        }
+    }
+
+    private UnityEngine.Random.State[] m_DropoutSeed;
+    public virtual Tensor Dropout(Tensor X, float alpha)
+    {
+        Assert.IsTrue(alpha >= 0f && alpha <= 1f);
+        var O = NewTensorLike(X);
+
+        // Based on PyTorch Dropout implementation
+        // See: https://github.com/pytorch/pytorch/blob/master/torch/nn/_functions/dropout.py
+
+        using (var seedOverride = new Seed(ref m_DropoutSeed, 1337))
+        {
+            var end = X.length;
+            for (int i = 0; i < end; ++i)
+            {
+                float v = X[i];
+                v *= Bernoulli(1f - alpha) / (1f - alpha);
+                O[i] = v;
+            }
+        }
+        return O;
+    }
+
+    private UnityEngine.Random.State[] m_RandomNormalSeed;
+    public virtual Tensor RandomNormal(TensorShape s, float mean, float scale, int seed)
+    {
+        var O = NewTensor(s);
+
+        using (var seedOverride = new Seed(ref m_RandomNormalSeed, seed))
+        {
+            var end = O.length;
+            for (int i = 0; i < end; ++i)
+                O[i] = Gaussian(mean, scale);
+        }
+
+        return O;
+    }
+
+    private UnityEngine.Random.State[] m_RandomUniformSeed;
+    public virtual Tensor RandomUniform(TensorShape s, float mean, float scale, int seed)
+    {
+        var O = NewTensor(s);
+
+        using (var seedOverride = new Seed(ref m_RandomUniformSeed, seed))
+        {
+            var end = O.length;
+            for (int i = 0; i < end; ++i)
+                O[i] = mean + scale * UnityEngine.Random.value;
+        }
+
+        return O;
+    }
+
+    private UnityEngine.Random.State[] m_MultinomialSeed;
+    public virtual Tensor Multinomial(Tensor X, int count, int seed)
+    {
+        var O = NewTensor(X.flatHeight, count);
+
+        // Tensorflow Multinomial for reference
+        // See: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/multinomial_op.cc
+
+        using (var seedOverride = new Seed(ref m_MultinomialSeed, seed))
+        {
+            for (int n = 0; n < X.flatHeight; ++n)
+            {
+                var maxLogP = Mathf.NegativeInfinity;
+                for (int i = 0; i < X.flatWidth; ++i)
+                    maxLogP = Mathf.Max(X[n, i], maxLogP);
+
+                float sumOfProbabilities = 0f;
+                for (int i = 0; i < X.flatWidth; ++i)
+                    sumOfProbabilities += Mathf.Exp(X[n, i] - maxLogP); // NOTE: X contains log-probabilities
+
+                for (int sample = 0; sample < count; ++sample)
+                {
+                    float p = UnityEngine.Random.value * sumOfProbabilities;
+
+                    int i = 0;
+                    float cumulativeP = 0f;
+                    while (i < X.flatWidth && p > cumulativeP)
+                    {
+                        cumulativeP += Mathf.Exp(X[n, i] - maxLogP);
+                        i++;
+                    }
+                    Assert.IsTrue(i > 0);
+                    O[n, sample] = (float)(i - 1);
+                }
+            }
+        }
+
+        return O;
+    }
+
+    public virtual Tensor OneHot(Tensor X, int depth, float onValue, float offValue)
+    {
+        var O = NewTensor(X.flatHeight, 1, X.flatWidth, depth);
+
+        for (int n = 0; n < X.flatHeight; ++n)
+        {
+            for (int j = 0; j < X.flatWidth; ++j)
+            {
+                int index = (int)X[n, j];
+                for (int i = 0; i < depth; ++i)
+                {
+                    float v = (i == index) ? onValue: offValue;
+                    O[n, 0, j, i] = v;
+                }
+            }
+        }
+        return O;
+    }
+
+    public virtual Tensor Relu(Tensor X)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Max(v, 0.0f);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor PRelu(Tensor X, Tensor S)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            float slope = S[i % S.length];
+
+            v = Mathf.Max(0.0f, v) + slope * Mathf.Min(0.0f, v);
+            O[i] = v;
+        }
+
+        return O;
+    }
+
+    public virtual Tensor Softmax(Tensor X)
+    {
+        var O = NewTensor(X.shape.Flatten());
+
+        //e_x = np.exp(X - X.max(axis=1, keepdims=True))
+        //X = e_x / e_x.sum(axis=1, keepdims=True)
+        for (int y = 0; y < X.flatHeight; ++y)
+        {
+            float maxV = Mathf.NegativeInfinity;
+            for (int x = 0; x < X.channels; ++x)
+            {
+                float v = X[y, x
+                    //b * X.channels +
+                    //x +
+                    //X.offset
+                ];
+
+                if (v > maxV)
+                    maxV = v;
+            }
+
+            float sum = 0.0f;
+            for (int x = 0; x < X.flatWidth; ++x)
+            {
+                float v = X[y, x
+                    // y * X.channels +
+                    // x +
+                    // X.offset
+                ];
+                sum += Mathf.Exp(v - maxV);
+            }
+
+            for (int x = 0; x < X.flatWidth; ++x)
+            {
+                float v = X[y, x
+                    //y * X.channels +
+                    //x +
+                    //X.offset
+                ];
+                v = Mathf.Exp(v - maxV) / sum;
+                O[y, x
+                    //y * O.width +
+                    //x +
+                    //O.offset
+                ] = v;
+            }
+        }
+
+        return O;
+    }
+
+    public virtual Tensor LogSoftmax(Tensor X)
+    {
+        var O = NewTensor(X.shape.Flatten());
+
+        //e_x = np.exp(X - X.max(axis=1, keepdims=True))
+        //X = log( e_x / e_x.sum(axis=1, keepdims=True) )
+        for (int y = 0; y < X.flatHeight; ++y)
+        {
+            float maxV = Mathf.NegativeInfinity;
+            for (int x = 0; x < X.channels; ++x)
+            {
+                float v = X[y, x
+                    //b * X.channels +
+                    //x +
+                    //X.offset
+                ];
+
+                if (v > maxV)
+                    maxV = v;
+            }
+
+            float sum = 0.0f;
+            for (int x = 0; x < X.flatWidth; ++x)
+            {
+                float v = X[y, x
+                    // y * X.channels +
+                    // x +
+                    // X.offset
+                ];
+                sum += Mathf.Exp(v - maxV);
+            }
+
+            for (int x = 0; x < X.flatWidth; ++x)
+            {
+                float v = X[y, x
+                    //y * X.channels +
+                    //x +
+                    //X.offset
+                ];
+                v = Mathf.Log( Mathf.Exp(v - maxV) / sum );
+                O[y, x
+                    //y * O.width +
+                    //x +
+                    //O.offset
+                ] = v;
+            }
+        }
+
+        return O;
+    }
+
+    public virtual Tensor Tanh(Tensor X)
+    {
+        // f(x) = tanh(x) = sinh(x) / cosh(x) = (exp(2*x) - 1) / (exp(2*x) + 1)
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            O[i] = MathfEx.tanh(X[i]);
+        }
+        return O;
+    }
+
+    public virtual Tensor Sigmoid(Tensor X)
+    {
+        // f(x) = 1 / (1 + exp(-x))
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = 1f / (1f + Mathf.Exp(-v));
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Relu6(Tensor X)
+    {
+        // f(x) = min(max(x, 0), 6)
+        // "Convolutional Deep Belief Networks on CIFAR-10", A Krizhevsky, 2010
+        // http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Min(Mathf.Max(0f, v), 6f);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Elu(Tensor X, float alpha)
+    {
+        // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0
+        // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015
+        // https://arxiv.org/abs/1511.07289
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            if (v <= 0)
+                v = alpha * (Mathf.Exp(v) - 1f);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor LeakyRelu(Tensor X, float alpha)
+    {
+        // f(x) = alpha * x for x < 0, f(x) = x for x >= 0.
+        // "Rectifier Nonlinearities Improve Neural Network Acoustic Models". AL Maas, 2013
+        // http://web.stanford.edu/~awni/papers/relu_hybrid_icml2013_final.pdf
+        Assert.IsTrue(alpha <= 1);
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Max(v, alpha * v);
+            // @TODO: doublecheck the following code
+            // from Theano impl
+            // https://github.com/Theano/theano/blob/d395439aec5a6ddde8ef5c266fd976412a5c5695/theano/tensor/nnet/nnet.py#L2209-L2251
+            //float f1 = 0.5f * (1f + alpha)
+            //float f2 = 0.5f * (1f - alpha)
+            //v = f1 * v + f2 * Mathf.Abs(v);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Selu(Tensor X, float alpha, float gamma)
+    {
+        // f(x) = gamma * (alpha * e^x - alpha) for x <= 0, f(x) = gamma * x for x > 0
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            if (v <= 0)
+                v = gamma * (alpha * Mathf.Exp(v) - alpha);
+            else
+                v = gamma * v;
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Swish(Tensor X)
+    {
+        // f(x) = sigmoid(x) * x = x / (1 + exp(-x))
+        // "Searching for Activation Functions". P Ramachandran, 2017
+        // https://arxiv.org/abs/1710.05941
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = v / (1f + Mathf.Exp(-v));
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Abs(Tensor X)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Abs(v);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Neg(Tensor X)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = -v;
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Ceil(Tensor X)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Ceil(v);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Clip(Tensor X, float min, float max)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Clamp(v, min, max);
+
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Floor(Tensor X)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Floor(v);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Reciprocal(Tensor X)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = 1.0f / v;
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Pow(Tensor X, float alpha)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Pow(v, alpha);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Exp(Tensor X)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Exp(v);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Log(Tensor X)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Log(v);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Sqrt(Tensor X)
+    {
+        var O = NewTensorLike(X);
+
+        var end = X.length;
+        for (int i = 0; i < end; ++i)
+        {
+            float v = X[i];
+            v = Mathf.Sqrt(v);
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor Concat(Tensor[] tensors, int axis)
+    {
+        var concatShape = TensorExtensions.ConcatShapes(tensors, axis);
+        var O = NewTensor(concatShape);
+
+        var srcIndices = new long[tensors.Length];
+        for (int i = 0; i < tensors.Length; ++i)
+            srcIndices[i] = tensors[i].readonlyArrayOffset;
+
+        // product of all tensor dimensions starting from axis
+        var copyBlockLengths = new long[tensors.Length];
+        for (int i = 0; i < tensors.Length; ++i)
+            copyBlockLengths[i] = tensors[i].shape.ToArray().Skip(tensors[i].shape.Axis(axis)).Aggregate(1L, (a, b) => (long)a * (long)b);
+
+        // copy tensor data interleaved into O
+        int intDstIndex = 0;
+        var dstArray = O.data.SharedAccess(out intDstIndex);
+        long dstIndex = intDstIndex;
+        long takes = concatShape.ToArray().Take(concatShape.Axis(axis)).Aggregate(1L, (a, b) => (long)a * (long)b);
+        for (int take = 0; take < takes; ++take)
+            for (int i = 0; i < tensors.Length; ++i)
+            {
+                var copyLength = copyBlockLengths[i];
+
+                Array.Copy(tensors[i].readonlyArray, srcIndices[i], // from
+                    dstArray, dstIndex, copyLength);                // to
+
+                srcIndices[i] += copyLength;
+                dstIndex += copyLength;
+            }
+
+        O.data.Upload(dstArray, 0);
+        return O;
+    }
+
+    public virtual Tensor StridedSlice(Tensor X, int[] starts, int[] ends, int[] stride)
+    {
+        Assert.AreEqual(starts.Length, 4);
+        Assert.AreEqual(ends.Length, 4);
+        Assert.AreEqual(stride.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyStridedSlice(starts, ends, stride));
+
+        var startB = TensorExtensions.WrapIndex(starts[0], X.batch);
+        var startY = TensorExtensions.WrapIndex(starts[1], X.height);
+        var startX = TensorExtensions.WrapIndex(starts[2], X.width);
+        var startC = TensorExtensions.WrapIndex(starts[3], X.channels);
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                    for (int c = 0; c < O.channels; ++c)
+                        O[b, y, x, c] = X[
+                            startB + b * stride[0],
+                            startY + y * stride[1],
+                            startX + x * stride[2],
+                            startC + c * stride[3]];
+        return O;
+    }
+
+    public virtual Tensor Tile(Tensor X, int[] repeats)
+    {
+        Assert.AreEqual(repeats.Length, 4);
+        var O = NewTensor(X.shape.Scale(repeats));
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                    for (int c = 0; c < O.channels; ++c)
+                        O[b, y, x, c] = X[
+                            b % repeats[0],
+                            y % repeats[1],
+                            x % repeats[2],
+                            c % repeats[3]];
+        return O;
+    }
+
+    private Tensor ApplyElementwiseWithBroadcast(Tensor[] tensors, Func<float, float, float> operation)
+    {
+        var O = GetOutputTensorFromBroadcast(tensors);
+        var A = tensors[0];
+        for (int t = 1; t < tensors.Length; ++t)
+        {
+            var B = tensors[t];
+            for (int b = 0; b < O.shape.batch; ++b)
+            {
+                for (int h = 0; h < O.shape.height; ++h)
+                {
+                    for (int w = 0; w < O.shape.width; ++w)
+                    {
+                        for (int c = 0; c < O.shape.channels; ++c)
+                        {
+                            var valueA = A[A.IndexWithBroadcast(b, h, w, c)];
+                            var valueB = B[B.IndexWithBroadcast(b, h, w, c)];
+                            O[O.Index(b, h, w, c)] = operation(valueA, valueB);
+                        }
+                    }
+                }
+            }
+            A = O;
+        }
+        return O;
+    }
+
+    // O = tensors[0] + tensors[1] + ... + tensors[N-1]
+    public virtual Tensor Add(Tensor[] tensors)
+    {
+        Func<float, float, float> op = (a, b) => a + b;
+        return ApplyElementwiseWithBroadcast(tensors, op);
+    }
+
+    // O = tensors[0] - tensors[1] - ... - tensors[N-1]
+    public virtual Tensor Sub(Tensor[] tensors)
+    {
+        Func<float, float, float> op = (a, b) => a - b;
+        return ApplyElementwiseWithBroadcast(tensors, op);
+    }
+
+    // O = tensors[0] * tensors[1] * ... * tensors[N-1]
+    public virtual Tensor Mul(Tensor[] tensors)
+    {
+        Func<float, float, float> op = (a, b) => a * b;
+        return ApplyElementwiseWithBroadcast(tensors, op);
+    }
+    // O = tensors[0] / tensors[1] / ... / tensors[N-1]
+    public virtual Tensor Div(Tensor[] tensors)
+    {
+        Func<float, float, float> op = (a, b) => a / b;
+        return ApplyElementwiseWithBroadcast(tensors, op);
+    }
+
+    // O = tensors[0] ^ tensors[1] ^ ... ^ tensors[N-1]
+    public virtual Tensor Pow(Tensor[] tensors)
+    {
+        Func<float, float, float> op = (a, b) => Mathf.Pow(a, b);
+        return ApplyElementwiseWithBroadcast(tensors, op);
+    }
+
+    // O = min(tensors[0], tensors[1],  ... , tensors[N-1])
+    public virtual Tensor Min(Tensor[] tensors)
+    {
+        Func<float, float, float> op = (a, b) => Mathf.Min(a, b);
+        return ApplyElementwiseWithBroadcast(tensors, op);
+    }
+
+    // O = max(tensors[0], tensors[1],  ... , tensors[N-1])
+    public virtual Tensor Max(Tensor[] tensors)
+    {
+        Func<float, float, float> op = (a, b) => Mathf.Max(a, b);
+        return ApplyElementwiseWithBroadcast(tensors, op);
+    }
+
+    // O = (1/N) * (tensors[0] + tensors[1] + ... + tensors[N-1])
+    public virtual Tensor Mean(Tensor[] tensors)
+    {
+        // accumulate
+        Func<float, float, float> op = (a, b) => a + b;
+        var O = ApplyElementwiseWithBroadcast(tensors, op);
+
+        // div by N
+        var invN = 1.0f / tensors.Length;
+        var end = O.length;
+        for (int i = 0; i < O.length; ++i)
+        {
+            float v = O[i];
+            v *= invN;
+            O[i] = v;
+        }
+        return O;
+    }
+
+    public virtual Tensor ReduceMin(Tensor X, int axis)
+    {
+        if (axis != 3 && axis != -1)
+            throw new NotImplementedException();
+
+        var O = NewTensor(X.shape.Reduce(axis));
+        var n = X.channels;
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                {
+                    float acc = float.MaxValue;
+                    for (int c = 0; c < n; ++c)
+                        acc = Mathf.Min(acc, X[b, y, x, c]);
+                    O[b, y, x, 0] = acc;
+                }
+        return O;
+    }
+
+    public virtual Tensor ReduceMax(Tensor X, int axis)
+    {
+        if (axis != 3 && axis != -1)
+            throw new NotImplementedException();
+
+        var O = NewTensor(X.shape.Reduce(axis));
+        var n = X.channels;
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                {
+                    float acc = float.MinValue;
+                    for (int c = 0; c < n; ++c)
+                        acc = Mathf.Max(acc, X[b, y, x, c]);
+                    O[b, y, x, 0] = acc;
+                }
+        return O;
+    }
+
+    public virtual Tensor ReduceSum(Tensor X, int axis)
+    {
+        if (axis != 3 && axis != -1)
+            throw new NotImplementedException();
+
+        var O = NewTensor(X.shape.Reduce(axis));
+        var n = X.channels;
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                {
+                    float acc = 0.0f;
+                    for (int c = 0; c < n; ++c)
+                        acc += X[b, y, x, c];
+                    O[b, y, x, 0] = acc;
+                }
+        return O;
+    }
+
+    public virtual Tensor ReduceMean(Tensor X, int axis)
+    {
+        if (axis != 3 && axis != -1)
+            throw new NotImplementedException();
+
+        var O = NewTensor(X.shape.Reduce(axis));
+        var n = X.channels;
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                {
+                    float acc = 0.0f;
+                    for (int c = 0; c < n; ++c)
+                        acc += X[b, y, x, c];
+                    O[b, y, x, 0] = acc / n;
+                }
+        return O;
+    }
+
+    public virtual Tensor ReduceProd(Tensor X, int axis)
+    {
+        if (axis != 3 && axis != -1)
+            throw new NotImplementedException();
+
+        var O = NewTensor(X.shape.Reduce(axis));
+        var n = X.channels;
+
+        for (int b = 0; b < O.batch; ++b)
+            for (int y = 0; y < O.height; ++y)
+                for (int x = 0; x < O.width; ++x)
+                {
+                    float acc = 1.0f;
+                    for (int c = 0; c < n; ++c)
+                        acc *= X[b, y, x, c];
+                    O[b, y, x, 0] = acc;
+                }
+        return O;
+    }
+
+    private Tensor GetOutputTensorFromBroadcast(Tensor[] tensors)
+    {
+        Assert.IsTrue(tensors.Length > 0);
+
+        var O = NewTensor(TensorExtensions.MaxShape(tensors));
+        foreach (var t in tensors)
+        {
+            Assert.IsTrue((t.batch    == 1) || (t.batch    == O.batch));
+            Assert.IsTrue((t.height   == 1) || (t.height   == O.height));
+            Assert.IsTrue((t.width    == 1) || (t.width    == O.width));
+            Assert.IsTrue((t.channels == 1) || (t.channels == O.channels));
+        }
+
+        return O;
+    }
+
+    private Tensor ApplyLogicalOperator(Tensor tensorA, Tensor tensorB, Func<float, float, float> logicOp)
+    {
+        var O = GetOutputTensorFromBroadcast(new Tensor[] { tensorA, tensorB });
+        for (int b = 0; b < O.shape.batch; ++b)
+        {
+            for (int h = 0; h < O.shape.height; ++h)
+            {
+                for (int w = 0; w < O.shape.width; ++w)
+                {
+                    for (int c = 0; c < O.shape.channels; ++c)
+                    {
+                            var A = tensorA[tensorA.IndexWithBroadcast(b, h, w, c)];
+                            var B = tensorB[tensorB.IndexWithBroadcast(b, h, w, c)];
+                            O[O.Index(b,h,w,c)] = logicOp(A,B);
+                    }
+                }
+            }
+        }
+
+        return O;
+    }
+
+    public virtual Tensor Greater(Tensor A, Tensor B)
+    {
+        Func<float, float, float> logicOp = (a, b) => Convert.ToSingle(a > b);
+        return ApplyLogicalOperator(A, B, logicOp);
+    }
+    public virtual Tensor GreaterEqual(Tensor A, Tensor B)
+    {
+        Func<float, float, float> logicOp = (a, b) => Convert.ToSingle(a >= b);
+        return ApplyLogicalOperator(A, B, logicOp);
+    }
+    public virtual Tensor Less(Tensor A, Tensor B)
+    {
+        Func<float, float, float> logicOp = (a, b) => Convert.ToSingle(a < b);
+        return ApplyLogicalOperator(A, B, logicOp);
+    }
+    public virtual Tensor LessEqual(Tensor A, Tensor B)
+    {
+        Func<float, float, float> logicOp = (a, b) => Convert.ToSingle(a <= b);
+        return ApplyLogicalOperator(A, B, logicOp);
+    }
+    public virtual Tensor Equal(Tensor A, Tensor B)
+    {
+        Func<float, float, float> logicOp = (a, b) => Convert.ToSingle(a == b);
+        return ApplyLogicalOperator(A, B, logicOp);
+    }
+    public virtual Tensor LogicalOr(Tensor A, Tensor B)
+    {
+        Func<float, float, float> logicOp = (a, b) => Convert.ToSingle( Convert.ToBoolean(a) || Convert.ToBoolean(b) );
+        return ApplyLogicalOperator(A, B, logicOp);
+    }
+    public virtual Tensor LogicalAnd(Tensor A, Tensor B)
+    {
+        Func<float, float, float> logicOp = (a, b) => Convert.ToSingle( Convert.ToBoolean(a) && Convert.ToBoolean(b) );
+        return ApplyLogicalOperator(A, B, logicOp);
+    }
+    public virtual Tensor LogicalXor(Tensor A, Tensor B)
+    {
+        Func<float, float, float> logicOp = (a, b) => Convert.ToSingle( Convert.ToBoolean(a) ^ Convert.ToBoolean(b) );
+        return ApplyLogicalOperator(A, B, logicOp);
+    }
+    public virtual Tensor LogicalNot(Tensor X)
+    {
+        var O = NewTensorLike(X);
+        var end = O.length;
+        for (int i = 0; i < end; ++i)
+            O[i] = Convert.ToSingle( !Convert.ToBoolean(X[i]) );
+        return O;
+    }
+
+    public virtual Tensor Flatten(Tensor X)
+    {
+        return X.Flatten();
+    }
+
+    public virtual Tensor Reshape(Tensor X, TensorShape newShape)
+    {
+        return X.Reshape(newShape);
+    }
+
+    public virtual Tensor Transpose(Tensor X)
+    {
+        Assert.IsTrue(X.dimensions <= 2);
+        X = X.Flatten();
+
+        var O = NewTensor(X.flatWidth, X.flatHeight);
+
+        for (int y = 0; y < O.flatHeight; ++y)
+            for (int x = 0; x < O.flatWidth; ++x)
+                O[y, x] = X[x, y];
+
+        return O;
+    }
+
+    public virtual Tensor Prepare(Tensor X)
+    {
+        X.PrepareCacheForAccess();
+        return X;
+    }
+}
+
+    public class MathfEx
+    {
+        public static float tanh(float x)
+        {
+            // tanh = (exp(2*x) - 1) / (exp(2*x) + 1)
+
+            // Constant taken from http://llvm.org/svn/llvm-project/libclc/trunk/generic/lib/math/tanh.cl
+            // const float large_threshold = 0x1.0a2b24p+3f;
+            const float LargeThreshold = 8.317766f;
+
+            // See also: https://stackoverflow.com/questions/34835641/tanh-returning-nan-for-large-input
+
+            // Handle edge-cases to prevent NaNs creeping in
+            if (x >= LargeThreshold || x <= -LargeThreshold)
+                return Mathf.Sign(x);
+
+            float exp2 = Mathf.Exp(2f * x);
+            return (exp2 - 1f) / (exp2 + 1f);
+        }
+    }
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCPU.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCPU.cs.meta
new file mode 100644
index 0000000..d12ae3a
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCPU.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: e7398940fb81d45ee8e648e0b0f467f2
+timeCreated: 1503433373
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCompute.cs b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCompute.cs
new file mode 100644
index 0000000..5ebb8b4
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCompute.cs
@@ -0,0 +1,1456 @@
+//#define DEBUG_TRACK_ALLOCATIONS
+
+using UnityEngine;
+using UnityEngine.Rendering;
+using UnityEngine.Experimental.Rendering; // AsyncGPUReadback
+using UnityEngine.Assertions;
+using UnityEngine.Profiling;
+using System;
+using System.Linq;
+using System.Collections.Generic;
+using System.Diagnostics;
+
+
+namespace Barracuda {
+
+public class ComputeTensorData : ITensorData
+{
+    private bool m_DisposeBufferAfterUse;
+    private ComputeBuffer m_Buffer;
+    private TensorShape m_Shape;
+    private int m_Offset;
+
+    public ComputeBuffer buffer { get { return m_Buffer; } }
+    public TensorShape shape { get { return m_Shape; } }
+    public int offset { get { return m_Offset; } }
+    public string name;
+
+#if DEBUG_TRACK_ALLOCATIONS
+    protected StackTrace m_AllocationTrace;
+#endif
+
+    public ComputeTensorData(TensorShape shape, string buffername, bool clearOnInit = true)
+    {
+        name = buffername;
+        m_Buffer = new ComputeBuffer(shape.length, sizeof(float));
+
+        // @TODO: consider zero initialization only for "debug" mode
+        if (clearOnInit)
+        {
+            float[] zeros = new float[shape.length];
+            m_Buffer.SetData(zeros);
+        }
+
+        m_Shape = shape;
+        m_Offset = 0;
+
+        m_DisposeBufferAfterUse = true;
+
+#if DEBUG_TRACK_ALLOCATIONS
+        m_AllocationTrace = new System.Diagnostics.StackTrace();
+#endif
+    }
+
+    protected ComputeTensorData(ComputeBuffer buffer, TensorShape shape, int offset, string buffername)
+    {
+        name = buffername;
+        m_Buffer = buffer;
+        m_Shape = shape;
+        m_Offset = offset;
+
+        m_DisposeBufferAfterUse = false;
+    }
+
+    ~ComputeTensorData()
+    {
+        if (m_Buffer == null)
+            return;
+        if (!m_DisposeBufferAfterUse)
+            return;
+
+        D.LogWarning("Found undisposed " + ToString() + ". Disposing!");
+
+        Dispose();
+    }
+
+    public virtual void Dispose()
+    {
+        if (m_DisposeBufferAfterUse)
+        {
+            m_Buffer.Dispose();
+            m_Buffer = null;
+        }
+        m_DisposeBufferAfterUse = false;
+    }
+
+    public virtual void Reserve(int count)
+    {
+        if (m_Offset + count > GetMaxCount())
+            throw new ArgumentException("ComputeTensorData buffer is too small to reserve " + count + " elements.");
+    }
+
+    public virtual void Upload(float[] data, int offset = 0, int count = -1)
+    {
+        Assert.IsTrue(offset >= 0);
+        if (count < 0)
+            count = Math.Min(GetMaxCount(), data.Length) - offset;
+        Assert.IsTrue(offset + count <= data.Length);
+
+        m_Buffer.SetData(data, offset, m_Offset, count);
+        #if UNITY_2018
+        m_AsyncDownloadRequested = false;
+        #endif
+    }
+
+    #if UNITY_2018
+    private bool m_AsyncDownloadRequested = false;
+    private AsyncGPUReadbackRequest m_AsyncDownloadRequest;
+    public virtual bool ScheduleAsyncDownload(int count)
+    {
+        if (!SystemInfo.supportsAsyncGPUReadback)
+            return true;
+
+        if (!m_AsyncDownloadRequested)
+        {
+            m_AsyncDownloadRequest = AsyncGPUReadback.Request(m_Buffer);
+            m_AsyncDownloadRequested = true;
+        }
+        else
+            m_AsyncDownloadRequest.Update();
+        return m_AsyncDownloadRequest.done;
+    }
+    #else
+    public virtual bool ScheduleAsyncDownload(int count)
+    {
+        return true;
+    }
+    #endif
+
+    public virtual float[] Download(int count)
+    {
+        //;;D.logStackTraceEnabled = true;
+        //;;Debug.Log("Download ComputeTensorData " + name + + GetMaxCount() + " " + count);
+        //;;D.logStackTraceEnabled = false;
+
+        Profiler.BeginSample("Barracuda.DownloadDataFromGPU");
+        Assert.IsTrue(GetMaxCount() >= count);
+        count = Math.Min(GetMaxCount(), count);
+
+        #if UNITY_2018
+        if (m_AsyncDownloadRequested)
+        {
+            m_AsyncDownloadRequested = false;
+            m_AsyncDownloadRequest.WaitForCompletion();
+            Profiler.EndSample();
+
+            if (!m_AsyncDownloadRequest.hasError)
+                return m_AsyncDownloadRequest.GetData<float>().ToArray();
+        }
+        #endif
+
+        var data = new float[count];
+        m_Buffer.GetData(data, 0, m_Offset, count);
+        Profiler.EndSample();
+
+        return data;
+    }
+
+    public virtual float[] SharedAccess(out int offset)
+    {
+        offset = m_Offset;
+        return Download(GetMaxCount());
+    }
+
+    public virtual int GetMaxCount()
+    {
+        return m_Buffer.count;
+    }
+
+    public override string ToString()
+    {
+        string allocationSource = "";
+
+#if DEBUG_TRACK_ALLOCATIONS
+        allocationSource += "\nSource:\n" + m_AllocationTrace;
+#endif
+
+        return string.Format("(GPU:{0}#{1} {2} buffer: {3} created at: {4})",
+            name, GetHashCode(), m_Shape, m_Buffer, allocationSource);
+    }
+}
+
+public class SharedComputeTensorData : ComputeTensorData
+{
+    public SharedComputeTensorData(ComputeBuffer buffer, TensorShape shape, int offset = 0, string buffername = "") : base(buffer, shape, offset, buffername) {}
+}
+
+public class TextureFormatUtils
+{
+    public static bool IsRedOnly(TextureFormat format)
+    {
+        return  format == TextureFormat.R8 ||
+                format == TextureFormat.R16 ||
+                format == TextureFormat.RHalf ||
+                format == TextureFormat.RFloat;
+    }
+
+    public static bool IsRedOnly(RenderTextureFormat format)
+    {
+        return  format == RenderTextureFormat.R8 ||
+                format == RenderTextureFormat.R16 ||
+                format == RenderTextureFormat.RHalf ||
+                format == RenderTextureFormat.RFloat;
+    }
+
+    public static bool IsRedGreen(TextureFormat format)
+    {
+        return  format == TextureFormat.RG16 ||
+                format == TextureFormat.RGHalf ||
+                format == TextureFormat.RGFloat;
+    }
+
+    public static bool IsRedGreen(RenderTextureFormat format)
+    {
+        return  format == RenderTextureFormat.RG16 ||
+                format == RenderTextureFormat.RGHalf ||
+                format == RenderTextureFormat.RGFloat;
+    }
+
+    public static bool IsAlphaOnly(Texture tex)
+    {
+        var tex2D = tex as Texture2D;
+        var texArr = tex as Texture2DArray;
+        var tex3D = tex as Texture3D;
+        if (tex2D != null)
+            return tex2D.format == TextureFormat.Alpha8;
+        else if (texArr != null)
+            return texArr.format == TextureFormat.Alpha8;
+        else if (tex3D != null)
+            return tex3D.format == TextureFormat.Alpha8;
+        else
+            return false;
+    }
+
+    public static bool IsRedOnly(Texture tex)
+    {
+        var tex2D = tex as Texture2D;
+        var texArr = tex as Texture2DArray;
+        var tex3D = tex as Texture3D;
+        var rt = tex as RenderTexture;
+
+        if (tex2D != null)
+            return IsRedOnly(tex2D.format);
+        else if (texArr != null)
+            return IsRedOnly(texArr.format);
+        else if (tex3D != null)
+            return IsRedOnly(tex3D.format);
+        else if (rt != null)
+            return IsRedOnly(rt.format);
+        else
+            return false;
+    }
+
+    public static bool IsRedGreen(Texture tex)
+    {
+        var tex2D = tex as Texture2D;
+        var texArr = tex as Texture2DArray;
+        var tex3D = tex as Texture3D;
+        var rt = tex as RenderTexture;
+
+        if (tex2D != null)
+            return IsRedGreen(tex2D.format);
+        else if (texArr != null)
+            return IsRedGreen(texArr.format);
+        else if (tex3D != null)
+            return IsRedGreen(tex3D.format);
+        else if (rt != null)
+            return IsRedGreen(rt.format);
+        else
+            return false;
+    }
+
+
+    public static Color FormatToChannelMask(Texture tex, int interpretPixelAsChannels)
+    {
+        switch (interpretPixelAsChannels)
+        {
+            case 1:
+                if (IsRedOnly(tex))
+                    return new Color(1,0,0,0);
+                if (IsAlphaOnly(tex))
+                    return new Color(0,0,0,1);
+                // TODO: known issue, doesn't handle RG textures properly
+                return new Color(0,0,0,0); // see specialCaseWhenChannelMaskIsEmptyStoresAverage
+            case 2:
+                return new Color(1,1,0,0);
+            case 3:
+                return new Color(1,1,1,0);
+            case 4:
+            default:
+                return new Color(1,1,1,1);
+        }
+    }
+
+    public static Color FormatToChannelMask(Texture tex)
+    {
+        if (IsRedOnly(tex))
+            return new Color(1,0,0,1);
+        if (IsRedGreen(tex))
+            return new Color(1,1,0,1);
+        if (IsAlphaOnly(tex))
+            return new Color(0,0,0,1);
+        return new Color(1,1,1,1);
+    }
+}
+
+public class TextureAsTensorData : ITensorData
+{
+    public enum Flip
+    {
+        None,
+        Y,
+    }
+
+    public enum InterpretDepthAs
+    {
+        Batch,
+        Channels,
+    }
+
+    public enum InterpretColorAs
+    {
+        AverageMultipleChannels,
+        // TODO: PickFirstChannel,
+    }
+
+    private TensorShape m_Shape;
+    private Texture[] m_Textures;
+    private int m_InterpretPixelAsChannels;
+    private InterpretDepthAs m_InterpretDepthAs;
+    private InterpretColorAs m_InterpretColorAs;
+    private Flip m_Flip;
+
+    public TensorShape shape { get { return m_Shape; } }
+    public Texture[] textures { get { return m_Textures; } }
+    public int interpretPixelAsChannels { get { return m_InterpretPixelAsChannels; } }
+    public InterpretDepthAs interpretDepthAs { get { return m_InterpretDepthAs; } }
+    public InterpretColorAs interpretColorAs { get { return m_InterpretColorAs; } }
+    public Flip flip { get { return m_Flip; } }
+
+
+    public TextureAsTensorData(Texture[] textures, int interpretPixelAsChannels = 3,
+        Flip flip = Flip.Y, InterpretDepthAs depthAs = InterpretDepthAs.Batch, InterpretColorAs colorAs = InterpretColorAs.AverageMultipleChannels)
+    {
+        m_InterpretPixelAsChannels = interpretPixelAsChannels;
+        m_InterpretDepthAs = depthAs;
+        m_InterpretColorAs = colorAs;
+        m_Flip = flip;
+
+        if (textures.Length < 1)
+            throw new ArgumentException("Textures array must be non empty");
+
+        var width = textures[0].width;
+        var height = textures[0].height;
+
+        var totalDepth = 0;
+        foreach (var tex in textures)
+        {
+            if (tex.width != width || tex.height != height)
+                throw new ArgumentException("All textures must have the same width and height dimensions");
+
+            var tex2D = tex as Texture2D;
+            var texArr = tex as Texture2DArray;
+            var tex3D = tex as Texture3D;
+            var rt = tex as RenderTexture;
+            if (tex2D)
+                totalDepth += 1;
+            else if (texArr)
+                totalDepth += texArr.depth;
+            else if (tex3D)
+                totalDepth += tex3D.depth;
+            else if (rt)
+                totalDepth += rt.volumeDepth;
+            else
+                throw new InvalidOperationException("Unsupported texture type");
+        }
+        m_Textures = textures;
+
+        int batch = 1;
+        int channels = interpretPixelAsChannels;
+        if (m_InterpretDepthAs == InterpretDepthAs.Batch)
+            batch *= totalDepth;
+        else if (m_InterpretDepthAs == InterpretDepthAs.Channels)
+            channels *= totalDepth;
+
+        m_Shape = new TensorShape(batch, height, width, channels);
+    }
+
+    public TextureAsTensorData(Texture texture, int interpretPixelAsChannels = 3,
+        Flip flip = Flip.Y, InterpretDepthAs depthAs = InterpretDepthAs.Batch, InterpretColorAs colorAs = InterpretColorAs.AverageMultipleChannels)
+    : this(new [] { texture }, interpretPixelAsChannels, flip, depthAs, colorAs) {}
+
+    public virtual void Reserve(int count)
+    {
+        // currently always readonly
+        throw new InvalidOperationException("TextureAsTensorData is readonly");
+    }
+
+    public virtual void Upload(float[] data, int offset = 0, int count = -1)
+    {
+        // currently always readonly
+        throw new InvalidOperationException("TextureAsTensorData is readonly");
+    }
+
+    static void ProcessLine(Color[] pixels, int srcOffset, int srcWidth, Color srcChannelMask, float[] dstArray, int dstOffset, Color dstChannelMask, int channelStride)
+    {
+        for (var x = 0; x < srcWidth; ++x)
+        {
+            var p = pixels[srcOffset + x];
+            var dst = dstOffset;
+            if (dstChannelMask[0] > 0) dstArray[dst++] = p.r * srcChannelMask[0];
+            if (dstChannelMask[1] > 0) dstArray[dst++] = p.g * srcChannelMask[1];
+            if (dstChannelMask[2] > 0) dstArray[dst++] = p.b * srcChannelMask[2];
+            if (dstChannelMask[3] > 0) dstArray[dst++] = p.a * srcChannelMask[3];
+            var specialCaseWhenChannelMaskIsEmptyStoresAverage = (dst == dstOffset);
+            if (specialCaseWhenChannelMaskIsEmptyStoresAverage)
+                dstArray[dst++] = (p.r + p.g + p.b) / 3;
+
+            dstOffset += channelStride;
+        }
+    }
+
+    public virtual bool ScheduleAsyncDownload(int count)
+    {
+        return true;
+    }
+
+    public virtual float[] Download(int count)
+    {
+        //;;D.logStackTraceEnabled = true;
+        //;;Debug.Log("Download TextureAsTensorData " + name + " " + count + " @ " + ToString());
+        //;;D.logStackTraceEnabled = false;
+
+        Assert.AreEqual(shape.length, count);
+        var data = new float[shape.length];
+        int batch = 0;
+        var dstChannel = 0;
+        foreach (var tex in m_Textures)
+        {
+            var tex2D = tex as Texture2D;
+            var texArr = tex as Texture2DArray;
+            var tex3D = tex as Texture3D;
+            // Source channel mask is a workaround - since Unity API that does not adhere to DX/GL standard when reading from 1 channel textures!
+            var srcChannelMask = TextureFormatUtils.FormatToChannelMask(tex);
+            var dstChannelMask = TextureFormatUtils.FormatToChannelMask(tex, m_InterpretPixelAsChannels);
+
+            if (tex2D)
+            {
+                var pixels = tex2D.GetPixels(0);
+
+                for (var y = 0; y < tex.height; ++y)
+                {
+                    var srcOffset = y * tex.width;
+                    var dstY = (m_Flip == Flip.Y) ? tex.height - y - 1: y;
+                    var dstOffset = shape.Index(batch, dstY, 0, dstChannel);
+                    ProcessLine(pixels, srcOffset, tex.width, srcChannelMask, data, dstOffset, dstChannelMask, shape.channels);
+                }
+
+                if (m_InterpretDepthAs == InterpretDepthAs.Batch)
+                    batch += 1;
+                else if (m_InterpretDepthAs == InterpretDepthAs.Channels)
+                    dstChannel += m_InterpretPixelAsChannels;
+            }
+            else if (texArr)
+            {
+                for (var z = 0; z < texArr.depth; ++z)
+                {
+                    var pixels = texArr.GetPixels(z, 0);
+
+                    D.Log(dstChannel);
+                    for (var y = 0; y < tex.height; ++y)
+                    {
+                        var srcOffset = y * tex.width;
+                        var dstY = (m_Flip == Flip.Y) ? tex.height - y - 1: y;
+                        var dstOffset = shape.Index(batch, dstY, 0, dstChannel);
+                        ProcessLine(pixels, srcOffset, tex.width, srcChannelMask, data, dstOffset, dstChannelMask, shape.channels);
+                    }
+
+                    if (m_InterpretDepthAs == InterpretDepthAs.Batch)
+                        batch += 1;
+                    else if (m_InterpretDepthAs == InterpretDepthAs.Channels)
+                        dstChannel += m_InterpretPixelAsChannels;
+                }
+            }
+            else if (tex3D)
+            {
+                var pixels = tex3D.GetPixels(0);
+                for (var z = 0; z < tex3D.depth; ++z)
+                {
+                    for (var y = 0; y < tex.height; ++y)
+                    {
+                        var srcOffset = z * tex.height + y * tex.width;
+                        var dstY = (m_Flip == Flip.Y) ? tex.height - y - 1: y;
+                        var dstOffset = shape.Index(batch, dstY, 0, dstChannel);
+                        ProcessLine(pixels, srcOffset, tex.width, srcChannelMask, data, dstOffset, dstChannelMask, shape.channels);
+                    }
+
+                    if (m_InterpretDepthAs == InterpretDepthAs.Batch)
+                        batch += 1;
+                    else if (m_InterpretDepthAs == InterpretDepthAs.Channels)
+                        dstChannel += m_InterpretPixelAsChannels;
+                }
+            }
+            else
+                throw new InvalidOperationException("Unsupported texture type for automatic readback to CPU");
+        }
+
+        return data;
+    }
+
+    public virtual float[] SharedAccess(out int offset)
+    {
+        offset = 0;
+        return Download(shape.length);
+    }
+
+    public virtual int GetMaxCount()
+    {
+        return m_Shape.length;
+    }
+
+    public virtual void Dispose()
+    {
+    }
+}
+
+public class ReferenceComputeOps : ReferenceCPUOps
+{
+    private ComputeShader m_Kernels;
+    private float[] m_SyncBuffer = new float[1];
+
+    public ReferenceComputeOps(ComputeShader kernels, ITensorAllocator allocator = null)
+    : base(allocator)
+    {
+        m_Kernels = kernels;
+    }
+
+    public ComputeTensorData Pin(Tensor X)
+    {
+        X.FlushCache();
+
+        var onDevice = X.tensorOnDevice as ComputeTensorData;
+        if (onDevice == null)
+        {
+            var asTexture = X.tensorOnDevice as TextureAsTensorData;
+            if (asTexture != null)
+                X.PinToDeviceAndDownloadFromIt(TextureToTensorData(asTexture, X.name));
+            else
+                X.PinToDeviceAndUploadToIt(new ComputeTensorData(X.shape, X.name));
+        }
+
+        Assert.IsNotNull(X.tensorOnDevice as ComputeTensorData);
+        Assert.IsNotNull((X.tensorOnDevice as ComputeTensorData).buffer);
+
+        return X.tensorOnDevice as ComputeTensorData;
+    }
+
+    public override void WaitForCompletion(Tensor x)
+    {
+        var data = x.tensorOnDevice as ComputeTensorData;
+
+        if (data != null)
+        {
+            data.buffer.GetData(m_SyncBuffer, 0, 0, 1);
+        }
+    }
+
+    public void SetTensor(ComputeFunc fn, string name, Tensor X)
+    {
+        var XonDevice = Pin(X);
+        fn.SetTensor(name, X.shape, XonDevice.buffer, XonDevice.offset);
+    }
+
+    public Tensor NewTensor(ComputeFunc fn, string name, TensorShape shape)
+    {
+        var o = NewTensor(shape, name);
+        fn.SetTensor(name, shape, Pin(o).buffer);
+        return o;
+    }
+
+    public Tensor Dispatch(ComputeFunc fn, TensorShape outputShape, int workItemsX, int workItemsY, int workItemsZ, string outputName = "O")
+    {
+        var o = NewTensor(fn, outputName, outputShape);
+        fn.Dispatch(workItemsX, workItemsY, workItemsZ);
+        return o;
+    }
+
+    // ---------------------------------------------------------------------------------
+
+    protected ITensorData TextureToTensorData(TextureAsTensorData texData, string name)
+    {
+        var fn = new ComputeFunc(m_Kernels, "TextureToTensor");
+        var tensorData = new ComputeTensorData(texData.shape, name, false);
+
+        fn.SetTensor("O", texData.shape, tensorData.buffer);
+        fn.shader.SetBool("_FlipY", texData.flip == TextureAsTensorData.Flip.Y);
+
+        var offsets = new int[] { 0,0,0,0 };
+        foreach (var tex in texData.textures)
+        {
+            var texArr = tex as Texture2DArray;
+            var tex3D = tex as Texture3D;
+            var rt = tex as RenderTexture;
+
+            var texDepth = 1;
+            if (texArr)
+                texDepth = texArr.depth;
+            else if (tex3D)
+                texDepth = tex3D.depth;
+            else if (rt)
+                texDepth = rt.volumeDepth;
+
+            var srcChannelMask = TextureFormatUtils.FormatToChannelMask(tex, texData.interpretPixelAsChannels);
+
+            fn.SetTexture("X", tex);
+            fn.shader.SetInts("_Pool", new int [] {tex.width, tex.height, 1, 1});
+            fn.shader.SetInts("_Pad", offsets);
+            fn.shader.SetInts("_Stride", new [] {(int)srcChannelMask[0], (int)srcChannelMask[1], (int)srcChannelMask[2], (int)srcChannelMask[3] });
+
+            fn.Dispatch(texData.shape.width, texData.shape.height, texDepth);
+
+            if (texData.interpretDepthAs == TextureAsTensorData.InterpretDepthAs.Batch)
+                offsets[0] += texDepth;
+            else if (texData.interpretDepthAs == TextureAsTensorData.InterpretDepthAs.Channels)
+                offsets[3] += texDepth * texData.interpretPixelAsChannels;
+        }
+
+        return tensorData;
+    }
+
+    public void TensorToRenderTexture(Tensor X, RenderTexture target,
+                                        int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
+    {
+        if (!target.enableRandomWrite || !target.IsCreated())
+        {
+            target.Release();
+            target.enableRandomWrite = true;
+            target.Create();
+        }
+
+        var fn = new ComputeFunc(m_Kernels, "TensorToTexture");
+        SetTensor(fn, "X", X);
+        fn.SetTexture("O", target);
+        fn.shader.SetFloat("_Alpha", scale);
+        fn.shader.SetFloat("_Beta", bias);
+        fn.shader.SetInts("_Pad", new int[] { batch, 0, 0, fromChannel });
+        fn.shader.SetBool("_FlipY", true);
+        fn.Dispatch(target.width, target.height, 1);
+    }
+
+    // ---------------------------------------------------------------------------------
+
+    public override Tensor Dense(Tensor X, Tensor W, Tensor B)
+    {
+        Assert.IsTrue(W.dimensions <= 2);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(X.flatWidth, W.flatHeight);
+
+        var O = new TensorShape(X.flatHeight, W.flatWidth);
+
+        var fn = new ComputeFunc(m_Kernels, "Dense");
+
+        SetTensor(fn, "X", X);
+        SetTensor(fn, "W", W);
+        SetTensor(fn, "B", B);
+
+        return Dispatch(fn, O, O.flatWidth, O.flatHeight, 1);
+    }
+
+    static public int IDivC(int v, int div)
+    {
+        return (v + div - 1) / div;
+    }
+
+    public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        Assert.AreEqual(X.channels, K.kernelDepth);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = X.shape.ApplyKernel(K.shape, stride, pad);
+
+        bool useWinograd = (K.width == 3) && (K.height == 3) && (stride[0] == 1); // only 3x3 kernel
+             useWinograd = useWinograd && (stride[0] == 1) && (pad[0] == 0) && (pad[0] == 0); // no support for padding and stride
+        if( useWinograd )
+        {
+            var fnw = new ComputeFunc(m_Kernels, "Conv2DWinograd_2x2_3x3");
+            SetTensor(fnw, "X", X);
+            SetTensor(fnw, "K", K);
+            SetTensor(fnw, "B", B);
+            fnw.shader.SetInts("_Stride", stride);
+            fnw.shader.SetInts("_Pad", pad);
+
+            var ow = Dispatch(fnw, O, K.kernelCount, IDivC(O.width,2), IDivC(O.height,2));
+            return ow;
+        }
+        
+        var fn = new ComputeFunc(m_Kernels, "Conv2D");
+
+        SetTensor(fn, "X", X);
+        SetTensor(fn, "K", K);
+        SetTensor(fn, "B", B);
+        fn.shader.SetInts("_Stride", stride);
+        fn.shader.SetInts("_Pad", pad);
+
+        var o = Dispatch(fn, O, K.kernelCount, O.width, O.height);
+        return o;
+    }
+
+    public override Tensor DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        if (K.kernelDepth != 1)
+            return base.DepthwiseConv2D(X, K, B, stride, pad);
+
+        Assert.AreEqual(K.kernelDepth, 1);
+        Assert.AreEqual(K.kernelCount, X.channels);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = X.shape.ApplyKernel(K.shape, stride, pad);
+
+        var fn = new ComputeFunc(m_Kernels, "DepthwiseConv2D");
+
+        SetTensor(fn, "X", X);
+        SetTensor(fn, "K", K);
+        SetTensor(fn, "B", B);
+        fn.shader.SetInts("_Stride", stride);
+        fn.shader.SetInts("_Pad", pad);
+
+        var o = Dispatch(fn, O, K.kernelCount, O.width, O.height);
+        return o;
+    }
+
+    public override Tensor Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment)
+    {
+        Assert.AreEqual(X.channels, K.kernelDepth);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = X.shape.ApplyKernelInverse(K.shape, stride, pad, outputAdjustment);
+
+        // one pass version
+        pad = new int[]
+        {
+            K.kernelWidth - pad[0] - 1, K.kernelHeight - pad[1] - 1,
+            K.kernelWidth - pad[2] - 1, K.kernelHeight - pad[3] - 1
+        };
+
+        var fn = new ComputeFunc(m_Kernels, "Conv2DTrans");
+
+        SetTensor(fn, "X", X);
+        SetTensor(fn, "K", K);
+        SetTensor(fn, "B", B);
+        fn.shader.SetInts("_Stride", stride);
+        fn.shader.SetInts("_Pad", pad);
+
+        return Dispatch(fn, O, K.kernelCount, O.width, O.height);
+    }
+
+    public override Tensor Upsample2D(Tensor X, int[] size)
+    {
+        Assert.AreEqual(size.Length, 2);
+
+        var O = new TensorShape(X.batch, X.height*size[1], X.width*size[0], X.channels);
+
+        var fn = new ComputeFunc(m_Kernels, "Upsample2D");
+
+        SetTensor(fn, "X", X);
+
+        fn.shader.SetInts("_Pool", size);
+
+        return Dispatch(fn, O, X.channels, X.width, X.height);
+    }
+
+    protected virtual Tensor ApplyPadding(Tensor X, int[] pad, string kernelName, float constant = 0.0f)
+    {
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = X.shape.ApplyBorder(pad);
+
+        var fn = new ComputeFunc(m_Kernels, kernelName);
+
+        SetTensor(fn, "X", X);
+
+        fn.shader.SetInts("_Pad", pad);
+        fn.shader.SetInts("_Stride", X.shape.ToArray());
+
+        if (kernelName == "Border2D")
+        {
+            // NOTE: negative "pad" variable will crop X tensor
+            int croppedWidth = X.width - Math.Max(0, -pad[2]);
+            int croppedHeight = X.height - Math.Max(0, -pad[3]);
+            var croppedSize = new int[] { 0, 0, 0, 0 };
+            croppedSize[0] = croppedWidth;
+            croppedSize[1] = croppedHeight;
+
+            fn.shader.SetInts("_Pool", croppedSize);
+            fn.shader.SetFloat("_Beta", constant);
+        }
+
+        return Dispatch(fn, O, O.channels, O.width, O.height);
+    }
+
+    public override Tensor Border2D(Tensor X, int[] pad, float constant)
+    {
+        return ApplyPadding(X, pad, "Border2D", constant);
+    }
+
+    public override Tensor Pad2DEdge(Tensor X, int[] pad)
+    {
+        return ApplyPadding(X, pad, "Pad2DEdge");
+    }
+
+    public override Tensor Pad2DReflect(Tensor X, int[] pad)
+    {
+        return ApplyPadding(X, pad, "Pad2DReflect");
+    }
+
+    public override Tensor Pad2DSymmetric(Tensor X, int[] pad)
+    {
+        return ApplyPadding(X, pad, "Pad2DSymmetric");
+    }
+
+    protected virtual Tensor Pool2D(string kernelName, Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        Assert.AreEqual(pool.Length, 2);
+        Assert.AreEqual(stride.Length, 2);
+
+        var O = X.shape.ApplyPool(pool, stride, pad);
+
+        var fn = new ComputeFunc(m_Kernels, kernelName);
+
+        SetTensor(fn, "X", X);
+        fn.shader.SetInts("_Pool", pool);
+        fn.shader.SetInts("_Stride", stride);
+        fn.shader.SetInts("_Pad", pad);
+
+        return Dispatch(fn, O, O.channels, O.width, O.height);
+    }
+
+    public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        return Pool2D("MaxPool2D", X, pool, stride, pad);
+    }
+
+    public override Tensor AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        return Pool2D("AvgPool2D", X, pool, stride, pad);
+    }
+
+    protected virtual Tensor GlobalPool2D(string kernelName, Tensor X)
+    {
+        var O = new TensorShape(X.batch, 1, 1, X.channels);
+
+        var fn = new ComputeFunc(m_Kernels, kernelName);
+
+        SetTensor(fn, "X", X);
+
+        return Dispatch(fn, O, O.channels, 1, 1);
+    }
+
+    public override Tensor GlobalMaxPool2D(Tensor X)
+    {
+        return GlobalPool2D("GlobalMaxPool2D", X);
+    }
+
+    public override Tensor GlobalAvgPool2D(Tensor X)
+    {
+        return GlobalPool2D("GlobalAvgPool2D", X);
+    }
+
+    public override Tensor GlobalAvgVariancePool2D(Tensor X)
+    {
+        var O = new TensorShape(X.batch, 2, 1, X.channels);
+
+        var fn = new ComputeFunc(m_Kernels, "GlobalAvgVariancePool2D");
+
+        SetTensor(fn, "X", X);
+
+        return Dispatch(fn, O, O.channels, 1, 1);
+    }
+
+    public override Tensor ScaleBias(Tensor X, Tensor S, Tensor B)
+    {
+        Assert.AreEqual(X.channels, B.channels); Assert.AreEqual(X.channels, S.channels);
+        Assert.AreEqual(B.length, B.channels); Assert.AreEqual(S.length, S.channels);
+
+        var O = X.shape;
+        var fn = new ComputeFunc(m_Kernels, "ScaleBias");
+
+        SetTensor(fn, "X", X);
+        SetTensor(fn, "W", S);
+        SetTensor(fn, "B", B);
+
+        return Dispatch(fn, O, O.channels, O.width, O.height);
+    }
+
+    public override Tensor Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon)
+    {
+        if (axis != 3 && axis != -1)
+            return base.Normalization(X, S, B, pool, axis, epsilon);
+
+        if (pool == 1 && X.batch != 1)
+            return base.Normalization(X, S, B, pool, axis, epsilon); // @TODO: Instance Normalization with batch > 1
+
+        if (pool <= 0)
+            pool = X.batch;
+
+        var O = X.shape;
+        var fn = new ComputeFunc(m_Kernels, "InstanceNorm");
+        fn.shader.SetFloat("_Epsilon", epsilon);
+
+        SetTensor(fn, "X", X);
+        SetTensor(fn, "W", S);
+        SetTensor(fn, "B", B);
+
+        return Dispatch(fn, O, O.channels, 1, 1);
+    }
+
+    // @TODO: debug & fix
+    public override Tensor Dropout(Tensor X, float alpha)
+    {
+        Assert.IsTrue(alpha >= 0f && alpha <= 1f);
+
+        var O = X.shape;
+        var fn = new ComputeFunc(m_Kernels, "Dropout");
+
+        SetTensor(fn, "X", X);
+        fn.shader.SetFloat("_Alpha", alpha);
+        fn.shader.SetFloat("_Seed", UnityEngine.Random.value);
+
+        return Dispatch(fn, O, O.channels, O.width, O.height);
+    }
+
+    protected virtual Tensor Activation(string kernelName, Tensor X, float alpha = 0f, float beta = 0f)
+    {
+        var O = X.shape;
+        var fn = new ComputeFunc(m_Kernels, kernelName);
+
+        SetTensor(fn, "X", X);
+        fn.shader.SetFloat("_Alpha", alpha);
+        fn.shader.SetFloat("_Beta",  beta);
+
+        return Dispatch(fn, O, O.channels, O.width, O.height);
+    }
+
+    public override Tensor Relu(Tensor X)
+    {
+        return Activation("Relu", X);
+    }
+
+    public override Tensor Selu(Tensor X, float alpha, float gamma)
+    {
+        return Activation("Selu", X, alpha, gamma);
+    }
+
+    public override Tensor Neg(Tensor X)
+    {
+        return Activation("Neg", X);
+    }
+
+    public override Tensor Swish(Tensor X)
+    {
+        return Activation("Swish", X);
+    }
+
+    public override Tensor Tanh(Tensor X)
+    {
+        return Activation("Tanh", X);
+    }
+
+    public override Tensor Sigmoid(Tensor X)
+    {
+        return Activation("Sigmoid", X);
+    }
+
+    public override Tensor Elu(Tensor X, float alpha)
+    {
+        return Activation("Elu", X, alpha);
+    }
+
+    public override Tensor Relu6(Tensor X)
+    {
+        return Activation("Relu6", X);
+    }
+
+    public override Tensor LeakyRelu(Tensor X, float alpha)
+    {
+        return Activation("LeakyRelu", X, alpha);
+    }
+
+    public override Tensor PRelu(Tensor X, Tensor S)
+    {
+        Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));
+
+        var O = X.shape;
+        var fn = new ComputeFunc(m_Kernels, "PRelu");
+
+        SetTensor(fn, "X", X);
+        SetTensor(fn, "W", S);
+
+        return Dispatch(fn, O, O.channels, O.width, O.height);
+    }
+
+    public override Tensor Exp(Tensor X)
+    {
+        return Activation("Exp", X);
+    }
+
+    public override Tensor Log(Tensor X)
+    {
+        return Activation("Log", X);
+    }
+
+    public override Tensor Sqrt(Tensor X)
+    {
+        return Activation("Sqrt", X);
+    }
+
+    public override Tensor Pow(Tensor X, float alpha)
+    {
+        return Activation("Pow", X, alpha);
+    }
+
+    public override Tensor Clip(Tensor X, float min, float max)
+    {
+        return Activation("Clip", X, min, max);
+    }
+
+    public override Tensor Softmax(Tensor X)
+    {
+        var O = X.shape.Flatten();
+
+        var fn = new ComputeFunc(m_Kernels, "Softmax");
+
+        SetTensor(fn, "X", X);
+
+        return Dispatch(fn, O, O.flatWidth, O.flatHeight, 1);
+    }
+
+    public override Tensor LogSoftmax(Tensor X)
+    {
+        var O = X.shape.Flatten();
+
+        var fn = new ComputeFunc(m_Kernels, "LogSoftmax");
+
+        SetTensor(fn, "X", X);
+
+        return Dispatch(fn, O, O.flatWidth, O.flatHeight, 1);
+    }
+
+    public override Tensor Concat(Tensor[] tensors, int axis)
+    {
+        if (axis != 3 && axis != -1)
+            return base.Concat(tensors, axis);
+
+        foreach (var X in tensors)
+            if (X.shape.rank != 4)
+                return base.Concat(tensors, axis);
+
+        var O = TensorExtensions.Concat(tensors.Select(t => t.shape).ToArray(), axis);
+        var offsets = new int[] { 0,0,0,0 };
+        axis = O.Axis(axis);
+
+        var fn = new ComputeFunc(m_Kernels, "Copy");
+        var result = NewTensor(fn, "O", O);
+        foreach (var X in tensors)
+        {
+            SetTensor(fn, "X", X);
+            fn.shader.SetInts("_Pad", offsets);
+            fn.Dispatch(X.channels, X.width, X.height);
+
+            offsets[axis] += X.shape[axis];
+        }
+        return result;
+    }
+
+    public virtual Tensor ElementwiseWithBroadcast(string kernelName, Tensor[] tensors)
+    {
+        var O = TensorExtensions.MaxShape(tensors);
+
+        Assert.IsTrue(tensors.Length > 0);
+        var X = tensors[0];
+
+        var fn = new ComputeFunc(m_Kernels, kernelName);
+        for (int t = 1; t < tensors.Length; ++t)
+        {
+            var B = tensors[t];
+
+            SetTensor(fn, "X", X);
+            SetTensor(fn, "B", B);
+            X = Dispatch(fn, O, O.channels, O.width, O.height);
+        }
+
+        return X;
+    }
+
+    public override Tensor Add(Tensor[] tensors)
+    {
+        return ElementwiseWithBroadcast("BroadcastAdd", tensors);
+    }
+
+    public override Tensor Sub(Tensor[] tensors)
+    {
+        return ElementwiseWithBroadcast("BroadcastSub", tensors);
+    }
+
+    public override Tensor Mul(Tensor[] tensors)
+    {
+        return ElementwiseWithBroadcast("BroadcastMul", tensors);
+    }
+
+    public override Tensor Div(Tensor[] tensors)
+    {
+        return ElementwiseWithBroadcast("BroadcastDiv", tensors);
+    }
+
+    public override Tensor Pow(Tensor[] tensors)
+    {
+        return ElementwiseWithBroadcast("BroadcastPow", tensors);
+    }
+
+    public override Tensor Min(Tensor[] tensors)
+    {
+        return ElementwiseWithBroadcast("BroadcastMin", tensors);
+    }
+
+    public override Tensor Max(Tensor[] tensors)
+    {
+        return ElementwiseWithBroadcast("BroadcastMax", tensors);
+    }
+
+    public override Tensor Greater(Tensor A, Tensor B)
+    {
+        return ElementwiseWithBroadcast("BroadcastGreater", new Tensor[] { A, B });
+    }
+
+    public override Tensor GreaterEqual(Tensor A, Tensor B)
+    {
+        return ElementwiseWithBroadcast("BroadcastGreaterEqual", new Tensor[] { A, B });
+    }
+
+    public override Tensor Less(Tensor A, Tensor B)
+    {
+        return ElementwiseWithBroadcast("BroadcastLess", new Tensor[] { A, B });
+    }
+
+    public override Tensor LessEqual(Tensor A, Tensor B)
+    {
+        return ElementwiseWithBroadcast("BroadcastLessEqual", new Tensor[] { A, B });
+    }
+
+    public override Tensor Equal(Tensor A, Tensor B)
+    {
+        return ElementwiseWithBroadcast("BroadcastEqual", new Tensor[] { A, B });
+    }
+
+    public override Tensor LogicalOr(Tensor A, Tensor B)
+    {
+        return ElementwiseWithBroadcast("BroadcastLogicalOr", new Tensor[] { A, B });
+    }
+
+    public override Tensor LogicalAnd(Tensor A, Tensor B)
+    {
+        return ElementwiseWithBroadcast("BroadcastLogicalAnd", new Tensor[] { A, B });
+    }
+
+    public override Tensor LogicalXor(Tensor A, Tensor B)
+    {
+        return ElementwiseWithBroadcast("BroadcastLogicalXor", new Tensor[] { A, B });
+    }
+
+    public override Tensor LogicalNot(Tensor X)
+    {
+        return Activation("LogicalNot", X);
+    }
+
+    public virtual Tensor Reduce(string kernelName, Tensor X, int axis)
+    {
+        if (axis != 3 && axis != -1)
+            throw new NotImplementedException();
+
+        var O = X.shape.Reduce(axis);
+		Assert.AreEqual(O.channels, 1);
+
+        var fn = new ComputeFunc(m_Kernels, kernelName);
+        SetTensor(fn, "X", X);
+
+        return Dispatch(fn, O, O.width, O.height, 1);
+    }
+
+    public override Tensor ReduceMin(Tensor X, int axis)
+    {
+    	return Reduce("ReduceMin", X, axis);
+    }
+
+	public override Tensor ReduceMax(Tensor X, int axis)
+    {
+    	return Reduce("ReduceMax", X, axis);
+    }
+
+    public override Tensor ReduceSum(Tensor X, int axis)
+    {
+    	return Reduce("ReduceSum", X, axis);
+    }
+
+    public override Tensor ReduceMean(Tensor X, int axis)
+    {
+    	return Reduce("ReduceMean", X, axis);
+    }
+
+    public override Tensor ReduceProd(Tensor X, int axis)
+    {
+    	return Reduce("ReduceProd", X, axis);
+    }
+
+    public override Tensor Prepare(Tensor X)
+    {
+        Pin(X);
+        return X;
+    }
+}
+
+public struct ComputeFunc
+{
+    // dispatch dimension limitation coming from D3D11
+    const uint SafeDispatchLimit = 65535;
+
+    public struct TensorDecl
+    {
+        public int ShapeId { get; }
+        public int InfoId { get; }
+
+        public TensorDecl(int shapeId, int infoId)
+        {
+            ShapeId = shapeId;
+            InfoId = infoId;
+        }
+    }
+
+    readonly public ComputeShader shader;
+    readonly public string kernelName;
+    readonly public int kernelIndex;
+    readonly public uint threadGroupSizeX;
+    readonly public uint threadGroupSizeY;
+    readonly public uint threadGroupSizeZ;
+    public uint threadGroupSize { get { return threadGroupSizeX * threadGroupSizeY * threadGroupSizeZ; } }
+
+    public int width { get { return (int)threadGroupSizeX; } }
+    public int height { get { return (int)threadGroupSizeY; } }
+    public int depth { get { return (int)threadGroupSizeZ; } }
+
+    static public TensorDecl GetTensorDecl(string name)
+    {
+        var shapeId = Shader.PropertyToID(s_StringCache.Lookup(name, "declShape"));
+        var infoId = Shader.PropertyToID(s_StringCache.Lookup(name, "declInfo"));
+        return new TensorDecl(shapeId, infoId);
+    }
+    static public int GetTensorData(string name ) { return Shader.PropertyToID(s_StringCache.Lookup(name, "data")); }
+
+    static private StringCache s_StringCache = new StringCache();
+
+    static private Texture2D s_DummyTexture2D;
+    static private Texture3D s_DummyTexture3D;
+    static private Texture2DArray s_DummyTexture2DArray;
+
+    static private Texture2D dummyTexture2D {
+        get
+        {
+            if (s_DummyTexture2D == null)
+                s_DummyTexture2D = new Texture2D(8, 8);
+            return s_DummyTexture2D;
+        }
+    }
+
+    static private Texture3D dummyTexture3D
+    {
+        get
+        {
+            if (s_DummyTexture3D == null)
+                s_DummyTexture3D = new Texture3D(8, 8, 1, TextureFormat.ARGB32, false);
+            return s_DummyTexture3D;
+        }
+    }
+
+    static private Texture2DArray dummyTexture2DArray
+    {
+        get
+        {
+            if (s_DummyTexture2DArray == null)
+                s_DummyTexture2DArray = new Texture2DArray(8, 8, 1, TextureFormat.ARGB32, false);
+            return s_DummyTexture2DArray;
+        }
+    }
+
+    // ---------------------------------------------------------------------------------
+
+    public ComputeFunc(ComputeShader cs, string[] kns, int x, int y = 1, int z = 1)
+        : this(cs, FindBestKernelMatchingDimensions(new [] { cs }, kns, x, y, z))
+    {
+    }
+
+    public ComputeFunc(ComputeShader cs, string kn)
+        : this(new [] { cs }, kn)
+    {
+    }
+
+    public ComputeFunc(ComputeShader[] cs, string[] kns, int x, int y = 1, int z = 1)
+        : this(cs, FindBestKernelMatchingDimensions(cs, kns, x, y, z))
+    {
+    }
+
+    public ComputeFunc(ComputeShader[] cs, string kn)
+    {
+        foreach (ComputeShader s in cs)
+            if (s != null && s.HasKernel(kn))
+            {
+                shader = s;
+                kernelName = kn;
+                kernelIndex = shader.FindKernel(kernelName);
+                shader.GetKernelThreadGroupSizes(kernelIndex, out threadGroupSizeX, out threadGroupSizeY, out threadGroupSizeZ);
+                return;
+            }
+        throw new ArgumentException("Kernel " + kn + " is missing");
+    }
+
+    // ---------------------------------------------------------------------------------
+
+    public void SetTensor(string name, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0)
+    {
+        SetTensorDecl(name, shape, dataOffset);
+        SetTensorBuffer(name, buffer);
+    }
+    public void SetTensor(ComputeFunc.TensorDecl tensorDecl, int dataPropId, TensorShape shape, ComputeBuffer buffer, Int64 dataOffset = 0)
+    {
+        SetTensorDecl(tensorDecl, shape, dataOffset);
+        SetTensorBuffer(dataPropId, buffer);
+    }
+
+    public void SetTensor(string name, TensorShape shape, Texture texture, Int64 dataOffset = 0)
+    {
+        SetTensorDecl(name, shape, dataOffset);
+        SetTexture(name, texture);
+    }
+
+    public void SetTensorDecl(string name, TensorShape shape, Int64 dataOffset)
+    {
+        ComputeFunc.TensorDecl tensorDecl = GetTensorDecl(name);
+        shader.SetInts(tensorDecl.ShapeId, shape.batch, shape.height, shape.width, shape.channels );
+        shader.SetInts(tensorDecl.InfoId, (int)dataOffset, shape.length);
+    }
+
+    // WARN: SetTensorDecl() is not multi-thread safe due to s_TensorDeclScratchpad usage
+    // However there is no plan to call SetTensorDecl() from multiple threads
+    // NOTE: s_TensorDeclScratchpad is used to avoid memory allocation
+	static private int[] s_tTensorDeclScratchpadShape = new int[4];
+    static private int[] s_tTensorDeclScratchpadInfo = new int[2];
+    public void SetTensorDecl(ComputeFunc.TensorDecl tensorDecl, TensorShape shape, Int64 dataOffset)
+    {
+        s_tTensorDeclScratchpadShape[0] = shape.batch;
+        s_tTensorDeclScratchpadShape[1] = shape.height;
+        s_tTensorDeclScratchpadShape[2] = shape.width;
+        s_tTensorDeclScratchpadShape[3] = shape.channels;
+        s_tTensorDeclScratchpadInfo[0] = (int)dataOffset;
+        s_tTensorDeclScratchpadInfo[1] = shape.length;
+        shader.SetInts(tensorDecl.ShapeId, s_tTensorDeclScratchpadShape);
+        shader.SetInts(tensorDecl.InfoId, s_tTensorDeclScratchpadInfo);
+    }
+
+    public void SetTensorBuffer(string name, ComputeBuffer buffer)
+    {
+        shader.SetBuffer(kernelIndex, GetTensorData(name), buffer);
+    }
+    public void SetTensorBuffer(int propId, ComputeBuffer buffer)
+    {
+        shader.SetBuffer(kernelIndex, propId, buffer);
+    }
+
+    public void SetTexture(string name, Texture tex)
+    {
+        // set dummy textures for slots that are not used - to make API validation layers happy
+        Texture tex2D = dummyTexture2D;
+        Texture tex2Darray = dummyTexture2DArray;
+        Texture tex3D = dummyTexture3D;
+
+        if (tex.dimension == TextureDimension.Tex2D)
+            tex2D = tex;
+        else if (tex.dimension == TextureDimension.Tex2DArray)
+            tex2Darray = tex;
+        else if (tex.dimension == TextureDimension.Tex3D)
+            tex3D = tex;
+        else
+            throw new InvalidOperationException("Unsupported texture type");
+
+        shader.SetTexture(kernelIndex, name + "tex2D", tex2D);
+        shader.SetTexture(kernelIndex, name + "tex3D", tex3D);
+        shader.SetTexture(kernelIndex, name + "tex2DArray", tex2Darray);
+    }
+
+    public void Dispatch(int[] workItems)
+    {
+        Assert.IsTrue(workItems.Length >= 3);
+        Dispatch(workItems[0], workItems[1], workItems[2]);
+    }
+
+    public void Dispatch(int workItemsX, int workItemsY, int workItemsZ)
+    {
+        Profiler.BeginSample(kernelName);
+        var x = IntDivCeil(workItemsX, (int) threadGroupSizeX);
+        var y = IntDivCeil(workItemsY, (int) threadGroupSizeY);
+        var z = IntDivCeil(workItemsZ, (int) threadGroupSizeZ);
+
+        // some GFX APIs / GPU hw/drivers have limitation of 65535 per dimension
+        if (x > SafeDispatchLimit || y > SafeDispatchLimit || z > SafeDispatchLimit)
+            D.LogWarning($"Exceeded safe compute dispatch group count limit per dimension [{x}, {y}, {z}] for {kernelName}");
+
+        shader.Dispatch(kernelIndex, x, y, z);
+        Profiler.EndSample();
+    }
+
+    // ---------------------------------------------------------------------------------
+
+    static public int IntDivCeil(int v, int div)
+    {
+        return (v + div - 1) / div;
+    }
+
+    static public string FindBestKernelMatchingDimensions(ComputeShader[] cs, string[] kns, int x, int y = 1, int z = 1)
+    {
+        Assert.IsTrue(kns.Length > 0);
+
+        foreach (var kernelName in kns)
+        {
+            foreach (var shader in cs)
+            {
+                int kernelIndex = shader.FindKernel(kernelName);
+                uint threadGroupSizeX, threadGroupSizeY, threadGroupSizeZ;
+                shader.GetKernelThreadGroupSizes(kernelIndex, out threadGroupSizeX, out threadGroupSizeY, out threadGroupSizeZ);
+
+                if (x % threadGroupSizeX == 0 &&
+                    y % threadGroupSizeY == 0 &&
+                    z % threadGroupSizeZ == 0)
+                    return kernelName;
+            }
+        }
+        // pick the last one
+        return kns[kns.Length - 1];
+    }
+}
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCompute.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCompute.cs.meta
new file mode 100644
index 0000000..4fb005e
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaReferenceCompute.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 3e48b2167ab1b453bb10a8fdac9dc531
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaUnsafeArrayCPU.cs b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaUnsafeArrayCPU.cs
new file mode 100644
index 0000000..47713e2
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaUnsafeArrayCPU.cs
@@ -0,0 +1,1969 @@
+using UnityEngine;
+using UnityEngine.Assertions;
+using System;
+using System.Runtime.InteropServices;
+using System.Threading.Tasks;
+using UnityEngine.Profiling;
+
+namespace Barracuda {
+
+public class UnsafeArrayTensorData : SharedArrayTensorData
+{
+    protected bool m_Readonly = false;
+
+    // creates new array
+    public UnsafeArrayTensorData(int count) : base(new float[count])
+    {
+    }
+
+    // creates new array
+    public UnsafeArrayTensorData(TensorShape shape) : this(shape.length)
+    {
+    }
+
+    // uses shared array
+    public UnsafeArrayTensorData(ArrayTensorData sharedArray) : base(sharedArray.array, 0, -1)
+    {
+    }
+
+    // uses shared array
+    public UnsafeArrayTensorData(SharedArrayTensorData sharedArray) : base(sharedArray.array, sharedArray.offset, sharedArray.count)
+    {
+        m_Readonly = true;
+    }
+
+    ~UnsafeArrayTensorData()
+    {
+        Dispose();
+    }
+
+    public override void Dispose()
+    {
+        m_Array = null;
+        m_Offset = m_Count = 0;
+    }
+
+    public override void Reserve(int count)
+    {
+        if (m_Readonly)
+        {
+            base.Reserve(count);
+            return;
+        }
+
+        if (count > m_Array.Length)
+            m_Array = new float[count];
+
+        m_Offset = 0;
+        m_Count = count;
+    }
+
+    public override void Upload(float[] data, int offset = 0, int count = -1)
+    {
+        if (m_Readonly)
+        {
+            base.Upload(data, offset, count);
+            return;
+        }
+
+        Assert.IsTrue(offset >= 0);
+        if (count < 0)
+            count = data.Length - offset;
+
+        if (m_Array == data && m_Offset == offset && m_Count == count)
+            return;
+
+        Reserve(count);
+
+        Array.Copy(data, offset, m_Array, m_Offset, m_Count);
+    }
+
+    public override float[] Download(int count)
+    {
+        //;;D.logStackTraceEnabled = true;
+        //;;D.Log("Download UnsafeArrayTensorData " + count + " from " + m_Count + " @ " + ToString());
+        //;;D.logStackTraceEnabled = false;
+
+        if (!m_Readonly && count <= m_Array.Length && m_Offset == 0)
+            return m_Array;
+
+        return base.Download(count);
+    }
+
+    public override string ToString()
+    {
+        return string.Format("(CPU unsafe: {0} length: {1} offset: {2} uploaded: {3})",
+            GetHashCode(), m_Array.Length, m_Offset, m_Count);
+    }
+}
+
+
+public class UnsafeArrayCPUOps : ReferenceCPUOps
+{
+    BLASPlugin blas = null;
+
+    const int BlockSize = 32;
+
+    internal InnerLoop m_InnerLoop = new InnerLoop();
+
+
+    public UnsafeArrayCPUOps(ITensorAllocator allocator = null)
+    : base(allocator)
+    {
+        blas = BLASPluginFactory.CreateBLASPlugin();
+    }
+
+    public static UnsafeArrayTensorData Pin(Tensor X)
+    {
+        X.FlushCache();
+
+        var onDevice = X.tensorOnDevice as UnsafeArrayTensorData;
+        if (onDevice == null)
+        {
+            // try to adopt CPU arrays
+            var asSharedArray = X.tensorOnDevice as SharedArrayTensorData;
+            var asArray = X.tensorOnDevice as ArrayTensorData;
+            if (asSharedArray != null) X.CastOnDevice(new UnsafeArrayTensorData(asSharedArray)); // adopt unsafe array without copy
+            else if (asArray != null) X.CastOnDevice(new UnsafeArrayTensorData(asArray)); // adopt unsafe array without copy
+            else
+                X.PinToDeviceAndUploadToIt(new UnsafeArrayTensorData(X.shape)); // device is uncompatible, create new array and upload
+        }
+
+        return X.tensorOnDevice as UnsafeArrayTensorData;
+    }
+
+    // ---------------------------------------------------------------------------------
+
+    // NOTE: Parallel.For with small number of work items results in varying and often worse performance
+    // As a workaround we will fallback to 'for' loop when number of work items is below heuristically determined threshold
+    private static void Parallel_For(long begin, long end, Action<long> body)
+    {
+        if (end - begin > 2048) // threshold determined heuristically. If work items < threshold, then for loop is faster than Parallel.For()
+            Parallel.For(begin, end, body);
+        else
+            for(var n = begin; n < end; n++)
+                body(n);
+    }
+
+    public override Tensor Neg(Tensor X)
+    {
+        // f(x) = -x
+        var O = NewTensorLike(X);
+        var end = X.length;
+        const int unrollSize = 4;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                NegInnerLoop(end, unrollSize, xPtr, oPtr);
+
+                // Remainder
+                for (int i = (end / unrollSize) * unrollSize; i < end; ++i)
+                {
+                    oPtr[i] = -xPtr[i];
+                }
+            }
+        }
+
+        return O;
+    }
+
+    private unsafe void NegInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr)
+    {
+        Assert.AreEqual(unrollSize, 4);
+
+        m_InnerLoop.SetState(unrollSize, xPtr, oPtr);
+
+        Parallel_For(0L, length / unrollSize, m_InnerLoop.m_negInnerLoopDelegate);
+    }
+
+    public override Tensor Relu(Tensor X)
+    {
+        // f(x) = max(x,0.0)
+        var O = NewTensorLike(X);
+        var end = X.length;
+        const int unrollSize = 4;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                ReluInnerLoop(end, unrollSize, xPtr, oPtr);
+
+                // Remainder
+                for (int i = (end / unrollSize) * unrollSize; i < end; ++i)
+                {
+                    float v = xPtr[i];
+                    v = Mathf.Max(v, 0.0f);
+                    oPtr[i] = v;
+                }
+            }
+        }
+
+        return O;
+    }
+
+    private unsafe void ReluInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr)
+    {
+        Assert.AreEqual(unrollSize, 4);
+
+        m_InnerLoop.SetState(unrollSize, xPtr, oPtr);
+
+        Parallel_For(0L, length / unrollSize, m_InnerLoop.m_reluInnerLoopDelegate);
+    }
+
+    public override Tensor Elu(Tensor X, float alpha)
+    {
+        // f(x) = alpha * (exp(x) - 1.) for x < 0, f(x) = x for x >= 0
+        // "Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)", DA Clevert, 2015
+        // https://arxiv.org/abs/1511.07289
+        var O = NewTensorLike(X);
+        var end = X.length;
+        const int unrollSize = 4;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                EluInnerLoop(end, unrollSize, xPtr, oPtr, alpha);
+
+                // Remainder
+                for (int i = (end / unrollSize) * unrollSize; i < end; ++i)
+                {
+                    float v = xPtr[i];
+                    if (v <= 0)
+                        v = alpha * (Mathf.Exp(v) - 1f);
+                    oPtr[i] = v;
+                }
+            }
+        }
+
+        return O;
+    }
+
+    private unsafe void EluInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr, float alpha)
+    {
+        Assert.AreEqual(unrollSize, 4);
+
+        m_InnerLoop.SetState(unrollSize, xPtr, oPtr, alpha);
+
+        Parallel_For(0L, length / unrollSize, m_InnerLoop.m_eluInnerLoopDelegate);
+    }
+
+
+    public override Tensor PRelu(Tensor X, Tensor S)
+    {
+        Assert.IsTrue((X.flatWidth == S.flatWidth) || (S.flatWidth == 1));
+
+        // f(x) = x for x >= 0, f(x) = slope*x for x <= 0
+        var O = NewTensorLike(X);
+        var end = X.length;
+        const int unrollSize = 4;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset],
+                wPtr = &Pin(S).array[Pin(S).offset])
+            {
+                PReluInnerLoop(end, unrollSize, xPtr, X.length, oPtr, wPtr, S.length);
+
+                // Remainder
+                for (int i = (end / unrollSize) * unrollSize; i < end; ++i)
+                {
+                    float v = xPtr[i];
+                    float slope = wPtr[i % S.length];
+	                v = Mathf.Max(0.0f, v) + slope * Mathf.Min(0.0f, v);
+                    oPtr[i] = v;
+                }
+            }
+        }
+
+        return O;
+    }
+
+    private unsafe void PReluInnerLoop(int length, int unrollSize, float* xPtr, int xLen, float* oPtr, float* wPtr, int wLen)
+    {
+        Assert.AreEqual(unrollSize, 4);
+
+        m_InnerLoop.SetState(unrollSize, oPtr, xPtr, xLen, wPtr, wLen);
+
+        Parallel_For(0L, length / unrollSize, m_InnerLoop.m_preluInnerLoopDelegate);
+    }
+
+    public override Tensor Sigmoid(Tensor X)
+    {
+        // f(x) = 1 / (1 + exp(-x))
+        var O = NewTensorLike(X);
+        var end = X.length;
+        const int unrollSize = 4;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                SigmoidInnerLoop(end, unrollSize, xPtr, oPtr);
+
+                // Remainder
+                for (int i = (end / unrollSize) * unrollSize; i < end; ++i)
+                {
+                    float v = xPtr[i];
+                    v = 1f / (1f + Mathf.Exp(-v));
+                    oPtr[i] = v;
+                }
+            }
+        }
+
+        return O;
+    }
+
+    private unsafe void SigmoidInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr)
+    {
+        Assert.AreEqual(unrollSize, 4);
+
+        m_InnerLoop.SetState(unrollSize, xPtr, oPtr);
+
+        Parallel_For(0L, length / unrollSize, m_InnerLoop.m_sigmoidInnerLoopDelegate);
+    }
+
+    public override Tensor Swish(Tensor X)
+    {
+        // f(x) = sigmoid(x) * x = x / (1 + exp(-x))
+        // "Searching for Activation Functions". P Ramachandran, 2017
+        // https://arxiv.org/abs/1710.05941
+
+        var O = NewTensorLike(X);
+        var end = X.length;
+        const int unrollSize = 4;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                SwishInnerLoop(end, unrollSize, xPtr, oPtr);
+
+                // Remainder
+                for (int i = (end / unrollSize) * unrollSize; i < end; ++i)
+                {
+                    float v = xPtr[i];
+                    v = v / (1f + Mathf.Exp(-v));
+                    oPtr[i] = v;
+                }
+            }
+        }
+
+        return O;
+    }
+
+    private unsafe void SwishInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr)
+    {
+        Assert.AreEqual(unrollSize, 4);
+
+        m_InnerLoop.SetState(unrollSize, xPtr, oPtr);
+
+        Parallel_For(0L, length / unrollSize, m_InnerLoop.m_swishInnerLoopDelegate);
+    }
+
+    public override Tensor Exp(Tensor X)
+    {
+        var O = NewTensorLike(X);
+        var end = X.length;
+        const int unrollSize = 4;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                ExpInnerLoop(end, unrollSize, xPtr, oPtr);
+
+                // Remainder
+                for (int i = (end / unrollSize) * unrollSize; i < end; ++i)
+                {
+                    float v = xPtr[i];
+                    v = Mathf.Exp(v);
+                    oPtr[i] = v;
+                }
+            }
+        }
+
+        return O;
+    }
+
+    private unsafe void ExpInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr)
+    {
+        Assert.AreEqual(unrollSize, 4);
+
+        m_InnerLoop.SetState(unrollSize, xPtr, oPtr);
+
+        Parallel_For(0L, length / unrollSize, m_InnerLoop.m_expInnerLoopDelegate);
+    }
+
+    public override Tensor Sqrt(Tensor X)
+    {
+        var O = NewTensorLike(X);
+        var end = X.length;
+        const int unrollSize = 4;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                SqrtInnerLoop(end, unrollSize, xPtr, oPtr);
+
+                // Remainder
+                for (int i = (end / unrollSize) * unrollSize; i < end; ++i)
+                {
+                    float v = xPtr[i];
+                    v = Mathf.Sqrt(v);
+                    oPtr[i] = v;
+                }
+            }
+        }
+
+        return O;
+    }
+
+    private unsafe void SqrtInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr)
+    {
+        Assert.AreEqual(unrollSize, 4);
+
+        m_InnerLoop.SetState(unrollSize, xPtr, oPtr);
+
+        Parallel_For(0L, length / unrollSize, m_InnerLoop.m_sqrtInnerLoopDelegate);
+    }
+
+    public override Tensor Tanh(Tensor X)
+    {
+        var O = NewTensorLike(X);
+        var end = X.length;
+        const int unrollSize = 4;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                TanhInnerLoop(end, unrollSize, xPtr, oPtr);
+
+                // Remainder
+                for (int i = (end / unrollSize) * unrollSize; i < end; ++i)
+                {
+                    float v = xPtr[i];
+                    v = MathfEx.tanh(v);
+                    oPtr[i] = v;
+                }
+            }
+        }
+
+        return O;
+    }
+
+    private unsafe void TanhInnerLoop(int length, int unrollSize, float* xPtr, float* oPtr)
+    {
+        Assert.AreEqual(unrollSize, 4);
+
+        m_InnerLoop.SetState(unrollSize, xPtr, oPtr);
+
+        Parallel_For(0L, length / unrollSize, m_InnerLoop.m_tanhInnerLoopDelegate);
+    }
+
+    private Tensor GetOutputTensorFromBroadcast(Tensor[] tensors)
+    {
+        Assert.IsTrue(tensors.Length > 0);
+
+        var O = NewTensor(TensorExtensions.MaxShape(tensors));
+        foreach (var t in tensors)
+        {
+            Assert.IsTrue((t.batch    == 1) || (t.batch    == O.batch));
+            Assert.IsTrue((t.height   == 1) || (t.height   == O.height));
+            Assert.IsTrue((t.width    == 1) || (t.width    == O.width));
+            Assert.IsTrue((t.channels == 1) || (t.channels == O.channels));
+        }
+
+        return O;
+    }
+
+    private bool CanUseModuloForBroadcasting(TensorShape o, TensorShape a)
+    {
+        if (o == a)
+           return true;
+
+        int firstDimensionMismatchInMemoryOrder = -1;
+        for (int i=3; i > 0; --i)
+        {
+            if (o[i] != a[i])
+            {
+                firstDimensionMismatchInMemoryOrder = i;
+                break;
+            }
+        }
+
+        for (int i = firstDimensionMismatchInMemoryOrder; i > 0; --i)
+        {
+            if (a[i] != 1)
+                return false;
+        }
+
+        return true;
+    }
+
+    private bool CanUseModuloForBroadcasting(TensorShape o, TensorShape a, TensorShape b)
+    {
+        return CanUseModuloForBroadcasting(o,a) && CanUseModuloForBroadcasting(o,b);
+    }
+
+    private Tensor ApplyElementwiseWithBroadcast(Tensor[] tensors, Func<float,float,float> opRemainder, Action<long> opInnerLoop, Action<long> opInnerLoopNoBroadcast)
+    {
+        var O = GetOutputTensorFromBroadcast(tensors);
+        var A = tensors[0];
+
+        unsafe
+        {
+            fixed (float*
+                t0Ptr = &Pin(A).array[Pin(A).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                float* aPtr = t0Ptr;
+                var aShape = A.shape;
+
+                for (int t = 1; t < tensors.Length; ++t)
+                {
+                    var B = tensors[t];
+                    fixed (float* bPtr = &Pin(B).array[Pin(B).offset])
+                    {
+                        //Inner loop
+                        const int unrollSize = 4;
+                        m_InnerLoop.SetState(unrollSize, oPtr, aPtr, bPtr, O.shape, aShape, B.shape);
+                        if (CanUseModuloForBroadcasting(O.shape, aShape, B.shape))
+                            Parallel_For(0L, O.length / unrollSize, opInnerLoopNoBroadcast);
+                        else
+                            Parallel_For(0L, O.length / unrollSize, opInnerLoop);
+
+
+                        // Remainder
+                        for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i)
+                        {
+                            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+                            O.shape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0);
+                            oPtr[i] = opRemainder(aPtr[A.shape.IndexWithBroadcast(b0, h0, w0, ch0)], bPtr[B.shape.IndexWithBroadcast(b0, h0, w0, ch0)]);
+                        }
+                    }
+
+                    aPtr = oPtr;
+                    aShape = O.shape;
+                }
+            }
+        }
+
+        return O;
+    }
+
+    public override Tensor Add(Tensor[] tensors)
+    {
+        return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_addOpDelegate, m_InnerLoop.m_addInnerLoopDelegate, m_InnerLoop.m_addInnerLoopDelegateNoBroadcast);
+    }
+
+    public override Tensor Sub(Tensor[] tensors)
+    {
+        return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_subOpDelegate, m_InnerLoop.m_subInnerLoopDelegate, m_InnerLoop.m_subInnerLoopDelegateNoBroadcast);
+    }
+
+    public override Tensor Mul(Tensor[] tensors)
+    {
+        return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_mulOpDelegate, m_InnerLoop.m_mulInnerLoopDelegate, m_InnerLoop.m_mulInnerLoopDelegateNoBroadcast);
+    }
+
+    public override Tensor Div(Tensor[] tensors)
+    {
+        return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_divOpDelegate, m_InnerLoop.m_divInnerLoopDelegate, m_InnerLoop.m_divInnerLoopDelegateNoBroadcast);
+    }
+
+    public override Tensor Min(Tensor[] tensors)
+    {
+        return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_minOpDelegate, m_InnerLoop.m_minInnerLoopDelegate, m_InnerLoop.m_minInnerLoopDelegateNoBroadcast);
+    }
+
+    public override Tensor Max(Tensor[] tensors)
+    {
+        return ApplyElementwiseWithBroadcast(tensors, m_InnerLoop.m_maxOpDelegate, m_InnerLoop.m_maxInnerLoopDelegate, m_InnerLoop.m_maxInnerLoopDelegateNoBroadcast);
+    }
+
+    public override Tensor Greater(Tensor A, Tensor B)
+    {
+        return ApplyLogicalOperator(A, B, m_InnerLoop.m_greaterOpDelegate, m_InnerLoop.m_greaterInnerLoopDelegate, m_InnerLoop.m_greaterInnerLoopDelegateNoBroadcast);
+    }
+    public override Tensor GreaterEqual(Tensor A, Tensor B)
+    {
+        return ApplyLogicalOperator(A, B, m_InnerLoop.m_greaterEqualOpDelegate, m_InnerLoop.m_greaterEqualInnerLoopDelegate, m_InnerLoop.m_greaterEqualInnerLoopDelegateNoBroadcast);
+    }
+    public override Tensor Less(Tensor A, Tensor B)
+    {
+        return ApplyLogicalOperator(A, B, m_InnerLoop.m_lessOpDelegate, m_InnerLoop.m_lessInnerLoopDelegate, m_InnerLoop.m_lessInnerLoopDelegateNoBroadcast);
+    }
+    public override Tensor LessEqual(Tensor A, Tensor B)
+    {
+        return ApplyLogicalOperator(A, B, m_InnerLoop.m_lessEqualOpDelegate, m_InnerLoop.m_lessEqualInnerLoopDelegate, m_InnerLoop.m_lessEqualInnerLoopDelegateNoBroadcast);
+    }
+    public override Tensor Equal(Tensor A, Tensor B)
+    {
+        return ApplyLogicalOperator(A, B, m_InnerLoop.m_equalOpDelegate, m_InnerLoop.m_equalInnerLoopDelegate, m_InnerLoop.m_equalInnerLoopDelegateNoBroadcast);
+    }
+    public override Tensor LogicalOr(Tensor A, Tensor B)
+    {
+        return ApplyLogicalOperator(A, B, m_InnerLoop.m_logicalOrOpDelegate, m_InnerLoop.m_logicalOrInnerLoopDelegate, m_InnerLoop.m_logicalOrInnerLoopDelegateNoBroadcast);
+    }
+    public override Tensor LogicalAnd(Tensor A, Tensor B)
+    {
+        return ApplyLogicalOperator(A, B, m_InnerLoop.m_logicalAndOpDelegate, m_InnerLoop.m_logicalAndInnerLoopDelegate, m_InnerLoop.m_logicalAndInnerLoopDelegateNoBroadcast);
+    }
+    public override Tensor LogicalXor(Tensor A, Tensor B)
+    {
+        return ApplyLogicalOperator(A, B, m_InnerLoop.m_logicalXorOpDelegate, m_InnerLoop.m_logicalXorInnerLoopDelegate, m_InnerLoop.m_logicalXorInnerLoopDelegateNoBroadcast);
+    }
+
+    public override Tensor LogicalNot(Tensor X)
+    {
+        var O = NewTensorLike(X);
+
+        unsafe
+        {
+            fixed (float*
+            xPtr = &Pin(X).array[Pin(X).offset],
+            oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                const int unrollSize = 4;
+                m_InnerLoop.SetState(unrollSize, xPtr, oPtr);
+                Parallel_For(0L, O.length / unrollSize, m_InnerLoop.m_logicaNotInnerLoopDelegate);
+
+                // Remainder
+                for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i)
+                    oPtr[i] = Convert.ToSingle( !Convert.ToBoolean(xPtr[i]) );
+            }
+        }
+        return O;
+    }
+
+    private Tensor ApplyLogicalOperator(Tensor A, Tensor B, Func<float,float,float> logicalOpRemainder, Action<long> logicalOpInnerLoop, Action<long> logicalOpInnerLoopNoBroadcast)
+    {
+        var O = GetOutputTensorFromBroadcast(new Tensor[] { A, B });
+
+        unsafe
+        {
+            fixed (float*
+                aPtr = &Pin(A).array[Pin(A).offset],
+                bPtr = &Pin(B).array[Pin(B).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                const int unrollSize = 4;
+                m_InnerLoop.SetState(unrollSize, oPtr, aPtr, bPtr, O.shape, A.shape, B.shape);
+                if ((O.shape == A.shape) && (O.shape == B.shape))
+                    Parallel_For(0L, O.length / unrollSize, logicalOpInnerLoopNoBroadcast);
+                else
+                    Parallel_For(0L, O.length / unrollSize, logicalOpInnerLoop);
+
+                // Remainder
+                for (int i = (O.length / unrollSize) * unrollSize; i < O.length; ++i)
+                {
+                    int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+                    O.shape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0);
+                    oPtr[i] = logicalOpRemainder(aPtr[A.shape.IndexWithBroadcast(b0, h0, w0, ch0)], bPtr[B.shape.IndexWithBroadcast(b0, h0, w0, ch0)]);
+                }
+            }
+        }
+
+        return O;
+    }
+
+    public override Tensor MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
+    {
+        //var Z = base.MatMul(X, xTranspose, Y, yTranspose);
+        Assert.IsTrue(X.dimensions <= 2);
+        Assert.IsTrue(Y.dimensions <= 2);
+
+        int xw = X.flatWidth, xh = X.flatHeight;
+        int yw = Y.flatWidth, yh = Y.flatHeight;
+
+        if (xTranspose)
+        {
+            var tmp = xw; xw = xh; xh = tmp;
+        }
+        if (yTranspose)
+        {
+            var tmp = yw; yw = yh; yh = tmp;
+        }
+
+        Assert.AreEqual(xw, yh);
+        var O = NewTensor(xh, yw);
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                yPtr = &Pin(Y).array[Pin(Y).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                // NOTE: assumes newly created Tensor data is initialized with 0
+
+                //D.Log(string.Format("===> X.b[{0}] x Y.w[{1}] * Y.h[{2}] x Y.w[{3}] = O.w[{4}] x O.h[{5}]", X.flatHeight, X.flatWidth, Y.flatHeight, Y.flatWidth, O.batch, O.width));
+                blas.SGEMM(xPtr, X.flatHeight, X.flatWidth,
+                    yPtr, Y.flatHeight, Y.flatWidth,
+                    oPtr, O.flatHeight, O.flatWidth, 32, xTranspose, yTranspose);
+            }
+        }
+
+        Profiler.EndSample ();
+
+        //O.PrintDataPart(32, "O");
+        //Z.PrintDataPart(32, "Z");
+        //CompareOps.CheckSame(O, Z, "MatMul");
+
+        return O;
+    }
+
+    public override Tensor Dense(Tensor X, Tensor W, Tensor B)
+    {
+        //D.Log(string.Format("X = {0}", X.shape));
+        Assert.IsTrue(W.dimensions <= 2);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(X.flatWidth, W.flatHeight);
+        var O = NewTensor(X.flatHeight, W.flatWidth);
+
+        unsafe
+        {
+            fixed (float*
+            xPtr = &Pin(X).array[Pin(X).offset],
+            wPtr = &Pin(W).array[Pin(W).offset],
+            bPtr = &Pin(B).array[Pin(B).offset],
+            oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                    var BOffset = Pin(O).offset;
+                    var BCount = Pin(B).count;
+                    var Barray = Pin(O).array;
+
+                    for (int i = 0; i < O.batch; i++)
+                    {
+                        Marshal.Copy((IntPtr)bPtr, Barray, BOffset + i * BCount, BCount);
+                    }
+
+                    //X.Print(); W.Print();
+                    blas.SGEMM(xPtr, X.flatHeight, X.flatWidth, wPtr, W.flatHeight, W.flatWidth, oPtr, O.batch, O.channels, 16);
+            }
+        }
+
+        return O;
+    }
+
+    public override Tensor MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        Assert.AreEqual(pool.Length, 2);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyPool(pool, stride, pad));
+
+        int xnMult = X.height * X.width * X.channels;
+        int xyMult = X.width * X.channels;
+        int xxMult = X.channels;
+
+        int onMult = O.height * O.width * O.channels;
+        int oyMult = O.width * O.channels;
+        int oxMult = O.channels;
+
+        int oBatch = O.batch;
+        int oHeight = O.height;
+        int oWidth = O.width;
+        int oChannels = O.channels;
+        int xHeight = X.height;
+        int xWidth = X.width;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                MaxPool2DInnerLoop(pool, stride, pad,
+                    xHeight, xWidth, xPtr, xnMult, xyMult, xxMult,
+                    oBatch, oHeight, oWidth, oChannels, oPtr, onMult, oyMult, oxMult);
+            }
+        }
+
+        return O;
+    }
+
+    private static unsafe void MaxPool2DInnerLoop(int[] pool, int[] stride, int[] pad,
+        int xHeight, int xWidth, float* xPtr, int xnMult, int xyMult, int xxMult,
+        int oBatch, int oHeight, int oWidth, int oChannels, float* oPtr, int onMult, int oyMult, int oxMult)
+    {
+        Parallel.For(0, oBatch, n =>
+        {
+            for (var y = 0; y < oHeight; ++y)
+            for (var x = 0; x < oWidth; ++x)
+            for (var c = 0; c < oChannels; ++c)
+            {
+                float maxVal = float.MinValue;
+                for (int dy = 0; dy < pool[1]; ++dy)
+                    for (int dx = 0; dx < pool[0]; ++dx)
+                    {
+                        int oy = y * stride[1] + dy - pad[1];
+                        int ox = x * stride[0] + dx - pad[0];
+
+                        if (oy < 0) continue;
+                        if (oy >= xHeight) continue;
+                        if (ox < 0) continue;
+                        if (ox >= xWidth) continue;
+
+                        float v = xPtr[n * xnMult + oy * xyMult + ox * xxMult + c];
+                        maxVal = Mathf.Max(v, maxVal);
+                    }
+                oPtr[n * onMult + y * oyMult + x * oxMult + c] = maxVal;
+            }
+        });
+    }
+
+    public override Tensor Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        Assert.AreEqual(X.channels, K.kernelDepth);
+        Assert.AreEqual(K.kernelCount, B.flatWidth);
+        Assert.AreEqual(B.flatWidth, B.length);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyKernel(K.shape, stride, pad));
+
+        int xnMult = X.height * X.width * X.channels;
+        int xyMult = X.width * X.channels;
+        int xxMult = X.channels;
+
+        int kyMult = K.height * K.width * K.channels;
+        int kxMult = K.width * K.channels;
+        int kcMult = K.channels;
+
+        int onMult = O.height * O.width * O.channels;
+        int oyMult = O.width * O.channels;
+        int oxMult = O.channels;
+
+        int oBatch = O.batch;
+        int oHeight = O.height;
+        int oWidth = O.width;
+        int kKernelCount = K.kernelCount;
+        int kKernelHeight = K.kernelHeight;
+        int kKernelWidth = K.kernelWidth;
+        int xHeight = X.height;
+        int xWidth = X.width;
+        int xChannels = X.channels;
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                kPtr = &Pin(K).array[Pin(K).offset],
+                bPtr = &Pin(B).array[Pin(B).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                Conv2DInnerLoop(stride, pad, oBatch, oHeight, oWidth, kKernelCount, bPtr, kKernelHeight, kKernelWidth,
+                    xHeight, xWidth, xChannels, xPtr, xnMult, xyMult, xxMult, kPtr, kyMult, kxMult, kcMult, oPtr, onMult, oyMult, oxMult);
+            }
+        }
+
+        return O;
+    }
+
+    private static unsafe void Conv2DInnerLoop(int[] stride, int[] pad, int oBatch, int oHeight, int oWidth, int kKernelCount,
+        float* bPtr, int kKernelHeight, int kKernelWidth, int xHeight, int xWidth, int xChannels, float* xPtr,
+        int xnMult, int xyMult, int xxMult, float* kPtr, int kyMult, int kxMult, int kcMult, float* oPtr, int onMult,
+        int oyMult, int oxMult)
+    {
+        Parallel.For(0, oBatch, n =>
+        {
+            for (var y = 0; y < oHeight; ++y)
+            for (var x = 0; x < oWidth; ++x)
+            for (var k = 0; k < kKernelCount; ++k)
+            {
+                float v = bPtr[k];
+                for (int dy = 0; dy < kKernelHeight; ++dy)
+                {
+                    for (int dx = 0; dx < kKernelWidth; ++dx)
+                    {
+                        int oy = y * stride[1] + dy - pad[1];
+                        int ox = x * stride[0] + dx - pad[0];
+
+                        if (oy < 0) continue;
+                        if (oy >= xHeight) continue;
+                        if (ox < 0) continue;
+                        if (ox >= xWidth) continue;
+
+                        for (var c = 0; c < xChannels; ++c)
+                        {
+                            float xv = xPtr[n * xnMult + oy * xyMult + ox * xxMult + c];
+                            float kv = kPtr[dy * kyMult + dx * kxMult + c * kcMult + k];
+
+                            v += xv * kv;
+                        }
+                    }
+                }
+
+                oPtr[n * onMult + y * oyMult + x * oxMult + k] = v;
+            }
+        });
+    }
+
+    private Tensor ApplyPadding(Tensor X, int[] pad, float constant, Action<long> paddingOp)
+    {
+        Assert.AreEqual(pad.Length, 4);
+
+        var O = NewTensor(X.shape.ApplyBorder(pad));
+
+        int prePadX = Math.Max(0, pad[0]);
+        int prePadY = Math.Max(0, pad[1]);
+        int postPadX = Math.Max(0, pad[2]);
+        int postPadY = Math.Max(0, pad[3]);
+
+        // NOTE: negative "pad" variable will crop X tensor
+        int preCropX  = Math.Max(0, -pad[0]);
+        int preCropY  = Math.Max(0, -pad[1]);
+        int postCropX = Math.Max(0, -pad[2]);
+        int postCropY = Math.Max(0, -pad[3]);
+        int croppedWidth = X.width - (preCropX + postCropX);
+        int croppedHeight = X.height - (preCropY + postCropY);
+
+        unsafe
+        {
+            fixed (float*
+                xPtr = &Pin(X).array[Pin(X).offset],
+                oPtr = &Pin(O).array[Pin(O).offset])
+            {
+                m_InnerLoop.SetState(oPtr, xPtr, O.shape, X.shape, constant, prePadX, prePadY);
+
+                long numItemInARow = O.width * O.channels;
+                long numItemInABatch = O.height * numItemInARow;
+
+                for (int b = 0; b < O.batch; ++b)
+                {
+                    //PrePadY
+                    long prepadOffset = numItemInABatch * b;
+                    long numItemToPrepadInHeight = prePadY * O.width * O.channels;
+                    Parallel_For(prepadOffset, prepadOffset + numItemToPrepadInHeight, paddingOp);
+
+                    //CenterY
+                    Parallel.For(prePadY, croppedHeight + prePadY, y =>
+                    {
+                        long offset = numItemInABatch * b + numItemInARow * y;
+                        //PrePadX
+                        long numItemToPrepadInWidth = prePadX * O.channels;
+                        for (long n = offset; n < (offset + numItemToPrepadInWidth); ++n)
+                            paddingOp(n);
+                        offset += numItemToPrepadInWidth;
+
+                        //CenterX
+                        int srcFloatOffset = X.Index(b, (int)y - prePadY, preCropX, 0) + Pin(X).offset;
+                        int dstFloatOffset = O.Index(b, (int)y, prePadX, 0) + Pin(O).offset;
+                        int numFloatToCopy = O.channels * croppedWidth;
+                        Buffer.BlockCopy(Pin(X).array, srcFloatOffset * sizeof(float), Pin(O).array, dstFloatOffset * sizeof(float), numFloatToCopy * sizeof(float));
+                        offset += numFloatToCopy;
+
+                        //PostPadX
+                        long numItemToPostInWidth = postPadX * O.channels;
+                        for (long n = offset; n < (offset + numItemToPostInWidth); ++n)
+                            paddingOp(n);
+                    });
+
+                    //PostPadY
+                    long postpadOffset = prepadOffset + numItemToPrepadInHeight + numItemInARow * croppedHeight;
+                    long numItemToPostpadInHeight = postPadY * O.width * O.channels;
+                    Parallel_For(postpadOffset, postpadOffset + numItemToPostpadInHeight, paddingOp);
+                }
+            }
+        }
+        return O;
+    }
+
+    public override Tensor Border2D(Tensor X, int[] pad, float constant)
+    {
+        return ApplyPadding(X, pad, constant, m_InnerLoop.m_border2DInnerLoopDelegate);
+    }
+
+    public override Tensor Pad2DEdge(Tensor X, int[] pad)
+    {
+        return ApplyPadding(X, pad, 0.0f, m_InnerLoop.m_pad2DEdgeInnerLoopDelegate);
+    }
+
+    public override Tensor Pad2DReflect(Tensor X, int[] pad)
+    {
+        return ApplyPadding(X, pad, 0.0f, m_InnerLoop.m_pad2DReflectInnerLoopDelegate);
+    }
+
+    public override Tensor Pad2DSymmetric(Tensor X, int[] pad)
+    {
+        return ApplyPadding(X, pad, 0.0f, m_InnerLoop.m_pad2DSymmetricInnerLoopDelegate);
+    }
+
+    public override Tensor Prepare(Tensor X)
+    {
+        Pin(X);
+        return X;
+    }
+}
+
+    internal unsafe class InnerLoop
+    {
+        private int unrollSize;
+        private float* oPtr;
+        private float* xPtr;
+        private int xLen;
+        private float* bPtr;
+        private int bLen;
+        private float alpha;
+        private int prePadX;
+        private int prePadY;
+
+        private TensorShape oShape;
+        private TensorShape xShape;
+        private TensorShape bShape;
+
+        public Action<long> m_tanhInnerLoopDelegate;
+        public Action<long> m_expInnerLoopDelegate;
+        public Action<long> m_sqrtInnerLoopDelegate;
+        public Action<long> m_swishInnerLoopDelegate;
+        public Action<long> m_sigmoidInnerLoopDelegate;
+        public Action<long> m_negInnerLoopDelegate;
+        public Action<long> m_eluInnerLoopDelegate;
+        public Action<long> m_reluInnerLoopDelegate;
+        public Action<long> m_preluInnerLoopDelegate;
+        public Action<long> m_maxInnerLoopDelegate;
+        public Action<long> m_minInnerLoopDelegate;
+        public Action<long> m_divInnerLoopDelegate;
+        public Action<long> m_mulInnerLoopDelegate;
+        public Action<long> m_subInnerLoopDelegate;
+        public Action<long> m_addInnerLoopDelegate;
+        public Action<long> m_greaterInnerLoopDelegate;
+        public Action<long> m_greaterEqualInnerLoopDelegate;
+        public Action<long> m_lessInnerLoopDelegate;
+        public Action<long> m_lessEqualInnerLoopDelegate;
+        public Action<long> m_equalInnerLoopDelegate;
+        public Action<long> m_logicalAndInnerLoopDelegate;
+        public Action<long> m_logicalOrInnerLoopDelegate;
+        public Action<long> m_logicalXorInnerLoopDelegate;
+        public Action<long> m_logicaNotInnerLoopDelegate;
+        public Action<long> m_maxInnerLoopDelegateNoBroadcast;
+        public Action<long> m_minInnerLoopDelegateNoBroadcast;
+        public Action<long> m_divInnerLoopDelegateNoBroadcast;
+        public Action<long> m_mulInnerLoopDelegateNoBroadcast;
+        public Action<long> m_subInnerLoopDelegateNoBroadcast;
+        public Action<long> m_addInnerLoopDelegateNoBroadcast;
+        public Action<long> m_greaterInnerLoopDelegateNoBroadcast;
+        public Action<long> m_greaterEqualInnerLoopDelegateNoBroadcast;
+        public Action<long> m_lessInnerLoopDelegateNoBroadcast;
+        public Action<long> m_lessEqualInnerLoopDelegateNoBroadcast;
+        public Action<long> m_equalInnerLoopDelegateNoBroadcast;
+        public Action<long> m_logicalAndInnerLoopDelegateNoBroadcast;
+        public Action<long> m_logicalOrInnerLoopDelegateNoBroadcast;
+        public Action<long> m_logicalXorInnerLoopDelegateNoBroadcast;
+        public Action<long> m_border2DInnerLoopDelegate;
+        public Action<long> m_pad2DReflectInnerLoopDelegate;
+        public Action<long> m_pad2DSymmetricInnerLoopDelegate;
+        public Action<long> m_pad2DEdgeInnerLoopDelegate;
+
+        public Func<float,float,float> m_maxOpDelegate;
+        public Func<float,float,float> m_minOpDelegate;
+        public Func<float,float,float> m_divOpDelegate;
+        public Func<float,float,float> m_mulOpDelegate;
+        public Func<float,float,float> m_subOpDelegate;
+        public Func<float,float,float> m_addOpDelegate;
+        public Func<float,float,float> m_greaterOpDelegate;
+        public Func<float,float,float> m_greaterEqualOpDelegate;
+        public Func<float,float,float> m_lessOpDelegate;
+        public Func<float,float,float> m_lessEqualOpDelegate;
+        public Func<float,float,float> m_equalOpDelegate;
+        public Func<float,float,float> m_logicalAndOpDelegate;
+        public Func<float,float,float> m_logicalOrOpDelegate;
+        public Func<float,float,float> m_logicalXorOpDelegate;
+        public Func<float,float>       m_logicaNotOpDelegate;
+
+        public InnerLoop()
+        {
+            //Store delegates to avoid GC allocation because of repeated cast from functions to delegate at runtime
+            m_tanhInnerLoopDelegate = TanhInnerLoop;
+            m_expInnerLoopDelegate = ExpInnerLoop;
+            m_sqrtInnerLoopDelegate = SqrtInnerLoop;
+            m_swishInnerLoopDelegate = SwishInnerLoop;
+            m_sigmoidInnerLoopDelegate = SigmoidInnerLoop;
+            m_negInnerLoopDelegate = NegInnerLoop;
+            m_eluInnerLoopDelegate = EluInnerLoop;
+            m_reluInnerLoopDelegate = ReluInnerLoop;
+            m_preluInnerLoopDelegate = PReluInnerLoop;
+            m_maxInnerLoopDelegate = MaxInnerLoop;
+            m_minInnerLoopDelegate = MinInnerLoop;
+            m_divInnerLoopDelegate = DivInnerLoop;
+            m_mulInnerLoopDelegate = MulInnerLoop;
+            m_subInnerLoopDelegate = SubInnerLoop;
+            m_addInnerLoopDelegate = AddInnerLoop;
+            m_greaterInnerLoopDelegate = GreaterInnerLoop;
+            m_greaterEqualInnerLoopDelegate = GreaterEqualInnerLoop;
+            m_lessInnerLoopDelegate = LessInnerLoop;
+            m_lessEqualInnerLoopDelegate = LessEqualInnerLoop;
+            m_equalInnerLoopDelegate = EqualInnerLoop;
+            m_logicalAndInnerLoopDelegate = LogicalAndInnerLoop;
+            m_logicalOrInnerLoopDelegate = LogicalOrInnerLoop;
+            m_logicalXorInnerLoopDelegate = LogicalXorInnerLoop;
+            m_logicaNotInnerLoopDelegate = LogicalNotInnerLoop;
+            m_maxInnerLoopDelegateNoBroadcast = MaxInnerLoopNoBroadcast;
+            m_minInnerLoopDelegateNoBroadcast = MinInnerLoopNoBroadcast;
+            m_divInnerLoopDelegateNoBroadcast = DivInnerLoopNoBroadcast;
+            m_mulInnerLoopDelegateNoBroadcast = MulInnerLoopNoBroadcast;
+            m_subInnerLoopDelegateNoBroadcast = SubInnerLoopNoBroadcast;
+            m_addInnerLoopDelegateNoBroadcast = AddInnerLoopNoBroadcast;
+            m_greaterInnerLoopDelegateNoBroadcast = GreaterInnerLoopNoBroadcast;
+            m_greaterEqualInnerLoopDelegateNoBroadcast = GreaterEqualInnerLoopNoBroadcast;
+            m_lessInnerLoopDelegateNoBroadcast = LessInnerLoopNoBroadcast;
+            m_lessEqualInnerLoopDelegateNoBroadcast = LessEqualInnerLoopNoBroadcast;
+            m_equalInnerLoopDelegateNoBroadcast = EqualInnerLoopNoBroadcast;
+            m_logicalAndInnerLoopDelegateNoBroadcast = LogicalAndInnerLoopNoBroadcast;
+            m_logicalOrInnerLoopDelegateNoBroadcast = LogicalOrInnerLoopNoBroadcast;
+            m_logicalXorInnerLoopDelegateNoBroadcast = LogicalXorInnerLoopNoBroadcast;
+            m_border2DInnerLoopDelegate = Border2DInnerLoop;
+            m_pad2DEdgeInnerLoopDelegate = Pad2DEdgeInnerLoop;
+            m_pad2DReflectInnerLoopDelegate = Pad2DReflectInnerLoop;
+            m_pad2DSymmetricInnerLoopDelegate = Pad2DSymmetricInnerLoop;
+            m_maxOpDelegate = Max;
+            m_minOpDelegate = Min;
+            m_divOpDelegate = Div;
+            m_mulOpDelegate = Mul;
+            m_subOpDelegate = Sub;
+            m_addOpDelegate = Add;
+            m_greaterOpDelegate = Greater;
+            m_greaterEqualOpDelegate = GreaterEqual;
+            m_lessOpDelegate = Less;
+            m_lessEqualOpDelegate = LessEqual;
+            m_equalOpDelegate = Equal;
+            m_logicalAndOpDelegate = LogicalAnd;
+            m_logicalOrOpDelegate = LogicalOr;
+            m_logicalXorOpDelegate = LogicalXor;
+            m_logicaNotOpDelegate = LogicalNot;
+        }
+
+        public void SetState(int unrollSize, float* oPtr, float* xPtr, float* bPtr, TensorShape oShape, TensorShape xShape, TensorShape bShape)
+        {
+            this.unrollSize = unrollSize;
+            this.oPtr = oPtr;
+            this.oShape = oShape;
+            this.xPtr = xPtr;
+            this.xShape = xShape;
+            this.xLen = xShape.length;
+            this.bPtr = bPtr;
+            this.bShape = bShape;
+            this.bLen = bShape.length;
+        }
+
+        public void SetState(int unrollSize, float* oPtr, float* xPtr, int xLen, float* bPtr, int bLen)
+        {
+            this.unrollSize = unrollSize;
+            this.oPtr = oPtr;
+            this.xPtr = xPtr;
+            this.xLen = xLen;
+            this.bPtr = bPtr;
+            this.bLen = bLen;
+        }
+
+        public void SetState(int unrollSize, float* xPtr, float* oPtr)
+        {
+            this.unrollSize = unrollSize;
+            this.oPtr = oPtr;
+            this.xPtr = xPtr;
+        }
+
+        public void SetState(int unrollSize, float* xPtr, float* oPtr, float* bPtr)
+        {
+            this.unrollSize = unrollSize;
+            this.oPtr = oPtr;
+            this.xPtr = xPtr;
+            this.bPtr = bPtr;
+        }
+
+        public void SetState(int unrollSize, float* xPtr, float* oPtr, float alpha)
+        {
+            this.unrollSize = unrollSize;
+            this.oPtr = oPtr;
+            this.xPtr = xPtr;
+            this.alpha = alpha;
+        }
+
+        public void SetState(float* oPtr, float* xPtr, TensorShape oShape, TensorShape xShape, float constant, int prePadX, int prePadY)
+        {
+            this.oPtr = oPtr;
+            this.xPtr = xPtr;
+            this.oShape = oShape;
+            this.xShape = xShape;
+            this.alpha = constant;
+            this.prePadX = prePadX;
+            this.prePadY = prePadY;
+        }
+
+        private void NegInnerLoop(long n)
+        {
+            float* baseXPtr = xPtr + n * unrollSize;
+            float* baseOPtr = oPtr + n * unrollSize;
+            float v0 = baseXPtr[0];
+            float v1 = baseXPtr[1];
+            float v2 = baseXPtr[2];
+            float v3 = baseXPtr[3];
+
+            v0 = -v0;
+            v1 = -v1;
+            v2 = -v2;
+            v3 = -v3;
+
+            baseOPtr[0] = v0;
+            baseOPtr[1] = v1;
+            baseOPtr[2] = v2;
+            baseOPtr[3] = v3;
+        }
+
+        private void ReluInnerLoop(long n)
+        {
+            float* baseXPtr = xPtr + n * unrollSize;
+            float* baseOPtr = oPtr + n * unrollSize;
+            float v0 = baseXPtr[0];
+            float v1 = baseXPtr[1];
+            float v2 = baseXPtr[2];
+            float v3 = baseXPtr[3];
+
+            v0 = 0.5f * (v0 + Mathf.Abs(v0));
+            v1 = 0.5f * (v1 + Mathf.Abs(v1));
+            v2 = 0.5f * (v2 + Mathf.Abs(v2));
+            v3 = 0.5f * (v3 + Mathf.Abs(v3));
+
+            baseOPtr[0] = v0;
+            baseOPtr[1] = v1;
+            baseOPtr[2] = v2;
+            baseOPtr[3] = v3;
+        }
+
+        private void EluInnerLoop(long n)
+        {
+            float* baseXPtr = xPtr + n * unrollSize;
+            float* baseOPtr = oPtr + n * unrollSize;
+            float v0 = baseXPtr[0];
+            float v1 = baseXPtr[1];
+            float v2 = baseXPtr[2];
+            float v3 = baseXPtr[3];
+
+            if (v0 <= 0)
+                v0 = alpha * (Mathf.Exp(v0) - 1f);
+            if (v1 <= 0)
+                v1 = alpha * (Mathf.Exp(v1) - 1f);
+            if (v2 <= 0)
+                v2 = alpha * (Mathf.Exp(v2) - 1f);
+            if (v3 <= 0)
+                v3 = alpha * (Mathf.Exp(v3) - 1f);
+
+            baseOPtr[0] = v0;
+            baseOPtr[1] = v1;
+            baseOPtr[2] = v2;
+            baseOPtr[3] = v3;
+        }
+
+        private void PReluInnerLoop(long n)
+        {
+            float* baseXPtr = xPtr + n * unrollSize;
+            float* baseOPtr = oPtr + n * unrollSize;
+            float* baseBPtr = bPtr + (n * unrollSize) % bLen;
+            float v0 = baseXPtr[0];
+            float v1 = baseXPtr[1];
+            float v2 = baseXPtr[2];
+            float v3 = baseXPtr[3];
+
+            float s0 = baseBPtr[0 % bLen];
+            float s1 = baseBPtr[1 % bLen];
+            float s2 = baseBPtr[2 % bLen];
+            float s3 = baseBPtr[3 % bLen];
+
+            if (v0 <= 0)
+                v0 = s0 * v0;
+            if (v1 <= 0)
+                v1 = s1 * v1;
+            if (v2 <= 0)
+                v2 = s2 * v2;
+            if (v3 <= 0)
+                v3 = s3 * v3;
+
+            baseOPtr[0] = v0;
+            baseOPtr[1] = v1;
+            baseOPtr[2] = v2;
+            baseOPtr[3] = v3;
+        }
+
+        private void SigmoidInnerLoop(long n)
+        {
+            float* baseXPtr = xPtr + n * unrollSize;
+            float* baseOPtr = oPtr + n * unrollSize;
+            float v0 = baseXPtr[0];
+            float v1 = baseXPtr[1];
+            float v2 = baseXPtr[2];
+            float v3 = baseXPtr[3];
+
+            v0 = 1f / (1f + Mathf.Exp(-v0));
+            v1 = 1f / (1f + Mathf.Exp(-v1));
+            v2 = 1f / (1f + Mathf.Exp(-v2));
+            v3 = 1f / (1f + Mathf.Exp(-v3));
+
+            baseOPtr[0] = v0;
+            baseOPtr[1] = v1;
+            baseOPtr[2] = v2;
+            baseOPtr[3] = v3;
+        }
+
+        private void SwishInnerLoop(long n)
+        {
+            float* baseXPtr = xPtr + n * unrollSize;
+            float* baseOPtr = oPtr + n * unrollSize;
+            float v0 = baseXPtr[0];
+            float v1 = baseXPtr[1];
+            float v2 = baseXPtr[2];
+            float v3 = baseXPtr[3];
+
+            v0 = v0 / (1f + Mathf.Exp(-v0));
+            v1 = v1 / (1f + Mathf.Exp(-v1));
+            v2 = v2 / (1f + Mathf.Exp(-v2));
+            v3 = v3 / (1f + Mathf.Exp(-v3));
+
+            baseOPtr[0] = v0;
+            baseOPtr[1] = v1;
+            baseOPtr[2] = v2;
+            baseOPtr[3] = v3;
+        }
+
+        private void ExpInnerLoop(long n)
+        {
+            float* baseXPtr = xPtr + n * unrollSize;
+            float* baseOPtr = oPtr + n * unrollSize;
+            float v0 = baseXPtr[0];
+            float v1 = baseXPtr[1];
+            float v2 = baseXPtr[2];
+            float v3 = baseXPtr[3];
+
+            v0 = Mathf.Exp(v0);
+            v1 = Mathf.Exp(v1);
+            v2 = Mathf.Exp(v2);
+            v3 = Mathf.Exp(v3);
+
+            baseOPtr[0] = v0;
+            baseOPtr[1] = v1;
+            baseOPtr[2] = v2;
+            baseOPtr[3] = v3;
+        }
+
+        private void SqrtInnerLoop(long n)
+        {
+            float* baseXPtr = xPtr + n * unrollSize;
+            float* baseOPtr = oPtr + n * unrollSize;
+            float v0 = baseXPtr[0];
+            float v1 = baseXPtr[1];
+            float v2 = baseXPtr[2];
+            float v3 = baseXPtr[3];
+
+            v0 = Mathf.Sqrt(v0);
+            v1 = Mathf.Sqrt(v1);
+            v2 = Mathf.Sqrt(v2);
+            v3 = Mathf.Sqrt(v3);
+
+            baseOPtr[0] = v0;
+            baseOPtr[1] = v1;
+            baseOPtr[2] = v2;
+            baseOPtr[3] = v3;
+        }
+
+        private void TanhInnerLoop(long n)
+        {
+            float* baseXPtr = xPtr + n * unrollSize;
+            float* baseOPtr = oPtr + n * unrollSize;
+            float v0 = baseXPtr[0];
+            float v1 = baseXPtr[1];
+            float v2 = baseXPtr[2];
+            float v3 = baseXPtr[3];
+
+            v0 = MathfEx.tanh(v0);
+            v1 = MathfEx.tanh(v1);
+            v2 = MathfEx.tanh(v2);
+            v3 = MathfEx.tanh(v3);
+
+            baseOPtr[0] = v0;
+            baseOPtr[1] = v1;
+            baseOPtr[2] = v2;
+            baseOPtr[3] = v3;
+        }
+
+        private void AddInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] + bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)];
+            oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] + bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)];
+            oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] + bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)];
+            oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] + bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)];
+        }
+
+        private void SubInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] - bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)];
+            oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] - bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)];
+            oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] - bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)];
+            oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] - bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)];
+        }
+
+        private void MulInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] * bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)];
+            oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] * bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)];
+            oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] * bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)];
+            oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] * bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)];
+        }
+
+        private void DivInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] / bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)];
+            oPtr[i + 1] = xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] / bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)];
+            oPtr[i + 2] = xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] / bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)];
+            oPtr[i + 3] = xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] / bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)];
+        }
+
+        private void MinInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] , bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)] );
+            oPtr[i + 1] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] , bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)] );
+            oPtr[i + 2] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] , bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)] );
+            oPtr[i + 3] = Mathf.Min( xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] , bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)] );
+        }
+
+        private void MaxInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)], bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]);
+            oPtr[i + 1] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)], bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]);
+            oPtr[i + 2] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)], bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]);
+            oPtr[i + 3] = Mathf.Max(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)], bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]);
+        }
+
+        private void GreaterInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] > bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] > bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] > bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] > bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f;
+        }
+
+        private void GreaterEqualInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] >= bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] >= bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] >= bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] >= bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f;
+        }
+
+        private void LessInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] < bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] < bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] < bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] < bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f;
+        }
+
+        private void LessEqualInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] <= bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] <= bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] <= bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] <= bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f;
+        }
+
+        private void EqualInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = (xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)] == bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)] == bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)] == bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)] == bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ? 1.0f : 0.0f;
+        }
+
+        private void LogicalOrInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)])) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)])) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)])) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) || Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)])) ? 1.0f : 0.0f;
+        }
+
+        private void LogicalAndInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)])) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)])) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)])) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) && Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)])) ? 1.0f : 0.0f;
+        }
+
+        private void LogicalXorInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            int b1 = 0, h1 = 0, w1 = 0, ch1 = 0;
+            int b2 = 0, h2 = 0, w2 = 0, ch2 = 0;
+            int b3 = 0, h3 = 0, w3 = 0, ch3 = 0;
+            oShape.GetPositionsFromIndex(i + 0, ref b0, ref h0, ref w0, ref ch0);
+            oShape.GetPositionsFromIndex(i + 1, ref b1, ref h1, ref w1, ref ch1);
+            oShape.GetPositionsFromIndex(i + 2, ref b2, ref h2, ref w2, ref ch2);
+            oShape.GetPositionsFromIndex(i + 3, ref b3, ref h3, ref w3, ref ch3);
+
+            oPtr[i + 0] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b0, h0, w0, ch0)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b0, h0, w0, ch0)])) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b1, h1, w1, ch1)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b1, h1, w1, ch1)])) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b2, h2, w2, ch2)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b2, h2, w2, ch2)])) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (Convert.ToBoolean(xPtr[xShape.IndexWithBroadcast(b3, h3, w3, ch3)]) ^ Convert.ToBoolean(bPtr[bShape.IndexWithBroadcast(b3, h3, w3, ch3)])) ? 1.0f : 0.0f;
+        }
+
+        private void AddInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = xPtr[(i + 0) % xLen] + bPtr[(i + 0) % bLen];
+            oPtr[i + 1] = xPtr[(i + 1) % xLen] + bPtr[(i + 1) % bLen];
+            oPtr[i + 2] = xPtr[(i + 2) % xLen] + bPtr[(i + 2) % bLen];
+            oPtr[i + 3] = xPtr[(i + 3) % xLen] + bPtr[(i + 3) % bLen];
+        }
+
+        private void SubInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = xPtr[(i + 0) % xLen] - bPtr[(i + 0) % bLen];
+            oPtr[i + 1] = xPtr[(i + 1) % xLen] - bPtr[(i + 1) % bLen];
+            oPtr[i + 2] = xPtr[(i + 2) % xLen] - bPtr[(i + 2) % bLen];
+            oPtr[i + 3] = xPtr[(i + 3) % xLen] - bPtr[(i + 3) % bLen];
+        }
+
+        private void MulInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = xPtr[(i + 0) % xLen] * bPtr[(i + 0) % bLen];
+            oPtr[i + 1] = xPtr[(i + 1) % xLen] * bPtr[(i + 1) % bLen];
+            oPtr[i + 2] = xPtr[(i + 2) % xLen] * bPtr[(i + 2) % bLen];
+            oPtr[i + 3] = xPtr[(i + 3) % xLen] * bPtr[(i + 3) % bLen];
+        }
+
+        private void DivInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = xPtr[(i + 0) % xLen] / bPtr[(i + 0) % bLen];
+            oPtr[i + 1] = xPtr[(i + 1) % xLen] / bPtr[(i + 1) % bLen];
+            oPtr[i + 2] = xPtr[(i + 2) % xLen] / bPtr[(i + 2) % bLen];
+            oPtr[i + 3] = xPtr[(i + 3) % xLen] / bPtr[(i + 3) % bLen];
+        }
+
+        private void MinInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = Mathf.Min(xPtr[(i + 0) % xLen], bPtr[(i + 0) % bLen]);
+            oPtr[i + 1] = Mathf.Min(xPtr[(i + 1) % xLen], bPtr[(i + 1) % bLen]);
+            oPtr[i + 2] = Mathf.Min(xPtr[(i + 2) % xLen], bPtr[(i + 2) % bLen]);
+            oPtr[i + 3] = Mathf.Min(xPtr[(i + 3) % xLen], bPtr[(i + 3) % bLen]);
+        }
+
+        private void MaxInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = Mathf.Max(xPtr[(i + 0) % xLen], bPtr[(i + 0) % bLen]);
+            oPtr[i + 1] = Mathf.Max(xPtr[(i + 1) % xLen], bPtr[(i + 1) % bLen]);
+            oPtr[i + 2] = Mathf.Max(xPtr[(i + 2) % xLen], bPtr[(i + 2) % bLen]);
+            oPtr[i + 3] = Mathf.Max(xPtr[(i + 3) % xLen], bPtr[(i + 3) % bLen]);
+        }
+
+        private void GreaterInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = (xPtr[(i + 0) % xLen] > bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (xPtr[(i + 1) % xLen] > bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (xPtr[(i + 2) % xLen] > bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (xPtr[(i + 3) % xLen] > bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f;
+        }
+
+        private void GreaterEqualInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = (xPtr[(i + 0) % xLen] >= bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (xPtr[(i + 1) % xLen] >= bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (xPtr[(i + 2) % xLen] >= bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (xPtr[(i + 3) % xLen] >= bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f;
+        }
+
+        private void LessInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = (xPtr[(i + 0) % xLen] < bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (xPtr[(i + 1) % xLen] < bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (xPtr[(i + 2) % xLen] < bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (xPtr[(i + 3) % xLen] < bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f;
+        }
+
+        private void LessEqualInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = (xPtr[(i + 0) % xLen] <= bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (xPtr[(i + 1) % xLen] <= bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (xPtr[(i + 2) % xLen] <= bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (xPtr[(i + 3) % xLen] <= bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f;
+        }
+
+        private void EqualInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = (xPtr[(i + 0) % xLen] == bPtr[(i + 0) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (xPtr[(i + 1) % xLen] == bPtr[(i + 1) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (xPtr[(i + 2) % xLen] == bPtr[(i + 2) % bLen]) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (xPtr[(i + 3) % xLen] == bPtr[(i + 3) % bLen]) ? 1.0f : 0.0f;
+        }
+
+        private void LogicalOrInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = (Convert.ToBoolean(xPtr[(i + 0) % xLen]) || Convert.ToBoolean(bPtr[(i + 0) % bLen])) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (Convert.ToBoolean(xPtr[(i + 1) % xLen]) || Convert.ToBoolean(bPtr[(i + 1) % bLen])) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (Convert.ToBoolean(xPtr[(i + 2) % xLen]) || Convert.ToBoolean(bPtr[(i + 2) % bLen])) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (Convert.ToBoolean(xPtr[(i + 3) % xLen]) || Convert.ToBoolean(bPtr[(i + 3) % bLen])) ? 1.0f : 0.0f;
+        }
+
+        private void LogicalAndInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = (Convert.ToBoolean(xPtr[(i + 0) % xLen]) && Convert.ToBoolean(bPtr[(i + 0) % bLen])) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (Convert.ToBoolean(xPtr[(i + 1) % xLen]) && Convert.ToBoolean(bPtr[(i + 1) % bLen])) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (Convert.ToBoolean(xPtr[(i + 2) % xLen]) && Convert.ToBoolean(bPtr[(i + 2) % bLen])) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (Convert.ToBoolean(xPtr[(i + 3) % xLen]) && Convert.ToBoolean(bPtr[(i + 3) % bLen])) ? 1.0f : 0.0f;
+        }
+
+        private void LogicalXorInnerLoopNoBroadcast(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = (Convert.ToBoolean(xPtr[(i + 0) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 0) % bLen])) ? 1.0f : 0.0f;
+            oPtr[i + 1] = (Convert.ToBoolean(xPtr[(i + 1) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 1) % bLen])) ? 1.0f : 0.0f;
+            oPtr[i + 2] = (Convert.ToBoolean(xPtr[(i + 2) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 2) % bLen])) ? 1.0f : 0.0f;
+            oPtr[i + 3] = (Convert.ToBoolean(xPtr[(i + 3) % xLen]) ^ Convert.ToBoolean(bPtr[(i + 3) % bLen])) ? 1.0f : 0.0f;
+        }
+
+        private void LogicalNotInnerLoop(long n)
+        {
+            int i = (int)n * unrollSize;
+
+            oPtr[i + 0] = Convert.ToBoolean(xPtr[i + 0]) ? 0.0f : 1.0f;
+            oPtr[i + 1] = Convert.ToBoolean(xPtr[i + 1]) ? 0.0f : 1.0f;
+            oPtr[i + 2] = Convert.ToBoolean(xPtr[i + 2]) ? 0.0f : 1.0f;
+            oPtr[i + 3] = Convert.ToBoolean(xPtr[i + 3]) ? 0.0f : 1.0f;
+        }
+
+        private static void ClampHWToTensorShape(TensorShape shape, ref int height, ref int width)
+        {
+            width = Math.Max(width, 0);
+            height = Math.Max(height, 0);
+            width = Math.Min(width, shape.width - 1);
+            height = Math.Min(height, shape.height - 1);
+        }
+        private void Border2DInnerLoop(long n)
+        {
+            int i = (int)n;
+            oPtr[i]  = alpha;
+        }
+        private void Pad2DEdgeInnerLoop(long n)
+        {
+            int i = (int)n;
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            oShape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0);
+            h0 -= prePadY;
+            w0 -= prePadX;
+
+            ClampHWToTensorShape(xShape, ref h0, ref w0);
+
+            oPtr[i] = xPtr[xShape.Index(b0, h0, w0, ch0)];
+        }
+
+        private void Pad2DReflectInnerLoop(long n)
+        {
+            int i = (int)n;
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            oShape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0);
+            h0 -= prePadY;
+            w0 -= prePadX;
+
+            int lastXIndex = xShape.width - 1;
+            int lastYIndex = xShape.height - 1;
+
+            if (w0 < 0)
+                w0 = -w0;
+            else if (w0 > lastXIndex)
+                w0 = lastXIndex - (w0 - lastXIndex);
+
+            if (h0 < 0)
+                h0 = -h0;
+            else if (h0 > lastYIndex)
+                h0 = lastYIndex - (h0 - lastYIndex);
+
+            ClampHWToTensorShape(xShape, ref h0, ref w0);
+
+            oPtr[i] = xPtr[xShape.Index(b0, h0, w0, ch0)];
+        }
+
+        private void Pad2DSymmetricInnerLoop(long n)
+        {
+            int i = (int)n;
+            int b0 = 0, h0 = 0, w0 = 0, ch0 = 0;
+            oShape.GetPositionsFromIndex(i, ref b0, ref h0, ref w0, ref ch0);
+            h0 -= prePadY;
+            w0 -= prePadX;
+
+            int lastXIndex = xShape.width - 1;
+            int lastYIndex = xShape.height - 1;
+
+            if (w0 < 0)
+                w0 = -w0 - 1;
+            else if (w0 > lastXIndex)
+                w0 = lastXIndex - (w0 - lastXIndex) + 1;
+
+            if (h0 < 0)
+                h0 = -h0 - 1;
+            else if (h0 > lastYIndex)
+                h0 = lastYIndex - (h0 - lastYIndex) + 1;
+
+            ClampHWToTensorShape(xShape, ref h0, ref w0);
+
+            oPtr[i] = xPtr[xShape.Index(b0, h0, w0, ch0)];
+        }
+
+        private float Add(float a, float b)
+        {
+            return a + b;
+        }
+        private float Sub(float a, float b)
+        {
+            return a - b;
+        }
+        private float Mul(float a, float b)
+        {
+            return a * b;
+        }
+        private float Div(float a, float b)
+        {
+            return a / b;
+        }
+        private float Min(float a, float b)
+        {
+            return Mathf.Min(a, b);
+        }
+        private float Max(float a, float b)
+        {
+            return Mathf.Max(a, b);
+        }
+        private float Greater(float a, float b)
+        {
+            return Convert.ToSingle(a > b);
+        }
+        private float GreaterEqual(float a, float b)
+        {
+            return Convert.ToSingle(a >= b);
+        }
+        private float Less(float a, float b)
+        {
+            return Convert.ToSingle(a < b);
+        }
+        private float LessEqual(float a, float b)
+        {
+            return Convert.ToSingle(a <= b);
+        }
+        private float Equal(float a, float b)
+        {
+            return Convert.ToSingle(a == b);
+        }
+        private float LogicalOr(float a, float b)
+        {
+            return Convert.ToSingle(Convert.ToBoolean(a) || Convert.ToBoolean(b));
+        }
+        private float LogicalAnd(float a, float b)
+        {
+            return Convert.ToSingle(Convert.ToBoolean(a) && Convert.ToBoolean(b));
+        }
+        private float LogicalXor(float a, float b)
+        {
+            return Convert.ToSingle(Convert.ToBoolean(a) ^ Convert.ToBoolean(b));
+        }
+        private float LogicalNot(float a)
+        {
+            return Convert.ToSingle(!Convert.ToBoolean(a));
+        }
+    }
+
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta
new file mode 100644
index 0000000..01a107c
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/BarracudaUnsafeArrayCPU.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: c077f9591cc6d4804bc89b66a2a67c0d
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/CompareOps.cs b/Assets/Coach-ML/Barracuda/Core/Backends/CompareOps.cs
new file mode 100644
index 0000000..4333ca8
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/CompareOps.cs
@@ -0,0 +1,602 @@
+namespace Barracuda {
+
+
+public class CompareOps : IOps
+{
+    private IOps m_Ops1;
+    private IOps m_Ops2;
+
+    public CompareOps(IOps ops1, IOps ops2)
+    {
+        m_Ops1 = ops1;
+        m_Ops2 = ops2;
+    }
+
+    public virtual void WaitForCompletion(Tensor x)
+    {
+        m_Ops1.WaitForCompletion(x);
+        m_Ops2.WaitForCompletion(x);
+    }
+
+    Tensor IOps.MatMul(Tensor X, bool xTranspose, Tensor W, bool wTranspose)
+    {
+        var Y = m_Ops1.MatMul(X, xTranspose, W, wTranspose);
+        var Z = m_Ops2.MatMul(X, xTranspose, W, wTranspose);
+        CheckSame(Y, Z, Layer.Type.MatMul);
+        return Y;
+    }
+    Tensor IOps.Dense(Tensor X, Tensor W, Tensor B)
+    {
+        var Y = m_Ops1.Dense(X, W, B);
+        var Z = m_Ops2.Dense(X, W, B);
+        CheckSame(Y, Z, Layer.Type.Dense);
+        return Y;
+    }
+
+    Tensor IOps.Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        var Y = m_Ops1.Conv2D(X, K, B, stride, pad);
+        var Z = m_Ops2.Conv2D(X, K, B, stride, pad);
+        CheckSame(Y, Z, Layer.Type.Conv2D);
+        return Y;
+    }
+    Tensor IOps.DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        var Y = m_Ops1.DepthwiseConv2D(X, K, B, stride, pad);
+        var Z = m_Ops2.DepthwiseConv2D(X, K, B, stride, pad);
+        CheckSame(Y, Z, Layer.Type.DepthwiseConv2D);
+        return Y;
+    }
+    Tensor IOps.Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment)
+    {
+        var Y = m_Ops1.Conv2DTrans(X, K, B, stride, pad, outputAdjustment);
+        var Z = m_Ops2.Conv2DTrans(X, K, B, stride, pad, outputAdjustment);
+        CheckSame(Y, Z, Layer.Type.Conv2DTrans);
+        return Y;
+    }
+    Tensor IOps.Upsample2D(Tensor X, int[] size)
+    {
+        var Y = m_Ops1.Upsample2D(X, size);
+        var Z = m_Ops2.Upsample2D(X, size);
+        CheckSame(Y, Z, Layer.Type.Upsample2D);
+        return Y;
+    }
+    Tensor IOps.MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        var Y = m_Ops1.MaxPool2D(X, pool, stride, pad);
+        var Z = m_Ops2.MaxPool2D(X, pool, stride, pad);
+        CheckSame(Y, Z, Layer.Type.MaxPool2D);
+        return Y;
+    }
+    Tensor IOps.AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        var Y = m_Ops1.AvgPool2D(X, pool, stride, pad);
+        var Z = m_Ops2.AvgPool2D(X, pool, stride, pad);
+        CheckSame(Y, Z, Layer.Type.AvgPool2D);
+        return Y;
+    }
+    Tensor IOps.GlobalMaxPool2D(Tensor X)
+    {
+        var Y = m_Ops1.GlobalMaxPool2D(X);
+        var Z = m_Ops2.GlobalMaxPool2D(X);
+        CheckSame(Y, Z, Layer.Type.GlobalMaxPool2D);
+        return Y;
+    }
+    Tensor IOps.GlobalAvgPool2D(Tensor X)
+    {
+        var Y = m_Ops1.GlobalAvgPool2D(X);
+        var Z = m_Ops2.GlobalAvgPool2D(X);
+        CheckSame(Y, Z, Layer.Type.GlobalAvgPool2D);
+        return Y;
+    }
+    Tensor IOps.GlobalAvgVariancePool2D(Tensor X)
+    {
+        var Y = m_Ops1.GlobalAvgVariancePool2D(X);
+        var Z = m_Ops2.GlobalAvgVariancePool2D(X);
+        CheckSame(Y, Z, Layer.Type.GlobalAvgPool2D);
+        return Y;
+    }   
+    Tensor IOps.Border2D(Tensor x, int[] pad, float value)
+    {
+        var Y = m_Ops1.Border2D(x, pad, value);
+        var Z = m_Ops2.Border2D(x, pad, value);
+        CheckSame(Y, Z, Layer.Type.Border2D);
+        return Y;        
+    }
+    Tensor IOps.Pad2DReflect(Tensor x, int[] pad)
+    {
+        var Y = m_Ops1.Pad2DReflect(x, pad);
+        var Z = m_Ops2.Pad2DReflect(x, pad);
+        CheckSame(Y, Z, Layer.Type.Pad2DReflect);
+        return Y;
+    }
+    Tensor IOps.Pad2DSymmetric(Tensor x, int[] pad)
+    {
+        var Y = m_Ops1.Pad2DSymmetric(x, pad);
+        var Z = m_Ops2.Pad2DSymmetric(x, pad);
+        CheckSame(Y, Z, Layer.Type.Pad2DSymmetric);
+        return Y;
+    }
+    Tensor IOps.Pad2DEdge(Tensor x, int[] pad)
+    {
+        var Y = m_Ops1.Pad2DEdge(x, pad);
+        var Z = m_Ops2.Pad2DEdge(x, pad);
+        CheckSame(Y, Z, Layer.Type.Pad2DEdge);
+        return Y;
+    }
+    Tensor IOps.ScaleBias(Tensor X, Tensor S, Tensor B)
+    {
+        var Y = m_Ops1.ScaleBias(X, S, B);
+        var Z = m_Ops2.ScaleBias(X, S, B);
+        CheckSame(Y, Z, Layer.Type.ScaleBias);
+        return Y;
+    }
+    Tensor IOps.Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon)
+    {
+        var Y = m_Ops1.Normalization(X, S, B, pool, axis, epsilon);
+        var Z = m_Ops2.Normalization(X, S, B, pool, axis, epsilon);
+        CheckSame(Y, Z, Layer.Type.Normalization);
+        return Y;
+    }
+    Tensor IOps.LRN(Tensor X, float alpha, float beta, float bias, int size)
+    {
+        var Y = m_Ops1.LRN(X, alpha, beta, bias, size);
+        var Z = m_Ops2.LRN(X, alpha, beta, bias, size);
+        CheckSame(Y, Z, Layer.Type.LRN);
+        return Y;
+    }
+    
+    Tensor IOps.Dropout(Tensor X, float alpha)
+    {
+        var Y = m_Ops1.Dropout(X, alpha);
+        var Z = m_Ops2.Dropout(X, alpha);
+        CheckSame(Y, Z, Layer.Type.Dropout);
+        return Y;
+    }
+
+    Tensor IOps.RandomNormal(TensorShape s, float mean, float scale, int seed)
+    {
+        var Y = m_Ops1.RandomNormal(s, mean, scale, seed);
+        var Z = m_Ops2.RandomNormal(s, mean, scale, seed);
+        CheckSame(Y, Z, Layer.Type.RandomNormal);
+        return Y;
+    }
+    Tensor IOps.RandomUniform(TensorShape s, float mean, float scale, int seed)
+    {
+        var Y = m_Ops1.RandomUniform(s, mean, scale, seed);
+        var Z = m_Ops2.RandomUniform(s, mean, scale, seed);
+        CheckSame(Y, Z, Layer.Type.RandomUniform);
+        return Y;
+    }
+    Tensor IOps.Multinomial(Tensor X, int count, int seed)
+    {
+        var Y = m_Ops1.Multinomial(X, count, seed);
+        var Z = m_Ops2.Multinomial(X, count, seed);
+        CheckSame(Y, Z, Layer.Type.Multinomial);
+        return Y;
+    }
+    Tensor IOps.OneHot(Tensor X, int depth, float onValue, float offValue)
+    {
+        var Y = m_Ops1.OneHot(X, depth, onValue, offValue);
+        var Z = m_Ops2.OneHot(X, depth, onValue, offValue);
+        CheckSame(Y, Z, Layer.Type.OneHot);
+        return Y;
+    }
+
+    Tensor IOps.Relu(Tensor X)
+    {
+        var Y = m_Ops1.Relu(X);
+        var Z = m_Ops2.Relu(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Relu);
+        return Y;
+    }
+    Tensor IOps.Softmax(Tensor X)
+    {
+        var Y = m_Ops1.Softmax(X);
+        var Z = m_Ops2.Softmax(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Softmax);
+        return Y;
+    }
+    Tensor IOps.LogSoftmax(Tensor X)
+    {
+        var Y = m_Ops1.LogSoftmax(X);
+        var Z = m_Ops2.LogSoftmax(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.LogSoftmax);
+        return Y;
+    }
+    Tensor IOps.Tanh(Tensor X)
+    {
+        var Y = m_Ops1.Tanh(X);
+        var Z = m_Ops2.Tanh(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Tanh);
+        return Y;
+    }
+    Tensor IOps.Sigmoid(Tensor X)
+    {
+        var Y = m_Ops1.Sigmoid(X);
+        var Z = m_Ops2.Sigmoid(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Sigmoid);
+        return Y;
+    }
+    Tensor IOps.Elu(Tensor X, float alpha)
+    {
+        var Y = m_Ops1.Elu(X, alpha);
+        var Z = m_Ops2.Elu(X, alpha);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Elu);
+        return Y;
+    }
+    Tensor IOps.Relu6(Tensor X)
+    {
+        var Y = m_Ops1.Relu6(X);
+        var Z = m_Ops2.Relu6(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Relu6);
+        return Y;
+    }
+    Tensor IOps.LeakyRelu(Tensor X, float alpha)
+    {
+        var Y = m_Ops1.LeakyRelu(X, alpha);
+        var Z = m_Ops2.LeakyRelu(X, alpha);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.LeakyRelu);
+        return Y;
+    }
+    Tensor IOps.Selu(Tensor X, float alpha, float gamma)
+    {
+        var Y = m_Ops1.Selu(X, alpha, gamma);
+        var Z = m_Ops2.Selu(X, alpha, gamma);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Selu);
+        return Y;
+    }
+    Tensor IOps.PRelu(Tensor X, Tensor S)
+    {
+        var Y = m_Ops1.PRelu(X, S);
+        var Z = m_Ops2.PRelu(X, S);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.PRelu);
+        return Y;
+    }
+    Tensor IOps.Swish(Tensor X)
+    {
+        var Y = m_Ops1.Swish(X);
+        var Z = m_Ops2.Swish(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Swish);
+        return Y;
+    }
+
+    Tensor IOps.Abs(Tensor X)
+    {
+        var Y = m_Ops1.Abs(X);
+        var Z = m_Ops2.Abs(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Abs);
+        return Y;
+    }
+    Tensor IOps.Neg(Tensor X)
+    {
+        var Y = m_Ops1.Neg(X);
+        var Z = m_Ops2.Neg(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Neg);
+        return Y;
+    }
+    Tensor IOps.Ceil(Tensor X)
+    {
+        var Y = m_Ops1.Ceil(X);
+        var Z = m_Ops2.Ceil(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Ceil);
+        return Y;
+    }
+    Tensor IOps.Clip(Tensor X, float min, float max)
+    {
+        var Y = m_Ops1.Clip(X, min, max);
+        var Z = m_Ops2.Clip(X, min, max);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Clip);
+        return Y;
+    }
+    Tensor IOps.Floor(Tensor X)
+    {
+        var Y = m_Ops1.Floor(X);
+        var Z = m_Ops2.Floor(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Floor);
+        return Y;
+    }
+
+    Tensor IOps.Reciprocal(Tensor X)
+    {
+        var Y = m_Ops1.Reciprocal(X);
+        var Z = m_Ops2.Reciprocal(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Reciprocal);
+        return Y;
+    }
+    Tensor IOps.Pow(Tensor X, float alpha)
+    {
+        var Y = m_Ops1.Pow(X, alpha);
+        var Z = m_Ops2.Pow(X, alpha);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Pow);
+        return Y;
+    }
+    Tensor IOps.Exp(Tensor X)
+    {
+        var Y = m_Ops1.Exp(X);
+        var Z = m_Ops2.Exp(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Exp);
+        return Y;
+    }
+    Tensor IOps.Log(Tensor X)
+    {
+        var Y = m_Ops1.Log(X);
+        var Z = m_Ops2.Log(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Log);
+        return Y;
+    }
+    Tensor IOps.Sqrt(Tensor X)
+    {
+        var Y = m_Ops1.Sqrt(X);
+        var Z = m_Ops2.Sqrt(X);
+        CheckSame(Y, Z, Layer.Type.Activation + " " + Layer.Activation.Sqrt);
+        return Y;
+    }
+
+    Tensor IOps.Add(Tensor[] tensors)
+    {
+        var Y = m_Ops1.Add(tensors);
+        var Z = m_Ops2.Add(tensors);
+        CheckSame(Y, Z, Layer.Type.Add);
+        return Y;
+    }
+    Tensor IOps.Sub(Tensor[] tensors)
+    {
+        var Y = m_Ops1.Sub(tensors);
+        var Z = m_Ops2.Sub(tensors);
+        CheckSame(Y, Z, Layer.Type.Sub);
+        return Y;
+    }
+    Tensor IOps.Mul(Tensor[] tensors)
+    {
+        var Y = m_Ops1.Mul(tensors);
+        var Z = m_Ops2.Mul(tensors);
+        CheckSame(Y, Z, Layer.Type.Mul, tensors);
+        return Y;
+    }
+    Tensor IOps.Div(Tensor[] tensors)
+    {
+        var Y = m_Ops1.Div(tensors);
+        var Z = m_Ops2.Div(tensors);
+        CheckSame(Y, Z, Layer.Type.Div);
+        return Y;
+    }
+    Tensor IOps.Pow(Tensor[] tensors)
+    {
+        var Y = m_Ops1.Pow(tensors);
+        var Z = m_Ops2.Pow(tensors);
+        CheckSame(Y, Z, Layer.Type.Pow);
+        return Y;
+    }
+    Tensor IOps.Min(Tensor[] tensors)
+    {
+        var Y = m_Ops1.Min(tensors);
+        var Z = m_Ops2.Min(tensors);
+        CheckSame(Y, Z, Layer.Type.Min);
+        return Y;
+    }
+    Tensor IOps.Max(Tensor[] tensors)
+    {
+        var Y = m_Ops1.Max(tensors);
+        var Z = m_Ops2.Max(tensors);
+        CheckSame(Y, Z, Layer.Type.Max);
+        return Y;
+    }
+    Tensor IOps.Mean(Tensor[] tensors)
+    {
+        var Y = m_Ops1.Mean(tensors);
+        var Z = m_Ops2.Mean(tensors);
+        CheckSame(Y, Z, Layer.Type.Mean);
+        return Y;
+    }
+
+    Tensor IOps.ReduceMax(Tensor X, int axis)
+    {
+        var Y = m_Ops1.ReduceMax(X, axis);
+        var Z = m_Ops2.ReduceMax(X, axis);
+        CheckSame(Y, Z, Layer.Type.ReduceMax);
+        return Y;
+    }
+    Tensor IOps.ReduceMean(Tensor X, int axis)
+    {
+        var Y = m_Ops1.ReduceMean(X, axis);
+        var Z = m_Ops2.ReduceMean(X, axis);
+        CheckSame(Y, Z, Layer.Type.ReduceMean);
+        return Y;
+    }
+    Tensor IOps.ReduceMin(Tensor X, int axis)
+    {
+        var Y = m_Ops1.ReduceMin(X, axis);
+        var Z = m_Ops2.ReduceMin(X, axis);
+        CheckSame(Y, Z, Layer.Type.ReduceMin);
+        return Y;
+    }
+    Tensor IOps.ReduceProd(Tensor X, int axis)
+    {
+        var Y = m_Ops1.ReduceProd(X, axis);
+        var Z = m_Ops2.ReduceProd(X, axis);
+        CheckSame(Y, Z, Layer.Type.ReduceProd);
+        return Y;
+    }
+    Tensor IOps.ReduceSum(Tensor X, int axis)
+    {
+        var Y = m_Ops1.ReduceSum(X, axis);
+        var Z = m_Ops2.ReduceSum(X, axis);
+        CheckSame(Y, Z, Layer.Type.ReduceSum);
+        return Y;
+    }
+
+    Tensor IOps.Greater(Tensor a, Tensor b)
+    {
+        var Y = m_Ops1.Greater(a, b);
+        var Z = m_Ops2.Greater(a, b);
+        CheckSame(Y, Z, Layer.Type.Greater);
+        return Y;
+    }
+    Tensor IOps.GreaterEqual(Tensor a, Tensor b)
+    {
+        var Y = m_Ops1.GreaterEqual(a, b);
+        var Z = m_Ops2.GreaterEqual(a, b);
+        CheckSame(Y, Z, Layer.Type.GreaterEqual);
+        return Y;
+    }
+    Tensor IOps.Less(Tensor a, Tensor b)
+    {
+        var Y = m_Ops1.Less(a, b);
+        var Z = m_Ops2.Less(a, b);
+        CheckSame(Y, Z, Layer.Type.Less);
+        return Y;
+
+    }
+    Tensor IOps.LessEqual(Tensor a, Tensor b)
+    {
+        var Y = m_Ops1.LessEqual(a, b);
+        var Z = m_Ops2.LessEqual(a, b);
+        CheckSame(Y, Z, Layer.Type.LessEqual);
+        return Y;
+    }
+    Tensor IOps.Equal(Tensor a, Tensor b)
+    {
+        var Y = m_Ops1.Equal(a, b);
+        var Z = m_Ops2.Equal(a, b);
+        CheckSame(Y, Z, Layer.Type.Equal);
+        return Y;
+    }
+    Tensor IOps.LogicalOr(Tensor a, Tensor b)
+    {
+        var Y = m_Ops1.LogicalOr(a, b);
+        var Z = m_Ops2.LogicalOr(a, b);
+        CheckSame(Y, Z, Layer.Type.LogicalOr);
+        return Y;
+    }
+    Tensor IOps.LogicalAnd(Tensor a, Tensor b)
+    {
+        var Y = m_Ops1.LogicalAnd(a, b);
+        var Z = m_Ops2.LogicalAnd(a, b);
+        CheckSame(Y, Z, Layer.Type.LogicalAnd);
+        return Y;
+    }
+    Tensor IOps.LogicalXor(Tensor a, Tensor b)
+    {
+        var Y = m_Ops1.LogicalXor(a, b);
+        var Z = m_Ops2.LogicalXor(a, b);
+        CheckSame(Y, Z, Layer.Type.LogicalXor);
+        return Y;
+    }
+    Tensor IOps.LogicalNot(Tensor x)
+    {
+        var Y = m_Ops1.LogicalNot(x);
+        var Z = m_Ops2.LogicalNot(x);
+        CheckSame(Y, Z, Layer.Type.LogicalNot);
+        return Y;
+    }
+
+    Tensor IOps.Flatten(Tensor X)
+    {
+        var Y = m_Ops1.Flatten(X);
+        var Z = m_Ops2.Flatten(X);
+        CheckSame(Y, Z, Layer.Type.Flatten);
+        return Y;
+    }
+    Tensor IOps.Reshape(Tensor X, TensorShape shape)
+    {
+        var Y = m_Ops1.Reshape(X, shape);
+        var Z = m_Ops2.Reshape(X, shape);
+        CheckSame(Y, Z, Layer.Type.Reshape);
+        return Y;
+    }
+    Tensor IOps.Transpose(Tensor X)
+    {
+        var Y = m_Ops1.Transpose(X);
+        var Z = m_Ops2.Transpose(X);
+        CheckSame(Y, Z, Layer.Type.Transpose);
+        return Y;
+    }
+
+    Tensor IOps.Concat(Tensor[] tensors, int axis)
+    {
+        var Y = m_Ops1.Concat(tensors, axis);
+        var Z = m_Ops2.Concat(tensors, axis);
+        CheckSame(Y, Z, Layer.Type.Concat);
+        return Y;
+    }
+    Tensor IOps.StridedSlice(Tensor X, int[] starts, int[] ends, int[] strides)
+    {
+        var Y = m_Ops1.StridedSlice(X, starts, ends, strides);
+        var Z = m_Ops2.StridedSlice(X, starts, ends, strides);
+        CheckSame(Y, Z, Layer.Type.StridedSlice);
+        return Y;
+    }
+    Tensor IOps.Tile(Tensor X, int[] repeats)
+    {
+        var Y = m_Ops1.Tile(X, repeats);
+        var Z = m_Ops2.Tile(X, repeats);
+        CheckSame(Y, Z, Layer.Type.Tile);
+        return Y;
+    }
+
+    Tensor IOps.Prepare(Tensor X)
+    {
+        var Y = m_Ops1.Prepare(X);
+        var Z = m_Ops2.Prepare(X);
+        CheckSame(Y, Z, "Prepare");
+        return Y;
+    }
+
+    void IOps.ResetAllocator(bool keepCachedMemory)
+    {
+        m_Ops1.ResetAllocator(keepCachedMemory);
+        m_Ops2.ResetAllocator(keepCachedMemory);
+    }
+
+    // -----
+    static public void CheckSame(Tensor X, Tensor Y, Layer.Type type, params Tensor[] inputs)
+    {
+        CheckSame(X, Y, type.ToString(), inputs);
+    }
+
+    static public void CheckSame(Tensor X, Tensor Y, string opName, params Tensor[] inputs)
+    {
+        if (!X.Approximately(Y))
+        {
+            D.LogWarning("Tensors not equal after " + opName + " max error: " + X.MaxDifference(Y));
+            D.Log("First: " + X.shape);
+            D.Log("Second:" + Y.shape);
+
+            X.PrintDataPart(X.channels * X.width * 2);
+            Y.PrintDataPart(Y.channels * Y.width * 2);
+
+            for (var i = 0; i < inputs.Length; i++)
+            {
+                inputs[i].PrintDataPart(32, "input_" + i);
+            }
+        }
+        if (X.tensorOnDevice != Y.tensorOnDevice)
+            Y.Dispose();
+    }
+
+    static public void CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, Layer.Type type)
+    {
+        CheckApproximately(X, Y, count, epsilon, type.ToString());
+    }
+
+    static public void CheckApproximately(Tensor X, Tensor Y, int count, float epsilon, string opName)
+    {
+        if (!X.Approximately(Y, epsilon, count))
+        {
+            D.LogWarning("Tensors not equal after " + opName);
+            D.Log("First: " + X.shape);
+            D.Log("Second:" + Y.shape);
+
+            if (count < 0)
+                count = X.channels * X.width * 2;
+            X.PrintDataPart(count);
+            Y.PrintDataPart(count);
+        }
+        if (X.tensorOnDevice != Y.tensorOnDevice)
+            Y.Dispose();
+    }
+}
+
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/CompareOps.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/CompareOps.cs.meta
new file mode 100644
index 0000000..c28cf09
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/CompareOps.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 3d3848101f7774555899e75a86641621
+timeCreated: 1506427659
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/ComputeInfo.cs b/Assets/Coach-ML/Barracuda/Core/Backends/ComputeInfo.cs
new file mode 100644
index 0000000..b5ea29c
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/ComputeInfo.cs
@@ -0,0 +1,52 @@
+﻿using UnityEngine;
+using UnityEngine.Rendering;
+
+namespace Barracuda
+{
+    public class ComputeInfo
+    {
+        public static bool supportsComputeSharedMemory = true;
+        public static bool supportsDense32x32 = true;
+        public static bool supportsDense64x64 = true;
+        public static bool supportsCompute = true;
+        public static uint maxComputeWorkGroupSize = 1024;
+
+        static ComputeInfo()
+        {
+            supportsCompute = SystemInfo.supportsComputeShaders;
+
+            // SystemInfo.maxComputeWorkGroupSize is incorrect
+            if (Application.platform == RuntimePlatform.Android)
+            {
+                maxComputeWorkGroupSize = (SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan) ? 256u : 128u;
+
+                var gpuName = SystemInfo.graphicsDeviceName ?? "";
+                var osName = SystemInfo.operatingSystem ?? "";
+                
+                // Known issue with Adreno Vulkan drivers on Android 8.x
+                if (gpuName.Contains("Adreno") && osName.StartsWith("Android OS 8") &&
+                    SystemInfo.graphicsDeviceType == GraphicsDeviceType.Vulkan)
+                    maxComputeWorkGroupSize = 128u;
+            }
+            else if (Application.platform == RuntimePlatform.IPhonePlayer || Application.platform == RuntimePlatform.tvOS)
+            {
+                var gpuName = SystemInfo.graphicsDeviceName;
+                if (gpuName != null && gpuName.StartsWith("Apple A"))
+                {
+                    int gpuNumber = 0, idx = "Apple A".Length;
+                    while (idx < gpuName.Length && '0' <= gpuName[idx] && gpuName[idx] <= '9')
+                    {
+                        gpuNumber = gpuNumber * 10 + gpuName[idx++] - '0';
+                    }
+
+                    // TODO check on lower end iOS devices
+                    maxComputeWorkGroupSize = (gpuNumber <= 10) ? 224u : 256u;
+                }
+                else
+                {
+                    maxComputeWorkGroupSize = 256u;
+                }
+            }
+        }
+    }
+}
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/ComputeInfo.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/ComputeInfo.cs.meta
new file mode 100644
index 0000000..765cd4f
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/ComputeInfo.cs.meta
@@ -0,0 +1,3 @@
+﻿fileFormatVersion: 2
+guid: 96aee99fc4154e2a991ac0edd6056c2b
+timeCreated: 1558541124
\ No newline at end of file
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/ComputeShaderSingleton.cs b/Assets/Coach-ML/Barracuda/Core/Backends/ComputeShaderSingleton.cs
new file mode 100644
index 0000000..5b9c951
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/ComputeShaderSingleton.cs
@@ -0,0 +1,51 @@
+using System.Collections.Generic;
+using UnityEngine;
+using Barracuda;
+
+public sealed class ComputeShaderSingleton
+{
+    public readonly ComputeShader referenceKernels;
+    public readonly ComputeShader[] kernels;
+
+    private static readonly ComputeShaderSingleton instance = new ComputeShaderSingleton ();
+
+    private ComputeShaderSingleton ()
+    {
+        referenceKernels = LoadIf(ComputeInfo.supportsCompute, "BarracudaReferenceImpl");
+        
+        List<ComputeShader> kernelsList = new List<ComputeShader>();
+
+        LoadIf(ComputeInfo.supportsCompute, "Generic", kernelsList);
+        LoadIf(ComputeInfo.supportsCompute, "Activation", kernelsList);
+        LoadIf(ComputeInfo.supportsCompute, "Broadcast", kernelsList);
+        LoadIf(ComputeInfo.supportsCompute, "Pool", kernelsList);
+        LoadIf(ComputeInfo.supportsCompute, "Pad", kernelsList);
+        LoadIf(ComputeInfo.supportsCompute, "Dense", kernelsList);
+        LoadIf(ComputeInfo.supportsCompute, "DenseFP16", kernelsList);
+        LoadIf(ComputeInfo.supportsCompute, "Conv", kernelsList);
+
+        kernels = kernelsList.ToArray();
+    }
+
+    public static ComputeShaderSingleton Instance {
+        get { return instance; }
+    }
+
+    public static ComputeShader LoadIf(bool condition, string fileName)
+    {
+        if (condition)
+            return (ComputeShader)Resources.Load(fileName);
+
+        return null;
+    }
+    
+    public static void LoadIf(bool condition, string fileName, List<ComputeShader> list)
+    {
+        ComputeShader shader = LoadIf(condition, fileName);
+        
+        if (shader)
+            list.Add(shader);
+    }
+
+    public bool supported { get { return SystemInfo.supportsComputeShaders; } }
+}
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/ComputeShaderSingleton.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/ComputeShaderSingleton.cs.meta
new file mode 100644
index 0000000..28eae9f
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/ComputeShaderSingleton.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 815b6432da283415d87dabe9ef715cd9
+timeCreated: 1495620775
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/GenericWorker.cs b/Assets/Coach-ML/Barracuda/Core/Backends/GenericWorker.cs
new file mode 100644
index 0000000..a2ab4e0
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/GenericWorker.cs
@@ -0,0 +1,1096 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq; // ToArray()
+
+using UnityEngine;
+using UnityEngine.Assertions;
+using UnityEngine.Profiling;
+
+using System.Runtime.CompilerServices;
+
+
+[assembly: InternalsVisibleTo("Unity.Barracuda.PerformanceTests")]
+[assembly: InternalsVisibleTo("Unity.Barracuda.Tests")]
+
+namespace Barracuda
+{
+
+
+public class GenericWorker : IWorker
+{
+    private Model m_Model;
+    private string m_DefaultInputName;
+    private string m_DefaultOutputName;
+    private Dictionary<string, TensorShape> m_InputShapes = new Dictionary<string, TensorShape>();
+    private IOps m_Ops;
+    private IVars m_Vars;
+    private IModelCompiler m_ModelCompiler;
+    private bool m_RequestResetAllocator;
+    private bool m_Verbose;
+    private float m_Progress = 0f;
+
+    private Tensor m_SyncTensor;
+
+    // Heuristic size for a small tensor. Small tensors are more likely to be accessed on CPU,
+    // thus PeekOutput() for such small tensor will auto schedule non-blocking download from GPU/NPU to CPU
+    const int m_MaxFlatWidthThatAutoTriggersAsyncDownload = 1000;
+
+    public GenericWorker(Model model, IOps ops, IVars vars, bool verbose = false)
+    {
+        m_Model = model;
+        m_DefaultInputName = ModelAnalyzer.GetDefaultInputName(model);
+        m_DefaultOutputName = ModelAnalyzer.GetDefaultOutputName(model);
+        m_Ops = ops;
+        m_Vars = vars;
+        m_ModelCompiler = ops as IModelCompiler;
+        m_Verbose = verbose;
+
+        m_RequestResetAllocator = true;
+    }
+
+    ~GenericWorker()
+    {
+        Dispose();
+    }
+
+    protected void ResetAllocatorIfRequested()
+    {
+        if (m_RequestResetAllocator)
+            m_Ops.ResetAllocator();
+        m_RequestResetAllocator = false;
+    }
+
+    public virtual void Dispose()
+    {
+        m_Vars?.Dispose();
+        m_Ops?.ResetAllocator(false); // clear allocator's memory
+        m_InputShapes?.Clear();
+
+        m_Vars = null;
+        m_Ops = null;
+        m_InputShapes = null;
+    }
+
+    public virtual void PrepareForInput(IDictionary<string, TensorShape> inputShapes)
+    {
+        m_InputShapes.Clear();
+        foreach (var input in inputShapes)
+            m_InputShapes.Add(input.Key, input.Value);
+        m_Vars.PrepareStorage(m_Model, m_Ops, m_InputShapes);
+    }
+
+    public virtual void SetInput(string name, Tensor x)
+    {
+        ResetAllocatorIfRequested();
+        m_Ops.Prepare(x);
+        m_Vars.SetInput(name, x);
+
+        // if single input network, then we have enough information to prepare network for execution
+        if (m_Model.inputs.Count <= 1 && name == m_DefaultInputName)
+            PrepareForInput(new Dictionary<string, TensorShape> { { name, x.shape } }); // @TODO: get rid of allocation
+
+        m_InputShapes[name] = x.shape;
+    }
+
+    public virtual void SetInput(Tensor x)
+    {
+        SetInput(m_DefaultInputName, x);
+    }
+
+    public virtual void Execute(IDictionary<string, Tensor> inputs)
+    {
+        foreach (var entry in inputs)
+            SetInput(entry.Key, entry.Value);
+        Execute();
+    }
+
+    public virtual void Execute(Tensor input)
+    {
+        SetInput(input);
+        Execute();
+    }
+
+    public virtual void Execute()
+    {
+        var enumerator = ExecuteAsync();
+        while (enumerator.MoveNext()) {};
+    }
+
+    public virtual IEnumerator ExecuteAsync(IDictionary<string, Tensor> inputs)
+    {
+        foreach (var entry in inputs)
+            SetInput(entry.Key, entry.Value);
+        return ExecuteAsync();
+    }
+
+    public virtual void WaitForCompletion()
+    {
+        m_Ops.WaitForCompletion(m_SyncTensor);
+    }
+
+    internal Tensor WaitForCompletionAndReturnIntermediate()
+    {
+        WaitForCompletion();
+        return m_SyncTensor;
+    }
+
+    public virtual IEnumerator ExecuteAsync(Tensor input)
+    {
+        SetInput(input);
+        return ExecuteAsync();
+    }
+
+    public virtual float GetAsyncProgress()
+    {
+        return m_Progress;
+    }
+
+    public virtual IEnumerator ExecuteAsync()
+    {
+        Profiler.BeginSample ("Barracuda.Execute");
+
+        ResetAllocatorIfRequested();
+        m_Vars.PrepareStorage(m_Model, m_Ops, m_InputShapes);
+
+        if (m_ModelCompiler != null)
+            m_ModelCompiler.PrepareModel(m_Model, m_InputShapes);
+
+        int idx = 0;
+        foreach (var l in m_Model.layers)
+        {
+            idx++;
+
+            m_Progress = idx / (float)m_Model.layers.Count;
+
+            Profiler.BeginSample(l.name);
+            var inputs = m_Vars.GatherInputs(l);
+
+            Tensor X = new Tensor();
+            if(inputs.Length > 0)
+                X = inputs[0];
+
+            if (m_Verbose)
+                D.Log("Layer: " + l.type + ((l.type == Layer.Type.Activation) ? ("." + l.activation) : "") + " " + l.name );
+
+            m_Vars.PrepareStorage(l);
+            if (m_ModelCompiler != null)
+                m_ModelCompiler.PreExecuteLayer(l, inputs);
+
+            // No operation, identity
+            if (l.type == Layer.Type.Nop)
+            {
+                Profiler.BeginSample ("Barracuda.Nop");
+                X = X.ShallowCopy();
+            }
+            // Load const
+            else if (l.type == Layer.Type.Load)
+            {
+                Profiler.BeginSample ("Barracuda.Load");
+            }
+            // GEMM
+            else if (l.type == Layer.Type.Dense)
+            {
+                Assert.AreEqual(inputs.Length, 3);
+                Profiler.BeginSample ("Barracuda.Dense");
+                X = m_Ops.Dense(X, inputs[1], inputs[2]);
+            }
+            // 2D
+            else if (l.type == Layer.Type.Conv2D)
+            {
+                Assert.AreEqual(inputs.Length, 3);
+                Profiler.BeginSample ("Barracuda.Conv2D");
+                var pad = X.AdjustPadToKernel(inputs[1], l.stride, l.pad);
+                X = m_Ops.Conv2D(X, inputs[1], inputs[2], l.stride, pad);
+            }
+            else if (l.type == Layer.Type.DepthwiseConv2D)
+            {
+                Assert.AreEqual(inputs.Length, 3);
+                Profiler.BeginSample ("Barracuda.DepthwiseConv2D");
+                var pad = X.AdjustPadToKernel(inputs[1], l.stride, l.pad);
+                X = m_Ops.DepthwiseConv2D(X, inputs[1], inputs[2], l.stride, pad);
+            }
+            else if (l.type == Layer.Type.Conv2DTrans)
+            {
+                Assert.AreEqual(inputs.Length, 3);
+                Profiler.BeginSample ("Barracuda.Conv2DTrans");
+                // pool size is treated as output_adjustment aka output_padding here
+                var outputAdjustment = l.pool;
+                var pad = X.AdjustPadToKernel(inputs[1], l.stride, l.pad);
+                X = m_Ops.Conv2DTrans(X, inputs[1], inputs[2], l.stride, pad, outputAdjustment);
+            }
+            else if (l.type == Layer.Type.Upsample2D)
+            {
+                Profiler.BeginSample ("Barracuda.Upsample2D");
+                // pool size is treated as upsample coefficient here
+                var size = l.pool;
+                X = m_Ops.Upsample2D(X, size);
+            }
+            else if (l.type == Layer.Type.MaxPool2D)
+            {
+                Profiler.BeginSample ("Barracuda.MaxPool2D");
+                var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad);
+                X = m_Ops.MaxPool2D(X, l.pool, l.stride, pad);
+            }
+            else if (l.type == Layer.Type.AvgPool2D)
+            {
+                Profiler.BeginSample ("Barracuda.AvgPool2D");
+                var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad);
+                X = m_Ops.AvgPool2D(X, l.pool, l.stride, pad);
+            }
+            else if (l.type == Layer.Type.GlobalMaxPool2D)
+            {
+                Profiler.BeginSample ("Barracuda.GlobalMaxPool2D");
+                X = m_Ops.GlobalMaxPool2D(X);
+            }
+            else if (l.type == Layer.Type.GlobalAvgPool2D)
+            {
+                Profiler.BeginSample ("Barracuda.GlobalAvgPool2D");
+                X = m_Ops.GlobalAvgPool2D(X);
+            }
+            else if (l.type == Layer.Type.Border2D)
+            {
+                Profiler.BeginSample ("Barracuda.Border2D");
+
+                Assert.IsNotNull(l.pad);
+                // NOTE: beta is used to retrieve fillin value
+                // because beta is 0 by default (while alpha is 1 by default)
+                // 0 value is more inline with zero padding
+                float fillValue = l.beta;
+                X = m_Ops.Border2D(X, l.pad, fillValue);
+            }
+            else if (l.type == Layer.Type.Pad2DReflect)
+            {
+                Profiler.BeginSample ("Barracuda.Pad2DReflect");
+
+                Assert.IsNotNull(l.pad);
+                X = m_Ops.Pad2DReflect(X, l.pad);
+            }
+            else if (l.type == Layer.Type.Pad2DSymmetric)
+            {
+                Profiler.BeginSample ("Barracuda.Pad2DSymmetric");
+
+                Assert.IsNotNull(l.pad);
+                X = m_Ops.Pad2DSymmetric(X, l.pad);
+            }
+            else if (l.type == Layer.Type.Pad2DEdge)
+            {
+                Profiler.BeginSample ("Barracuda.Pad2DEdge");
+
+                Assert.IsNotNull(l.pad);
+                X = m_Ops.Pad2DEdge(X, l.pad);
+            }
+            // 3D
+            else if (l.type == Layer.Type.Conv3D ||
+                l.type == Layer.Type.Conv3DTrans ||
+                l.type == Layer.Type.Upsample3D ||
+                l.type == Layer.Type.MaxPool3D ||
+                l.type == Layer.Type.AvgPool3D ||
+                l.type == Layer.Type.GlobalMaxPool3D ||
+                l.type == Layer.Type.GlobalAvgPool3D ||
+                l.type == Layer.Type.Border3D)
+            {
+                throw new NotImplementedException("3D operations are not implemented yet!");
+            }
+            else if (l.type == Layer.Type.ScaleBias)
+            {
+                Assert.AreEqual(inputs.Length, 3);
+                Profiler.BeginSample ("Barracuda.ScaleBias");
+                X = m_Ops.ScaleBias(X, inputs[1], inputs[2]);
+            }
+            else if (l.type == Layer.Type.Normalization)
+            {
+                Assert.AreEqual(inputs.Length, 3);
+                Profiler.BeginSample ("Barracuda.Normalization");
+                // @TODO: support other types of Normalization at test time.
+                // Currently supported only pool=1 (InstanceNormalization)
+
+                // NOTE: beta is used to retrieve epsilon value
+                // because beta is 0 by default (while alpha is 1 by default)
+                // 0 value is more inline with very small epsilon
+                var epsilon = l.beta;
+                if (epsilon == 0)
+                    epsilon = Mathf.Epsilon; // safety check to prevent division by zero
+                X = m_Ops.Normalization(X, inputs[1], inputs[2], 1, l.axis, epsilon);
+            }
+            else if (l.type == Layer.Type.LRN)
+            {
+                Profiler.BeginSample ("Barracuda.LRN");
+
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 1);
+                int count = l.pool[0];
+                X = m_Ops.LRN(X, l.alpha, l.beta, 1.0f, count); // @TODO: bias
+            }
+            // Stochastic layers
+            else if (l.type == Layer.Type.Dropout)
+            {
+                Profiler.BeginSample ("Barracuda.Dropout");
+
+                X = m_Ops.Dropout(X, l.alpha);
+            }
+            else if (l.type == Layer.Type.RandomNormal)
+            {
+                Profiler.BeginSample ("Barracuda.RandomNormal");
+
+                Assert.IsNotNull(l.pool);
+                // pool size is treated as shape constant, if not empty
+                // otherwise shape of the previous tensor is used
+                var shape = X.shape;
+                if (l.pool.Length > 0)
+                    shape = new TensorShape(l.pool);
+
+                int seed = (l.pad.Length > 0) ? l.pad[0] : 1337;
+                float scale = l.alpha, mean = l.beta;
+                X = m_Ops.RandomNormal(shape, mean, scale, seed);
+            }
+            else if (l.type == Layer.Type.RandomUniform)
+            {
+                Profiler.BeginSample ("Barracuda.RandomUniform");
+
+                Assert.IsNotNull(l.pool);
+                // pool size is treated as shape constant, if not empty
+                // otherwise shape of the previous tensor is used
+                var shape = X.shape;
+                if (l.pool.Length > 0)
+                    shape = new TensorShape(l.pool);
+
+                int seed = (l.pad.Length > 0) ? l.pad[0] : 1337;
+                float scale = l.alpha, mean = l.beta;
+                X = m_Ops.RandomUniform(shape, mean, scale, seed);
+            }
+            else if (l.type == Layer.Type.Multinomial)
+            {
+                Profiler.BeginSample ("Barracuda.Multinomial");
+
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 1);
+
+                int count = l.pool[0];
+                int seed = (l.pad.Length > 0) ? l.pad[0] : 1337;
+                X = m_Ops.Multinomial(X, count, seed);
+            }
+            else if (l.type == Layer.Type.OneHot)
+            {
+                Profiler.BeginSample ("Barracuda.OneHot");
+
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 1);
+                int depth = l.pool[0];
+                float on = l.alpha, off = l.beta;
+                X = m_Ops.OneHot(X, depth, on, off);
+            }
+            // Broadcast layers
+            else if (l.type == Layer.Type.Add)
+            {
+                Profiler.BeginSample ("Barracuda.Add");
+
+                X = m_Ops.Add(inputs);
+            }
+            else if (l.type == Layer.Type.Sub)
+            {
+                Profiler.BeginSample ("Barracuda.Sub");
+
+                X = m_Ops.Sub(inputs);
+            }
+            else if (l.type == Layer.Type.Mul)
+            {
+                Profiler.BeginSample ("Barracuda.Mul");
+
+                X = m_Ops.Mul(inputs);
+            }
+            else if (l.type == Layer.Type.Div)
+            {
+                Profiler.BeginSample ("Barracuda.Div");
+
+                X = m_Ops.Div(inputs);
+            }
+            else if (l.type == Layer.Type.Pow)
+            {
+                Profiler.BeginSample ("Barracuda.Pow");
+
+                X = m_Ops.Pow(inputs);
+            }
+            else if (l.type == Layer.Type.Min)
+            {
+                Profiler.BeginSample ("Barracuda.Min");
+
+                X = m_Ops.Min(inputs);
+            }
+            else if (l.type == Layer.Type.Max)
+            {
+                Profiler.BeginSample ("Barracuda.Max");
+
+                X = m_Ops.Max(inputs);
+            }
+            else if (l.type == Layer.Type.Mean)
+            {
+                Profiler.BeginSample ("Barracuda.Mean");
+
+                X = m_Ops.Mean(inputs);
+            }
+            // Reduction layers
+            else if (l.type == Layer.Type.ReduceMax)
+            {
+                Profiler.BeginSample ("Barracuda.ReduceMax");
+
+                X = m_Ops.ReduceMax(X, l.axis);
+            }
+            else if (l.type == Layer.Type.ReduceMean)
+            {
+                Profiler.BeginSample ("Barracuda.ReduceMean");
+
+                X = m_Ops.ReduceMean(X, l.axis);
+            }
+            else if (l.type == Layer.Type.ReduceMin)
+            {
+                Profiler.BeginSample ("Barracuda.ReduceMin");
+
+                X = m_Ops.ReduceMin(X, l.axis);
+            }
+            else if (l.type == Layer.Type.ReduceProd)
+            {
+                Profiler.BeginSample ("Barracuda.ReduceProd");
+
+                X = m_Ops.ReduceProd(X, l.axis);
+            }
+            else if (l.type == Layer.Type.ReduceSum)
+            {
+                Profiler.BeginSample ("Barracuda.ReduceSum");
+
+                X = m_Ops.ReduceSum(X, l.axis);
+            }
+            else if (
+                l.type == Layer.Type.ReduceL1 ||
+                l.type == Layer.Type.ReduceL2 ||
+                l.type == Layer.Type.ReduceLogSum ||
+                l.type == Layer.Type.ReduceLogSumExp ||
+                l.type == Layer.Type.ReduceSumSquare)
+            {
+                throw new NotImplementedException("This reduction operation is not implemented yet!");
+            }
+            // Logical operators with broadcast
+            else if (l.type == Layer.Type.Greater)
+            {
+                Assert.AreEqual(inputs.Length, 2);
+                Profiler.BeginSample ("Barracuda.Greater");
+                X = m_Ops.Greater(X, inputs[1]);
+            }
+            else if (l.type == Layer.Type.GreaterEqual)
+            {
+                Assert.AreEqual(inputs.Length, 2);
+                Profiler.BeginSample("Barracuda.GreaterEqual");
+                X = m_Ops.GreaterEqual(X, inputs[1]);
+            }
+            else if (l.type == Layer.Type.Less)
+            {
+                Assert.AreEqual(inputs.Length, 2);
+                Profiler.BeginSample("Barracuda.Less");
+                X = m_Ops.Less(X, inputs[1]);
+            }
+            else if (l.type == Layer.Type.LessEqual)
+            {
+                Assert.AreEqual(inputs.Length, 2);
+                Profiler.BeginSample("Barracuda.LessEqual");
+                X = m_Ops.LessEqual(X, inputs[1]);
+            }
+            else if (l.type == Layer.Type.Equal)
+            {
+                Assert.AreEqual(inputs.Length, 2);
+                Profiler.BeginSample("Barracuda.Equal");
+                X = m_Ops.Equal(X, inputs[1]);
+            }
+            else if (l.type == Layer.Type.LogicalOr)
+            {
+                Assert.AreEqual(inputs.Length, 2);
+                Profiler.BeginSample("Barracuda.LogicalOr");
+                X = m_Ops.LogicalOr(X, inputs[1]);
+            }
+            else if (l.type == Layer.Type.LogicalAnd)
+            {
+                Assert.AreEqual(inputs.Length, 2);
+                Profiler.BeginSample("Barracuda.LogicalAnd");
+                X = m_Ops.LogicalAnd(X, inputs[1]);
+            }
+            else if (l.type == Layer.Type.LogicalXor)
+            {
+                Assert.AreEqual(inputs.Length, 2);
+                Profiler.BeginSample("Barracuda.LogicalXor");
+                X = m_Ops.LogicalXor(X, inputs[1]);
+            }
+            else if (l.type == Layer.Type.LogicalNot)
+            {
+                Profiler.BeginSample("Barracuda.LogicalNot");
+                X = m_Ops.LogicalNot(X);
+            }
+            // Shape affecting layers
+            else if (l.type == Layer.Type.Flatten)
+            {
+                Profiler.BeginSample ("Barracuda.Flatten");
+                X = m_Ops.Flatten(X);
+            }
+            else if (l.type == Layer.Type.Reshape)
+            {
+                Profiler.BeginSample ("Barracuda.Reshape");
+
+                // pool size is treated as reshape coefficient, if not empty
+                // otherwise shape of the 2nd input tensor is used
+                var size = l.pool;
+
+                Assert.IsNotNull(size);
+                if (size.Length == 0 && inputs.Length > 1)
+                    size = inputs[1].shape.ToArray();
+
+                var newShape = X.shape.Reshape(size);
+                X = m_Ops.Reshape(X, newShape);
+            }
+            else if (l.type == Layer.Type.Transpose)
+            {
+                Profiler.BeginSample ("Barracuda.Transpose");
+                X = m_Ops.Transpose(X);
+            }
+            else if (l.type == Layer.Type.Squeeze ||
+                l.type == Layer.Type.Unsqueeze)
+            {
+                throw new NotImplementedException();
+            }
+            else if (l.type == Layer.Type.Concat)
+            {
+                Profiler.BeginSample ("Barracuda.Concat");
+
+                X = m_Ops.Concat(inputs, l.axis);
+            }
+            else if (l.type == Layer.Type.StridedSlice)
+            {
+                Profiler.BeginSample ("Barracuda.StridedSlice");
+
+                Assert.IsNotNull(l.pad);
+                Assert.IsNotNull(l.pool);
+                Assert.IsNotNull(l.stride);
+                X = m_Ops.StridedSlice(X, l.pad, l.pool, l.stride);
+            }
+            else if (l.type == Layer.Type.Tile)
+            {
+                throw new NotImplementedException();
+            }
+            // Activations
+            else if (l.type == Layer.Type.Activation)
+            {
+                Profiler.BeginSample ("Barracuda.Activation");
+
+                if (l.activation == Layer.Activation.Relu)
+                {
+                    X = m_Ops.Relu(X);
+                }
+                else if (l.activation == Layer.Activation.Softmax)
+                {
+                    X = m_Ops.Softmax(X);
+                }
+                else if (l.activation == Layer.Activation.LogSoftmax)
+                {
+                    X = m_Ops.LogSoftmax(X);
+                }
+                else if (l.activation == Layer.Activation.Tanh)
+                {
+                    X = m_Ops.Tanh(X);
+                }
+                else if (l.activation == Layer.Activation.Sigmoid)
+                {
+                    X = m_Ops.Sigmoid(X);
+                }
+                else if (l.activation == Layer.Activation.Relu6)
+                {
+                    X = m_Ops.Relu6(X);
+                }
+                else if (l.activation == Layer.Activation.Elu)
+                {
+                    X = m_Ops.Elu(X, l.alpha);
+                }
+                else if (l.activation == Layer.Activation.LeakyRelu)
+                {
+                    X = m_Ops.LeakyRelu(X, l.alpha);
+                }
+                else if (l.activation == Layer.Activation.Selu)
+                {
+                    X = m_Ops.Selu(X, l.alpha, l.beta);
+                }
+                else if (l.activation == Layer.Activation.Swish)
+                {
+                    X = m_Ops.Swish(X);
+                }
+                else if (l.activation == Layer.Activation.PRelu)
+                {
+                    Assert.AreEqual(inputs.Length, 2);
+                    Profiler.BeginSample("Barracuda.PRelu");
+                    X = m_Ops.PRelu(X, inputs[1]);
+                }
+                else if (
+                    l.activation == Layer.Activation.Softplus ||
+                    l.activation == Layer.Activation.Softsign ||
+                    l.activation == Layer.Activation.Hardmax ||
+                    l.activation == Layer.Activation.HardSigmoid)
+                {
+                    throw new NotImplementedException("This activation function is not implemented yet!");
+                }
+                else if (l.activation == Layer.Activation.Abs)
+                {
+                    X = m_Ops.Abs(X);
+                }
+                else if (l.activation == Layer.Activation.Neg)
+                {
+                    X = m_Ops.Neg(X);
+                }
+                else if (l.activation == Layer.Activation.Ceil)
+                {
+                    X = m_Ops.Ceil(X);
+                }
+                else if (l.activation == Layer.Activation.Clip)
+                {
+                    X = m_Ops.Clip(X, l.alpha, l.beta);
+                }
+                else if (l.activation == Layer.Activation.Floor)
+                {
+                    X = m_Ops.Floor(X);
+                }
+                else if (l.activation == Layer.Activation.Reciprocal)
+                {
+                    X = m_Ops.Reciprocal(X);
+                }
+                else if (l.activation == Layer.Activation.Pow)
+                {
+                    X = m_Ops.Pow(X, l.alpha);
+                }
+                else if (l.activation == Layer.Activation.Exp)
+                {
+                    X = m_Ops.Exp(X);
+                }
+                else if (l.activation == Layer.Activation.Log)
+                {
+                    X = m_Ops.Log(X);
+                }
+                else if (l.activation == Layer.Activation.Sqrt)
+                {
+                    X = m_Ops.Sqrt(X);
+                }
+                else if ((int)l.activation >= (int)Layer.Activation.Acos &&
+                    (int)l.activation <= (int)Layer.Activation.Tan)
+                {
+                    throw new NotImplementedException("Trig functions are not implemented yet!");
+                }
+                else
+                {
+                    X = X.ShallowCopy();
+                }
+            }
+            else
+            {
+                Profiler.BeginSample ("Barracuda.Dummy");
+                Assert.AreEqual(l.activation, Layer.Activation.None);
+            }
+
+            m_Vars.Store(l, X);
+            m_SyncTensor = X;
+
+            // optype
+            Profiler.EndSample();
+
+            // layer.name
+            Profiler.EndSample();
+
+            yield return null;
+        }
+
+        // request ResetAllocator before next Execute() starts
+        m_RequestResetAllocator = true;
+        Profiler.EndSample ();
+
+        if (m_Verbose)
+            D.Log(m_Vars.GetAllocator());
+    }
+
+    public virtual Tensor PeekOutput()
+    {
+        Profiler.BeginSample("Barracuda.PeekOutput");
+        var X = m_Vars.PeekOutput(m_DefaultOutputName);
+
+        if (X.flatWidth <=
+            m_MaxFlatWidthThatAutoTriggersAsyncDownload) // tensor is small and most likely will be accessed on CPU,
+            X.PrepareCacheForAccess(false);              // thus schedule non-blocking download from GPU/NPU to CPU
+        Profiler.EndSample();
+
+        return X;
+    }
+
+    public virtual Tensor PeekOutput(string name)
+    {
+        Profiler.BeginSample("Barracuda.PeekOutput");
+        var X = m_Vars.PeekOutput(name);
+
+        if (X.flatWidth <=
+            m_MaxFlatWidthThatAutoTriggersAsyncDownload) // tensor is small and most likely will be accessed on CPU,
+            X.PrepareCacheForAccess(false);              // thus schedule non-blocking download from GPU/NPU to CPU
+        Profiler.EndSample();
+
+        return X;
+    }
+
+    public virtual string Summary()
+    {
+        return m_Vars.GetAllocator().ToString() + "\n" + m_Ops.ToString();
+    }
+}
+
+
+public class GenericVars : IVars
+{
+    private Dictionary<string, Tensor> m_TensorsByName = new Dictionary<string, Tensor>();
+    protected HashSet<Tensor> m_ModelTensors = new HashSet<Tensor>();
+    protected Dictionary<Layer, Tensor[]> m_InputTensorsByLayer = new Dictionary<Layer, Tensor[]>();
+    private Dictionary<string, int> m_LayerNameToId = new Dictionary<string, int>();
+    private Dictionary<string, int> m_LayerNameToKeepUntilId = new Dictionary<string, int>();
+    private Dictionary<int, Layer> m_LayerIdToLayer = new Dictionary<int, Layer>();
+    protected StringCache m_StringCache = new StringCache();
+
+    public GenericVars()
+    {
+    }
+
+    ~GenericVars()
+    {
+        Dispose();
+    }
+
+    public virtual void Dispose()
+    {
+        foreach (var t in m_ModelTensors)
+            t.Dispose();
+        m_ModelTensors.Clear();
+    }
+
+    private ITensorAllocator m_Allocator = new DefaultTensorAllocator();
+    public virtual ITensorAllocator GetAllocator()
+    {
+        return m_Allocator;
+    }
+
+    protected bool ValidateGlobalInputs(Model model, IDictionary<string, TensorShape> inputShapes)
+    {
+        bool valid = true;
+        foreach (var i in model.inputs)
+        {
+            if (m_TensorsByName.ContainsKey(i.name) ||
+                (inputShapes != null && inputShapes.ContainsKey(i.name)))
+                continue;
+
+            D.LogWarning("Global input is missing: " + i.name);
+            valid = false;
+        }
+        return valid;
+    }
+
+    protected virtual Tensor[] PrepareLayerInputTensors(Model model, Layer layer, IOps ops)
+    {
+        int tensorIndex = 0;
+        var tensors = new Tensor[layer.inputs.Length + layer.datasets.Length];
+
+        foreach (var name in layer.inputs)
+        {
+            tensors[tensorIndex++] = new Tensor(1, 1, 1, 1, m_StringCache.Lookup(layer.name, "_dummy_in", tensorIndex));
+        }
+        foreach (var arg in layer.datasets)
+        {
+            var tensor = new Tensor(arg.shape, new SharedArrayTensorData(layer.weights, (int)arg.offset,
+                                                                        (int)arg.shape.length),
+                                                                        m_StringCache.Lookup(layer.name, "_arg", tensorIndex));
+            if (ops != null)
+                tensor = ops.Prepare(tensor);
+            m_ModelTensors.Add(tensor);
+            tensors[tensorIndex++] = tensor;
+        }
+        return tensors;
+    }
+
+    public virtual void SetInput(string name, Tensor x)
+    {
+        m_TensorsByName[name] = x;
+    }
+
+    public virtual void PrepareStorage(Model model, IOps ops, IDictionary<string, TensorShape> inputShapes)
+    {
+        ValidateGlobalInputs(model, inputShapes);
+
+        m_LayerNameToId.Clear();
+        m_LayerNameToKeepUntilId.Clear();
+        m_LayerIdToLayer.Clear();
+
+        for (var idx = 0; idx < model.layers.Count; idx++)
+        {
+            var forLayer = model.layers[idx];
+            m_LayerIdToLayer[idx] = forLayer;
+
+            // prepare input placeholders and argument tensors only once per layer
+            if (m_InputTensorsByLayer.ContainsKey(forLayer))
+                continue;
+
+            var tensors = PrepareLayerInputTensors(model, forLayer, ops);
+            m_InputTensorsByLayer.Add(forLayer, tensors);
+        }
+
+        for (var i = 0; i < model.layers.Count; i++)
+        {
+            var layer = model.layers[i];
+            m_LayerNameToId[layer.name] = i;
+
+            for (var j = 0; j < layer.inputs.Length; j++)
+            {
+                m_LayerNameToKeepUntilId[layer.inputs[j]] = i;
+            }
+        }
+
+        // outputs should always be preserved
+        foreach (var input in model.inputs)
+        {
+            m_LayerNameToKeepUntilId[input.name] = model.layers.Count;
+        }
+
+        // outputs should always be preserved
+        foreach (var outname in model.outputs)
+        {
+            m_LayerNameToKeepUntilId[outname] = model.layers.Count;
+        }
+
+        // memories should always be preserved
+        foreach (var mem in model.memories)
+        {
+            m_LayerNameToKeepUntilId[mem.input] = model.layers.Count;
+            m_LayerNameToKeepUntilId[mem.output] = model.layers.Count;
+        }
+    }
+
+    public virtual Tensor[] GatherInputs(Layer forLayer)
+    {
+        var tensors = m_InputTensorsByLayer[forLayer];
+
+        // fill in input variables
+        int index = 0;
+        foreach (var name in forLayer.inputs)
+            tensors[index++] = PeekOutput(name);
+
+        return tensors;
+    }
+
+    public virtual void PrepareStorage(Layer forLayer)
+    {
+        // Current layer Id
+        var layerId = m_LayerNameToId[forLayer.name];
+
+        for (var idx = 0; idx < layerId; idx++)
+        {
+            var l = m_LayerIdToLayer[idx];
+            var key = l.name;
+
+            // Remove all allocated tensors for layer storage, but
+            // global constants might not exist in this dictionary,
+            // so lets just ignore them
+            if (m_TensorsByName.ContainsKey(key) &&
+                m_LayerNameToKeepUntilId.ContainsKey(key) &&
+                m_LayerNameToKeepUntilId[key] < layerId &&
+                !m_ModelTensors.Contains(m_TensorsByName[key]))
+            {
+                m_TensorsByName[key].Dispose();
+                m_TensorsByName.Remove(key);
+            }
+        }
+    }
+
+    public virtual void Store(Layer fromLayer, Tensor result)
+    {
+        // assign debug name
+        result.name = m_StringCache.Lookup(fromLayer.name, "_out");
+
+        m_TensorsByName[fromLayer.name] = result;
+    }
+
+    public virtual Tensor PeekOutput(string name)
+    {
+        if (!m_TensorsByName.ContainsKey(name))
+            D.LogWarning("GenericVars missing variable: " + name);
+
+        return m_TensorsByName[name];
+    }
+}
+
+public class GenericVarsWithReuse : GenericVars
+{
+    private Model m_CachedModel;
+    private bool m_LayerRequiresStorage = false;
+    private HashSet<Layer> m_LayersWithStorage;
+    private Tensor m_Temporary;
+    private string m_TemporaryName = null;
+
+    protected bool layerRequiresStorage { get { return m_LayerRequiresStorage; } }
+    protected Tensor temporary { get { return m_Temporary; } }
+
+    protected void ReleaseTemporary()
+    {
+        m_TemporaryName = null;
+        if (m_Temporary == null)
+            return;
+
+        if (m_Temporary.allocator != null)
+            m_Temporary.allocator.Release(m_Temporary, false);
+        else
+            m_Temporary.Dispose();
+        m_Temporary = null;
+    }
+
+    public override void PrepareStorage(Model model, IOps ops, IDictionary<string, TensorShape> inputShapes)
+    {
+        base.PrepareStorage(model, ops, inputShapes);
+
+        ReleaseTemporary();
+
+        if (m_CachedModel != model)
+            m_LayersWithStorage = ModelAnalyzer.FindLayersThatRequireStorage(model);
+        m_CachedModel = model;
+
+        Assert.AreEqual(m_Temporary, null);
+    }
+
+    public override void PrepareStorage(Layer forLayer)
+    {
+        base.PrepareStorage(forLayer);
+        m_LayerRequiresStorage = m_LayersWithStorage.Contains(forLayer);
+    }
+
+    public override void Store(Layer fromLayer, Tensor result)
+    {
+        if (result.tensorOnDevice != m_Temporary?.tensorOnDevice)
+            ReleaseTemporary();
+
+        if (layerRequiresStorage)
+        {
+            Assert.IsNotNull(result);
+            base.Store(fromLayer, result);
+
+            m_Temporary = null;
+            m_TemporaryName = null;
+        }
+        else
+        {
+            Assert.IsTrue(m_Temporary == null || m_Temporary.tensorOnDevice == result.tensorOnDevice);
+
+            // assign debug name
+            result.name = m_StringCache.Lookup(fromLayer.name, "_out");
+
+            m_Temporary = result;
+            m_TemporaryName = fromLayer.name;
+        }
+    }
+
+    public override Tensor PeekOutput(string name)
+    {
+        if (m_TemporaryName == name)
+        {
+            Assert.IsNotNull(m_Temporary);
+            return m_Temporary;
+        }
+        return base.PeekOutput(name);
+    }
+}
+
+public class GenericVarsWithPreallocation : GenericVarsWithReuse, ITensorAllocator
+{
+    private Model m_CachedModel;
+
+    private DefaultTensorAllocator m_TemporaryAllocator = new DefaultTensorAllocator();
+    private DefaultTensorAllocator m_StorageAllocator = new DefaultTensorAllocator();
+
+    public override void PrepareStorage(Model model, IOps ops, IDictionary<string, TensorShape> inputShapes)
+    {
+        base.PrepareStorage(model, ops, inputShapes);
+        if (m_CachedModel != model)
+        {
+            // pre-allocate 2 buffers that can be cycled for temporaries
+            var allocator = m_TemporaryAllocator;
+
+            var maxShape = ModelAnalyzer.FindLargestNecessaryTensorShape(model, inputShapes);
+            var alloc1 = allocator.Alloc(maxShape);
+            var alloc2 = allocator.Alloc(maxShape);
+            alloc1 = ops.Prepare(alloc1);
+            alloc2 = ops.Prepare(alloc2);
+            allocator.Release(alloc1, false);
+            allocator.Release(alloc2, false);
+        }
+        m_CachedModel = model;
+    }
+
+    public override ITensorAllocator GetAllocator()
+    {
+        return this;
+    }
+
+    public virtual Tensor Alloc(TensorShape shape)
+    {
+        if (layerRequiresStorage)
+            return m_StorageAllocator.Alloc(shape);
+        else
+            return m_TemporaryAllocator.Alloc(shape);
+    }
+    public virtual Tensor Alloc(TensorShape shape, ITensorData buffer)
+    {
+        if (layerRequiresStorage)
+            return m_StorageAllocator.Alloc(shape, buffer);
+        else
+            return m_TemporaryAllocator.Alloc(shape, buffer);
+    }
+    public virtual void Repin(Tensor x, ITensorData newBuffer, ITensorData oldBuffer, bool disposeUnpinnedHint)
+    {
+        x.allocator.Repin(x, newBuffer, oldBuffer, disposeUnpinnedHint);
+    }
+    public virtual void Cast(Tensor x, ITensorData newBuffer, ITensorData oldBuffer)
+    {
+        x.allocator.Cast(x, newBuffer, oldBuffer);
+    }
+    public virtual void Release(Tensor x, bool calledFromTensorDispose)
+    {
+        x.allocator.Release(x, calledFromTensorDispose);
+    }
+    public virtual void WaiveOwnership(Tensor x)
+    {
+        x.allocator.WaiveOwnership(x);
+    }
+    public virtual void Reset(bool keepCachedMemory)
+    {
+        m_TemporaryAllocator.Reset(keepCachedMemory);
+        m_StorageAllocator.Reset(keepCachedMemory);
+    }
+
+    public long busyBytes
+    { get {
+        return m_TemporaryAllocator.busyBytes + m_StorageAllocator.busyBytes;
+    } }
+    public long freeBytes
+    { get {
+        return m_TemporaryAllocator.freeBytes + m_StorageAllocator.freeBytes;
+    } }
+    public long totalBytes
+    { get {
+        return m_TemporaryAllocator.totalBytes + m_StorageAllocator.totalBytes;
+    } }
+    public override string ToString()
+    {
+        return $"Total allocated: {totalBytes} busy: {busyBytes}";
+    }
+}
+
+//public class DefaultTensorAllocator : TensorOperatorNewAllocator {}
+//public class DefaultTensorAllocator : TensorCachingByShapeAllocator {}
+public class DefaultTensorAllocator : TensorCachingAllocator {}
+
+//public class DefaultVars : GenericVars {}
+//public class DefaultVars : GenericVarsWithReuse {}
+public class DefaultVars : GenericVarsWithPreallocation {}
+
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/GenericWorker.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/GenericWorker.cs.meta
new file mode 100644
index 0000000..27226bb
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/GenericWorker.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: f7473266805a8439287433d3dac88945
+timeCreated: 1506427659
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/MatrixUtils.cs b/Assets/Coach-ML/Barracuda/Core/Backends/MatrixUtils.cs
new file mode 100644
index 0000000..88ec72c
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/MatrixUtils.cs
@@ -0,0 +1,220 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Runtime.InteropServices;
+using System.Threading.Tasks;
+using Barracuda;
+using UnityEngine;
+using UnityEngine.Assertions;
+using UnityEngine.Scripting;
+
+public class MatrixUtils 
+{
+    public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int N, int col, int M, float[] blockOut, int bs, bool transpose = false)
+    {
+        Array.Clear(blockOut, 0, bs * bs);
+
+        var rowFinal = Math.Min(row + bs, N);
+        var count = Math.Min(col + bs, M) - col;
+
+        // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
+        if (transpose)
+        {
+            // sequential access over blockOut, strided over matrixIn
+            //for (var i = row; i < rowFinal; i++)
+            //    for (var j = 0; j < count; ++j)
+            //        blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
+
+            // sequential access over matrixIn, strided over blockOut
+            for (var j = 0; j < count; ++j)
+                for (var i = row; i < rowFinal; i++)
+                    blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
+        }
+        else
+            for (var i = row; i < rowFinal; i++)
+            {
+                //D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
+                Marshal.Copy((IntPtr)(matrixIn + i * M + col), blockOut, (i - row) * bs, count);
+            }
+
+    }
+
+    public static unsafe void ClearFloatArray(float* arr, float val, int count)
+    {
+        for (int i = 0; i < count; i++)
+        {
+            arr[i] = val;
+        }
+    }
+
+    public static unsafe void CopyFloatArray(float* from, float* to, int count)
+    {
+        for (int i = 0; i < count; i++)
+        {
+            to[i] = from[i];
+        }
+    }
+    
+    public static unsafe void CopyBlockWithPadding(float* matrixIn, int row, int N, int col, int M, float* blockOut, int bs, bool transpose = false)
+    {
+        ClearFloatArray(blockOut, 0, bs * bs);
+
+        var rowFinal = Math.Min(row + bs, N);
+        var count = Math.Min(col + bs, M) - col;
+
+        // @TODO: measure which one is better - sequential access over matrix memory or blockOut cache
+        if (transpose)
+        {
+            // sequential access over blockOut, strided over matrixIn
+            //for (var i = row; i < rowFinal; i++)
+            //    for (var j = 0; j < count; ++j)
+            //        blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
+
+            // sequential access over matrixIn, strided over blockOut
+            for (var j = 0; j < count; ++j)
+            for (var i = row; i < rowFinal; i++)
+                blockOut[(i - row) * bs + j] = matrixIn[i + (col + j) * N];
+        }
+        else
+            for (var i = row; i < rowFinal; i++)
+            {
+                //D.Log(string.Format("Copy[{3}] {0} -> {1} {2}", i * M + col, (i - row) * bs, count, i));
+                CopyFloatArray(matrixIn + i * M + col, blockOut + (i - row) * bs, count);
+            }
+
+    }
+
+    public static unsafe void CopyBlockWithPadding(float[] blockOut, float* matrixIn, int row, int N, int col, int M, int bs)
+    {
+        var rowFinal = Math.Min(row + bs, N);
+        var count = Math.Min(col + bs, M) - col;
+
+        for (var i = row; i < rowFinal; i++)
+            Marshal.Copy(blockOut, (i - row) * bs, (IntPtr)(matrixIn + i * M + col), count);
+    }
+    
+    public static unsafe void CopyBlockWithPadding(float* blockOut, float* matrixIn, int row, int N, int col, int M, int bs)
+    {
+        var rowFinal = Math.Min(row + bs, N);
+        var count = Math.Min(col + bs, M) - col;
+
+        for (var i = row; i < rowFinal; i++)
+            CopyFloatArray(blockOut + (i - row) * bs, matrixIn + i * M + col, count);
+    }
+
+    public static unsafe void MultiplyBlockUnroll8xhPadded(float* Ap,
+                                     float* Bp,
+                                     float* Cp, int bs)
+    {
+        for (int i = 0; i < bs; i++)
+        {
+            for (int j = 0; j < bs; j += 8)
+            {
+                int baseC = i * bs + j;
+                float sum0 = *(Cp + baseC);
+                float sum1 = *(Cp + baseC + 1);
+                float sum2 = *(Cp + baseC + 2);
+                float sum3 = *(Cp + baseC + 3);
+                float sum4 = *(Cp + baseC + 4);
+                float sum5 = *(Cp + baseC + 5);
+                float sum6 = *(Cp + baseC + 6);
+                float sum7 = *(Cp + baseC + 7);
+
+                for (int l = 0; l < bs; l++)
+                {
+                    float A = Ap[i * bs + l];
+                    int baseB = l * bs + j;
+
+                    sum0 += A * *(Bp + baseB);
+                    sum1 += A * *(Bp + baseB + 1);
+                    sum2 += A * *(Bp + baseB + 2);
+                    sum3 += A * *(Bp + baseB + 3);
+                    sum4 += A * *(Bp + baseB + 4);
+                    sum5 += A * *(Bp + baseB + 5);
+                    sum6 += A * *(Bp + baseB + 6);
+                    sum7 += A * *(Bp + baseB + 7);
+                }
+
+                *(Cp + baseC) = sum0;
+                *(Cp + baseC + 1) = sum1;
+                *(Cp + baseC + 2) = sum2;
+                *(Cp + baseC + 3) = sum3;
+                *(Cp + baseC + 4) = sum4;
+                *(Cp + baseC + 5) = sum5;
+                *(Cp + baseC + 6) = sum6;
+                *(Cp + baseC + 7) = sum7;
+            }
+        }
+    }
+
+    public static unsafe void MultiplyBlockUnroll8xhParallelWithPadding(float* Ap, int AN, int AM,
+                                                           float* Bp, int BN, int BM,
+                                                           float* Cp, int CN, int CM, int bs,
+                                                           bool transposeA = false, bool transposeB = false)
+    {
+        if (transposeA)
+        {
+            var tmp = AN; AN = AM; AM = tmp;
+        }
+        if (transposeB)
+        {
+            var tmp = BN; BN = BM; BM = tmp;
+        }
+
+        int N = AN;
+        int M = AM;
+        int K = BM;
+
+        {
+            Assert.IsTrue(bs >= 8, "Matrix Mul block size should be >= 8");
+
+            Parallel.For(0, (BM / bs) + (BM % bs > 0 ? 1 : 0), colB =>
+            {
+                float[] blockA = new float[bs * bs];
+                float[] blockB = new float[bs * bs];
+                float[] blockC = new float[bs * bs];
+
+                for (int rowA = 0; rowA < N; rowA += bs)
+                {
+                    //for (int colB = 0; colB < BM; colB += bs)
+                    {
+                        for (int l = 0; l < AM; l += bs)
+                        {
+
+                            CopyBlockWithPadding(Ap, rowA, AN, l, AM, blockA, bs, transposeA);
+                            CopyBlockWithPadding(Bp, l, BN, colB * bs, BM, blockB, bs, transposeB);
+                            CopyBlockWithPadding(Cp, rowA, CN, colB * bs, CM, blockC, bs);
+
+                            fixed (float* blockAp = blockA, blockBp = blockB, blockCp = blockC)
+                            {
+                                MatrixUtils.MultiplyBlockUnroll8xhPadded(blockAp, blockBp, blockCp, bs);
+                            }
+
+                            CopyBlockWithPadding(blockC, Cp, rowA, CN, colB * bs, CM, bs);
+                        }
+                    }
+                }
+            });
+        }
+    } 
+}
+
+namespace Barracuda
+{
+    [Preserve]
+    public class CSharpBLAS : BLASPlugin
+    {
+        public bool IsCurrentPlatformSupported()
+        {
+            return true;
+        }
+
+        public unsafe void SGEMM(float* Ap, int AN, int AM, float* Bp, int BN, int BM, float* Cp, int CN, int CM, int bs,
+            bool transposeA = false, bool transposeB = false)
+        {
+            MatrixUtils.MultiplyBlockUnroll8xhParallelWithPadding(Ap, AN, AM, Bp, BN, BM, Cp, CN, CM, bs,
+                transposeA, transposeB);
+        }
+    }
+}
+
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/MatrixUtils.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/MatrixUtils.cs.meta
new file mode 100644
index 0000000..0c8ebab
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/MatrixUtils.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: bf04fe6d135714369af8cab2915b2735
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/ModelAnalyzer.cs b/Assets/Coach-ML/Barracuda/Core/Backends/ModelAnalyzer.cs
new file mode 100644
index 0000000..559a427
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/ModelAnalyzer.cs
@@ -0,0 +1,499 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq; // ToArray(), ToDictionary()
+
+using UnityEngine;
+using UnityEngine.Assertions;
+using UnityEngine.Profiling;
+
+namespace Barracuda {
+
+
+public class ModelAnalyzer
+{
+    static public string GetDefaultInputName(Model model)
+    {
+        if (model.inputs.Count == 1)
+            return model.inputs[0].name;
+
+        var previousLayerNames = new HashSet<string>();
+
+        // find first unconnected layer
+        foreach (var l in model.layers)
+        {
+            previousLayerNames.Add(l.name);
+
+            bool layerDoesNotNeedInput = (l.type == Layer.Type.Load);
+
+            if (layerDoesNotNeedInput)
+                continue;
+
+            if (l.inputs.Length != 1)
+                continue;
+
+            // treat layer as default input layer
+            // if-and-only-if layer has only 1 input AND is not connected to any previous layer
+            var inputName = l.inputs[0];
+            if (!previousLayerNames.Contains(inputName))
+                return inputName;
+        }
+
+        return "";
+    }
+
+    static public string GetDefaultOutputName(Model model)
+    {
+        if (model.outputs.Count == 1)
+            return model.outputs[0];
+
+        if (model.layers.Count > 0)
+        {
+            var lastLayer = model.layers[model.layers.Count - 1];
+            return lastLayer.name;
+        }
+
+        return "";
+    }
+
+    static public TensorShape[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes)
+    {
+        IDictionary<string, TensorShape> shapesByName;
+        return ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
+    }
+
+    static public TensorShape[] ListTemporaryTensorShapes(Model model, IDictionary<string, TensorShape> inputShapes,
+        out IDictionary<string, TensorShape> shapesByName)
+    {
+        Profiler.BeginSample ("Barracuda.ListTemporaryTensorShapes");
+        var shapes = new List<TensorShape>();
+        shapesByName = new Dictionary<string, TensorShape>();
+        foreach (var entry in inputShapes)
+            shapesByName.Add(entry.Key, entry.Value);
+
+        TensorShape X;
+        shapesByName.TryGetValue(GetDefaultInputName(model), out X); // default input
+        var O = X;
+
+        foreach (var l in model.layers)
+        {
+            if (l.inputs.Length > 0 && shapesByName.ContainsKey(l.inputs[0]))
+                X = shapesByName[l.inputs[0]];
+            else
+                X = O; // previous output is used, if-and-only-if layer has no explicit inputs
+
+            if (l.type == Layer.Type.Dense)
+            {
+                Assert.IsNotNull(l.datasets);
+                var W = l.datasets[0].shape;
+                O = new TensorShape(X.flatHeight, W.flatWidth);
+            }
+            else if (
+                l.type == Layer.Type.Conv2D ||
+                l.type == Layer.Type.DepthwiseConv2D)
+            {
+                var K = l.datasets[0].shape;
+
+                Assert.IsNotNull(l.stride);
+                Assert.IsNotNull(l.pad);
+                var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
+                O = X.ApplyKernel(K, l.stride, pad);
+            }
+            else if (
+                l.type == Layer.Type.Conv2DTrans)
+            {
+                var K = l.datasets[0].shape;
+                Assert.IsNotNull(l.stride);
+                Assert.IsNotNull(l.pad);
+                // pool size is treated as output_adjustment aka output_padding here
+                var outputAdjustment = l.pool;
+                var pad = X.AdjustPadToKernel(K, l.stride, l.pad);
+                O = X.ApplyKernelInverse(K, l.stride, pad, outputAdjustment);
+            }
+            else if (
+                l.type == Layer.Type.Upsample2D)
+            {
+                // pool size is treated as upsample coefficient here
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 2);
+                O = new TensorShape(X.batch, X.height * l.pool[1], X.width * l.pool[0], X.channels);
+            }
+            else if (
+                l.type == Layer.Type.MaxPool2D ||
+                l.type == Layer.Type.AvgPool2D)
+            {
+                Assert.IsNotNull(l.pool);
+                Assert.IsNotNull(l.stride);
+                Assert.IsNotNull(l.pad);
+                var pad = X.AdjustPadToPool(l.pool, l.stride, l.pad);
+                O = X.ApplyPool(l.pool, l.stride, pad);
+            }
+            else if (
+                l.type == Layer.Type.GlobalMaxPool2D ||
+                l.type == Layer.Type.GlobalAvgPool2D)
+            {
+                O = new TensorShape(X.batch, 1, 1, X.channels);
+            }
+            else if (
+                l.type == Layer.Type.Border2D ||
+                l.type == Layer.Type.Pad2DReflect ||
+                l.type == Layer.Type.Pad2DSymmetric ||
+                l.type == Layer.Type.Pad2DEdge)
+            {
+                Assert.IsNotNull(l.pad);
+                O = X.ApplyBorder(l.pad);
+            }
+            else if (
+                l.type == Layer.Type.Conv3D ||
+                l.type == Layer.Type.Conv3DTrans ||
+                l.type == Layer.Type.Upsample3D ||
+                l.type == Layer.Type.MaxPool3D ||
+                l.type == Layer.Type.AvgPool3D ||
+                l.type == Layer.Type.GlobalMaxPool3D ||
+                l.type == Layer.Type.GlobalAvgPool3D ||
+                l.type == Layer.Type.Border3D)
+            {
+                throw new NotImplementedException();
+            }
+            else if (
+                l.type == Layer.Type.RandomNormal ||
+                l.type == Layer.Type.RandomUniform)
+            {
+                Assert.IsNotNull(l.pool);
+                // pool size is treated as shape constant, if not empty
+                // otherwise shape of the previous tensor is used
+                if (l.pool.Length > 0)
+                    O = new TensorShape(l.pool);
+                else
+                    O = X;
+            }
+            else if (
+                l.type == Layer.Type.Multinomial)
+            {
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 1);
+                O = new TensorShape(X.batch, l.pool[0]);
+            }
+            else if (
+                l.type == Layer.Type.OneHot)
+            {
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 1);
+                int features = X.flatWidth;
+                int depth = l.pool[0];
+                O = new TensorShape(X.batch, 1, features, depth);
+            }
+            else if (
+                l.type == Layer.Type.Add ||
+                l.type == Layer.Type.Sub ||
+                l.type == Layer.Type.Mul ||
+                l.type == Layer.Type.Div ||
+                l.type == Layer.Type.Pow ||
+                l.type == Layer.Type.Min ||
+                l.type == Layer.Type.Max ||
+                l.type == Layer.Type.Mean||
+                l.type == Layer.Type.Greater ||
+                l.type == Layer.Type.GreaterEqual ||
+                l.type == Layer.Type.Less ||
+                l.type == Layer.Type.LessEqual ||
+                l.type == Layer.Type.Equal ||
+                l.type == Layer.Type.LogicalOr ||
+                l.type == Layer.Type.LogicalAnd ||
+                l.type == Layer.Type.LogicalXor)
+            {
+                // gather shapes by names
+                var list = new List<TensorShape>(l.inputs.Length);
+                foreach (var i in l.inputs)
+                {
+                    if (shapesByName.ContainsKey(i))
+                        list.Add(shapesByName[i]);
+                }
+
+                O = TensorExtensions.Max(list.ToArray());
+            }
+            else if (
+                l.type == Layer.Type.ReduceL1 ||
+                l.type == Layer.Type.ReduceL2 ||
+                l.type == Layer.Type.ReduceLogSum ||
+                l.type == Layer.Type.ReduceLogSumExp ||
+                l.type == Layer.Type.ReduceMax ||
+                l.type == Layer.Type.ReduceMean ||
+                l.type == Layer.Type.ReduceMin ||
+                l.type == Layer.Type.ReduceProd ||
+                l.type == Layer.Type.ReduceSum ||
+                l.type == Layer.Type.ReduceSumSquare)
+            {
+                O = X.Reduce(l.axis);
+            }
+            else if (
+                l.type == Layer.Type.Flatten)
+            {
+                O = X.Flatten();
+            }
+            else if (
+                l.type == Layer.Type.Reshape)
+            {
+                // pool size is treated as reshape coefficient, if not empty
+                // otherwise shape of the 2nd input tensor is used
+                var size = l.pool;
+
+                Assert.IsNotNull(size);
+                if (size.Length == 0 && l.inputs.Length > 1)
+                    size = shapesByName[l.inputs[1]].ToArray();
+
+                Assert.AreEqual(size.Length, 4);
+                // pool size is treated as reshape coefficient here
+                O = X.Reshape(size);
+            }
+            else if (
+                l.type == Layer.Type.Transpose)
+            {
+                O = new TensorShape(X.flatWidth, X.flatHeight);
+            }
+            else if (
+                l.type == Layer.Type.Squeeze ||
+                l.type == Layer.Type.Unsqueeze)
+            {
+                throw new NotImplementedException();
+            }
+            else if (
+                l.type == Layer.Type.Concat)
+            {
+                // gather shapes by names
+                var list = new List<TensorShape>(l.inputs.Length);
+                foreach (var i in l.inputs)
+                {
+                    if (shapesByName.ContainsKey(i))
+                        list.Add(shapesByName[i]);
+                }
+
+                O = TensorExtensions.Concat(list.ToArray(), l.axis);
+            }
+            else if (
+                l.type == Layer.Type.StridedSlice)
+            {
+                Assert.IsNotNull(l.pad);
+                Assert.IsNotNull(l.pool);
+                Assert.IsNotNull(l.stride);
+                O = X.ApplyStridedSlice(l.pad, l.pool, l.stride);
+            }
+            else if (
+                l.type == Layer.Type.Tile)
+            {
+                // pool size is treated as tiling coefficient here
+                Assert.IsNotNull(l.pool);
+                Assert.AreEqual(l.pool.Length, 4);
+                var scale = l.pool;
+                O = X.Scale(scale);
+            }
+            else if (
+                l.type == Layer.Type.Load)
+            {
+                O = l.datasets[0].shape;
+            }
+            else if (// elementwise operations
+                l.type == Layer.Type.Nop ||
+                l.type == Layer.Type.Activation ||
+                l.type == Layer.Type.ScaleBias ||
+                l.type == Layer.Type.Normalization ||
+                l.type == Layer.Type.LRN ||
+                l.type == Layer.Type.Dropout ||
+                l.type == Layer.Type.LogicalNot ||
+                l.activation == Layer.Activation.PRelu)
+            {
+                // works in place, keeps the same shape size
+                O = X;
+            }
+            else
+            {
+                throw new NotImplementedException();
+            }
+
+            shapes.Add(O);
+            shapesByName.Add(l.name, O);
+        }
+
+        Profiler.EndSample();
+        return shapes.ToArray();
+    }
+
+    static public bool TryGetOutputTensorShape(Model model, IDictionary<string, TensorShape> inputShapes, string output, out TensorShape shape)
+    {
+        IDictionary<string, TensorShape> shapesByName;
+        ListTemporaryTensorShapes(model, inputShapes, out shapesByName);
+        return shapesByName.TryGetValue(output, out shape);
+    }
+
+    static public bool TryGetOutputTensorShape(Model model, string output, out TensorShape shape)
+    {
+        var inputShapes = new Dictionary<string, TensorShape>();
+        foreach (var i in model.inputs)
+            inputShapes.Add(i.name, new TensorShape(i.shape));
+        return TryGetOutputTensorShape(model, inputShapes, output, out shape);
+    }
+
+    static public HashSet<Layer> FindLayersThatRequireStorage(Model model)
+    {
+        var allInputsExceptFromPreviousLayer = new HashSet<string>();
+        Layer prevLayer = null;
+        foreach (var layer in model.layers)
+        {
+            foreach (var input in layer.inputs)
+                if (prevLayer != null && input != prevLayer.name)
+                    allInputsExceptFromPreviousLayer.Add(input);
+            prevLayer = layer;
+        }
+
+        var allOutputs = new HashSet<string>();
+        foreach (var output in model.outputs)
+            allOutputs.Add(output);
+        foreach (var memory in model.memories)
+            allOutputs.Add(memory.output);
+        allOutputs.Add(GetDefaultOutputName(model));
+
+        var requireStorage = new HashSet<Layer>();
+        foreach (var layer in model.layers)
+        {
+            // loading constant tensor requires storage
+            if (layer.type == Layer.Type.Load)
+                requireStorage.Add(layer);
+
+            // @TBD: implement safety check that ensures Nop never has input
+            // otherwise it has to be treated as Load operation
+            if (layer.type == Layer.Type.Nop)
+                requireStorage.Add(layer);
+
+            if (allInputsExceptFromPreviousLayer.Contains(layer.name) ||
+                allOutputs.Contains(layer.name))
+                requireStorage.Add(layer);
+        }
+
+        return requireStorage;
+    }
+
+    /*static public HashSet<Layer> FindUpstreamLayers(Model model, string[] outputs)
+    {
+        var layersByName = new Dictionary<string, Layer>();
+        foreach (var l in model.layers)
+            layersByName.Add(l.name, l);
+
+        var connected = new HashSet<Layer>();
+        Func<string[], HashSet<Layer>(), HashSet<Layer>()> visitor = (layerNames, visitNext) =>
+        {
+            foreach (var i in layerNames)
+                if (layersByName.ContainsKey(i))
+                {
+                    visitNext.Add(layersByName[i]);
+                    connected.Add(layersByName[i]);
+                }
+            return visitNext;
+        };
+
+        var layersToVisit = visitor(outputs, new HashSet<Layer>());
+        while (layersToVisit.Count > 0)
+        {
+            var visitNext = new HashSet<Layer>();
+            foreach (var l in layersToVisit)
+                visitor(l.inputs, visitNext);
+            layersToVisit = visitNext;
+        }
+        return connected;
+    }*/
+
+    static public HashSet<Layer> FindUpstreamLayers(Model model, string[] outputs)
+    {
+        // TODO: replace with var layersByName = model.layers.ToDictionary(i => i.name, i => i);
+        var layersByName = new Dictionary<string, Layer>();
+        foreach (var l in model.layers)
+            layersByName.Add(l.name, l);
+
+        var connected = new HashSet<Layer>();
+        var layersToVisit = new HashSet<Layer>();
+        foreach (var o in outputs)
+            if (layersByName.ContainsKey(o))
+            {
+                layersToVisit.Add(layersByName[o]);
+                connected.Add(layersByName[o]);
+            }
+
+        while (layersToVisit.Count > 0)
+        {
+            var visitNext = new HashSet<Layer>();
+            foreach (var l in layersToVisit)
+                foreach (var i in l.inputs)
+                    if (layersByName.ContainsKey(i))
+                    {
+                        visitNext.Add(layersByName[i]);
+                        connected.Add(layersByName[i]);
+                    }
+
+            layersToVisit = visitNext;
+        }
+        return connected;
+    }
+
+    static public TensorShape FindLargestNecessaryTensorShape(Model model, IDictionary<string, TensorShape> inputShapes)
+    {
+        Profiler.BeginSample ("Barracuda.FindLargestNecessaryTensorShape");
+
+        var shapes = ListTemporaryTensorShapes(model, inputShapes);
+
+        var maxTensorShape = new TensorShape(1,1,1,1);
+        foreach (var X in shapes)
+            if (X.length > maxTensorShape.length)
+                maxTensorShape = X;
+
+        Profiler.EndSample ();
+
+        return maxTensorShape;
+    }
+
+    static public TensorShape FindLargestArgumentTensorShape(Model model)
+    {
+        TensorShape maxTensorShape = new TensorShape(1,1,1,1);
+        foreach (var layer in model.layers)
+            foreach (var arg in layer.datasets)
+                if (arg.shape.length > maxTensorShape.length)
+                    maxTensorShape = arg.shape;
+
+        return maxTensorShape;
+    }
+
+    static public string[] FindBrokenLinks(Model model)
+    {
+        var globalInputsByName = model.inputs.ToDictionary(i => i.name, i => true);
+        var layersByName = model.layers.ToDictionary(i => i.name, i => i);
+        var brokenLinks = new HashSet<string>();
+
+        foreach (var layer in model.layers)
+            foreach (var i in layer.inputs)
+                if (!layersByName.ContainsKey(i) && !globalInputsByName.ContainsKey(i))
+                    brokenLinks.Add(i);
+        return brokenLinks.ToArray();
+    }
+
+    static public string[] FindUnconnectedInputs(Model model)
+    {
+        var unconnected = model.inputs.ToDictionary(i => i.name, i => true);
+        foreach (var layer in model.layers)
+            foreach (var i in layer.inputs)
+                unconnected.Remove(i);
+        return unconnected.Keys.ToArray();
+    }
+
+    static public string[] FindUnconnectedOutputs(Model model, List<string> outputs)
+    {
+        var unconnected = outputs.ToDictionary(i => i, i => true);
+        foreach (var layer in model.layers)
+            unconnected.Remove(layer.name);
+        return unconnected.Keys.ToArray();
+    }
+
+    static public string[] FindUnconnectedOutputs(Model model)
+    {
+        return FindUnconnectedOutputs(model, model.outputs);
+    }
+}
+
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/ModelAnalyzer.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/ModelAnalyzer.cs.meta
new file mode 100644
index 0000000..eab91aa
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/ModelAnalyzer.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 58838262534854657974303d5782ea38
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/StatsOps.cs b/Assets/Coach-ML/Barracuda/Core/Backends/StatsOps.cs
new file mode 100644
index 0000000..aac56a4
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/StatsOps.cs
@@ -0,0 +1,530 @@
+using System;
+using UnityEngine;
+
+namespace Barracuda {
+
+
+public class StatsOps : IOps
+{
+    class Transcendental
+    {
+
+        // Table of approximate alu operation costs
+        //  mul       1
+        //  rcp/mad   2
+        //  div/sqrt  10
+        //  log/exp   100
+        //  pow       200
+        // see: https://www.sciencedirect.com/topics/computer-science/division-operation
+        // see: https://colfaxresearch.com/arithmetics-on-intels-sandy-bridge-and-westmere-cpus-not-all-flops-are-created-equal/
+
+        public const long Reciprocal = 2L;
+        public const long Div = 10L;
+        public const long Root = 10L;
+        public const long Exponent = 100L;
+        public const long Pow = 200L;
+        public const long Trigonometric = 200L;
+    }
+
+    private IOps m_Ops;
+    private long m_Alu;
+    private long m_Mem;
+
+    public StatsOps(IOps ops)
+    {
+        m_Ops = ops;
+        m_Alu = 0L;
+        m_Mem = 0L;
+    }
+
+    public virtual void WaitForCompletion(Tensor x)
+    {
+        m_Ops.WaitForCompletion(x);
+    }
+
+    Tensor IOps.MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
+    {
+        var O = m_Ops.MatMul(X, xTranspose, Y, yTranspose);
+        m_Alu += (long)X.flatHeight * (long)X.flatWidth * (long)Y.flatWidth * 2L;
+        m_Mem += (long)X.length + (long)Y.length + (long)O.length;
+        return O;
+    }
+    Tensor IOps.Dense(Tensor X, Tensor W, Tensor B)
+    {
+        var O = m_Ops.Dense(X, W, B);
+        m_Alu += (long)X.flatHeight * (long)X.flatWidth * (long)W.flatWidth * 2L;
+        m_Mem += (long)X.length + (long)W.length + (long)B.length + (long)O.length;
+        return O;
+    }
+    Tensor IOps.Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        var O = m_Ops.Conv2D(X, K, B, stride, pad);
+        long m = (long)O.batch * (long)O.width * (long)O.height;
+        long n = (long)X.channels;
+        long k = (long)K.kernelWidth * (long)K.kernelHeight * (long)K.channels;
+        m_Alu += m * n * k * 2L;
+        m_Mem += (long)X.length + (long)K.length + (long)B.length + (long)O.length;
+        return O;
+    }
+    Tensor IOps.DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        var O = m_Ops.DepthwiseConv2D(X, K, B, stride, pad);
+        long m = (long)O.batch * (long)O.width * (long)O.height;
+        long n = (long)X.channels;
+        long k = (long)K.kernelWidth * (long)K.kernelHeight;
+        m_Alu += m * n * k * 2L;
+        m_Mem += (long)X.length + (long)K.length + (long)B.length + (long)O.length;
+        return O;
+    }
+    Tensor IOps.Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment)
+    {
+        var O = m_Ops.Conv2DTrans(X, K, B, stride, pad, outputAdjustment);
+        long m = (long)O.batch * (long)O.width * (long)O.height;
+        long n = (long)X.channels;
+        long k = (long)(K.kernelWidth/stride[1]) * (long)(K.kernelHeight/stride[0]) * (long)K.channels;
+        m_Alu += m * n * k * 2L;
+        m_Mem += (long)X.length + (long)K.length + (long)B.length + (long)O.length;
+        return O;
+    }
+    Tensor IOps.Upsample2D(Tensor X, int[] size)
+    {
+        var O = m_Ops.Upsample2D(X, size);
+        m_Alu += (long)O.length;
+        m_Mem += (long)X.length + (long)O.length;
+        return O;
+    }
+    Tensor IOps.MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        var O = m_Ops.MaxPool2D(X, pool, stride, pad);
+        Reduce(X, O);
+        return O;
+    }
+    Tensor IOps.AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        var O = m_Ops.AvgPool2D(X, pool, stride, pad);
+        Reduce(X, O);
+        return O;
+    }
+    Tensor IOps.GlobalMaxPool2D(Tensor X)
+    {
+        var O = m_Ops.GlobalMaxPool2D(X);
+        Reduce(X, O);
+        return O;
+    }
+    Tensor IOps.GlobalAvgPool2D(Tensor X)
+    {
+        var O = m_Ops.GlobalAvgPool2D(X);
+        Reduce(X, O);
+        return O;
+    }
+    Tensor IOps.GlobalAvgVariancePool2D(Tensor X)
+    {
+        var O = m_Ops.GlobalAvgVariancePool2D(X);
+        m_Alu += (long)X.length * 2L + (long)O.length;
+        m_Mem += (long)X.length + (long)O.length;
+        return O;
+    }
+
+    Tensor IOps.Border2D(Tensor X, int[] pad, float value)
+    {
+        var O = m_Ops.Border2D(X, pad, value);
+        m_Mem += (long)X.length + (long)O.length;
+        return O;
+    }
+    Tensor IOps.Pad2DReflect(Tensor X, int[] pad)
+    {
+        var O = m_Ops.Pad2DReflect(X, pad);
+        m_Mem += (long)X.length + (long)O.length;
+        return O;
+    }
+    Tensor IOps.Pad2DSymmetric(Tensor X, int[] pad)
+    {
+        var O = m_Ops.Pad2DSymmetric(X, pad);
+        m_Mem += (long)X.length + (long)O.length;
+        return O;
+    }
+    Tensor IOps.Pad2DEdge(Tensor X, int[] pad)
+    {
+        var O = m_Ops.Pad2DEdge(X, pad);
+        m_Mem += (long)X.length + (long)O.length;
+        return O;
+    }
+
+    Tensor IOps.ScaleBias(Tensor X, Tensor S, Tensor B)
+    {
+        Elementwise(X, 2L);
+        return m_Ops.ScaleBias(X, S, B);
+    }
+    Tensor IOps.Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon)
+    {
+        var O = m_Ops.Normalization(X, S, B, pool, axis, epsilon);
+        m_Alu += (long)X.length * 4L + (long)O.length * 2L;
+        m_Mem += (long)X.length + (long)O.length;
+        return O;
+    }
+    Tensor IOps.LRN(Tensor X, float alpha, float beta, float bias, int size)
+    {
+        var O = m_Ops.LRN(X, alpha, beta, bias, size);
+        // @TODO: not implemented
+        return O;
+    }
+    Tensor IOps.Dropout(Tensor X, float alpha)
+    {
+        Elementwise(X);
+        return m_Ops.Dropout(X, alpha);
+    }
+    Tensor IOps.RandomNormal(TensorShape s, float mean, float scale, int seed)
+    {
+        var O = m_Ops.RandomNormal(s, mean, scale, seed);
+        // @TODO: not implemented
+        return O;
+    }
+    Tensor IOps.RandomUniform(TensorShape s, float mean, float scale, int seed)
+    {
+        var O = m_Ops.RandomUniform(s, mean, scale, seed);
+        // @TODO: not implemented
+        return O;
+    }
+    Tensor IOps.Multinomial(Tensor X, int count, int seed)
+    {
+        var O = m_Ops.Multinomial(X, count, seed);
+        // @TODO: not implemented
+        return O;
+    }
+    Tensor IOps.OneHot(Tensor X, int depth, float onValue, float offValue)
+    {
+        var O = m_Ops.OneHot(X, depth, onValue, offValue);
+        // @TODO: not implemented
+        return O;
+    }
+
+    Tensor IOps.Relu(Tensor X)
+    {
+        Elementwise(X);
+        return m_Ops.Relu(X);
+    }
+    Tensor IOps.Softmax(Tensor X)
+    {
+        Elementwise(X, Transcendental.Exponent);
+        return m_Ops.Softmax(X);
+    }
+    Tensor IOps.LogSoftmax(Tensor X)
+    {
+        Elementwise(X, Transcendental.Exponent);
+        return m_Ops.LogSoftmax(X);
+    }
+    Tensor IOps.Tanh(Tensor X)
+    {
+        Elementwise(X, Transcendental.Trigonometric);
+        return m_Ops.Tanh(X);
+    }
+    Tensor IOps.Sigmoid(Tensor X)
+    {
+        Elementwise(X, Transcendental.Trigonometric);
+        return m_Ops.Sigmoid(X);
+    }
+    Tensor IOps.Relu6(Tensor X)
+    {
+        Elementwise(X, 4L);
+        return m_Ops.Relu6(X);
+    }
+    Tensor IOps.Elu(Tensor X, float alpha)
+    {
+        Elementwise(X, Transcendental.Exponent);
+        return m_Ops.Elu(X, alpha);
+    }
+    Tensor IOps.LeakyRelu(Tensor X, float alpha)
+    {
+        Elementwise(X, 4L);
+        return m_Ops.LeakyRelu(X, alpha);
+    }
+    Tensor IOps.Selu(Tensor X, float alpha, float gamma)
+    {
+        Elementwise(X, Transcendental.Exponent);
+        return m_Ops.Selu(X, alpha, gamma);
+    }
+    Tensor IOps.PRelu(Tensor X, Tensor S)
+    {
+        Elementwise(X, 4L);
+        return m_Ops.PRelu(X, S);
+    }
+    Tensor IOps.Swish(Tensor X)
+    {
+        Elementwise(X, Transcendental.Trigonometric);
+        return m_Ops.Swish(X);
+    }
+    Tensor IOps.Abs(Tensor X)
+    {
+        Elementwise(X);
+        return m_Ops.Abs(X);
+    }
+    Tensor IOps.Neg(Tensor X)
+    {
+        Elementwise(X);
+        return m_Ops.Neg(X);
+    }
+    Tensor IOps.Ceil(Tensor X)
+    {
+        Elementwise(X);
+        return m_Ops.Ceil(X);
+    }
+    Tensor IOps.Clip(Tensor X, float min, float max)
+    {
+        Elementwise(X, 2L);
+        return m_Ops.Clip(X, min, max);
+    }
+    Tensor IOps.Floor(Tensor X)
+    {
+        Elementwise(X);
+        return m_Ops.Floor(X);
+    }
+
+    Tensor IOps.Reciprocal(Tensor X)
+    {
+        Elementwise(X, Transcendental.Reciprocal);
+        return m_Ops.Reciprocal(X);
+    }
+    Tensor IOps.Pow(Tensor X, float alpha)
+    {
+        Elementwise(X, Transcendental.Pow);
+        return m_Ops.Pow(X, alpha);
+    }
+    Tensor IOps.Exp(Tensor X)
+    {
+        Elementwise(X, Transcendental.Exponent);
+        return m_Ops.Exp(X);
+    }
+    Tensor IOps.Log(Tensor X)
+    {
+        Elementwise(X, Transcendental.Exponent);
+        return m_Ops.Log(X);
+    }
+    Tensor IOps.Sqrt(Tensor X)
+    {
+        Elementwise(X, Transcendental.Root);
+        return m_Ops.Sqrt(X);
+    }
+
+    Tensor IOps.Add(Tensor[] tensors)
+    {
+        var O = m_Ops.Add(tensors);
+        ElementwiseBroadcast(tensors, O);
+        return O;
+    }
+    Tensor IOps.Sub(Tensor[] tensors)
+    {
+        var O = m_Ops.Sub(tensors);
+        ElementwiseBroadcast(tensors, O);
+        return O;
+    }
+    Tensor IOps.Mul(Tensor[] tensors)
+    {
+        var O = m_Ops.Mul(tensors);
+        ElementwiseBroadcast(tensors, O);
+        return O;
+    }
+    Tensor IOps.Div(Tensor[] tensors)
+    {
+        var O = m_Ops.Div(tensors);
+        ElementwiseBroadcast(tensors, O, Transcendental.Div);
+        return O;
+    }
+    Tensor IOps.Pow(Tensor[] tensors)
+    {
+        var O = m_Ops.Pow(tensors);
+        ElementwiseBroadcast(tensors, O, Transcendental.Pow);
+        return O;
+    }
+    Tensor IOps.Min(Tensor[] tensors)
+    {
+        var O = m_Ops.Min(tensors);
+        ElementwiseBroadcast(tensors, O);
+        return O;
+    }
+    Tensor IOps.Max(Tensor[] tensors)
+    {
+        var O = m_Ops.Max(tensors);
+        ElementwiseBroadcast(tensors, O);
+        return O;
+    }
+    Tensor IOps.Mean(Tensor[] tensors)
+    {
+        var O = m_Ops.Mean(tensors);
+        ElementwiseBroadcast(tensors, O);
+        return O;
+    }
+
+    Tensor IOps.ReduceMax(Tensor X, int axis)
+    {
+        var O = m_Ops.ReduceMax(X, axis);
+        Reduce(X, O);
+        return O;
+    }
+    Tensor IOps.ReduceMean(Tensor X, int axis)
+    {
+        var O = m_Ops.ReduceMean(X, axis);
+        Reduce(X, O);
+        return O;
+    }
+    Tensor IOps.ReduceMin(Tensor X, int axis)
+    {
+        var O = m_Ops.ReduceMin(X, axis);
+        Reduce(X, O);
+        return O;
+    }
+    Tensor IOps.ReduceProd(Tensor X, int axis)
+    {
+        var O = m_Ops.ReduceProd(X, axis);
+        Reduce(X, O);
+        return O;
+    }
+    Tensor IOps.ReduceSum(Tensor X, int axis)
+    {
+        var O = m_Ops.ReduceSum(X, axis);
+        Reduce(X, O);
+        return O;
+    }
+
+    Tensor IOps.Greater(Tensor a, Tensor b)
+    {
+        var O = m_Ops.Greater(a, b);
+        Elementwise(O);
+        return O;
+    }
+    Tensor IOps.GreaterEqual(Tensor a, Tensor b)
+    {
+        var O = m_Ops.GreaterEqual(a, b);
+        Elementwise(O);
+        return O;
+    }
+    Tensor IOps.Less(Tensor a, Tensor b)
+    {
+        var O = m_Ops.Less(a, b);
+        Elementwise(O);
+        return O;
+    }
+    Tensor IOps.LessEqual(Tensor a, Tensor b)
+    {
+        var O = m_Ops.LessEqual(a, b);
+        Elementwise(O);
+        return O;
+    }
+    Tensor IOps.Equal(Tensor a, Tensor b)
+    {
+        var O = m_Ops.Equal(a, b);
+        Elementwise(O);
+        return O;
+    }
+    Tensor IOps.LogicalOr(Tensor a, Tensor b)
+    {
+        var O = m_Ops.LogicalOr(a, b);
+        Elementwise(O);
+        return O;
+    }
+    Tensor IOps.LogicalAnd(Tensor a, Tensor b)
+    {
+        var O = m_Ops.LogicalAnd(a, b);
+        Elementwise(O);
+        return O;
+    }
+    Tensor IOps.LogicalXor(Tensor a, Tensor b)
+    {
+        var O = m_Ops.LogicalXor(a, b);
+        Elementwise(O);
+        return O;
+    }
+    Tensor IOps.LogicalNot(Tensor x)
+    {
+        var O = m_Ops.LogicalNot(x);
+        Elementwise(O);
+        return O;
+    }
+
+    Tensor IOps.Flatten(Tensor X)
+    {
+        return m_Ops.Flatten(X);
+    }
+    Tensor IOps.Reshape(Tensor X, TensorShape shape)
+    {
+        return m_Ops.Reshape(X, shape);
+    }
+    Tensor IOps.Transpose(Tensor X)
+    {
+        Elementwise(X);
+        return m_Ops.Transpose(X);
+    }
+
+    Tensor IOps.Concat(Tensor[] tensors, int axis)
+    {
+        var O = m_Ops.Concat(tensors, axis);
+        Elementwise(O);
+        return O;
+    }
+    Tensor IOps.StridedSlice(Tensor X, int[] starts, int[] ends, int[] strides)
+    {
+        var O = m_Ops.StridedSlice(X, starts, ends, strides);
+        Elementwise(O);
+        return O;
+    }
+    Tensor IOps.Tile(Tensor X, int[] repeats)
+    {
+        var O = m_Ops.Tile(X, repeats);
+        Elementwise(O);
+        return O;
+    }
+
+    Tensor IOps.Prepare(Tensor X)
+    {
+        return m_Ops.Prepare(X);
+    }
+
+    void IOps.ResetAllocator(bool keepCachedMemory)
+    {
+        m_Ops.ResetAllocator(keepCachedMemory);
+        m_Alu = 0;
+        m_Mem = 0;
+    }
+
+    public override string ToString()
+    {
+        string alu = m_Alu.ToString();
+        if (m_Alu > 1e12)
+            alu = $"{(double)m_Alu / (1e12):###.0}T";
+        else if (m_Alu > 1e9)
+            alu = $"{(double)m_Alu / (1e9):###.0}G";
+        else if (m_Alu > 1e6)
+            alu = $"{(double)m_Alu / (1e6):###.0}M";
+
+        var mem4 = m_Mem * 4L;
+        string mem = mem4.ToString();
+        if (mem4 > 1024*1024*1024)
+            mem = $"{(double)mem4 / (1024*1024*1024):###.0}Gb";
+        else if (mem4 > 1024*1024)
+            mem = $"{(double)mem4 / (1024*1024):###.0}Mb";
+        return $"ALU operations: {alu} bytes accessed: {mem}";
+    }
+
+    // -----
+    protected void Elementwise(Tensor X, long aluOperationsPerElement = 1L)
+    {
+        m_Alu += (long)X.length * aluOperationsPerElement;
+        m_Mem += (long)X.length * 2L;
+    }
+
+    protected void ElementwiseBroadcast(Tensor[] tensors, Tensor X, long aluOperationsPerElement = 1L)
+    {
+        m_Alu += (long)X.length * aluOperationsPerElement;
+        m_Mem += (long)X.length;
+        foreach (var t in tensors)
+            m_Mem += (long)t.length;
+    }
+
+    protected void Reduce(Tensor X, Tensor O, long aluOperationsPerElement = 1L)
+    {
+        m_Alu += (long)X.length * aluOperationsPerElement;
+        m_Mem += (long)X.length + (long)O.length;
+    }
+}
+
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/StatsOps.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/StatsOps.cs.meta
new file mode 100644
index 0000000..6f4724a
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/StatsOps.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 326d2411861b248059757b7e98e3a101
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/TensorAllocators.cs b/Assets/Coach-ML/Barracuda/Core/Backends/TensorAllocators.cs
new file mode 100644
index 0000000..ff531c1
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/TensorAllocators.cs
@@ -0,0 +1,705 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq; // ToList()
+
+using UnityEngine;
+using UnityEngine.Assertions;
+using UnityEngine.Profiling;
+
+namespace Barracuda {
+
+
+public class TensorOperatorNewAllocator : ITensorAllocator
+{
+    private List<Tensor> m_AllocatedTensors = new List<Tensor>();
+    private HashSet<ITensorData> m_AllocatedBuffers = new HashSet<ITensorData>();
+
+    public TensorOperatorNewAllocator()
+    {
+    }
+
+    ~TensorOperatorNewAllocator()
+    {
+        Dispose();
+    }
+
+    public virtual Tensor Alloc(TensorShape shape)
+    {
+        var newTensor = new Tensor(shape, this);
+        newTensor.name = "untitled";
+        m_AllocatedTensors.Add(newTensor);
+        return newTensor;
+    }
+
+    public virtual Tensor Alloc(TensorShape shape, ITensorData buffer)
+    {
+        var newTensor = new Tensor(shape, buffer, this);
+        newTensor.name = "untitled";
+        m_AllocatedTensors.Add(newTensor);
+        m_AllocatedBuffers.Add(buffer);
+        return newTensor;
+    }
+
+    public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
+    {
+    }
+
+    public virtual void Repin(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeUnpinnedHint)
+    {
+        if (newBuffer != null)
+            m_AllocatedBuffers.Add(newBuffer);
+    }
+
+    public virtual void Cast(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer)
+    {
+        if (oldBuffer != null)
+            m_AllocatedBuffers.Remove(oldBuffer);
+        if (newBuffer != null)
+            m_AllocatedBuffers.Add(newBuffer);
+    }
+
+    public virtual void Reset(bool keepCachedMemory)
+    {
+        Dispose();
+    }
+
+    public virtual void WaiveOwnership(Tensor tensor)
+    {
+        tensor.Unpin();
+        m_AllocatedTensors.Remove(tensor);
+        m_AllocatedBuffers.Remove(tensor.tensorOnDevice);
+    }
+
+    public virtual void Dispose()
+    {
+        foreach (var tensor in m_AllocatedTensors)
+            tensor.Dispose();
+        foreach (var buf in m_AllocatedBuffers)
+            buf.Dispose();
+        m_AllocatedTensors.Clear();
+        m_AllocatedBuffers.Clear();
+    }
+
+    public long busyBytes
+    { get {
+        long bytes = 0;
+        foreach(var tensor in m_AllocatedTensors)
+            bytes += tensor.length * sizeof(float);
+        return bytes;
+    } }
+    public long freeBytes
+    { get {
+        return 0;
+    } }
+    public long totalBytes
+    { get {
+        return busyBytes + freeBytes;
+    } }
+    public override string ToString()
+    {
+        return "Total allocated: " + totalBytes;
+    }
+}
+
+// @TODO: reduce code duplication between TensorCachingByShapeAllocator and TensorCachingAllocator
+public class TensorCachingByShapeAllocator : ITensorAllocator
+{
+    struct Entry
+    {
+        public TensorShape shape;
+        public ITensorData buffer;
+    }
+    // multi-value Dictionary<TensorShape, Entry*> implemented via
+    // pair of m_FreeTensorByShape and m_FreeTensors
+    private Dictionary<TensorShape, LinkedListNode<Entry>> m_FreeBufferByShape = new Dictionary<TensorShape, LinkedListNode<Entry>>();
+    private LinkedList<Entry> m_FreeBuffers = new LinkedList<Entry>();
+    private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
+    private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
+
+    public TensorCachingByShapeAllocator()
+    {
+    }
+
+    ~TensorCachingByShapeAllocator()
+    {
+        Dispose();
+    }
+
+    protected void AddRef(ITensorData buffer)
+    {
+        if (buffer == null)
+            return;
+
+        var sharedBufferCount = 0;
+        m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
+        m_SharedBuffers[buffer] = sharedBufferCount + 1;
+    }
+
+    protected void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
+    {
+        if (buffer == null)
+            return;
+
+        Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
+        Assert.IsTrue(m_SharedBuffers[buffer] > 0);
+        if (--m_SharedBuffers[buffer] > 0)
+            return;
+
+        m_SharedBuffers.Remove(buffer);
+
+        if (onLastRef != null)
+            onLastRef(buffer);
+    }
+
+    protected void AdoptFreeBuffer(TensorShape shape, ITensorData buffer)
+    {
+        // code below automatically covers handles edge-case (2)
+        // by adopting tensor's with the new ITensorData into m_FreeTensors/m_FreeTensorByShape
+        var newEntry = new Entry { shape = shape, buffer = buffer };
+        LinkedListNode<Entry> node;
+        if (m_FreeBufferByShape.TryGetValue(newEntry.shape, out node))
+        {
+            m_FreeBuffers.AddAfter(node, newEntry);
+        }
+        else
+        {
+            var newNode = m_FreeBuffers.AddLast(newEntry);
+            m_FreeBufferByShape.Add(newEntry.shape, newNode);
+        }
+    }
+
+    public virtual Tensor Alloc(TensorShape shape)
+    {
+        Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
+        var name = "untitled";
+
+        LinkedListNode<Entry> node;
+        if (m_FreeBufferByShape.TryGetValue(shape, out node))
+        {
+            Assert.AreEqual(node.Value.shape, shape);
+
+            // advance dictionary to the next Tensor with the same shape, if available
+            if (node.Next != null && node.Next.Value.shape == shape)
+                m_FreeBufferByShape[shape] = node.Next;
+            else
+                m_FreeBufferByShape.Remove(shape);
+
+            var buffer = node.Value.buffer;
+            buffer?.Reserve(shape.length);
+
+            var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
+            tensor.name = name;
+
+            m_FreeBuffers.Remove(node);
+            m_BusyTensors.Add(tensor, buffer);
+            AddRef(buffer);
+
+            Assert.AreEqual(tensor.shape, shape);
+            Profiler.EndSample();
+            return tensor;
+        }
+
+        var newTensor = new Tensor(shape, this);
+        newTensor.name = name;
+        m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
+        AddRef(newTensor.tensorOnDevice);
+
+        Profiler.EndSample();
+        return newTensor;
+    }
+
+    public virtual Tensor Alloc(TensorShape shape, ITensorData buffer)
+    {
+        Profiler.BeginSample("Barracuda.ShapeAllocator.Alloc");
+        var name = "untitled";
+
+        var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
+        tensor.name = name;
+        m_BusyTensors.Add(tensor, buffer);
+        AddRef(buffer);
+
+        Profiler.EndSample();
+        return tensor;
+    }
+
+    public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
+    {
+        Profiler.BeginSample("Barracuda.ShapeAllocator.Release");
+        Assert.AreEqual(tensor.allocator, this);
+
+        var unpinned = tensor.Invalidate(); // calls Repin(newBuffer=null)
+
+        if (!m_BusyTensors.ContainsKey(tensor))
+        {
+            if (unpinned == null)
+                return;
+
+            foreach (var freeEntry in m_FreeBuffers)
+                if (freeEntry.buffer == unpinned)
+                    return;
+
+            // some operations can create new Tensor and reassign ITensorData to it
+            foreach (var busyEntry in m_BusyTensors)
+                if (busyEntry.Value == unpinned)
+                    return; // we have at least another instance ITensorData in m_BusyTensors, nothing to realease
+        }
+
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors.Remove(tensor);
+        Profiler.EndSample();
+    }
+
+    public virtual void Repin(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeUnpinnedHint)
+    {
+        if (newBuffer == oldBuffer)
+            return;
+
+        Assert.AreEqual(tensor.allocator, this);
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors[tensor] = newBuffer;
+
+        AddRef(newBuffer);
+        DecRef(oldBuffer,
+            (freeBuffer) => {
+                if (disposeUnpinnedHint)
+                    freeBuffer.Dispose();
+                else
+                    AdoptFreeBuffer(tensor.shape, freeBuffer);
+                });
+    }
+
+    public virtual void Cast(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer)
+    {
+        if (newBuffer == oldBuffer)
+            return;
+
+        Assert.AreEqual(tensor.allocator, this);
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors[tensor] = newBuffer;
+
+        AddRef(newBuffer);
+        DecRef(oldBuffer);
+    }
+
+    public virtual void Reset(bool keepCachedMemory)
+    {
+        Profiler.BeginSample("Barracuda.ShapeAllocator.Reset");
+
+        if (!keepCachedMemory)
+            Dispose();
+
+        foreach(var tensor in m_BusyTensors.Keys.ToList())
+            Release(tensor, false);
+
+        Assert.AreEqual(m_BusyTensors.Count, 0);
+        Assert.AreEqual(m_SharedBuffers.Count, 0);
+
+        Profiler.EndSample();
+    }
+
+    public virtual void WaiveOwnership(Tensor tensor)
+    {
+        Assert.AreEqual(tensor.allocator, this);
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors.Remove(tensor);
+
+        var buffer = tensor.tensorOnDevice;
+        if (buffer == null)
+            return;
+
+        Profiler.BeginSample("Barracuda.ShapeAllocator.WaiveOwnership");
+
+        int sharedCount = 0;
+        m_SharedBuffers.TryGetValue(buffer, out sharedCount);
+        if (sharedCount > 1)
+        {
+            var patchBusyTensors = new List<Tensor>();
+            foreach (var busyEntry in m_BusyTensors)
+                if (busyEntry.Value == buffer)
+                    patchBusyTensors.Add(busyEntry.Key);
+
+            Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
+
+            foreach (var busyTensor in patchBusyTensors)
+            {
+                Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
+
+                var unpinned = busyTensor.Unpin(false);
+                var newBuffer = busyTensor.tensorOnDevice;
+                Assert.IsTrue(unpinned == buffer);
+                Assert.IsTrue(newBuffer != buffer);
+                m_BusyTensors[busyTensor] = newBuffer;
+                AddRef(newBuffer);
+            }
+        }
+
+        // Assert no references to tensor are left owned by allocator
+        Assert.IsTrue(m_SharedBuffers[buffer] == 1);
+        m_SharedBuffers.Remove(buffer);
+        foreach (var freeEntry in m_FreeBuffers)
+        {
+            Assert.IsTrue(freeEntry.buffer != buffer);
+        }
+        foreach(var busyEntry in m_BusyTensors)
+        {
+            Assert.IsTrue(busyEntry.Key != tensor);
+            Assert.IsTrue(busyEntry.Value != buffer);
+        }
+
+        Profiler.EndSample();
+    }
+
+    public virtual void Dispose()
+    {
+        m_FreeBufferByShape.Clear();
+        foreach(var tensor in m_BusyTensors.Keys.ToList())
+            Release(tensor, false);
+        foreach (var entry in m_FreeBuffers)
+            entry.buffer?.Dispose();
+
+        m_BusyTensors.Clear();
+        m_FreeBuffers.Clear();
+        m_SharedBuffers.Clear();
+    }
+
+    public long busyBytes
+    { get {
+        long bytes = 0;
+        foreach(var tensor in m_BusyTensors.Keys)
+            bytes += tensor.length * sizeof(float);
+        return bytes;
+    } }
+    public long freeBytes
+    { get {
+        long bytes = 0;
+        foreach(var entry in m_FreeBuffers)
+            bytes += entry.shape.length * sizeof(float);
+        return bytes;
+    } }
+    public long totalBytes
+    { get {
+        return busyBytes + freeBytes;
+    } }
+    public override string ToString()
+    {
+        return "Total allocated: " + totalBytes + " busy: " + busyBytes;
+    }
+}
+
+public class TensorCachingAllocator : ITensorAllocator
+{
+    struct Entry
+    {
+        public int size;
+        public ITensorData buffer;
+        public bool free;
+    }
+    // Sorted by size array of ITensorData
+    private List<Entry> m_AllocatedBuffers = new List<Entry>();
+    private Dictionary<Tensor, ITensorData> m_BusyTensors = new Dictionary<Tensor, ITensorData>();
+    private Dictionary<ITensorData, int> m_SharedBuffers = new Dictionary<ITensorData, int>();
+
+    private Action<ITensorData> disposeAllocatedBufferDelegate;
+    private Action<ITensorData> adoptFreeBufferDelegate;
+
+    public TensorCachingAllocator()
+    {
+        disposeAllocatedBufferDelegate = DisposeAllocatedBuffer;
+        adoptFreeBufferDelegate = AdoptFreeBuffer;
+    }
+
+    ~TensorCachingAllocator()
+    {
+        Dispose();
+    }
+
+    static protected int GetAllocationMaxCount(Tensor tensor)
+    {
+        return (tensor.tensorOnDevice != null) ?
+            tensor.tensorOnDevice.GetMaxCount():
+            tensor.length;
+    }
+
+    protected void AddRef(ITensorData buffer)
+    {
+        if (buffer == null)
+            return;
+
+        var sharedBufferCount = 0;
+        m_SharedBuffers.TryGetValue(buffer, out sharedBufferCount);
+        m_SharedBuffers[buffer] = sharedBufferCount + 1;
+    }
+
+    protected void DecRef(ITensorData buffer, Action<ITensorData> onLastRef = null)
+    {
+        if (buffer == null)
+            return;
+
+        Assert.IsTrue(m_SharedBuffers.ContainsKey(buffer));
+        Assert.IsTrue(m_SharedBuffers[buffer] > 0);
+        if (--m_SharedBuffers[buffer] > 0)
+            return;
+
+        m_SharedBuffers.Remove(buffer);
+
+        if (onLastRef != null)
+            onLastRef(buffer);
+    }
+
+    protected void AdoptFreeBuffer(ITensorData buffer)
+    {
+        // insert into the sorted array
+        var size = buffer.GetMaxCount();
+        var newEntry = new Entry { size = size, buffer = buffer, free = true };
+        bool found = false;
+        for (int i = 0; !found && i < m_AllocatedBuffers.Count; ++i)
+        {
+            var entry = m_AllocatedBuffers[i];
+            if (buffer == entry.buffer)
+            {
+                Assert.IsTrue(!entry.free);
+                entry.free = true;
+                m_AllocatedBuffers[i] = entry;
+                Assert.IsTrue(m_AllocatedBuffers[i].free);
+                found = true;
+            }
+            if (size < entry.size)
+            {
+                m_AllocatedBuffers.Insert(i, newEntry);
+                Assert.IsTrue(m_AllocatedBuffers[i].size < m_AllocatedBuffers[i + 1].size);
+                found = true;
+            }
+        }
+
+        if (!found)
+            m_AllocatedBuffers.Add(newEntry);
+    }
+
+    protected void DisposeAllocatedBuffer(ITensorData buffer)
+    {
+        for (int i = m_AllocatedBuffers.Count - 1; i >= 0; i--)
+            if (m_AllocatedBuffers[i].buffer == buffer)
+                m_AllocatedBuffers.RemoveAt(i);
+        buffer.Dispose();
+    }
+
+    public virtual Tensor Alloc(TensorShape shape)
+    {
+        Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
+        var name = "untitled";
+
+        for (int i = 0; i < m_AllocatedBuffers.Count; ++i)
+        {
+            var entry = m_AllocatedBuffers[i];
+            if (entry.size >= shape.length && entry.free)
+            {
+                entry.free = false;
+                m_AllocatedBuffers[i] = entry;
+
+                var buffer = entry.buffer;
+                buffer?.Reserve(shape.length);
+
+                var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
+                tensor.name = name;
+
+                m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
+                AddRef(tensor.tensorOnDevice);
+
+                Profiler.EndSample();
+                return tensor;
+            }
+        }
+
+
+        var newTensor = new Tensor(shape, this);
+        newTensor.name = name;
+        m_BusyTensors.Add(newTensor, newTensor.tensorOnDevice);
+        AddRef(newTensor.tensorOnDevice);
+
+        Profiler.EndSample();
+        return newTensor;
+    }
+
+    public virtual Tensor Alloc(TensorShape shape, ITensorData buffer)
+    {
+        Profiler.BeginSample("Barracuda.SizeAllocator.Alloc");
+        var name = "untitled";
+
+        var tensor = new Tensor(shape, buffer, this); // @TODO: reuse Tensor instances
+        tensor.name = name;
+        m_BusyTensors.Add(tensor, tensor.tensorOnDevice);
+        AddRef(tensor.tensorOnDevice);
+
+        Profiler.EndSample();
+        return tensor;
+    }
+
+    public virtual void Release(Tensor tensor, bool calledFromTensorDispose)
+    {
+        Profiler.BeginSample("Barracuda.SizeAllocator.Release");
+        Assert.AreEqual(tensor.allocator, this);
+
+        var unpinned = tensor.Invalidate(); // calls Repin(newBuffer=null)
+
+        if (!m_BusyTensors.ContainsKey(tensor))
+        {
+            if (unpinned == null)
+                return;
+
+            foreach (var entry in m_AllocatedBuffers)
+                if (entry.buffer == unpinned && entry.free)
+                    return;
+
+            // some operations can create new Tensor and reassign ITensorData to it
+            foreach (var busyEntry in m_BusyTensors)
+                if (busyEntry.Value == unpinned)
+                    return; // we have original ITensorData in m_BusyTensors, nothing to realease
+        }
+
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors.Remove(tensor);
+
+        Profiler.EndSample();
+    }
+
+    public virtual void Repin(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer, bool disposeUnpinnedHint)
+    {
+        if (newBuffer == oldBuffer)
+            return;
+
+        Assert.AreEqual(tensor.allocator, this);
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors[tensor] = newBuffer;
+
+        AddRef(newBuffer);
+
+        if (disposeUnpinnedHint)
+            DecRef(oldBuffer, disposeAllocatedBufferDelegate);
+        else
+            DecRef(oldBuffer, adoptFreeBufferDelegate);
+    }
+
+    public virtual void Cast(Tensor tensor, ITensorData newBuffer, ITensorData oldBuffer)
+    {
+        if (newBuffer == oldBuffer)
+            return;
+
+        Assert.AreEqual(tensor.allocator, this);
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors[tensor] = newBuffer;
+
+        AddRef(newBuffer);
+        DecRef(oldBuffer);
+    }
+
+    public virtual void Reset(bool keepCachedMemory)
+    {
+        Profiler.BeginSample("Barracuda.SizeAllocator.Reset");
+
+        if (!keepCachedMemory)
+            Dispose();
+
+        foreach(var tensor in m_BusyTensors.Keys.ToList())
+            Release(tensor, false);
+
+        Assert.AreEqual(m_BusyTensors.Count, 0);
+        Assert.AreEqual(m_SharedBuffers.Count, 0);
+
+        foreach(var buf in m_AllocatedBuffers)
+            Assert.IsTrue(buf.free);
+
+        Profiler.EndSample();
+    }
+
+    public virtual void WaiveOwnership(Tensor tensor)
+    {
+        Assert.AreEqual(tensor.allocator, this);
+        Assert.IsTrue(m_BusyTensors.ContainsKey(tensor));
+        m_BusyTensors.Remove(tensor);
+
+        var buffer = tensor.tensorOnDevice;
+        if (buffer == null)
+            return;
+
+        Profiler.BeginSample("Barracuda.SizeAllocator.WaiveOwnership");
+
+        int sharedCount = 0;
+        m_SharedBuffers.TryGetValue(buffer, out sharedCount);
+        if (sharedCount > 1)
+        {
+            var patchBusyTensors = new List<Tensor>();
+            foreach (var busyEntry in m_BusyTensors)
+                if (busyEntry.Value == buffer)
+                    patchBusyTensors.Add(busyEntry.Key);
+
+            Assert.AreEqual(sharedCount - 1, patchBusyTensors.Count);
+
+            foreach (var busyTensor in patchBusyTensors)
+            {
+                Assert.AreEqual(m_BusyTensors[busyTensor], buffer);
+
+                var unpinned = busyTensor.Unpin(false);
+                var newBuffer = busyTensor.tensorOnDevice;
+                Assert.IsTrue(unpinned == buffer);
+                Assert.IsTrue(newBuffer != buffer);
+                m_BusyTensors[busyTensor] = newBuffer;
+                AddRef(newBuffer);
+            }
+        }
+
+        // Assert no references to tensor are left owned by allocator
+        Assert.IsTrue(m_SharedBuffers[buffer] == 1);
+        m_SharedBuffers.Remove(buffer);
+        foreach (var freeEntry in m_AllocatedBuffers)
+        {
+            Assert.IsTrue(freeEntry.buffer != buffer);
+        }
+        foreach(var busyEntry in m_BusyTensors)
+        {
+            Assert.IsTrue(busyEntry.Key != tensor);
+            Assert.IsTrue(busyEntry.Value != buffer);
+        }
+
+        Profiler.EndSample();
+    }
+
+    public virtual void Dispose()
+    {
+        foreach(var tensor in m_BusyTensors.Keys.ToList())
+            Release(tensor, false);
+        foreach (var entry in m_AllocatedBuffers)
+            entry.buffer?.Dispose();
+
+        m_BusyTensors.Clear();
+        m_AllocatedBuffers.Clear();
+        m_SharedBuffers.Clear();
+    }
+
+    public long busyBytes
+    { get {
+        long bytes = 0;
+        foreach(var tensor in m_BusyTensors.Keys)
+            bytes += GetAllocationMaxCount(tensor)  * sizeof(float);
+        return bytes;
+    } }
+    public long freeBytes
+    { get {
+        long bytes = 0;
+        foreach(var entry in m_AllocatedBuffers)
+            if (entry.free)
+                bytes += entry.size * sizeof(float);
+        return bytes;
+    } }
+    public long totalBytes
+    { get {
+        return busyBytes + freeBytes;
+    } }
+
+    public override string ToString()
+    {
+        return "Total allocated: " + totalBytes + " busy: " + busyBytes;
+    }
+}
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/TensorAllocators.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/TensorAllocators.cs.meta
new file mode 100644
index 0000000..bfbd36c
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/TensorAllocators.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 1c30b359da14d4b02a55e7c9806058f1
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/VerboseOps.cs b/Assets/Coach-ML/Barracuda/Core/Backends/VerboseOps.cs
new file mode 100644
index 0000000..6c6769e
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/VerboseOps.cs
@@ -0,0 +1,545 @@
+using System;
+using UnityEngine;
+
+namespace Barracuda {
+
+
+public class VerboseOps : IOps
+{
+    private IOps m_Ops;
+    private const string Prefix = "After ";
+
+    public VerboseOps(IOps ops)
+    {
+        m_Ops = ops;
+    }
+
+    public virtual void WaitForCompletion(Tensor x)
+    {
+        m_Ops.WaitForCompletion(x);
+    }
+
+    Tensor IOps.MatMul(Tensor X, bool xTranspose, Tensor Y, bool yTranspose)
+    {
+        D.Log("(" + X.flatHeight + "," + X.flatWidth + ")" + (xTranspose?".T":"") +
+            " * (" + Y.flatHeight + "," + Y.flatWidth + ")"+ (yTranspose?".T":""));
+        var O = m_Ops.MatMul(X, xTranspose, Y, yTranspose);
+        O.PrintDataPart(32, Prefix + "MatMul");
+        return O;
+    }
+    Tensor IOps.Dense(Tensor X, Tensor W, Tensor B)
+    {
+        D.Log(X.shape + " * (" + W.flatHeight + "," + W.flatWidth + ") + (" + B.flatWidth + ")");
+        var O = m_Ops.Dense(X, W, B);
+        O.PrintDataPart(32, Prefix + "Dense");
+        return O;
+    }
+    Tensor IOps.Conv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        D.Log(X.shape + " # " + K.shape + " + (" + B.flatWidth + ")");
+        var O = m_Ops.Conv2D(X, K, B, stride, pad);
+        O.PrintDataPart(32, Prefix + "Conv2D");
+        return O;
+    }
+    Tensor IOps.DepthwiseConv2D(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad)
+    {
+        D.Log(X.shape + " ∆ " + K.shape + " + (" + B.flatWidth + ")");
+        var O = m_Ops.DepthwiseConv2D(X, K, B, stride, pad);
+        O.PrintDataPart(32, Prefix + "DepthwiseConv2D");
+        return O;
+    }
+    Tensor IOps.Conv2DTrans(Tensor X, Tensor K, Tensor B, int[] stride, int[] pad, int[] outputAdjustment)
+    {
+        D.Log(X.shape + " @ " + K.shape + " + (" + B.flatWidth + ")");
+        var O = m_Ops.Conv2DTrans(X, K, B, stride, pad, outputAdjustment);
+        O.PrintDataPart(32, Prefix + "Conv2DTrans");
+        return O;
+    }
+    Tensor IOps.Upsample2D(Tensor X, int[] size)
+    {
+        var O = m_Ops.Upsample2D(X, size);
+        D.Log(X.shape + " ^ " + O.shape);
+        O.PrintDataPart(32, Prefix + "Upsample2D");
+        return O;
+    }
+    Tensor IOps.MaxPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        var O = m_Ops.MaxPool2D(X, pool, stride, pad);
+        D.Log(X.shape + " > " + O.shape);
+        O.PrintDataPart(32, Prefix + "MaxPool2D");
+        return O;
+    }
+    Tensor IOps.AvgPool2D(Tensor X, int[] pool, int[] stride, int[] pad)
+    {
+        var O = m_Ops.AvgPool2D(X, pool, stride, pad);
+        D.Log(X.shape + " ≥ " + O.shape);
+        O.PrintDataPart(32, Prefix + "AvgPool2D");
+        return O;
+    }
+    Tensor IOps.GlobalMaxPool2D(Tensor X)
+    {
+        var O = m_Ops.GlobalMaxPool2D(X);
+        D.Log(X.shape + " >> " + O.shape);
+        O.PrintDataPart(32, Prefix + "GlobalMaxPool2D");
+        return O;
+    }
+    Tensor IOps.GlobalAvgPool2D(Tensor X)
+    {
+        var O = m_Ops.GlobalAvgPool2D(X);
+        D.Log(X.shape + " ≥≥ " + O.shape);
+        O.PrintDataPart(32, Prefix + "GlobalAvgPool2D");
+        return O;
+    }
+    Tensor IOps.GlobalAvgVariancePool2D(Tensor X)
+    {
+        var O = m_Ops.GlobalAvgVariancePool2D(X);
+        D.Log(X.shape + " ≥≥ " + O.shape);
+        O.PrintDataPart(32, Prefix + "GlobalAvgVariancePool2D");
+        return O;
+    } 
+    Tensor IOps.Border2D(Tensor X, int[] pad, float value)
+    {
+        D.Log($"{X.shape} ¶(border) value={value} pad=[{pad[0]},{pad[1]},{pad[2]},{pad[3]})");
+        var O = m_Ops.Border2D(X, pad, value);
+        O.PrintDataPart(32, Prefix + "Border2D");
+        return O;
+    }
+    Tensor IOps.Pad2DReflect(Tensor X, int[] pad)
+    {
+        D.Log($"{X.shape} ¶(reflect) pad=[{pad[0]},{pad[1]},{pad[2]},{pad[3]})");
+        var O = m_Ops.Pad2DReflect(X, pad);
+        O.PrintDataPart(32, Prefix + "Pad2DReflect");
+        return O;
+    }
+    Tensor IOps.Pad2DSymmetric(Tensor X, int[] pad)
+    {
+        D.Log($"{X.shape} ¶(symmetric) pad=[{pad[0]},{pad[1]},{pad[2]},{pad[3]})");
+        var O = m_Ops.Pad2DSymmetric(X, pad);
+        O.PrintDataPart(32, Prefix + "Pad2DSymmetric");
+        return O;
+    }
+    Tensor IOps.Pad2DEdge(Tensor X, int[] pad)
+    {
+        D.Log($"{X.shape} ¶(edge) pad=[{pad[0]},{pad[1]},{pad[2]},{pad[3]})");
+        var O = m_Ops.Pad2DEdge(X, pad);
+        O.PrintDataPart(32, Prefix + "Pad2DEdge");
+        return O;
+    }
+
+    Tensor IOps.ScaleBias(Tensor X, Tensor S, Tensor B)
+    {
+        D.Log(X.shape + " * (" + S.channels + ") + (" + B.channels + ")");
+        var O = m_Ops.ScaleBias(X, S, B);
+        O.PrintDataPart(32, Prefix + "ScaleBias");
+        return O;
+    }
+    Tensor IOps.Normalization(Tensor X, Tensor S, Tensor B, int pool, int axis, float epsilon)
+    {
+        D.Log(X.shape + " ! " + (pool==1 ? "instance": "batch") + " axis=" + axis);
+        var O = m_Ops.Normalization(X, S, B, pool, axis, epsilon);
+        O.PrintDataPart(32, Prefix + "Normalization");
+        return O;
+    }
+    Tensor IOps.LRN(Tensor X, float alpha, float beta, float bias, int size)
+    {
+        D.Log(X.shape + " LRN n=" + size + " a=" + alpha + " b=" + beta + " bias=" + bias);
+        var O = m_Ops.LRN(X, alpha, beta, bias, size);
+        O.PrintDataPart(32, Prefix + "LRN");
+        return O;
+    }
+    Tensor IOps.Dropout(Tensor X, float alpha)
+    {
+        D.Log(X.shape + "  a=" + alpha);
+        var O = m_Ops.Dropout(X, alpha);
+        O.PrintDataPart(32, Prefix + "Dropout");
+        return O;
+    }
+    Tensor IOps.RandomNormal(TensorShape s, float mean, float scale, int seed)
+    {
+        D.Log(s + " N m=" + mean + " s=" + scale + " s=" + seed);
+        var O = m_Ops.RandomNormal(s, mean, scale, seed);
+        O.PrintDataPart(32, Prefix + "RandomNormal");
+        return O;
+    }
+    Tensor IOps.RandomUniform(TensorShape s, float mean, float scale, int seed)
+    {
+        D.Log(s + " U m=" + mean + " s=" + scale + " s=" + seed);
+        var O = m_Ops.RandomUniform(s, mean, scale, seed);
+        O.PrintDataPart(32, Prefix + "RandomUniform");
+        return O;
+    }
+    Tensor IOps.Multinomial(Tensor X, int count, int seed)
+    {
+        D.Log(X.shape + " M n=" + count + " s=" + seed);
+        var O = m_Ops.Multinomial(X, count, seed);
+        O.PrintDataPart(32, Prefix + "Multinomial");
+        return O;
+    }
+    Tensor IOps.OneHot(Tensor X, int depth, float onValue, float offValue)
+    {
+        Debug.Log(X.shape + " Ω n=" + depth + " 1=" + onValue + " 0=" + offValue);
+        var O = m_Ops.OneHot(X, depth, onValue, offValue);
+        O.PrintDataPart(32, Prefix + "OneHot");
+        return O;
+    }
+
+    Tensor IOps.Relu(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Relu(X);
+        O.PrintDataPart(32, Prefix + "Relu");
+        return O;
+    }
+    Tensor IOps.Softmax(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Softmax(X);
+        O.PrintDataPart(32, Prefix + "Softmax");
+        return O;
+    }
+    Tensor IOps.LogSoftmax(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.LogSoftmax(X);
+        O.PrintDataPart(32, Prefix + "LogSoftmax");
+        return O;
+    }
+    Tensor IOps.Tanh(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Tanh(X);
+        O.PrintDataPart(32, Prefix + "Tanh");
+        return O;
+    }
+    Tensor IOps.Sigmoid(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Sigmoid(X);
+        O.PrintDataPart(32, Prefix + "Sigmoid");
+        return O;
+    }
+    Tensor IOps.Relu6(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Relu6(X);
+        O.PrintDataPart(32, Prefix + "Relu6");
+        return O;
+    }
+    Tensor IOps.Elu(Tensor X, float alpha)
+    {
+        D.Log(X.shape + " () a=" + alpha);
+        var O = m_Ops.Elu(X, alpha);
+        O.PrintDataPart(32, Prefix + "Elu");
+        return O;
+    }
+    Tensor IOps.LeakyRelu(Tensor X, float alpha)
+    {
+        D.Log(X.shape + " () a=" + alpha);
+        var O = m_Ops.LeakyRelu(X, alpha);
+        O.PrintDataPart(32, Prefix + "LeakyRelu");
+        return O;
+    }
+    Tensor IOps.Selu(Tensor X, float alpha, float gamma)
+    {
+        D.Log(X.shape + " () a=" + alpha + " g=" + gamma);
+        var O = m_Ops.Selu(X, alpha, gamma);
+        O.PrintDataPart(32, Prefix + "Selu");
+        return O;
+    }
+    Tensor IOps.PRelu(Tensor X, Tensor S)
+    {
+        D.Log(X.shape + " * (" + S.channels + ")");
+        var O = m_Ops.PRelu(X, S);
+        O.PrintDataPart(32, Prefix + "PRelu");
+        return O;
+    }
+    Tensor IOps.Swish(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Swish(X);
+        O.PrintDataPart(32, Prefix + "Swish");
+        return O;
+    }
+    Tensor IOps.Abs(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Abs(X);
+        O.PrintDataPart(32, Prefix + "Abs");
+        return O;
+    }
+    Tensor IOps.Neg(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Neg(X);
+        O.PrintDataPart(32, Prefix + "Neg");
+        return O;
+    }
+    Tensor IOps.Ceil(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Ceil(X);
+        O.PrintDataPart(32, Prefix + "Ceil");
+        return O;
+    }
+    Tensor IOps.Clip(Tensor X, float min, float max)
+    {
+        D.Log(X.shape + " () min=" + min + " max=" + max);
+        var O = m_Ops.Clip(X, min, max);
+        O.PrintDataPart(32, Prefix + "Clip");
+        return O;
+    }
+    Tensor IOps.Floor(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Floor(X);
+        O.PrintDataPart(32, Prefix + "Floor");
+        return O;
+    }
+
+    Tensor IOps.Reciprocal(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Reciprocal(X);
+        O.PrintDataPart(32, Prefix + "Reciprocal");
+        return O;
+    }
+    Tensor IOps.Pow(Tensor X, float alpha)
+    {
+        D.Log(X.shape + " () a=" + alpha);
+        var O = m_Ops.Pow(X, alpha);
+        O.PrintDataPart(32, Prefix + "Pow");
+        return O;
+    }
+    Tensor IOps.Exp(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Exp(X);
+        O.PrintDataPart(32, Prefix + "Exp");
+        return O;
+    }
+    Tensor IOps.Log(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Log(X);
+        O.PrintDataPart(32, Prefix + "Log");
+        return O;
+    }
+    Tensor IOps.Sqrt(Tensor X)
+    {
+        D.Log(X.shape + " ()");
+        var O = m_Ops.Sqrt(X);
+        O.PrintDataPart(32, Prefix + "Sqrt");
+        return O;
+    }
+
+    Tensor IOps.Add(Tensor[] tensors)
+    {
+        var O = m_Ops.Add(tensors);
+        D.Log("{" + tensors.Length + "} + " + O.shape); // @TODO: print input dimensions
+        O.PrintDataPart(32, Prefix + "Add");
+        return O;
+    }
+    Tensor IOps.Sub(Tensor[] tensors)
+    {
+        var O = m_Ops.Sub(tensors);
+        D.Log("{" + tensors.Length + "} - " + O.shape); // @TODO: print input dimensions
+        O.PrintDataPart(32, Prefix + "Sub");
+        return O;
+    }
+    Tensor IOps.Mul(Tensor[] tensors)
+    {
+        var O = m_Ops.Mul(tensors);
+        D.Log("{" + tensors.Length + "} * " + O.shape); // @TODO: print input dimensions
+        O.PrintDataPart(32, Prefix + "Mul");
+        return O;
+    }
+    Tensor IOps.Div(Tensor[] tensors)
+    {
+        var O = m_Ops.Div(tensors);
+        D.Log("{" + tensors.Length + "} / " + O.shape); // @TODO: print input dimensions
+        O.PrintDataPart(32, Prefix + "Div");
+        return O;
+    }
+    Tensor IOps.Pow(Tensor[] tensors)
+    {
+        var O = m_Ops.Pow(tensors);
+        D.Log("{" + tensors.Length + "} ^ " + O.shape); // @TODO: print input dimensions
+        O.PrintDataPart(32, Prefix + "Pow");
+        return O;
+    }
+    Tensor IOps.Min(Tensor[] tensors)
+    {
+        var O = m_Ops.Min(tensors);
+        D.Log("{" + tensors.Length + "} < " + O.shape); // @TODO: print input dimensions
+        O.PrintDataPart(32, Prefix + "Min");
+        return O;
+    }
+    Tensor IOps.Max(Tensor[] tensors)
+    {
+        var O = m_Ops.Max(tensors);
+        D.Log("{" + tensors.Length + "} > " + O.shape); // @TODO: print input dimensions
+        O.PrintDataPart(32, Prefix + "Max");
+        return O;
+    }
+    Tensor IOps.Mean(Tensor[] tensors)
+    {
+        var O = m_Ops.Mean(tensors);
+        D.Log("{" + tensors.Length + "} ∑ " + O.shape); // @TODO: print input dimensions
+        O.PrintDataPart(32, Prefix + "Mean");
+        return O;
+    }
+
+    Tensor IOps.ReduceMax(Tensor X, int axis)
+    {
+        var O = m_Ops.ReduceMax(X, axis);
+        D.Log(X.shape + " .> " + O.shape);
+        O.PrintDataPart(32, Prefix + "ReduceMax");
+        return O;
+    }
+    Tensor IOps.ReduceMean(Tensor X, int axis)
+    {
+        var O = m_Ops.ReduceMean(X, axis);
+        D.Log(X.shape + " .∑ " + O.shape);
+        O.PrintDataPart(32, Prefix + "ReduceMean");
+        return O;
+    }
+    Tensor IOps.ReduceMin(Tensor X, int axis)
+    {
+        var O = m_Ops.ReduceMin(X, axis);
+        D.Log(X.shape + " .< " + O.shape);
+        O.PrintDataPart(32, Prefix + "ReduceMin");
+        return O;
+    }
+    Tensor IOps.ReduceProd(Tensor X, int axis)
+    {
+        var O = m_Ops.ReduceProd(X, axis);
+        D.Log(X.shape + " .* " + O.shape);
+        O.PrintDataPart(32, Prefix + "ReduceProd");
+        return O;
+    }
+    Tensor IOps.ReduceSum(Tensor X, int axis)
+    {
+        var O = m_Ops.ReduceSum(X, axis);
+        D.Log(X.shape + " .+ " + O.shape);
+        O.PrintDataPart(32, Prefix + "ReduceSum");
+        return O;
+    }
+    Tensor IOps.Greater(Tensor a, Tensor b)
+    {
+        var O = m_Ops.Greater(a, b);
+        D.Log(a.shape + " > " + b.shape + " = " + O.shape);
+        O.PrintDataPart(32, Prefix + "Greater");
+        return O;
+    }
+    Tensor IOps.GreaterEqual(Tensor a, Tensor b)
+    {
+        var O = m_Ops.GreaterEqual(a, b);
+        D.Log(a.shape + " >= " + b.shape + " = " + O.shape);
+        O.PrintDataPart(32, Prefix + "GreaterEqual");
+        return O;
+    }
+    Tensor IOps.Less(Tensor a, Tensor b)
+    {
+        var O = m_Ops.Less(a, b);
+        D.Log(a.shape + " < " + b.shape + " = " + O.shape);
+        O.PrintDataPart(32, Prefix + "Less");
+        return O;
+    }
+    Tensor IOps.LessEqual(Tensor a, Tensor b)
+    {
+        var O = m_Ops.LessEqual(a, b);
+        D.Log(a.shape + " <= " + b.shape + " = " + O.shape);
+        O.PrintDataPart(32, Prefix + "LessEqual");
+        return O;
+    }
+    Tensor IOps.Equal(Tensor a, Tensor b)
+    {
+        var O = m_Ops.Equal(a, b);
+        D.Log(a.shape + " == " + b.shape + " = " + O.shape);
+        O.PrintDataPart(32, Prefix + "Equal");
+        return O;
+    }
+    Tensor IOps.LogicalOr(Tensor a, Tensor b)
+    {
+        var O = m_Ops.LogicalOr(a, b);
+        D.Log(a.shape + " || " + b.shape + " = " + O.shape);
+        O.PrintDataPart(32, Prefix + "LogicalOr");
+        return O;
+    }
+    Tensor IOps.LogicalAnd(Tensor a, Tensor b)
+    {
+        var O = m_Ops.LogicalAnd(a, b);
+        D.Log(a.shape + " && " + b.shape + " = " + O.shape);
+        O.PrintDataPart(32, Prefix + "LogicalAnd");
+        return O;
+    }
+    Tensor IOps.LogicalXor(Tensor a, Tensor b)
+    {
+        var O = m_Ops.LogicalXor(a, b);
+        D.Log(a.shape + " ^ " + b.shape + " = " + O.shape);
+        O.PrintDataPart(32, Prefix + "LogicalXor");
+        return O;
+    }
+    Tensor IOps.LogicalNot(Tensor x)
+    {
+        var O = m_Ops.LogicalNot(x);
+        D.Log("!(" + x.shape +" )");
+        O.PrintDataPart(32, Prefix + "LogicalNot");
+        return O;
+    }
+
+    Tensor IOps.Flatten(Tensor X)
+    {
+        var O = m_Ops.Flatten(X);
+        D.Log(X.shape + " = " + O.shape);
+        return O;
+    }
+    Tensor IOps.Reshape(Tensor X, TensorShape shape)
+    {
+        var O = m_Ops.Reshape(X, shape);
+        D.Log(X.shape + " $ " + O.shape);
+        return O;
+    }
+    Tensor IOps.Transpose(Tensor X)
+    {
+        var O = m_Ops.Transpose(X);
+        D.Log(X.shape + " T " + O.shape);
+        return O;
+    }
+
+    Tensor IOps.Concat(Tensor[] tensors, int axis)
+    {
+        var O = m_Ops.Concat(tensors, axis);
+        D.Log("{" + tensors.Length + "} # " + O.shape); // @TODO: print input dimensions
+        O.PrintDataPart(32, Prefix + "Concat");
+        return O;
+    }
+    Tensor IOps.StridedSlice(Tensor X, int[] starts, int[] ends, int[] strides)
+    {
+        var O = m_Ops.StridedSlice(X, starts, ends, strides);
+        D.Log(X.shape + " | " + O.shape);
+        O.PrintDataPart(32, Prefix + "StridedSlice");
+        return O;
+    }
+    Tensor IOps.Tile(Tensor X, int[] repeats)
+    {
+        var O = m_Ops.Tile(X, repeats);
+        D.Log(X.shape + " % " + O.shape);
+        O.PrintDataPart(32, Prefix + "Tile");
+        return O;
+    }
+
+    Tensor IOps.Prepare(Tensor X)
+    {
+        D.Log("!" + X.shape);
+        return m_Ops.Prepare(X);
+    }
+
+    void IOps.ResetAllocator(bool keepCachedMemory)
+    {
+        m_Ops.ResetAllocator(keepCachedMemory);
+    }
+}
+
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Backends/VerboseOps.cs.meta b/Assets/Coach-ML/Barracuda/Core/Backends/VerboseOps.cs.meta
new file mode 100644
index 0000000..ac1bd3a
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Backends/VerboseOps.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 652e588fca30240cf89d82db18ad71a8
+timeCreated: 1506427659
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/BackwardsCompatibility.cs b/Assets/Coach-ML/Barracuda/Core/BackwardsCompatibility.cs
new file mode 100644
index 0000000..e4cd952
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/BackwardsCompatibility.cs
@@ -0,0 +1,162 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using UnityEngine;
+using UnityEngine.Assertions;
+
+namespace Barracuda {
+
+// @TODO: deprecate, left here only for backwards compatibility
+public static class WorkerExtensions
+{
+    #region Inputs
+    /// <summary>
+    /// Specify single tensor value as the input for the network.
+    /// Useful when network has only one input and caller does not need to know input's name.
+    /// </summary>
+    public static void AddInput(this IWorker worker, Tensor x)
+    {
+        worker.SetInput(x);
+    }
+    /// <summary>
+    /// Specify tensor value for the named input of the network.
+    /// </summary>
+    public static void AddInput(this IWorker worker, string name, Tensor x)
+    {
+        worker.SetInput(name, x);
+    }
+    #endregion
+
+    #region Outputs
+    /// <summary>
+    /// Returns a reference to tensor from the last layer of the network
+    /// Useful when network has only one output.
+    /// IMPORTANT: follow with TakeOwnership() call, if you want tensor to outlive worker or make tensor copy with DeepCopy()
+    /// see also WorkerExtensions.FetchAndTakeOwnership()
+    /// </summary>
+    public static Tensor Peek(this IWorker worker)
+    {
+        return worker.PeekOutput();
+    }
+    /// <summary>
+    /// Returns a reference to tensor by name.
+    /// IMPORTANT: follow with TakeOwnership() call, if you want tensor to outlive worker or make tensor copy with DeepCopy()
+    /// see also WorkerExtensions.FetchAndTakeOwnership()
+    /// </summary>
+    public static Tensor Peek(this IWorker worker, string name)
+    {
+        return worker.PeekOutput(name);
+    }
+    #endregion
+
+
+    #region Blocking APIs
+    /// <summary>
+    /// Schedules network execution in one go and waits for result to be available.
+    /// Useful when network has only one input and caller does not need to know input's name.
+    /// </summary>
+    public static Tensor ExecuteAndWaitForCompletion(this IWorker worker, Tensor input)
+    {
+        worker.Execute(input);
+        return worker.Fetch();
+    }
+    /// <summary>
+    /// Schedules network execution in one go and waits for result to be available.
+    /// </summary>
+    public static Tensor ExecuteAndWaitForCompletion(this IWorker worker, IDictionary<string, Tensor> inputs)
+    {
+        worker.Execute(inputs);
+        return worker.Fetch();
+    }
+    #endregion
+
+    #region Non-blocking APIs
+    /// <summary>
+    /// Returns first output tensor and takes ownership of memory to outlive worker.
+    /// Useful when network has only one output.
+    /// </summary>
+    public static Tensor FetchAndTakeOwnership(this IWorker worker)
+    {
+        var output = worker.Peek();
+        output.TakeOwnership();
+        return output;
+
+    }
+    /// <summary>
+    /// Returns output tensor by name and takes ownership of memory to outlive worker.
+    /// </summary>
+    public static Tensor FetchAndTakeOwnership(this IWorker worker, string name)
+    {
+        var output = worker.Peek(name);
+        output.TakeOwnership();
+        return output;
+    }
+    #endregion
+
+    // @TODO: rename these APIs, Fetch() name kept for backwards compatibility
+    #region Backward compatiblity
+    /// <summary>
+    /// DEPRECATED: Use FetchAndTakeOwnership() instead.
+    /// This method is a blocking call while FetchAndTakeOwnership() is not.
+    /// </summary>
+    public static Tensor Fetch(this IWorker worker)
+    {
+        var output = worker.Peek();
+        output.Unpin(); // unpin will readback to CPU and
+                        // give allocator a chance to reuse allocated buffer
+        output.TakeOwnership();
+        return output;
+    }
+    /// <summary>
+    /// DEPRECATED: Use FetchAndTakeOwnership() instead.
+    /// This method is a blocking call while FetchAndTakeOwnership() is not.
+    /// </summary>
+    public static Tensor Fetch(this IWorker worker, string name)
+    {
+        var output = worker.Peek(name);
+        output.Unpin(); // unpin will readback to CPU and
+                        // give allocator a chance to reuse allocated buffer
+        output.TakeOwnership();
+        return output;
+    }
+    #endregion
+}
+
+// @TODO: deprecate, left here only for backwards compatibility
+public class BarracudaWorkerFactory : WorkerFactory
+{
+    public enum Flags
+    {
+        Compute = Device.GPU,
+        CSharp  = Device.CPU
+    }
+
+    public static bool IsType(Type type, Flags flags)
+    {
+        return IsType(type, (Device)flags);
+    }
+}
+
+// @TODO: make internal or remove completely. Left here for backwards compatibility.
+public class BarracudaTextureUtils
+{
+    public static void TensorToRenderTexture(Tensor x, RenderTexture target,
+                                            int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
+    {
+        new ReferenceComputeOps(ComputeShaderSingleton.Instance.referenceKernels).TensorToRenderTexture(x, target, batch, fromChannel, scale, bias);
+    }
+
+    /// <summary>
+    /// Create a RenderTexture from a slice/batch of a tensor.
+    /// </summary>
+    public static RenderTexture TensorToRenderTexture(Tensor x,
+                                                int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
+    {
+        var target = new RenderTexture(x.width, x.height, 0);
+        TensorToRenderTexture(x, target, batch, fromChannel, scale, bias);
+        return target;
+    }
+}
+
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/BackwardsCompatibility.cs.meta b/Assets/Coach-ML/Barracuda/Core/BackwardsCompatibility.cs.meta
new file mode 100644
index 0000000..8b20162
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/BackwardsCompatibility.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: d8be23f67617e4158b42ccaa1fc437ea
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Barracuda.asmdef b/Assets/Coach-ML/Barracuda/Core/Barracuda.asmdef
new file mode 100644
index 0000000..4f20923
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Barracuda.asmdef
@@ -0,0 +1,8 @@
+{
+    "name": "Barracuda",
+    "references": [],
+    "optionalUnityReferences": [],
+    "includePlatforms": [],
+    "excludePlatforms": [],
+    "allowUnsafeCode": true
+}
\ No newline at end of file
diff --git a/Assets/Coach-ML/Barracuda/Core/Barracuda.asmdef.meta b/Assets/Coach-ML/Barracuda/Core/Barracuda.asmdef.meta
new file mode 100644
index 0000000..8d4fa06
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Barracuda.asmdef.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: 5c2b5ba89f9e74e418232e154bc5cc7a
+AssemblyDefinitionImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Barracuda.cs b/Assets/Coach-ML/Barracuda/Core/Barracuda.cs
new file mode 100644
index 0000000..90d7a9d
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Barracuda.cs
@@ -0,0 +1,430 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using UnityEngine.Assertions;
+
+namespace Barracuda {
+
+/// <summary>
+/// Public interface for Workers. A worker is able to schedule models execution for a given backend.
+/// Use `WorkerFactory` to instantiate a worker.
+/// </summary>
+public interface IWorker : IDisposable
+{
+    #region Inputs
+    /// <summary>
+    /// Optional method to prepare network for particular input dimensions
+    /// </summary>
+    void PrepareForInput(IDictionary<string, TensorShape> inputShapes);
+    /// <summary>
+    /// Specify single tensor value as the input for the network.
+    /// Useful when network has only one input and caller does not need to know input's name.
+    /// </summary>
+    void SetInput(Tensor x);
+    /// <summary>
+    /// Specify tensor value for the named input of the network.
+    /// </summary>
+    void SetInput(string name, Tensor x);
+    #endregion
+
+    #region Schedule whole network
+    /// <summary>
+    /// Non-blocking API that schedules network execution in one go
+    /// Remark: This API will only be non-blocking for GPU inference.
+    /// </summary>
+    void Execute();
+    /// <summary>
+    /// Non-blocking API that schedules network execution in one go, using the provider tensor as input.
+    /// Remark: This API will only be non-blocking for GPU inference.
+    /// Useful when network have only one input as input name is not needed.
+    /// </summary>
+    void Execute(Tensor input);
+    /// <summary>
+    /// Non-blocking API that schedules network execution in one go, using the provider tensor dictionary for inputs.
+    /// Remark: This API will only be non-blocking for GPU inference.
+    /// </summary>
+    void Execute(IDictionary<string, Tensor> inputs);
+    #endregion
+
+    #region Schedule one layer at a time
+    /// <summary>
+    /// Non-blocking API that schedules network execution one layer at the time.
+    /// Remark: This API will only be non-blocking for GPU inference.
+    /// Check GetAsyncProgress() for progress.
+    /// </summary>
+    IEnumerator ExecuteAsync();
+    /// <summary>
+    /// Non-blocking API that schedules network execution one layer at the time, using the provider tensor as input.
+    /// Remark: This API will only be non-blocking for GPU inference.
+    /// Useful when network have only one input as input name is not needed.
+    /// Check GetAsyncProgress() for progress.
+    /// </summary>
+    IEnumerator ExecuteAsync(Tensor input);
+    /// <summary>
+    /// Non-blocking API that schedules network execution one layer at the time, using the provider tensor dictionary for inputs.
+    /// Remark: This API will only be non-blocking for GPU inference.
+    /// Check GetAsyncProgress() for progress.
+    /// </summary>
+    IEnumerator ExecuteAsync(IDictionary<string, Tensor> inputs);
+    /// <summary>
+    /// Wait for completion of part of the network that was scheduled via `ExecuteAsync()`
+    /// </summary>
+    void WaitForCompletion();
+    /// <summary>
+    /// Progress of the scheduling, 0.0 = 0%, 1.0 = 100%
+    /// </summary>
+    float GetAsyncProgress();
+    #endregion
+
+    #region Outputs
+    /// <summary>
+    /// Returns a reference to tensor from the last layer of the network
+    /// Useful when network has only one output.
+    /// IMPORTANT: follow with TakeOwnership() call, if you want tensor to outlive worker or make tensor copy with DeepCopy()
+    /// see also WorkerExtensions.FetchAndTakeOwnership()
+    /// </summary>
+    Tensor PeekOutput();
+    /// <summary>
+    /// Returns a reference to tensor by name.
+    /// IMPORTANT: follow with TakeOwnership() call, if you want tensor to outlive worker or make tensor copy with DeepCopy()
+    /// see also WorkerExtensions.FetchAndTakeOwnership()
+    /// </summary>
+    Tensor PeekOutput(string name);
+    #endregion
+
+    /// <summary>
+    /// Returns a string summary after execution.
+    /// </summary>
+    string Summary();
+}
+
+/// <summary>
+/// Interface for device dependent representation of Tensor data.
+/// </summary>
+public interface ITensorData : IDisposable
+{
+    /// <summary>
+    /// Reserve uninitialized memory.
+    /// </summary>
+    void Reserve(int count);
+    /// <summary>
+    /// Initialize with `data`.
+    /// `offset` is the offset where to start the copy in the `data`
+    /// `count` is the number of element to copy. If count is -1 (default) number of element will be (data.length - offset).
+    /// </summary>
+    void Upload(float[] data, int offset = 0, int count = -1);
+    /// <summary>
+    /// Schedule an asynchronous download from device memory.
+    /// `count` is the number of element to readback.
+    /// return `true` if the request was successfully schedule.
+    /// </summary>
+    bool ScheduleAsyncDownload(int count);
+    /// <summary>
+    /// Return a copy of the data. This is a blocking call.
+    /// `count` is the number of element to readback.
+    /// Prefer a call to ScheduleAsyncDownload() before.
+    /// </summary>
+    float[] Download(int count);
+    /// <summary>
+    /// Return a copy of the full shared tensorData,
+    /// and an offset where this tensorData data is starting.
+    /// Prefer a call to ScheduleAsyncDownload() before.
+    /// </summary>
+    float[] SharedAccess(out int offset);
+    /// <summary>
+    /// Return the maximum number of element this tensorData can contain.
+    /// </summary>
+    int GetMaxCount();
+}
+
+/// <summary>
+/// Object that represent memory (recurrent state) between the executions of a given model.
+/// </summary>
+public class RecurrentState : IDisposable
+{
+    private int m_BatchSize = 1;
+    private Model m_Model;
+    private Tensor[] m_Memories;
+
+    /// <summary>
+    /// Constructs recurrent state for a specific model
+    /// `model` is the associated model.
+    /// `batchSize` has to match the batch dimension of the input tensor(s).
+    /// `grabFromInputs` optional dictionary of named tensors that can be used as a memory. If name of the tensor matches the memory, tensor will be removed from the dictionary and used as memory.
+    /// </summary>
+    public RecurrentState(Model model, int batchSize = 1, Dictionary<string, Tensor> grabFromInputs = null)
+    {
+        m_BatchSize = batchSize;
+        m_Model = model;
+        m_Memories = new Tensor[m_Model.memories.Count];
+
+        var index = 0;
+        foreach (var memory in m_Model.memories)
+        {
+            if (grabFromInputs != null && grabFromInputs.ContainsKey(memory.input))
+            {
+                m_Memories[index++] = grabFromInputs[memory.input];
+                grabFromInputs.Remove(memory.input);
+            }
+            else
+            {
+                Assert.AreEqual(memory.shape.batch, 1);
+                var shape = new TensorShape(memory.shape.batch * batchSize, memory.shape.height, memory.shape.width, memory.shape.channels);
+                m_Memories[index++] = new Tensor(shape);
+            }
+        }
+    }
+
+    ~RecurrentState()
+    {
+        Dispose();
+    }
+
+    public virtual void Dispose()
+    {
+        if (m_Memories == null)
+            return;
+
+        foreach (var x in m_Memories)
+            x.Dispose();
+
+        m_Memories = null;
+    }
+
+    /// <summary>
+    /// Returns batch dimension used for the memories.
+    /// </summary>
+    public int GetBatchSize()
+    {
+        return m_BatchSize;
+    }
+
+    /// <summary>
+    /// Internal callback called before the execution of the model.
+    /// This callback prepares model for the next iteration according to the memory.
+    /// </summary>
+    public void BeforeExecution(IWorker worker)
+    {
+        Assert.AreEqual(m_Model.memories.Count, m_Memories.Length);
+
+        var index = 0;
+        foreach (var memory in m_Model.memories)
+            worker.SetInput(memory.input, m_Memories[index++]);
+    }
+
+    /// <summary>
+    /// Internal callback called after execution of the model finished.
+    /// This callback stores results of the current iteration in the memory.
+    /// </summary>
+    public void AfterExecution(IWorker worker)
+    {
+        Assert.AreEqual(m_Model.memories.Count, m_Memories.Length);
+
+        var index = 0;
+        foreach (var memory in m_Model.memories)
+        {
+            // @TODO: consider using PeekOutput()+DeepCopy() instead of Unpin() that happens inside Fetch()
+            var newTensor = worker.Fetch(memory.output);
+            Assert.IsTrue(newTensor.tensorOnDevice != m_Memories[index]);
+            m_Memories[index].Dispose();
+            m_Memories[index] = newTensor;
+            index++;
+        }
+    }
+}
+
+/// <summary>
+/// Factory to create worker that executes specified model on a particular device  (GPU, CPU, etc) using particular backend.
+/// See `IWorker` for usage of the worker itself.
+/// </summary>
+public class WorkerFactory
+{
+    /// <summary>
+    /// Supported device type
+    /// </summary>
+    public enum Device
+    {
+        GPU                 = 1 << 8,
+        CPU                 = 1 << 9,
+        Auto                = 1 << 15,
+
+        // aliases
+        Compute             = GPU,
+        CSharp              = CPU,
+    }
+
+    /// <summary>
+    /// Backend type
+    /// </summary>
+    public enum Type
+    {
+        Auto                = 0 | Device.Auto,
+
+        ComputePrecompiled  = 0 | Device.GPU,
+        Compute             = 1 | Device.GPU,
+        ComputeRef          = 2 | Device.GPU,
+
+        CSharp              = 0 | Device.CPU,
+        CSharpRef           = 1 | Device.CPU
+    }
+
+    /// <summary>
+    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
+    /// `type` is backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `additionalOutputs` are the additional outputs to track but not directly specified by the model.
+    /// `trimOutputs` are the outputs not discard even if they are specified by the model.
+    /// `verbose` will log scheduling of layers execution to the console.
+    /// `compareAgainstType` if different than `type` model will be run on those two backend and the result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed.
+    /// </summary>
+    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, string[] trimOutputs, bool verbose, Type compareAgainstType)
+    {
+        return BarracudaBackendsFactory.CreateWorker(type, model, additionalOutputs, trimOutputs, verbose, compareAgainstType);
+    }
+
+    /// <summary>
+    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
+    /// `type` is backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `verbose` will log scheduling of layers execution to the console.
+    /// `compareAgainstType` if different than `type` model will be run on those two backend and the result of every layer will be compared, checking for divergence. Great for debugging, but very slow because of the sync needed.
+    /// </summary>
+    public static IWorker CreateWorker(Type type, Model model, bool verbose, Type compareAgainstType)
+    {
+        return CreateWorker(type, model, null, null, verbose, compareAgainstType);
+    }
+
+    /// <summary>
+    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
+    /// `type` is backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `verbose` will log scheduling of layers execution to the console.
+    /// </summary>
+    public static IWorker CreateWorker(Type type, Model model, bool verbose)
+    {
+        return CreateWorker(type, model, null, null, verbose, type);
+    }
+
+    /// <summary>
+    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
+    /// `type` is backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `additionalOutputs` are the additional outputs to track but not directly specified by the model.
+    /// `verbose` will log scheduling of layers execution to the console (default == false)
+    /// </summary>
+    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs, bool verbose = false)
+    {
+        return CreateWorker(type, model, additionalOutputs, null, verbose, type);
+    }
+
+    /// <summary>
+    /// Create a worker with explicitly specified backend `type` to execute the given `model`.
+    /// `type` is backend type to use. For example `WorkerFactory.Type.Compute` specifies the fast GPU path.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `additionalOutputs` are the additional outputs to track but not directly specified by the model.
+    /// `trimOutputs` are the outputs not discard even if they are specified by the model.
+    /// `verbose` will log scheduling of layers execution to the console (default == false)
+    /// </summary>
+    public static IWorker CreateWorker(Type type, Model model, string[] additionalOutputs = null, string[] trimOutputs = null, bool verbose = false)
+    {
+        return CreateWorker(type, model, additionalOutputs, trimOutputs, verbose, type);
+    }
+
+    /// <summary>
+    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `device` is the preferred device for execution. For example `WorkerFactory.Device.GPU` specifies the fast GPU path.
+    /// `verbose` will log scheduling of layers execution to the console.
+    /// </summary>
+    public static IWorker CreateWorker(Model model, Device device = Device.Auto, bool verbose = true)
+    {
+        var type = GetBestTypeForDevice(device);
+        return CreateWorker(type, model, null, null, verbose, type);
+    }
+
+    /// <summary>
+    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `additionalOutputs` are the additional outputs to track but not directly specified by the model.
+    /// `device` is the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path.
+    /// `verbose` will log scheduling of layers execution to the console (default == false)
+    /// </summary>
+    public static IWorker CreateWorker(Model model, string[] additionalOutputs, Device device = Device.Auto, bool verbose = false)
+    {
+        var type = GetBestTypeForDevice(device);
+        return CreateWorker(type, model, additionalOutputs, null, verbose, type);
+    }
+
+    /// <summary>
+    /// Create a worker that will execute `model` using the best backend that is available for a given `device` type.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `additionalOutputs` are the additional outputs to track but not directly specified by the model.
+    /// `trimOutputs` are the outputs not discard even if they are specified by the model.
+    /// `device` is the device type to run worker on. For example `WorkerFactory.Device.GPU` specifies the fast GPU path.
+    /// `verbose` will log scheduling of layers execution to the console (default == false)
+    /// </summary>
+    public static IWorker CreateWorker(Model model, string[] additionalOutputs = null, string[] trimOutputs = null, Device device = Device.Auto, bool verbose = false)
+    {
+        var type = GetBestTypeForDevice(device);
+        return CreateWorker(type, model, additionalOutputs, trimOutputs, verbose, type);
+    }
+
+    /// <summary>
+    /// Create a worker using the reference CPU backend for the given `model`.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `verbose` will log scheduling of layers execution to the console (default == false)
+    /// </summary>
+    public static IWorker CreateReferenceCPUWorker(Model model, bool verbose = false)
+    {
+        return CreateWorker(Type.CSharpRef, model, verbose);
+    }
+
+    /// <summary>
+    /// Create a worker using the reference GPU backend for the given `model`.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `verbose` will log scheduling of layers execution to the console (default == false)
+    /// </summary>
+    public static IWorker CreateReferenceComputeWorker(Model model, bool verbose = false)
+    {
+        return CreateWorker(Type.ComputeRef, model, verbose);
+    }
+
+    /// <summary>
+    /// Create a worker using the precompiled GPU backend for the given `model`.
+    /// `model` is the associated model. See ModelLoader.cs.
+    /// `verbose` will log scheduling of layers execution to the console (default == false)
+    /// </summary>
+    public static IWorker CreateComputeWorker(Model model, bool verbose = false)
+    {
+        return CreateWorker(Type.ComputePrecompiled, model, verbose);
+    }
+
+    /// <summary>
+    /// Check if a backend is of a given type.
+    /// For example: IsType(Type.CSharpRef, Device.GPU) == true
+    /// </summary>
+    public static bool IsType(Type type, Device device)
+    {
+        type = BarracudaBackendsFactory.ResolveAutoType(type);
+        Assert.AreNotEqual(type, Type.Auto);
+        return ((int)type & (int)device) == (int)device;
+    }
+
+    /// <summary>
+    /// Returns the best backend type that can run on a `device` given the `model`.
+    /// </summary>
+    public static Type GetBestTypeForDevice(Device device)
+    {
+        return BarracudaBackendsFactory.GetBestTypeForDevice(device);
+    }
+
+    /// <summary>
+    /// Validate if a backend of `type` is supported, otherwise return a fallback type.
+    /// </summary>
+    public static Type ValidateType(Type type)
+    {
+        return BarracudaBackendsFactory.ValidateType(type);
+    }
+}
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Barracuda.cs.meta b/Assets/Coach-ML/Barracuda/Core/Barracuda.cs.meta
new file mode 100644
index 0000000..da15bc9
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Barracuda.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 9d9abde4165354254b69822280e8a22b
+timeCreated: 1495554326
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Internals.meta b/Assets/Coach-ML/Barracuda/Core/Internals.meta
new file mode 100644
index 0000000..85cb86c
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Internals.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: f589e2f9d03bd44a0a4a818a19b6d8db
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Internals/Debug.cs b/Assets/Coach-ML/Barracuda/Core/Internals/Debug.cs
new file mode 100644
index 0000000..bb18a1d
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Internals/Debug.cs
@@ -0,0 +1,164 @@
+﻿//#if !UNITY_XBOXONE
+#define BARRACUDA_LOG_ENABLED 
+//#endif
+
+using UnityEngine;
+
+namespace Barracuda
+{
+    public class D
+    {
+        public static bool warningStackTraceEnabled = Application.isEditor;
+        public static bool errorStackTraceEnabled = true;
+        public static bool logStackTraceEnabled = false;
+
+        public static bool warningEnabled = true;
+        public static bool errorEnabled = true;
+        public static bool logEnabled = Application.isEditor;
+
+#if BARRACUDA_LOG_ENABLED
+        public static void LogWarning(object message)
+        {
+            if (!warningEnabled)
+                return;
+
+            if (!warningStackTraceEnabled)
+            {
+                var oldConfig = Application.GetStackTraceLogType(LogType.Warning);
+                Application.SetStackTraceLogType(LogType.Warning, StackTraceLogType.None);
+                UnityEngine.Debug.LogWarning(message);
+                Application.SetStackTraceLogType(LogType.Warning, oldConfig);
+            }
+            else
+            {
+                UnityEngine.Debug.LogWarning(message);
+            }
+        }
+        
+        public static void LogWarning(object message, Object context)
+        {
+            if (!warningEnabled)
+                return;
+
+            if (!warningStackTraceEnabled)
+            {
+                var oldConfig = Application.GetStackTraceLogType(LogType.Warning);
+                Application.SetStackTraceLogType(LogType.Warning, StackTraceLogType.None);
+                UnityEngine.Debug.LogWarning(message, context);
+                Application.SetStackTraceLogType(LogType.Warning, oldConfig);
+            }
+            else
+            {
+                UnityEngine.Debug.LogWarning(message, context);
+            }
+        }
+
+        public static void LogError(object message)
+        {
+            if (!errorEnabled)
+                return;
+
+            if (!errorStackTraceEnabled)
+            {
+                var oldConfig = Application.GetStackTraceLogType(LogType.Warning);
+                Application.SetStackTraceLogType(LogType.Error, StackTraceLogType.None);
+                UnityEngine.Debug.LogError(message);
+                Application.SetStackTraceLogType(LogType.Error, oldConfig);
+            }
+            else
+            {
+                UnityEngine.Debug.LogError(message);
+            }
+        }
+        
+        public static void LogError(object message, Object context)
+        {
+            if (!errorEnabled)
+                return;
+
+            if (!errorStackTraceEnabled)
+            {
+                var oldConfig = Application.GetStackTraceLogType(LogType.Warning);
+                Application.SetStackTraceLogType(LogType.Error, StackTraceLogType.None);
+                UnityEngine.Debug.LogError(message, context);
+                Application.SetStackTraceLogType(LogType.Error, oldConfig);
+            }
+            else
+            {
+                UnityEngine.Debug.LogError(message, context);
+            }
+        }
+
+        public static void Log(object message)
+        {
+            if (!logEnabled)
+                return;
+
+            if (!logStackTraceEnabled)
+            {
+                var oldConfig = Application.GetStackTraceLogType(LogType.Warning);
+                Application.SetStackTraceLogType(LogType.Log, StackTraceLogType.None);
+                UnityEngine.Debug.Log(message);
+                Application.SetStackTraceLogType(LogType.Log, oldConfig);
+            }
+            else
+            {
+                UnityEngine.Debug.Log(message);
+            }
+        }
+        
+        public static void Log(object message, Object context)
+        {
+            if (!logEnabled)
+                return;
+
+            if (!logStackTraceEnabled)
+            {
+                var oldConfig = Application.GetStackTraceLogType(LogType.Warning);
+                Application.SetStackTraceLogType(LogType.Log, StackTraceLogType.None);
+                UnityEngine.Debug.Log(message, context);
+                Application.SetStackTraceLogType(LogType.Log, oldConfig);
+            }
+            else
+            {
+                UnityEngine.Debug.Log(message, context);
+            }
+        }
+#else
+        public static void LogWarning(object message)
+        {
+            
+        }
+        
+        public static void LogWarning(object message, Object context)
+        {
+            
+        }
+
+        public static void LogError(object message)
+        {
+            
+        }
+        
+        public static void LogError(object message, Object context)
+        {
+            
+        }
+
+        public static void Log(object message)
+        {
+            
+        }
+        
+        public static void Log(object message, Object context)
+        {
+            
+        }
+#endif
+    }
+
+    internal class Debug : D
+    {
+        
+    }
+}
\ No newline at end of file
diff --git a/Assets/Coach-ML/Barracuda/Core/Internals/Debug.cs.meta b/Assets/Coach-ML/Barracuda/Core/Internals/Debug.cs.meta
new file mode 100644
index 0000000..94b1ccf
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Internals/Debug.cs.meta
@@ -0,0 +1,3 @@
+﻿fileFormatVersion: 2
+guid: bdcfe88795204e0799076d9c7cd8dd39
+timeCreated: 1534164090
\ No newline at end of file
diff --git a/Assets/Coach-ML/Barracuda/Core/Internals/NNModel.cs b/Assets/Coach-ML/Barracuda/Core/Internals/NNModel.cs
new file mode 100644
index 0000000..c305cf6
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Internals/NNModel.cs
@@ -0,0 +1,10 @@
+﻿using UnityEngine;
+
+namespace Barracuda
+{
+    public class NNModel : ScriptableObject
+    {
+        [HideInInspector]
+        public byte[] Value;
+    }
+}
diff --git a/Assets/Coach-ML/Barracuda/Core/Internals/NNModel.cs.meta b/Assets/Coach-ML/Barracuda/Core/Internals/NNModel.cs.meta
new file mode 100644
index 0000000..9013f48
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Internals/NNModel.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: d56b7ac7bee314a29a9d00b13ccdb4f5
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Internals/StringCache.cs b/Assets/Coach-ML/Barracuda/Core/Internals/StringCache.cs
new file mode 100644
index 0000000..0cb0575
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Internals/StringCache.cs
@@ -0,0 +1,90 @@
+﻿using System;
+using System.Collections;
+using System.Collections.Generic;
+
+namespace Barracuda
+{
+
+internal struct StringStringPair : IEquatable<StringStringPair>
+{
+    public string a;
+    public string b;
+
+    public bool Equals(StringStringPair other)
+    {
+        return string.Equals(a, other.a) && string.Equals(b, other.b);
+    }
+
+    public override bool Equals(object obj)
+    {
+        if (ReferenceEquals(null, obj)) return false;
+        return obj is StringStringPair && Equals((StringStringPair) obj);
+    }
+
+    public override int GetHashCode()
+    {
+        var hashCode = a.GetHashCode();
+        hashCode ^= b.GetHashCode();
+        return hashCode;
+    }
+}
+
+internal struct StringStringLongTriplet : IEquatable<StringStringLongTriplet>
+{
+    public string a;
+    public string b;
+    public long c;
+
+    public override int GetHashCode()
+    {
+        var hashCode = a.GetHashCode();
+        hashCode ^= b.GetHashCode();
+        hashCode ^= c.GetHashCode();
+        return hashCode;
+    }
+
+    public bool Equals(StringStringLongTriplet other)
+    {
+        return string.Equals(a, other.a) && string.Equals(b, other.b) && c == other.c;
+    }
+
+    public override bool Equals(object obj)
+    {
+        if (ReferenceEquals(null, obj)) return false;
+        return obj is StringStringLongTriplet && Equals((StringStringLongTriplet) obj);
+    }
+}
+
+public class StringCache
+{
+    private Dictionary<StringStringPair, string> m_CacheStringString = new Dictionary<StringStringPair, string>();
+    private Dictionary<StringStringLongTriplet, string> m_CacheStringStringLong = new Dictionary<StringStringLongTriplet, string>();
+
+    public string Lookup(string a, string b)
+    {
+        var key = new StringStringPair {a = a ?? "", b = b ?? ""};
+
+        if (!m_CacheStringString.ContainsKey(key))
+            m_CacheStringString[key] = a + b;
+
+        return m_CacheStringString[key];
+    }
+
+    public string Lookup(string a, string b, long c)
+    {
+        var key = new StringStringLongTriplet {a = a ?? "", b = b ?? "", c = c};
+
+        if (!m_CacheStringStringLong.ContainsKey(key))
+            m_CacheStringStringLong[key] = a + b + c;
+
+        return m_CacheStringStringLong[key];
+    }
+
+    public void Clear()
+    {
+        m_CacheStringString.Clear();
+        m_CacheStringStringLong.Clear();
+    }
+}
+
+} // namespace Barracuda
\ No newline at end of file
diff --git a/Assets/Coach-ML/Barracuda/Core/Internals/StringCache.cs.meta b/Assets/Coach-ML/Barracuda/Core/Internals/StringCache.cs.meta
new file mode 100644
index 0000000..a55a8c2
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Internals/StringCache.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 6728c68ead6e34aee8795c793b4e5070
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Internals/TestSetLoader.cs b/Assets/Coach-ML/Barracuda/Core/Internals/TestSetLoader.cs
new file mode 100644
index 0000000..b5f8e7e
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Internals/TestSetLoader.cs
@@ -0,0 +1,278 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+using UnityEngine;
+
+namespace Barracuda {
+
+
+public class TestSet
+{
+    private RawTestSet rawTestSet;
+    private JSONTestSet jsonTestSet;
+
+    public TestSet(RawTestSet rawTestSet)
+    {
+        this.rawTestSet = rawTestSet;
+    }
+
+    public TestSet(JSONTestSet jsonTestSet)
+    {
+        this.jsonTestSet = jsonTestSet;
+    }
+
+    public TestSet()
+    {
+    }
+
+    public bool SupportsNames()
+    {
+        if (rawTestSet != null)
+            return false;
+
+        return true;
+    }
+
+    public int GetOutputCount()
+    {
+        if (rawTestSet != null)
+            return 1;
+
+        return jsonTestSet.outputs.Length;
+    }
+
+    public float[] GetOutputData(int idx = 0)
+    {
+        if (rawTestSet != null)
+            return rawTestSet.labels;
+
+        return jsonTestSet.outputs[idx].data;
+    }
+
+    public string GetOutputName(int idx = 0)
+    {
+        if (rawTestSet != null)
+            return null;
+
+        return jsonTestSet.outputs[idx].name;
+    }
+
+    public int GetInputCount()
+    {
+        if (rawTestSet != null)
+            return 1;
+
+        return jsonTestSet.inputs.Length;
+    }
+
+    public string GetInputName(int idx = 0)
+    {
+        if (rawTestSet != null)
+            return "";
+
+        return jsonTestSet.inputs[idx].name;
+    }
+
+    public float[] GetInputData(int idx = 0)
+    {
+        if (rawTestSet != null)
+            return rawTestSet.input;
+
+        return jsonTestSet.inputs[idx].data;
+    }
+
+    public int[] GetInputShape(int idx = 0)
+    {
+        if (rawTestSet != null)
+            return new int[4] {1, 1, 1, rawTestSet.input.Length};
+
+        return new int[4] {Math.Max(jsonTestSet.inputs[idx].shape.batch,   1),
+                           Math.Max(jsonTestSet.inputs[idx].shape.height,  1),
+                           Math.Max(jsonTestSet.inputs[idx].shape.width,   1),
+                           Math.Max(jsonTestSet.inputs[idx].shape.channels,1)};
+    }
+
+    public int[] GetOutputShape(int idx = 0)
+    {
+        if (rawTestSet != null)
+            return new int[4] {1, 1, 1, rawTestSet.labels.Length};
+
+        return new int[4] {Math.Max(jsonTestSet.outputs[idx].shape.batch,   1),
+                           Math.Max(jsonTestSet.outputs[idx].shape.height,  1),
+                           Math.Max(jsonTestSet.outputs[idx].shape.width,   1),
+                           Math.Max(jsonTestSet.outputs[idx].shape.channels,1)};
+    }
+
+    public Dictionary<string, Tensor> GetInputsAsTensorDictionary(Dictionary<string, Tensor> inputs = null, int batchCount = -1, int fromBatch = 0)
+    {
+        if (rawTestSet != null)
+            throw new Exception("GetInputsAsTensorDictionary is not supported for RAW test suites");
+
+        if (inputs == null)
+            inputs = new Dictionary<string, Tensor>();
+
+        for (var i = 0; i < GetInputCount(); i++)
+            inputs[GetInputName(i)] = GetInputAsTensor(i, batchCount, fromBatch);
+
+        return inputs;
+    }
+
+    public Dictionary<string, Tensor> GetOutputsAsTensorDictionary(Dictionary<string, Tensor> outputs = null, int batchCount = -1, int fromBatch = 0)
+    {
+        if (rawTestSet != null)
+            throw new Exception("GetOutputsAsTensorDictionary is not supported for RAW test suites");
+
+        if (outputs == null)
+            outputs = new Dictionary<string, Tensor>();
+
+        for (var i = 0; i < GetInputCount(); i++)
+            outputs[GetOutputName(i)] = GetOutputAsTensor(i, batchCount, fromBatch);
+
+        return outputs;
+    }
+
+    public Tensor GetInputAsTensor(int idx = 0, int batchCount = -1, int fromBatch = 0)
+    {
+        if (rawTestSet != null)
+            throw new Exception("GetInputAsTensor is not supported for RAW test suites");
+
+        var shape = GetInputShape(idx);
+        var array = GetInputData(idx);
+        var maxBatchCount = array.Length / (shape[1] * shape[2] * shape[3]);
+
+        fromBatch = Math.Min(fromBatch, maxBatchCount - 1);
+        if (batchCount < 0)
+            batchCount = maxBatchCount - fromBatch;
+
+        // pad data with 0s, if test-set doesn't have enough batches:
+        // 1) new ArrayTensorData() will initialize to 0
+        // 2) Upload will copy as much data as test-set has into ArrayTensorData
+        var tensorShape = new TensorShape(batchCount, shape[1], shape[2], shape[3]);
+        var data = new ArrayTensorData(tensorShape.length);
+        data.Upload(array, fromBatch * tensorShape.flatWidth, Math.Min(batchCount, maxBatchCount - fromBatch) * tensorShape.flatWidth);
+
+        var res = new Tensor(tensorShape, data);
+        res.name = GetInputName(idx);
+
+        return res;
+    }
+
+    public Tensor GetOutputAsTensor(int idx = 0, int batchCount = -1, int fromBatch = 0)
+    {
+        if (rawTestSet != null)
+            throw new Exception("GetOutputAsTensor is not supported for RAW test suites");
+
+        var shape = GetOutputShape(idx);
+        var array = GetOutputData(idx);
+        var maxBatchCount = array.Length / (shape[1] * shape[2] * shape[3]);
+
+        fromBatch = Math.Min(fromBatch, maxBatchCount - 1);
+        if (batchCount < 0)
+            batchCount = maxBatchCount - fromBatch;
+        batchCount = Math.Min(batchCount, maxBatchCount - fromBatch);
+
+        var res = new Tensor(batchCount, shape[1], shape[2], shape[3],
+            new SharedArrayTensorData(array, fromBatch * shape[1] * shape[2] * shape[3]));
+        res.name = GetOutputName(idx);
+
+        return res;
+    }
+}
+
+public class RawTestSet
+{
+    public float[] input;
+    public float[] labels;
+}
+
+[Serializable]
+public class JSONTestSet
+{
+    public JSONTensor[] inputs;
+    public JSONTensor[] outputs;
+}
+
+
+[Serializable]
+public class JSONTensorShape
+{
+    public int batch;
+    public int height;
+    public int width;
+    public int channels;
+}
+
+[Serializable]
+public class JSONTensor
+{
+    public string name;
+    public JSONTensorShape shape;
+    public string type;
+    public float[] data;
+}
+
+
+public class TestSetLoader
+{
+    public static TestSet Load(string filename)
+    {
+        if (filename.ToLower().EndsWith(".raw"))
+            return LoadRaw(filename);
+
+        return LoadJSON(filename);
+    }
+
+    public static TestSet LoadJSON(string filename)
+    {
+        string fullpath = Path.Combine(Application.streamingAssetsPath, "TestSet", filename);
+
+        var json = File.ReadAllText(fullpath);
+        TestSet result = new TestSet(JsonUtility.FromJson<JSONTestSet>(json));
+
+        return result;
+    }
+
+    public static TestSet LoadRaw(string filename)
+    {
+        string fullpath = Path.Combine(Application.streamingAssetsPath, "TestSet", filename);
+
+        using(BinaryReader file = Open(fullpath))
+        {
+
+            var rawTestSet = new RawTestSet();
+            rawTestSet.input = LoadFloatArray(file);
+            rawTestSet.labels = LoadFloatArray(file);
+            return new TestSet(rawTestSet);;
+        }
+    }
+
+    public static Texture LoadImage(string filename)
+    {
+        string fullpath = Path.Combine(Application.streamingAssetsPath, "TestSet", filename);
+
+        var bytes = File.ReadAllBytes(fullpath);
+        var tex = new Texture2D(2, 2);
+        ImageConversion.LoadImage(tex, bytes, false); // LoadImage will auto-resize the texture dimensions
+        tex.wrapMode = TextureWrapMode.Clamp;
+        return tex;
+    }
+
+    public static float[] LoadFloatArray(BinaryReader file)
+    {
+        Int64 dataLength = file.ReadInt64();
+        float[] array = new float[dataLength];
+        byte[] bytes = file.ReadBytes(Convert.ToInt32(dataLength * sizeof(float))); // @TODO: support larger than MaxInt32 data blocks
+        Buffer.BlockCopy(bytes, 0, array, 0, bytes.Length);
+
+        return array;
+    }
+
+    static BinaryReader Open(string filename)
+    {
+        return new BinaryReader(new FileStream(filename, FileMode.Open, FileAccess.Read));
+    }
+}
+
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Internals/TestSetLoader.cs.meta b/Assets/Coach-ML/Barracuda/Core/Internals/TestSetLoader.cs.meta
new file mode 100644
index 0000000..ee47724
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Internals/TestSetLoader.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: b60ed189056434a469534a5cfdd124ab
+timeCreated: 1495576373
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/Model.cs b/Assets/Coach-ML/Barracuda/Core/Model.cs
new file mode 100644
index 0000000..3e29efc
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Model.cs
@@ -0,0 +1,317 @@
+using System;
+using System.Linq; // Select
+using System.Collections.Generic;
+
+namespace Barracuda {
+
+public class Layer
+{
+    public enum Type
+    {
+        Nop = 0,
+        Dense = 1,
+        MatMul = 2,
+
+        Conv2D = 20,
+        DepthwiseConv2D = 21,
+        Conv2DTrans = 22,
+        Upsample2D = 23,
+        MaxPool2D = 25,
+        AvgPool2D = 26,
+        GlobalMaxPool2D = 27,
+        GlobalAvgPool2D = 28,
+        Border2D = 29,
+
+        Conv3D = 30,                // TODO: NOT IMPLEMENTED
+        Conv3DTrans = 32,           // TODO: NOT IMPLEMENTED
+        Upsample3D = 33,            // TODO: NOT IMPLEMENTED
+        MaxPool3D = 35,             // TODO: NOT IMPLEMENTED
+        AvgPool3D = 36,             // TODO: NOT IMPLEMENTED
+        GlobalMaxPool3D = 37,       // TODO: NOT IMPLEMENTED
+        GlobalAvgPool3D = 38,       // TODO: NOT IMPLEMENTED
+        Border3D = 39,              // TODO: NOT IMPLEMENTED
+
+        Activation = 50,
+        ScaleBias = 51,
+        Normalization = 52,
+        LRN = 53,                   // TODO: NOT IMPLEMENTED, https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
+
+        Dropout = 60,
+        RandomNormal = 64,
+        RandomUniform = 65,
+        Multinomial = 66,
+        OneHot = 67,
+
+        Add = 100,
+        Sub = 101,
+        Mul = 102,
+        Div = 103,
+        Pow = 104,
+        Min = 110,
+        Max = 111,
+        Mean = 112,
+
+        ReduceL1 = 120,             // TODO: NOT IMPLEMENTED
+        ReduceL2 = 121,             // TODO: NOT IMPLEMENTED
+        ReduceLogSum = 122,         // TODO: NOT IMPLEMENTED
+        ReduceLogSumExp = 123,      // TODO: NOT IMPLEMENTED
+        ReduceMax = 124,
+        ReduceMean = 125,
+        ReduceMin = 126,
+        ReduceProd = 127,
+        ReduceSum = 128,
+        ReduceSumSquare = 129,      // TODO: NOT IMPLEMENTED
+
+        Greater = 140,
+        GreaterEqual = 141,
+        Less = 142,
+        LessEqual = 143,
+        Equal = 144,
+        LogicalOr = 145,
+        LogicalAnd = 146,
+        LogicalNot = 147,
+        LogicalXor = 148,
+
+        Pad2DReflect = 160,
+        Pad2DSymmetric = 161,
+        Pad2DEdge = 162,
+
+        Flatten = 200,
+        Reshape = 201,
+        Transpose = 202,
+        Squeeze = 203,              // TODO: NOT IMPLEMENTED
+        Unsqueeze = 204,            // TODO: NOT IMPLEMENTED
+
+        Concat = 210,
+        StridedSlice = 211,
+        Tile = 212,
+
+        Load = 255
+    }
+
+    public enum Activation
+    {
+        None = 0,
+        Relu = 1,
+        Softmax = 2,
+        Tanh = 3,
+        Sigmoid = 4,
+        Elu = 5,
+        Relu6 = 6,
+        LeakyRelu = 7,
+        Selu = 8,
+        Swish = 9,
+
+        LogSoftmax = 10,            
+        Softplus = 11,              // TODO: NOT IMPLEMENTED
+        Softsign = 12,              // TODO: NOT IMPLEMENTED
+
+        PRelu = 13,
+
+        Hardmax = 20,               // TODO: NOT IMPLEMENTED
+        HardSigmoid = 21,           // TODO: NOT IMPLEMENTED
+
+        Abs = 100,
+        Neg = 101,
+        Ceil = 102,
+        Clip = 103,
+        Floor = 104,
+
+        Reciprocal = 110,
+        Sqrt = 111,
+        Pow = 112,
+        Exp = 113,
+        Log = 114,
+
+        Acos = 200,                 // TODO: NOT IMPLEMENTED
+        Acosh = 201,                // TODO: NOT IMPLEMENTED
+        Asin = 202,                 // TODO: NOT IMPLEMENTED
+        Asinh = 203,                // TODO: NOT IMPLEMENTED
+        Atan = 204,                 // TODO: NOT IMPLEMENTED
+        Atanh = 205,                // TODO: NOT IMPLEMENTED
+        Cos = 206,                  // TODO: NOT IMPLEMENTED
+        Cosh = 207,                 // TODO: NOT IMPLEMENTED
+        Sin = 208,                  // TODO: NOT IMPLEMENTED
+        Sinh = 209,                 // TODO: NOT IMPLEMENTED
+        Tan = 210                   // TODO: NOT IMPLEMENTED
+    }
+
+    public enum AutoPad
+    {
+        Valid = 0,
+        SameUpper = -1,
+        SameLower = -2,
+    }
+
+    public struct DataSet
+    {
+        public string      name;
+        public TensorShape shape;
+        public Int64       offset;
+        public Int32       itemSizeInBytes;
+        public Int32       length;
+    }
+
+    public string     name;
+    public Type       type;
+    public Activation activation;
+    public Int32[]    pad;
+    public Int32[]    stride;
+    public Int32[]    pool;
+    public Int32      axis;
+    public float      alpha;
+    public float      beta;
+    public string[]   inputs;
+
+    public DataSet[]  datasets;
+    public float[]    weights;
+
+    private Layer(string layerName)
+    {
+        name = layerName;
+        type = Type.Nop;
+        activation = Activation.None;
+        pad = new int[0];
+        stride = new int[0];
+        pool = new int[0];
+        axis = -1;
+        alpha = 1.0f;
+        beta = 0.0f;
+        inputs = new string[0];
+        datasets = new DataSet[0];
+        weights = new float[0];
+    }
+
+    public Layer(string layerName, Type layerType, Activation activationType = Activation.None) : this(layerName)
+    {
+        type = layerType;
+        activation = activationType;
+    }
+
+    public Layer(string layerName, Activation activationType) : this(layerName)
+    {
+        type = Type.Activation;
+        activation = activationType;
+    }
+
+    public override string ToString()
+    {
+        return ($"name:{name}, activation:{activation}, inputs:[{string.Join(",", inputs)}], " +
+            $"pad:[{string.Join(",", pad)}], stride:[{string.Join(",", stride)}], pool:[{string.Join(",", pool)}], " +
+            $"alpha:{alpha}, beta:{beta}, axis:{axis}, " +
+            $"consts:[{string.Join(", ", datasets.Select(x => $"{x.name} {x.shape}"))}]".Replace(name+"/","").Replace(name+" ","")).
+            Replace("activation:None, ", "").Replace("inputs:[], ", "").Replace("pad:[], ", "").
+            Replace("stride:[], ", "").Replace("stride:[1,1], ", "").Replace("pool:[], ", "").
+            Replace("alpha:1, ", "").Replace("beta:0, ", "").Replace("axis:-1, ", "").
+            Replace("consts:[]", "");
+    }
+}
+
+public class Model
+{
+    public const int Version = 16;
+
+    public struct Input
+    {
+        public string  name;
+        public Int32[] shape; // input shape can contain -1 for unspecified dimensions
+
+        public Input WithName(string name)
+        {
+            return new Input {name = name, shape = shape};
+        }
+    }
+
+    public struct Memory
+    {
+        public TensorShape   shape;
+        public string        input;
+        public string        output;
+    }
+
+    public List<Input>   inputs = new List<Input>();
+    public List<string>  outputs = new List<string>();
+    public List<Memory>  memories = new List<Memory>();
+    public List<Layer>   layers = new List<Layer>();
+
+    #region Importer info
+    public string IrSource = "Script";
+    public string IrVersion = "NA";
+    public string ProducerName = "Script";
+    public List<ImporterWarning> Warnings { get; } = new List<ImporterWarning>();
+    public class ImporterWarning
+    {
+
+        public string Message { get; }
+        public string LayerName { get; }
+        public ImporterWarning(string layer, string msg)
+        {
+            Message = msg;
+            LayerName = layer;
+        }
+    }
+    #endregion
+
+    public Model ShallowCopy()
+    {
+        var model = new Model();
+        model.inputs.AddRange(inputs);
+        model.outputs.AddRange(outputs);
+        model.memories.AddRange(memories);
+        model.layers.AddRange(layers);
+
+        model.IrSource = IrSource;
+        model.IrVersion = IrVersion;
+        model.ProducerName = ProducerName;
+        model.Warnings.AddRange(Warnings);
+        return model;
+    }
+
+    public override string ToString()
+    {
+        return $"inputs: [{string.Join(", ", inputs.Select(i => $"{i.name} ({string.Join(",", i.shape)})"))}], " +
+            $"memories: [{string.Join(", ", memories.Select(m => $"{m.input} {m.shape} {m.output}"))}], " +
+            $"outputs: [{string.Join(", ", outputs)}] " +
+            $"\n{layers.Count} layers, {layers.Sum(l => l.weights.Length)} weights: ...\n{string.Join("\n", layers.Select(i => $"{i.type} ({i})"))}";
+    }
+}
+
+public static class ModelMetadataExtensions
+{
+    static public Tensor GetTensorByName(this Model model, string name)
+    {
+        foreach (var l in model.layers)
+            foreach (var ds in l.datasets)
+                if (ds.name == name)
+                    return new Tensor(ds.shape,
+                        new SharedArrayTensorData(l.weights, (int)ds.offset, (int)ds.shape.length), ds.name);
+
+        return null;
+    }
+
+    static public TensorShape GetShapeByName(this Model model, string name)
+    {
+        foreach (var i in model.inputs)
+            if (i.name == name)
+                return new TensorShape(i.shape);
+
+        TensorShape shape;
+        if (ModelAnalyzer.TryGetOutputTensorShape(model, name, out shape))
+            return shape;
+
+        foreach (var l in model.layers)
+            foreach (var ds in l.datasets)
+                if (ds.name == name)
+                    return ds.shape;
+
+        foreach (var mem in model.memories)
+        {
+            if (mem.input == name || mem.output == name)
+                return mem.shape;
+        }
+
+        throw new System.Collections.Generic.KeyNotFoundException("Shape " + name + " not found!");
+    }
+}
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Model.cs.meta b/Assets/Coach-ML/Barracuda/Core/Model.cs.meta
new file mode 100644
index 0000000..564b709
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Model.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: b26b24090eb094bbbad7577bab770b25
+timeCreated: 1506364243
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/ModelBuilder.cs b/Assets/Coach-ML/Barracuda/Core/ModelBuilder.cs
new file mode 100644
index 0000000..b4699ba
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/ModelBuilder.cs
@@ -0,0 +1,902 @@
+using System;
+using System.Linq;
+
+namespace Barracuda
+{
+    public class ModelBuilder
+    {
+        private readonly Model m_Model;
+        public Model model { get { return m_Model; } }
+
+        /// <summary>
+        /// Create a model builder helper to construct the underlying Model.
+        /// </summary>
+        public ModelBuilder(Model model)
+        {
+            m_Model = model;
+        }
+
+        /// <summary>
+        /// Add an input to the model
+        /// </summary>
+        public Model.Input Input(string name, Int32[] shape)
+        {
+            m_Model.inputs.Add(new Model.Input {name = name, shape = shape});
+
+            return m_Model.inputs.Last();
+        }
+
+        /// <summary>
+        /// Add an input to the model
+        /// </summary>
+        public Model.Input Input(string name, Int32 batch, Int32 channels)
+        {
+            m_Model.inputs.Add(new Model.Input {name = name, shape = new []{batch, 1, 1, channels}});
+
+            return m_Model.inputs.Last();
+        }
+
+        /// <summary>
+        /// Add an input to the model
+        /// </summary>
+        public Model.Input Input(string name, Int32 batch, Int32 height, Int32 width, Int32 channels)
+        {
+            m_Model.inputs.Add(new Model.Input {name = name, shape = new []{batch, height, width, channels}});
+
+            return m_Model.inputs.Last();
+        }
+
+        /// <summary>
+        /// Add an output to the model
+        /// </summary>
+        public string Output(object input)
+        {
+            var name = ResolveInput(input);
+            if (!m_Model.outputs.Contains(name))
+                m_Model.outputs.Add(name);
+            return name;
+        }
+
+        private string ResolveInput(object input)
+        {
+            if (input == null)
+                return null;
+
+            if (input is string)
+                return input as string;
+
+            if (input is Layer)
+                return (input as Layer).name;
+
+            if (input is Model.Input)
+                return ((Model.Input)input).name;
+
+            throw new ArgumentException($"Unsupported input type: {input.GetType()}");
+        }
+
+        /// <summary>
+        /// Allow to load a tensor from constants.
+        /// </summary>
+        public Layer Const(string name, Tensor tensor, int insertionIndex = -1)
+        {
+            Layer layer = new Layer(name, Layer.Type.Load);
+            layer.datasets = new Layer.DataSet[1];
+            layer.datasets[0].name            = name;
+            layer.datasets[0].shape           = tensor.shape;
+            layer.datasets[0].itemSizeInBytes = 4;
+            layer.datasets[0].length          = tensor.shape.length;
+            layer.datasets[0].offset          = 0;
+            layer.weights                     = new float[tensor.shape.length];
+            tensor.readonlyArray.CopyTo(layer.weights, 0);
+
+            if (insertionIndex < 0 || insertionIndex >= m_Model.layers.Count)
+                m_Model.layers.Add(layer);
+            else
+                m_Model.layers.Insert(insertionIndex, layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Apply per channel scale and bias.
+        /// Scale and bias should be tensors of shape [1,1,1, input.shape[C]]
+        ///
+        /// Output shape is same as input.
+        /// </summary>
+        public Layer ScaleBias(string name, object input, Tensor scale, Tensor bias)
+        {
+            Layer layer = new Layer(name,Layer.Type.ScaleBias);
+            layer.inputs = new [] {ResolveInput(input)};
+            layer.datasets = new Layer.DataSet[2];
+            layer.datasets[0].name            = $"{name}/S";
+            layer.datasets[0].shape           = scale.shape;
+            layer.datasets[0].itemSizeInBytes = 4;
+            layer.datasets[0].length          = scale.shape.length;
+            layer.datasets[0].offset          = 0;
+            layer.datasets[1].name            = $"{name}/B";
+            layer.datasets[1].shape           = bias.shape;
+            layer.datasets[1].itemSizeInBytes = 4;
+            layer.datasets[1].length          = bias.shape.length;
+            layer.datasets[1].offset          = scale.shape.length;
+            layer.weights                     = new float[scale.shape.length + bias.shape.length];
+
+            scale.readonlyArray.CopyTo(layer.weights, 0);
+            bias.readonlyArray.CopyTo(layer.weights, layer.datasets[1].offset);
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Carries out instance normalization as described in the paper https://arxiv.org/abs/1607.08022
+        /// y = scale * (x - mean) / sqrt(variance + epsilon) + bias, where mean and variance are computed per instance per channel.
+        /// Scale and bias should be tensors of shape [1,1,1, input.shape[C]]
+        ///
+        /// Output shape is same as input.
+        /// </summary>
+        public Layer Normalization(string name, object input, Tensor scale, Tensor bias, float epsilon = 1e-5f)
+        {
+            Layer layer = new Layer(name, Layer.Type.Normalization);
+            layer.inputs = new [] {ResolveInput(input)};
+            layer.datasets = new Layer.DataSet[2];
+            layer.datasets[0].name            = $"{name}/S";
+            layer.datasets[0].shape           = scale.shape;
+            layer.datasets[0].itemSizeInBytes = 4;
+            layer.datasets[0].length          = scale.shape.length;
+            layer.datasets[0].offset          = 0;
+            layer.datasets[1].name            = $"{name}/B";
+            layer.datasets[1].shape           = bias.shape;
+            layer.datasets[1].itemSizeInBytes = 4;
+            layer.datasets[1].length          = bias.shape.length;
+            layer.datasets[1].offset          = scale.shape.length;
+            layer.weights                     = new float[scale.shape.length + bias.shape.length];
+            layer.beta                        = epsilon;
+
+            scale.readonlyArray.CopyTo(layer.weights, 0);
+            bias.readonlyArray.CopyTo(layer.weights, layer.datasets[1].offset);
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Apply a densely connected layer (aka general matrix multiplication or GEMM)
+        /// Bias should be a tensor with (batch == input.shape[H] * input.shape[W] * input.shape[C]) and only one other dimensions of size > 1
+        /// Weight should be a tensor with (batch == 1) and (height * width * channels == bias.shape[B] * )
+        ///
+        /// Output shape is [input.shape[B], 1, 1, Weight.shape[H]*Weight.shape[W]*Weight.shape[C]]
+        /// </summary>
+        public Layer Dense(string name, object input, Tensor weight, Tensor bias)
+        {
+            Layer layer = new Layer(name, Layer.Type.Dense);
+            layer.inputs = new [] {ResolveInput(input)};
+            layer.datasets = new Layer.DataSet[2];
+            layer.datasets[0].name            = $"{name}/W";
+            layer.datasets[0].shape           = weight.shape;
+            layer.datasets[0].itemSizeInBytes = 4;
+            layer.datasets[0].length          = weight.shape.length;
+            layer.datasets[0].offset          = 0;
+            layer.datasets[1].name            = $"{name}/B";
+            layer.datasets[1].shape           = bias.shape;
+            layer.datasets[1].itemSizeInBytes = 4;
+            layer.datasets[1].length          = bias.shape.length;
+            layer.datasets[1].offset          = weight.shape.length;
+            layer.weights                     = new float[weight.shape.length + bias.shape.length];
+
+            weight.readonlyArray.CopyTo(layer.weights, 0);
+            bias.readonlyArray.CopyTo(layer.weights, layer.datasets[1].offset);
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        private Layer Conv(string name, Layer.Type convType, object input, Int32[] stride, Int32[] pad, Int32[] outputPad, Tensor kernel, Tensor bias)
+        {
+            Layer layer = new Layer(name, convType);
+            layer.pad = pad;
+            layer.stride = stride;
+            layer.pool = outputPad;
+            layer.inputs = new [] {ResolveInput(input)};
+            layer.datasets = new Layer.DataSet[2];
+            layer.datasets[0].name            = $"{name}/K";
+            layer.datasets[0].shape           = kernel.shape;
+            layer.datasets[0].itemSizeInBytes = 4;
+            layer.datasets[0].length          = kernel.shape.length;
+            layer.datasets[0].offset          = 0;
+            layer.datasets[1].name            = $"{name}/B";
+            layer.datasets[1].shape           = bias.shape;
+            layer.datasets[1].itemSizeInBytes = 4;
+            layer.datasets[1].length          = bias.shape.length;
+            layer.datasets[1].offset          = kernel.shape.length;
+            layer.weights                     = new float[kernel.shape.length + bias.shape.length];
+
+            kernel.readonlyArray.CopyTo(layer.weights, 0);
+            bias.readonlyArray.CopyTo(layer.weights, layer.datasets[1].offset);
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Apply a spatial 2D convolution on H and W.
+        /// Stride should be of size 2 and format is [W, H].
+        /// Pad should be of size 4 and format is [pre W, pre H, post W, post H].
+        /// Kernel should be a tensor of shape [kernelHeight, kernelWidth, kernelDepth, kernelCount]
+        /// Bias should be a tensor with (batch == 1) and (height * width * channels == kernelCount)
+        ///
+        /// Output batch is same as input.
+        /// Output channel is kernel.shape[3].
+        /// output.shape[H,W] = (input.shape[H,W] + pad[1,0] + pad[3,2] - kernel.shape[1,0]) / stride[1,0] + 1.
+        /// </summary>
+        public Layer Conv2D(string name, object input, Int32[] stride, Int32[] pad, Tensor kernel, Tensor bias)
+        {
+            return Conv(name, Layer.Type.Conv2D, input, stride, pad, new int[0], kernel, bias);
+        }
+
+        /// <summary>
+        /// Apply a spatial 2D depthwise convolution on H and W.
+        /// Stride should be of size 2 and format is [W, H].
+        /// Pad should be of size 4 and format is [pre W, pre H, post W, post H].
+        /// Kernel should be a tensor of shape [kernelHeight, kernelWidth, kernelDepth, kernelCount]
+        /// Thus input must have a channel dimension of 1
+        /// Bias should be a tensor with (batch == 1) and (height * width * channels == kernelCount)
+        ///
+        /// Output batch is same as input.
+        /// Output channel is kernel.shape[3].
+        /// output.shape[H,W] = (input.shape[H,W] + pad[1,0] + pad[3,2] - kernel.shape[1,0]) / stride[1,0] + 1.
+        /// </summary>
+        public Layer DepthwiseConv2D(string name, object input, Int32[] stride, Int32[] pad, Tensor kernel, Tensor bias)
+        {
+            return Conv(name, Layer.Type.DepthwiseConv2D, input, stride, pad, new int[0], kernel, bias);
+        }
+
+        /// <summary>
+        /// Apply a spatial 2D transposed convolution on H and W.
+        /// Stride should be of size 2 and format is [W, H].
+        /// Pad should be of size 4 and format is [pre W, pre H, post W, post H].
+        /// Kernel should be a tensor of rank 4 of dimensions [kernelHeight, kernelWidth, kernelDepth, kernelCount]
+        /// Bias should be a tensor with (batch == 1) and (height * width * channels == kernelCount)
+        /// OutputPad should be of length 0 or 2, format is [W, H].
+        /// If OutputPad length is 0 it will be defaulted to:
+        ///     OutputPad[W,H] = (input.shape[W,H] * stride[0,1] + pad[0,1] + pad[2,3] - [kernelWidth, kernelHeight]) % stride[0,1]
+        ///
+        /// Output batch is same as input.
+        /// Output channel is kernel.shape[3].
+        /// output.shape[H,W] = (input.shape[H,W]-1) * stride[0,1] - (pad[1,0] + pad[3,2]) + [kernelWidth, kernelHeight] + OutputPad[W,H]
+        /// </summary>
+        public Layer Conv2DTrans(string name, object input, Int32[] stride, Int32[] pad, Int32[] outputPad, Tensor kernel, Tensor bias)
+        {
+            return Conv(name, Layer.Type.Conv2DTrans, input, stride, pad, outputPad, kernel, bias);
+        }
+
+        private Layer Pool(Layer.Type type, string name, object input, Int32[] pool, Int32[] stride, Int32[] pad)
+        {
+            Layer layer = new Layer(name, type);
+            layer.pad = pad;
+            layer.stride = stride;
+            layer.pool = pool;
+            layer.inputs = new [] {ResolveInput(input)};
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Apply 'average' pooling by downscaling H and W dimension according to `pool`, `stride` and `pad`.
+        /// Pool and stride should be of size 2 and format is [W, H].
+        /// Pad should be of size 4 and format is [pre W, pre H, post W, post H].
+        ///
+        /// Output batch and channels dimensions the same as input.
+        /// output.shape[H,W] = (input.shape[H,W] + pad[1,0] + pad[3,2] - pool[1,0]) / stride[1,0] + 1.
+        /// </summary>
+        public Layer AvgPool2D(string name, object input, Int32[] pool, Int32[] stride, Int32[] pad)
+        {
+            return Pool(Layer.Type.AvgPool2D, name, input, pool, stride, pad);
+        }
+
+        /// <summary>
+        /// Apply 'max' pooling by downscaling H and W dimension according to `pool`, `stride` and `pad`.
+        /// Pool and stride should be of size 2 and format is [W, H].
+        /// Pad should be of size 4 and format is [pre W, pre H, post W, post H].
+        ///
+        /// Output batch and channels dimensions the same as input.
+        /// output.shape[H,W] = (input.shape[H,W] + pad[1,0] + pad[3,2] - pool[1,0]) / stride[1,0] + 1.
+        /// </summary>
+        public Layer MaxPool2D(string name, object input, Int32[] pool, Int32[] stride, Int32[] pad)
+        {
+            return Pool(Layer.Type.MaxPool2D, name, input, pool, stride, pad);
+        }
+
+        /// <summary>
+        /// Apply 'average' pooling by downscaling H and W dimension to [1,1]
+        /// </summary>
+        public Layer GlobalAvgPool2D(string name, object input)
+        {
+            return Pool(Layer.Type.GlobalAvgPool2D, name, input, new int[0], new int[0], new int[0]);
+        }
+
+        /// <summary>
+        /// Apply 'max' pooling by downscaling H and W dimension to [1,1]
+        /// </summary>
+        public Layer GlobalMaxPool2D(string name, object input)
+        {
+            return Pool(Layer.Type.GlobalMaxPool2D, name, input, new int[0], new int[0], new int[0]);
+        }
+
+        /// <summary>
+        /// Upsample the input tensor by scaling H and W by upsample[0] and upsample[1] respectively.
+        /// Upsampling is done using nearest neighbor.
+        /// </summary>
+        public Layer Upsample2D(string name, object input, Int32[] upsample)
+        {
+            Layer layer = new Layer(name, Layer.Type.Upsample2D);
+            layer.pool = upsample;
+            layer.inputs = new [] {ResolveInput(input)};
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Return a tensor of the requested shape. Input and output must contain the same number of elements.
+        /// </summary>
+        public Layer Reshape(string name, object input, int[] shape)
+        {
+            Layer layer = new Layer(name, Layer.Type.Reshape);
+            layer.pool = shape;
+            layer.inputs = new [] {ResolveInput(input)};
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Return a tensor of shape [input.Batch, input.Height * input.Width * input.Channels]
+        /// </summary>
+        public Layer Flatten(string name, object input)
+        {
+            Layer layer = new Layer(name, Layer.Type.Flatten);
+            layer.inputs = new [] {ResolveInput(input)};
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Concatenate a list of tensors into a single tensor. All input tensors must have the same shape, except for the axis to concatenate on.
+        /// </summary>
+        public Layer Concat(string name, object[] inputs, int axis)
+        {
+            Layer layer = new Layer(name, Layer.Type.Concat);
+            layer.axis = axis;
+            layer.inputs = inputs.Select(i => ResolveInput(i)).ToArray();
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Produces a slice of the input tensor along all axes.
+        /// The following rules apply:
+        ///     begin=0, end=0, stride=1: copy the full range of elements from the given axis
+        ///     begin=A, end=B, stride=1: copy the range [A, B) (excluding the Bth element) from the given axis
+        ///     begin=A, end=B, stride=I: copy every Ith element in the range [A, B) from the given axis
+        ///     begin=N, end=N, stride=0: shrink axis to a single Nth element
+        /// output.shape[*] = (ends[*] - starts[*]) / max(1, stride[*])
+        /// </summary>
+        public Layer StridedSlice(string name, object input, int[] starts, int[] ends, int[] strides)
+        {
+            Layer layer = new Layer(name, Layer.Type.StridedSlice);
+            layer.inputs = new [] {ResolveInput(input)};
+            layer.pad = starts;
+            layer.pool = ends;
+            layer.stride = strides;
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+
+        private Layer Activation(Layer.Activation activation, string name, object input)
+        {
+            Layer layer = new Layer(name, activation);
+            layer.inputs = new [] {ResolveInput(input)};
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// No-op layer
+        /// </summary>
+        public Layer Identity(string name, object input)
+        {
+            return Activation(Layer.Activation.None, name, input);
+        }
+
+
+        /// <summary>
+        /// Element-wise `Relu` activation function: f(x) = max(0, x)
+        /// </summary>
+        public Layer Relu(string name, object input)
+        {
+            return Activation(Layer.Activation.Relu, name, input);
+        }
+
+        /// <summary>
+        /// Return the softmax (normalized exponential) values of the flatten HWC dimensions of the input.
+        /// Thus output will be of shape [input.Batch, input.Height * input.Width * input.Channels]
+        /// </summary>
+        public Layer Softmax(string name, object input)
+        {
+            return Activation(Layer.Activation.Softmax, name, input);
+        }
+
+        /// <summary>
+        /// Return the logsoftmax (normalized exponential) values of the flatten HWC dimensions of the input.
+        /// Thus output will be of shape [input.Batch, input.Height * input.Width * input.Channels]
+        /// </summary>
+        public Layer LogSoftmax(string name, object input)
+        {
+            return Activation(Layer.Activation.LogSoftmax, name, input);
+        }
+
+        /// <summary>
+        /// Element-wise `Sqrt` activation function
+        /// </summary>
+        public Layer Sqrt(string name, object input)
+        {
+            return Activation(Layer.Activation.Sqrt, name, input);
+        }
+
+        /// <summary>
+        /// Element-wise `Tanh` activation function: f(x) = (1 - e^{-2x})/(1 + e^{-2x})
+        /// </summary>
+        public Layer Tanh(string name, object input)
+        {
+            return Activation(Layer.Activation.Tanh, name, input);
+        }
+
+        /// <summary>
+        /// Element-wise `Sigmoid` activation function: f(x) = 1/(1 + e^{-x})
+        /// </summary>
+        public Layer Sigmoid(string name, object input)
+        {
+            return Activation(Layer.Activation.Sigmoid, name, input);
+        }
+
+        /// <summary>
+        /// Element-wise `Elu` activation function: f(x) = x if x >= 0 else alpha*(e^x - 1)
+        /// alpha default is 1.0
+        /// </summary>
+        public Layer Elu(string name, object input, float alpha = 1.0f)
+        {
+            var layer = Activation(Layer.Activation.Elu, name, input);
+            layer.alpha = alpha;
+            return layer;
+        }
+
+        /// <summary>
+        /// Element-wise `Relu6` activation function. f(x) = min(max(x, 0), 6)
+        /// see http://www.cs.utoronto.ca/~kriz/conv-cifar10-aug2010.pdf
+        /// </summary>
+        public Layer Relu6(string name, object input)
+        {
+            return Activation(Layer.Activation.Relu6, name, input);
+        }
+
+        /// <summary>
+        /// Element-wise `LeakyRelu` activation function: f(x) = x if x >= 0 else alpha * x
+        /// alpha default is 0.01
+        /// </summary>
+        public Layer LeakyRelu(string name, object input, float alpha = 0.01f)
+        {
+            var layer = Activation(Layer.Activation.LeakyRelu, name, input);
+            layer.alpha = alpha;
+            return layer;
+        }
+
+        /// <summary>
+        /// Element-wise `Selu` activation function: f(x) = gamma * x if x >= 0 else (alpha * e^x - alpha)
+        /// alpha default is 1.67326
+        /// gamma default is 1.0507
+        /// </summary>
+        public Layer Selu(string name, object input, float alpha = 1.67326f, float gamma = 1.0507f)
+        {
+            var layer = Activation(Layer.Activation.Selu, name, input);
+            layer.alpha = alpha;
+            layer.beta = gamma;
+            return layer;
+        }
+
+        /// <summary>
+        /// Element-wise `PRelu` activation function: f(x) = x if x >= 0 else slope * x
+        /// </summary>
+        public Layer PRelu(string name, object input, object slope)
+        {
+            object[] inputs = new [] {input, slope};
+
+            Layer layer = new Layer(name, Layer.Activation.PRelu);
+            layer.inputs = inputs.Select(i => ResolveInput(i)).ToArray();
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Element-wise `Swish` activation function. f(x) = sigmoid(x) * x = x/(1 + e^{-x})
+        /// see https://arxiv.org/abs/1710.05941
+        /// </summary>
+        public Layer Swish(string name, object input)
+        {
+            return Activation(Layer.Activation.Swish, name, input);
+        }
+
+        /// <summary>
+        // Element-wise `Clip` activation function f(x, xmin, xmax) = min(max(x, xmin), xmax)
+        /// </summary>
+        public Layer Clip(string name, object input, float min, float max)
+        {
+            var layer = Activation(Layer.Activation.Clip, name, input);
+            layer.alpha = min;
+            layer.beta  = max;
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Element-wise `Exp` activation function: f(x) = e^{x}
+        /// </summary>
+        public Layer Exp(string name, object input)
+        {
+            return Activation(Layer.Activation.Exp, name, input);
+        }
+
+        /// <summary>
+        /// Element-wise `Log` activation function: f(x) = log(x)
+        /// </summary>
+        public Layer Log(string name, object input)
+        {
+            return Activation(Layer.Activation.Log, name, input);
+        }
+
+        /// <summary>
+        /// Element-wise `Neg` activation function: f(x) = -x
+        /// </summary>
+        public Layer Neg(string name, object input)
+        {
+            return Activation(Layer.Activation.Neg, name, input);
+        }
+
+        /// <summary>
+        /// Element-wise `Reciprocal` activation function: f(x) = 1/x
+        /// </summary>
+        public Layer Reciprocal(string name, object input)
+        {
+            return Activation(Layer.Activation.Reciprocal, name, input);
+        }
+
+
+        private Layer Broadcast(Layer.Type type, string name, object[] inputs)
+        {
+            Layer layer = new Layer(name, type);
+            layer.inputs = inputs.Select(i => ResolveInput(i)).ToArray();
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        /// <summary>
+        /// Element-wise `add` of each of the input tensors with multidimensional broadcasting support.
+        /// </summary>
+        public Layer Add(string name, object[] inputs)
+        {
+            return Broadcast(Layer.Type.Add, name, inputs);
+        }
+
+        /// <summary>
+        /// Element-wise `sub` of each of the input tensors with multidimensional broadcasting support.
+        /// </summary>
+        public Layer Sub(string name, object[] inputs)
+        {
+            return Broadcast(Layer.Type.Sub, name, inputs);
+        }
+
+        /// <summary>
+        /// Element-wise multiplication of each of the input tensors with multidimensional broadcasting support.
+        /// </summary>
+        public Layer Mul(string name, object[] inputs)
+        {
+            return Broadcast(Layer.Type.Mul, name, inputs);
+        }
+
+        /// <summary>
+        /// Element-wise division of each of the input tensors with multidimensional broadcasting support.
+        /// First element is divided by the 2nd, then result is divided by the third one and so on.
+        /// </summary>
+        public Layer Div(string name, object[] inputs)
+        {
+            return Broadcast(Layer.Type.Div, name, inputs);
+        }
+
+        /// <summary>
+        /// Element-wise pow of each of the input tensors with multidimensional broadcasting support.
+        /// First element get raised to the pow of the 2nd, then result is raised to the pow of the third one and so on.
+        /// </summary>
+        public Layer Pow(string name, object[] inputs)
+        {
+            return Broadcast(Layer.Type.Pow, name, inputs);
+        }
+
+        /// <summary>
+        /// Element-wise `min` of each of the input tensors with multidimensional broadcasting support.
+        /// </summary>
+        public Layer Min(string name, object[] inputs)
+        {
+            return Broadcast(Layer.Type.Min, name, inputs);
+        }
+
+        /// <summary>
+        /// Element-wise `max` of each of the input tensors with multidimensional broadcasting support.
+        /// </summary>
+        public Layer Max(string name, object[] inputs)
+        {
+            return Broadcast(Layer.Type.Max, name, inputs);
+        }
+
+        /// <summary>
+        /// Element-wise `mean` of each of the input tensors with multidimensional broadcasting support.
+        /// </summary>
+        public Layer Mean(string name, object[] inputs)
+        {
+            return Broadcast(Layer.Type.Mean, name, inputs);
+        }
+
+        /// <summary>
+        /// Performs a `greater` logical operation elementwise on the input tensors with multidimensional broadcasting support.
+        /// Return 1.0 elementwise if condition is true 0.0 otherwise.
+        /// </summary>
+        public Layer Greater(string name, object input0, object input1)
+        {
+            return Broadcast(Layer.Type.Greater, name, new [] {input0, input1});
+        }
+
+        /// <summary>
+        /// Performs a `greaterEqual` logical operation elementwise on the input tensors with multidimensional broadcasting support.
+        /// Return 1.0 elementwise if condition is true 0.0 otherwise.
+        /// </summary>
+        public Layer GreaterEqual(string name, object input0, object input1)
+        {
+            return Broadcast(Layer.Type.GreaterEqual, name, new [] {input0, input1});
+        }
+
+        /// <summary>
+        /// Performs a `less` logical operation elementwise on the input tensors with multidimensional broadcasting support.
+        /// Return 1.0 elementwise if condition is true 0.0 otherwise.
+        /// </summary>
+        public Layer Less(string name, object input0, object input1)
+        {
+            return Broadcast(Layer.Type.Less, name, new [] {input0, input1});
+        }
+
+        /// <summary>
+        /// Performs a `less equal` logical operation elementwise on the input tensors with multidimensional broadcasting support.
+        /// Return 1.0 elementwise if condition is true 0.0 otherwise.
+        /// </summary>
+        public Layer LessEqual(string name, object input0, object input1)
+        {
+            return Broadcast(Layer.Type.LessEqual, name, new [] {input0, input1});
+        }
+
+        /// <summary>
+        /// Performs a `equal` logical operation elementwise on the input tensors with multidimensional broadcasting support.
+        /// Return 1.0 elementwise if condition is true 0.0 otherwise.
+        /// </summary>
+        public Layer Equal(string name, object input0, object input1)
+        {
+            return Broadcast(Layer.Type.Equal, name, new [] {input0, input1});
+        }
+
+        /// <summary>
+        /// Performs a `and` logical operation elementwise on the input tensors with multidimensional broadcasting support.
+        /// Return 1.0 elementwise if condition is true 0.0 otherwise.
+        /// Input is consider false if 0.0 elementwise true otherwise.
+        /// </summary>
+        public Layer LogicalAnd(string name, object input0, object input1)
+        {
+            return Broadcast(Layer.Type.LogicalAnd, name, new [] {input0, input1});
+        }
+
+        /// <summary>
+        /// Performs a `or` logical operation elementwise on the input tensors with multidimensional broadcasting support.
+        /// Return 1.0 elementwise if condition is true 0.0 otherwise.
+        /// Input is consider false if 0.0 elementwise true otherwise.
+        /// </summary>
+        public Layer LogicalOr(string name, object input0, object input1)
+        {
+            return Broadcast(Layer.Type.LogicalOr, name, new [] {input0, input1});
+        }
+
+        /// <summary>
+        /// Performs a `xor` logical operation elementwise on the input tensors with multidimensional broadcasting support.
+        /// Return 1.0 elementwise if condition is true 0.0 otherwise.
+        /// Input is consider false if 0.0 elementwise true otherwise.
+        /// </summary>
+        public Layer LogicalXor(string name, object input0, object input1)
+        {
+            return Broadcast(Layer.Type.LogicalXor, name, new [] {input0, input1});
+        }
+
+        /// <summary>
+        /// Performs a `not` logical operation elementwise on the input tensor.
+        /// Return 1.0 elementwise if condition is true 0.0 otherwise.
+        /// Input is consider false if 0.0 elementwise true otherwise.
+        /// </summary>
+        public Layer LogicalNot(string name, object input)
+        {
+            Layer layer = new Layer(name, Layer.Type.LogicalNot);
+            layer.inputs = new[] { ResolveInput(input) };
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        private Layer Pad(Layer.Type type, string name, object input, Int32[] pad, float constantValue = 0.0f)
+        {
+            Layer layer = new Layer(name, type);
+            layer.inputs = new[] { ResolveInput(input) };
+            layer.beta = constantValue;
+            layer.pad = pad;
+
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+        /// <summary>
+        /// Pads H and W dimension with a given constant value (default to 0).
+        /// Pad should be of size 4 and format is [pre W, pre H, post W, post H].
+        /// If pad contain negative values H and W dimensions will be cropped instead.
+        ///
+        /// For example a tensor of shape(1,2,3,1)
+        /// [1, 2, 3],
+        /// [4, 5, 6]
+        ///
+        /// With pad [2, 1, 2, 1]
+        ///
+        /// Result in a tensor of shape(1,4,7,1)
+        /// [0, 0, 0, 0, 0, 0, 0],
+        /// [0, 0, 1, 2, 3, 0, 0],
+        /// [0, 0, 4, 5, 6, 0, 0],
+        /// [0, 0, 0, 0, 0, 0, 0]
+        /// </summary>
+        public Layer Border2D(string name, object input, Int32[] pad, float constantValue = 0.0f)
+        {
+            return Pad(Layer.Type.Border2D, name, input, pad, constantValue);
+        }
+
+        /// <summary>
+        /// Pads H and W dimension by repeating the edge values of the input.
+        /// Pad should be of size 4 and format is [pre W, pre H, post W, post H].
+        ///
+        /// For example a tensor of shape(1,2,3,1):
+        /// [1, 2, 3],
+        /// [4, 5, 6]
+        ///
+        /// With pad [2, 1, 2, 1]
+        ///
+        /// Result in a tensor of shape(1,4,7,1)
+        /// [1, 1, 1, 2, 3, 3, 3],
+        /// [1, 1, 1, 2, 3, 3, 3],
+        /// [4, 4, 4, 5, 6, 6, 6],
+        /// [4, 4, 4, 5, 6, 6, 6]
+        /// </summary>
+        public Layer Pad2DEdge(string name, object input, Int32[] pad)
+        {
+            return Pad(Layer.Type.Pad2DEdge, name, input, pad);
+        }
+
+        /// <summary>
+        /// Pads H and W dimension by mirroring on the first and last values along those axis.
+        /// Pad should be of size 4 and format is [pre W, pre H, post W, post H].
+        ///
+        /// For example a tensor of shape(1,2,3,1):
+        /// [1, 2, 3],
+        /// [4, 5, 6]
+        ///
+        /// With pad [2, 1, 2, 1]
+        ///
+        /// Result in a tensor of shape(1,4,7,1)
+        /// [6, 5, 4, 5, 6, 5, 4],
+        /// [3, 2, 1, 2, 3, 2, 1],
+        /// [6, 5, 4, 5, 6, 5, 4],
+        /// [3, 2, 1, 2, 3, 2, 1]
+        /// </summary>
+        public Layer Pad2DReflect(string name, object input, Int32[] pad)
+        {
+            return Pad(Layer.Type.Pad2DReflect, name, input, pad);
+        }
+
+        /// <summary>
+        /// Pads H and W dimension with symmetric replication along those axis.
+        /// Pad should be of size 4 and format is [pre W, pre H, post W, post H].
+        ///
+        ///  For example a tensor of shape(1,2,3,1):
+        ///  [1, 2, 3],
+        ///  [4, 5, 6]
+        ///
+        ///  With pad [2, 1, 2, 1]
+        ///
+        ///  Result in a tensor of shape(1,4,7,1)
+        ///  [2, 1, 1, 2, 3, 3, 2],
+        ///  [2, 1, 1, 2, 3, 3, 2],
+        ///  [5, 4, 4, 5, 6, 6, 5],
+        ///  [5, 4, 4, 5, 6, 6, 5]
+        /// </summary>
+        public Layer Pad2Symmetric(string name, object input, Int32[] pad)
+        {
+            return Pad(Layer.Type.Pad2DSymmetric, name, input, pad);
+        }
+
+        public Layer RandomNormal(string name, float mean, float scale, float seed, object input)
+        {
+            Layer layer = new Layer(name, Layer.Type.RandomNormal);
+            layer.inputs = new[] { ResolveInput(input) };
+            layer.alpha = scale;
+            layer.beta = mean;
+            layer.pad = new int[1] {(int)seed};
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        public Layer RandomNormal(string name, float mean, float scale, float seed, Int32[] shape)
+        {
+            Layer layer = new Layer(name, Layer.Type.RandomNormal);
+            layer.alpha = scale;
+            layer.beta = mean;
+            layer.pad = new int[1] {(int)seed};
+            layer.pool = shape;
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        public Layer RandomUniform(string name, float min, float max, float seed, object input)
+        {
+            Layer layer = new Layer(name, Layer.Type.RandomUniform);
+            layer.inputs = new[] { ResolveInput(input) };
+            layer.alpha = (max-min);
+            layer.beta = min;
+            layer.pad = new int[1] {(int)seed};
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+
+        public Layer RandomUniform(string name, float min, float max, float seed, Int32[] shape)
+        {
+            Layer layer = new Layer(name, Layer.Type.RandomUniform);
+            layer.alpha = (max-min);
+            layer.beta = min;
+            layer.pad = new int[1] {(int)seed};
+            layer.pool = shape;
+            m_Model.layers.Add(layer);
+
+            return layer;
+        }
+    }
+}
diff --git a/Assets/Coach-ML/Barracuda/Core/ModelBuilder.cs.meta b/Assets/Coach-ML/Barracuda/Core/ModelBuilder.cs.meta
new file mode 100644
index 0000000..dc029c0
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/ModelBuilder.cs.meta
@@ -0,0 +1,3 @@
+﻿fileFormatVersion: 2
+guid: 19ceced96eb441539830855be9d99f12
+timeCreated: 1566476409
\ No newline at end of file
diff --git a/Assets/Coach-ML/Barracuda/Core/ModelLoader.cs b/Assets/Coach-ML/Barracuda/Core/ModelLoader.cs
new file mode 100644
index 0000000..67e3b7d
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/ModelLoader.cs
@@ -0,0 +1,265 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Runtime.CompilerServices;
+using UnityEngine;
+using UnityEngine.Assertions;
+using UnityEngine.Profiling;
+
+[assembly: InternalsVisibleTo("Unity.Barracuda.Tests")]
+
+namespace Barracuda {
+
+public static class ModelLoader
+{
+    /// <summary>
+    /// Return an object oriented representation (aka: `Model`) of a neural network from a binary representation of type `NNModel`.
+    /// By default details are logged to the console, set `verbose` to false to load silently.
+    /// </summary>
+    public static Model Load(NNModel model, bool verbose = true)
+    {
+        return Load(model.Value, verbose);
+    }
+
+    /// <summary>
+    /// Return an object oriented representation (aka: `Model`) of a neural network from a `.bc` file from the the streaming asset folder.
+    /// By default details are logged to the console, set verbose to false to load silently.
+    /// </summary>
+    public static Model LoadFromStreamingAssets(string filename, bool verbose = true)
+    {
+        return Load(Path.Combine(Application.streamingAssetsPath, filename), verbose);
+    }
+
+    /// <summary>
+    /// Return an object oriented representation (aka: `Model`) of a neural network from a `.bc` file.
+    /// By default details are logged to the console, set verbose to false to load silently.
+    /// </summary>
+    public static Model Load(string filepath, bool verbose = true)
+    {
+        return Load(Open(filepath), verbose);
+    }
+
+    /// <summary>
+    /// Return an object oriented representation (aka: `Model`) of a neural network from a byte[] array.
+    /// By default details are logged to the console, set verbose to false to load silently.
+    /// </summary>
+    public static Model Load(byte[] stream, bool verbose = true)
+    {
+        return Load(Open(stream), verbose);
+    }
+
+    #region Private and internal
+
+    internal static Model Load(byte[] stream, bool verbose = true, bool applyPatching = true)
+    {
+        return Load(Open(stream), verbose, applyPatching);
+    }
+
+    private static Model Load(BinaryReader fileReader, bool verbose = true, bool applyPatching = true)
+    {
+        using (BinaryReader file = fileReader)
+        {
+            Profiler.BeginSample("Barracuda.LoadLayers");
+
+            Model model = new Model();
+            List<Layer> layers = new List<Layer>();
+
+            long version = file.ReadInt64() % 0xff; // magic
+            if (version != Model.Version)
+                throw new NotSupportedException($"Format version not supported: {version}");
+
+            var count = file.ReadInt32();
+            model.inputs = new List<Model.Input>(count);
+            for (var i = 0; i < count; ++i)
+            {
+                model.inputs.Add(new Model.Input {name = ReadString(file), shape = ReadInt32Array(file)});
+            }
+
+            model.outputs = ReadStringArray(file).ToList();
+
+            count = file.ReadInt32();
+            model.memories  = new List<Model.Memory>(count);
+            for (var m = 0; m < count; ++m)
+            {
+                model.memories.Add(new Model.Memory
+                {
+                    shape = new TensorShape(ReadInt32Array(file)),
+                    input = ReadString(file),
+                    output = ReadString(file)
+                });
+            }
+
+            int numberOfLayers = file.ReadInt32();
+            for (var l = 0; l < numberOfLayers; ++l)
+            {
+                var name            = ReadString(file);
+                var type            = (Layer.Type)file.ReadInt32();
+                var activation      = (Layer.Activation)file.ReadInt32();
+                Layer layer         = new Layer(name, type, activation);
+                                      ReadInt32Array(file); // dummy
+                                      ReadInt32Array(file); // dummy
+                layer.pad           = ReadInt32Array(file);
+                layer.stride        = ReadInt32Array(file);
+                layer.pool          = ReadInt32Array(file);
+                layer.axis          = file.ReadInt32();
+                layer.alpha         = file.ReadSingle();
+                layer.beta          = file.ReadSingle();
+                                      ReadInt32Array(file); // dummy
+
+                layer.inputs        = ReadStringArray(file);
+
+                layer.datasets      = new Layer.DataSet[file.ReadInt32()];
+                for (var i = 0; i < layer.datasets.Length; ++i)
+                {
+                    layer.datasets[i].name            = ReadString(file);
+                    layer.datasets[i].shape           = new TensorShape(ReadInt32Array(file));
+                    layer.datasets[i].offset          = file.ReadInt64();
+                    layer.datasets[i].itemSizeInBytes = file.ReadInt32();
+                    layer.datasets[i].length          = file.ReadInt32();
+                }
+
+                layers.Add(layer);
+
+                if (verbose)
+                    D.Log(
+                        $"layer {l}, {layer.name} type: {layer.type} " +
+                                 $"{((layer.activation != Layer.Activation.None) ? $"activation {layer.activation} " : "")}" +
+                                 $"tensors: {layer.datasets.Length} inputs: {String.Join(",", layer.inputs)}");
+
+                if (verbose)
+                    foreach (var t in layer.datasets)
+                        D.Log($"        Tensor: {t.shape} offset: {t.offset} len: {t.length}");
+
+                if(applyPatching)
+                    PatchLayer(layers, layer);
+            }
+            model.layers = layers;
+
+            Int64 dataLength = 0;
+            for (var l = 0; l < model.layers.Count; ++l)
+                for (var d = 0; d < model.layers[l].datasets.Length; ++d)
+                    dataLength += model.layers[l].datasets[d].length;
+
+            Profiler.EndSample();
+
+            Profiler.BeginSample("Barracuda.AllocModel");
+            var sharedWeights = new float[dataLength];
+            Profiler.EndSample();
+
+            Profiler.BeginSample("Barracuda.LoadModel");
+            byte[] bytes = file.ReadBytes(Convert.ToInt32(dataLength * sizeof(float))); // @TODO: support larger than MaxInt32 data blocks
+            Buffer.BlockCopy(bytes, 0, sharedWeights, 0, bytes.Length);
+            Profiler.EndSample();
+
+            for (var l = 0; l < model.layers.Count; ++l)
+                model.layers[l].weights = sharedWeights;
+
+            //Importer Reporting
+            try
+            {
+                model.IrSource = ReadString(file);
+                model.IrVersion = ReadString(file);
+                model.ProducerName = ReadString(file);
+                var numWarnings = file.ReadInt32();
+                for (var i = 0; i < numWarnings; ++i)
+                {
+                    model.Warnings.Add(new Model.ImporterWarning(ReadString(file), ReadString(file)));
+                }
+            }
+            catch (EndOfStreamException)
+            {
+                //Do nothing Importer Reporting data might not be present for backward compatibility reasons
+            }
+
+            return model;
+        }
+    }
+
+    private static void PatchLayer(List<Layer> layers, Layer layer)
+    {
+        // Split Load so that each constant tensor gets its own layer
+        // for the sake of simplicity of the execution code
+        if (layer.type == Layer.Type.Load &&
+            layer.datasets.Length > 1)
+        {
+            foreach (var t in layer.datasets)
+            {
+                Layer layerC        = new Layer(t.name, Layer.Type.Load);// load using tensor name
+                layerC.inputs       = layer.inputs;
+                layerC.datasets     = new[] { t };
+
+                layers.Add(layerC);
+            }
+
+            // patch original layer
+            layer.name              = layer.name + "_nop";
+            layer.type              = Layer.Type.Nop;
+        }
+
+        // Split Activation part into separate layer for the sake of simplicity of the execution code
+        // NOTE: Keras specific. Only Keras exporter packs both Dense/Conv and Activation into the same layer
+        // @TODO: move layer split directly into Keras exporter
+        if (layer.type          != Layer.Type.Activation &&
+            layer.activation    != Layer.Activation.None)
+        {
+            var affineOutput    = layer.name + "_tmp";
+
+            Layer layerA        = new Layer(layer.name, layer.activation);// take the original layer name
+            layerA.inputs       = new[] { affineOutput };
+
+            // patch original layer
+            layer.name           = affineOutput;
+            layer.activation     = Layer.Activation.None;
+            Assert.AreEqual(layers[layers.Count-1].name, layer.name);
+            Assert.AreEqual(layers[layers.Count-1].activation, layer.activation);
+
+            layers.Add(layerA);
+        }
+
+        // @TODO: Enable Dropout
+        // @TEMP: disabled runtime Dropout noise to get more predictable results for auto testing
+        if (layer.type == Layer.Type.Dropout)
+        {
+            layer.type          = Layer.Type.Activation;
+            layer.activation    = Layer.Activation.None;
+        }
+    }
+
+    private static Int32[] ReadInt32Array(BinaryReader file)
+    {
+        var arr = new Int32[file.ReadInt32()];
+        for (var i = 0; i < arr.Length; ++i)
+            arr[i] = file.ReadInt32();
+        return arr;
+    }
+
+    private static string ReadString(BinaryReader file)
+    {
+        var len = file.ReadInt32();
+        var chars = file.ReadChars(len);
+        return new string(chars);
+    }
+
+    private static string[] ReadStringArray(BinaryReader file)
+    {
+        var arr = new string[file.ReadInt32()];
+        for (var i = 0; i < arr.Length; ++i)
+            arr[i] = ReadString(file);
+        return arr;
+    }
+
+    private static BinaryReader Open(string filename)
+    {
+        return new BinaryReader(new FileStream(filename, FileMode.Open, FileAccess.Read));
+    }
+
+    private static BinaryReader Open(byte[] bytes)
+    {
+        return new BinaryReader(new MemoryStream(bytes, false));
+    }
+    #endregion
+}
+
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/ModelLoader.cs.meta b/Assets/Coach-ML/Barracuda/Core/ModelLoader.cs.meta
new file mode 100644
index 0000000..8fe55d1
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/ModelLoader.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: ab890607c8319490aaa5d1dee1fc4069
+timeCreated: 1495569481
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/ModelWriter.cs b/Assets/Coach-ML/Barracuda/Core/ModelWriter.cs
new file mode 100644
index 0000000..5eee2ae
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/ModelWriter.cs
@@ -0,0 +1,146 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Threading;
+
+using UnityEngine;
+using UnityEngine.Assertions;
+using UnityEngine.Profiling;
+
+namespace Barracuda {
+
+
+    public class ModelWriter
+    {
+        public static void Save(string fileName, Model model, bool verbose = false)
+        {
+            BinaryWriter writer = new BinaryWriter(File.Open(fileName, FileMode.Create));
+            Save(writer, model, verbose);
+            writer.Close();
+        }
+
+        public static void Save(BinaryWriter writer, Model model, bool verbose = false)
+        {
+            Profiler.BeginSample("Barracuda.ModelWriter.Save");
+
+            writer.Write((long)Model.Version);
+
+            writer.Write(model.inputs.Count);
+            for (var i = 0; i < model.inputs.Count; ++i)
+            {
+                WriteString(writer, model.inputs[i].name);
+                WriteInt32Array(writer, model.inputs[i].shape);
+            }
+            WriteStringArray(writer, model.outputs);
+
+            writer.Write(model.memories.Count);
+            for (var m = 0; m < model.memories.Count; ++m)
+            {
+                WriteInt32Array(writer, model.memories[m].shape.ToArray());
+                WriteString(writer, model.memories[m].input);
+                WriteString(writer, model.memories[m].output);
+            }
+
+            // Write layers
+            long offsetFromModelStartToLayer = 0;
+            writer.Write(model.layers.Count);
+            for (var l = 0; l < model.layers.Count; ++l)
+            {
+                Layer layer         = model.layers[l];
+                WriteString(writer, layer.name);
+                writer.Write((Int32)layer.type);
+                writer.Write((Int32)layer.activation);
+                writer.Write(0); //dummy 0 size array
+                writer.Write(0); //dummy 0 size array
+                WriteInt32Array(writer, layer.pad);
+                WriteInt32Array(writer, layer.stride);
+                WriteInt32Array(writer, layer.pool);
+                writer.Write(layer.axis);
+                writer.Write(layer.alpha);
+                writer.Write(layer.beta);
+                writer.Write(0); //dummy 0 size array
+
+                WriteStringArray(writer, layer.inputs);
+
+                long offsetFromLayerStart = 0;
+                writer.Write(layer.datasets.Length);
+                for (var i = 0; i < layer.datasets.Length; ++i)
+                {
+                    WriteString(writer, layer.datasets[i].name);
+                    WriteInt32Array(writer, layer.datasets[i].shape.ToArray());
+                    // Recalculate all offsets to be global inside the model
+                    // this way weights can be stored in one block at the end of the file
+                    Assert.AreEqual(offsetFromLayerStart, layer.datasets[i].offset - layer.datasets[0].offset);
+                    writer.Write(offsetFromModelStartToLayer + offsetFromLayerStart);
+                    writer.Write(layer.datasets[i].itemSizeInBytes);
+                    writer.Write(layer.datasets[i].length);
+                    offsetFromLayerStart += layer.datasets[i].length;
+                }
+                offsetFromModelStartToLayer += offsetFromLayerStart;
+
+                if (verbose)
+                    D.Log("layer " + l + ", " + layer.name + " type: " + layer.type.ToString() +
+                        ((layer.activation != Layer.Activation.None) ? " activation " + layer.activation : "") +
+                    " tensors: " + layer.datasets.Length +
+                        " inputs: " + String.Join(",", layer.inputs));
+
+                if (verbose)
+                    foreach (var t in layer.datasets)
+                        D.Log("        Tensor: " + t.shape + " offset: " + t.offset + " len: " + t.length);
+            }
+
+            // Write tensor data
+            for (var l = 0; l < model.layers.Count; ++l)
+            {
+                for (var d = 0; d < model.layers[l].datasets.Length; ++d)
+                {
+                    byte[] dst = new byte[model.layers[l].datasets[d].length * sizeof(float)];
+                    Buffer.BlockCopy(model.layers[l].weights, (int)(model.layers[l].datasets[d].offset * sizeof(float)), dst, 0, dst.Length);
+                    writer.Write(dst);
+                }
+            }
+
+            WriteString(writer, model.IrSource);
+            WriteString(writer, model.IrVersion);
+            WriteString(writer, model.ProducerName);
+            int numWarnings = model.Warnings.Count;
+            writer.Write(numWarnings);
+            for (var i = 0; i < numWarnings; ++i)
+            {
+                WriteString(writer, model.Warnings[i].LayerName);
+                WriteString(writer, model.Warnings[i].Message);
+            }
+
+            Profiler.EndSample();
+        }
+
+
+
+        static void WriteInt32Array(BinaryWriter writer, Int32[] arr)
+        {
+            writer.Write(arr.Length);
+            for (var i = 0; i < arr.Length; ++i)
+                writer.Write(arr[i]);
+        }
+
+        static void WriteString(BinaryWriter writer, string str)
+        {
+            writer.Write(str.Length);
+            writer.Write(str.ToCharArray());
+        }
+
+        static void WriteStringArray(BinaryWriter writer, string[] strArray)
+        {
+            writer.Write(strArray.Length);
+            foreach(string str in strArray)
+                WriteString(writer, str);
+        }
+
+        static void WriteStringArray(BinaryWriter writer, List<string> strArray)
+        {
+            writer.Write(strArray.Count);
+            foreach(string str in strArray)
+                WriteString(writer, str);
+        }
+    }
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/ModelWriter.cs.meta b/Assets/Coach-ML/Barracuda/Core/ModelWriter.cs.meta
new file mode 100644
index 0000000..63067f0
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/ModelWriter.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 444f70d41cf065440a76d75c1a3d47e1
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/PluginInterfaces.cs b/Assets/Coach-ML/Barracuda/Core/PluginInterfaces.cs
new file mode 100644
index 0000000..599b54e
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/PluginInterfaces.cs
@@ -0,0 +1,66 @@
+﻿using System;
+using System.Collections.Generic;
+using UnityEngine;
+
+namespace Barracuda
+{
+    public interface BLASPlugin
+    {
+        bool IsCurrentPlatformSupported();
+        unsafe void SGEMM(float* Ap, int AN, int AM,
+            float* Bp, int BN, int BM,
+            float* Cp, int CN, int CM, int bs,
+            bool transposeA = false, bool transposeB = false);
+    }
+
+    public class BLASPluginFactory
+    {
+        public static BLASPlugin CreateBLASPlugin()
+        {
+            BLASPlugin blas = null;
+            
+            // TODO make plugins discoverable via custom attributes
+            Stack<string> plugins = new Stack<string>();
+            plugins.Push(typeof(CSharpBLAS).FullName);
+            plugins.Push("BurstBLAS");
+
+            if (Application.platform == RuntimePlatform.IPhonePlayer)
+                plugins.Push("iOSBLAS");
+            else if (Application.platform == RuntimePlatform.OSXPlayer || Application.platform == RuntimePlatform.OSXEditor)
+                plugins.Push("MacBLAS");
+
+            while (plugins.Count > 0)
+            {
+                var candidate = plugins.Pop();
+                D.Log($"Probing {candidate}");
+                foreach (var assembly in AppDomain.CurrentDomain.GetAssemblies())
+                {
+                    var t = assembly.GetType(candidate);
+                    if (t != null)
+                    {
+                        try
+                        {
+                            var inst = Activator.CreateInstance(t) as BLASPlugin;
+
+                            if (inst != null && inst.IsCurrentPlatformSupported())
+                            {
+                                blas = inst;
+                            }
+                        }
+                        catch (Exception e)
+                        {
+                            D.LogWarning($"Failed to load {t} with exception {e}");
+                            break;
+                        }  
+                    }
+                }
+
+                // Found working candidate
+                if (blas != null)
+                    break;
+            }
+
+            return blas;
+        }
+    }
+}
\ No newline at end of file
diff --git a/Assets/Coach-ML/Barracuda/Core/PluginInterfaces.cs.meta b/Assets/Coach-ML/Barracuda/Core/PluginInterfaces.cs.meta
new file mode 100644
index 0000000..2eb693a
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/PluginInterfaces.cs.meta
@@ -0,0 +1,3 @@
+﻿fileFormatVersion: 2
+guid: cb590b30d6c1477e9316410e67c4c568
+timeCreated: 1538563588
\ No newline at end of file
diff --git a/Assets/Coach-ML/Barracuda/Core/Resources.meta b/Assets/Coach-ML/Barracuda/Core/Resources.meta
new file mode 100644
index 0000000..36457c8
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Resources.meta
@@ -0,0 +1,9 @@
+fileFormatVersion: 2
+guid: 5db9da9dbc7ab4c73bd6cec144c0f1bc
+folderAsset: yes
+timeCreated: 1506450657
+licenseType: Pro
+DefaultImporter:
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Resources/Activation.compute b/Assets/Coach-ML/Barracuda/Core/Resources/Activation.compute
similarity index 95%
rename from Assets/Coach-ML/Barracuda/Resources/Activation.compute
rename to Assets/Coach-ML/Barracuda/Core/Resources/Activation.compute
index 30701e6..2207562 100644
--- a/Assets/Coach-ML/Barracuda/Resources/Activation.compute
+++ b/Assets/Coach-ML/Barracuda/Core/Resources/Activation.compute
@@ -1,34 +1,46 @@
 #pragma kernel Neg_Flat
+#pragma kernel Neg_FlatStrict
 #pragma kernel Neg_Loop
 #pragma kernel Reciprocal_Flat
 #pragma kernel Reciprocal_Loop
 #pragma kernel Relu_Flat
+#pragma kernel Relu_FlatStrict
 #pragma kernel Relu_Loop
 #pragma kernel Relu6_Flat
+#pragma kernel Relu6_FlatStrict
 #pragma kernel Relu6_Loop
 #pragma kernel PRelu_Flat
 #pragma kernel PRelu_Loop
 #pragma kernel Selu_Flat
 #pragma kernel Selu_Loop
 #pragma kernel Tanh_Flat
+#pragma kernel Tanh_FlatStrict
 #pragma kernel Tanh_Loop
 #pragma kernel Swish_Flat
+#pragma kernel Swish_FlatStrict
 #pragma kernel Swish_Loop
 #pragma kernel Sigmoid_Flat
+#pragma kernel Sigmoid_FlatStrict
 #pragma kernel Sigmoid_Loop
 #pragma kernel Elu_Flat
+#pragma kernel Elu_FlatStrict
 #pragma kernel Elu_Loop
 #pragma kernel LeakyRelu_Flat
+#pragma kernel LeakyRelu_FlatStrict
 #pragma kernel LeakyRelu_Loop
 #pragma kernel Exp_Flat
+#pragma kernel Exp_FlatStrict
 #pragma kernel Exp_Loop
 #pragma kernel Log_Flat
+#pragma kernel Log_FlatStrict
 #pragma kernel Log_Loop
 #pragma kernel Sqrt_Flat
 #pragma kernel Sqrt_Loop
 #pragma kernel Pow_Flat
+#pragma kernel Pow_FlatStrict
 #pragma kernel Pow_Loop
 #pragma kernel LogicalNot_Flat
+#pragma kernel LogicalNot_FlatStrict
 #pragma kernel LogicalNot_Loop
 #pragma kernel Clip_Flat
 #pragma kernel Clip_Loop
@@ -125,6 +137,23 @@ void name##_Flat (uint3 dispatchThreadID : SV_DispatchThreadID)\
     O.Set(i, v);\
 }
 
+#define FLAT_ACTIVATION_STRICT(name, op_name) \
+void name##_FlatStrict (uint3 groupId : SV_GroupID, uint3 groupThreadId : SV_GroupThreadID)\
+{\
+    DISPATCH_ARGS(O.length/2, 1, 1)\
+    TENSOR_ARGS2(X, O);\
+\
+    uint numThreadsPerTG = NUMTHREAD(512, 128, 64);\
+    uint i1 = (groupId.x * 2 + 0) * numThreadsPerTG + groupThreadId.x;\
+	uint i2 = (groupId.x * 2 + 1) * numThreadsPerTG + groupThreadId.x;\
+    float v1 = X.Get(i1);\
+	float v2 = X.Get(i2);\
+    v1 = op_name (v1);\
+	v2 = op_name (v2);\
+    O.Set(i1, v1);\
+	O.Set(i2, v2);\
+}
+
 #define LOOP_ACTIVATION(name, op_name) \
 void name##_Loop (uint3 dispatchThreadID : SV_DispatchThreadID)\
 {\
@@ -146,6 +175,8 @@ void name##_Loop (uint3 dispatchThreadID : SV_DispatchThreadID)\
 NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
 FLAT_ACTIVATION(name, op_name)\
 NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
+FLAT_ACTIVATION_STRICT(name, op_name)\
+NUMTHREADS((512,1,1), (128,1,1), (64,1,1))\
 LOOP_ACTIVATION(name, op_name)
 
 float relu(float v)
@@ -208,7 +239,7 @@ float tanh_safe(float x)
     return tanh(clamp(x,-16.0f,16.0f));//clamp to avoid NaNs for large values.
 }
 
-float clip(float v)
+float activation_clip(float v)
 {
 	return clamp(v, _Alpha, _Beta);
 }
@@ -229,7 +260,7 @@ ACTIVATION(Log, log)
 ACTIVATION(Sqrt, sqrt)
 ACTIVATION(Pow, signed_pow)
 ACTIVATION(LogicalNot, logical_not)
-ACTIVATION(Clip, clip)
+ACTIVATION(Clip, activation_clip)
 
 // -------------------
 
@@ -480,7 +511,7 @@ void Clip(uint3 dispatchThreadID : SV_DispatchThreadID)
 	for (uint n = 0; n < X.batch; ++n)
 	{
 		float v = X.Get(n, y, x, c);
-		v = clip(v);
+		v = activation_clip(v);
 		O.Set(n, y, x, c, v);
 	}
 }
@@ -1032,7 +1063,7 @@ void Clip_CNyx(uint3 dispatchThreadID : SV_DispatchThreadID)
 	if (n >= X.batch) return;
 
 	float v = X.Get(n, y, x, c);
-	v = clip(v);
+	v = activation_clip(v);
 	O.Set(n, y, x, c, v);
 }
 
@@ -1054,7 +1085,7 @@ void Clip_Nyxc(uint3 dispatchThreadID : SV_DispatchThreadID)
 	if (n >= X.batch) return;
 
 	float v = X.Get(n, y, x, c);
-	v = clip(v);
+	v = activation_clip(v);
 	O.Set(n, y, x, c, v);
 }
 
@@ -1107,7 +1138,8 @@ void LogSoftmax(uint3 dispatchThreadID : SV_DispatchThreadID)
 	if (y >= O.GetFlatHeight()) return;
 
 	float maxV = -FLT_MAX;
-	for (uint i = 0; i < X.GetFlatWidth(); ++i)
+	uint i;
+	for (i = 0; i < X.GetFlatWidth(); ++i)
 	{
 		float v = X.Get(y, i);
 		if (v > maxV)
diff --git a/Assets/Coach-ML/Barracuda/Resources/Activation.compute.meta b/Assets/Coach-ML/Barracuda/Core/Resources/Activation.compute.meta
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Activation.compute.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/Activation.compute.meta
diff --git a/Assets/Coach-ML/Barracuda/Resources/BarracudaReferenceImpl.compute b/Assets/Coach-ML/Barracuda/Core/Resources/BarracudaReferenceImpl.compute
similarity index 99%
rename from Assets/Coach-ML/Barracuda/Resources/BarracudaReferenceImpl.compute
rename to Assets/Coach-ML/Barracuda/Core/Resources/BarracudaReferenceImpl.compute
index 33530ac..0da2662 100644
--- a/Assets/Coach-ML/Barracuda/Resources/BarracudaReferenceImpl.compute
+++ b/Assets/Coach-ML/Barracuda/Core/Resources/BarracudaReferenceImpl.compute
@@ -9,7 +9,7 @@
 #pragma kernel AvgPool2D
 #pragma kernel GlobalMaxPool2D
 #pragma kernel GlobalAvgPool2D
-#pragma kernal GlobalAvgVariancePool2D
+#pragma kernel GlobalAvgVariancePool2D
 #pragma kernel ScaleBias
 #pragma kernel InstanceNorm
 #pragma kernel Dropout
@@ -76,7 +76,6 @@ float _Alpha;
 float _Beta;
 float _Epsilon;
 float _Seed;
-
 [numthreads(8,8,1)]
 void Dense(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
@@ -970,7 +969,8 @@ void LogSoftmax(uint3 dispatchThreadID : SV_DispatchThreadID)
 	if (y >= O.GetFlatHeight()) return;
 
 	float maxV = -FLT_MAX;
-	for (uint i = 0; i < X.GetFlatWidth(); ++i)
+	uint i;
+	for (i = 0; i < X.GetFlatWidth(); ++i)
 	{
 		float v = X.Get(y, i);
 		if (v > maxV)
@@ -1245,9 +1245,10 @@ void Conv2DWinograd_2x2_3x3(uint3 dispatchThreadID : SV_DispatchThreadID, uint3
 		for (uint c = 0; c < X.channels; ++c)
 		{
 			float4x4 d = 0;
+			int ix;
 
 			// 16 loads per thread
-			for (int ix = 0; ix < 4; ix++)
+			for (ix = 0; ix < 4; ix++)
 			{
 				for (int iy = 0; iy < 4; iy++)
 				{
@@ -1258,7 +1259,7 @@ void Conv2DWinograd_2x2_3x3(uint3 dispatchThreadID : SV_DispatchThreadID, uint3
 
 			// transform kernel -- N.B: do this offline offline
 			float3x3 g;
-			for (int ix = 0; ix < 3; ix++)
+			for (ix = 0; ix < 3; ix++)
 			{
 				for (int iy = 0; iy < 3; iy++)
 				{
diff --git a/Assets/Coach-ML/Barracuda/Resources/BarracudaReferenceImpl.compute.meta b/Assets/Coach-ML/Barracuda/Core/Resources/BarracudaReferenceImpl.compute.meta
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/BarracudaReferenceImpl.compute.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/BarracudaReferenceImpl.compute.meta
diff --git a/Assets/Coach-ML/Barracuda/Resources/Broadcast.compute b/Assets/Coach-ML/Barracuda/Core/Resources/Broadcast.compute
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Broadcast.compute
rename to Assets/Coach-ML/Barracuda/Core/Resources/Broadcast.compute
diff --git a/Assets/Coach-ML/Barracuda/Resources/Broadcast.compute.meta b/Assets/Coach-ML/Barracuda/Core/Resources/Broadcast.compute.meta
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Broadcast.compute.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/Broadcast.compute.meta
diff --git a/Assets/Coach-ML/Barracuda/Resources/Conv.compute b/Assets/Coach-ML/Barracuda/Core/Resources/Conv.compute
similarity index 94%
rename from Assets/Coach-ML/Barracuda/Resources/Conv.compute
rename to Assets/Coach-ML/Barracuda/Core/Resources/Conv.compute
index 427d59e..97171e8 100644
--- a/Assets/Coach-ML/Barracuda/Resources/Conv.compute
+++ b/Assets/Coach-ML/Barracuda/Core/Resources/Conv.compute
@@ -9,7 +9,12 @@
 #pragma kernel DepthwiseConv2D
 
 #pragma kernel Conv2DTrans
-#pragma kernel Conv2DTrans_KernelCached
+
+//Tested 2x2, 3x3 and 5x5 kernels with groupsize [8,8], [8,16], [16,16] and [16,32] (this one not in 5x5 as it does not fit in 32k)
+//k=5x5 t=[16,16] fast consistently faster or equal to other configuration both on AMDVega and RTX2080 (tested with kernel size 2x2x32x32, input size 128x128x32)
+//however this configuration is quite LDS bound performance profile might be very different on hardware without on chip LDS. This is especially true for smaller kernel
+//as a lot of LDS will be reserved but not used, reducing the amount of cache used.
+#pragma kernel Conv2DTrans_KernelCached_K5x5_T16x16 MAX_KERNEL_SIZE=5 GROUP_SIZE_X=16 GROUP_SIZE_Y=16
 
 #include "Tensor.cginc"
 
@@ -557,18 +562,16 @@ void Conv2DTrans(uint3 dispatchThreadID : SV_DispatchThreadID)
     }
 }
 
-#undef GROUP_SIZE
-#define GROUP_SIZE 16
-#undef MAX_KERNEL_SIZE
-#define MAX_KERNEL_SIZE 5
-groupshared float Conv2DTrans_SharedKernel[MAX_KERNEL_SIZE][MAX_KERNEL_SIZE][GROUP_SIZE*GROUP_SIZE];
+#if defined(MAX_KERNEL_SIZE) && defined(GROUP_SIZE_X) && defined(GROUP_SIZE_Y)
+#define CONV2DTRANS_NAME(KERNEL,TGX,TGY) Conv2DTrans_KernelCached_K##KERNEL##x##KERNEL##_T##TGX##x##TGY
+groupshared float Conv2DTrans_SharedKernel[MAX_KERNEL_SIZE][MAX_KERNEL_SIZE][GROUP_SIZE_X*GROUP_SIZE_Y];
 groupshared float Conv2DTrans_SharedBias;
-[numthreads(1,GROUP_SIZE,GROUP_SIZE)]
-void Conv2DTrans_KernelCached(uint3 dispatchThreadID : SV_DispatchThreadID, uint groupIndex: SV_GroupIndex)
+[numthreads(1,GROUP_SIZE_X,GROUP_SIZE_Y)]
+void CONV2DTRANS_NAME(MAX_KERNEL_SIZE, GROUP_SIZE_X,GROUP_SIZE_Y)(uint3 dispatchThreadID : SV_DispatchThreadID, uint groupIndex: SV_GroupIndex)
 {
     //Constraints:
-    // C <= GROUP_SIZE*GROUP_SIZE
-    // K <= 5x5
+    // C <= GROUP_SIZE_X*GROUP_SIZE_Y
+    // K <= MAX_KERNEL_SIZExMAX_KERNEL_SIZE
     DISPATCH_ARGS(K.kernelCount, O.width, O.height);
     TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
 
@@ -581,8 +584,8 @@ void Conv2DTrans_KernelCached(uint3 dispatchThreadID : SV_DispatchThreadID, uint
     //  a thread group = handle 1 feature in a GROUP_SIZExGROUP_SIZE x,y region, it loop other all batch, input channel count need to be <= GROUP_SIZE*GROUP_SIZE
 
     //LDS allocation
-    //  we have 1 feature and up to GROUP_SIZE*GROUP_SIZE channels per thread group, batch all use the same kernels,
-    //  thus LDS is [MAX_KERNEL_SIZE][MAX_KERNEL_SIZE][GROUP_SIZE*GROUP_SIZE]
+    //  we have 1 feature and up to GROUP_SIZE_X*GROUP_SIZE_Y channels per thread group, batch all use the same kernels,
+    //  thus LDS is [MAX_KERNEL_SIZE][MAX_KERNEL_SIZE][GROUP_SIZE_X*GROUP_SIZE_Y]
 
     //Loading to LDS
     //  Each threads load a 2D kernel for a different channel into LDS
@@ -632,3 +635,5 @@ void Conv2DTrans_KernelCached(uint3 dispatchThreadID : SV_DispatchThreadID, uint
         O.Set(n, y, x, k, acc);
     }
 }
+#undef CONV2DTRANS_NAME
+#endif //defined(MAX_KERNEL_SIZE) && defined(GROUP_SIZE_X) && defined(GROUP_SIZE_Y)
diff --git a/Assets/Coach-ML/Barracuda/Resources/Conv.compute.meta b/Assets/Coach-ML/Barracuda/Core/Resources/Conv.compute.meta
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Conv.compute.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/Conv.compute.meta
diff --git a/Assets/Coach-ML/Barracuda/Resources/Dense.compute b/Assets/Coach-ML/Barracuda/Core/Resources/Dense.compute
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Dense.compute
rename to Assets/Coach-ML/Barracuda/Core/Resources/Dense.compute
diff --git a/Assets/Coach-ML/Barracuda/Resources/Dense.compute.meta b/Assets/Coach-ML/Barracuda/Core/Resources/Dense.compute.meta
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Dense.compute.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/Dense.compute.meta
diff --git a/Assets/Coach-ML/Barracuda/Resources/DenseFP16.compute b/Assets/Coach-ML/Barracuda/Core/Resources/DenseFP16.compute
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/DenseFP16.compute
rename to Assets/Coach-ML/Barracuda/Core/Resources/DenseFP16.compute
diff --git a/Assets/Coach-ML/Barracuda/Resources/DenseFP16.compute.meta b/Assets/Coach-ML/Barracuda/Core/Resources/DenseFP16.compute.meta
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/DenseFP16.compute.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/DenseFP16.compute.meta
diff --git a/Assets/Coach-ML/Barracuda/Resources/Generic.compute b/Assets/Coach-ML/Barracuda/Core/Resources/Generic.compute
similarity index 51%
rename from Assets/Coach-ML/Barracuda/Resources/Generic.compute
rename to Assets/Coach-ML/Barracuda/Core/Resources/Generic.compute
index c0579a0..c26921f 100644
--- a/Assets/Coach-ML/Barracuda/Resources/Generic.compute
+++ b/Assets/Coach-ML/Barracuda/Core/Resources/Generic.compute
@@ -3,29 +3,13 @@
 #pragma kernel ScaleBias_CNyx2
 #pragma kernel ScaleBias_Flat
 #pragma kernel ScaleBias_Loop
-#pragma kernel Upsample2D
-#pragma kernel AvgPool2D
-#pragma kernel MaxPool2D
-#pragma kernel AvgPool2D_NoPads
-#pragma kernel MaxPool2D_NoPads
-//#pragma kernel MaxPool2D_Pool2x2_NoPads
-#pragma kernel GlobalAvgPool2D
-#pragma kernel GlobalAvgVariancePool2D
 #pragma kernel InstanceNorm
 #pragma kernel InstanceNormTail_CNyx2
 #pragma kernel InstanceNormTail_Flat
 #pragma kernel InstanceNormTail_Loop
+#pragma kernel Upsample2D
 #pragma kernel Copy
 
-/*
-ScaleBias_Flat+ScaleBias_CNyx2 (NEW) vs ScaleBias+ScaleBias_CNyx
-Compute Precompiled
-
-MOBILENET@4
-<<<Exec #64:  66.5 ms, cpu: 7.7 ms, avg:  66.3 ms, result:OK    <--- NEW!
-<<<Exec #64:  66.7 ms, cpu: 8.0 ms, avg:  67.1 ms, result:OK
-*/
-
 #include "Tensor.cginc"
 
 TENSOR_DECL(X)
@@ -35,9 +19,7 @@ TENSOR_DECL(WBK)
 TENSOR_DECL_RW(O)
 
 uint4 _Pool;
-uint4 _Stride;
 uint4 _Pad;
-float _Alpha;
 float _Epsilon;
 uint _LoopStride;
 
@@ -127,7 +109,7 @@ void ScaleBias_Loop(uint3 dispatchThreadID : SV_DispatchThreadID)
         float v = X.Get(i);
         v = v * scale + bias;
         O.Set(i, v);
-    
+
         i += _LoopStride;
     }
 }
@@ -152,277 +134,6 @@ void ScaleBias_CNyx2(uint3 dispatchThreadID : SV_DispatchThreadID)
     O.Set(i, v);
 }
 
-NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
-void Upsample2D(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    // NOTE: dispatched over X (not O)
-    DISPATCH_ARGS(X.channels, X.width, X.height);
-    TENSOR_ARGS2(X, O);
-
-    uint c = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (c >= X.channels) return;
-    if (x >= X.width) return;
-    if (y >= X.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float v = X.Get(n, y, x, c);
-
-        for (uint dy = 0; dy < _Pool.y; ++dy)
-            for (uint dx = 0; dx < _Pool.x; ++dx)
-            {
-                uint oy = y * _Pool.y + dy;
-                uint ox = x * _Pool.x + dx;
-                O.Set(n, oy, ox, c, v);
-            }
-    }
-}
-
-NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
-void MaxPool2D(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(O.channels, O.width, O.height);
-    TENSOR_ARGS2(X, O);
-
-    uint c = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (c >= O.channels) return;
-    if (x >= O.width) return;
-    if (y >= O.height) return;
-
-    for (uint n = 0; n < X.batch; ++n)
-    {
-        float maxV = -FLT_MAX;
-        for (uint dy = 0; dy < _Pool.y; ++dy)
-            for (uint dx = 0; dx < _Pool.x; ++dx)
-            {
-                uint oy = y * _Stride.y + dy;
-                uint ox = x * _Stride.x + dx;
-
-                bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.y < X.height) && (ox - _Pad.x < X.width);
-                float v = (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): -FLT_MAX;
-                
-                maxV = max(v, maxV);
-            }
-        
-        O.Set(n, y, x, c, maxV);
-    }
-}
-
-NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
-void AvgPool2D(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(O.channels, O.width, O.height);
-    TENSOR_ARGS2(X, O);
-
-    uint c = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (c >= O.channels) return;
-    if (x >= O.width) return;
-    if (y >= O.height) return;
-
-    for (uint n = 0; n < X.batch; ++n)
-    {
-        float acc = 0;
-        float counter = 0;
-        for (uint dy = 0; dy < _Pool.y; ++dy)
-            for (uint dx = 0; dx < _Pool.x; ++dx)
-            {
-                uint oy = y * _Stride.y + dy;
-                uint ox = x * _Stride.x + dx;
-
-                bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.y < X.height) && (ox - _Pad.x < X.width);
-                acc += (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
-                counter += (mask)? 1: 0;
-            }
-        
-        acc /= counter;
-        O.Set(n, y, x, c, acc);
-    }
-}
-
-NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
-void MaxPool2D_NoPads(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(O.channels, O.width, O.height);
-    TENSOR_ARGS2(X, O);
-
-    uint c = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (c >= O.channels) return;
-    if (x >= O.width) return;
-    if (y >= O.height) return;
-
-    for (uint n = 0; n < X.batch; ++n)
-    {
-        float maxV = -FLT_MAX;
-        for (uint dy = 0; dy < _Pool[1]; ++dy)
-            for (uint dx = 0; dx < _Pool[0]; ++dx)
-            {
-                float v = X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c);
-                maxV = max(v, maxV);
-            }
-        
-        O.Set(n, y, x, c, maxV);
-    }
-}
-
-NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
-void AvgPool2D_NoPads(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(O.channels, O.width, O.height);
-    TENSOR_ARGS2(X, O);
-
-    uint c = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (c >= O.channels) return;
-    if (x >= O.width) return;
-    if (y >= O.height) return;
-
-    float invPoolSize = 1.0f / (_Pool[0] * _Pool[1]);
-    for (uint n = 0; n < X.batch; ++n)
-    {
-        float v = 0;
-        for (uint dy = 0; dy < _Pool[1]; ++dy)
-            for (uint dx = 0; dx < _Pool[0]; ++dx)
-                v += X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c) * invPoolSize;
-
-        O.Set(n, y, x, c, v);
-    }
-}
-
-NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
-//NUMTHREADS((16,4,4), (16,4,2), (16,2,2))
-void MaxPool2D_Pool2x2_NoPads(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(O.channels, O.width, O.height);
-    TENSOR_ARGS2(X, O);
-
-    uint c = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (c >= O.channels) return;
-    if (x >= O.width) return;
-    if (y >= O.height) return;
-
-    for (uint n = 0; n < X.batch; ++n)
-    {
-        float v0 = X.Get(n, y*2,   x*2,   c);
-        float v1 = X.Get(n, y*2+1, x*2,   c);
-        float v2 = X.Get(n, y*2,   x*2+1, c);
-        float v3 = X.Get(n, y*2+1, x*2+1, c);
-        float v = max(v0, max(v1, max(v2, v3)));
-
-        O.Set(n, y, x, c, v);
-    }
-}
-
-[numthreads(32,1,1)]
-void GlobalAvgPool2D(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(O.channels, 1, 1);
-    TENSOR_ARGS2(X, O);
-
-    uint c = dispatchThreadID.x;
-    if (c >= O.channels) return;
-    //ASSERT(X.batch == O.batch)
-
-    for (uint n = 0; n < X.batch; ++n)
-    {
-        float v = 0;
-        for (uint y = 0; y < X.height; ++y)
-            for (uint x = 0; x < X.width; ++x)
-                v += X.Get(n, y, x, c);
-        
-        v /= (X.height * X.width);
-        O.Set(n, 0, 0, c, v);
-    }
-}
-
-
-
-#undef GROUP_SIZE
-#define GROUP_SIZE BARRACUDA_MAX_THREAD_COUNT
-
-groupshared float AvgVariancePool2D_SharedMean[1][GROUP_SIZE];
-groupshared float AvgVariancePool2D_SharedVariance[1][GROUP_SIZE];
-
-[numthreads(1, GROUP_SIZE, 1)]
-void GlobalAvgVariancePool2D(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(O.channels, 1, 1);
-    TENSOR_ARGS2(X, O);
-
-    uint c = dispatchThreadID.x;
-
-    for (uint n = 0; n < X.batch; ++n)
-    {
-        uint tid = groupThreadID.y;
-        uint q = (X.height * X.width) / GROUP_SIZE;
-        uint r = (X.height * X.width) % GROUP_SIZE;
-
-        float mean = 0;
-        float mean2 = 0;
-
-        for (int j = 0; j < q; j++)
-        {
-            float v = X.Get(n, tid + GROUP_SIZE * j, c);
-
-            // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
-            // Squential addressing with none divergent branching
-            AvgVariancePool2D_SharedMean[groupThreadID.x][tid] = v;
-            AvgVariancePool2D_SharedVariance[groupThreadID.x][tid] = v * v;
-
-            GroupMemoryBarrierWithGroupSync();
-
-            for (int s = GROUP_SIZE / 2; s > 0; s >>= 1)
-            {
-                if (tid < s)
-                {
-                    AvgVariancePool2D_SharedMean[groupThreadID.x][tid] += AvgVariancePool2D_SharedMean[groupThreadID.x][tid + s];
-                    AvgVariancePool2D_SharedVariance[groupThreadID.x][tid] += AvgVariancePool2D_SharedVariance[groupThreadID.x][tid + s];
-                }
-                GroupMemoryBarrierWithGroupSync();
-            }
-
-            mean = mean + AvgVariancePool2D_SharedMean[groupThreadID.x][0];
-            mean2 = mean2 + AvgVariancePool2D_SharedVariance[groupThreadID.x][0];
-        }
-
-
-        // N.B: you can reduce this part, but the extra logic wasn't worth it perf wise
-        if (tid == 0)
-        {
-            for (int j = 0; j < r; j++)
-            {
-                float v = X.Get(n, q * GROUP_SIZE + j, c);
-                mean += v;
-                mean2 += v * v;
-            }
-            mean /= (X.height * X.width);
-            mean2 /= (X.height * X.width);
-
-            O.Set(n, 0, 0, c, mean);
-            O.Set(n, 1, 0, c, mean2 - mean * mean);
-        }
-
-    }
-}
-
-
-
 [numthreads(64,1,1)]
 void InstanceNorm(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
@@ -555,6 +266,35 @@ void InstanceNormTail_CNyx2(uint3 dispatchThreadID : SV_DispatchThreadID)
     O.Set(i, v);
 }
 
+NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
+void Upsample2D(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    // NOTE: dispatched over X (not O)
+    DISPATCH_ARGS(X.channels, X.width, X.height);
+    TENSOR_ARGS2(X, O);
+
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
+
+    if (c >= X.channels) return;
+    if (x >= X.width) return;
+    if (y >= X.height) return;
+
+    for (uint n = 0; n < O.batch; ++n)
+    {
+        float v = X.Get(n, y, x, c);
+
+        for (uint dy = 0; dy < _Pool.y; ++dy)
+            for (uint dx = 0; dx < _Pool.x; ++dx)
+            {
+                uint oy = y * _Pool.y + dy;
+                uint ox = x * _Pool.x + dx;
+                O.Set(n, oy, ox, c, v);
+            }
+    }
+}
+
 NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
 void Copy(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
diff --git a/Assets/Coach-ML/Barracuda/Resources/Generic.compute.meta b/Assets/Coach-ML/Barracuda/Core/Resources/Generic.compute.meta
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Generic.compute.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/Generic.compute.meta
diff --git a/Assets/Coach-ML/Barracuda/Resources/Padding.compute b/Assets/Coach-ML/Barracuda/Core/Resources/Pad.compute
similarity index 97%
rename from Assets/Coach-ML/Barracuda/Resources/Padding.compute
rename to Assets/Coach-ML/Barracuda/Core/Resources/Pad.compute
index 4121e0a..f1ba1e0 100644
--- a/Assets/Coach-ML/Barracuda/Resources/Padding.compute
+++ b/Assets/Coach-ML/Barracuda/Core/Resources/Pad.compute
@@ -37,8 +37,11 @@ void Border2D(uint3 dispatchThreadID : SV_DispatchThreadID)
 
     for (uint n = 0; n < O.batch; ++n)
     {
-        float v = X.Get(n, readY, readX, c);
-        v = (paddedTexel) ? _Beta : v;
+        float v = _Beta;
+        
+        if (!paddedTexel)
+            v = X.Get(n, readY, readX, c);
+        
         O.Set(n, y, x, c, v);
     }
 }
diff --git a/Assets/Coach-ML/Barracuda/Resources/Padding.compute.meta b/Assets/Coach-ML/Barracuda/Core/Resources/Pad.compute.meta
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Padding.compute.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/Pad.compute.meta
diff --git a/Assets/Coach-ML/Barracuda/Core/Resources/Pool.compute b/Assets/Coach-ML/Barracuda/Core/Resources/Pool.compute
new file mode 100644
index 0000000..904a79c
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Resources/Pool.compute
@@ -0,0 +1,260 @@
+#pragma kernel AvgPool2D
+#pragma kernel MaxPool2D
+#pragma kernel AvgPool2D_NoPads
+#pragma kernel MaxPool2D_NoPads
+//#pragma kernel MaxPool2D_Pool2x2_NoPads
+#pragma kernel GlobalAvgPool2D
+#pragma kernel GlobalAvgVariancePool2D
+
+#include "Tensor.cginc"
+
+TENSOR_DECL(X)
+TENSOR_DECL(W)
+TENSOR_DECL(B)
+TENSOR_DECL(WBK)
+TENSOR_DECL_RW(O)
+
+uint4 _Pool;
+uint4 _Stride;
+uint4 _Pad;
+
+NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
+void MaxPool2D(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
+
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
+
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
+
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float maxV = -FLT_MAX;
+        for (uint dy = 0; dy < _Pool.y; ++dy)
+            for (uint dx = 0; dx < _Pool.x; ++dx)
+            {
+                uint oy = y * _Stride.y + dy;
+                uint ox = x * _Stride.x + dx;
+
+                bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.y < X.height) && (ox - _Pad.x < X.width);
+                float v = (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): -FLT_MAX;
+                
+                maxV = max(v, maxV);
+            }
+
+        O.Set(n, y, x, c, maxV);
+    }
+}
+
+NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
+void AvgPool2D(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
+
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
+
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
+
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float acc = 0;
+        float counter = 0;
+        for (uint dy = 0; dy < _Pool.y; ++dy)
+            for (uint dx = 0; dx < _Pool.x; ++dx)
+            {
+                uint oy = y * _Stride.y + dy;
+                uint ox = x * _Stride.x + dx;
+
+                bool mask = (oy >= _Pad.y) && (ox >= _Pad.x) && (oy - _Pad.y < X.height) && (ox - _Pad.x < X.width);
+                acc += (mask)? X.Get(n, oy - _Pad.y, ox - _Pad.x, c): 0;
+                counter += (mask)? 1: 0;
+            }
+
+        acc /= counter;
+        O.Set(n, y, x, c, acc);
+    }
+}
+
+NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
+void MaxPool2D_NoPads(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
+
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
+
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
+
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float maxV = -FLT_MAX;
+        for (uint dy = 0; dy < _Pool[1]; ++dy)
+            for (uint dx = 0; dx < _Pool[0]; ++dx)
+            {
+                float v = X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c);
+                maxV = max(v, maxV);
+            }
+
+        O.Set(n, y, x, c, maxV);
+    }
+}
+
+NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
+void AvgPool2D_NoPads(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
+
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
+
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
+
+    float invPoolSize = 1.0f / (_Pool[0] * _Pool[1]);
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = 0;
+        for (uint dy = 0; dy < _Pool[1]; ++dy)
+            for (uint dx = 0; dx < _Pool[0]; ++dx)
+                v += X.Get(n, y * _Stride[1] + dy, x * _Stride[0] + dx, c) * invPoolSize;
+
+        O.Set(n, y, x, c, v);
+    }
+}
+
+NUMTHREADS((4,8,8), (4,8,4), (4,4,4))
+//NUMTHREADS((16,4,4), (16,4,2), (16,2,2))
+void MaxPool2D_Pool2x2_NoPads(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.channels, O.width, O.height);
+    TENSOR_ARGS2(X, O);
+
+    uint c = dispatchThreadID.x;
+    uint x = dispatchThreadID.y;
+    uint y = dispatchThreadID.z;
+
+    if (c >= O.channels) return;
+    if (x >= O.width) return;
+    if (y >= O.height) return;
+
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v0 = X.Get(n, y*2,   x*2,   c);
+        float v1 = X.Get(n, y*2+1, x*2,   c);
+        float v2 = X.Get(n, y*2,   x*2+1, c);
+        float v3 = X.Get(n, y*2+1, x*2+1, c);
+        float v = max(v0, max(v1, max(v2, v3)));
+
+        O.Set(n, y, x, c, v);
+    }
+}
+
+[numthreads(32,1,1)]
+void GlobalAvgPool2D(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    DISPATCH_ARGS(O.channels, 1, 1);
+    TENSOR_ARGS2(X, O);
+
+    uint c = dispatchThreadID.x;
+    if (c >= O.channels) return;
+    //ASSERT(X.batch == O.batch)
+
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        float v = 0;
+        for (uint y = 0; y < X.height; ++y)
+            for (uint x = 0; x < X.width; ++x)
+                v += X.Get(n, y, x, c);
+
+        v /= (X.height * X.width);
+        O.Set(n, 0, 0, c, v);
+    }
+}
+
+
+
+#undef GROUP_SIZE
+#define GROUP_SIZE BARRACUDA_MAX_THREAD_COUNT
+
+groupshared float AvgVariancePool2D_SharedMean[1][GROUP_SIZE];
+groupshared float AvgVariancePool2D_SharedVariance[1][GROUP_SIZE];
+
+[numthreads(1, GROUP_SIZE, 1)]
+void GlobalAvgVariancePool2D(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID)
+{
+    DISPATCH_ARGS(O.channels, 1, 1);
+    TENSOR_ARGS2(X, O);
+
+    uint c = dispatchThreadID.x;
+
+    for (uint n = 0; n < X.batch; ++n)
+    {
+        uint tid = groupThreadID.y;
+        uint q = (X.height * X.width) / GROUP_SIZE;
+        uint r = (X.height * X.width) % GROUP_SIZE;
+
+        float mean = 0;
+        float mean2 = 0;
+
+        for (uint j = 0; j < q; j++)
+        {
+            float v = X.Get(n, tid + GROUP_SIZE * j, c);
+
+            // https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+            // Squential addressing with none divergent branching
+            AvgVariancePool2D_SharedMean[groupThreadID.x][tid] = v;
+            AvgVariancePool2D_SharedVariance[groupThreadID.x][tid] = v * v;
+
+            GroupMemoryBarrierWithGroupSync();
+
+            for (uint s = GROUP_SIZE / 2; s > 0; s >>= 1)
+            {
+                if (tid < s)
+                {
+                    AvgVariancePool2D_SharedMean[groupThreadID.x][tid] += AvgVariancePool2D_SharedMean[groupThreadID.x][tid + s];
+                    AvgVariancePool2D_SharedVariance[groupThreadID.x][tid] += AvgVariancePool2D_SharedVariance[groupThreadID.x][tid + s];
+                }
+                GroupMemoryBarrierWithGroupSync();
+            }
+
+            mean = mean + AvgVariancePool2D_SharedMean[groupThreadID.x][0];
+            mean2 = mean2 + AvgVariancePool2D_SharedVariance[groupThreadID.x][0];
+        }
+
+
+        // N.B: you can reduce this part, but the extra logic wasn't worth it perf wise
+        if (tid == 0)
+        {
+            for (uint j = 0; j < r; j++)
+            {
+                float v = X.Get(n, q * GROUP_SIZE + j, c);
+                mean += v;
+                mean2 += v * v;
+            }
+            mean /= (X.height * X.width);
+            mean2 /= (X.height * X.width);
+
+            O.Set(n, 0, 0, c, mean);
+            O.Set(n, 1, 0, c, mean2 - mean * mean);
+        }
+
+    }
+}
+
diff --git a/Assets/Coach-ML/Barracuda/Resources/ConvOld.compute.meta b/Assets/Coach-ML/Barracuda/Core/Resources/Pool.compute.meta
similarity index 65%
rename from Assets/Coach-ML/Barracuda/Resources/ConvOld.compute.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/Pool.compute.meta
index dae45fc..cf1f8c1 100644
--- a/Assets/Coach-ML/Barracuda/Resources/ConvOld.compute.meta
+++ b/Assets/Coach-ML/Barracuda/Core/Resources/Pool.compute.meta
@@ -1,8 +1,8 @@
 fileFormatVersion: 2
-guid: a89bb2d7cde74429c8475f7cd8bcdb01
+guid: 8330b6eef5e224882b4d4c769aeaef4a
 ComputeShaderImporter:
   externalObjects: {}
-  currentAPIMask: 0
+  currentAPIMask: 65536
   userData: 
   assetBundleName: 
   assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Resources/Random.cginc b/Assets/Coach-ML/Barracuda/Core/Resources/Random.cginc
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Random.cginc
rename to Assets/Coach-ML/Barracuda/Core/Resources/Random.cginc
diff --git a/Assets/Coach-ML/Barracuda/Resources/Random.cginc.meta b/Assets/Coach-ML/Barracuda/Core/Resources/Random.cginc.meta
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Random.cginc.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/Random.cginc.meta
diff --git a/Assets/Coach-ML/Barracuda/Resources/Tensor.cginc b/Assets/Coach-ML/Barracuda/Core/Resources/Tensor.cginc
similarity index 94%
rename from Assets/Coach-ML/Barracuda/Resources/Tensor.cginc
rename to Assets/Coach-ML/Barracuda/Core/Resources/Tensor.cginc
index 45eabec..e30fa67 100644
--- a/Assets/Coach-ML/Barracuda/Resources/Tensor.cginc
+++ b/Assets/Coach-ML/Barracuda/Core/Resources/Tensor.cginc
@@ -339,19 +339,19 @@ struct SharedTensor : Tensor
     }
 };
 
-#define TENSOR_DECL(X) uint4 X##decl[2]; StructuredBuffer<float> X##data;
-#define TENSOR_DECL_RW(X) uint4 X ## decl[2]; RWStructuredBuffer<float> X ## data;
+#define TENSOR_DECL(X) uint4 X##declShape; uint4 X##declInfo; StructuredBuffer<float> X##data;
+#define TENSOR_DECL_RW(X) uint4 X ## declShape; uint4 X ## declInfo; RWStructuredBuffer<float> X ## data;
 
-#define TENSOR_ARG(X) ReadonlyTensor X; X##.Init(X##decl[0], X##data); // readonly
-#define TENSOR_MODEL(X) SharedTensor X; X##.Init(X##decl[0], X##decl[1], X##data); // RO w offset
-#define TENSOR_ARG_RW(X) ReadWriteTensor X; X##.Init(X##decl[0], X##data);
+#define TENSOR_ARG(X) ReadonlyTensor X; X##.Init(X##declShape, X##data); // readonly
+#define TENSOR_MODEL(X) SharedTensor X; X##.Init(X##declShape, X##declInfo, X##data); // RO w offset
+#define TENSOR_ARG_RW(X) ReadWriteTensor X; X##.Init(X##declShape, X##data);
 
 #define TENSOR_ARGS2(X, O) TENSOR_ARG(X); TENSOR_ARG_RW(O);
 #define TENSOR_ARGS3(X, A, O) TENSOR_ARG(X); TENSOR_MODEL(A); TENSOR_ARG_RW(O);
 #define TENSOR_ARGS4(X, A, B, O) TENSOR_ARG(X); TENSOR_MODEL(A); TENSOR_MODEL(B); TENSOR_ARG_RW(O);
 
 // shared model tensors
-#define TENSOR_SHARED_MODEL(X, S) SharedTensor X; X##.Init(X##decl[0], X##decl[1], S##data);
+#define TENSOR_SHARED_MODEL(X, S) SharedTensor X; X##.Init(X##declShape, X##declInfo, S##data);
 #define TENSOR_SHARED2_ARGS4(X, A, B, S, O) TENSOR_ARG(X); TENSOR_SHARED_MODEL(A, S); TENSOR_SHARED_MODEL(B, S); TENSOR_ARG_RW(O);
 
 
diff --git a/Assets/Coach-ML/Barracuda/Resources/Tensor.cginc.meta b/Assets/Coach-ML/Barracuda/Core/Resources/Tensor.cginc.meta
similarity index 100%
rename from Assets/Coach-ML/Barracuda/Resources/Tensor.cginc.meta
rename to Assets/Coach-ML/Barracuda/Core/Resources/Tensor.cginc.meta
diff --git a/Assets/Coach-ML/Barracuda/Core/Tensor.cs b/Assets/Coach-ML/Barracuda/Core/Tensor.cs
new file mode 100644
index 0000000..10f9547
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Tensor.cs
@@ -0,0 +1,938 @@
+using UnityEngine.Assertions;
+using System;
+
+namespace Barracuda {
+
+/// <summary>
+/// TensorShape are immutable representation of a Tensor dimensions and rank.
+/// At the moment a TensorShape is always of rank 4 and channels last ie B,H,W,C.
+/// However an axis can be a size 1. For example a tensor without spatial information will be B,1,1,C
+/// </summary>
+[Serializable]
+public struct TensorShape
+{
+    /// <summary>
+    /// Return the number of batch.
+    /// </summary>
+    public readonly int batch;
+    /// <summary>
+    /// Return the spatial height.
+    /// </summary>
+    public readonly int height;
+    /// <summary>
+    /// Return the spatial width.
+    /// </summary>
+    public readonly int width;
+    /// <summary>
+    /// Return the number of channels.
+    /// </summary>
+    public readonly int channels;
+
+    #region Constructors
+    /// <summary>
+    /// Create a TensorShape of shape B,H,W,C.
+    /// </summary>
+    public TensorShape(int b, int h, int w, int ch)
+    {
+        batch = b > 0 ? b : 1;
+        height = h > 0 ? h : 1;
+        width = w > 0 ? w : 1;
+        channels = ch > 0 ? ch : 1;
+    }
+    /// <summary>
+    /// Create a TensorShape of shape B,1,1,C.
+    /// </summary>
+    public TensorShape(int b, int ch)
+    {
+        batch = b > 0 ? b : 1;
+        height = 1;
+        width = 1;
+        channels = ch > 0 ? ch : 1;
+    }
+    /// <summary>
+    /// Create a TensorShape of arbitrary shape.
+    /// `shape` parameter should be of length 4.
+    /// </summary>
+    public TensorShape(int[] shape)
+        : this(shape[0], shape[1], shape[2], shape[3])
+    {
+        Assert.AreEqual(4, shape.Length);
+    }
+    #endregion
+
+    #region Properties
+    /// <summary>
+    /// Kernel dimension ordering is [H,W,C,K] for efficiency purpose.
+    /// Return kernel height.
+    /// </summary>
+    public int kernelHeight { get { return batch; } }
+    /// <summary>
+    /// Kernel dimension ordering is [H,W,C,K] for efficiency purpose.
+    /// Return kernel width.
+    /// </summary>
+    public int kernelWidth { get { return height; } }
+    /// <summary>
+    /// Kernel dimension ordering is [H,W,C,K] for efficiency purpose.
+    /// Return kernel depth (aka the number of input channels of the associated operator).
+    /// </summary>
+    public int kernelDepth { get { return width; } }
+    /// <summary>
+    /// Kernel dimension ordering is [H,W,C,K] for efficiency purpose.
+    /// Return kernel count (aka the number of output channels of the associated operator).
+    /// </summary>
+    public int kernelCount { get { return channels; } }
+    /// <summary>
+    /// Return the number of batch.
+    /// </summary>
+    public int flatHeight { get { return batch; } }
+    /// <summary>
+    /// Return the H*W*C.
+    /// </summary>
+    public int flatWidth { get { return height * width * channels; } }
+    /// <summary>
+    /// Return the total number of elements represented by this shape.
+    /// </summary>
+    public int length { get { return batch * height * width * channels; } }
+    /// <summary>
+    /// Always 4, look also at the `dimensions` property.
+    /// </summary>
+    public int rank { get { return 4; } }
+    /// <summary>
+    /// Return the count of non-unit dimension of this shape.
+    /// For example [B,1,1,C] dimensions is 2.
+    /// </summary>
+    public int dimensions { get {
+            return
+                (batch > 1 ? 1 : 0) +
+                (height > 1 ? 1 : 0) +
+                (width > 1 ? 1 : 0) +
+                (channels > 1 ? 1 : 0);
+        }
+    }
+    #endregion
+
+    #region Helpers
+    /// <summary>
+    /// Allow to use negative axis to access tensorShape backward.
+    /// `axis` should be from -rank to rank (exclusive).
+    /// </summary>
+    public int Axis(int axis)
+    {
+        Assert.IsTrue(axis > -rank && axis < rank);
+        return axis >= 0 ? axis: rank + axis;
+    }
+    /// <summary>
+    /// Given an offset in memory return the dimensions indices of the element as [b,h,w,c].
+    /// </summary>
+    public void GetPositionsFromIndex(int index, ref int b, ref int h, ref int w, ref int ch)
+    {
+        ch = index % channels;
+        w = (index / channels) % width;
+        h = (index / (channels * width)) % height;
+        b = (index / (channels * width * height)) % batch;
+    }
+    /// <summary>
+    /// Given an element dimensions indices [b,h,w,c] with broadcast support, return this element offset in memory.
+    /// </summary>
+    public int IndexWithBroadcast(int b, int h, int w, int ch)
+    {
+        b %= batch;
+        h %= height;
+        w %= width;
+        ch %= channels;
+        return Index(b, h, w, ch);
+    }
+    /// <summary>
+    /// Given an element dimensions indices [b,h,w,c] return this element offset in memory.
+    /// </summary>
+    public int Index(int b, int h, int w, int ch)
+    {
+        int index =
+            b * height * width * channels +
+            h * width * channels +
+            w * channels +
+            ch;
+        return index;
+    }
+    /// <summary>
+    /// Given an element dimensions indices [b,0,0,c] return this element offset in memory.
+    /// </summary>
+    public int Index(int b, int c)
+    {
+        int index =
+            b * height * width * channels +
+            c;
+        return index;
+    }
+    /// <summary>
+    /// Indexer to return a dimension of this tensorShape as [B,H,W,C]
+    /// Prefer this over ToArray() to avoid GC allocation/collection.
+    /// </summary>
+    public int this[int axis]
+    {
+        get
+        {
+            //switch case rather than `ToArray` to avoid GC allocation
+            switch(axis)
+            {
+                case 0:
+                    return batch;
+                case 1:
+                    return height;
+                case 2:
+                    return width;
+                default:
+                    return channels;
+            }
+        }
+    }
+    /// <summary>
+    /// Return an array representation of this tensorShape as [B,H,W,C]
+    /// Prefer tensorShape[x] to avoid GC allocation/collection.
+    /// </summary>
+    public int[] ToArray()
+    {
+        return new[] { batch, height, width, channels };
+    }
+
+    /// <summary>
+    /// Remove single-dimensional entries from the shape.
+    /// [b=4,h=1,w=1,c=128] => [b=1,h=1,w=4,c=128]
+    /// </summary>
+    public TensorShape Squeeze()
+    {
+        var dims = ToArray();
+
+        var squeezed = new[] { 1,1,1,1 };
+        Assert.IsTrue(dims.Length == squeezed.Length);
+        var index = squeezed.Length;
+        foreach (var dim in dims)
+            if (dim > 1)
+                squeezed[--index] = dim;
+        return new TensorShape(squeezed);
+    }
+
+    /// <summary>
+    /// Return a TensorShape of dimensions [B,1,1,H*W*C]
+    /// </summary>
+    public TensorShape Flatten()
+    {
+        return new TensorShape(batch, height * width * channels);
+    }
+    #endregion
+
+    #region Comparison operators
+    public static bool operator ==(TensorShape a, TensorShape b)
+    {
+        return
+            a.batch == b.batch &&
+            a.height == b.height &&
+            a.width == b.width &&
+            a.channels == b.channels;
+    }
+
+    public static bool operator !=(TensorShape a, TensorShape b)
+    {
+        return !(a == b);
+    }
+
+    public override bool Equals(Object obj)
+    {
+        // Check for null values and compare run-time types.
+        if (obj == null || GetType() != obj.GetType())
+            return false;
+
+        return this == (TensorShape)obj;
+    }
+
+    public override int GetHashCode()
+    {
+        return batch ^ height ^ width ^ channels;
+    }
+    #endregion
+
+    public override string ToString()
+    {
+        return $"({batch}, {height}, {width}, {channels})";
+    }
+}
+
+
+// @TODO: most likely Tensor should still be struct - that way passing Tensor as argument into IOps would be safer (no hidden state mods), and Flatten & Reshape could return modified Tensor
+// ITensorData & Dispose mechanism should however allow Tensors to share the same ITensorData
+public class Tensor : IDisposable
+{
+    private ITensorData m_TensorOnDevice;
+    private ITensorAllocator m_TensorAllocator;
+    private float[] m_Cache;
+    private bool m_CacheIsDirty;
+
+    /// <summary>
+    /// Return this tensor name.
+    /// </summary>
+    public string name;
+    /// <summary>
+    /// Return this tensor allocator, see interface `ITensorAllocator`.
+    /// </summary>
+    public ITensorAllocator allocator { get { return m_TensorAllocator; } }
+
+    #region Shape
+    /// <summary>
+    /// Return this tensor shape as [B,H,W,C].
+    /// </summary>
+    public readonly TensorShape shape;
+    /// <summary>
+    /// Return the number of batch.
+    /// </summary>
+    public int batch { get { return shape.batch; } }
+    /// <summary>
+    /// Return the spatial height.
+    /// </summary>
+    public int height { get { return shape.height; } }
+    /// <summary>
+    /// Return the spatial width.
+    /// </summary>
+    public int width { get { return shape.width; } }
+    /// <summary>
+    /// Return the number of channels.
+    /// </summary>
+    public int channels { get { return shape.channels; } }
+    /// <summary>
+    /// Kernel dimension ordering is [H,W,C,K] for efficiency purpose.
+    /// Return kernel width.
+    /// </summary>
+    public int kernelWidth { get { return shape.kernelWidth; } }
+    /// <summary>
+    /// Kernel dimension ordering is [H,W,C,K] for efficiency purpose.
+    /// Return kernel height.
+    /// </summary>
+    public int kernelHeight { get { return shape.kernelHeight; } }
+    /// <summary>
+    /// Kernel dimension ordering is [H,W,C,K] for efficiency purpose.
+    /// Return kernel depth (aka the number of input channels of the associated operator).
+    /// </summary>
+    public int kernelDepth { get { return shape.kernelDepth; } }
+    /// <summary>
+    /// Kernel dimension ordering is [H,W,C,K] for efficiency purpose.
+    /// Return kernel count (aka the number of output channels of the associated operator).
+    /// </summary>
+    public int kernelCount { get { return shape.kernelCount; } }
+    /// <summary>
+    /// Return the number of batch.
+    /// </summary>
+    public int flatHeight { get { return shape.flatHeight; } }
+    /// <summary>
+    /// Return the H*W*C.
+    /// </summary>
+    public int flatWidth { get { return shape.flatWidth; } }
+    /// <summary>
+    /// Return the total number of elements in this tensor.
+    /// </summary>
+    public int length { get { return shape.length; } }
+    /// <summary>
+    /// Return the count of non-unit dimension of this tensor shape.
+    /// For example [B,1,1,C] dimensions is 2.
+    /// </summary>
+    public int dimensions { get { return shape.dimensions; } }
+    #endregion
+
+    #region Constructors
+    /// <summary>
+    /// Create a Tensor from a shape `s`, an array of data `srcData` and an optional name `n`
+    /// `s` should be of size 4, order is [b,h,w,ch].
+    /// `srcData` should be of size s[0]*s[1]*s[2]*s[3].
+    /// </summary>
+    public Tensor(int[] s, float[] srcData, string n = "") : this(new TensorShape(s), srcData, n) {}
+    /// <summary>
+    /// Create a Tensor of shape [b,h,w,ch], an array of data `srcData` and an optional name `n`
+    /// `srcData` should be of size b*h*w*ch
+    /// </summary>
+    public Tensor(int b, int h, int w, int ch, float[] srcData, string n = "") : this(new TensorShape(b, h, w, ch), srcData, n) {}
+    /// <summary>
+    /// Create a Tensor of shape [b,1,1,ch], an array of data `srcData` and an optional name `n`
+    /// `srcData` should be of size b*ch
+    /// </summary>
+    public Tensor(int b, int ch, float[] srcData, string n = "") : this(new TensorShape(b, ch), srcData, n) {}
+    /// <summary>
+    /// Create a Tensor of shape tensorShape `s`, an array of data `srcData` and an optional name `n`
+    /// `srcData` should be of size `s.length`.
+    /// </summary>
+    public Tensor(TensorShape s, float[] srcData, string n = "")
+    {
+        //;;UnityEngine.Debug.Log("Tensor::Tensor " + n + " " + s + " []-> " + srcData);
+        name = n;
+        shape = s;
+        m_TensorOnDevice = new ArrayTensorData(shape);
+        m_TensorOnDevice.Upload(srcData, 0, Math.Min(length, srcData.Length));
+        m_TensorAllocator = null;
+        m_Cache = null;
+        m_CacheIsDirty = false;
+    }
+    /// <summary>
+    /// Create a Tensor from a shape `s`, an array of data `srcData` and an optional name `n`
+    /// `s` should be of size 4, order is [b,h,w,ch].
+    /// `srcData` should be of size s[0]*s[1]*s[2]*s[3].
+    /// </summary>
+    public Tensor(int[] s, float[][] srcData, string n = "") : this(new TensorShape(s), srcData, n) {}
+    /// <summary>
+    /// Create a Tensor of shape [b,h,w,ch], an array of data `srcData` and an optional name `n`
+    /// `srcData` should be of size b*h*w*ch
+    /// </summary>
+    public Tensor(int b, int h, int w, int ch, float[][] srcData, string n = "") : this(new TensorShape(b, h, w, ch), srcData, n) {}
+    /// <summary>
+    /// Create a Tensor of shape [b,1,1,ch], an array of data `srcData` and an optional name `n`
+    /// `srcData` should be of size b*ch
+    /// </summary>
+    public Tensor(int b, int ch, float[][] srcData, string n = "") : this(new TensorShape(b, ch), srcData, n) {}
+    /// <summary>
+    /// Create a Tensor of shape tensorShape `s`, an array of data `srcData` and an optional name `n`
+    /// `srcData` should be of size `s.length`.
+    /// </summary>
+    public Tensor(TensorShape s, float[][] srcData, string n = "")
+    {
+        //;;UnityEngine.Debug.Log("Tensor::Tensor " + n + " " + s + " [][]-> " + srcData);
+        name = n;
+        shape = s;
+        var arrayTensorData = new ArrayTensorData(shape);
+        for (var i = 0; i < Math.Min(flatHeight, srcData.Length); ++i)
+        {
+            var src = srcData[i];
+            var dstOffset = i * flatWidth;
+            Array.Copy(src, 0, arrayTensorData.array, dstOffset, Math.Min(flatWidth, src.Length));
+        }
+        m_TensorOnDevice = arrayTensorData;
+        m_TensorAllocator = null;
+        m_Cache = null;
+        m_CacheIsDirty = false;
+    }
+    /// <summary>
+    /// Create a Tensor from a texture, shape is [1, texture.height, texture.width, `channels=3`]
+    /// </summary>
+    public Tensor(UnityEngine.Texture srcTexture, int channels = 3, string n = "") : this(new [] { srcTexture }, channels, n) {}
+    /// <summary>
+    /// Create a Tensor from multiple texture, shape is [srcTextures.length, texture.height, texture.width, `channels=3`]
+    /// All textures must be of the same size and dimension.
+    /// </summary>
+    public Tensor(UnityEngine.Texture[] srcTextures, int channels = 3, string n = "")
+    {
+        name = n;
+        var tensorData = new TextureAsTensorData(srcTextures, channels);
+        //;;UnityEngine.Debug.Log("Tensor::Tensor " + n + " " + tensorData.shape + " [TEX] " + srcTextures);
+        shape = tensorData.shape;
+        Assert.IsTrue(tensorData.GetMaxCount() >= length);
+        m_TensorOnDevice = tensorData;
+        m_TensorAllocator = null;
+        m_Cache = null;
+        m_CacheIsDirty = false;
+    }
+    /// <summary>
+    /// Create a Tensor from a shape `s`, a ITensorData `d` and an optional name `n`
+    /// `s` should be of size 4, order is [b,h,w,ch].
+    /// </summary>
+    public Tensor(int[] s, ITensorData d, string n = "") : this(new TensorShape(s), d, n) {}
+    /// <summary>
+    /// Create a Tensor of shape [b,h,w,ch], a ITensorData `d` and an optional name `n`
+    /// `srcData` should be of size b*h*w*ch
+    /// </summary>
+    public Tensor(int b, int h, int w, int ch, ITensorData d, string n = "") : this(new TensorShape(b, h, w, ch), d, n) {}
+    /// <summary>
+    /// Create a Tensor of shape [b,1,1,ch], a ITensorData `d` and an optional name `n`
+    /// `srcData` should be of size b*ch
+    /// </summary>
+    public Tensor(int b, int ch, ITensorData d, string n = "") : this(new TensorShape(b, ch), d, n) {}
+    /// <summary>
+    /// Create a Tensor of shape tensorShape `s`, a ITensorData `d` and an optional name `n`
+    /// </summary>
+    public Tensor(TensorShape s, ITensorData d, string n = "")
+    {
+        //;;UnityEngine.Debug.Log("Tensor::Tensor " + n + " " + s + " @ " + ((d != null) ? d.GetType().Name : "null"));
+        name = n;
+        shape = s;
+        m_TensorOnDevice = d;
+        m_TensorAllocator = null;
+        m_Cache = null;
+        m_CacheIsDirty = false;
+    }
+    /// <summary>
+    /// Create an uninitialized Tensor with a shape of [1,1,1,1].
+    /// </summary>
+    public Tensor(string n = "") : this(new TensorShape(1,1,1,1), n) {}
+    /// <summary>
+    /// Create an uninitialized Tensor from a shape `s`. `s` should be of size 4, order is [b,h,w,ch]
+    /// </summary>
+    public Tensor(int[] s, string n = "") : this(new TensorShape(s), n) {}
+    /// <summary>
+    /// Create an uninitialized Tensor of shape [b,h,w,ch].
+    /// </summary>
+    public Tensor(int b, int h, int w, int ch, string n = "") : this(new TensorShape(b, h, w, ch), n) {}
+    /// <summary>
+    /// Create an uninitialized Tensor of shape [b,1,1,ch].
+    /// </summary>
+    public Tensor(int b, int ch, string n = "") : this(new TensorShape(b, ch), n) {}
+    /// <summary>
+    /// Create an uninitialized Tensor of shape tensorShape `s`.
+    /// </summary>
+    public Tensor(TensorShape s, string n = "")
+    {
+        //;;UnityEngine.Debug.Log("Tensor::Tensor " + n + " " + s);
+        name = n;
+        shape = s;
+        m_TensorOnDevice = null;
+        m_TensorAllocator = null;
+        m_Cache = null;
+        m_CacheIsDirty = false;
+    }
+    /// <summary>
+    /// Create a Tensor from a shape `s`, a ITensorData `d` and a ITensorAllocator `a`
+    /// `s` should be of size 4, order is [b,h,w,ch].
+    /// </summary>
+    public Tensor(int[] s, ITensorData d, ITensorAllocator a) : this(new TensorShape(s), d, a) {}
+    /// <summary>
+    /// Create a Tensor of shape [b,h,w,ch], a ITensorData `d` and a ITensorAllocator `a`
+    /// </summary>
+    public Tensor(int b, int h, int w, int ch, ITensorData d, ITensorAllocator a) : this(new TensorShape(b, h, w, ch), d, a) {}
+    /// <summary>
+    /// Create a Tensor of shape [b,1,1,ch], a ITensorData `d` and a ITensorAllocator `a`
+    /// `srcData` should be of size b*ch
+    /// </summary>
+    public Tensor(int b, int ch, ITensorData d, ITensorAllocator a) : this(new TensorShape(b, ch), d, a) {}
+    /// <summary>
+    /// Create a Tensor of shape tensorShape `s`, a ITensorData `d` and a ITensorAllocator `a`
+    /// </summary>
+    public Tensor(TensorShape s, ITensorData d, ITensorAllocator a)
+    {
+        //;;UnityEngine.Debug.Log("Tensor::Tensor " + s + " " + d + " " + a);
+        name = "";
+        shape = s;
+        m_TensorOnDevice = d;
+        m_TensorAllocator = a;
+        m_Cache = null;
+        m_CacheIsDirty = false;
+    }
+
+    /// <summary>
+    /// Create an uninitialized Tensor with a shape of [1,1,1,1] and ITensorAllocator `a`
+    /// </summary>
+    public Tensor(ITensorAllocator a) : this(new TensorShape(1,1,1,1), a) {}
+    /// <summary>
+    /// Create an uninitialized Tensor from a shape `s` and ITensorAllocator `a`
+    /// `s` should be of size 4, order is [b,h,w,ch].
+    ///
+    /// </summary>
+    public Tensor(int[] s, ITensorAllocator a) : this(new TensorShape(s), a) {}
+    /// <summary>
+    /// Create an uninitialized Tensor of shape [b,h,w,ch] and ITensorAllocator `a`.
+    /// </summary>
+    public Tensor(int b, int h, int w, int ch, ITensorAllocator a) : this(new TensorShape(b, h, w, ch), a) {}
+    /// <summary>
+    /// Create an uninitialized Tensor of shape [b,1,1,ch] and ITensorAllocator `a`.
+    /// </summary>
+    public Tensor(int b, int ch, ITensorAllocator a) : this(new TensorShape(b, ch), a) {}
+    /// <summary>
+    /// Create an uninitialized Tensor of shape tensorShape `s` and ITensorAllocator `a`.
+    /// </summary>
+    public Tensor(TensorShape s, ITensorAllocator a)
+    {
+        //;;UnityEngine.Debug.Log("Tensor::Tensor " + n + " " + s + " " + a);
+        name = "";
+        shape = s;
+        m_TensorOnDevice = null;
+        m_TensorAllocator = a;
+        m_Cache = null;
+        m_CacheIsDirty = false;
+    }
+    #endregion
+
+    /// <summary>
+    /// Destructor will also dispose associated memories.
+    /// </summary>
+    ~Tensor()
+    {
+        Dispose();
+    }
+
+    /// <summary>
+    /// Allocate tensor on device if needed and update data.
+    /// By default cached copy of the data will be discarded when doing so, set `forceInvalidateCache` to false to keep the cache.
+    /// </summary>
+    public void PinToDeviceAndUploadToIt(ITensorData onDevice, bool forceInvalidateCache = true)
+    {
+        if (m_TensorOnDevice == onDevice && !m_CacheIsDirty)
+            return;
+
+        PrepareCacheForAccess();
+        PinToDevice(onDevice, disposeUnpinned: true);
+
+        m_CacheIsDirty = true;
+        if (forceInvalidateCache)
+            UploadAndInvalidateCache();
+        else
+            UploadIfDirty();
+    }
+
+    /// <summary>
+    /// Allocate tensor on device if needed and download data to cache.
+    /// See also `PrepareCacheForAccess()`.
+    /// </summary>
+    public void PinToDeviceAndDownloadFromIt(ITensorData onDevice)
+    {
+        if (m_TensorOnDevice == onDevice && !m_CacheIsDirty)
+            return;
+
+        UploadIfDirty();
+        PinToDevice(onDevice, disposeUnpinned: true);
+        if (m_Cache != null)
+            PrepareCacheForAccess();
+    }
+
+    private void PinToDevice(ITensorData onDevice, bool disposeUnpinned = true)
+    {
+        Assert.IsTrue(onDevice?.GetMaxCount() >= length || onDevice == null);
+
+        if (m_TensorAllocator != null)
+            m_TensorAllocator.Repin(this, onDevice, m_TensorOnDevice, disposeUnpinned);
+        else if (disposeUnpinned)
+            m_TensorOnDevice?.Dispose();
+
+        m_TensorOnDevice = onDevice;
+    }
+
+    /// <summary>
+    /// Cast a tensorData to this tensor, transferring ownership of on tensorData device memory to this tensor.
+    /// </summary>
+    public void CastOnDevice(ITensorData onDevice)
+    {
+        if (m_TensorOnDevice == onDevice)
+            return;
+
+        Assert.IsNotNull(onDevice);
+        Assert.IsNotNull(m_TensorOnDevice);
+        Assert.IsTrue(onDevice.GetMaxCount() >= length);
+
+        if (m_TensorAllocator != null)
+            m_TensorAllocator.Cast(this, onDevice, m_TensorOnDevice);
+
+        m_TensorOnDevice = onDevice;
+    }
+
+    /// <summary>
+    /// Remove tensor from device, will first sync the cache with device data.
+    /// </summary>
+    public ITensorData Unpin(bool disposeUnpinned = true)
+    {
+        PrepareCacheForAccess();
+
+        ITensorData unpinned = (disposeUnpinned) ? null : m_TensorOnDevice;
+        PinToDevice(null, disposeUnpinned);
+        return unpinned;
+    }
+
+    private bool m_Disposing = false;    // to protect from infinite-loop. in case UnpinAndDisposeTensor() is called from Dispose()
+    /// <summary>
+    /// Remove tensor from device, and dispose it.
+    /// </summary>
+    public ITensorData UnpinAndDisposeTensor()
+    {
+        // NOTE: since this Tensor is going to be Disposed
+        // there is no need to populate cache with data from tensorOnDevice
+        // we can save on skipping PrepareCacheForAccess() call
+        ITensorData unpinned = m_TensorOnDevice;
+        PinToDevice(null, false);
+        if (!m_Disposing)
+            Dispose();
+        return unpinned;
+    }
+
+    private void UploadIfDirty()
+    {
+        if (m_CacheIsDirty && m_TensorOnDevice != null)
+            m_TensorOnDevice.Upload(m_Cache);
+        m_CacheIsDirty = false;
+    }
+
+    private void UploadAndInvalidateCache()
+    {
+        UploadIfDirty();
+
+        // remove cache only, if pinned to device
+        // otherwise cache holds the only copy of the tensor data and we can not loose it
+        if (m_TensorOnDevice == null)
+            return;
+
+        m_Cache = null;
+        m_CacheIsDirty = false;
+    }
+
+    /// <summary>
+    /// Populate the cache with on device data.
+    /// Blocking read if `blocking` is true (default)
+    /// </summary>
+    public bool PrepareCacheForAccess(bool blocking = true)
+    {
+        // non-blocking, schedule download for later
+        if (!blocking && m_TensorOnDevice != null)
+            return m_TensorOnDevice.ScheduleAsyncDownload(length);
+
+        // blocking, have to get data now!
+        if (m_Cache == null)
+        {
+            if (m_TensorOnDevice != null)
+                m_Cache = m_TensorOnDevice.Download(length);
+            else
+                m_Cache = new float[length];
+            m_CacheIsDirty = false;
+        }
+
+        return true;
+    }
+
+    /// <summary>
+    /// Upload cache to device memory and delete it.
+    /// </summary>
+    public void FlushCache()
+    {
+        UploadAndInvalidateCache();
+    }
+
+    // @TODO: choose approach to handle case when tensors after Flatten/Reshape are written into OR taken ownership of
+    // 1) owns data, copy on PrepareCacheForAccess() and PinForWrite()
+    // 2) always copy data in Flatten()/Reshape(), remove from Tensor interface
+    // 2) always copy data in Flatten()/Reshape(), implement ICloneable for GPU ITensorData
+
+    /// <summary>
+    /// Create a flattened copy of the current Tensor ie of shape [B,1,1,H*W*CH]
+    /// </summary>
+    public Tensor Flatten()
+    {
+        var newShape = shape.Flatten();
+
+        Tensor copy;
+        if (m_TensorAllocator != null)
+            copy = m_TensorAllocator.Alloc(newShape, m_TensorOnDevice);
+        else
+            copy = new Tensor(newShape, m_TensorOnDevice);
+
+        copy.name = $"flatten of {name}";
+        copy.m_Cache = m_Cache;
+        copy.m_CacheIsDirty = m_CacheIsDirty;
+        return copy;
+    }
+
+    /// <summary>
+    /// Create a reshaped copy of the current Tensor.
+    /// `newShape`.length must be equal to this.shape.length.
+    /// </summary>
+    public Tensor Reshape(TensorShape newShape)
+    {
+        Assert.AreEqual(shape.length, newShape.length);
+        Tensor copy;
+        if (m_TensorAllocator != null)
+            copy = m_TensorAllocator.Alloc(newShape, m_TensorOnDevice);
+        else
+            copy = new Tensor(newShape, m_TensorOnDevice);
+
+        copy.name = $"reshape of {name}";
+        copy.m_Cache = m_Cache;
+        copy.m_CacheIsDirty = m_CacheIsDirty;
+        return copy;
+    }
+
+    /// <summary>
+    /// Create a copy of the current Tensor, sharing data storage with original tensor.
+    /// </summary>
+    public Tensor ShallowCopy()
+    {
+        Tensor copy;
+        if (m_TensorAllocator != null)
+            copy = m_TensorAllocator.Alloc(shape, m_TensorOnDevice);
+        else
+            copy = new Tensor(shape, m_TensorOnDevice);
+
+        copy.name = $"copy of {name}";
+        copy.m_Cache = m_Cache;
+        copy.m_CacheIsDirty = m_CacheIsDirty;
+
+        return copy;
+    }
+
+    /// <summary>
+    /// Create a copy of the current Tensor, actively syncing there data in a blocking way.
+    /// </summary>
+    public Tensor DeepCopy()
+    {
+        // @TODO: use Tensor allocator
+        var copy = new Tensor(shape, $"clone of {name}");
+        if (m_TensorOnDevice is ICloneable)
+        {
+            UploadIfDirty();
+            var copyOfTensorData = (m_TensorOnDevice as ICloneable).Clone() as ITensorData;
+            copy.PinToDeviceAndDownloadFromIt(copyOfTensorData);
+        }
+        else
+        {
+            PrepareCacheForAccess();
+            copy.PrepareCacheForAccess();
+            Array.Copy(m_Cache, 0, copy.m_Cache, 0, length);
+        }
+
+        return copy;
+    }
+
+    /// <summary>
+    /// Remove system reference to this tensor, caller assume ownership.
+    /// </summary>
+    public void TakeOwnership()
+    {
+        m_TensorAllocator?.WaiveOwnership(this);
+        m_TensorAllocator = null;
+    }
+
+    /// Called from ITensorAllocator, puts Tensor in the ready for reuse state.
+    internal ITensorData Invalidate()
+    {
+        ITensorData unpinned = m_TensorOnDevice;
+        PinToDevice(null, false);
+        Assert.AreEqual(m_TensorOnDevice, null);
+        m_Cache = null;
+        m_CacheIsDirty = false;
+        m_TensorOnDevice = null;
+        m_TensorAllocator = null;
+        return unpinned;
+    }
+
+    /// <summary>
+    /// Dispose Tensor and associated memories.
+    /// </summary>
+    public virtual void Dispose()
+    {
+        m_Disposing = true;
+        if (m_TensorAllocator != null)
+        {
+            m_TensorAllocator.Release(this, true);
+        }
+        else if (m_TensorOnDevice != null)
+        {
+            //;;UnityEngine.D.Log("DISPOSE " + name + " " + shape + " @ " + m_TensorOnDevice.GetType().Name);
+            m_TensorOnDevice.Dispose();
+        }
+
+        m_Cache = null;
+        m_CacheIsDirty = false;
+        m_TensorOnDevice = null;
+        m_TensorAllocator = null;
+        m_Disposing = false;
+    }
+
+
+    #region Render Texture
+    /// <summary>
+    /// Fill a RenderTexture with a slice/batch of a tensor.
+    /// </summary>
+    public void ToRenderTexture(UnityEngine.RenderTexture target, int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
+    {
+        BarracudaTextureUtils.TensorToRenderTexture(this, target, batch, fromChannel, scale, bias);
+    }
+
+    /// <summary>
+    /// Create a new RenderTexture from a slice/batch of a tensor.
+    /// </summary>
+    public UnityEngine.RenderTexture ToRenderTexture(int batch = 0, int fromChannel = 0, float scale = 1.0f, float bias = 0f)
+    {
+        return BarracudaTextureUtils.TensorToRenderTexture(this, batch, fromChannel, scale, bias);
+    }
+    #endregion
+
+
+    #region Data access
+    /// <summary>
+    /// Allow to use negative axis to access tensorShape backward.
+    /// `axis` should be from -rank to rank (exclusive).
+    /// </summary>
+    public int Axis(int axis)
+    {
+        return shape.Axis(axis);
+    }
+    /// <summary>
+    /// Given an element dimensions indices [b,h,w,c] return this element offset in memory.
+    /// </summary>
+    public int Index(int b, int h, int w, int ch)
+    {
+        return shape.Index(b, h, w, ch);
+    }
+    /// <summary>
+    /// Given an element dimensions indices [b,h,w,c] with broadcast support, return this element offset in memory.
+    /// </summary>
+    public int IndexWithBroadcast(int b, int h, int w, int ch)
+    {
+        return shape.IndexWithBroadcast(b, h, w, ch);
+    }
+    /// <summary>
+    /// Given an element dimensions indices [b,0,0,c] return this element offset in memory.
+    /// </summary>
+    public int Index(int y, int x)
+    {
+        return shape.Index(y, x);
+    }
+    /// <summary>
+    /// Access element at offset `index` in this Tensor.
+    /// This will create a blocking read if cache is dirty.
+    /// </summary>
+    public float this[int index]
+    {
+        get { PrepareCacheForAccess(); return m_Cache[index]; }
+        set { PrepareCacheForAccess(); m_Cache[index] = value; m_CacheIsDirty = true; }
+    }
+    /// <summary>
+    /// Access element at index [b,0,0,ch] in this Tensor.
+    /// This will create a blocking read if cache is dirty.
+    /// </summary>
+    public float this[int b, int ch]
+    {
+        get { PrepareCacheForAccess(); return m_Cache[Index(b, ch)]; }
+        set { PrepareCacheForAccess(); m_Cache[Index(b, ch)] = value; m_CacheIsDirty = true; }
+    }
+    /// <summary>
+    /// Access element at index [b,h,w,ch] in this Tensor.
+    /// This will create a blocking read if cache is dirty.
+    /// </summary>
+    public float this[int b, int h, int w, int ch]
+    {
+        get { PrepareCacheForAccess(); return m_Cache[Index(b, h, w, ch)]; }
+        set { PrepareCacheForAccess(); m_Cache[Index(b, h, w, ch)] = value; m_CacheIsDirty = true; }
+    }
+
+    // @TODO: implement via ITensorData.SharedAccess()
+    /// <summary>
+    /// Return the cached linear memory representation of this tensor data.
+    /// This will create a blocking read if cache is dirty.
+    /// see also `readonlyArrayOffset`.
+    /// IMPORTANT: This data should not be modified.
+    /// </summary>
+    public float[] readonlyArray { get { PrepareCacheForAccess(); return m_Cache; } }
+    // @TODO: implement via ITensorData.SharedAccess()
+    /// <summary>
+    /// Return the offset to use when accessing `readonlyArray`
+    /// Always 0 at the moment.
+    /// </summary>
+    public int readonlyArrayOffset { get { return 0; } }
+    #endregion
+
+    public ITensorData tensorOnDevice { get { return m_TensorOnDevice; } }
+    public ITensorData data
+    {
+        get
+        {
+            if (m_TensorOnDevice == null)
+                PinToDeviceAndUploadToIt(new ArrayTensorData(shape));
+            return m_TensorOnDevice;
+        }
+    }
+
+    public override string ToString()
+    {
+        return $"({name} {shape}, alloc: {m_TensorAllocator?.GetType()}, onDevice:{m_TensorOnDevice})";
+    }
+
+}
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/Tensor.cs.meta b/Assets/Coach-ML/Barracuda/Core/Tensor.cs.meta
new file mode 100644
index 0000000..9bfd6bf
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/Tensor.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 98a907db6ef714800aaf596877e02d38
+timeCreated: 1506363800
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Core/TensorExtensions.cs b/Assets/Coach-ML/Barracuda/Core/TensorExtensions.cs
new file mode 100644
index 0000000..267de33
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/TensorExtensions.cs
@@ -0,0 +1,484 @@
+﻿using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq; // Enumerable.Range(), Enumerable.SequenceEqual()
+
+using UnityEngine;
+using UnityEngine.Assertions;
+
+namespace Barracuda {
+
+
+public static class TensorExtensions
+{
+    static public void TestInit(this Tensor X, int n = -1)
+    {
+        if (n < 0)
+            n = X.length;
+        n = Math.Min(n, X.length);
+        for (int i = 0; i < n; ++i)
+            X[i] = i;
+    }
+
+    static public void TestInit2(this Tensor X, int n = -1)
+    {
+        if (n < 0)
+            n = X.length;
+        n = Math.Min(n, X.length);
+        for (int i = 0; i < n; ++i)
+            X[i] = 0.1f;
+    }
+
+    static public void TestInitCos(this Tensor X, int n = -1)
+    {
+        if (n < 0)
+            n = X.length;
+        n = Math.Min(n, X.length);
+        for (int i = 0; i < n; ++i)
+            X[i] = Mathf.Cos(i);
+    }
+
+    static public void Print(this Tensor X, string msg = "")
+    {
+        D.Log(msg + X.name + " " + X.shape);
+    }
+
+    static public void PrintDataPart(this Tensor X, int size, string msg = "")
+    {
+        if (msg.Length > 0)
+            msg += " ";
+        for (int i = 0; i < X.length && i < size; ++i)
+        {
+            msg += X[i];
+            msg += " ";
+        }
+        D.Log(msg);
+    }
+
+    static public bool Equals(this Tensor X, Tensor Y)
+    {
+        if (X.batch != Y.batch || X.height != Y.height || X.width != Y.width || X.channels != Y.channels)
+            return false;
+
+        if (X.length != Y.length)
+            return false;
+
+        for (int i = 0; i < X.length; ++i)
+        {
+            if (X[i] != Y[i])
+                return false;
+        }
+
+        return true;
+    }
+
+    static public bool Approximately(this Tensor X, Tensor Y, float epsilon = 1e-4f, int count = -1)
+    {
+        if (X.batch != Y.batch || X.height != Y.height || X.width != Y.width || X.channels != Y.channels)
+            return false;
+
+        if (X.length != Y.length)
+            return false;
+
+        if (count < 0)
+            count = X.length;
+        for (int i = 0; i < count; ++i)
+        {
+            if (Mathf.Abs(X[i] - Y[i]) > epsilon)
+            {
+                // @TODO: move logging into dedicated function
+                D.Log("First mismatch @ [" + i + "]: " + X[i] + " != " + Y[i]);
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    static public float MaxDifference(this Tensor X, Tensor Y)
+    {
+        float maxD = 0f;
+        for (int i = 0; i < X.length; ++i)
+            maxD = Mathf.Max(Mathf.Abs(X[i] - Y[i]), maxD);
+        return maxD;
+    }
+
+    static public int[] ArgMax(this Tensor X)
+    {
+        int[] result = new int[X.batch];
+        for (int b = 0; b < X.batch; ++b)
+        {
+            float maxV = Mathf.NegativeInfinity;
+            var i = 0;
+            for (int y = 0; y < X.height; ++y)
+                for (int x = 0; x < X.width; ++x)
+                    for (int c = 0; c < X.channels; ++c, ++i)
+                    {
+                        var v = X[b, y, x, c];
+                        if (maxV >= v)
+                            continue;
+                        maxV = v;
+                        result[b] = i;
+                    }
+        }
+        return result;
+    }
+
+    static public Tensor Reshape(this Tensor X, int[] size)
+    {
+        var newShape = X.shape.Reshape(size);
+        return X.Reshape(newShape);
+    }
+
+    static public int[][] ArgSort(this Tensor X)
+    {
+        var count = X.height * X.width * X.channels;
+        var result = new List<int[]>();
+
+        for (int n = 0; n < X.batch; ++n)
+        {
+            int[] indices = Enumerable.Range(0, count).ToArray<int>();
+
+            var sliceOffset = n * count;
+            Array.Sort<int>(indices, (a, b) => X[sliceOffset + a].CompareTo(X[sliceOffset + b]));
+            result.Add(indices);
+        }
+        return result.ToArray();
+    }
+
+    static public TensorShape Concat(TensorShape[] shapes, int axis)
+    {
+        if (shapes.Length == 0)
+            return new TensorShape();
+
+        // validate that off axis dimensions are equal
+        for (var i = 1; i < shapes.Length; ++i)
+        {
+            var a = shapes[0].ToArray();
+            var b = shapes[i].ToArray();
+            var aAxis = shapes[0].Axis(axis);
+            var bAxis = shapes[i].Axis(axis);
+            a[aAxis] = 0; b[bAxis] = 0;
+            if (!Enumerable.SequenceEqual(a, b))
+            {
+                foreach (var s in shapes)
+                    D.Log(s);
+                throw new ArgumentException("Off-axis dimensions must match");
+            }
+        }
+
+        var shape = shapes[0].ToArray();
+        var dstAxis = shapes[0].Axis(axis);
+        for (var i = 1; i < shapes.Length; ++i)
+            shape[dstAxis] += shapes[i][axis];
+        return new TensorShape(shape);
+    }
+
+    static public TensorShape ConcatShapes(Tensor[] tensors, int axis)
+    {
+        return Concat(tensors.Select(t => t.shape).ToArray(), axis);
+    }
+
+    static public TensorShape Max(TensorShape[] shapes)
+    {
+        Assert.IsTrue(shapes.Length > 0);
+        int batch = 0, height = 0, width = 0, channels = 0;
+        foreach (var s in shapes)
+        {
+            batch =    Math.Max(s.batch, batch);
+            height =   Math.Max(s.height, height);
+            width =    Math.Max(s.width, width);
+            channels = Math.Max(s.channels, channels);
+        }
+        return new TensorShape(batch, height, width, channels);
+    }
+
+    static public TensorShape MaxShape(Tensor[] tensors)
+    {
+        Assert.IsTrue(tensors.Length > 0);
+        int batch = 0, height = 0, width = 0, channels = 0;
+        foreach (var t in tensors)
+        {
+            batch =    Math.Max(t.batch, batch);
+            height =   Math.Max(t.height, height);
+            width =    Math.Max(t.width, width);
+            channels = Math.Max(t.channels, channels);
+        }
+        return new TensorShape(batch, height, width, channels);
+    }
+
+    static public TensorShape Scale(this TensorShape shape, TensorShape scale)
+    {
+        return new TensorShape(
+            shape.batch * scale.batch,
+            shape.height * scale.height,
+            shape.width * scale.width,
+            shape.channels * scale.channels);
+    }
+
+    static public TensorShape Scale(this TensorShape shape, int[] scale)
+    {
+        Assert.AreEqual(scale.Length, 4);
+        return Scale(shape, new TensorShape(scale));
+    }
+
+    static public TensorShape Reduce(this TensorShape shape, int axis)
+    {
+        axis = shape.Axis(axis);
+        var newShapeArray = shape.ToArray();
+        newShapeArray[axis] = 1;
+        return new TensorShape(newShapeArray);
+    }
+
+    static public TensorShape Reshape(this TensorShape shape, int[] size)
+    {
+        Assert.AreEqual(size.Length, 4);
+        var newShapeArray = shape.ToArray();
+
+        // From: https://github.com/onnx/onnx/blob/master/docs/Operators.md#Reshape
+        //
+        // At most one dimension of the new shape can be -1.
+        // In this case, the value is inferred from the size of the tensor and the remaining dimensions.
+        //
+        // A dimension could also be 0,
+        // in which case the actual dimension value is unchanged (i.e. taken from the input tensor).
+
+        var multipleOf = 1;
+        var unknownIndex = -1;
+        for (int q = 0; q < size.Length; ++q)
+        {
+            if (size[q] > 0)
+            {
+                multipleOf *= size[q];
+                newShapeArray[q] = size[q];
+            }
+            else if (size[q] == 0)
+                multipleOf *= newShapeArray[q];
+            else if (unknownIndex == -1)
+                unknownIndex = q;
+            else
+                throw new ArgumentException("Can only specify one unknown dimension");
+        }
+
+        if (unknownIndex == -1)
+        {
+            // all dimensions are given
+            var newShape = new TensorShape(newShapeArray);
+            if (shape.length != newShape.length)
+                throw new ArgumentException("Cannot reshape array of size " + shape.length +
+                    " into shape " + newShape);
+            return newShape;
+        }
+
+        var solveForIndex = shape.length / multipleOf;
+        bool remainderLeft = shape.length % multipleOf != 0;
+
+        if (remainderLeft)
+            throw new ArgumentException("Cannot reshape array of size " + shape.length +
+                " into shape with multiple of " + multipleOf + " elements");
+
+        newShapeArray[unknownIndex] = solveForIndex;
+        return new TensorShape(newShapeArray);
+    }
+
+    static public TensorShape ApplyBorder(this TensorShape shape, int[] border)
+    {
+        return new TensorShape(
+            shape.batch,
+            (shape.height + (border[1]+border[3])),
+            (shape.width  + (border[0]+border[2])),
+            shape.channels);
+    }
+
+    static public int[] AdjustPadToKernel(this Tensor tensor, Tensor kernel, int[] stride, int[] pad)
+    {
+        return AdjustPadToKernel(tensor.shape, kernel.shape, stride, pad);
+    }
+
+    static public int[] AdjustPadToKernel(this TensorShape shape, TensorShape kernel, int[] stride, int[] pad)
+    {
+        return AdjustPadToPool(shape, new int[] { kernel.kernelWidth, kernel.kernelHeight }, stride, pad);
+    }
+
+    static public int[] AdjustPadToPool(this Tensor tensor, int[] pool, int[] stride, int[] pad)
+    {
+        return AdjustPadToPool(tensor.shape, pool, stride, pad);
+    }
+
+    static public int[] AdjustPadToPool(this TensorShape shape, int[] pool, int[] stride, int[] pad)
+    {
+        // negative pad values mean auto_pad type is used
+        if (pad[0] >= 0)
+            return pad;
+
+        var type = (Layer.AutoPad)pad[0];
+        if (type == Layer.AutoPad.SameUpper || type == Layer.AutoPad.SameLower)
+        {
+            // Based on ONNX (AveragePool & MaxPool)
+            //        https://github.com/onnx/onnx/blob/master/docs/Operators.md
+            // and TensorFlow docs:
+            //         https://www.tensorflow.org/api_guides/python/nn#Notes_on_SAME_Convolution_Padding
+
+            var widthModStride = shape.width % stride[0];
+            var heightModStride = shape.height % stride[1];
+
+            if (widthModStride == 0)
+                widthModStride = stride[0];
+            if (heightModStride == 0)
+                heightModStride = stride[1];
+
+            var padAlongWidth = Math.Max(pool[0] - widthModStride, 0);
+            var padAlongHeight = Math.Max(pool[1] - heightModStride, 0);
+            // Code above (based on TensorFlow docs) is equivalent to (based on ONNX docs):
+            // padAlongWidth = (Mathf.Ceil(shape.width/stride[0]) - 1) * stride[0] + pool[0] - shape.width;
+            // padAlongHeight = (Mathf.Ceil(shape.height/stride[1]) - 1) * stride[1] + pool[1] - shape.height;
+
+            var widthSmall = padAlongWidth / 2;
+            var widthLarge = padAlongWidth - widthSmall;
+            var heightSmall = padAlongHeight / 2;
+            var heightLarge = padAlongHeight - heightSmall;
+
+            // In case of odd number add the extra padding
+            // at the end for SAME_UPPER and at the beginning for SAME_LOWER
+            if (type == Layer.AutoPad.SameUpper)
+                return new [] { widthSmall, heightSmall, widthLarge, heightLarge };
+            else
+                return new [] { widthLarge, heightLarge, widthSmall, heightSmall };
+        }
+        else
+            throw new NotImplementedException("This padding type is not implemented yet!");
+    }
+
+    static public TensorShape ApplyPool(this TensorShape shape, int[] pool, int[] stride, int[] pad)
+    {
+        Assert.AreEqual(pool.Length, 2);
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        // Based on ONNX (AveragePool & MaxPool)
+        //        https://github.com/onnx/onnx/blob/master/docs/Operators.md
+        // Theano "Convolution arithmetic tutorial"
+        //        http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html#quick-reference
+        // and TensorFlow docs:
+        //         https://www.tensorflow.org/api_guides/python/nn#Convolution
+        //         https://www.tensorflow.org/api_guides/python/nn#Notes_on_SAME_Convolution_Padding
+        //
+        //   output_size = (input_size + pad_left + pad_right - kernel_size) / stride + 1
+        //
+
+        return new TensorShape(
+            shape.batch,
+            (shape.height + (pad[1]+pad[3]) - pool[1]) / stride[1] + 1,
+            (shape.width  + (pad[0]+pad[2]) - pool[0]) / stride[0] + 1,
+            shape.channels);
+    }
+
+    static public TensorShape ApplyKernel(this TensorShape shape, TensorShape kernel, int[] stride, int[] pad)
+    {
+        shape = ApplyPool(shape, new int[] { kernel.kernelWidth, kernel.kernelHeight }, stride, pad);
+        return new TensorShape(shape.batch, shape.height, shape.width, kernel.kernelCount);
+    }
+
+    static public TensorShape ApplyKernelInverse(this TensorShape shape, TensorShape kernel, int[] stride, int[] pad, int[] outputAdjustment)
+    {
+        Assert.AreEqual(stride.Length, 2);
+        Assert.AreEqual(pad.Length, 4);
+
+        // Based on ONNX (ConvTranspose)
+        //        https://github.com/onnx/onnx/blob/master/docs/Operators.md
+        // and Theano "Convolution arithmetic tutorial"
+        //        http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html#transposed-convolution-arithmetic
+        //
+        // Inverse of:
+        //   output_size = (input_size + pad_left + pad_right - kernel_size) / stride + 1
+        // Resulting in:
+        //   output_size = (input_size - 1 ) * stride - (pad_left + pad_right) + kernel_size + output_adj
+        //   output_adj = (input_size + (pad_left + pad_right) - kernel_size) % stride
+        //
+        if (outputAdjustment == null || outputAdjustment.Length == 0)
+        {
+            outputAdjustment = new int[] {
+                (shape.width + (pad[0]+pad[2]) - kernel.kernelWidth) % stride[0],
+                (shape.height + (pad[1]+pad[3]) - kernel.kernelHeight) % stride[1]
+            };
+        }
+        return new TensorShape(
+            shape.batch,
+            (shape.height - 1) * stride[1] - (pad[1]+pad[3]) + kernel.kernelHeight + outputAdjustment[1],
+            (shape.width  - 1) * stride[0] - (pad[0]+pad[2]) + kernel.kernelWidth + outputAdjustment[0],
+            kernel.kernelCount);
+    }
+
+    static public int WrapIndex(int i, int length)
+    {
+        // allow index to be equal to length
+        // in order to enable iteration over [i,end) range
+        if (i >= length)
+            return length;
+
+        // in C# modulo of negative is negative
+        // to emulate Python array behavior, we use: https://stackoverflow.com/questions/1082917/mod-of-negative-number-is-melting-my-brain/1082938
+        var v = i % length;
+        return v < 0 ? (v + length): v;
+    }
+
+    // TODO: implement negative strides
+    static public TensorShape ApplyStridedSlice(this TensorShape shape, int[] starts, int[] ends, int[] stride)
+    {
+        Assert.AreEqual(starts.Length, shape.rank);
+        Assert.AreEqual(ends.Length, shape.rank);
+        Assert.AreEqual(stride.Length, shape.rank);
+
+        int[] counts = shape.ToArray();
+        int[] sliced = shape.ToArray();
+        Assert.AreEqual(counts.Length, shape.rank);
+        for (int i = 0; i < counts.Length; ++i)
+        {
+            // NOTE: begin=0, end=0, stride=1  <=  full range from the existing axis
+            //       begin=0, end=X, stride=1  <=  full range from the existing axis, if X==last element on this axis
+            //       begin=0, end=0, stride=0  <=  new axis OR shrink axis to a single 1st element
+            //       begin=N, end=N, stride=0  <=              shrink axis to a single Nth element
+
+            Assert.IsTrue(starts[i] < counts[i]);
+            if (starts[i] != ends[i])
+                sliced[i] = WrapIndex(ends[i], counts[i]) - WrapIndex(starts[i], counts[i]);
+            else
+                sliced[i] = counts[i];
+            if (stride[i] != 0 && stride[i] < counts[i])
+                sliced[i] /= stride[i];
+            else
+                sliced[i] = 1;
+
+            if (sliced[i] < 0)
+                sliced[i] = counts[i] + sliced[i];
+
+            if (sliced[i] < 0)
+                sliced[i] = 0;
+        }
+
+        return new TensorShape(sliced);
+    }
+
+    static public ITensorData CreateFromTexture(Texture tex, TensorShape shape)
+    {
+        Assert.AreEqual(tex.width, shape.width);
+        Assert.AreEqual(tex.height, shape.height);
+        Assert.IsTrue(shape.channels < 4);
+
+        // @TODO: implement proper GPU storage
+        var data = new ArrayTensorData(shape);
+        if (tex is Texture2D)
+        {
+            Texture2D tex2d = tex as Texture2D;
+            var pixels = tex2d.GetPixels();
+            for (int i = 0; i < data.array.Length && i < pixels.Length * shape.channels; ++i)
+                data.array[i] = pixels[i / shape.channels][i % shape.channels];
+        }
+        else
+            throw new NotImplementedException();
+
+        return data;
+    }
+}
+
+} // namespace Barracuda
diff --git a/Assets/Coach-ML/Barracuda/Core/TensorExtensions.cs.meta b/Assets/Coach-ML/Barracuda/Core/TensorExtensions.cs.meta
new file mode 100644
index 0000000..a774bdd
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Core/TensorExtensions.cs.meta
@@ -0,0 +1,12 @@
+fileFormatVersion: 2
+guid: 3fb6bb6c79a8e4887a615dbfc580e1cd
+timeCreated: 1506363800
+licenseType: Pro
+MonoImporter:
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef b/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef
index b10599b..dc0faf2 100644
--- a/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef
+++ b/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/Barracuda-editor.asmdef
@@ -1,8 +1,8 @@
 {
     "name": "Barracuda-editor",
-    "references": [],
+    "references": ["Barracuda"],
     "includePlatforms": [
         "Editor"
     ],
     "excludePlatforms": []
-}
\ No newline at end of file
+}
diff --git a/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/HalfHelper.cs b/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/HalfHelper.cs
new file mode 100644
index 0000000..985aace
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/HalfHelper.cs
@@ -0,0 +1,165 @@
+﻿using System.Runtime.InteropServices;
+
+// Based on https://sourceforge.net/p/csharp-half/code/HEAD/tree/System.Half/HalfHelper.cs
+namespace System
+{
+    /// <summary>
+    /// Helper class for Half conversions and some low level operations.
+    /// This class is internally used in the Half class.
+    /// </summary>
+    /// <remarks>
+    /// References:
+    ///     - Fast Half Float Conversions, Jeroen van der Zijp, link: http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+    /// </remarks>
+    internal static class HalfHelper
+    {
+        private static uint[] mantissaTable = GenerateMantissaTable();
+        private static uint[] exponentTable = GenerateExponentTable();
+        private static ushort[] offsetTable = GenerateOffsetTable();
+        private static ushort[] baseTable = GenerateBaseTable();
+        private static sbyte[] shiftTable = GenerateShiftTable();
+
+        // Transforms the subnormal representation to a normalized one. 
+        private static uint ConvertMantissa(int i)
+        {
+            uint m = (uint)(i << 13); // Zero pad mantissa bits
+            uint e = 0; // Zero exponent
+
+            // While not normalized
+            while ((m & 0x00800000) == 0)
+            {
+                e -= 0x00800000; // Decrement exponent (1<<23)
+                m <<= 1; // Shift mantissa                
+            }
+            m &= unchecked((uint)~0x00800000); // Clear leading 1 bit
+            e += 0x38800000; // Adjust bias ((127-14)<<23)
+            return m | e; // Return combined number
+        }
+
+        private static uint[] GenerateMantissaTable()
+        {
+            uint[] mantissaTable = new uint[2048];
+            mantissaTable[0] = 0;
+            for (int i = 1; i < 1024; i++)
+            {
+                mantissaTable[i] = ConvertMantissa(i);
+            }
+            for (int i = 1024; i < 2048; i++)
+            {
+                mantissaTable[i] = (uint)(0x38000000 + ((i - 1024) << 13));
+            }
+
+            return mantissaTable;
+        }
+        private static uint[] GenerateExponentTable()
+        {
+            uint[] exponentTable = new uint[64];
+            exponentTable[0] = 0;
+            for (int i = 1; i < 31; i++)
+            {
+                exponentTable[i] = (uint)(i << 23);
+            }
+            exponentTable[31] = 0x47800000;
+            exponentTable[32] = 0x80000000;
+            for (int i = 33; i < 63; i++)
+            {
+                exponentTable[i] = (uint)(0x80000000 + ((i - 32) << 23));
+            }
+            exponentTable[63] = 0xc7800000;
+
+            return exponentTable;
+        }
+        private static ushort[] GenerateOffsetTable()
+        {
+            ushort[] offsetTable = new ushort[64];
+            offsetTable[0] = 0;
+            for (int i = 1; i < 32; i++)
+            {
+                offsetTable[i] = 1024;
+            }
+            offsetTable[32] = 0;
+            for (int i = 33; i < 64; i++)
+            {
+                offsetTable[i] = 1024;
+            }
+
+            return offsetTable;
+        }
+        private static ushort[] GenerateBaseTable()
+        {
+            ushort[] baseTable = new ushort[512];
+            for (int i = 0; i < 256; ++i)
+            {
+                sbyte e = (sbyte)(127 - i);
+                if (e > 24)
+                { // Very small numbers map to zero
+                    baseTable[i | 0x000] = 0x0000;
+                    baseTable[i | 0x100] = 0x8000;
+                }
+                else if (e > 14)
+                { // Small numbers map to denorms
+                    baseTable[i | 0x000] = (ushort)(0x0400 >> (18 + e));
+                    baseTable[i | 0x100] = (ushort)((0x0400 >> (18 + e)) | 0x8000);
+                }
+                else if (e >= -15)
+                { // Normal numbers just lose precision
+                    baseTable[i | 0x000] = (ushort)((15 - e) << 10);
+                    baseTable[i | 0x100] = (ushort)(((15 - e) << 10) | 0x8000);
+                }
+                else if (e > -128)
+                { // Large numbers map to Infinity
+                    baseTable[i | 0x000] = 0x7c00;
+                    baseTable[i | 0x100] = 0xfc00;
+                }
+                else
+                { // Infinity and NaN's stay Infinity and NaN's
+                    baseTable[i | 0x000] = 0x7c00;
+                    baseTable[i | 0x100] = 0xfc00;
+                }
+            }
+
+            return baseTable;
+        }
+        private static sbyte[] GenerateShiftTable()
+        {
+            sbyte[] shiftTable = new sbyte[512];
+            for (int i = 0; i < 256; ++i)
+            {
+                sbyte e = (sbyte)(127 - i);
+                if (e > 24)
+                { // Very small numbers map to zero
+                    shiftTable[i | 0x000] = 24;
+                    shiftTable[i | 0x100] = 24;
+                }
+                else if (e > 14)
+                { // Small numbers map to denorms
+                    shiftTable[i | 0x000] = (sbyte)(e - 1);
+                    shiftTable[i | 0x100] = (sbyte)(e - 1);
+                }
+                else if (e >= -15)
+                { // Normal numbers just lose precision
+                    shiftTable[i | 0x000] = 13;
+                    shiftTable[i | 0x100] = 13;
+                }
+                else if (e > -128)
+                { // Large numbers map to Infinity
+                    shiftTable[i | 0x000] = 24;
+                    shiftTable[i | 0x100] = 24;
+                }
+                else
+                { // Infinity and NaN's stay Infinity and NaN's
+                    shiftTable[i | 0x000] = 13;
+                    shiftTable[i | 0x100] = 13;
+                }
+            }
+
+            return shiftTable;
+        }
+
+        public static float HalfToSingle(ushort halfValue)
+        {
+            uint result = mantissaTable[offsetTable[halfValue >> 10] + (halfValue & 0x3ff)] + exponentTable[halfValue >> 10];
+            return BitConverter.ToSingle(BitConverter.GetBytes(result), 0);
+        }
+    }
+}
diff --git a/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/HalfHelper.cs.meta b/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/HalfHelper.cs.meta
new file mode 100644
index 0000000..6a44d24
--- /dev/null
+++ b/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/HalfHelper.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 9f18f6e7d4eea41ceb83f1c74589e5ab
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/ONNXModelImporter.cs b/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/ONNXModelImporter.cs
index afd527e..936c07e 100644
--- a/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/ONNXModelImporter.cs
+++ b/Assets/Coach-ML/Barracuda/Plugins/Editor/BarracudaEditor/ONNXModelImporter.cs
@@ -907,6 +907,13 @@ private static ONNXTensor ReadTensor(TensorProto onnxTensor)
                     Debug.Assert((sizeof(float) * shape.length) == onnxTensor.RawData.Length);
                     Buffer.BlockCopy(byteArray, 0, data, 0, byteArray.Length);
                 }
+                else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Float16)
+                {
+                    var typedData = new UInt16[shape.length];
+                    Debug.Assert((sizeof(UInt16) * shape.length) == onnxTensor.RawData.Length);
+                    Buffer.BlockCopy(byteArray, 0, typedData, 0, byteArray.Length);
+                    data = typedData.Select(x => HalfHelper.HalfToSingle(x)).ToArray();
+                }
                 else if (onnxTensor.DataType == (int)TensorProto.Types.DataType.Int32)
                 {
                     var typedData = new int[shape.length];
diff --git a/Assets/Coach-ML/Barracuda/Plugins/OSX/MacBLAS.asmdef b/Assets/Coach-ML/Barracuda/Plugins/OSX/MacBLAS.asmdef
index 9d6f291..85149cf 100644
--- a/Assets/Coach-ML/Barracuda/Plugins/OSX/MacBLAS.asmdef
+++ b/Assets/Coach-ML/Barracuda/Plugins/OSX/MacBLAS.asmdef
@@ -1,6 +1,8 @@
 {
     "name": "MacBLAS",
-    "references": [],
+    "references": [
+		"Barracuda"
+    ],
     "optionalUnityReferences": [],
     "includePlatforms": [
         "Editor",
@@ -8,4 +10,4 @@
     ],
     "excludePlatforms": [],
     "allowUnsafeCode": true
-}
\ No newline at end of file
+}
diff --git a/Assets/Coach-ML/Barracuda/Plugins/iOS/iOSBLAS.asmdef b/Assets/Coach-ML/Barracuda/Plugins/iOS/iOSBLAS.asmdef
index ba58166..e34fa24 100644
--- a/Assets/Coach-ML/Barracuda/Plugins/iOS/iOSBLAS.asmdef
+++ b/Assets/Coach-ML/Barracuda/Plugins/iOS/iOSBLAS.asmdef
@@ -1,6 +1,8 @@
 {
     "name": "iOSBLAS",
-    "references": [],
+    "references": [
+        "Barracuda"
+    ],
     "optionalUnityReferences": [],
     "includePlatforms": [
         "Editor",
@@ -8,4 +10,4 @@
     ],
     "excludePlatforms": [],
     "allowUnsafeCode": true
-}
\ No newline at end of file
+}
diff --git a/Assets/Coach-ML/Barracuda/Resources/ConvOld.compute b/Assets/Coach-ML/Barracuda/Resources/ConvOld.compute
deleted file mode 100644
index 81b5e4b..0000000
--- a/Assets/Coach-ML/Barracuda/Resources/ConvOld.compute
+++ /dev/null
@@ -1,418 +0,0 @@
-//#pragma kernel Conv2D_Kmod16_Nmod8_KNY
-//#pragma kernel Conv2D_Cache_KCmod32_KNyx
-//#pragma kernel Conv2D_Cache_KCmod32_KNyxDiv2
-// NOTE: DISABLED 64 version because as it is slower than 32 version on AMD GPU
-//#pragma kernel Conv2D_Cache_KCmod64_KNyx
-
-#include "Tensor.cginc"
-
-TENSOR_DECL(X)
-TENSOR_DECL(K)
-TENSOR_DECL(B)
-TENSOR_DECL(WBK)
-TENSOR_DECL_RW(O)
-
-uint4 _Pad;
-uint4 _Stride;
-
-NUMTHREADS((16,8,1), (16,8,1), (16,4,1))
-void Conv2D_Kmod16_Nmod8_KNY(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(K.channels, O.batch, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    uint k = dispatchThreadID.x;
-    uint n = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    for (uint x = 0; x < O.width; ++x)
-    {
-        float v = B.Get(k);
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint oy = y * _Stride.y + dy;
-                uint ox = x * _Stride.x + dx;
-                // @TODO: investigate
-                // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-                if (oy < _Pad.y) continue;
-                if (oy - _Pad.w >= X.height) continue;
-                if (ox < _Pad.x) continue;
-                if (ox - _Pad.z >= X.width) continue;
-
-                for (uint c = 0; c < X.channels; ++c)
-                {
-                    v += X.Get(n, oy-_Pad.y, ox-_Pad.x, c) * K.Get(dy, dx, c, k);
-                }
-            }
-        }
-        O.Set(n, y, x, k, v);
-    }
-}
-
-#undef CTILE
-#define CTILE NUMTHREAD(16, 8, 8)
-groupshared float Conv_Xcache[4][CTILE][CTILE];
-groupshared float Conv_Kcache[4][CTILE][CTILE];
-[numthreads(CTILE, CTILE, 1)]
-void Conv2D_Cache_KCmod32_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv_Xcache
-    #define K_ Conv_Kcache
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = CTILE * groupID.x + groupThreadID.x;
-    uint nyx = CTILE * groupID.y + groupThreadID.y;
-
-    uint width = O.width;
-    uint height = O.height;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-    
-    float b0 = B.Get(k*2+0);
-    float b1 = B.Get(k*2+1);
-    float4 v = float4(b0, b1,
-                      b0, b1);
-
-    for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-    {
-        for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-        {
-            bool mask = true;
-            uint oy = y * _Stride.y + dy;
-            uint ox = x * _Stride.x + dx;
-            // @TODO: investigate
-            // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-            if (oy < _Pad.y) mask = false;
-            if (oy - _Pad.w >= X.height) mask = false;
-            if (ox < _Pad.x) mask = false;
-            if (ox - _Pad.z >= X.width) mask = false;
-
-            for (uint m = 0; m < X.channels/(CTILE*2); ++m)
-            {
-                float x0 = 0;
-                float x1 = 0;
-                float x2 = 0;
-                float x3 = 0;
-                
-                if (mask)
-                {
-                    x0 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
-                    x1 = X.Get(n*2+0, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
-                    x2 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+0);
-                    x3 = X.Get(n*2+1, oy-_Pad.y, ox-_Pad.x, (m*CTILE + gx)*2+1);
-                }
-
-                float k0 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
-                float k1 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
-                float k2 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
-                float k3 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
-
-                //X_[gy][gx] = float4(x0, x1,
-                //                    x2, x3);
-                //K_[gy][gx] = float4(k0, k1,
-                //                    k2, k3);
-                X_[0][gy][gx] = x0;
-                X_[1][gy][gx] = x1;
-                X_[2][gy][gx] = x2;
-                X_[3][gy][gx] = x3;
-
-                K_[0][gy][gx] = k0;
-                K_[1][gy][gx] = k1;
-                K_[2][gy][gx] = k2;
-                K_[3][gy][gx] = k3;
-
-                GroupMemoryBarrierWithGroupSync();
-
-                [unroll]
-                for (uint i = 0; i < CTILE; ++i)
-                {
-                    float4 x = //X_[gy][i];
-                        float4(    X_[0][gy][i],
-                                X_[1][gy][i],
-                                X_[2][gy][i],
-                                X_[3][gy][i]);
-                    float4 k = //K_[i][gx];
-                        float4(    K_[0][i][gx],
-                                K_[1][i][gx],
-                                K_[2][i][gx],
-                                K_[3][i][gx]);
-                    
-                    v.x = mad(k.x, x.x, v.x);
-                    v.x = mad(k.z, x.y, v.x);
-                    
-                    v.y = mad(k.y, x.x, v.y);
-                    v.y = mad(k.w, x.y, v.y);
-                    
-                    v.z = mad(k.x, x.z, v.z);
-                    v.z = mad(k.z, x.w, v.z);
-                    
-                    v.w = mad(k.y, x.z, v.w);
-                    v.w = mad(k.w, x.w, v.w);
-
-                    //v.x += k.x*x.x + k.z*x.y;
-                    //v.y += k.y*x.x + k.w*x.y;
-                    //v.z += k.x*x.z + k.z*x.w;
-                    //v.w += k.y*x.z + k.w*x.w;
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-
-    O.Set(n*2+0, y, x, k*2+0, v.x);
-    O.Set(n*2+0, y, x, k*2+1, v.y);
-    O.Set(n*2+1, y, x, k*2+0, v.z);
-    O.Set(n*2+1, y, x, k*2+1, v.w);
-    
-    #undef X_
-    #undef K_
-}
-
-#undef CTILE
-//#define CTILE NUMTHREAD(16, 8, 8)
-#define CTILE 16
-groupshared float Conv_Xcache2[4][CTILE][CTILE];
-groupshared float Conv_Kcache2[4][CTILE][CTILE];
-[numthreads(CTILE, CTILE, 1)]
-void Conv2D_Cache_KCmod32_KNyxDiv2(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount / 2, O.batch * O.height * O.width / 2, 1);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv_Xcache2
-    #define K_ Conv_Kcache2
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = CTILE * groupID.x + groupThreadID.x;
-    uint nyx = CTILE * groupID.y + groupThreadID.y;
-
-    uint width = O.width / 2;
-    uint height = O.height;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-    
-    float b0 = B.Get(k*2+0);
-    float b1 = B.Get(k*2+1);
-    float4 v = float4(b0, b1,
-                      b0, b1);
-
-    bool mask = n < O.batch;
-
-    for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-    {
-        for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-        {
-            // @TODO: investigate
-            // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-            bool maskY = mask;
-            uint oy = y * _Stride.y + dy;
-            if (oy < _Pad.y) maskY = false;
-            if (oy - _Pad.w >= X.height) maskY = false;
-
-            bool maskL = maskY;
-            uint oxL = (x*2+0) * _Stride.x + dx;
-            if (oxL < _Pad.x) maskL = false;
-            if (oxL - _Pad.z >= X.width) maskL = false;
-
-            bool maskR = maskY;
-            uint oxR = (x*2+1) * _Stride.x + dx;
-            if (oxR < _Pad.x) maskR = false;
-            if (oxR - _Pad.z >= X.width) maskR = false;
-
-            for (uint m = 0; m < X.channels/(CTILE*2); ++m)
-            {
-                if (maskL)
-                {
-                    X_[0][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+0);
-                    X_[1][gy][gx] = X.Get(n, oy-_Pad.y, oxL-_Pad.x, (m*CTILE + gx)*2+1);
-                }
-                else
-                {
-                    X_[0][gy][gx] = X_[1][gy][gx] = 0;
-                }
-
-                if (maskR)
-                {
-                    X_[2][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+0);
-                    X_[3][gy][gx] = X.Get(n, oy-_Pad.y, oxR-_Pad.x, (m*CTILE + gx)*2+1);
-                }
-                else
-                {
-                    X_[2][gy][gx] = X_[3][gy][gx] = 0;
-                }
-
-
-                K_[0][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0);
-                K_[1][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1);
-                K_[2][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0);
-                K_[3][gy][gx] = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1);
-
-                GroupMemoryBarrierWithGroupSync();
-
-                [unroll]
-                for (uint i = 0; i < CTILE; ++i)
-                {
-                    float4 x =
-                        float4(    X_[0][gy][i],
-                                X_[1][gy][i],
-                                X_[2][gy][i],
-                                X_[3][gy][i]);
-                    float4 k =
-                        float4(    K_[0][i][gx],
-                                K_[1][i][gx],
-                                K_[2][i][gx],
-                                K_[3][i][gx]);
-                    
-                    v.x = mad(k.x, x.x, v.x);
-                    v.x = mad(k.z, x.y, v.x);
-                    
-                    v.y = mad(k.y, x.x, v.y);
-                    v.y = mad(k.w, x.y, v.y);
-                    
-                    v.z = mad(k.x, x.z, v.z);
-                    v.z = mad(k.z, x.w, v.z);
-                    
-                    v.w = mad(k.y, x.z, v.w);
-                    v.w = mad(k.w, x.w, v.w);
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-
-    O.Set(n, y, x*2+0, k*2+0, v.x);
-    O.Set(n, y, x*2+0, k*2+1, v.y);
-    if (mask && x*2+1 < O.width)
-    {
-        O.Set(n, y, x*2+1, k*2+0, v.z);
-        O.Set(n, y, x*2+1, k*2+1, v.w);
-    }
-
-    #undef X_
-    #undef K_
-}
-
-
-#undef CTILE
-//#define CTILE NUMTHREAD(16, 8, 8)
-#define CTILE 16
-#define RTILE 4
-groupshared float Conv_XcacheR[RTILE*RTILE][CTILE*CTILE];
-groupshared float Conv_KcacheR[RTILE*RTILE][CTILE*CTILE];
-[numthreads(CTILE, CTILE, 1)]
-void Conv2D_Cache_KCmod64_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount / 4, O.batch * O.height * O.width / 4, 1);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv_XcacheR
-    #define K_ Conv_KcacheR
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = CTILE * groupID.x + groupThreadID.x;
-    uint nyx = CTILE * groupID.y + groupThreadID.y;
-
-    uint x = nyx % O.width;
-    uint ny = nyx / O.width;
-    uint y = ny % O.height;
-    uint n = ny / O.height;
-
-    float v[RTILE][RTILE];
-    for (uint xxxx = 0; xxxx < RTILE; ++xxxx)
-    {
-        float b = B.Get(k*RTILE+xxxx);
-        for (uint yyyy = 0; yyyy < RTILE; ++yyyy)
-            v[yyyy][xxxx] = b;
-    }
-
-    for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-    {
-        for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-        {
-            bool mask = true;
-            uint oy = y * _Stride.y + dy;
-            uint ox = x * _Stride.x + dx;
-            // @TODO: investigate
-            // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-            if (oy < _Pad.y) mask = false;
-            if (oy - _Pad.w >= X.height) mask = false;
-            if (ox < _Pad.x) mask = false;
-            if (ox - _Pad.z >= X.width) mask = false;
-
-            for (uint m = 0; m < X.channels/(CTILE*RTILE); ++m)
-            {                
-                for (uint yy = 0; yy < RTILE; ++yy)
-                    for (uint xx = 0; xx < RTILE; ++xx)
-                    {
-                        if (mask)
-                            X_[yy*RTILE+xx][gy*CTILE+gx] = X.Get(n*RTILE+yy, oy - _Pad.y, ox - _Pad.x, (m*CTILE + gx)*RTILE+xx);
-                        else
-                            X_[yy*RTILE+xx][gy*CTILE+gx] = 0;
-                        K_[yy*RTILE+xx][gy*CTILE+gx] = K.Get(dy, dx, (m*CTILE + gy)*RTILE+yy, k*RTILE+xx);
-                    }
-
-                GroupMemoryBarrierWithGroupSync();
-
-                for (uint ii = 0; ii < CTILE; ++ii)
-                {
-                    float x[RTILE][RTILE];
-                    float k[RTILE][RTILE];
-
-                    [unroll]
-                    for (uint yy = 0; yy < RTILE; ++yy)
-                    {
-                        [unroll]
-                        for (uint xx = 0; xx < RTILE; ++xx)
-                        {
-                            x[yy][xx] = X_[yy*RTILE+xx][gy*CTILE+ii];
-                            k[yy][xx] = K_[yy*RTILE+xx][ii*CTILE+gx];
-                        }
-                    }
-
-
-                    [unroll]
-                    for (uint yyy = 0; yyy < RTILE; ++yyy)
-                    {
-                        [unroll]
-                        for (uint xxx = 0; xxx < RTILE; ++xxx)
-                        {
-                            [unroll]
-                            for (uint i = 0; i < RTILE; ++i)
-                            {
-                                v[yyy][xxx] = mad(x[yyy][i], k[i][xxx], v[yyy][xxx]);
-                            }
-                        }
-                    }
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-
-    for (uint yy = 0; yy < RTILE; ++yy)
-        for (uint xx = 0; xx < RTILE; ++xx)
-            O.Set(n*RTILE+yy, y, x, k*RTILE+xx, v[yy][xx]);
-    
-    #undef X_
-    #undef K_
-}
diff --git a/Assets/Coach-ML/Barracuda/Resources/Experimental.compute b/Assets/Coach-ML/Barracuda/Resources/Experimental.compute
deleted file mode 100644
index 7685606..0000000
--- a/Assets/Coach-ML/Barracuda/Resources/Experimental.compute
+++ /dev/null
@@ -1,4284 +0,0 @@
-#if EXPERIMENTAL_KERNELS_ENABLED
-/*
-#pragma kernel Dense
-#pragma kernel DenseTiled
-#pragma kernel Dense10x16
-#pragma kernel DenseTiled32x32
-#pragma kernel DenseTiled64x64
-#pragma kernel Dense64
-#pragma kernel Relu
-#pragma kernel Relu256xV
-#pragma kernel Relu16x16
-#pragma kernel ReluChannelsFirst16x2x16
-#pragma kernel Relu_Cmod16_CNyx
-#pragma kernel Relu_Nyxc
-#pragma kernel Softmax
-#pragma kernel Softmax256x2
-#pragma kernel MaxPooling2D
-#pragma kernel MaxPooling2D16x4x4
-*/
-/*
-#pragma kernel Conv2D_Kernel3x3_32Channel
-#pragma kernel Conv2D_Kernel3x3_1Channel
-#pragma kernel Conv2D
-//#pragma kernel Conv2DTiled16x16_Kernel3x3
-#pragma kernel Conv2DTiled14x14_Kernel3x3
-#pragma kernel Conv2DTiled13x13_Kernel3x3
-//#pragma kernel Conv2DTiled12x12_Kernel3x3
-#pragma kernel Fill
-
-#pragma kernel Conv2D_Kernel3x3_Kmod16_Cmod4_KN
-#pragma kernel Conv2D_Kernel3x3_Kmod16_Cmod4_KNyx
-//#pragma kernel Conv2D_Kernel3x3_Cache_KCmod32_KNyx
-//#pragma kernel Conv2D_Kernel3x3_Cache_KCmod64_KNyx
-*/
-
-
-// @TODO: BIAS and WEIGHTS have changed format 
-// BIAS      (0,0,x,0) -> (0,0,0,x) --> (x)
-// WEIGHTS   (y,0,x,0) -> (y,0,0,x) --> (y,x)
-// DENSE_OUT (y,0,x,0) -> (y,0,0,x) --> (y,x)
-
-
-//#pragma kernel Conv2D_Kmod16_Nmod8_KNY
-//#pragma kernel Conv2D_Kernel3x3_64
-
-#define BOUNDS_CHECKS 0
-
-RWStructuredBuffer<int> Edata;
-
-struct Tensor
-{
-    uint batch, height, width, channels;
-    uint offset;
-    uint dataLength;
-
-    uint Index(uint b, uint h, uint w, uint ch)
-    {
-        uint index =
-            b * height * width * channels +
-            h * width * channels +
-            w * channels +
-            ch;
-        return index + offset;
-    }
-    void Set(uint b, uint h, uint w, uint ch, float v, RWStructuredBuffer<float> data)
-    {
-        data[Index(b,h,w,ch)] = v;
-    }
-    void Set(int b, uint h, uint w, uint ch, float v, RWStructuredBuffer<float> data, int dataLength)
-    {
-        uint index = Index(b,h,w,ch);
-        #if BOUNDS_CHECKS
-        if (index < 0 || index >= dataLength)
-        {
-            InterlockedAdd(Edata[1], 1);
-            return;
-        }
-        #endif
-
-        data[Index(b,h,w,ch)] = v;
-    }
-
-    float Get(uint b, uint h, uint w, uint ch, StructuredBuffer<float> data)
-    {
-        return data[Index(b,h,w,ch)];
-    }
-    float Get(uint b, uint h, uint w, uint ch, StructuredBuffer<float> data, int dataLength)
-    {
-        int index = Index(b,h,w,ch);
-        #if BOUNDS_CHECKS
-        if (index < 0 || index >= dataLength)
-        {
-            InterlockedAdd(Edata[0], 1);
-            return 0.0f;
-        }
-        #endif
-        
-        return data[Index(b,h,w,ch)];
-    }
-};
-
-#define X ((Tensor)Xdecl)
-int4 Xdecl[2];
-StructuredBuffer<float> Xdata;
-
-#define O ((Tensor)Odecl)
-int4 Odecl[2];
-RWStructuredBuffer<float> Odata;
-
-#define W ((Tensor)Wdecl)
-int4 Wdecl[2];
-
-#define B ((Tensor)Bdecl)
-int4 Bdecl[2];
-
-#define K ((Tensor)Kdecl)
-int4 Kdecl[2];
-
-#define WBK ((Tensor)WBKdecl)
-int4 WBKdecl[2];
-StructuredBuffer<float> WBKdata;
-
-uint _FilterSize;
-uint _Border;
-uint _Offset;
- 
-[numthreads(1,1,1)]
-void Dense(uint3 groupID : SV_GroupID)
-{
-    uint b = groupID.y;
-    uint x = groupID.x;
-    float v = B.Get(0, 0, x, 0, WBKdata, WBK.dataLength);
-    for (uint i = 0; i < X.width; ++i)
-        v += X.Get(b, 0, i, 0, Xdata) * W.Get(0, i, x, 0, WBKdata, WBK.dataLength);
-
-    O.Set(b, 0, x, 0, v, Odata, O.dataLength);
-}
-
-[numthreads(10,16,1)]
-void Dense10x16(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint x = 10*groupID.x + groupThreadID.x;
-    uint b = 16*groupID.y + groupThreadID.y;
-    float v = B.Get(0, 0, x, 0, WBKdata, WBK.dataLength);
-
-    for (uint i = 0; i < X.width;)
-    {
-        // can unroll up to 16 because numthreads.y=16
-        v += X.Get(b, 0, i, 0, Xdata) * W.Get(0, i, x, 0, WBKdata, WBK.dataLength); ++i;
-        v += X.Get(b, 0, i, 0, Xdata) * W.Get(0, i, x, 0, WBKdata, WBK.dataLength); ++i;
-        v += X.Get(b, 0, i, 0, Xdata) * W.Get(0, i, x, 0, WBKdata, WBK.dataLength); ++i;
-        v += X.Get(b, 0, i, 0, Xdata) * W.Get(0, i, x, 0, WBKdata, WBK.dataLength); ++i;
-
-        v += X.Get(b, 0, i, 0, Xdata) * W.Get(0, i, x, 0, WBKdata, WBK.dataLength); ++i;
-        v += X.Get(b, 0, i, 0, Xdata) * W.Get(0, i, x, 0, WBKdata, WBK.dataLength); ++i;
-        v += X.Get(b, 0, i, 0, Xdata) * W.Get(0, i, x, 0, WBKdata, WBK.dataLength); ++i;
-        v += X.Get(b, 0, i, 0, Xdata) * W.Get(0, i, x, 0, WBKdata, WBK.dataLength); ++i;
-    }
-    O.Set(b, 0, x, 0, v, Odata);
-}
-
-
-#undef THREAD_COUNT
-#define THREAD_COUNT 64 // ATM support only 8x8
-
-#undef BLOCK_WIDTH
-#define BLOCK_WIDTH 8
-
-#undef LOAD_WIDTH
-#define LOAD_WIDTH THREAD_COUNT
-
-#undef LOAD_DEPTH
-#define LOAD_DEPTH BLOCK_WIDTH
-
-groupshared float Conv_KcacheR[LOAD_DEPTH][LOAD_WIDTH];
-groupshared float Conv_XcacheR[LOAD_DEPTH][LOAD_WIDTH];
-[numthreads(THREAD_COUNT, 1, 1)]
-void Conv2D_Kernel3x3_64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ Conv_XcacheR
-    #define K_ Conv_KcacheR
-
-    uint id = groupThreadID.x;
-    uint bx = groupID.x;
-    uint by = groupID.y;
-
-    uint bbx = id % BLOCK_WIDTH;
-    uint bby = id / BLOCK_WIDTH;
-
-    uint width = O.width;
-    uint height = O.height;
-
-    // ASSERT(LOAD_WIDTH == THREAD_COUNT)
-    uint loadNYX = by*LOAD_WIDTH + id; // only works for 8x8
-    uint loadX = loadNYX % width;
-    uint loadNY = loadNYX / width;
-    uint loadY = loadNY % height;
-    uint loadN = loadNY / height;
-
-    float v[BLOCK_WIDTH][BLOCK_WIDTH];
-    for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
-        for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
-        {
-            float bias = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx, 0, WBKdata, WBK.dataLength);
-            v[yy][xx] = bias;
-        }
-
-    for (uint dy = 0; dy < 3; ++dy)
-    {
-        bool mask = true;
-
-        if (loadY+dy < _Offset) mask = false;
-        if (loadY+dy-_Offset >= X.height) mask = false;
-
-        for (uint dx = 0; dx < 3; ++dx)
-        {
-            if (loadX+dx < _Offset) mask = false;
-            if (loadX+dx-_Offset >= X.width) mask = false;
-
-            for (uint m = 0; m < X.channels/LOAD_DEPTH; ++m)
-            {
-                for (uint q = 0; q < LOAD_DEPTH; ++q)
-                {
-                    if (mask)
-                        X_[q][id] = X.Get(loadN, loadY+dy-_Offset, loadX+dx-_Offset, m*LOAD_DEPTH + q, Xdata);
-                    else
-                        X_[q][id] = 0;
-                    K_[q][id] = K.Get(dy, dx, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id, WBKdata, WBK.dataLength);
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-
-                for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-                    [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx) 
-                        [unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
-                        {
-                            v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * K_[i][bbx*BLOCK_WIDTH + xxx];
-                        }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-
-    for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-        for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-        {
-            //O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, y, x, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, v[yyy][xxx], Odata);
-            uint saveNYX = by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy;
-            //uint saveNYX = by*LOAD_WIDTH + ((id>>3)<<3) + yyy;
-            uint saveX = saveNYX % width;
-            uint saveNY = saveNYX / width;
-            uint saveY = saveNY % height;
-            uint saveN = saveNY / height;
-
-            uint saveK = bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx;
-            O.Set(saveN, saveY, saveX, saveK, v[yyy][xxx], Odata);
-        }
-
-    #undef X_
-    #undef K_
-}
-
-
-#undef THREAD_COUNT
-#define THREAD_COUNT 64 // ATM support only 8x8
-
-#undef BLOCK_WIDTH
-#define BLOCK_WIDTH 8
-
-#undef LOAD_WIDTH
-#define LOAD_WIDTH THREAD_COUNT
-
-#undef LOAD_DEPTH
-#define LOAD_DEPTH BLOCK_WIDTH
-
-#if 1
-
-groupshared float DenseTiled_XcacheR[32][LOAD_WIDTH];
-groupshared float DenseTiled_WcacheR[LOAD_DEPTH][LOAD_WIDTH];
-
-[numthreads(THREAD_COUNT, 1, 1)]
-void Dense64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_XcacheR
-    #define W_ DenseTiled_WcacheR
-
-    uint id = groupThreadID.x;
-    uint bx = groupID.x;
-    uint by = groupID.y;
-
-    uint bbx = id % BLOCK_WIDTH;
-    uint bby = id / BLOCK_WIDTH;
-
-    float v[BLOCK_WIDTH][BLOCK_WIDTH];
-    for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
-        for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
-        {
-            float bias = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx, 0, WBKdata, WBK.dataLength);
-            v[yy][xx] = bias;
-        }
-
-    for (uint m = 0; m < X.width/LOAD_DEPTH; ++m)
-    {
-        for (uint q = 0; q < LOAD_DEPTH; ++q)
-        {
-            X_[q][id] = X.Get(by*LOAD_WIDTH + id, 0, m*LOAD_DEPTH + q, 0, Xdata);
-            W_[q][id] = W.Get(0, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id, 0, WBKdata, WBK.dataLength);
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-
-        for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-        {
-            X_[yyy][id] = X.Get(by*LOAD_WIDTH + id, 0, m*LOAD_DEPTH + yyy, 0, Xdata);
-            [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-                [unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
-                {
-                    v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * W_[i][bbx*BLOCK_WIDTH + xxx];
-                }
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-    }
-
-    for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-        for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-            O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, 0, v[yyy][xxx], Odata);
-
-    #undef X_
-    #undef W_
-}
-
-#elif 1
-groupshared float DenseTiled_XcacheR[LOAD_DEPTH][LOAD_WIDTH];
-groupshared float DenseTiled_WcacheR[LOAD_DEPTH][LOAD_WIDTH];
-
-[numthreads(THREAD_COUNT, 1, 1)]
-void Dense64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_XcacheR
-    #define W_ DenseTiled_WcacheR
-
-    uint id = groupThreadID.x;
-    uint bx = groupID.x;
-    uint by = groupID.y;
-
-    uint bbx = id % BLOCK_WIDTH;
-    uint bby = id / BLOCK_WIDTH;
-
-    float v[BLOCK_WIDTH][BLOCK_WIDTH];
-    for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
-        for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
-        {
-            float bias = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx, 0, WBKdata, WBK.dataLength);
-            v[yy][xx] = bias;
-        }
-
-    for (uint m = 0; m < X.width/LOAD_DEPTH; ++m)
-    {
-        for (uint q = 0; q < LOAD_DEPTH; ++q)
-        {
-            X_[q][id] = X.Get(by*LOAD_WIDTH + id, 0, m*LOAD_DEPTH + q, 0, Xdata);
-            W_[q][id] = W.Get(0, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id, 0, WBKdata, WBK.dataLength);
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-
-        for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-            [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-                [unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
-                {
-                    //v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * W_[i][bbx*BLOCK_WIDTH + xxx];
-                    v[yyy][xxx] = mad(X_[i][bby*BLOCK_WIDTH + yyy], W_[i][bbx*BLOCK_WIDTH + xxx], v[yyy][xxx]);
-                }
-
-        GroupMemoryBarrierWithGroupSync();
-    }
-
-    for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-        for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-            O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, 0, v[yyy][xxx], Odata);
-
-    #undef X_
-    #undef W_
-}
-
-#elif 1
-
-// unroll array to help some "naive" compilers to map to regs
-// could be easier to lay out zigzagging patterns 
-groupshared float DenseTiled_XcacheR[LOAD_DEPTH][LOAD_WIDTH];
-groupshared float DenseTiled_WcacheR[LOAD_DEPTH][LOAD_WIDTH];
-
-[numthreads(THREAD_COUNT, 1, 1)]
-void Dense64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_XcacheR
-    #define W_ DenseTiled_WcacheR
-
-    uint id = groupThreadID.x;
-    uint bx = groupID.x;
-    uint by = groupID.y;
-
-    uint bbx = id % BLOCK_WIDTH;
-    uint bby = id / BLOCK_WIDTH;
-
-    //float v[BLOCK_WIDTH][BLOCK_WIDTH];
-    float
-        v00, v01, v02, v03, v04, v05, v06, v07,
-        v10, v11, v12, v13, v14, v15, v16, v17,
-        v20, v21, v22, v23, v24, v25, v26, v27,
-        v30, v31, v32, v33, v34, v35, v36, v37,
-        v40, v41, v42, v43, v44, v45, v46, v47,
-        v50, v51, v52, v53, v54, v55, v56, v57,
-        v60, v61, v62, v63, v64, v65, v66, v67,
-        v70, v71, v72, v73, v74, v75, v76, v77;
-
-    float b0 = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + 0, 0, WBKdata, WBK.dataLength);
-    float b1 = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + 1, 0, WBKdata, WBK.dataLength);
-    float b2 = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + 2, 0, WBKdata, WBK.dataLength);
-    float b3 = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + 3, 0, WBKdata, WBK.dataLength);
-    float b4 = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + 4, 0, WBKdata, WBK.dataLength);
-    float b5 = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + 5, 0, WBKdata, WBK.dataLength);
-    float b6 = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + 6, 0, WBKdata, WBK.dataLength);
-    float b7 = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + 7, 0, WBKdata, WBK.dataLength);
-
-    #define L_(y, x) v##y##x = b##x
-    L_(0,0); L_(0,1); L_(0,2); L_(0,3); L_(0,4); L_(0,5); L_(0,6); L_(0,7);
-    L_(1,0); L_(1,1); L_(1,2); L_(1,3); L_(1,4); L_(1,5); L_(1,6); L_(1,7);
-    L_(2,0); L_(2,1); L_(2,2); L_(2,3); L_(2,4); L_(2,5); L_(2,6); L_(2,7);
-    L_(3,0); L_(3,1); L_(3,2); L_(3,3); L_(3,4); L_(3,5); L_(3,6); L_(3,7);
-    L_(4,0); L_(4,1); L_(4,2); L_(4,3); L_(4,4); L_(4,5); L_(4,6); L_(4,7);
-    L_(5,0); L_(5,1); L_(5,2); L_(5,3); L_(5,4); L_(5,5); L_(5,6); L_(5,7);
-    L_(6,0); L_(6,1); L_(6,2); L_(6,3); L_(6,4); L_(6,5); L_(6,6); L_(6,7);
-    L_(7,0); L_(7,1); L_(7,2); L_(7,3); L_(7,4); L_(7,5); L_(7,6); L_(7,7);    
-    #undef L_
-
-    for (uint m = 0; m < X.width/LOAD_DEPTH; ++m)
-    {
-        for (uint q = 0; q < LOAD_DEPTH; ++q)
-        {
-            X_[q][id] = X.Get(by*LOAD_WIDTH + id, 0, m*LOAD_DEPTH + q, 0, Xdata);
-            W_[q][id] = W.Get(0, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id, 0, WBKdata, WBK.dataLength);
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-
-        [unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
-        {
-            //v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * W_[i][bbx*BLOCK_WIDTH + xxx];
-            #define XW_(y, x) v##y##x += X_[i][bby*BLOCK_WIDTH + ##y] * W_[i][bbx*BLOCK_WIDTH + ##x]
-            XW_(0,0); XW_(0,1); XW_(0,2); XW_(0,3); XW_(0,4); XW_(0,5); XW_(0,6); XW_(0,7);
-            XW_(1,0); XW_(1,1); XW_(1,2); XW_(1,3); XW_(1,4); XW_(1,5); XW_(1,6); XW_(1,7);
-            XW_(2,0); XW_(2,1); XW_(2,2); XW_(2,3); XW_(2,4); XW_(2,5); XW_(2,6); XW_(2,7);
-            XW_(3,0); XW_(3,1); XW_(3,2); XW_(3,3); XW_(3,4); XW_(3,5); XW_(3,6); XW_(3,7);
-            XW_(4,0); XW_(4,1); XW_(4,2); XW_(4,3); XW_(4,4); XW_(4,5); XW_(4,6); XW_(4,7);
-            XW_(5,0); XW_(5,1); XW_(5,2); XW_(5,3); XW_(5,4); XW_(5,5); XW_(5,6); XW_(5,7);
-            XW_(6,0); XW_(6,1); XW_(6,2); XW_(6,3); XW_(6,4); XW_(6,5); XW_(6,6); XW_(6,7);
-            XW_(7,0); XW_(7,1); XW_(7,2); XW_(7,3); XW_(7,4); XW_(7,5); XW_(7,6); XW_(7,7);        
-            #undef XW_
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-    }
-
-    #define S_(a, b) O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + ##a, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + ##b, 0, v##a##b, Odata)
-    S_(0,0); S_(0,1); S_(0,2); S_(0,3); S_(0,4); S_(0,5); S_(0,6); S_(0,7);
-    S_(1,0); S_(1,1); S_(1,2); S_(1,3); S_(1,4); S_(1,5); S_(1,6); S_(1,7);
-    S_(2,0); S_(2,1); S_(2,2); S_(2,3); S_(2,4); S_(2,5); S_(2,6); S_(2,7);
-    S_(3,0); S_(3,1); S_(3,2); S_(3,3); S_(3,4); S_(3,5); S_(3,6); S_(3,7);
-    S_(4,0); S_(4,1); S_(4,2); S_(4,3); S_(4,4); S_(4,5); S_(4,6); S_(4,7);
-    S_(5,0); S_(5,1); S_(5,2); S_(5,3); S_(5,4); S_(5,5); S_(5,6); S_(5,7);
-    S_(6,0); S_(6,1); S_(6,2); S_(6,3); S_(6,4); S_(6,5); S_(6,6); S_(6,7);
-    S_(7,0); S_(7,1); S_(7,2); S_(7,3); S_(7,4); S_(7,5); S_(7,6); S_(7,7);
-    #undef S_
-
-    #undef X_
-    #undef W_
-}
-
-#elif 1
-
-groupshared float DenseTiled_XcacheR[2][LOAD_DEPTH][LOAD_WIDTH];
-groupshared float DenseTiled_WcacheR[2][LOAD_DEPTH][LOAD_WIDTH];
-
-[numthreads(THREAD_COUNT, 1, 1)]
-void Dense64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_XcacheR
-    #define W_ DenseTiled_WcacheR
-
-    uint id = groupThreadID.x;
-    uint bx = groupID.x;
-    uint by = groupID.y;
-
-    uint bbx = id % BLOCK_WIDTH;
-    uint bby = id / BLOCK_WIDTH;
-
-    float v[BLOCK_WIDTH][BLOCK_WIDTH];
-    for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
-        [unroll] for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
-        {
-            float bias = B.Get(0, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx, 0, WBKdata, WBK.dataLength);
-            v[yy][xx] = bias;
-        }
-
-    uint m = 0;
-    for (uint q = 0; q < LOAD_DEPTH; ++q)
-    {
-        X_[0][q][id] = X.Get(by*LOAD_WIDTH + id, 0, m*LOAD_DEPTH + q, 0, Xdata);
-        W_[0][q][id] = W.Get(0, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id, 0, WBKdata, WBK.dataLength);
-    }
-    GroupMemoryBarrierWithGroupSync();
-
-    ++m;
-
-    for (; m < X.width/LOAD_DEPTH; ++m)
-    {
-        for (uint q = 0; q < LOAD_DEPTH; ++q)
-        {
-            X_[1][q][id] = X.Get(by*LOAD_WIDTH + id, 0, m*LOAD_DEPTH + q, 0, Xdata);
-            W_[1][q][id] = W.Get(0, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id, 0, WBKdata, WBK.dataLength);
-        }
-        
-        for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-            [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-                [unroll]
-                for (uint i = 0; i < LOAD_DEPTH; ++i)
-                {
-                    v[yyy][xxx] += X_[0][i][bby*BLOCK_WIDTH + yyy] * W_[0][i][bbx*BLOCK_WIDTH + xxx];
-            }
-
-        ++m;
-        GroupMemoryBarrierWithGroupSync();
-
-        if (m < X.width/LOAD_DEPTH)
-        {
-            for (uint q = 0; q < LOAD_DEPTH; ++q)
-            {
-                X_[0][q][id] = X.Get(by*LOAD_WIDTH + id, 0, m*LOAD_DEPTH + q, 0, Xdata);
-                W_[0][q][id] = W.Get(0, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id, 0, WBKdata, WBK.dataLength);
-            }
-        }
-
-        for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-            [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-                [unroll]
-                for (uint i = 0; i < LOAD_DEPTH; ++i)
-                {
-                    v[yyy][xxx] += X_[1][i][bby*BLOCK_WIDTH + yyy] * W_[1][i][bbx*BLOCK_WIDTH + xxx];
-            }
-        GroupMemoryBarrierWithGroupSync();
-    }
-
-    for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-        [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-            O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, 0, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, 0, v[yyy][xxx], Odata);
-
-    #undef X_
-    #undef W_
-}
-
-#else
-
-groupshared float DenseTiled_XcacheR[LOAD_DEPTH][LOAD_WIDTH];
-groupshared float DenseTiled_WcacheR[LOAD_DEPTH][LOAD_WIDTH];
-
-[numthreads(THREAD_COUNT, 1, 1)]
-void Dense64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_XcacheR
-    #define W_ DenseTiled_WcacheR
-
-    uint id = groupThreadID.x;
-    uint bx = groupID.x;
-    uint by = groupID.y;
-
-    uint n = by * LOAD_WIDTH + id;
-    uint x = bx * LOAD_WIDTH + id;
-
-    float v[LOAD_WIDTH];
-    float bias = B.Get(0, 0, x, 0, WBKdata, WBK.dataLength);
-    [unroll] for (uint xx = 0; xx < LOAD_WIDTH; ++xx)
-        v[xx] = bias;
-
-    for (uint m = 0; m < X.width/LOAD_DEPTH; ++m)
-    {
-        float ww[LOAD_DEPTH];
-        for (uint q = 0; q < LOAD_DEPTH; ++q)
-        {
-            X_[q][id] = X.Get(n, 0, m*LOAD_DEPTH + q, 0, Xdata);
-            //W_[q][id] = W.Get(0, m*LOAD_DEPTH + q, x, 0, WBKdata, WBK.dataLength);
-            ww[q] = W.Get(0, m*LOAD_DEPTH + q, x, 0, WBKdata, WBK.dataLength);
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-
-        for (uint w = 0; w < LOAD_WIDTH; ++w)
-        {
-            [unroll]
-            for (uint i = 0; i < LOAD_DEPTH; ++i)
-            {
-                //v[w] += X_[i][w] * W_[i][id];
-                v[w] += X_[i][w] * ww[i];
-            }
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-    }
-
-    [unroll] for ( xx = 0; xx < LOAD_WIDTH; ++xx)
-        O.Set(by * LOAD_WIDTH + xx, 0, x, 0, v[xx], Odata);
-
-    #undef X_
-    #undef W_
-}
-#endif
-
-#if 1
-#undef TILE_WIDTH
-#define TILE_WIDTH 16
-groupshared float DenseTiled_Xcache64[16][TILE_WIDTH*TILE_WIDTH];
-groupshared float DenseTiled_Wcache64[16][TILE_WIDTH*TILE_WIDTH];
-[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
-void DenseTiled64x64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_Xcache64
-    #define W_ DenseTiled_Wcache64
-
-    uint tx = groupThreadID.x;
-    uint ty = groupThreadID.y;
-    uint x = groupID.x*TILE_WIDTH + tx;
-    uint n = groupID.y*TILE_WIDTH + ty;
-
-    float b0 = B.Get(0, 0, x*4+0, 0, WBKdata, WBK.dataLength);
-    float b1 = B.Get(0, 0, x*4+1, 0, WBKdata, WBK.dataLength);
-    float b2 = B.Get(0, 0, x*4+2, 0, WBKdata, WBK.dataLength);
-    float b3 = B.Get(0, 0, x*4+3, 0, WBKdata, WBK.dataLength);
-    
-    float4 v0, v1, v2, v3;
-    v0 = v1 = v2 = v3 = float4(b0, b1, b2, b3);
-
-    for (uint m = 0; m < X.width/(TILE_WIDTH*4); ++m) 
-    {
-        for (uint yy = 0; yy < 4; ++yy)
-            for (uint xx = 0; xx < 4; ++xx)
-            {
-                X_[yy*4+xx][ty*TILE_WIDTH+tx] = X.Get(n*4+yy, 0, (m*TILE_WIDTH + tx)*4+xx, 0, Xdata);
-                W_[yy*4+xx][ty*TILE_WIDTH+tx] = W.Get(0, (m*TILE_WIDTH + ty)*4+yy, x*4+xx, 0, WBKdata, WBK.dataLength);
-            }
-        
-        GroupMemoryBarrierWithGroupSync();
-
-        //[unroll]
-        for (uint i = 0; i < TILE_WIDTH; ++i)
-        {
-            [unroll]
-            for (uint q = 0; q < 4; ++q)
-            {
-                float x0 = X_[0*4+q][ty*TILE_WIDTH+i];
-                float x1 = X_[1*4+q][ty*TILE_WIDTH+i];
-                float x2 = X_[2*4+q][ty*TILE_WIDTH+i];
-                float x3 = X_[3*4+q][ty*TILE_WIDTH+i];
-                
-                float w0 = W_[q*4+0][i*TILE_WIDTH+tx];
-                float w1 = W_[q*4+1][i*TILE_WIDTH+tx];
-                float w2 = W_[q*4+2][i*TILE_WIDTH+tx];
-                float w3 = W_[q*4+3][i*TILE_WIDTH+tx];
-
-                v0.x = mad(x0, w0, v0.x); //--
-                v1.x = mad(x1, w0, v1.x); 
-                v2.x = mad(x2, w0, v2.x); 
-                v3.x = mad(x3, w0, v3.x); 
-                v0.y = mad(x0, w1, v0.y); //--
-                v1.y = mad(x1, w1, v1.y);
-                v2.y = mad(x2, w1, v2.y);
-                v3.y = mad(x3, w1, v3.y);
-                v0.z = mad(x0, w2, v0.z); //--
-                v1.z = mad(x1, w2, v1.z); 
-                v2.z = mad(x2, w2, v2.z); 
-                v3.z = mad(x3, w2, v3.z); 
-                v0.w = mad(x0, w3, v0.w); //--
-                v1.w = mad(x1, w3, v1.w);
-                v2.w = mad(x2, w3, v2.w);
-                v3.w = mad(x3, w3, v3.w);
-            }
-
-            GroupMemoryBarrierWithGroupSync();
-        }
-    }
-
-    O.Set(n*4+0, 0, x*4+0, 0, v0.x, Odata);
-    O.Set(n*4+0, 0, x*4+1, 0, v0.y, Odata);
-    O.Set(n*4+0, 0, x*4+2, 0, v0.z, Odata);
-    O.Set(n*4+0, 0, x*4+3, 0, v0.w, Odata);
-    
-    O.Set(n*4+1, 0, x*4+0, 0, v1.x, Odata);
-    O.Set(n*4+1, 0, x*4+1, 0, v1.y, Odata);
-    O.Set(n*4+1, 0, x*4+2, 0, v1.z, Odata);
-    O.Set(n*4+1, 0, x*4+3, 0, v1.w, Odata);
-     
-    O.Set(n*4+2, 0, x*4+0, 0, v2.x, Odata);
-    O.Set(n*4+2, 0, x*4+1, 0, v2.y, Odata);
-    O.Set(n*4+2, 0, x*4+2, 0, v2.z, Odata);
-    O.Set(n*4+2, 0, x*4+3, 0, v2.w, Odata);
-     
-    O.Set(n*4+3, 0, x*4+0, 0, v3.x, Odata);
-    O.Set(n*4+3, 0, x*4+1, 0, v3.y, Odata);
-    O.Set(n*4+3, 0, x*4+2, 0, v3.z, Odata);
-    O.Set(n*4+3, 0, x*4+3, 0, v3.w, Odata);
-              
-    #undef X_
-    #undef W_
-}
-
-#else
-
-#define TILE_WIDTH 16
-#define RTILE 4
-groupshared float DenseTiled_Xcache64[RTILE*RTILE][TILE_WIDTH*TILE_WIDTH];
-groupshared float DenseTiled_Wcache64[RTILE*RTILE][TILE_WIDTH*TILE_WIDTH];
-[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
-void DenseTiled64x64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_Xcache64
-    #define W_ DenseTiled_Wcache64
-
-    uint tx = groupThreadID.x;
-    uint ty = groupThreadID.y;
-    uint x = groupID.x*TILE_WIDTH + tx;
-    uint n = groupID.y*TILE_WIDTH + ty;
-
-    float v[RTILE*RTILE];
-    [unroll] for (uint xxxx = 0; xxxx < RTILE; ++xxxx)
-    {
-        float b = B.Get(0, 0, x*RTILE+xxxx, 0, WBKdata, WBK.dataLength);
-        [unroll] for (uint yyyy = 0; yyyy < RTILE; ++yyyy)
-            v[yyyy*RTILE+xxxx] = b;
-    }
-
-    for (uint m = 0; m < X.width/(TILE_WIDTH*RTILE); ++m) 
-    {
-        for (uint yy = 0; yy < RTILE; ++yy)
-            [unroll] for (uint xx = 0; xx < RTILE; ++xx)
-            {
-                X_[yy*RTILE+xx][ty*TILE_WIDTH+tx] = X.Get(n*RTILE+yy, 0, (m*TILE_WIDTH + tx)*RTILE+xx, 0, Xdata);
-                W_[yy*RTILE+xx][ty*TILE_WIDTH+tx] = W.Get(0, (m*TILE_WIDTH + ty)*RTILE+yy, x*RTILE+xx, 0, WBKdata, WBK.dataLength);
-            }
-        GroupMemoryBarrierWithGroupSync();
-
-        for (uint ii = 0; ii < TILE_WIDTH; ++ii)
-        {
-            [unroll] for (uint yy = 0; yy < RTILE; ++yy)
-                [unroll] for (uint xx = 0; xx < RTILE; ++xx)
-                    [unroll] for (uint i = 0; i < RTILE; ++i)
-                    {
-                        float x = X_[yy*RTILE+i][ty*TILE_WIDTH+ii];
-                        float w = W_[i*RTILE+xx][ii*TILE_WIDTH+tx];
-                        v[yy*RTILE+xx] = mad(x, w, v[yy*RTILE+xx]);
-                    }
-
-            GroupMemoryBarrierWithGroupSync();
-        }
-    }
-
-    [unroll] for (uint yy = 0; yy < RTILE; ++yy)
-        [unroll] for (uint xx = 0; xx < RTILE; ++xx)
-            O.Set(n*RTILE+yy, 0, x*RTILE+xx, 0, v[yy*RTILE+xx], Odata);
-      
-    #undef X_
-    #undef W_
-}
-
-#endif
-
-#undef TILE_WIDTH
-#define TILE_WIDTH 16 // 32 crashes on MacBookPro/AMD
-groupshared float DenseTiled_Xcache32[4][TILE_WIDTH][TILE_WIDTH];
-groupshared float DenseTiled_Wcache32[4][TILE_WIDTH][TILE_WIDTH];
-[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
-void DenseTiled32x32(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_Xcache32
-    #define W_ DenseTiled_Wcache32
-
-    uint tx = groupThreadID.x;
-    uint ty = groupThreadID.y;
-    uint x = groupID.x*TILE_WIDTH + tx;
-    uint n = groupID.y*TILE_WIDTH + ty;
-
-    float b0 = B.Get(0, 0, x*2+0, 0, WBKdata, WBK.dataLength);
-    float b1 = B.Get(0, 0, x*2+1, 0, WBKdata, WBK.dataLength);
-    float4 v = float4(b0, b1,
-                      b0, b1);
-
-    for (uint m = 0; m < X.width/(TILE_WIDTH*2);)
-    {
-        // @TODO: read in float2s
-        float x0 = X.Get(n*2+0, 0, m*TILE_WIDTH*2 + tx*2+0, 0, Xdata);
-        float x1 = X.Get(n*2+0, 0, m*TILE_WIDTH*2 + tx*2+1, 0, Xdata);
-        float x2 = X.Get(n*2+1, 0, m*TILE_WIDTH*2 + tx*2+0, 0, Xdata);
-        float x3 = X.Get(n*2+1, 0, m*TILE_WIDTH*2 + tx*2+1, 0, Xdata);
-
-        float w0 = W.Get(0, m*TILE_WIDTH*2 + ty*2+0, x*2+0, 0, WBKdata, WBK.dataLength);        
-        float w1 = W.Get(0, m*TILE_WIDTH*2 + ty*2+0, x*2+1, 0, WBKdata, WBK.dataLength);
-        float w2 = W.Get(0, m*TILE_WIDTH*2 + ty*2+1, x*2+0, 0, WBKdata, WBK.dataLength);
-        float w3 = W.Get(0, m*TILE_WIDTH*2 + ty*2+1, x*2+1, 0, WBKdata, WBK.dataLength);
-
-        ++m;
-
-        X_[0][ty][tx] = x0;
-        X_[1][ty][tx] = x1;
-        X_[2][ty][tx] = x2;
-        X_[3][ty][tx] = x3;
-
-        W_[0][ty][tx] = w0;
-        W_[1][ty][tx] = w1;
-        W_[2][ty][tx] = w2;
-        W_[3][ty][tx] = w3;
-
-        GroupMemoryBarrierWithGroupSync();
-
-        [unroll]
-        for (uint i = 0; i < TILE_WIDTH; ++i)
-        {
-            float4 x = //X_[ty][i];
-                float4(    X_[0][ty][i],
-                        X_[1][ty][i],
-                        X_[2][ty][i],
-                        X_[3][ty][i]);
-            float4 w = //W_[i][tx];
-                float4(    W_[0][i][tx],
-                        W_[1][i][tx],
-                        W_[2][i][tx],
-                        W_[3][i][tx]);
-                    
-            v.x = mad(w.x, x.x, v.x);
-            v.y = mad(w.y, x.x, v.y);
-            v.z = mad(w.x, x.z, v.z);
-            v.w = mad(w.y, x.z, v.w);
-
-            v.x = mad(w.z, x.y, v.x);                      
-            v.y = mad(w.w, x.y, v.y);
-            v.z = mad(w.z, x.w, v.z);
-            v.w = mad(w.w, x.w, v.w);
-
-            //v.x += k.x*x.x + k.z*x.y;
-            //v.y += k.y*x.x + k.w*x.y;
-            //v.z += k.x*x.z + k.z*x.w;
-            //v.w += k.y*x.z + k.w*x.w;
-        }
-        
-        GroupMemoryBarrierWithGroupSync();
-    }
-    
-    O.Set(n*2+0, 0, x*2+0, 0, v.x, Odata);
-    O.Set(n*2+0, 0, x*2+1, 0, v.y, Odata);
-    O.Set(n*2+1, 0, x*2+0, 0, v.z, Odata);
-    O.Set(n*2+1, 0, x*2+1, 0, v.w, Odata);
-
-    #undef X_
-    #undef W_
-}
-
-// sligtly faster on AMD (56ms vs 62ms)
-#undef TILE_WIDTH
-#define TILE_WIDTH 16
-//#define CACHE_ONLY_X
-//#define TRANSPOSE_W
-//#define TRANSPOSE_X
-groupshared float DenseTiled_XcacheF[TILE_WIDTH][TILE_WIDTH];
-#if !defined(CACHE_ONLY_X)
-groupshared float DenseTiled_WcacheF[TILE_WIDTH][TILE_WIDTH];
-#endif
-[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
-void DenseTiled16x16_amd(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_XcacheF
-    #define W_ DenseTiled_WcacheF
-
-    uint tx = groupThreadID.x;
-    uint ty = groupThreadID.y;
-    uint x = groupID.x*TILE_WIDTH + tx;
-    uint b = groupID.y*TILE_WIDTH + ty;
-
-    float v = B.Get(0, 0, x, 0, WBKdata, WBK.dataLength);
-
-    for (uint m = 0; m < X.width/TILE_WIDTH; ++m)
-    {
-        #if defined(TRANSPOSE_X)
-        X_[tx][ty] = X.Get(b, 0, m*TILE_WIDTH + tx, 0, Xdata);
-        #else
-        X_[ty][tx] = X.Get(b, 0, m*TILE_WIDTH + tx, 0, Xdata);
-        #endif
-
-        #if defined(CACHE_ONLY_X)
-        float ww = WBKdata[wi];
-        #else
-        #if defined(TRANSPOSE_W)
-        W_[tx][ty] = W.Get(0, m*TILE_WIDTH + ty, x, 0, WBKdata, WBK.dataLength);
-        #else
-        W_[ty][tx] = W.Get(0, m*TILE_WIDTH + ty, x, 0, WBKdata, WBK.dataLength);
-        #endif
-        #endif
-        GroupMemoryBarrierWithGroupSync();
-
-        //[unroll(groupthreads)]
-        [unroll]
-        for (uint i = 0; i < TILE_WIDTH; ++i)
-        {
-            #if defined(TRANSPOSE_X)
-            float x = X_[i][ty];
-            #else
-            float x = X_[ty][i];
-            #endif
-
-            #if defined(CACHE_ONLY_X)
-            //float w = ww;
-            //if (i != TILE_WIDTH-1) { wi += W.width; ww = WBKdata[wi]; }
-            float w = W.Get(0, m*TILE_WIDTH + i, x, 0, WBKdata, WBK.dataLength);
-            #else
-            #if defined(TRANSPOSE_W)
-            float w = W_[tx][i];
-            #else
-            float w = W_[i][tx];
-            #endif
-            #endif
-
-            v += x * w;
-        }
-    }
-    
-    O.Set(b, 0, x, 0, v, Odata);
-
-    #undef X_
-    #undef W_
-}
-
-#undef TILE_WIDTH
-#define TILE_WIDTH 16
-groupshared float DenseTiled_Xcache[TILE_WIDTH][TILE_WIDTH];
-groupshared float DenseTiled_Wcache[TILE_WIDTH][TILE_WIDTH];
-[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
-void DenseTiled(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_Xcache
-    #define W_ DenseTiled_Wcache
-
-    uint tx = groupThreadID.x;
-    uint ty = groupThreadID.y;
-    uint x = groupID.x*TILE_WIDTH + tx;
-    uint b = groupID.y*TILE_WIDTH + ty;
-
-    bool mask = (x < O.width && b < O.batch);
-
-    float v = B.Get(0, 0, x, 0, WBKdata, WBK.dataLength);
-
-    for (uint m = 0; m < X.width/TILE_WIDTH; ++m)
-    {
-        if (mask)
-        {
-            X_[ty][tx] = X.Get(b, 0, m*TILE_WIDTH + tx, 0, Xdata);
-            W_[ty][tx] = W.Get(0, m*TILE_WIDTH + ty, x, 0, WBKdata, WBK.dataLength);
-        }
-        else
-        {
-            X_[ty][tx] = 0;
-            W_[ty][tx] = 0;
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-
-        [unroll]
-        for (uint i = 0; i < TILE_WIDTH; ++i)
-        {
-            v += X_[ty][i] * W_[i][tx];
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-    }
-    
-    if (mask)
-        O.Set(b, 0, x, 0, v, Odata);
-
-    #undef X_
-    #undef W_
-}
-
-
-groupshared float DenseTiled_XcacheP[TILE_WIDTH][TILE_WIDTH];
-groupshared float DenseTiled_WcacheP[TILE_WIDTH][TILE_WIDTH];
-// Prefetch - seems to be the same performance as DenseTiled16x16 without prefetch, has higher register pressure
-[numthreads(TILE_WIDTH,TILE_WIDTH,1)]
-void DenseTiledPrefetch16x16(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ DenseTiled_XcacheP
-    #define W_ DenseTiled_WcacheP
-
-    uint tx = groupThreadID.x;
-    uint ty = groupThreadID.y;
-    uint x = groupID.x*TILE_WIDTH + tx;
-    uint b = groupID.y*TILE_WIDTH + ty;
-
-    float v = B.Get(0, 0, x, 0, WBKdata, WBK.dataLength);
-
-    float Xregs[TILE_WIDTH][TILE_WIDTH];
-    float Wregs[TILE_WIDTH][TILE_WIDTH];
-    for (uint m = 0; m < X.width/TILE_WIDTH; ++m)
-    {
-        Xregs[ty][tx] = X.Get(b, 0, m*TILE_WIDTH + tx, 0, Xdata);
-        Wregs[ty][tx] = W.Get(0, m*TILE_WIDTH + ty, x, 0, WBKdata, WBK.dataLength);
-        GroupMemoryBarrierWithGroupSync();
-    }
-
-    for (m = 0; m < X.width/TILE_WIDTH; ++m)
-    {
-        X_[ty][tx] = Xregs[ty][tx];
-        W_[ty][tx] = Wregs[ty][tx];
-
-        Xregs[ty][tx] = X.Get(b, 0, m*TILE_WIDTH + tx, 0, Xdata);
-        Wregs[ty][tx] = W.Get(0, m*TILE_WIDTH + ty, x, 0, WBKdata, WBK.dataLength);
-
-        for (uint i = 0; i < TILE_WIDTH;)
-        {
-            // can unroll up to 16 because TILE_WIDTH=16
-            v += X_[ty][i] * W_[i][tx]; ++i;
-            v += X_[ty][i] * W_[i][tx]; ++i;
-            v += X_[ty][i] * W_[i][tx]; ++i;
-            v += X_[ty][i] * W_[i][tx]; ++i;
-
-            v += X_[ty][i] * W_[i][tx]; ++i;
-            v += X_[ty][i] * W_[i][tx]; ++i;
-            v += X_[ty][i] * W_[i][tx]; ++i;
-            v += X_[ty][i] * W_[i][tx]; ++i;
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-    }
-
-    O.Set(b, 0, x, 0, v, Odata);
-    #undef X_
-    #undef W_
-}
-
-[numthreads(1,1,1)]
-void Relu(uint3 groupID : SV_GroupID)
-{
-    uint x = groupID.x;
-    uint b = groupID.y;
-    uint c = groupID.z;
-    for (uint y = 0; y < X.height; ++y)
-    {
-        float v = X.Get(b, y, x, c, Xdata, X.dataLength);
-        v = 0.5f * (v + abs(v));
-        O.Set(b, y, x, c, v, Odata, O.dataLength);
-    }
-}
-
-[numthreads(16,16,1)]
-void Relu_Cmod16_CNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint c = 16*groupID.x + groupThreadID.x;
-    uint nyx = 16*groupID.y + groupThreadID.y;
-
-    uint width = X.width;
-    uint height = X.height;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-
-    float v = X.Get(n, y, x, c, Xdata, X.dataLength);
-    v = 0.5f * (v + abs(v));
-    O.Set(n, y, x, c, v, Odata, O.dataLength);
-}
-
-[numthreads(512,1,1)]
-void Relu_Nyxc(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint nyxc = 512*groupID.x + groupThreadID.x;
-
-    uint width = X.width;
-    uint height = X.height;
-    uint channels = X.channels;
-
-    uint c = nyxc % channels;
-    uint nyx = nyxc / channels;
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-
-    float v = X.Get(n, y, x, c, Xdata, X.dataLength);
-    v = 0.5f * (v + abs(v));
-    O.Set(n, y, x, c, v, Odata, O.dataLength);
-}
-
-[numthreads(16,16,1)]
-void Relu16x16(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint x = 16*groupID.x + groupThreadID.x;
-    uint b = 16*groupID.y + groupThreadID.y;
-    uint c = groupID.z;
-
-    for (uint y = 0; y < X.height; ++y)
-    {
-        float v = X.Get(b, y, x, c, Xdata, X.dataLength);
-        v = 0.5f * (v + abs(v));
-        O.Set(b, y, x, c, v, Odata, O.dataLength);
-    }
-}
-
-[numthreads(16,16,1)]
-void Relu16x16_(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint x = 16*groupID.x + groupThreadID.x;
-    uint b = 16*groupID.y + groupThreadID.y;
-
-    for (uint y = 0; y < X.height; ++y)
-    {
-        for (uint c = 0; c < X.channels; ++c)
-        {
-            float v = X.Get(b, y, x, c, Xdata, X.dataLength);
-            v = 0.5f * (v + abs(v));
-            O.Set(b, y, x, c, v, Odata, O.dataLength);
-        }
-    }
-}
-
-
-// channels, width, batch
-[numthreads(16,2,16)]
-void ReluChannelsFirst16x2x16(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint c = 16*groupID.x + groupThreadID.x;
-    uint x = 2*groupID.y + groupThreadID.y;
-    uint b = 16*groupID.z + groupThreadID.z;
-
-    for (uint y = 0; y < X.height; ++y)
-    {
-        float v = X.Get(b, y, x, c, Xdata, X.dataLength);
-        v = 0.5f * (v + abs(v));
-        O.Set(b, y, x, c, v, Odata, O.dataLength);
-    }
-}
-
-[numthreads(256,1,1)]
-void Relu256xV(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint x = 256*groupID.x + groupThreadID.x;
-    uint b = groupID.y;
-    uint c = groupID.z;
-
-    for (uint y = 0; y < X.height; ++y)
-    {
-        float v = 0;
-        for (uint b = 0; b < X.batch; )
-        {
-            v = X.Get(b, y, x, c, Xdata, X.dataLength);
-            v = 0.5f * (v + abs(v));
-            O.Set(b, y, x, c, v, Odata, O.dataLength);
-            ++b;
-
-            v = X.Get(b, y, x, c, Xdata, X.dataLength);
-            v = 0.5f * (v + abs(v));
-            O.Set(b, y, x, c, v, Odata, O.dataLength);
-            ++b;
-
-            v = X.Get(b, y, x, c, Xdata, X.dataLength);
-            v = 0.5f * (v + abs(v));
-            O.Set(b, y, x, c, v, Odata, O.dataLength);
-            ++b;
-
-            v = X.Get(b, y, x, c, Xdata, X.dataLength);
-            v = 0.5f * (v + abs(v));
-            O.Set(b, y, x, c, v, Odata, O.dataLength);
-            ++b;
-        }
-    }
-}
-
-
-#define FLT_MAX 3.402823466e+38F
-
-[numthreads(1,1,1)]
-void Softmax(uint3 groupID : SV_GroupID)
-{
-    uint b = groupID.x;
-    uint x = groupID.y;
-
-    float maxV = -FLT_MAX;
-    for (uint i = 0; i < X.width; ++i)
-    {
-        float v = X.Get(b, 0, i, 0, Xdata, X.dataLength);
-        if (v > maxV)
-            maxV = v;
-    }
-
-    float sum = 0.0f;
-    for (i = 0; i < X.width; ++i)
-    {
-        float v = X.Get(b, 0, i, 0, Xdata, X.dataLength);
-        sum += exp(v - maxV);
-    }
-
-    float v = X.Get(b, 0, x, 0, Xdata, X.dataLength);
-    v = exp(v - maxV) / sum;
-    O.Set(b, 0, x, 0, v, Odata, O.dataLength);
-}
-
-[numthreads(256,2,1)]
-void Softmax256x2(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint b = 256*groupID.x + groupThreadID.x;
-    uint x = 2*groupID.y + groupThreadID.y;
-
-    float maxV = -FLT_MAX;
-    for (uint i = 0; i < X.width; ++i)
-    {
-        float v = X.Get(b, 0, i, 0, Xdata, X.dataLength);
-        if (v > maxV)
-            maxV = v;
-    }
-
-    float sum = 0.0f;
-    for (i = 0; i < X.width; ++i)
-    {
-        float v = X.Get(b, 0, i, 0, Xdata, X.dataLength);
-        sum += exp(v - maxV);
-    }
-
-    float v = X.Get(b, 0, x, 0, Xdata, X.dataLength);
-    v = exp(v - maxV) / sum;
-    O.Set(b, 0, x, 0, v, Odata, O.dataLength);
-}
-
-[numthreads(1,1,1)]
-void MaxPooling2D(uint3 groupID : SV_GroupID)
-{
-    uint c = groupID.x;
-    uint x = groupID.y;
-    uint y = groupID.z;
-
-    for (uint b = 0; b < O.batch; ++b)
-    {
-        float v0 = X.Get(b, y*2,   x*2,   c, Xdata, X.dataLength);
-        float v1 = X.Get(b, y*2+1, x*2,   c, Xdata, X.dataLength);
-        float v2 = X.Get(b, y*2,   x*2+1, c, Xdata, X.dataLength);
-        float v3 = X.Get(b, y*2+1, x*2+1, c, Xdata, X.dataLength);
-        float v = max(v0, max(v1, max(v2, v3)));
-        O.Set(b, y, x, c, v, Odata, O.dataLength);
-    }
-}
-
-[numthreads(16,4,4)]
-void MaxPooling2D16x4x4(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint c = 16*groupID.x + groupThreadID.x;
-    uint x = 4*groupID.y + groupThreadID.y;
-    uint y = 4*groupID.z + groupThreadID.z;
-
-    for (uint b = 0; b < O.batch; ++b)
-    {
-        float v0 = X.Get(b, y*2,   x*2,   c, Xdata, X.dataLength);
-        float v1 = X.Get(b, y*2+1, x*2,   c, Xdata, X.dataLength);
-        float v2 = X.Get(b, y*2,   x*2+1, c, Xdata, X.dataLength);
-        float v3 = X.Get(b, y*2+1, x*2+1, c, Xdata, X.dataLength);
-        float v = max(v0, max(v1, max(v2, v3)));
-        O.Set(b, y, x, c, v, Odata, O.dataLength);
-    }
-}
-
-[numthreads(16,16,2)]
-void Conv2D_Valid(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint k = 16*groupID.x + groupThreadID.x;
-    uint n = 16*groupID.y + groupThreadID.y;
-    uint y = 2*groupID.z + groupThreadID.z + _FilterSize;
-
-    //for (int y = _FilterSize; y < X.height - _FilterSize; ++y)
-    {
-        for (uint x = _FilterSize; x < X.width - _FilterSize; ++x)
-        {
-            float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-            for (int i = -(int)_FilterSize; i < (int)_FilterSize + 1; ++i)
-            {
-                for (int j = -(int)_FilterSize; j < (int)_FilterSize + 1; ++j)
-                {
-                    for (uint c = 0; c < X.channels; ++c)
-                    {
-                        v += X.Get(n, y+j, x+i, c, Xdata, X.dataLength) * K.Get(_FilterSize+j, _FilterSize+i, c, k, WBKdata, WBK.dataLength);
-                    }
-                }
-            }
-            O.Set(n, y-_FilterSize, x-_FilterSize, k, v, Odata, O.dataLength);
-        }
-    }
-}
-
-[numthreads(16,8,1)]
-void Conv2D_Kmod16_Nmod8_KNY(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint k = 16*groupID.x + groupThreadID.x;
-    uint n = 8*groupID.y + groupThreadID.y;
-    uint y = 1*groupID.z + groupThreadID.z;
-
-    //for (int y = _FilterSize; y < X.height - _FilterSize; ++y)
-    {
-        for (uint x = 0; x < X.width - _Border; ++x)
-        {
-            float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-            for (uint j = 0; j < 2*_FilterSize+1; ++j)
-            {
-                if (y+j < _Offset) continue;
-                if (y+j-_Offset >= X.height) continue;
-
-                for (uint i = 0; i < 2*_FilterSize+1; ++i)
-                {
-                    if (x+i < _Offset) continue;
-                    if (x+i-_Offset >= X.width) continue;
-
-                    for (uint c = 0; c < X.channels; ++c)
-                    {
-                        v += X.Get(n, y+j-_Offset, x+i-_Offset, c, Xdata, X.dataLength) * K.Get(j, i, c, k, WBKdata, WBK.dataLength);
-                    }
-                }
-            }
-            O.Set(n, y, x, k, v, Odata, O.dataLength);
-        }
-    }
-}
-
-[numthreads(1,1,1)]
-void Conv2D(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint k = 1*groupID.x + groupThreadID.x;
-    uint n = 1*groupID.y + groupThreadID.y;
-    uint y = 1*groupID.z + groupThreadID.z;
-
-    //for (int y = _FilterSize; y < X.height - _FilterSize; ++y)
-    {
-        for (uint x = 0; x < X.width - _Border; ++x)
-        {
-            float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-            for (uint j = 0; j < 2*_FilterSize+1; ++j)
-            {
-                if (y+j < _Offset) continue;
-                if (y+j-_Offset >= X.height) continue;
-
-                for (uint i = 0; i < 2*_FilterSize+1; ++i)
-                {
-                    if (x+i < _Offset) continue;
-                    if (x+i-_Offset >= X.width) continue;
-
-                    for (uint c = 0; c < X.channels; ++c)
-                    {
-                        v += X.Get(n, y+j-_Offset, x+i-_Offset, c, Xdata, X.dataLength) * K.Get(j, i, c, k, WBKdata, WBK.dataLength);
-                    }
-                }
-            }
-            O.Set(n, y, x, k, v, Odata, O.dataLength);
-        }
-    }
-}
-
-#if 0
-
-#define MAX_TILE_WIDTH 16
-#define KERNEL_COUNT 4
-#define KERNEL_SIZE 3
-#define KERNEL_RADIUS 1 //(KERNEL_SIZE-1)/2
-groupshared float XCcache[MAX_TILE_WIDTH+KERNEL_SIZE-1][MAX_TILE_WIDTH+KERNEL_SIZE-1];
-groupshared float Kcache[KERNEL_SIZE][KERNEL_SIZE][KERNEL_COUNT];
-
-#undef TILE_WIDTH
-#define TILE_WIDTH 13
-[numthreads(TILE_WIDTH,TILE_WIDTH,KERNEL_COUNT)]
-void Conv2DTiled14x14_Kernel3x3(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint tx = groupThreadID.x;
-    uint ty = groupThreadID.y;
-    uint tk = groupThreadID.z;
-    uint gx = groupID.x;
-    uint gy = groupID.y;
-    uint gk = groupID.z;
-    uint tileCornerX = gx*TILE_WIDTH;
-    uint tileCornerY = gy*TILE_WIDTH;
-    uint x = tileCornerX + tx;
-    uint y = tileCornerY + ty;
-    uint k = gk*KERNEL_COUNT + tk;
-    uint idx = ty*TILE_WIDTH + tx;
-
-    for (uint b = 0; b < X.batch; ++b)
-    {
-        float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-        for (uint c = 0; c < X.channels; ++c)
-        {
-            if (tk == 0)
-                XCcache[ty][tx] = X.Get(b, y, x, c, Xdata);
-            else if (tk == 1 && idx < TILE_WIDTH * 2)
-            {
-                uint yy = idx / 2;
-                uint xx = idx % 2 + TILE_WIDTH;
-                XCcache[yy][xx] = X.Get(b, tileCornerY+yy, tileCornerX+xx, c, Xdata);
-            }
-            else if (tk == 2 && idx < (TILE_WIDTH + 2) * 2)
-            {
-                uint yy = idx / (TILE_WIDTH + 2) + TILE_WIDTH;
-                uint xx = idx % (TILE_WIDTH + 2);
-                XCcache[yy][xx] = X.Get(b, tileCornerY+yy, tileCornerX+xx, c, Xdata);
-            }
-            if (tk == 3)
-            {
-                uint kk = idx / (KERNEL_SIZE * KERNEL_SIZE);
-                uint kyx = idx % (KERNEL_SIZE * KERNEL_SIZE);
-                if (kk < KERNEL_COUNT)
-                {
-                    uint yy = kyx / KERNEL_SIZE;
-                    uint xx = kyx % KERNEL_SIZE;
-                    Kcache[yy][xx][kk] = K.Get(yy, xx, c, gk*KERNEL_COUNT+kk, WBKdata, WBK.dataLength);
-                }
-            }
-            GroupMemoryBarrierWithGroupSync();
-
-            for (int i = 0; i < KERNEL_SIZE; ++i)
-            {
-                for (int j = 0; j < KERNEL_SIZE; ++j)
-                {
-                    v += XCcache[ty+j][tx+i] * Kcache[j][i][tk];
-                }
-            }
-        }
-        O.Set(b, y, x, k, v, Odata, O.dataLength);
-        GroupMemoryBarrierWithGroupSync();
-    }
-}
-
-#undef TILE_WIDTH
-#define TILE_WIDTH 12
-[numthreads(TILE_WIDTH,TILE_WIDTH,KERNEL_COUNT)]
-void Conv2DTiled13x13_Kernel3x3(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint tx = groupThreadID.x;
-    uint ty = groupThreadID.y;
-    uint tk = groupThreadID.z;
-    uint gx = groupID.x;
-    uint gy = groupID.y;
-    uint gk = groupID.z;
-    uint tileCornerX = gx*TILE_WIDTH;
-    uint tileCornerY = gy*TILE_WIDTH;
-    uint x = tileCornerX + tx;
-    uint y = tileCornerY + ty;
-    uint k = gk*KERNEL_COUNT + tk;
-    uint idx = ty*TILE_WIDTH + tx;
-
-    for (uint b = 0; b < X.batch; ++b)
-    {
-        float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-        for (uint c = 0; c < X.channels; ++c)
-        {
-            if (tk == 0)
-                XCcache[ty][tx] = X.Get(b, y, x, c, Xdata);
-            else if (tk == 1 && idx < TILE_WIDTH * 2)
-            {
-                uint yy = idx / 2;
-                uint xx = idx % 2 + TILE_WIDTH;
-                XCcache[yy][xx] = X.Get(b, tileCornerY+yy, tileCornerX+xx, c, Xdata);
-            }
-            else if (tk == 2 && idx < (TILE_WIDTH + 2) * 2)
-            {
-                uint yy = idx / (TILE_WIDTH + 2) + TILE_WIDTH;
-                uint xx = idx % (TILE_WIDTH + 2);
-                XCcache[yy][xx] = X.Get(b, tileCornerY+yy, tileCornerX+xx, c, Xdata);
-            }
-            if (tk == 3)
-            {
-                uint kk = idx / (KERNEL_SIZE * KERNEL_SIZE);
-                uint kyx = idx % (KERNEL_SIZE * KERNEL_SIZE);
-                if (kk < KERNEL_COUNT)
-                {
-                    uint yy = kyx / KERNEL_SIZE;
-                    uint xx = kyx % KERNEL_SIZE;
-                    Kcache[yy][xx][kk] = K.Get(yy, xx, c, gk*KERNEL_COUNT+kk, WBKdata, WBK.dataLength);
-                }
-            }
-            GroupMemoryBarrierWithGroupSync();
-
-            for (int i = 0; i < KERNEL_SIZE; ++i)
-            {
-                for (int j = 0; j < KERNEL_SIZE; ++j)
-                {
-                    v += XCcache[ty+j][tx+i] * Kcache[j][i][tk];
-                }
-            }
-        }
-        O.Set(b, y, x, k, v, Odata, O.dataLength);
-        GroupMemoryBarrierWithGroupSync();
-    }
-}
-
-/*
-#undef TILE_WIDTH
-#define TILE_WIDTH 12
-[numthreads(TILE_WIDTH,TILE_WIDTH,KERNEL_COUNT)]
-void Conv2DTiled12x12_Kernel3x3(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint tx = groupThreadID.x;
-    uint ty = groupThreadID.y;
-    uint tk = groupThreadID.z;
-    uint gx = groupID.x;
-    uint gy = groupID.y;
-    uint gk = groupID.z;
-    uint tileCornerX = gx*TILE_WIDTH;
-    uint tileCornerY = gy*TILE_WIDTH;
-    uint x = tileCornerX + tx;
-    uint y = tileCornerY + ty;
-    uint k = gk*KERNEL_COUNT + tk;
-    uint idx = ty*TILE_WIDTH + tx;
-
-    for (uint b = 0; b < X.batch; ++b)
-    {
-        float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-        for (uint c = 0; c < X.channels; ++c)
-        {
-            if (gk == 0)
-                XCcache[ty][tx] = X.Get(b, y, x, c, Xdata);
-            else if (gk == 1 && idx < TILE_WIDTH * 2)
-            {
-                uint yy = idx / 2;
-                uint xx = idx % 2 + TILE_WIDTH;
-                XCcache[yy][xx] = X.Get(b, tileCornerY+yy, tileCornerX+xx, c, Xdata);
-            }
-            else if (gk == 2 && idx < (TILE_WIDTH + 2) * 2)
-            {
-                uint yy = idx / (TILE_WIDTH + 2) + TILE_WIDTH;
-                uint xx = idx % (TILE_WIDTH + 2);
-                XCcache[yy][xx] = X.Get(b, tileCornerY+yy, tileCornerX+xx, c, Xdata);
-            }
-            else if (gk == 3 && ty < KERNEL_SIZE && tx < KERNEL_SIZE)
-                Kcache[ty][tx][tk] = K.Get(ty, tx, c, k, WBKdata, WBK.dataLength);
-            GroupMemoryBarrierWithGroupSync();
-
-            for (int i = 0; i < KERNEL_SIZE; ++i)
-            {
-                for (int j = 0; j < KERNEL_SIZE; ++j)
-                {
-                    v += XCcache[ty+j][tx+i] * Kcache[j][i][tk];
-                }
-            }
-        }
-        O.Set(b, y-KERNEL_RADIUS, x-KERNEL_RADIUS, k, v, Odata, O.dataLength);
-        GroupMemoryBarrierWithGroupSync();
-    }
-}
-*/
-
-// %TODO: only supports up to 32 channels now
-#undef KERNEL_COUNT
-#undef CHANNEL_COUNT
-#define KERNEL_COUNT 16
-#define CHANNEL_COUNT 32
-groupshared float K2cache[CHANNEL_COUNT][KERNEL_COUNT][9];
-[numthreads(KERNEL_COUNT,CHANNEL_COUNT,1)]
-void Conv2D_Kernel3x3_32Channel_Valid(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint tk = groupThreadID.x;
-    uint k = KERNEL_COUNT*groupID.x + tk;
-    uint n = CHANNEL_COUNT*groupID.y + groupThreadID.y;
-
-    for (uint q = 0; q < 9; ++q)
-    {
-        uint tc = n % CHANNEL_COUNT;
-        K2cache[tc][tk][q] = K.Get(q/3, q%3, tc, k, WBKdata, WBK.dataLength);
-    }
-    GroupMemoryBarrierWithGroupSync();
-
-    for (uint y = 0; y < X.height - _FilterSize*2; ++y)
-    {
-        for (uint x = 0; x < X.width - _FilterSize*2; ++x)
-        {
-            float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-            for (uint q = 0; q < 9; ++q)
-                for (uint c = 0; c < CHANNEL_COUNT; c += 4)
-                {
-                    //K.Get(q/3, q%3, c, k, WBKdata, WBK.dataLength);
-                    v += X.Get(n, y+q/3, x+q%3, c+0, Xdata, X.dataLength) * K2cache[c+0][tk][q];
-                    v += X.Get(n, y+q/3, x+q%3, c+1, Xdata, X.dataLength) * K2cache[c+1][tk][q];
-                    v += X.Get(n, y+q/3, x+q%3, c+2, Xdata, X.dataLength) * K2cache[c+2][tk][q];
-                    v += X.Get(n, y+q/3, x+q%3, c+3, Xdata, X.dataLength) * K2cache[c+3][tk][q];
-                }
-            O.Set(n, y, x, k, v, Odata, O.dataLength);
-        }
-    }
-}
-
-[numthreads(KERNEL_COUNT,CHANNEL_COUNT,1)]
-void Conv2D_Kernel3x3_32Channel(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint tk = groupThreadID.x;
-    uint k = KERNEL_COUNT*groupID.x + tk;
-    uint n = CHANNEL_COUNT*groupID.y + groupThreadID.y;
-
-    for (uint q = 0; q < 9; ++q)
-    {
-        uint tc = n % CHANNEL_COUNT;
-        K2cache[tc][tk][q] = K.Get(q/3, q%3, tc, k, WBKdata, WBK.dataLength);
-    }
-    GroupMemoryBarrierWithGroupSync();
-
-    for (uint y = 0; y < X.height - _Border; ++y)
-    {
-        for (uint x = 0; x < X.width - _Border; ++x)
-        {
-            float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-            for (uint dy = 0; dy < 3; ++dy)
-            {
-                if (y+dy < _Offset) continue;
-                if (y+dy-_Offset >= X.height) continue;
-                for (uint dx = 0; dx < 3; ++dx)
-                {
-                    if (x+dx < _Offset) continue;
-                    if (x+dx-_Offset >= X.width) continue;
-
-                    uint q = dy*3+dx;
-                    for (uint c = 0; c < CHANNEL_COUNT; c += 4)
-                    {
-                        //K.Get(q/3, q%3, c, k, WBKdata, WBK.dataLength);
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+0, Xdata, X.dataLength) * K2cache[c+0][tk][q];
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+1, Xdata, X.dataLength) * K2cache[c+1][tk][q];
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+2, Xdata, X.dataLength) * K2cache[c+2][tk][q];
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+3, Xdata, X.dataLength) * K2cache[c+3][tk][q];
-                    }
-                }
-            }
-            O.Set(n, y, x, k, v, Odata, O.dataLength);
-        }
-    }
-}
-
-groupshared float X2cache[2][CHANNEL_COUNT][KERNEL_COUNT];
-[numthreads(KERNEL_COUNT,CHANNEL_COUNT,1)]
-void Conv2D_Kernel3x3_32Channel_(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint tk = groupThreadID.x;
-    uint tn = groupThreadID.y;
-    uint k = KERNEL_COUNT*groupID.x + tk;
-    uint n = CHANNEL_COUNT*groupID.y + tn;
-
-    for (uint q = 0; q < 9; ++q)
-    {
-        uint tc = n % CHANNEL_COUNT;
-        K2cache[q][tc][tk] = K.Get(q/3, q%3, tc, k, WBKdata, WBK.dataLength);
-    }
-    //GroupMemoryBarrierWithGroupSync(); <-- unnecessary, we have one inside the loop
-
-    for (uint y = 0; y < X.height - _FilterSize*2; ++y)
-    {
-        for (uint x = 0; x < X.width - _FilterSize*2; ++x)
-        {
-            float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-            for (uint cBlock = 0; cBlock < CHANNEL_COUNT; cBlock += KERNEL_COUNT)
-            {
-                for (uint q = 0; q < 9; ++q)
-                {
-                    uint tc = k % KERNEL_COUNT;
-                    X2cache[q%2][tn][tc] = X.Get(n, y+q/3, x+q%3, cBlock+tc, Xdata, X.dataLength);
-                    GroupMemoryBarrierWithGroupSync();
-
-                    for (tc = 0; tc < KERNEL_COUNT; ++tc)
-                        v += X2cache[q%2][tn][tc] * K2cache[q][cBlock+tc][tk];
-                }
-            }
-            O.Set(n, y, x, k, v, Odata, O.dataLength);
-        }
-    }
-}
-
-// 16x8 => 0.101
-// 32x4 => 0.114
-//  8x8 => 0.131
-
-#define PARAM_X 16
-#define PARAM_Y 8
-[numthreads(PARAM_X, PARAM_Y, 1)]
-void Conv2D_Kernel3x3_Kmod16_Cmod4_KN(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint k = PARAM_X * groupID.x + groupThreadID.x;
-    uint n = PARAM_Y * groupID.y + groupThreadID.y;
-
-    for (uint y = 0; y < X.height - _Border; ++y)
-    {
-        for (uint x = 0; x < X.width - _Border; ++x)
-        {
-            float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-            for (uint dy = 0; dy < 3; ++dy)
-            {
-                if (y+dy < _Offset) continue;
-                if (y+dy-_Offset >= X.height) continue;
-                for (uint dx = 0; dx < 3; ++dx)
-                {
-                    if (x+dx < _Offset) continue;
-                    if (x+dx-_Offset >= X.width) continue;
-
-                    for (uint c = 0; c < X.channels; c += 4)
-                    {
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+0, Xdata, X.dataLength) * K.Get(dy, dx, c+0, k, WBKdata, WBK.dataLength);
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+1, Xdata, X.dataLength) * K.Get(dy, dx, c+1, k, WBKdata, WBK.dataLength);
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+2, Xdata, X.dataLength) * K.Get(dy, dx, c+2, k, WBKdata, WBK.dataLength);
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+3, Xdata, X.dataLength) * K.Get(dy, dx, c+3, k, WBKdata, WBK.dataLength);
-                    }
-                }
-            }
-            O.Set(n, y, x, k, v, Odata, O.dataLength);
-        }
-    }
-}
-#undef PARAM_X
-#undef PARAM_Y
-#define PARAM_X 16
-#define PARAM_Y 8
-
-// 16x8 => 0.096
-//  8x8 => 0.117
-[numthreads(PARAM_X, PARAM_Y, 1)]
-void Conv2D_Kernel3x3_Kmod16_Cmod4_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint k = PARAM_X * groupID.x + groupThreadID.x;
-    uint nyx = PARAM_Y * groupID.y + groupThreadID.y;
-
-    uint width = X.width - _Border;
-    uint height = X.height - _Border;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-
-    //for (uint y = 0; y < X.height - _Border; ++y)
-    //{
-    //    for (uint x = 0; x < X.width - _Border; ++x)
-    //    {
-            float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-            for (uint dy = 0; dy < 3; ++dy)
-            {
-                if (y+dy < _Offset) continue;
-                if (y+dy-_Offset >= X.height) continue;
-                for (uint dx = 0; dx < 3; ++dx)
-                {
-                    if (x+dx < _Offset) continue;
-                    if (x+dx-_Offset >= X.width) continue;
-
-                    for (uint c = 0; c < X.channels; c += 4)
-                    {
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+0, Xdata, X.dataLength) * K.Get(dy, dx, c+0, k, WBKdata, WBK.dataLength);
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+1, Xdata, X.dataLength) * K.Get(dy, dx, c+1, k, WBKdata, WBK.dataLength);
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+2, Xdata, X.dataLength) * K.Get(dy, dx, c+2, k, WBKdata, WBK.dataLength);
-                        v += X.Get(n, y+dy-_Offset, x+dx-_Offset, c+3, Xdata, X.dataLength) * K.Get(dy, dx, c+3, k, WBKdata, WBK.dataLength);
-                    }
-                }
-            }
-            O.Set(n, y, x, k, v, Odata, O.dataLength);
-    //    }
-    //}
-}
-
-#undef CTILE
-#define CTILE 16
-
-#undef PARAM_X
-#undef PARAM_Y
-#define PARAM_X CTILE
-#define PARAM_Y CTILE
-
-#define TYPE float
-
-groupshared TYPE Conv_XcacheT[CTILE][CTILE];
-groupshared TYPE Conv_KcacheT[CTILE][CTILE];
-
-[numthreads(PARAM_X, PARAM_Y, 1)]
-void Conv2D_Kernel3x3_Cache_KCmod16_KNyx_(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ Conv_XcacheT
-    #define K_ Conv_KcacheT
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = PARAM_X * groupID.x + groupThreadID.x;
-    uint nyx = PARAM_Y * groupID.y + groupThreadID.y;
-
-    uint width = X.width - _Border;
-    uint height = X.height - _Border;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-
-            //half v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-            TYPE v = WBKdata[k + B.offset];
-            for (uint dy = 0; dy < 3; ++dy)
-            {
-                bool mask = true;
-
-                if (y+dy < _Offset) mask = false;
-                if (y+dy-_Offset >= X.height) mask = false;
-
-                for (uint dx = 0; dx < 3; ++dx)
-                {
-                    if (x+dx < _Offset) mask = false;
-                    if (x+dx-_Offset >= X.width) mask = false;
-
-                    int Xi = (( n             * X.height +
-                                y+dy-_Offset ) * X.width +
-                                x+dx-_Offset ) * X.channels +
-                                gx;
-
-                    int Ki = (( dy   * K.height +
-                                dx ) * K.width +
-                                /*m*CTILE +*/ gy ) * K.channels +
-                                k + K.offset;
-
-                    for (uint m = 0; m < X.channels/CTILE; ++m)
-                    {
-                        if (mask)
-                        {
-                            //X_[gy][gx] = X.Get(n, y+dy-_Offset, x+dx-_Offset, m*CTILE + gx, Xdata);
-                            X_[gy][gx] = Xdata[Xi + m*CTILE];
-                        }
-                        else
-                        {
-                            X_[gy][gx] = 0;
-                        }    
-                        //K_[gy][gx] = K.Get(dy, dx, m*CTILE + gy, k, WBKdata, WBK.dataLength);
-                        //K_[gy][gx] = WBKdata[((
-                        //    dy   * K.height +
-                        //    dx ) * K.width +
-                        //    m*CTILE + gy ) * K.channels +
-                        //    k + K.offset];
-                        //K_[gy][gx] = WBKdata[Ki + m*CTILE * K.channels];
-                        K_[gy][gx] = WBKdata[Ki + m*CTILE * K.channels];
-                        GroupMemoryBarrierWithGroupSync();
-
-                        for (uint i = 0; i < CTILE;)
-                        {
-                            /*
-                            // can unroll up to CTILE
-                            half4 x4 = ((half4[CTILE][CTILE/4])(X_))[gy][i];
-                            half4 k4 = ((half4[CTILE][CTILE/4])(K_))[gx][i];
-
-                            v += dot(x4, k4); ++i;
-                            v += dot(x4, k4); ++i;
-                            */
-                            
-                            v += X_[gy][i] * K_[i][gx]; ++i;
-                            v += X_[gy][i] * K_[i][gx]; ++i;
-                            v += X_[gy][i] * K_[i][gx]; ++i;
-                            v += X_[gy][i] * K_[i][gx]; ++i;
-                            v += X_[gy][i] * K_[i][gx]; ++i;
-                            v += X_[gy][i] * K_[i][gx]; ++i;
-                            v += X_[gy][i] * K_[i][gx]; ++i;
-                            v += X_[gy][i] * K_[i][gx]; ++i;
-                                
-                        }
-                    }
-                }
-            }
-            //O.Set(n, y, x, k, v, Odata, O.dataLength);
-            Odata[((
-                n   * O.height +
-                y ) * O.width +
-                x ) * O.channels +
-                k] = v;
-
-    #undef X_
-    #undef K_
-}
-
-#undef CTILE
-#define CTILE 16
-groupshared float Conv_XcacheA[4][CTILE][CTILE];
-groupshared float Conv_Kcache0[CTILE][CTILE];
-groupshared float Conv_Kcache1[CTILE][CTILE];
-groupshared float Conv_Kcache2[CTILE][CTILE];
-groupshared float Conv_Kcache3[CTILE][CTILE];
-[numthreads(CTILE, CTILE, 1)]
-void Conv2D_Kernel3x3_Cache_KCmod32_KNyx____(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ Conv_XcacheA
-    #define K_0 Conv_Kcache0
-    #define K_1 Conv_Kcache1
-    #define K_2 Conv_Kcache2
-    #define K_3 Conv_Kcache3
-
-
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = CTILE * groupID.x + groupThreadID.x;
-    uint nyx = CTILE * groupID.y + groupThreadID.y;
-
-    uint width = X.width - _Border;
-    uint height = X.height - _Border;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-    
-    float b0 = B.Get(0, 0, k*2+0, 0, WBKdata, WBK.dataLength);
-    float b1 = B.Get(0, 0, k*2+1, 0, WBKdata, WBK.dataLength);
-    float4 v = float4(b0, b1,
-                      b0, b1);
-
-    for (uint dy = 0; dy < 3; ++dy)
-    {
-        bool mask = true;
-
-        if (y+dy < _Offset) mask = false;
-        if (y+dy-_Offset >= X.height) mask = false;
-
-        for (uint dx = 0; dx < 3; ++dx)
-        {
-            if (x+dx < _Offset) mask = false;
-            if (x+dx-_Offset >= X.width) mask = false;
-
-            for (uint m = 0; m < X.channels/(CTILE*2); ++m)
-            {
-                float x0 = 0;
-                float x1 = 0;
-                float x2 = 0;
-                float x3 = 0;
-                
-                if (mask)
-                {
-                    x0 = X.Get(n*2+0, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*2+0, Xdata);
-                    x1 = X.Get(n*2+0, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*2+1, Xdata);
-                    x2 = X.Get(n*2+1, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*2+0, Xdata);
-                    x3 = X.Get(n*2+1, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*2+1, Xdata);
-                }
-
-                float k0 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0, WBKdata, WBK.dataLength);
-                float k1 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1, WBKdata, WBK.dataLength);
-                float k2 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0, WBKdata, WBK.dataLength);
-                float k3 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1, WBKdata, WBK.dataLength);
-
-                //X_[gy][gx] = float4(x0, x1,
-                //                    x2, x3);
-                //K_[gy][gx] = float4(k0, k1,
-                //                    k2, k3);
-                X_[0][gy][gx] = x0;
-                X_[1][gy][gx] = x1;
-                X_[2][gy][gx] = x2;
-                X_[3][gy][gx] = x3;
-
-                K_0[gy][gx] = k0;
-                K_1[gy][gx] = k1;
-                K_2[gy][gx] = k2;
-                K_3[gy][gx] = k3;
-
-                GroupMemoryBarrierWithGroupSync();
-
-                [unroll]
-                for (uint i = 0; i < CTILE; ++i)
-                {
-                    float4 x = //X_[gy][i];
-                        float4(    X_[0][gy][i],
-                                X_[1][gy][i],
-                                X_[2][gy][i],
-                                X_[3][gy][i]);
-                    //float4 k = //K_[i][gx];
-                    //    float4(    K_0[i][gx],
-                    //            K_1[i][gx],
-                    //            K_2[i][gx],
-                    //            K_3[i][gx]);
-                    k0 = K_0[i][gx];
-                    k1 = K_1[i][gx];
-                    k2 = K_2[i][gx];
-                    k3 = K_3[i][gx];
-                    
-                    v.x = mad(k0, x.x, v.x);
-                    v.x = mad(k2, x.y, v.x);
-                    
-                    v.y = mad(k1, x.x, v.y);
-                    v.y = mad(k2, x.y, v.y);
-                    
-                    v.z = mad(k0, x.z, v.z);
-                    v.z = mad(k2, x.w, v.z);
-                    
-                    v.w = mad(k1, x.z, v.w);
-                    v.w = mad(k3, x.w, v.w);
-
-                    //v.x += k.x*x.x + k.z*x.y;
-                    //v.y += k.y*x.x + k.w*x.y;
-                    //v.z += k.x*x.z + k.z*x.w;
-                    //v.w += k.y*x.z + k.w*x.w;
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-
-    //Odata[nyx * O.channels + k] = v;
-    
-    /*Odata[((
-        n   * O.height +
-        y ) * O.width +
-        x ) * O.channels +
-        k] = v;
-        */
-
-    O.Set(n*2+0, y, x, k*2+0, v.x, Odata);
-    O.Set(n*2+0, y, x, k*2+1, v.y, Odata);
-    O.Set(n*2+1, y, x, k*2+0, v.z, Odata);
-    O.Set(n*2+1, y, x, k*2+1, v.w, Odata);
-    
-    #undef X_
-    #undef K_
-}
-
-
-#undef CTILE
-#define CTILE 16
-groupshared float Conv_Xcache[4][CTILE][CTILE];
-groupshared float Conv_Kcache[4][CTILE][CTILE];
-[numthreads(CTILE, CTILE, 1)]
-void Conv2D_Kernel3x3_Cache_KCmod32_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ Conv_Xcache
-    #define K_ Conv_Kcache
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = CTILE * groupID.x + groupThreadID.x;
-    uint nyx = CTILE * groupID.y + groupThreadID.y;
-
-    uint width = X.width - _Border;
-    uint height = X.height - _Border;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-    
-    float b0 = B.Get(0, 0, k*2+0, 0, WBKdata, WBK.dataLength);
-    float b1 = B.Get(0, 0, k*2+1, 0, WBKdata, WBK.dataLength);
-    float4 v = float4(b0, b1,
-                      b0, b1);
-
-    for (uint dy = 0; dy < 3; ++dy)
-    {
-        bool mask = true;
-
-        if (y+dy < _Offset) mask = false;
-        if (y+dy-_Offset >= X.height) mask = false;
-
-        for (uint dx = 0; dx < 3; ++dx)
-        {
-            if (x+dx < _Offset) mask = false;
-            if (x+dx-_Offset >= X.width) mask = false;
-
-            for (uint m = 0; m < X.channels/(CTILE*2); ++m)
-            {
-                float x0 = 0;
-                float x1 = 0;
-                float x2 = 0;
-                float x3 = 0;
-                
-                if (mask)
-                {
-                    x0 = X.Get(n*2+0, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*2+0, Xdata);
-                    x1 = X.Get(n*2+0, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*2+1, Xdata);
-                    x2 = X.Get(n*2+1, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*2+0, Xdata);
-                    x3 = X.Get(n*2+1, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*2+1, Xdata);
-                }
-
-                float k0 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+0, WBKdata, WBK.dataLength);
-                float k1 = K.Get(dy, dx, (m*CTILE + gy)*2+0, k*2+1, WBKdata, WBK.dataLength);
-                float k2 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+0, WBKdata, WBK.dataLength);
-                float k3 = K.Get(dy, dx, (m*CTILE + gy)*2+1, k*2+1, WBKdata, WBK.dataLength);
-
-                //X_[gy][gx] = float4(x0, x1,
-                //                    x2, x3);
-                //K_[gy][gx] = float4(k0, k1,
-                //                    k2, k3);
-                X_[0][gy][gx] = x0;
-                X_[1][gy][gx] = x1;
-                X_[2][gy][gx] = x2;
-                X_[3][gy][gx] = x3;
-
-                K_[0][gy][gx] = k0;
-                K_[1][gy][gx] = k1;
-                K_[2][gy][gx] = k2;
-                K_[3][gy][gx] = k3;
-
-                GroupMemoryBarrierWithGroupSync();
-
-                [unroll]
-                for (uint i = 0; i < CTILE; ++i)
-                {
-                    float4 x = //X_[gy][i];
-                        float4(    X_[0][gy][i],
-                                X_[1][gy][i],
-                                X_[2][gy][i],
-                                X_[3][gy][i]);
-                    float4 k = //K_[i][gx];
-                        float4(    K_[0][i][gx],
-                                K_[1][i][gx],
-                                K_[2][i][gx],
-                                K_[3][i][gx]);
-                    
-                    v.x = mad(k.x, x.x, v.x);
-                    v.x = mad(k.z, x.y, v.x);
-                    
-                    v.y = mad(k.y, x.x, v.y);
-                    v.y = mad(k.w, x.y, v.y);
-                    
-                    v.z = mad(k.x, x.z, v.z);
-                    v.z = mad(k.z, x.w, v.z);
-                    
-                    v.w = mad(k.y, x.z, v.w);
-                    v.w = mad(k.w, x.w, v.w);
-
-                    //v.x += k.x*x.x + k.z*x.y;
-                    //v.y += k.y*x.x + k.w*x.y;
-                    //v.z += k.x*x.z + k.z*x.w;
-                    //v.w += k.y*x.z + k.w*x.w;
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-
-    //Odata[nyx * O.channels + k] = v;
-    
-    /*Odata[((
-        n   * O.height +
-        y ) * O.width +
-        x ) * O.channels +
-        k] = v;
-        */
-
-    O.Set(n*2+0, y, x, k*2+0, v.x, Odata);
-    O.Set(n*2+0, y, x, k*2+1, v.y, Odata);
-    O.Set(n*2+1, y, x, k*2+0, v.z, Odata);
-    O.Set(n*2+1, y, x, k*2+1, v.w, Odata);
-    
-    #undef X_
-    #undef K_
-}
-
-#if 0 // =====================================================================================================
-
-#undef CTILE
-#define CTILE 16
-#define RTILE 4
-groupshared float Conv_XcacheR[RTILE*RTILE][CTILE*CTILE];
-groupshared float Conv_KcacheR[RTILE*RTILE][CTILE*CTILE];
-[numthreads(CTILE, CTILE, 1)]
-void Conv2D_Kernel3x3_Cache_KCmod64_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ Conv_XcacheR
-    #define K_ Conv_KcacheR
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = CTILE * groupID.x + groupThreadID.x;
-    uint nyx = CTILE * groupID.y + groupThreadID.y;
-
-    uint width = X.width - _Border;
-    uint height = X.height - _Border;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-
-    float v[RTILE][RTILE];
-    for (uint xxxx = 0; xxxx < RTILE; ++xxxx)
-    {
-        float b = B.Get(0, 0, k*RTILE+xxxx, 0, WBKdata, WBK.dataLength);
-        for (uint yyyy = 0; yyyy < RTILE; ++yyyy)
-            v[yyyy][xxxx] = b;
-    }
-
-    for (uint dy = 0; dy < 3; ++dy)
-    {
-        bool mask = true;
-
-        if (y+dy < _Offset) mask = false;
-        if (y+dy-_Offset >= X.height) mask = false;
-
-        for (uint dx = 0; dx < 3; ++dx)
-        {
-            if (x+dx < _Offset) mask = false;
-            if (x+dx-_Offset >= X.width) mask = false;
-
-            for (uint m = 0; m < X.channels/(CTILE*RTILE); ++m)
-            {                
-                for (uint yy = 0; yy < RTILE; ++yy)
-                    for (uint xx = 0; xx < RTILE; ++xx)
-                    {
-                        if (mask)
-                            X_[yy*RTILE+xx][gy*CTILE+gx] = X.Get(n*RTILE+yy, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*RTILE+xx, Xdata);
-                        else
-                            X_[yy*RTILE+xx][gy*CTILE+gx] = 0;
-                        K_[yy*RTILE+xx][gy*CTILE+gx] = K.Get(dy, dx, (m*CTILE + gy)*RTILE+yy, k*RTILE+xx, WBKdata, WBK.dataLength);
-                    }
-
-                GroupMemoryBarrierWithGroupSync();
-
-                for (uint ii = 0; ii < CTILE; ++ii)
-                {
-                    float x[RTILE][RTILE];
-                    float k[RTILE][RTILE];
-
-                    [unroll]
-                    for (uint yy = 0; yy < RTILE; ++yy)
-                    {
-                        [unroll]
-                        for (uint xx = 0; xx < RTILE; ++xx)
-                        {
-                            x[yy][xx] = X_[yy*RTILE+xx][gy*CTILE+ii];
-                            k[yy][xx] = K_[yy*RTILE+xx][ii*CTILE+gx];
-                        }
-                    }
-
-
-                    [unroll]
-                    for (uint yyy = 0; yyy < RTILE; ++yyy)
-                    {
-                        [unroll]
-                        for (uint xxx = 0; xxx < RTILE; ++xxx)
-                        {
-                            [unroll]
-                            for (uint i = 0; i < RTILE; ++i)
-                            {
-                                v[yyy][xxx] = mad(x[yyy][i], k[i][xxx], v[yyy][xxx]);
-                            }
-                        }
-                    }
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-
-    for (uint yy = 0; yy < RTILE; ++yy)
-        for (uint xx = 0; xx < RTILE; ++xx)
-            O.Set(n*RTILE+yy, y, x, k*RTILE+xx, v[yy][xx], Odata);
-    
-    #undef X_
-    #undef K_
-}
-
-#elif 1 // =====================================================================================================
-
-#undef CTILE
-#define CTILE 16
-groupshared float2 Conv_KcacheR[8][CTILE*CTILE];
-groupshared float2 Conv_XcacheR[8][CTILE*CTILE];
-[numthreads(CTILE, CTILE, 1)]
-void Conv2D_Kernel3x3_Cache_KCmod64_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ Conv_XcacheR
-    #define K_ Conv_KcacheR
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = CTILE * groupID.x + groupThreadID.x;
-    uint nyx = CTILE * groupID.y + groupThreadID.y;
-
-    uint width = X.width - _Border;
-    uint height = X.height - _Border;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-    
-    float b0 = B.Get(0, 0, k*4+0, 0, WBKdata, WBK.dataLength);
-    float b1 = B.Get(0, 0, k*4+1, 0, WBKdata, WBK.dataLength);
-    float b2 = B.Get(0, 0, k*4+2, 0, WBKdata, WBK.dataLength);
-    float b3 = B.Get(0, 0, k*4+3, 0, WBKdata, WBK.dataLength);
-    
-    float4 v0, v1, v2, v3;
-    v0 = v1 = v2 = v3 = float4(b0, b1, b2, b3);
-
-    for (uint dy = 0; dy < 3; ++dy)
-    {
-        bool mask = true;
-
-        if (y+dy < _Offset) mask = false;
-        if (y+dy-_Offset >= X.height) mask = false;
-
-        for (uint dx = 0; dx < 3; ++dx)
-        {
-            if (x+dx < _Offset) mask = false;
-            if (x+dx-_Offset >= X.width) mask = false;
-
-            for (uint m = 0; m < X.channels/(CTILE*4); ++m)
-            {                
-                for (uint yy = 0; yy < 4; ++yy)
-                    for (uint xx = 0; xx < 2; ++xx)
-                    {
-                        // 111ms
-                        if (mask)
-                        {
-                            X_[yy*2+xx][gy*CTILE+gx].x = X.Get(n*4+yy, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*4+xx*2+0, Xdata);
-                            X_[yy*2+xx][gy*CTILE+gx].y = X.Get(n*4+yy, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*4+xx*2+1, Xdata);
-                        }
-                        else
-                        {
-                            X_[yy*2+xx][gy*CTILE+gx].x = 0;
-                            X_[yy*2+xx][gy*CTILE+gx].y = 0;
-                        }
-
-                        K_[yy*2+xx][gy*CTILE+gx].x = K.Get(dy, dx, (m*CTILE + gy)*4+yy, k*4+xx*2+0, WBKdata, WBK.dataLength);
-                        K_[yy*2+xx][gy*CTILE+gx].y = K.Get(dy, dx, (m*CTILE + gy)*4+yy, k*4+xx*2+1, WBKdata, WBK.dataLength);
-                    }
-
-                GroupMemoryBarrierWithGroupSync();
-
-                for (uint i = 0; i < CTILE; ++i)
-                {                    
-                    #if 1 // ----------------------------------------------------------
-
-                    float2 x[8];
-                    float2 k[8];
-
-                    // 109ms
-                    // dcl_temps 29
-                    for (uint regs = 0; regs < 8; ++regs)
-                    {
-                        x[regs] = X_[regs][gy*CTILE+i];
-                        k[regs] = K_[regs][i*CTILE+gx];
-                    }
-
-                    for (uint q = 0; q < 4; ++q)
-                    {
-                        float
-                            k0 = k[q*2+0].x,
-                            k1 = k[q*2+0].y,
-                            k2 = k[q*2+1].x,
-                            k3 = k[q*2+1].y;
-                        float
-                            x0 = x[0+q/2].x,
-                            x1 = x[2+q/2].x,
-                            x2 = x[4+q/2].x,
-                            x3 = x[6+q/2].x;
-
-                        v0.x = mad(x0, k0, v0.x); //--
-                        v1.x = mad(x1, k0, v1.x); 
-                        v2.x = mad(x2, k0, v2.x); 
-                        v3.x = mad(x3, k0, v3.x); 
-                        v0.y = mad(x0, k1, v0.y); //--
-                        v1.y = mad(x1, k1, v1.y);
-                        v2.y = mad(x2, k1, v2.y);
-                        v3.y = mad(x3, k1, v3.y);
-                        v0.z = mad(x0, k2, v0.z); //--
-                        v1.z = mad(x1, k2, v1.z); 
-                        v2.z = mad(x2, k2, v2.z); 
-                        v3.z = mad(x3, k2, v3.z); 
-                        v0.w = mad(x0, k3, v0.w); //--
-                        v1.w = mad(x1, k3, v1.w);
-                        v2.w = mad(x2, k3, v2.w);
-                        v3.w = mad(x3, k3, v3.w);
-
-                        ++q;
-
-                            k0 = k[q*2+0].x;
-                            k1 = k[q*2+0].y;
-                            k2 = k[q*2+1].x;
-                            k3 = k[q*2+1].y;
-
-                            x0 = x[0+q/2].y;
-                            x1 = x[2+q/2].y;
-                            x2 = x[4+q/2].y;
-                            x3 = x[6+q/2].y;
-
-                        v0.x = mad(x0, k0, v0.x); //--
-                        v1.x = mad(x1, k0, v1.x); 
-                        v2.x = mad(x2, k0, v2.x); 
-                        v3.x = mad(x3, k0, v3.x); 
-                        v0.y = mad(x0, k1, v0.y); //--
-                        v1.y = mad(x1, k1, v1.y);
-                        v2.y = mad(x2, k1, v2.y);
-                        v3.y = mad(x3, k1, v3.y);
-                        v0.z = mad(x0, k2, v0.z); //--
-                        v1.z = mad(x1, k2, v1.z); 
-                        v2.z = mad(x2, k2, v2.z); 
-                        v3.z = mad(x3, k2, v3.z); 
-                        v0.w = mad(x0, k3, v0.w); //--
-                        v1.w = mad(x1, k3, v1.w);
-                        v2.w = mad(x2, k3, v2.w);
-                        v3.w = mad(x3, k3, v3.w);
-                    }
-                
-                    #endif  // ----------------------------------------------------------
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-    
-    #if 1 // ----------------------------------------------------------
-
-    // 117ms
-    O.Set(n*4+0, y, x, k*4+0, v0.x, Odata);
-    O.Set(n*4+0, y, x, k*4+1, v0.y, Odata);
-    O.Set(n*4+0, y, x, k*4+2, v0.z, Odata);
-    O.Set(n*4+0, y, x, k*4+3, v0.w, Odata);
-    
-    O.Set(n*4+1, y, x, k*4+0, v1.x, Odata);
-    O.Set(n*4+1, y, x, k*4+1, v1.y, Odata);
-    O.Set(n*4+1, y, x, k*4+2, v1.z, Odata);
-    O.Set(n*4+1, y, x, k*4+3, v1.w, Odata);
-    
-    O.Set(n*4+2, y, x, k*4+0, v2.x, Odata);
-    O.Set(n*4+2, y, x, k*4+1, v2.y, Odata);
-    O.Set(n*4+2, y, x, k*4+2, v2.z, Odata);
-    O.Set(n*4+2, y, x, k*4+3, v2.w, Odata);
-    
-    O.Set(n*4+3, y, x, k*4+0, v3.x, Odata);
-    O.Set(n*4+3, y, x, k*4+1, v3.y, Odata);
-    O.Set(n*4+3, y, x, k*4+2, v3.z, Odata);
-    O.Set(n*4+3, y, x, k*4+3, v3.w, Odata);
-
-    #else // ----------------------------------------------------------
-        
-    // 118ms
-    O.Set(n*4+0, y, x, k*4+0, v0.x, Odata);
-    O.Set(n*4+1, y, x, k*4+0, v1.x, Odata);
-    O.Set(n*4+2, y, x, k*4+0, v2.x, Odata);
-    O.Set(n*4+3, y, x, k*4+0, v3.x, Odata);
-    
-    O.Set(n*4+0, y, x, k*4+1, v0.y, Odata);
-    O.Set(n*4+1, y, x, k*4+1, v1.y, Odata);
-    O.Set(n*4+2, y, x, k*4+1, v2.y, Odata);
-    O.Set(n*4+3, y, x, k*4+1, v3.y, Odata);
-    
-    O.Set(n*4+0, y, x, k*4+2, v0.z, Odata);
-    O.Set(n*4+1, y, x, k*4+2, v1.z, Odata);
-    O.Set(n*4+2, y, x, k*4+2, v2.z, Odata);
-    O.Set(n*4+3, y, x, k*4+2, v3.z, Odata);
-    
-    O.Set(n*4+0, y, x, k*4+3, v0.w, Odata);
-    O.Set(n*4+1, y, x, k*4+3, v1.w, Odata);
-    O.Set(n*4+2, y, x, k*4+3, v2.w, Odata);
-    O.Set(n*4+3, y, x, k*4+3, v3.w, Odata);
-
-    #endif // ----------------------------------------------------------
-
-              
-    #undef X_
-    #undef K_
-}
-
-#elif 1 // =====================================================================================================
-
-#undef CTILE
-#define CTILE 16
-groupshared float Conv_KcacheR[16][CTILE*CTILE];
-groupshared float Conv_XcacheR[16][CTILE*CTILE];
-[numthreads(CTILE, CTILE, 1)]
-void Conv2D_Kernel3x3_Cache_KCmod64_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ Conv_XcacheR
-    #define K_ Conv_KcacheR
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = CTILE * groupID.x + groupThreadID.x;
-    uint nyx = CTILE * groupID.y + groupThreadID.y;
-
-    uint width = X.width - _Border;
-    uint height = X.height - _Border;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-    
-    float b0 = B.Get(0, 0, k*4+0, 0, WBKdata, WBK.dataLength);
-    float b1 = B.Get(0, 0, k*4+1, 0, WBKdata, WBK.dataLength);
-    float b2 = B.Get(0, 0, k*4+2, 0, WBKdata, WBK.dataLength);
-    float b3 = B.Get(0, 0, k*4+3, 0, WBKdata, WBK.dataLength);
-    
-    float4 v0, v1, v2, v3;
-    v0 = v1 = v2 = v3 = float4(b0, b1, b2, b3);
-
-    for (uint dy = 0; dy < 3; ++dy)
-    {
-        bool mask = true;
-
-        if (y+dy < _Offset) mask = false;
-        if (y+dy-_Offset >= X.height) mask = false;
-
-        for (uint dx = 0; dx < 3; ++dx)
-        {
-            if (x+dx < _Offset) mask = false;
-            if (x+dx-_Offset >= X.width) mask = false;
-
-            for (uint m = 0; m < X.channels/(CTILE*4); ++m)
-            {                
-                for (uint yy = 0; yy < 4; ++yy)
-                    for (uint xx = 0; xx < 4; ++xx)
-                    {
-                        #if 1  // ----------------------------------------------------------
-
-                        // 111ms
-                        if (mask)
-                            X_[yy*4+xx][gy*CTILE+gx] = X.Get(n*4+yy, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*4+xx, Xdata);
-                        else
-                            X_[yy*4+xx][gy*CTILE+gx] = 0;
-                        K_[yy*4+xx][gy*CTILE+gx] = K.Get(dy, dx, (m*CTILE + gy)*4+yy, k*4+xx, WBKdata, WBK.dataLength);
-
-                        #else  // ----------------------------------------------------------
-
-                        // 122ms
-                        if (mask)
-                            X_[yy*4+(gx%4)][gy*CTILE+xx*4+(gx/4)] = X.Get(n*4+yy, y+dy-_Offset, x+dx-_Offset, m*CTILE*4 + xx*CTILE + gx, Xdata);
-                        else
-                            X_[yy*4+(gx%4)][gy*CTILE+xx*4+(gx/4)] = 0;
-                        K_[yy*4+(k%4)][gy*CTILE+xx*4+(gx/4)] = K.Get(dy, dx, (m*CTILE + gy)*4+yy, CTILE*groupID.x*4 + xx*CTILE + gx, WBKdata, WBK.dataLength);
-
-                        #endif  // ----------------------------------------------------------
-                    }
-
-                GroupMemoryBarrierWithGroupSync();
-
-                for (uint i = 0; i < CTILE; ++i)
-                {
-
-                    #if 0 // ----------------------------------------------------------
-
-                    float x[16];
-                    float k[16];
-
-                    k[0] = K_[0][i*CTILE+gx];
-                    x[0] = X_[0][gy*CTILE+i];
-                    x[4] = X_[4][gy*CTILE+i];
-                    x[8] = X_[8][gy*CTILE+i];
-                    x[12] = X_[12][gy*CTILE+i];
-
-                    for (uint q = 0; q < 3; ++q)
-                    {
-                        k[q*4+1] = K_[q*4+1][i*CTILE+gx];
-                        v0.x = mad(x[0*4+q], k[q*4+0], v0.x); //--
-                        v1.x = mad(x[1*4+q], k[q*4+0], v1.x);
-                        x[0*4+q+1] = X_[0*4+q+1][gy*CTILE+i];
-                        v2.x = mad(x[2*4+q], k[q*4+0], v2.x); 
-                        v3.x = mad(x[3*4+q], k[q*4+0], v3.x); 
-                        k[q*4+2] = K_[q*4+2][i*CTILE+gx];
-                        v0.y = mad(x[0*4+q], k[q*4+1], v0.y); //--
-                        v1.y = mad(x[1*4+q], k[q*4+1], v1.y);
-                        x[1*4+q+1] = X_[1*4+q+1][gy*CTILE+i];
-                        v2.y = mad(x[2*4+q], k[q*4+1], v2.y);
-                        v3.y = mad(x[3*4+q], k[q*4+1], v3.y);
-                        k[q*4+3] = K_[q*4+3][i*CTILE+gx];
-                        v0.z = mad(x[0*4+q], k[q*4+2], v0.z); //--
-                        v1.z = mad(x[1*4+q], k[q*4+2], v1.z); 
-                        x[2*4+q+1] = X_[2*4+q+1][gy*CTILE+i];
-                        v2.z = mad(x[2*4+q], k[q*4+2], v2.z); 
-                        v3.z = mad(x[3*4+q], k[q*4+2], v3.z); 
-                        k[q*4+4] = K_[q*4+4][i*CTILE+gx];
-                        v0.w = mad(x[0*4+q], k[q*4+3], v0.w); //--
-                        v1.w = mad(x[1*4+q], k[q*4+3], v1.w);
-                        x[3*4+q+1] = X_[3*4+q+1][gy*CTILE+i];
-                        v2.w = mad(x[2*4+q], k[q*4+3], v2.w);
-                        v3.w = mad(x[3*4+q], k[q*4+3], v3.w);
-                    }
-                    {
-                        k[q*4+1] = K_[q*4+1][i*CTILE+gx];
-                        v0.x = mad(x[0*4+q], k[q*4+0], v0.x); //--
-                        v1.x = mad(x[1*4+q], k[q*4+0], v1.x); 
-                        v2.x = mad(x[2*4+q], k[q*4+0], v2.x); 
-                        v3.x = mad(x[3*4+q], k[q*4+0], v3.x); 
-                        k[q*4+2] = K_[q*4+2][i*CTILE+gx];
-                        v0.y = mad(x[0*4+q], k[q*4+1], v0.y); //--
-                        v1.y = mad(x[1*4+q], k[q*4+1], v1.y);
-                        v2.y = mad(x[2*4+q], k[q*4+1], v2.y);
-                        v3.y = mad(x[3*4+q], k[q*4+1], v3.y);
-                        k[q*4+3] = K_[q*4+3][i*CTILE+gx];
-                        v0.z = mad(x[0*4+q], k[q*4+2], v0.z); //--
-                        v1.z = mad(x[1*4+q], k[q*4+2], v1.z); 
-                        v2.z = mad(x[2*4+q], k[q*4+2], v2.z); 
-                        v3.z = mad(x[3*4+q], k[q*4+2], v3.z); 
-                        v0.w = mad(x[0*4+q], k[q*4+3], v0.w); //--
-                        v1.w = mad(x[1*4+q], k[q*4+3], v1.w);
-                        v2.w = mad(x[2*4+q], k[q*4+3], v2.w);
-                        v3.w = mad(x[3*4+q], k[q*4+3], v3.w);
-                    }
-        
-                    #elif 0 // ----------------------------------------------------------
-
-                    //float x[4];
-                    //float k[4];
-
-                    float k0 = K_[0*4+0][i*CTILE+gx];
-                    float x0 = X_[0*4+0][gy*CTILE+i];
-                    float x1 = X_[1*4+0][gy*CTILE+i];
-                    float x2 = X_[2*4+0][gy*CTILE+i];
-                    float x3 = X_[3*4+0][gy*CTILE+i];
-
-                    float k1, k2, k3;
-                    float x0p, x1p, x2p, x3p;
-
-                    uint q = 0;
-                    //for (uint q = 0; q < 4;)
-                    {
-                        //x[regs] = X_[regs][gy*CTILE+i];
-                        
-                            k1 = K_[q*4+1][i*CTILE+gx];
-                        v0.x = mad(x0, k0, v0.x); //--
-                        v1.x = mad(x1, k0, v1.x); 
-                            x0p = X_[0*4+q+1][gy*CTILE+i];
-                        v2.x = mad(x2, k0, v2.x); 
-                        v3.x = mad(x3, k0, v3.x);
-
-                            k2 = K_[q*4+2][i*CTILE+gx];
-                        v0.y = mad(x0, k1, v0.y); //--
-                        v1.y = mad(x1, k1, v1.y);
-                            x1p = X_[1*4+q+1][gy*CTILE+i];
-                        v2.y = mad(x2, k1, v2.y);
-                        v3.y = mad(x3, k1, v3.y);
-
-                            k3 = K_[q*4+3][i*CTILE+gx];
-                        v0.z = mad(x0, k2, v0.z); //--
-                        v1.z = mad(x1, k2, v1.z); 
-                            x2p = X_[2*4+q+1][gy*CTILE+i];
-                        v2.z = mad(x2, k2, v2.z); 
-                        v3.z = mad(x3, k2, v3.z);
-                        
-                            k0 = K_[q*4+4][i*CTILE+gx];
-                        v0.w = mad(x0, k3, v0.w); //--
-                        v1.w = mad(x1, k3, v1.w);
-                            x3p = X_[3*4+q+1][gy*CTILE+i];
-                        v2.w = mad(x2, k3, v2.w);
-                        v3.w = mad(x3, k3, v3.w);
-
-                        ++q;
-
-                            k1 = K_[q*4+1][i*CTILE+gx];
-                        v0.x = mad(x0p, k0, v0.x); //--
-                        v1.x = mad(x1p, k0, v1.x); 
-                            x0 = X_[0*4+q+1][gy*CTILE+i];
-                        v2.x = mad(x2p, k0, v2.x); 
-                        v3.x = mad(x3p, k0, v3.x);
-                                     
-                            k2 = K_[q*4+2][i*CTILE+gx];
-                        v0.y = mad(x0p, k1, v0.y); //--
-                        v1.y = mad(x1p, k1, v1.y);
-                            x1 = X_[1*4+q+1][gy*CTILE+i];
-                        v2.y = mad(x2p, k1, v2.y);
-                        v3.y = mad(x3p, k1, v3.y);
-                                     
-                            k3 = K_[q*4+3][i*CTILE+gx];
-                        v0.z = mad(x0p, k2, v0.z); //--
-                        v1.z = mad(x1p, k2, v1.z); 
-                            x2 = X_[2*4+q+1][gy*CTILE+i];
-                        v2.z = mad(x2p, k2, v2.z); 
-                        v3.z = mad(x3p, k2, v3.z);
-                                     
-                            k0 = K_[q*4+4][i*CTILE+gx];
-                        v0.w = mad(x0p, k3, v0.w); //--
-                        v1.w = mad(x1p, k3, v1.w);
-                            x3 = X_[3*4+q+1][gy*CTILE+i];
-                        v2.w = mad(x2p, k3, v2.w);
-                        v3.w = mad(x3p, k3, v3.w);
-
-                        ++q;
-
-                            k1 = K_[q*4+1][i*CTILE+gx];
-                        v0.x = mad(x0, k0, v0.x); //--
-                        v1.x = mad(x1, k0, v1.x); 
-                            x0p = X_[0*4+q+1][gy*CTILE+i];
-                        v2.x = mad(x2, k0, v2.x); 
-                        v3.x = mad(x3, k0, v3.x);
-
-                            k2 = K_[q*4+2][i*CTILE+gx];
-                        v0.y = mad(x0, k1, v0.y); //--
-                        v1.y = mad(x1, k1, v1.y);
-                            x1p = X_[1*4+q+1][gy*CTILE+i];
-                        v2.y = mad(x2, k1, v2.y);
-                        v3.y = mad(x3, k1, v3.y);
-
-                            k3 = K_[q*4+3][i*CTILE+gx];
-                        v0.z = mad(x0, k2, v0.z); //--
-                        v1.z = mad(x1, k2, v1.z); 
-                            x2p = X_[2*4+q+1][gy*CTILE+i];
-                        v2.z = mad(x2, k2, v2.z); 
-                        v3.z = mad(x3, k2, v3.z);
-                        
-                            k0 = K_[q*4+4][i*CTILE+gx];
-                        v0.w = mad(x0, k3, v0.w); //--
-                        v1.w = mad(x1, k3, v1.w);
-                            x3p = X_[3*4+q+1][gy*CTILE+i];
-                        v2.w = mad(x2, k3, v2.w);
-                        v3.w = mad(x3, k3, v3.w);
-
-                        ++q;
-
-                            k1 = K_[q*4+1][i*CTILE+gx];
-                        v0.x = mad(x0p, k0, v0.x); //--
-                        v1.x = mad(x1p, k0, v1.x); 
-                            //x0p = X_[0*4+q][gy*CTILE+i];
-                        v2.x = mad(x2p, k0, v2.x); 
-                        v3.x = mad(x3p, k0, v3.x);
-                                     
-                            k2 = K_[q*4+2][i*CTILE+gx];
-                        v0.y = mad(x0p, k1, v0.y); //--
-                        v1.y = mad(x1p, k1, v1.y);
-                            //x1p = X_[1*4+q][gy*CTILE+i];
-                        v2.y = mad(x2p, k1, v2.y);
-                        v3.y = mad(x3p, k1, v3.y);
-                                     
-                            k3 = K_[q*4+3][i*CTILE+gx];
-                        v0.z = mad(x0p, k2, v0.z); //--
-                        v1.z = mad(x1p, k2, v1.z); 
-                            //x2p = X_[2*4+q][gy*CTILE+i];
-                        v2.z = mad(x2p, k2, v2.z); 
-                        v3.z = mad(x3p, k2, v3.z);
-                                     
-                            //k0 = K_[(q+1)*4][i*CTILE+gx];
-                        v0.w = mad(x0p, k3, v0.w); //--
-                        v1.w = mad(x1p, k3, v1.w);
-                            //x3p = X_[3*4+q][gy*CTILE+i];
-                        v2.w = mad(x2p, k3, v2.w);
-                        v3.w = mad(x3p, k3, v3.w);
-
-                        ++q;
-                    }
-            
-                    
-                    #elif 1 // ----------------------------------------------------------
-
-                    float x[16];
-                    float k[16];
-
-                    // 109ms
-                    // dcl_temps 29
-                    for (uint regs = 0; regs < 16; ++regs)
-                    {
-                        x[regs] = X_[regs][gy*CTILE+i];
-                        k[regs] = K_[regs][i*CTILE+gx];
-                    }
-
-                    for (uint q = 0; q < 4; ++q)
-                    {
-                        v0.x = mad(x[0*4+q], k[q*4+0], v0.x); //--
-                        v1.x = mad(x[1*4+q], k[q*4+0], v1.x); 
-                        v2.x = mad(x[2*4+q], k[q*4+0], v2.x); 
-                        v3.x = mad(x[3*4+q], k[q*4+0], v3.x); 
-                        v0.y = mad(x[0*4+q], k[q*4+1], v0.y); //--
-                        v1.y = mad(x[1*4+q], k[q*4+1], v1.y);
-                        v2.y = mad(x[2*4+q], k[q*4+1], v2.y);
-                        v3.y = mad(x[3*4+q], k[q*4+1], v3.y);
-                        v0.z = mad(x[0*4+q], k[q*4+2], v0.z); //--
-                        v1.z = mad(x[1*4+q], k[q*4+2], v1.z); 
-                        v2.z = mad(x[2*4+q], k[q*4+2], v2.z); 
-                        v3.z = mad(x[3*4+q], k[q*4+2], v3.z); 
-                        v0.w = mad(x[0*4+q], k[q*4+3], v0.w); //--
-                        v1.w = mad(x[1*4+q], k[q*4+3], v1.w);
-                        v2.w = mad(x[2*4+q], k[q*4+3], v2.w);
-                        v3.w = mad(x[3*4+q], k[q*4+3], v3.w);
-                    }
-                    
-                    #elif 1  // ----------------------------------------------------------
-
-                    // 111ms
-                    // dcl_temps 34
-                    [unroll]
-                    for (uint regs = 0; regs < 16; ++regs)
-                    {
-                        x[regs] = X_[regs][gy*CTILE+i];
-                        k[regs] = K_[regs][i*CTILE+gx];
-                    }
-                    v0.x = mad(x[0*4+0], k[0*4+0], v0.x); //--
-                    v1.x = mad(x[1*4+0], k[0*4+0], v1.x); 
-                    v2.x = mad(x[2*4+0], k[0*4+0], v2.x); 
-                    v3.x = mad(x[3*4+0], k[0*4+0], v3.x); 
-                    v0.y = mad(x[0*4+0], k[0*4+1], v0.y); //--
-                    v1.y = mad(x[1*4+0], k[0*4+1], v1.y);
-                    v2.y = mad(x[2*4+0], k[0*4+1], v2.y);
-                    v3.y = mad(x[3*4+0], k[0*4+1], v3.y);
-                    v0.z = mad(x[0*4+0], k[0*4+2], v0.z); //--
-                    v1.z = mad(x[1*4+0], k[0*4+2], v1.z); 
-                    v2.z = mad(x[2*4+0], k[0*4+2], v2.z); 
-                    v3.z = mad(x[3*4+0], k[0*4+2], v3.z); 
-                    v0.w = mad(x[0*4+0], k[0*4+3], v0.w); //--
-                    v1.w = mad(x[1*4+0], k[0*4+3], v1.w);
-                    v2.w = mad(x[2*4+0], k[0*4+3], v2.w);
-                    v3.w = mad(x[3*4+0], k[0*4+3], v3.w);
-                    
-                    v0.x = mad(x[0*4+1], k[1*4+0], v0.x); //--
-                    v1.x = mad(x[1*4+1], k[1*4+0], v1.x); 
-                    v2.x = mad(x[2*4+1], k[1*4+0], v2.x); 
-                    v3.x = mad(x[3*4+1], k[1*4+0], v3.x); 
-                    v0.y = mad(x[0*4+1], k[1*4+1], v0.y); //--
-                    v1.y = mad(x[1*4+1], k[1*4+1], v1.y);
-                    v2.y = mad(x[2*4+1], k[1*4+1], v2.y);
-                    v3.y = mad(x[3*4+1], k[1*4+1], v3.y);
-                    v0.z = mad(x[0*4+1], k[1*4+2], v0.z); //--
-                    v1.z = mad(x[1*4+1], k[1*4+2], v1.z); 
-                    v2.z = mad(x[2*4+1], k[1*4+2], v2.z); 
-                    v3.z = mad(x[3*4+1], k[1*4+2], v3.z); 
-                    v0.w = mad(x[0*4+1], k[1*4+3], v0.w); //--
-                    v1.w = mad(x[1*4+1], k[1*4+3], v1.w);
-                    v2.w = mad(x[2*4+1], k[1*4+3], v2.w);
-                    v3.w = mad(x[3*4+1], k[1*4+3], v3.w);
-                    
-                    v0.x = mad(x[0*4+2], k[2*4+0], v0.x); //--
-                    v1.x = mad(x[1*4+2], k[2*4+0], v1.x); 
-                    v2.x = mad(x[2*4+2], k[2*4+0], v2.x); 
-                    v3.x = mad(x[3*4+2], k[2*4+0], v3.x); 
-                    v0.y = mad(x[0*4+2], k[2*4+1], v0.y); //--
-                    v1.y = mad(x[1*4+2], k[2*4+1], v1.y);
-                    v2.y = mad(x[2*4+2], k[2*4+1], v2.y);
-                    v3.y = mad(x[3*4+2], k[2*4+1], v3.y);
-                    v0.z = mad(x[0*4+2], k[2*4+2], v0.z); //--
-                    v1.z = mad(x[1*4+2], k[2*4+2], v1.z); 
-                    v2.z = mad(x[2*4+2], k[2*4+2], v2.z); 
-                    v3.z = mad(x[3*4+2], k[2*4+2], v3.z); 
-                    v0.w = mad(x[0*4+2], k[2*4+3], v0.w); //--
-                    v1.w = mad(x[1*4+2], k[2*4+3], v1.w);
-                    v2.w = mad(x[2*4+2], k[2*4+3], v2.w);
-                    v3.w = mad(x[3*4+2], k[2*4+3], v3.w);
-                    
-                    v0.x = mad(x[0*4+3], k[3*4+0], v0.x); //--
-                    v1.x = mad(x[1*4+3], k[3*4+0], v1.x); 
-                    v2.x = mad(x[2*4+3], k[3*4+0], v2.x); 
-                    v3.x = mad(x[3*4+3], k[3*4+0], v3.x); 
-                    v0.y = mad(x[0*4+3], k[3*4+1], v0.y); //--
-                    v1.y = mad(x[1*4+3], k[3*4+1], v1.y);
-                    v2.y = mad(x[2*4+3], k[3*4+1], v2.y);
-                    v3.y = mad(x[3*4+3], k[3*4+1], v3.y);
-                    v0.z = mad(x[0*4+3], k[3*4+2], v0.z); //--
-                    v1.z = mad(x[1*4+3], k[3*4+2], v1.z); 
-                    v2.z = mad(x[2*4+3], k[3*4+2], v2.z); 
-                    v3.z = mad(x[3*4+3], k[3*4+2], v3.z); 
-                    v0.w = mad(x[0*4+3], k[3*4+3], v0.w); //--
-                    v1.w = mad(x[1*4+3], k[3*4+3], v1.w);
-                    v2.w = mad(x[2*4+3], k[3*4+3], v2.w);
-                    v3.w = mad(x[3*4+3], k[3*4+3], v3.w);
-                    
-                    #else  // ----------------------------------------------------------
-
-                    // 115 ms, reg dependencies
-                    // dcl_temps 32
-                    [unroll]
-                    for (uint regs = 0; regs < 16; ++regs)
-                    {
-                        x[regs] = X_[regs][gy*CTILE+i];
-                        k[regs] = K_[regs][i*CTILE+gx];
-                    }
-
-                    v0.x = mad(x[0*4+0], k[0*4+0], v0.x); //--
-                    v0.x = mad(x[0*4+1], k[1*4+0], v0.x);
-                    v0.x = mad(x[0*4+2], k[2*4+0], v0.x);
-                    v0.x = mad(x[0*4+3], k[3*4+0], v0.x);
-                    v0.y = mad(x[0*4+0], k[0*4+1], v0.y); //--
-                    v0.y = mad(x[0*4+1], k[1*4+1], v0.y);
-                    v0.y = mad(x[0*4+2], k[2*4+1], v0.y);
-                    v0.y = mad(x[0*4+3], k[3*4+1], v0.y);
-                    v0.z = mad(x[0*4+0], k[0*4+2], v0.z); //--
-                    v0.z = mad(x[0*4+1], k[1*4+2], v0.z);
-                    v0.z = mad(x[0*4+2], k[2*4+2], v0.z);
-                    v0.z = mad(x[0*4+3], k[3*4+2], v0.z);
-                    v0.w = mad(x[0*4+0], k[0*4+3], v0.w); //--
-                    v0.w = mad(x[0*4+1], k[1*4+3], v0.w);
-                    v0.w = mad(x[0*4+2], k[2*4+3], v0.w);
-                    v0.w = mad(x[0*4+3], k[3*4+3], v0.w);
-                    
-                    v1.x = mad(x[1*4+0], k[0*4+0], v1.x); //--
-                    v1.x = mad(x[1*4+1], k[1*4+0], v1.x);
-                    v1.x = mad(x[1*4+2], k[2*4+0], v1.x);
-                    v1.x = mad(x[1*4+3], k[3*4+0], v1.x); 
-                    v1.y = mad(x[1*4+0], k[0*4+1], v1.y); //--
-                    v1.y = mad(x[1*4+1], k[1*4+1], v1.y);
-                    v1.y = mad(x[1*4+2], k[2*4+1], v1.y);
-                    v1.y = mad(x[1*4+3], k[3*4+1], v1.y);
-                    v1.z = mad(x[1*4+0], k[0*4+2], v1.z); //--
-                    v1.z = mad(x[1*4+1], k[1*4+2], v1.z);
-                    v1.z = mad(x[1*4+2], k[2*4+2], v1.z);
-                    v1.z = mad(x[1*4+3], k[3*4+2], v1.z);
-                    v1.w = mad(x[1*4+0], k[0*4+3], v1.w); //--
-                    v1.w = mad(x[1*4+1], k[1*4+3], v1.w);
-                    v1.w = mad(x[1*4+2], k[2*4+3], v1.w);
-                    v1.w = mad(x[1*4+3], k[3*4+3], v1.w);
-                    
-                    v2.x = mad(x[2*4+0], k[0*4+0], v2.x); //--
-                    v2.x = mad(x[2*4+1], k[1*4+0], v2.x);
-                    v2.x = mad(x[2*4+2], k[2*4+0], v2.x);
-                    v2.x = mad(x[2*4+3], k[3*4+0], v2.x);
-                    v2.y = mad(x[2*4+0], k[0*4+1], v2.y); //--
-                    v2.y = mad(x[2*4+1], k[1*4+1], v2.y);
-                    v2.y = mad(x[2*4+2], k[2*4+1], v2.y);
-                    v2.y = mad(x[2*4+3], k[3*4+1], v2.y);
-                    v2.z = mad(x[2*4+0], k[0*4+2], v2.z); //--
-                    v2.z = mad(x[2*4+1], k[1*4+2], v2.z);
-                    v2.z = mad(x[2*4+2], k[2*4+2], v2.z);
-                    v2.z = mad(x[2*4+3], k[3*4+2], v2.z);
-                    v2.w = mad(x[2*4+0], k[0*4+3], v2.w); //--
-                    v2.w = mad(x[2*4+1], k[1*4+3], v2.w);
-                    v2.w = mad(x[2*4+2], k[2*4+3], v2.w);
-                    v2.w = mad(x[2*4+3], k[3*4+3], v2.w);
-                    
-                    v3.x = mad(x[3*4+0], k[0*4+0], v3.x); //--
-                    v3.x = mad(x[3*4+1], k[1*4+0], v3.x);
-                    v3.x = mad(x[3*4+2], k[2*4+0], v3.x);
-                    v3.x = mad(x[3*4+3], k[3*4+0], v3.x); 
-                    v3.y = mad(x[3*4+0], k[0*4+1], v3.y); //--
-                    v3.y = mad(x[3*4+1], k[1*4+1], v3.y);
-                    v3.y = mad(x[3*4+2], k[2*4+1], v3.y);
-                    v3.y = mad(x[3*4+3], k[3*4+1], v3.y);
-                    v3.z = mad(x[3*4+0], k[0*4+2], v3.z); //--
-                    v3.z = mad(x[3*4+1], k[1*4+2], v3.z);
-                    v3.z = mad(x[3*4+2], k[2*4+2], v3.z);
-                    v3.z = mad(x[3*4+3], k[3*4+2], v3.z);
-                    v3.w = mad(x[3*4+0], k[0*4+3], v3.w); //--
-                    v3.w = mad(x[3*4+1], k[1*4+3], v3.w);
-                    v3.w = mad(x[3*4+2], k[2*4+3], v3.w);
-                    v3.w = mad(x[3*4+3], k[3*4+3], v3.w);
-
-                    #endif  // ----------------------------------------------------------
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-    
-    #if 1 // ----------------------------------------------------------
-
-    // 117ms
-    O.Set(n*4+0, y, x, k*4+0, v0.x, Odata);
-    O.Set(n*4+0, y, x, k*4+1, v0.y, Odata);
-    O.Set(n*4+0, y, x, k*4+2, v0.z, Odata);
-    O.Set(n*4+0, y, x, k*4+3, v0.w, Odata);
-    
-    O.Set(n*4+1, y, x, k*4+0, v1.x, Odata);
-    O.Set(n*4+1, y, x, k*4+1, v1.y, Odata);
-    O.Set(n*4+1, y, x, k*4+2, v1.z, Odata);
-    O.Set(n*4+1, y, x, k*4+3, v1.w, Odata);
-    
-    O.Set(n*4+2, y, x, k*4+0, v2.x, Odata);
-    O.Set(n*4+2, y, x, k*4+1, v2.y, Odata);
-    O.Set(n*4+2, y, x, k*4+2, v2.z, Odata);
-    O.Set(n*4+2, y, x, k*4+3, v2.w, Odata);
-    
-    O.Set(n*4+3, y, x, k*4+0, v3.x, Odata);
-    O.Set(n*4+3, y, x, k*4+1, v3.y, Odata);
-    O.Set(n*4+3, y, x, k*4+2, v3.z, Odata);
-    O.Set(n*4+3, y, x, k*4+3, v3.w, Odata);
-
-    #else // ----------------------------------------------------------
-        
-    // 118ms
-    O.Set(n*4+0, y, x, k*4+0, v0.x, Odata);
-    O.Set(n*4+1, y, x, k*4+0, v1.x, Odata);
-    O.Set(n*4+2, y, x, k*4+0, v2.x, Odata);
-    O.Set(n*4+3, y, x, k*4+0, v3.x, Odata);
-    
-    O.Set(n*4+0, y, x, k*4+1, v0.y, Odata);
-    O.Set(n*4+1, y, x, k*4+1, v1.y, Odata);
-    O.Set(n*4+2, y, x, k*4+1, v2.y, Odata);
-    O.Set(n*4+3, y, x, k*4+1, v3.y, Odata);
-    
-    O.Set(n*4+0, y, x, k*4+2, v0.z, Odata);
-    O.Set(n*4+1, y, x, k*4+2, v1.z, Odata);
-    O.Set(n*4+2, y, x, k*4+2, v2.z, Odata);
-    O.Set(n*4+3, y, x, k*4+2, v3.z, Odata);
-    
-    O.Set(n*4+0, y, x, k*4+3, v0.w, Odata);
-    O.Set(n*4+1, y, x, k*4+3, v1.w, Odata);
-    O.Set(n*4+2, y, x, k*4+3, v2.w, Odata);
-    O.Set(n*4+3, y, x, k*4+3, v3.w, Odata);
-
-    #endif // ----------------------------------------------------------
-
-              
-    #undef X_
-    #undef K_
-}
-
-#else // =====================================================================================================
-
-#undef CTILE
-#define CTILE 16
-#define RTILE 4
-groupshared float Conv_XcacheR[RTILE*RTILE][CTILE*CTILE];
-groupshared float Conv_KcacheR[RTILE*RTILE][CTILE*CTILE];
-[numthreads(CTILE, CTILE, 1)]
-void Conv2D_Kernel3x3_Cache_KCmod64_KNyx(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ Conv_XcacheR
-    #define K_ Conv_KcacheR
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = CTILE * groupID.x + groupThreadID.x;
-    uint nyx = CTILE * groupID.y + groupThreadID.y;
-
-    uint width = X.width - _Border;
-    uint height = X.height - _Border;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-    
-    float v[RTILE*RTILE];
-    for (uint xxxx = 0; xxxx < RTILE; ++xxxx)
-    {
-        float b = B.Get(0, 0, k*RTILE+xxxx, 0, WBKdata, WBK.dataLength);
-        for (uint yyyy = 0; yyyy < RTILE; ++yyyy)
-            v[yyyy*RTILE+xxxx] = b;
-    }
-
-    for (uint dy = 0; dy < 3; ++dy)
-    {
-        bool mask = true;
-
-        if (y+dy < _Offset) mask = false;
-        if (y+dy-_Offset >= X.height) mask = false;
-
-        for (uint dx = 0; dx < 3; ++dx)
-        {
-            if (x+dx < _Offset) mask = false;
-            if (x+dx-_Offset >= X.width) mask = false;
-
-            for (uint m = 0; m < X.channels/(CTILE*RTILE); ++m)
-            {                
-            
-                for (uint yy = 0; yy < RTILE; ++yy)
-                    for (uint xx = 0; xx < RTILE; ++xx)
-                    {
-                        if (mask)
-                            X_[yy*RTILE+xx][gy*CTILE+gx] = X.Get(n*RTILE+yy, y+dy-_Offset, x+dx-_Offset, (m*CTILE + gx)*RTILE+xx, Xdata);
-                        else
-                            X_[yy*RTILE+xx][gy*CTILE+gx] = 0;
-                        K_[yy*RTILE+xx][gy*CTILE+gx] = K.Get(dy, dx, (m*CTILE + gy)*RTILE+yy, k*RTILE+xx, WBKdata, WBK.dataLength);
-                    }
-
-                GroupMemoryBarrierWithGroupSync();
-
-                for (uint ii = 0; ii < CTILE; ++ii)
-                {
-                    float x[RTILE*RTILE];
-                    float k[RTILE*RTILE];
-                    
-                    [unroll]
-                    for (uint iii = 0; iii < RTILE*RTILE; ++iii)
-                    {
-                        x[iii] = X_[iii][gy*CTILE+ii];
-                        k[iii] = K_[iii][ii*CTILE+gx];
-                    }
-
-                    [unroll]
-                    for (uint r = 0; r < RTILE*RTILE; ++r)
-                    {
-                        [unroll]
-                        for (uint i = 0; i < RTILE; ++i)
-                        {
-                            uint xxx = r % RTILE;
-                            v[r] = mad(x[r], k[i*RTILE+xxx], v[r]);
-
-                            //v[yyy][xxx] += x[yyy][i] * k[i][xxx];
-                        }
-                    }
-
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-
-    for (uint yy = 0; yy < RTILE; ++yy)
-        for (uint xx = 0; xx < RTILE; ++xx)
-            O.Set(n*RTILE+yy, y, x, k*RTILE+xx, v[yy*RTILE+xx], Odata);
-    
-    #undef X_
-    #undef K_
-}
-#endif
-
-[numthreads(CTILE, CTILE, 1)]
-void Conv2D_Kernel3x3_Cache_KCmod16_KNyx_TEMPLATE(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    #define X_ Conv_XcacheT
-    #define K_ Conv_KcacheT
-
-    uint gx = groupThreadID.x;
-    uint gy = groupThreadID.y;
-
-    uint k = CTILE * groupID.x + groupThreadID.x;
-    uint nyx = CTILE * groupID.y + groupThreadID.y;
-
-    uint width = X.width - _Border;
-    uint height = X.height - _Border;
-
-    uint x = nyx % width;
-    uint ny = nyx / width;
-    uint y = ny % height;
-    uint n = ny / height;
-    
-    float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-    for (uint dy = 0; dy < 3; ++dy)
-    {
-        bool mask = true;
-
-        if (y+dy < _Offset) mask = false;
-        if (y+dy-_Offset >= X.height) mask = false;
-
-        for (uint dx = 0; dx < 3; ++dx)
-        {
-            if (x+dx < _Offset) mask = false;
-            if (x+dx-_Offset >= X.width) mask = false;
-
-            //for (uint m = 0; m < (9*128)/CTILE; ++m)
-            for (uint m = 0; m < X.channels/CTILE; ++m)
-            {
-                if (mask)
-                    X_[gy][gx] = X.Get(n, y+dy-_Offset, x+dx-_Offset, m*CTILE + gx, Xdata);
-                else
-                    X_[gy][gx] = 0;
-                K_[gy][gx] = K.Get(dy, dx, m*CTILE + gy, k, WBKdata, WBK.dataLength);
-                GroupMemoryBarrierWithGroupSync();
-
-                [unroll]
-                for (uint i = 0; i < CTILE; ++i)
-                {
-                    float x = X_[gy][i];
-                    float k =.25;// K_[i][gx];
-                    v += x * k;
-                }
-            }
-        }
-    }
-
-    //Odata[nyx * O.channels + k] = v;
-    
-    Odata[((
-        n   * O.height +
-        y ) * O.width +
-        x ) * O.channels +
-        k] = v;
-    
-    #undef X_
-    #undef K_
-}
-// %TODO: only supports up to 51 kernels (51 = 16*16*2/(9kernel+1bias)) for now. Add a loop to handle more!
-/*
-groupshared float K1cache[KERNEL_SIZE][KERNEL_SIZE][32];
-groupshared float B1cache[32];
-[numthreads(16,16,2)]
-void Conv2D_Kernel3x3_1Channel(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint k = 16*groupID.x + groupThreadID.x;
-    uint n = 16*groupID.y + groupThreadID.y;
-    uint y = 2*groupID.z + groupThreadID.z + _FilterSize;
-
-    uint idx = 16*16*groupThreadID.z + 16*groupThreadID.y + groupThreadID.x;
-    if (idx < 9 * K.channels)
-    {
-        uint kx = idx / K.channels;
-        uint kk = idx % K.channels;
-        K1cache[kx/3][kx%3][kk] = K.Get(kx/3, kx%3, 0, kk, WBKdata, WBK.dataLength);
-    }
-    else if (idx < 10 * K.channels)
-    {
-        uint kk = idx % K.channels;
-        B1cache[kk] = B.Get(0, 0, kk, 0, WBKdata, WBK.dataLength);
-    }
-    GroupMemoryBarrierWithGroupSync();
-
-    for (uint x = _FilterSize; x < X.width - _FilterSize; ++x)
-    {            
-        float v = B1cache[k];//B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-        for (int i = -_FilterSize; i < _FilterSize + 1; ++i)
-        {
-            for (int j = -_FilterSize; j < _FilterSize + 1; ++j)
-            {
-                v += X.Get(n, y+j, x+i, 0, Xdata, X.dataLength) * K1cache[_FilterSize+j][_FilterSize+i][k];
-            }
-        }
-        O.Set(n, y-_FilterSize, x-_FilterSize, k, v, Odata, O.dataLength);
-    }
-}
-*/
-
-groupshared float K1cache[32][9];
-[numthreads(32,16,1)]
-void Conv2D_Kernel3x3_1Channel(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    uint tk = groupThreadID.x;
-    uint k = 32*groupID.x + tk;
-    uint n = 16*groupID.y + groupThreadID.y;
-
-    //for (uint q = 0; q < 9; ++q)
-    {
-        uint q = n % 9;
-        K1cache[tk][q] = K.Get(q/3, q%3, 0, k, WBKdata, WBK.dataLength);
-    }
-    GroupMemoryBarrierWithGroupSync();
-
-    for (uint y = 0; y < X.height - _FilterSize*2; ++y)
-    {
-        for (uint x = 0; x < X.width - _FilterSize*2; ++x)
-        {
-            float v = B.Get(0, 0, k, 0, WBKdata, WBK.dataLength);
-            //for (uint q = 0; q < 9; ++q)
-            //    v += X.Get(n, y+q/3, x+q%3, 0, Xdata, X.dataLength) * K1cache[tk][q];
-            v += X.Get(n, y+0, x+0, 0, Xdata, X.dataLength) * K1cache[tk][0];
-            v += X.Get(n, y+0, x+1, 0, Xdata, X.dataLength) * K1cache[tk][1];
-            v += X.Get(n, y+0, x+2, 0, Xdata, X.dataLength) * K1cache[tk][2];
-
-            v += X.Get(n, y+1, x+0, 0, Xdata, X.dataLength) * K1cache[tk][3];
-            v += X.Get(n, y+1, x+1, 0, Xdata, X.dataLength) * K1cache[tk][4];
-            v += X.Get(n, y+1, x+2, 0, Xdata, X.dataLength) * K1cache[tk][5];
-
-            v += X.Get(n, y+2, x+0, 0, Xdata, X.dataLength) * K1cache[tk][6];
-            v += X.Get(n, y+2, x+1, 0, Xdata, X.dataLength) * K1cache[tk][7];
-            v += X.Get(n, y+2, x+2, 0, Xdata, X.dataLength) * K1cache[tk][8];
-
-            O.Set(n, y, x, k, v, Odata, O.dataLength);
-        }
-    }
-}
-
-float fillValue;
-
-[numthreads(1,1,1)]
-void Fill(uint3 groupID : SV_GroupID)
-{
-    uint b = groupID.x;
-    uint h = groupID.y;
-    uint w = groupID.z;
-    for (uint ch = 0; ch < O.channels; ++ch)
-        O.Set(b, h, w, ch+1, fillValue, Odata, O.dataLength);
-}
-#endif
-
-
-/*
-Cbufferconsts{
-    uint n;
-    uint dispatchDim_x;};
-#define groupDim_x 512
-groupshared float Accumulate_sharedMem[groupDim_x * channels];
-[numthreads(groupDim_x, 1, 1)]
-void Accumulate(uint tid: SV_GroupIndex, uint3 groupIdx: groupID)
-{
-    #define sharedMem Reduce_sharedMem
-    unsigned int i = groupIdx.x * (groupDim_x * 2) + tid;
-    unsigned int dispatchSize = (groupDim_x * 2) * dispatchDim_x;
-    sharedMem[tid] = 0;
-    do {
-        sharedMem[tid] += g_idata[i] + g_idata[i+groupDim_x];
-        i += dispatchSize;
-    } while (i < n);
-    GroupMemoryBarrierWithGroupSync();
-
-    if (groupDim_x >= 256)
-    {
-        if (tid < 128) { sharedMem[tid] += sharedMem[tid + 128 * channels]; }
-        GroupMemoryBarrierWithGroupSync();
-    }
-
-    if (groupDim_x >= 128)
-    {
-        if (tid <  64) { sharedMem[tid] += sharedMem[tid +  64]; }
-        GroupMemoryBarrierWithGroupSync();
-    }
-
-    if (tid < 32)
-    {
-        if (groupDim_x >= 64) sharedMem[tid] += sharedMem[tid + 32* channels];
-        if (groupDim_x >= 32) sharedMem[tid] += sharedMem[tid + 16* channels];
-        if (groupDim_x >= 16) sharedMem[tid] += sharedMem[tid +  8* channels];
-        if (groupDim_x >=  8) sharedMem[tid] += sharedMem[tid +  4* channels];
-        if (groupDim_x >=  4) sharedMem[tid] += sharedMem[tid +  2* channels];
-        if (groupDim_x >=  2) sharedMem[tid] += sharedMem[tid +  1* channels];
-    }
-
-    if (tid == 0) g_odata[groupIdx.x] = sharedMem[0];
-
-    #undef sharedMem
-}
-*/
-    /*
-// Could do to reduce across NxN patch fitting within a group, HW <= HW / N
-// Repeat, until HW == 1
-
-// Alternatively reduce across Y axis, then X
-
-#undef MAX_CHANNELS
-#define MAX_CHANNELS 2048
-groupshared float GlobalAvgPool2D_AccumulatorPerChannel[MAX_CHANNELS];
-[numthreads(4,8,8)]
-void GlobalAvgPool2D(uint3 dispatchThreadID : SV_DispatchThreadID, uint threadID : SV_ThreadID)
-{
-    // NOTE: dispatched over X (not O)
-    DISPATCH_ARGS(X.channels, X.width, X.height);
-    TENSOR_ARGS2(X, O);
-
-    uint c = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (c >= X.channels || c >= MAX_CHANNELS) return;
-    if (x >= X.width) return;
-    if (y >= X.height) return;
-
-    // Accumulate
-    for (uint n = 0; n < X.batch; ++n)
-    {
-        // Clear accumulator
-        // @TODO: ThreadID
-        //uint threadID = groupThreadID.x * 4 + groupThreadID.y * 8 + groupThreadID.z * 8;
-        if (threadID < MAX_CHANNELS)
-            GlobalAvgPool2D_AccumulatorPerChannel[threadID] = 0;
-        GroupMemoryBarrierWithGroupSync();
-
-        GlobalAvgPool2D_AccumulatorPerChannel[c] += X.Get(n, y, x, c);
-        // @TODO: atomicAdd?
-
-        GroupMemoryBarrierWithGroupSync();
-        if (threadID < MAX_CHANNELS)
-        {
-            float v = GlobalAvgPool2D_AccumulatorPerChannel[threadID];
-            O.Set(n, 0, 0, c, v / (X.width * X.height));
-        }
-    }
-}*/
-
-
-[numthreads(64,2,2)]
-void Conv2D_Reg2x2(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    uint k = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (k >= K.channels) return;
-    if (x*2 >= O.width) return;
-    if (y*2 >= O.height) return;
-
-    uint2 leftCorner = _Pad.xy;
-    uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float4 acc = B.Get(k);
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos0 = uint2(x*2+0, y*2+0) * _Stride.xy + uint2(dx, dy);
-                uint2 pos1 = uint2(x*2+1, y*2+0) * _Stride.xy + uint2(dx, dy);
-                uint2 pos2 = uint2(x*2+0, y*2+1) * _Stride.xy + uint2(dx, dy);
-                uint2 pos3 = uint2(x*2+1, y*2+1) * _Stride.xy + uint2(dx, dy);
-
-                for (uint c = 0; c < X.channels; ++c)
-                {
-                    if (all(pos0 >= leftCorner) && all(pos0 < rightCorner))
-                        acc.x = fastfma(X.Get(n, pos0 - leftCorner, c), K.Get(dy, dx, c, k), acc.x);
-                    if (all(pos1 >= leftCorner) && all(pos1 < rightCorner))
-                        acc.y = fastfma(X.Get(n, pos1 - leftCorner, c), K.Get(dy, dx, c, k), acc.y);
-                    if (all(pos2 >= leftCorner) && all(pos2 < rightCorner))
-                        acc.z = fastfma(X.Get(n, pos2 - leftCorner, c), K.Get(dy, dx, c, k), acc.z);
-                    if (all(pos3 >= leftCorner) && all(pos3 < rightCorner))
-                        acc.w = fastfma(X.Get(n, pos3 - leftCorner, c), K.Get(dy, dx, c, k), acc.w);
-                }
-            }
-        }
-
-        O.Set(n, y*2+0, x*2+0, k, acc.x);
-        O.Set(n, y*2+0, x*2+1, k, acc.y);
-        O.Set(n, y*2+1, x*2+0, k, acc.z);
-        O.Set(n, y*2+1, x*2+1, k, acc.w);
-    }
-}
-
-#define SIZE 2
-[numthreads(64, 2, 2)]
-void Conv2D_Reg_Loop(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    uint k = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (k >= K.channels) return;
-    if (x*SIZE >= O.width) return;
-    if (y*SIZE >= O.height) return;
-
-    uint2 leftCorner = _Pad.xy;
-    uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc[SIZE*SIZE];
-        [unroll]
-        for (uint q = 0; q < SIZE*SIZE; ++q)
-            acc[q] = B.Get(k);
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos[SIZE*SIZE];
-                [unroll]
-                for (uint q = 0; q < SIZE*SIZE; ++q)
-                    pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
-
-                // @TODO: investigate
-                // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-
-                for (uint c = 0; c < X.channels; ++c)
-                    [unroll]
-                    for (q = 0; q < SIZE*SIZE; ++q)
-                        if (all(pos[q] >= leftCorner) && all(pos[q] < rightCorner))
-                            acc[q] = fastfma(X.Get(n, pos[q] - leftCorner, c), K.Get(dy, dx, c, k), acc[q]);
-            }
-        }
-
-        [unroll]
-        for (q = 0; q < SIZE*SIZE; ++q)
-            O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
-    }
-}
-
-NUMTHREADS((16,4,4), (8,4,4), (16,2,2))
-//[numthreads(64, 1, 1)]
-void Conv2D_safe(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    uint k = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (k >= K.channels) return;
-    if (x >= O.width) return;
-    if (y >= O.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc = B.Get(k);
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos = uint2(x, y) * _Stride.xy + uint2(dx, dy);
-
-                for (uint c = 0; c < X.channels; ++c)
-                    acc = fastfma(X.SafeGet(n, pos, c, _Pad.xy), K.Get(dy, dx, c, k), acc);
-            }
-        }
-
-        O.Set(n, y, x, k, acc);
-    }
-}
-
-
-#undef L1CACHESIZE
-#define L1CACHESIZE 32
-groupshared float Conv2D_L1Cached32_X[L1CACHESIZE];
-[numthreads(L1CACHESIZE, 1, 1)]
-void Conv2D_L1Cached32(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv2D_L1Cached32_X
-
-    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-    uint x = groupID.y;
-    uint y = groupID.z;
-
-    if (x >= O.width) return;
-    if (y >= O.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc = B.SafeGet(k);
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos = uint2(x,y) * _Stride.xy + uint2(dx,dy);
-
-                for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-                {
-                    // Cache X
-                    X_[groupThreadID.x] = X.SafeGet(n, pos, c + groupThreadID.x, _Pad.xy);
-                    GroupMemoryBarrierWithGroupSync();
-
-                    // X * K
-                    if (k < K.channels)
-                    {
-                        for (uint dc = 0; dc < L1CACHESIZE; ++dc)
-                            acc = fastfma(X_[dc], K.Get(dy, dx, c + dc, k), acc);
-                    }
-                    GroupMemoryBarrierWithGroupSync();
-                }
-            }
-        }
-
-        O.Set(n, y, x, k, acc);
-    }
-
-    #undef X_
-}
-
-#undef L1CACHESIZE
-#define L1CACHESIZE 64
-groupshared float Conv2D_L1Cached64_X[L1CACHESIZE];
-[numthreads(L1CACHESIZE, 1, 1)]
-void Conv2D_L1Cached64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv2D_L1Cached64_X
-
-    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-    uint x = groupID.y;
-    uint y = groupID.z;
-
-    if (x >= O.width) return;
-    if (y >= O.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc = B.SafeGet(k);
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos = uint2(x,y) * _Stride.xy + uint2(dx,dy);
-                for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-                {
-                    // Cache X
-                    X_[groupThreadID.x] = X.SafeGet(n, pos, c + groupThreadID.x, _Pad.xy);
-                    GroupMemoryBarrierWithGroupSync();
-
-                    // X * K
-                    if (k < K.channels)
-                    {
-                        for (uint dc = 0; dc < L1CACHESIZE; ++dc)
-                            acc = fastfma(X_[dc], K.Get(dy, dx, c + dc, k), acc);
-                    }
-                    GroupMemoryBarrierWithGroupSync();
-                }
-            }
-        }
-
-        O.Set(n, y, x, k, acc);
-    }
-
-    #undef X_
-}
-
-
-#undef SIZE
-#define SIZE 2
-[numthreads(64, 2, 2)]
-void Conv2D_Reg_Loop_safe(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    uint k = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (k >= K.channels) return;
-    if (x*SIZE >= O.width) return;
-    if (y*SIZE >= O.height) return;
-
-    uint2 leftCorner = _Pad.xy;
-    uint2 rightCorner = uint2(X.width, X.height) + _Pad.xy;
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc[SIZE*SIZE];
-        [unroll]
-        for (uint q = 0; q < SIZE*SIZE; ++q)
-            acc[q] = B.Get(k);
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos[SIZE*SIZE];
-                [unroll]
-                for (uint q = 0; q < SIZE*SIZE; ++q)
-                    pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
-
-                // @TODO: investigate
-                // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-
-                for (uint c = 0; c < X.channels; ++c)
-                    [unroll]
-                    for (q = 0; q < SIZE*SIZE; ++q)
-                        acc[q] = fastfma(X.SafeGet(n, pos[q], c, _Pad.xy), K.Get(dy, dx, c, k), acc[q]);
-            }
-        }
-
-        [unroll]
-        for (q = 0; q < SIZE*SIZE; ++q)
-            O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
-    }
-}
-
-
-#undef L1CACHESIZE
-#define L1CACHESIZE 64
-#undef SIZE
-#define SIZE 2
-groupshared float Conv2D_L1Cached64_Reg_Loop2x2_X[SIZE*SIZE][L1CACHESIZE];
-[numthreads(L1CACHESIZE, 1, 1)]
-void Conv2D_L1Cached64_Reg_Loop2x2(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv2D_L1Cached64_Reg_Loop2x2_X
-
-    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-    uint x = groupID.y;
-    uint y = groupID.z;
-
-    // need all threads to load channels, thus will do late check against kernel count
-    if (x*SIZE >= O.width) return;
-    if (y*SIZE >= O.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc[SIZE*SIZE];
-        [unroll]
-        for (uint q = 0; q < SIZE*SIZE; ++q)
-            acc[q] = B.SafeGet(k);
-
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos[SIZE*SIZE];
-                [unroll]
-                for (uint q = 0; q < SIZE*SIZE; ++q)
-                    pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
-
-                for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-                {
-                    // Cache X
-                    uint dc = groupThreadID.x;
-                    [unroll]
-                    for (q = 0; q < SIZE*SIZE; ++q)
-                        X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
-                    GroupMemoryBarrierWithGroupSync();
-
-                    // X * K
-                    if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-                    {
-                        uint kIndex = K.Index(dy, dx, c, k);
-                        for (dc = 0; dc < L1CACHESIZE; ++dc)
-                        {
-                            for (q = 0; q < SIZE*SIZE; ++q)
-                                acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]); //K.Get(dy, dx, c + dc, k);
-                            kIndex += K.channels;
-                        }
-                    }
-                    GroupMemoryBarrierWithGroupSync();
-                }
-            }
-        }
-
-        if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-            [unroll]
-            for (q = 0; q < SIZE*SIZE; ++q)
-                O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
-    }
-
-    #undef X_
-}
-
-
-#undef L1CACHESIZE
-#define L1CACHESIZE 64
-#undef SIZE
-#define SIZE 4
-groupshared float Conv2D_L1Cached64_Reg_Loop_X[SIZE*SIZE][L1CACHESIZE];
-[numthreads(L1CACHESIZE, 1, 1)]
-void Conv2D_L1Cached64_Reg_Loop(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv2D_L1Cached64_Reg_Loop_X
-
-    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-    uint x = groupID.y;
-    uint y = groupID.z;
-
-    // need all threads to load channels, thus will do late check against kernel count
-    if (x*SIZE >= O.width) return;
-    if (y*SIZE >= O.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc[SIZE*SIZE];
-        [unroll]
-        for (uint q = 0; q < SIZE*SIZE; ++q)
-            acc[q] = B.SafeGet(k);
-
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos[SIZE*SIZE];
-                [unroll]
-                for (uint q = 0; q < SIZE*SIZE; ++q)
-                    pos[q] = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE)) * _Stride.xy + uint2(dx, dy);
-
-                for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-                {
-                    // Cache X
-                    uint dc = groupThreadID.x;
-                    [unroll]
-                    for (q = 0; q < SIZE*SIZE; ++q)
-                        X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
-                    GroupMemoryBarrierWithGroupSync();
-
-                    // X * K
-                    if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-                    {
-                        uint kIndex = K.Index(dy, dx, c, k);
-                        for (dc = 0; dc < L1CACHESIZE; ++dc)
-                        {
-                            for (q = 0; q < SIZE*SIZE; ++q)
-                                acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);//K.Get(dy, dx, c + dc, k);
-                            kIndex += K.channels;
-                        }
-                    }
-                    GroupMemoryBarrierWithGroupSync();
-                }
-            }
-        }
-
-        if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-            [unroll]
-            for (q = 0; q < SIZE*SIZE; ++q)
-                O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
-    }
-
-    #undef X_
-}
-
-
-#undef L1CACHESIZE
-#define L1CACHESIZE 64
-#define SIZE_W 4
-#define SIZE_H 2
-groupshared float Conv2D_L1Cached64_Reg_Loop_safe__X[SIZE_H*SIZE_W][L1CACHESIZE];
-[numthreads(L1CACHESIZE, 1, 1)]
-void Conv2D_L1Cached64_Reg_Loop_safe_(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv2D_L1Cached64_Reg_Loop_safe__X
-
-    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-    uint x = groupID.y;
-    uint y = groupID.z;
-
-    // need all threads to load channels, thus will do late check against kernel count
-    if (x*SIZE_W >= O.width) return;
-    if (y*SIZE_H >= O.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc[SIZE_H*SIZE_W];
-        [unroll]
-        for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
-            acc[q] = B.SafeGet(k);
-
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos[SIZE_H*SIZE_W];
-                [unroll]
-                for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
-                    pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
-
-                for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-                {
-                    // Cache X
-                    uint dc = groupThreadID.x;
-                    [unroll]
-                    for (q = 0; q < SIZE_H*SIZE_W; ++q)
-                        X_[q][dc] = X.SafeGet(n, pos[q], c + dc, _Pad.xy);
-                    GroupMemoryBarrierWithGroupSync();
-
-                    // X * K
-                    if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-                    {
-                        uint kIndex = K.Index(dy, dx, c, k);
-                        for (dc = 0; dc < L1CACHESIZE; ++dc)
-                        {
-                            [unroll]
-                            for (q = 0; q < SIZE_H*SIZE_W; ++q)
-                                acc[q] = fastfma(X_[q][dc], K.data[kIndex], acc[q]);
-                            kIndex += K.channels;
-                        }
-                    }
-                    GroupMemoryBarrierWithGroupSync();
-                }
-            }
-        }
-
-        if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-            [unroll]
-            for (q = 0; q < SIZE_H*SIZE_W; ++q)
-            {
-                uint ox = x*SIZE_W+(q%SIZE_W);
-                uint oy = y*SIZE_H+(q/SIZE_W);
-                if (ox < O.width && oy < O.height)
-                    O.Set(n, oy, ox, k, acc[q]);
-            }
-    }
-
-    #undef X_
-}
-#undef SIZE_H
-#undef SIZE_W
-
-
-/*
-#undef L1CACHESIZE
-#define L1CACHESIZE 32
-#define SIZE_W 4
-#define SIZE_H 2
-groupshared float Conv2D_L1Cached64_Reg_Loop_safe__X[SIZE_H*SIZE_W][L1CACHESIZE];
-[numthreads(L1CACHESIZE, SIZE_W, SIZE_H)]
-void Conv2D_L1Cached64_Reg_Loop_safe_(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv2D_L1Cached64_Reg_Loop_safe__X
-
-    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-    uint x = SIZE_W * groupID.y + groupThreadID.y;
-    uint y = SIZE_H * groupID.z + groupThreadID.z;
-
-    // need all threads to load channels, thus will do late check against kernel count
-    //if (x*SIZE_W >= O.width) return;
-    //if (y*SIZE_H >= O.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc[SIZE_H*SIZE_W];
-        [unroll]
-        for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
-            acc[q] = B.SafeGet(k);
-
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                //uint2 pos[SIZE_H*SIZE_W];
-                //[unroll]
-                //for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
-                //    pos[q] = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W)) * _Stride.xy + uint2(dx, dy);
-
-                for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-                {
-                    // Cache X
-                    uint dc = groupThreadID.x;
-                    uint gx = groupThreadID.y;
-                    uint gy = groupThreadID.z;
-                    //[unroll]
-                    //for (q = 0; q < SIZE_H*SIZE_W; ++q)
-                    //{
-                        uint2 pos = uint2(x*SIZE_W+gx, y*SIZE_H+gy) * _Stride.xy + uint2(dx, dy);
-                        X_[SIZE_W*gy+gx][dc] = X.SafeGet(n, pos, c + dc, _Pad.xy);
-                    //}
-                    GroupMemoryBarrierWithGroupSync();
-
-                    // X * K
-                    if (k < K.channels &&
-                        x*SIZE_W < O.width &&
-                        y*SIZE_H < O.height) // need all threads to load channels, thus late check against kernel count
-                    {
-                        uint kIndex = K.Index(dy, dx, c, k);
-                        for (dc = 0; dc < L1CACHESIZE; ++dc)
-                        {
-                            [unroll]
-                            for (q = 0; q < SIZE_H*SIZE_W; ++q)
-                                acc[q] += X_[q][dc] * K.data[kIndex];//K.Get(dy, dx, c + dc, k);
-                            kIndex += K.channels;
-                        }
-                    }
-                    GroupMemoryBarrierWithGroupSync();
-                }
-            }
-        }
-
-        if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-            [unroll]
-            for (q = 0; q < SIZE_H*SIZE_W; ++q)
-            {
-                uint ox = x*SIZE_W+(q%SIZE_W);
-                uint oy = y*SIZE_H+(q/SIZE_W);
-                if (ox < O.width && oy < O.height)
-                    O.Set(n, oy, ox, k, acc[q]);
-            }
-    }
-
-    #undef X_
-}
-#undef SIZE_H
-#undef SIZE_W
-*/
-
-/*
-#undef L1CACHESIZE
-#define L1CACHESIZE 64
-groupshared float Conv2D_RegCached_X[4][L1CACHESIZE];
-[numthreads(L1CACHESIZE, 1, 1)]
-void Conv2D_RegCached(uint3 dispatchThreadID : SV_DispatchThreadID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv2D_RegCached_X
-
-    uint k = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (x*2 >= O.width) return;
-    if (y*2 >= O.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float4 acc = B.SafeGet(k);
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint2 pos0 = uint2(x*2+0,y*2+0) * _Stride + uint2(dx,dy);
-                uint2 pos1 = uint2(x*2+1,y*2+0) * _Stride + uint2(dx,dy);
-                uint2 pos2 = uint2(x*2+0,y*2+1) * _Stride + uint2(dx,dy);
-                uint2 pos3 = uint2(x*2+1,y*2+1) * _Stride + uint2(dx,dy);
-
-                // Cache X
-                uint c_ = groupThreadID.x;
-                if (c_ < X.channels)
-                {
-                    X_[0][c_] = X.SafeGet(n, pos0, c_, _Pad.xy);
-                    X_[1][c_] = X.SafeGet(n, pos1, c_, _Pad.xy);
-                    X_[2][c_] = X.SafeGet(n, pos2, c_, _Pad.xy);
-                    X_[3][c_] = X.SafeGet(n, pos3, c_, _Pad.xy);
-                }
-                GroupMemoryBarrierWithGroupSync();
-
-                // X * K
-                if (k < K.channels)
-                    for (uint c = 0; c < X.channels; ++c)
-                    {
-                        acc.x += X_[0][c] * K.Get(dy, dx, c, k);
-                        acc.y += X_[1][c] * K.Get(dy, dx, c, k);
-                        acc.z += X_[2][c] * K.Get(dy, dx, c, k);
-                        acc.w += X_[3][c] * K.Get(dy, dx, c, k);
-                    }
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-
-        O.Set(n, y*2+0, x*2+0, k, acc.x);
-        O.Set(n, y*2+0, x*2+1, k, acc.y);
-        O.Set(n, y*2+1, x*2+0, k, acc.z);
-        O.Set(n, y*2+1, x*2+1, k, acc.w);
-    }
-}
-*/
-
-/*
-[numthreads(16,4,4)]
-void Conv2DTrans(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    uint k = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (k >= K.channels) return;
-    if (x >= O.width) return;
-    if (y >= O.height) return;
-
-    uint2 strideMask = _Stride.xy - 1;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc = B.Get(k);
-        for (uint dy = 0; dy < K.GetKernelHeight(); dy += _Stride.y)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); dx += _Stride.x)
-            {
-                uint dxShifted = dx + (x&strideMask.x);
-                uint dyShifted = dy + (y&strideMask.y);
-
-                uint xx = x + dxShifted;
-                uint yy = y + dyShifted;
-
-                uint oy = (yy - _Pad.y) / _Stride.y;
-                uint ox = (xx - _Pad.x) / _Stride.x;
-
-                bool mask = xx >= _Pad.x && yy >= _Pad.y && ox < X.width && oy < X.height;
-                if (!mask) continue;
-
-                // [unroll] - crashes metal compiler
-                for (uint c = 0; c < X.channels; ++c)
-                {
-                    acc += X.Get(n, oy, ox, c) * K.Get(    K.GetKernelHeight() - 1 - dyShifted,
-                                                        K.GetKernelWidth()  - 1 - dxShifted, c, k);
-                }
-            }
-        }
-
-        O.Set(n, y, x, k, acc);
-    }
-}
-*/
-
-
-
-#undef SIZE
-#define SIZE 4
-[numthreads(16, 4, 4)]
-void Conv2DTrans_Reg_Loop_safe(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    uint k = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (k >= K.channels) return;
-    if (x*SIZE >= O.width) return;
-    if (y*SIZE >= O.height) return;
-
-    uint2 strideMask = _Stride.xy - 1;
-
-    uint2 pad = _Pad.xy / _Stride.xy;
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc[SIZE*SIZE];
-        [unroll]
-        for (uint q = 0; q < SIZE*SIZE; ++q)
-            acc[q] = B.Get(k);
-
-        for (uint dy = 0; dy < K.GetKernelHeight(); dy += _Stride.y)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); dx += _Stride.x)
-            {
-                uint2 kernelPos[SIZE*SIZE];
-                uint2 pos[SIZE*SIZE];
-
-                [unroll]
-                for (uint q = 0; q < SIZE*SIZE; ++q)
-                {
-                    uint2 xy = uint2(x*SIZE+(q%SIZE), y*SIZE+(q/SIZE));
-                    kernelPos[q] = uint2(dx, dy) + (xy & strideMask);
-                    pos[q] = (xy + kernelPos[q]) / _Stride.xy;
-
-                    // transpose
-                    kernelPos[q] = uint2(K.GetKernelWidth(), K.GetKernelHeight()) - 1 - kernelPos[q];
-                }
-
-                for (uint c = 0; c < X.channels; ++c)
-                    [unroll]
-                    for (q = 0; q < SIZE*SIZE; ++q)
-                        acc[q] = fastfma(X.SafeGet(n, pos[q], c, pad.xy), K.Get(kernelPos[q].y, kernelPos[q].x, c, k), acc[q]);
-                        //acc[q] += X.SafeGet(n, pos[q], c, pad.xy) * K.Get(kernelPos[q].y, kernelPos[q].x, c, k);
-            }
-        }
-
-        [unroll]
-        for (q = 0; q < SIZE*SIZE; ++q)
-            O.Set(n, y*SIZE+(q/SIZE), x*SIZE+(q%SIZE), k, acc[q]);
-    }
-}
-
-
-
-#undef L1CACHESIZE
-#define L1CACHESIZE 64
-#define SIZE_W 4
-#define SIZE_H 2
-groupshared float Conv2DTrans_L1Cached64_Reg_Loop_safe__X[SIZE_H*SIZE_W][L1CACHESIZE];
-[numthreads(L1CACHESIZE, 1, 1)]
-void Conv2DTrans_L1Cached64_Reg_Loop_safe_(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv2DTrans_L1Cached64_Reg_Loop_safe__X
-
-    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-    uint x = groupID.y;
-    uint y = groupID.z;
-
-    // need all threads to load channels, thus will do late check against kernel count
-    if (x*SIZE_W >= O.width) return;
-    if (y*SIZE_H >= O.height) return;
-
-    uint2 strideMask = _Stride.xy - 1;
-    uint2 pad = _Pad.xy / _Stride.xy;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc[SIZE_H*SIZE_W];
-        [unroll]
-        for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
-            acc[q] = B.SafeGet(k);
-
-        for (uint dy = 0; dy < K.GetKernelHeight(); dy += _Stride.y)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); dx += _Stride.x)
-            {
-                uint2 kernelPos[SIZE_H*SIZE_W];
-                uint2 pos[SIZE_H*SIZE_W];
-
-                [unroll]
-                for (uint q = 0; q < SIZE_H*SIZE_W; ++q)
-                {
-                    uint2 xy = uint2(x*SIZE_W+(q%SIZE_W), y*SIZE_H+(q/SIZE_W));
-                    kernelPos[q] = uint2(dx, dy) + (xy & strideMask);
-                    pos[q] = (xy + kernelPos[q]) / _Stride.xy;
-
-                    // transpose
-                    kernelPos[q] = uint2(K.GetKernelWidth(), K.GetKernelHeight()) - 1 - kernelPos[q];
-                }
-
-                for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-                {
-                    // Cache X
-                    uint dc = groupThreadID.x;
-                    [unroll]
-                    for (q = 0; q < SIZE_H*SIZE_W; ++q)
-                        X_[q][dc] = X.SafeGet(n, pos[q], c + dc, pad.xy);
-                    GroupMemoryBarrierWithGroupSync();
-
-                    // X * K
-                    if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-                    {
-                        for (dc = 0; dc < L1CACHESIZE; ++dc)
-                        {
-                            [unroll]
-                            for (q = 0; q < SIZE_H*SIZE_W; ++q)
-                                acc[q] = fastfma(X_[q][dc], K.Get(kernelPos[q].y, kernelPos[q].x, c + dc, k), acc[q]);
-                                //acc[q] += X_[q][dc] * K.Get(kernelPos[q].y, kernelPos[q].x, c + dc, k);
-                        }
-                    }
-                    GroupMemoryBarrierWithGroupSync();
-                }
-            }
-        }
-
-        if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-            [unroll]
-            for (q = 0; q < SIZE_H*SIZE_W; ++q)
-            {
-                uint ox = x*SIZE_W+(q%SIZE_W);
-                uint oy = y*SIZE_H+(q/SIZE_W);
-                if (ox < O.width && oy < O.height)
-                    O.Set(n, oy, ox, k, acc[q]);
-            }
-    }
-
-    #undef X_
-}
-#undef SIZE_H
-#undef SIZE_W
-
-
-/*
-#undef L1CACHESIZE
-#define L1CACHESIZE 64
-groupshared float Conv2DTrans_L1Cached64_Reg_Loop_safe_X[L1CACHESIZE];
-[numthreads(L1CACHESIZE, 1, 1)]
-void Conv2DTrans_L1Cached64_Reg_Loop_safe(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    DISPATCH_ARGS(K.kernelCount, X.width, X.height);
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv2DTrans_L1Cached64_Reg_Loop_safe_X
-
-    uint k = L1CACHESIZE * groupID.x + groupThreadID.x;
-    uint x = groupID.y;
-    uint y = groupID.z;
-
-    // need all threads to load channels, thus will do late check against kernel count
-    if (x >= X.width) return;
-    if (y >= X.height) return;
-
-    uint2 pad = _Pad.xy / _Stride.xy;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        for (uint sy = 0; sy < _Stride.y; ++sy)
-        {
-            for (uint sx = 0; sx < _Stride.x; ++sx)
-            {
-                float acc = B.SafeGet(k);
-
-                for (uint dy = sy; dy < K.GetKernelHeight(); dy += _Stride.y)
-                {
-                    for (uint dx = sx; dx < K.GetKernelWidth(); dx += _Stride.x)
-                    {
-                        uint2 pos = uint2(x, y) + uint2(sx + dx, sy + dy) / _Stride.xy;
-
-                        for (uint c = 0; c < X.channels; c += L1CACHESIZE)
-                        {
-                            // Cache X
-                            uint dc = groupThreadID.x;
-                            X_[dc] = X.SafeGet(n, pos, c + dc, pad);
-                            GroupMemoryBarrierWithGroupSync();
-
-                            // X * K
-                            if (k < K.channels) // need all threads to load channels, thus late check against kernel count
-                            {
-                                for (dc = 0; dc < L1CACHESIZE; ++dc)
-                                {
-                                    acc = fastfma(    X_[dc],
-                                                    K.Get(    K.GetKernelHeight() - 1 - dy,
-                                                            K.GetKernelWidth()  - 1 - dx, c + dc, k),
-                                            acc);
-                                }
-                            }
-                            GroupMemoryBarrierWithGroupSync();
-                        }
-                    }
-                }
-
-                uint oy = y * _Stride.y + sy;
-                uint ox = x * _Stride.x + sx;
-                if (oy < O.height && ox < O.width && k < K.channels)
-                    O.Set(n, oy, ox, k, acc);
-            }
-        }
-    }
-
-    #undef X_
-}
-*/
-#endif
-
-
diff --git a/Assets/Coach-ML/Barracuda/Resources/Experimental.compute.meta b/Assets/Coach-ML/Barracuda/Resources/Experimental.compute.meta
deleted file mode 100644
index 49e7b42..0000000
--- a/Assets/Coach-ML/Barracuda/Resources/Experimental.compute.meta
+++ /dev/null
@@ -1,9 +0,0 @@
-fileFormatVersion: 2
-guid: 299ca130202014274b506123e830c52d
-timeCreated: 1506672486
-licenseType: Pro
-ComputeShaderImporter:
-  currentAPIMask: 196608
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Resources/FastNV.compute b/Assets/Coach-ML/Barracuda/Resources/FastNV.compute
deleted file mode 100644
index 00f077e..0000000
--- a/Assets/Coach-ML/Barracuda/Resources/FastNV.compute
+++ /dev/null
@@ -1,188 +0,0 @@
-//#pragma kernel Dense64
-//#pragma kernel Conv2D_Kernel3x3_64
-
-#include "Tensor.cginc"
-
-TENSOR_DECL(X)
-TENSOR_DECL(W)
-TENSOR_DECL(K)
-TENSOR_DECL(B)
-TENSOR_DECL(WBK)
-TENSOR_DECL_RW(O)
-
-uint4 _Pad;
-uint4 _Stride;
-
-#undef THREAD_COUNT
-#define THREAD_COUNT 64 // ATM support only 8x8
-
-#undef BLOCK_WIDTH
-#define BLOCK_WIDTH 8
-
-#undef LOAD_WIDTH
-#define LOAD_WIDTH THREAD_COUNT
-
-#undef LOAD_DEPTH
-#define LOAD_DEPTH BLOCK_WIDTH
-
-groupshared float DenseTiled_XcacheR[LOAD_DEPTH][LOAD_WIDTH];
-groupshared float DenseTiled_WcacheR[LOAD_DEPTH][LOAD_WIDTH];
-
-[numthreads(THREAD_COUNT, 1, 1)]
-void Dense64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    // @TODO: DISPATCH_ARGS(...)
-    TENSOR_SHARED2_ARGS4(X, W, B, WBK, O);
-
-    #define X_ DenseTiled_XcacheR
-    #define W_ DenseTiled_WcacheR
-
-    uint id = groupThreadID.x;
-    uint bx = groupID.x;
-    uint by = groupID.y;
-
-    uint bbx = id % BLOCK_WIDTH;
-    uint bby = id / BLOCK_WIDTH;
-
-    float v[BLOCK_WIDTH][BLOCK_WIDTH];
-    for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
-        for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
-        {
-            float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
-            v[yy][xx] = bias;
-        }
-
-    for (uint m = 0; m < X.GetFlatWidth()/LOAD_DEPTH; ++m)
-    {
-        for (uint q = 0; q < LOAD_DEPTH; ++q)
-        {
-            X_[q][id] = X.Get(by*LOAD_WIDTH + id, m*LOAD_DEPTH + q);
-            W_[q][id] = W.Get(m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
-        }
-
-        GroupMemoryBarrierWithGroupSync();
-
-        for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-            [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-                [unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
-                {
-                    v[yyy][xxx] = mad(X_[i][bby*BLOCK_WIDTH + yyy], W_[i][bbx*BLOCK_WIDTH + xxx], v[yyy][xxx]);
-                }
-
-        GroupMemoryBarrierWithGroupSync();
-    }
-
-    for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-        for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-            O.Set(by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy, bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx, v[yyy][xxx]);
-
-    #undef X_
-    #undef W_
-}
-
-
-#undef THREAD_COUNT
-#define THREAD_COUNT 64 // ATM support only 8x8
-
-#undef BLOCK_WIDTH
-#define BLOCK_WIDTH 8
-
-#undef LOAD_WIDTH
-#define LOAD_WIDTH THREAD_COUNT
-
-#undef LOAD_DEPTH
-#define LOAD_DEPTH BLOCK_WIDTH
-
-groupshared float Conv_KcacheR[LOAD_DEPTH][LOAD_WIDTH];
-groupshared float Conv_XcacheR[LOAD_DEPTH][LOAD_WIDTH];
-[numthreads(THREAD_COUNT, 1, 1)]
-void Conv2D_Kernel3x3_64(uint3 groupID : SV_GroupID, uint3 groupThreadID : SV_GroupThreadID)
-{
-    // @TODO: DISPATCH_ARGS(...)
-    TENSOR_SHARED2_ARGS4(X, K, B, WBK, O);
-
-    #define X_ Conv_XcacheR
-    #define K_ Conv_KcacheR
-
-    uint id = groupThreadID.x;
-    uint bx = groupID.x;
-    uint by = groupID.y;
-
-    uint bbx = id % BLOCK_WIDTH;
-    uint bby = id / BLOCK_WIDTH;
-
-    uint width = O.width;
-    uint height = O.height;
-
-    // ASSERT(LOAD_WIDTH == THREAD_COUNT)
-    uint loadNYX = by*LOAD_WIDTH + id; // only works for 8x8
-    uint loadX = loadNYX % width;
-    uint loadNY = loadNYX / width;
-    uint loadY = loadNY % height;
-    uint loadN = loadNY / height;
-
-    // @TODO: validate that _Stride works, added the following 2 lines without testing
-    loadX *= _Stride.x;
-    loadY *= _Stride.y;
-
-    float v[BLOCK_WIDTH][BLOCK_WIDTH];
-    [unroll] for (uint yy = 0; yy < BLOCK_WIDTH; ++yy)
-        [unroll] for (uint xx = 0; xx < BLOCK_WIDTH; ++xx)
-        {
-            float bias = B.Get(bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xx);
-            v[yy][xx] = bias;
-        }
-
-    for (uint dy = 0; dy < 3; ++dy)
-    {
-        bool mask = true;
-
-        if (loadY+dy < _Pad.y) mask = false;
-        if (loadY+dy - _Pad.w >= X.height) mask = false;
-
-        for (uint dx = 0; dx < 3; ++dx)
-        {
-            if (loadX+dx < _Pad.x) mask = false;
-            if (loadX+dx - _Pad.z >= X.width) mask = false;
-
-            for (uint m = 0; m < X.channels/LOAD_DEPTH; ++m)
-            {
-                for (uint q = 0; q < LOAD_DEPTH; ++q)
-                {
-                    if (mask)
-                        X_[q][id] = X.Get(loadN, loadY+dy-_Pad.y, loadX+dx-_Pad.x, m*LOAD_DEPTH + q);
-                    else
-                        X_[q][id] = 0;
-                    K_[q][id] = K.Get(dy, dx, m*LOAD_DEPTH + q, bx*LOAD_WIDTH + id);
-                }
-
-                GroupMemoryBarrierWithGroupSync();
-
-                for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-                    [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx) 
-                        [unroll] for (uint i = 0; i < LOAD_DEPTH; ++i)
-                        {
-                            v[yyy][xxx] += X_[i][bby*BLOCK_WIDTH + yyy] * K_[i][bbx*BLOCK_WIDTH + xxx];
-                        }
-
-                GroupMemoryBarrierWithGroupSync();
-            }
-        }
-    }
-
-    [unroll] for (uint yyy = 0; yyy < BLOCK_WIDTH; ++yyy)
-        [unroll] for (uint xxx = 0; xxx < BLOCK_WIDTH; ++xxx)
-        {
-            uint saveNYX = by*LOAD_WIDTH + bby*BLOCK_WIDTH + yyy;
-            uint saveX = saveNYX % width;
-            uint saveNY = saveNYX / width;
-            uint saveY = saveNY % height;
-            uint saveN = saveNY / height;
-
-            uint saveK = bx*LOAD_WIDTH + bbx*BLOCK_WIDTH + xxx;
-            O.Set(saveN, saveY, saveX, saveK, v[yyy][xxx]);
-        }
-
-    #undef X_
-    #undef K_
-}
diff --git a/Assets/Coach-ML/Barracuda/Resources/FastNV.compute.meta b/Assets/Coach-ML/Barracuda/Resources/FastNV.compute.meta
deleted file mode 100644
index 91a8425..0000000
--- a/Assets/Coach-ML/Barracuda/Resources/FastNV.compute.meta
+++ /dev/null
@@ -1,9 +0,0 @@
-fileFormatVersion: 2
-guid: c7c673db45e6845d5abaed4ed5ef42e1
-timeCreated: 1507294253
-licenseType: Pro
-ComputeShaderImporter:
-  currentAPIMask: 196608
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: 
diff --git a/Assets/Coach-ML/Barracuda/Resources/TexConv.compute b/Assets/Coach-ML/Barracuda/Resources/TexConv.compute
deleted file mode 100644
index e3174b1..0000000
--- a/Assets/Coach-ML/Barracuda/Resources/TexConv.compute
+++ /dev/null
@@ -1,99 +0,0 @@
-#pragma kernel TexConv2D
-
-#include "Tensor.cginc"
-
-TENSOR_DECL(X)
-TENSOR_DECL(K)
-TENSOR_DECL(B)
-TENSOR_DECL(WBK)
-TENSOR_DECL_RW(O)
-
-uint4 _Pad;
-uint4 _Stride;
-
-struct TextureAsTensor : Tensor
-{
-    Texture2D<float4> tex;
-    SamplerState smp;
-
-    Texture2DArray<float4> texArray;
-    SamplerState smpArray;
-
-    void Init(uint4 nhwc, Texture2D<float4> tex_, SamplerState sampler_, Texture2DArray<float4> texArray_, SamplerState samplerArray_)
-    {
-        Tensor::Init(nhwc);
-        tex = tex_;
-        smp = sampler_;
-        texArray = texArray_;
-        smpArray = samplerArray_;
-    }
-
-    float4 Get(uint b, uint y, uint x)
-    {
-        float3 loc = float3((float)x / (float)width, (float)y / (float)height, b);
-        if (batch > 1)
-            return texArray.SampleLevel(smpArray, loc, 0);
-        else
-            return tex.SampleLevel(smp, loc.xy, 0);
-    }
-};
-
-#define TENSOR_SHARED2_ARGS3(A, B, S, O) TENSOR_SHARED_ARG(A, S); TENSOR_SHARED_ARG(B, S); TENSOR_ARG_RW(O);
-Texture2DArray<float4> Xtex2DArray;
-Texture2D<float4> Xtex2D;
-SamplerState samplerXtex2D { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; };
-SamplerState samplerXtex2DArray { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; };
-
-#define MAX_CHANNELS 4
-
-NUMTHREADS((16,4,4), (16,4,2), (16,2,2))
-void TexConv2D(uint3 dispatchThreadID : SV_DispatchThreadID)
-{
-// @TODO: currently it fails to compile, needs to be investigated
-#if 0
-    DISPATCH_ARGS(K.kernelCount, O.width, O.height);
-    TextureAsTensor X; X.Init(Xdecl[0], Xtex2D, samplerXtex2D, Xtex2DArray, samplerXtex2DArray);
-
-    TENSOR_SHARED_ARG(K, WBK);
-    TENSOR_SHARED_ARG(B, WBK);
-    TENSOR_ARG_RW(O);
-
-    // ASSERT(X.channels <= MAX_CHANNELS)
-
-    uint k = dispatchThreadID.x;
-    uint x = dispatchThreadID.y;
-    uint y = dispatchThreadID.z;
-
-    if (k >= K.channels) return;
-    if (x >= O.width) return;
-    if (y >= O.height) return;
-
-    for (uint n = 0; n < O.batch; ++n)
-    {
-        float acc = B.Get(k);
-        for (uint dy = 0; dy < K.GetKernelHeight(); ++dy)
-        {
-            for (uint dx = 0; dx < K.GetKernelWidth(); ++dx)
-            {
-                uint oy = y * _Stride.y + dy;
-                uint ox = x * _Stride.x + dx;
-
-                // @TODO: investigate
-                // WARNING: had to move both y check into the loop (as opposed to checking y in parent loop) - due to potential bug in Metal compiler
-                if (oy < _Pad.y) continue;
-                if (oy - _Pad.w >= X.height) continue;
-                if (ox < _Pad.x) continue;
-                if (ox - _Pad.z >= X.width) continue;
-
-                float4 in4channels = X.Get(n, oy - _Pad.y, ox - _Pad.x);
-                for (uint c = 0; c < X.channels && c < MAX_CHANNELS; ++c)
-                {
-                    acc += in4channels[c] * K.Get(dy, dx, c, k);
-                }
-            }
-        }
-
-        O.Set(n, y, x, k, acc);
-    }
-#endif
-}
diff --git a/Assets/Coach-ML/Barracuda/Resources/TexConv.compute.meta b/Assets/Coach-ML/Barracuda/Resources/TexConv.compute.meta
deleted file mode 100644
index 38baaf9..0000000
--- a/Assets/Coach-ML/Barracuda/Resources/TexConv.compute.meta
+++ /dev/null
@@ -1,9 +0,0 @@
-fileFormatVersion: 2
-guid: 85d38d76f835143f797bca1481285596
-timeCreated: 1507637303
-licenseType: Pro
-ComputeShaderImporter:
-  currentAPIMask: 196608
-  userData: 
-  assetBundleName: 
-  assetBundleVariant: