From dbd18936cbbf9bc7a9b394a595c54b973864b356 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sat, 23 Dec 2017 22:41:56 +0100
Subject: [PATCH 01/30] Added InceptionInfo struct

---
 .../APIs/Structs/InceptionInfo.cs             | 108 ++++++++++++++++++
 .../APIs/Structs/ConvolutionInfo.cs           |  23 ++--
 NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs |  33 +++---
 NeuralNetwork.NET/APIs/Structs/Tensor.cs      |   8 +-
 NeuralNetwork.NET/APIs/Structs/TensorInfo.cs  |   6 +-
 5 files changed, 137 insertions(+), 41 deletions(-)
 create mode 100644 NeuralNetwork.NET.Cuda/APIs/Structs/InceptionInfo.cs
diff --git a/NeuralNetwork.NET.Cuda/APIs/Structs/InceptionInfo.cs b/NeuralNetwork.NET.Cuda/APIs/Structs/InceptionInfo.cs
new file mode 100644
index 0000000..5ddcd9f
--- /dev/null
+++ b/NeuralNetwork.NET.Cuda/APIs/Structs/InceptionInfo.cs
@@ -0,0 +1,108 @@
+﻿using JetBrains.Annotations;
+using NeuralNetworkNET.APIs.Enums;
+using Newtonsoft.Json;
+using System;
+using System.Runtime.CompilerServices;
+
+namespace NeuralNetworkNET.APIs.Structs
+{
+    /// <summary>
+    /// A <see cref="struct"/> containing all the info on an inception module
+    /// </summary>
+    [JsonObject(MemberSerialization.Fields)]
+    public readonly struct InceptionInfo : IEquatable<InceptionInfo>
+    {
+        #region Fields
+
+        /// <summary>
+        /// The number of 1x1 convolution kernels used in the first step of the forward pass
+        /// </summary>
+        public readonly int Primary1x1ConvolutionKernels;
+
+        /// <summary>
+        /// The number of 3x3 convolution kernels
+        /// </summary>
+        public readonly int Secondary3x3ConvolutionKernels;
+
+        /// <summary>
+        /// The number of 5x5 convolution kernels
+        /// </summary>
+        public readonly int Secondary5x5ConvolutionKernels;
+
+        /// <summary>
+        /// The kind of pooling operation performed on the layer
+        /// </summary>
+        public readonly PoolingMode Pooling;
+
+        /// <summary>
+        /// The number of 1x1 convolution kernels after the pooling operation
+        /// </summary>
+        public readonly int Chained1x1AfterPoolingConvolutionKernels;
+
+        #endregion
+
+        #region Constructors
+
+        // Internal constructor
+        private InceptionInfo(int _1x1Kernels, int _3x3Kernels, int _5x5Kernels, PoolingMode poolingMode, int _1x1SecondaryKernels)
+        {
+            Primary1x1ConvolutionKernels = _1x1Kernels >= 1 ? _1x1Kernels : throw new ArgumentOutOfRangeException(nameof(_1x1Kernels), "The number of 1x1 kernels must be at least 1");
+            Secondary3x3ConvolutionKernels = _3x3Kernels >= 1 ? _3x3Kernels : throw new ArgumentOutOfRangeException(nameof(_3x3Kernels), "The number of 3x3 kernels must be at least 1");
+            Secondary5x5ConvolutionKernels = _5x5Kernels >= 1 ? _5x5Kernels : throw new ArgumentOutOfRangeException(nameof(_5x5Kernels), "The number of 5x5 kernels must be at least 1");
+            Chained1x1AfterPoolingConvolutionKernels = _1x1SecondaryKernels >= 1 ? _1x1SecondaryKernels : throw new ArgumentOutOfRangeException(nameof(_1x1SecondaryKernels), "The number of secondary 1x1 kernels must be at least 1");
+            Pooling = poolingMode;
+        }
+
+        /// <summary>
+        /// Creates a new inception layer description with the input parameters
+        /// </summary>
+        /// <param name="_1x1Kernels">The number of 1x1 primary convolution kernels</param>
+        /// <param name="_3x3Kernels">The number of 3x3 convolution kernels</param>
+        /// <param name="_5x5Kernels">The number of 5x5 convolution kernels</param>
+        /// <param name="poolingMode">The pooling mode for the pooling channel</param>
+        /// <param name="_1x1SecondaryKernels">The number of secondary 1x1 convolution kernels</param>
+        [PublicAPI]
+        [Pure]
+        public static InceptionInfo New(
+            int _1x1Kernels, int _3x3Kernels, int _5x5Kernels, 
+            PoolingMode poolingMode, int _1x1SecondaryKernels)
+            => new InceptionInfo(_1x1Kernels, _3x3Kernels, _5x5Kernels, poolingMode, _1x1SecondaryKernels);
+
+        #endregion
+
+        #region Equality
+
+        /// <inheritdoc/>
+        public bool Equals(InceptionInfo other) => this == other;
+
+        /// <inheritdoc/>
+        public override bool Equals(object obj) => obj is InceptionInfo info ? this == info : false;
+
+        /// <inheritdoc/>
+        public override int GetHashCode()
+        {
+            int hash = 17;
+            unchecked
+            {
+                hash = hash * 31 + Primary1x1ConvolutionKernels;
+                hash = hash * 31 + Chained1x1AfterPoolingConvolutionKernels;
+                hash = hash * 31 + Secondary3x3ConvolutionKernels;
+                hash = hash * 31 + Secondary5x5ConvolutionKernels;
+                hash = hash * 31 + (int)Pooling;
+            }
+            return hash;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool operator ==(in InceptionInfo a, in InceptionInfo b) => a.Primary1x1ConvolutionKernels == b.Primary1x1ConvolutionKernels &&
+                                                                                  a.Chained1x1AfterPoolingConvolutionKernels == b.Chained1x1AfterPoolingConvolutionKernels && 
+                                                                                  a.Secondary3x3ConvolutionKernels == b.Secondary3x3ConvolutionKernels &&
+                                                                                  a.Secondary5x5ConvolutionKernels == b.Secondary5x5ConvolutionKernels && 
+                                                                                  a.Pooling == b.Pooling;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool operator !=(in InceptionInfo a, in InceptionInfo b) => !(a == b);
+
+        #endregion
+    }
+}
diff --git a/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs b/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs
index 00ac1b7..49a34f8 100644
--- a/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs
@@ -13,27 +13,27 @@ namespace NeuralNetworkNET.APIs.Structs
     public readonly struct ConvolutionInfo : IEquatable<ConvolutionInfo>
     {
         /// <summary>
-        /// Gets the current convolution mode for the layer
+        /// The current convolution mode for the layer
         /// </summary>
         public readonly ConvolutionMode Mode;
 
         /// <summary>
-        /// Gets the optional vertical padding for the convolution operation
+        /// The optional vertical padding for the convolution operation
         /// </summary>
         public readonly int VerticalPadding;
 
         /// <summary>
-        /// Gets the optional horizontal padding for the convolution operation
+        /// The optional horizontal padding for the convolution operation
         /// </summary>
         public readonly int HorizontalPadding;
 
         /// <summary>
-        /// Gets the vertical stride length while sliding the receptive window over the input
+        /// The vertical stride length while sliding the receptive window over the input
         /// </summary>
         public readonly int VerticalStride;
 
         /// <summary>
-        /// Gets the horizontal stride length while sliding the receptive window over the input
+        /// The horizontal stride length while sliding the receptive window over the input
         /// </summary>
         public readonly int HorizontalStride;
 
@@ -45,16 +45,11 @@ private ConvolutionInfo(
             int verticalPadding, int horizontalPadding,
             int verticalStride, int horizontalStride)
         {
-            if (verticalPadding < 0) throw new ArgumentOutOfRangeException(nameof(verticalPadding), "The vertical padding must be greater than or equal to 0");
-            if (horizontalPadding < 0) throw new ArgumentOutOfRangeException(nameof(horizontalPadding), "The horizontal padding must be greater than or equal to 0");
-            if (verticalStride < 1) throw new ArgumentOutOfRangeException(nameof(verticalStride), "The vertical stride must be at least equal to 1");
-            if (horizontalStride < 1) throw new ArgumentOutOfRangeException(nameof(horizontalStride), "The horizontal stride must be at least equal to 1");
-
+            VerticalPadding = verticalPadding >= 0 ? verticalPadding : throw new ArgumentOutOfRangeException(nameof(verticalPadding), "The vertical padding must be greater than or equal to 0");
+            HorizontalPadding = horizontalPadding >= 0 ? horizontalPadding : throw new ArgumentOutOfRangeException(nameof(horizontalPadding), "The horizontal padding must be greater than or equal to 0");
+            VerticalStride = verticalStride >= 1 ? verticalStride : throw new ArgumentOutOfRangeException(nameof(verticalStride), "The vertical stride must be at least equal to 1");
+            HorizontalStride = horizontalStride >= 1 ? horizontalStride : throw new ArgumentOutOfRangeException(nameof(horizontalStride), "The horizontal stride must be at least equal to 1");
             Mode = mode;
-            VerticalPadding = verticalPadding;
-            HorizontalPadding = horizontalPadding;
-            VerticalStride = verticalStride;
-            HorizontalStride = horizontalStride;
         }
 
         /// <summary>
diff --git a/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs b/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs
index 7185d35..ea2166a 100644
--- a/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs
@@ -13,37 +13,37 @@ namespace NeuralNetworkNET.APIs.Structs
     public readonly struct PoolingInfo : IEquatable<PoolingInfo>
     {
         /// <summary>
-        /// Gets the current pooling mode for the layer
+        /// The current pooling mode for the layer
         /// </summary>
         public readonly PoolingMode Mode;
 
         /// <summary>
-        /// Gets the height of each input local receptive field
+        /// The height of each input local receptive field
         /// </summary>
         public readonly int WindowHeight;
 
         /// <summary>
-        /// Gets the width of each input local receptive field
+        /// The width of each input local receptive field
         /// </summary>
         public readonly int WindowWidth;
 
         /// <summary>
-        /// Gets the optional vertical padding for the pooling operation
+        /// The optional vertical padding for the pooling operation
         /// </summary>
         public readonly int VerticalPadding;
 
         /// <summary>
-        /// Gets the optional horizontal padding for the pooling operation
+        /// The optional horizontal padding for the pooling operation
         /// </summary>
         public readonly int HorizontalPadding;
 
         /// <summary>
-        /// Gets the vertical stride length while sliding the receptive window over the input
+        /// The vertical stride length while sliding the receptive window over the input
         /// </summary>
         public readonly int VerticalStride;
 
         /// <summary>
-        /// Gets the horizontal stride length while sliding the receptive window over the input
+        /// The horizontal stride length while sliding the receptive window over the input
         /// </summary>
         public readonly int HorizontalStride;
 
@@ -55,20 +55,13 @@ private PoolingInfo(
             int verticalPadding, int horizontalPadding,
             int verticalStride, int horizontalStride)
         {
-            if (windowHeight <= 0) throw new ArgumentOutOfRangeException(nameof(windowHeight), "The window height must be at least equal to 1");
-            if (windowWidth <= 0) throw new ArgumentOutOfRangeException(nameof(windowWidth), "The window width must be at least equal to 1");
-            if (verticalPadding < 0) throw new ArgumentOutOfRangeException(nameof(verticalPadding), "The vertical padding must be greater than or equal to 0");
-            if (horizontalPadding < 0) throw new ArgumentOutOfRangeException(nameof(horizontalPadding), "The horizontal padding must be greater than or equal to 0");
-            if (verticalStride < 1) throw new ArgumentOutOfRangeException(nameof(verticalStride), "The vertical stride must be at least equal to 1");
-            if (horizontalStride < 1) throw new ArgumentOutOfRangeException(nameof(horizontalStride), "The horizontal stride must be at least equal to 1");
-
+            WindowHeight = windowHeight > 0 ? windowHeight : throw new ArgumentOutOfRangeException(nameof(windowHeight), "The window height must be at least equal to 1");
+            WindowWidth = windowWidth > 0 ? windowWidth : throw new ArgumentOutOfRangeException(nameof(windowWidth), "The window width must be at least equal to 1");
+            VerticalPadding = verticalPadding >= 0 ? verticalPadding : throw new ArgumentOutOfRangeException(nameof(verticalPadding), "The vertical padding must be greater than or equal to 0");
+            HorizontalPadding = horizontalPadding >= 0 ? horizontalPadding : throw new ArgumentOutOfRangeException(nameof(horizontalPadding), "The horizontal padding must be greater than or equal to 0");
+            VerticalStride = verticalStride >= 1 ? verticalStride : throw new ArgumentOutOfRangeException(nameof(verticalStride), "The vertical stride must be at least equal to 1");
+            HorizontalStride = horizontalStride >= 1 ? horizontalStride : throw new ArgumentOutOfRangeException(nameof(horizontalStride), "The horizontal stride must be at least equal to 1");
             Mode = mode;
-            WindowHeight = windowHeight;
-            WindowWidth = windowWidth;
-            VerticalPadding = verticalPadding;
-            HorizontalPadding = horizontalPadding;
-            VerticalStride = verticalStride;
-            HorizontalStride = horizontalStride;
         }
 
         /// <summary>
diff --git a/NeuralNetwork.NET/APIs/Structs/Tensor.cs b/NeuralNetwork.NET/APIs/Structs/Tensor.cs
index cf57a6d..9252f95 100644
--- a/NeuralNetwork.NET/APIs/Structs/Tensor.cs
+++ b/NeuralNetwork.NET/APIs/Structs/Tensor.cs
@@ -17,22 +17,22 @@ namespace NeuralNetworkNET.APIs.Structs
     public readonly struct Tensor
     {
         /// <summary>
-        /// Gets the <see cref="IntPtr"/> value to the allocated memory
+        /// The <see cref="IntPtr"/> value to the allocated memory
         /// </summary>
         public readonly IntPtr Ptr;
 
         /// <summary>
-        /// Gets the number of entities (rows) in the current <see cref="Tensor"/>
+        /// The number of entities (rows) in the current <see cref="Tensor"/>
         /// </summary>
         public readonly int Entities;
 
         /// <summary>
-        /// Gets the size of each entity in the current <see cref="Tensor"/>
+        /// The size of each entity in the current <see cref="Tensor"/>
         /// </summary>
         public readonly int Length;
 
         /// <summary>
-        /// Gets the total size (the number of <see cref="float"/> values) in the current <see cref="Tensor"/>
+        /// The total size (the number of <see cref="float"/> values) in the current <see cref="Tensor"/>
         /// </summary>
         public int Size => Entities * Length;
 
diff --git a/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs b/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs
index d774cf3..b938fed 100644
--- a/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs
@@ -14,19 +14,19 @@ namespace NeuralNetworkNET.APIs.Structs
     public readonly struct TensorInfo : IEquatable<TensorInfo>
     {
         /// <summary>
-        /// Gets the height of each 2D slice
+        /// The height of each 2D slice
         /// </summary>
         [JsonProperty(nameof(Height), Order = 1)]
         public readonly int Height;
 
         /// <summary>
-        /// Gets the width of each 2D slice
+        /// The width of each 2D slice
         /// </summary>
         [JsonProperty(nameof(Width), Order = 2)]
         public readonly int Width;
 
         /// <summary>
-        /// Gets the number of channels for the tensor description
+        /// The number of channels for the tensor description
         /// </summary>
         [JsonProperty(nameof(Channels), Order = 3)]
         public readonly int Channels;

From fd7f9ec9e3cafc26c5e9d082848f3e92e09d3952 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sun, 24 Dec 2017 01:00:36 +0100
Subject: [PATCH 02/30] Temp move to APIs folder

---
 .../CuDnnNetworkLayers.cs => APIs/CuDnnNetworkLayers_Move.cs}     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename NeuralNetwork.NET.Cuda/{APIS/CuDnnNetworkLayers.cs => APIs/CuDnnNetworkLayers_Move.cs} (100%)

diff --git a/NeuralNetwork.NET.Cuda/APIS/CuDnnNetworkLayers.cs b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers_Move.cs
similarity index 100%
rename from NeuralNetwork.NET.Cuda/APIS/CuDnnNetworkLayers.cs
rename to NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers_Move.cs

From 2cc487929c4c8d808c84f5da0063c85dbef76060 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sun, 24 Dec 2017 01:01:02 +0100
Subject: [PATCH 03/30] Name switched back to original (path fix)

---
 .../APIs/{CuDnnNetworkLayers_Move.cs => CuDnnNetworkLayers.cs}    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename NeuralNetwork.NET.Cuda/APIs/{CuDnnNetworkLayers_Move.cs => CuDnnNetworkLayers.cs} (100%)

diff --git a/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers_Move.cs b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs
similarity index 100%
rename from NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers_Move.cs
rename to NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs

From ed56e1ff803230dd9c2c160734e0be35e21aa750 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sun, 24 Dec 2017 01:28:58 +0100
Subject: [PATCH 04/30] Initial inception layer structure added

---
 .../APIs/Structs/InceptionInfo.cs             |   2 +-
 .../Layers/CuDnnConvolutionalLayer.cs         |   4 +-
 .../Layers/CuDnnInceptionLayer.cs             | 174 ++++++++++++++++++
 NeuralNetwork.NET/APIs/Enums/LayerType.cs     |  26 ++-
 4 files changed, 201 insertions(+), 5 deletions(-)
 create mode 100644 NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs

diff --git a/NeuralNetwork.NET.Cuda/APIs/Structs/InceptionInfo.cs b/NeuralNetwork.NET.Cuda/APIs/Structs/InceptionInfo.cs
index 5ddcd9f..27da1ab 100644
--- a/NeuralNetwork.NET.Cuda/APIs/Structs/InceptionInfo.cs
+++ b/NeuralNetwork.NET.Cuda/APIs/Structs/InceptionInfo.cs
@@ -59,7 +59,7 @@ private InceptionInfo(int _1x1Kernels, int _3x3Kernels, int _5x5Kernels, Pooling
         /// <param name="_1x1Kernels">The number of 1x1 primary convolution kernels</param>
         /// <param name="_3x3Kernels">The number of 3x3 convolution kernels</param>
         /// <param name="_5x5Kernels">The number of 5x5 convolution kernels</param>
-        /// <param name="poolingMode">The pooling mode for the pooling channel</param>
+        /// <param name="poolingMode">The pooling mode for the pooling pipeline</param>
         /// <param name="_1x1SecondaryKernels">The number of secondary 1x1 convolution kernels</param>
         [PublicAPI]
         [Pure]
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
index 4aa0982..ff09ac5 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
@@ -47,9 +47,7 @@ internal sealed class CuDnnConvolutionalLayer : ConvolutionalLayer
         [NotNull]
         private readonly Dnn DnnInstance = DnnService.Instance;
 
-        /// <summary>
-        /// Sets the cuDNN fields that will be used during future forward/backwards operations
-        /// </summary>
+        // cuDNN fields setup
         private void SetupCuDnnInfo()
         {
             ConvolutionDescription.Set2D(OperationInfo.VerticalPadding, OperationInfo.HorizontalPadding, OperationInfo.VerticalStride, OperationInfo.HorizontalStride, 1, 1, (Alea.cuDNN.ConvolutionMode)OperationInfo.Mode);
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
new file mode 100644
index 0000000..b271567
--- /dev/null
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -0,0 +1,174 @@
+﻿using Alea.cuDNN;
+using JetBrains.Annotations;
+using NeuralNetworkNET.APIs.Enums;
+using NeuralNetworkNET.APIs.Interfaces;
+using NeuralNetworkNET.APIs.Structs;
+using NeuralNetworkNET.Cuda.Services;
+using NeuralNetworkNET.Networks.Activations;
+using NeuralNetworkNET.Networks.Activations.Delegates;
+using NeuralNetworkNET.Networks.Implementations.Layers.Abstract;
+using System;
+using System.Runtime.CompilerServices;
+
+namespace NeuralNetworkNET.Cuda.Layers
+{
+    /// <summary>
+    /// A simplified inception module, with 4 pipelines combining 1x1 convolution, 1x1 + 3x3, 1x1 + 5x5 and pooling + 1x1
+    /// </summary>
+    internal sealed class CuDnnInceptionLayer : WeightedLayerBase
+    {
+        #region Parameters
+
+        /// <sinheritdoc/>
+        public override LayerType LayerType { get; } = LayerType.Inception;
+
+        private readonly InceptionInfo _OperationInfo;
+
+        /// <summary>
+        /// Gets the info on the inception parameters used by the layer
+        /// </summary>    
+        public ref readonly InceptionInfo OperationInfo
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => ref _OperationInfo;
+        }
+
+        #endregion
+
+        #region cuDNN fields
+
+        // The NCHW tensor info for the layer inputs
+        [NotNull]
+        private readonly TensorDescriptor InputDescription = new TensorDescriptor();
+
+        #region 1x1 convolution
+
+        // The NCHW info for the 1x1 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor _1x1FilterDescription = new FilterDescriptor();
+
+         // The info on the 1x1 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor _1x1BiasDescription = new TensorDescriptor();
+
+        // The first 1x1 convolution info
+        [NotNull]
+        private readonly ConvolutionDescriptor _1x1ConvolutionDescription = new ConvolutionDescriptor();
+
+        // The NCHW tensor info for the outputs of the first 1x1 convolution
+        [NotNull]
+        private readonly TensorDescriptor _1x1OutputDescription = new TensorDescriptor();
+
+        #endregion
+
+        #region 3x3 secondary convolution
+
+        // The NCHW info for the 3x3 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor _3x3FilterDescription = new FilterDescriptor();
+
+         // The info on the 3x3 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor _3x3BiasDescription = new TensorDescriptor();
+
+        // The first 3x3 convolution info
+        [NotNull]
+        private readonly ConvolutionDescriptor _3x3ConvolutionDescription = new ConvolutionDescriptor();
+
+        // The NCHW tensor info for the outputs of the 3x3 convolution
+        [NotNull]
+        private readonly TensorDescriptor _3x3OutputDescription = new TensorDescriptor();
+
+        #endregion
+
+        #region 5x5 secondary convolution
+
+        // The NCHW info for the 5x5 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor _5x5FilterDescription = new FilterDescriptor();
+
+         // The info on the 5x5 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor _5x5BiasDescription = new TensorDescriptor();
+
+        // The first 5x5 convolution info
+        [NotNull]
+        private readonly ConvolutionDescriptor _5x5ConvolutionDescription = new ConvolutionDescriptor();
+
+        // The NCHW tensor info for the outputs of the 5x5 convolution
+        [NotNull]
+        private readonly TensorDescriptor _5x5OutputDescription = new TensorDescriptor();
+
+        #endregion
+
+        #region Pooling pipeline
+
+        // The descriptor for the pooling operation performed by the layer
+        [NotNull]
+        private readonly PoolingDescriptor PoolingDescription = new PoolingDescriptor();
+
+        // The NCHW tensor info for the pooling outputs
+        [NotNull]
+        private readonly TensorDescriptor PoolingOutputDescription = new TensorDescriptor();
+
+        // The NCHW info for the secondary 1x1 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor Secondary1x1FilterDescription = new FilterDescriptor();
+
+        // The info on the secondary 1x1 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor Secondary1x1BiasDescription = new TensorDescriptor();
+
+        // The first secondary 1x1 convolution info
+        [NotNull]
+        private readonly ConvolutionDescriptor Secondary1x1ConvolutionDescription = new ConvolutionDescriptor();
+
+        // The info on the secondary 1x1 convolution outputs
+        [NotNull]
+        private readonly TensorDescriptor Secondary1x1OutputDescription = new TensorDescriptor();
+
+        #endregion
+
+        /// <summary>
+        /// Gets the <see cref="Dnn"/> instance for the current layer
+        /// </summary>
+        [NotNull]
+        private readonly Dnn DnnInstance = DnnService.Instance;
+
+        // cuDNN fields setup
+        private void SetupCuDnnInfo()
+        {
+
+        }
+
+        #endregion
+
+        protected CuDnnInceptionLayer(in TensorInfo input, in TensorInfo output, [NotNull] float[] w, [NotNull] float[] b, ActivationFunctionType activation) : base(input, output, w, b, activation)
+        {
+        }
+
+        #region Implementation
+
+        public override void Forward(in Tensor x, out Tensor z, out Tensor a)
+        {
+            throw new NotImplementedException();
+        }
+
+        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
+        {
+            throw new NotImplementedException();
+        }
+
+        public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJdw, out Tensor dJdb)
+        {
+            throw new NotImplementedException();
+        }
+
+        #endregion
+
+        public override INetworkLayer Clone()
+        {
+            throw new NotImplementedException();
+        }
+    }
+}
diff --git a/NeuralNetwork.NET/APIs/Enums/LayerType.cs b/NeuralNetwork.NET/APIs/Enums/LayerType.cs
index 498e059..4406841 100644
--- a/NeuralNetwork.NET/APIs/Enums/LayerType.cs
+++ b/NeuralNetwork.NET/APIs/Enums/LayerType.cs
@@ -5,10 +5,34 @@
     /// </summary>
     public enum LayerType : byte
     {
+        /// <summary>
+        /// A fully connected layer, mapping n inputs to m outputs
+        /// </summary>
         FullyConnected,
+
+        /// <summary>
+        /// A convolutional layer, which keeps spatial information on the input volume
+        /// </summary>
         Convolutional,
+
+        /// <summary>
+        /// A pooling layer, useful to reduce the size of the input data volume
+        /// </summary>
         Pooling,
+
+        /// <summary>
+        /// A fully connected output layer, with an arbitrary activation and cost function
+        /// </summary>
         Output,
-        Softmax
+
+        /// <summary>
+        /// A softmax layer, with the softmax activation and log-likelyhood cost function
+        /// </summary>
+        Softmax,
+
+        /// <summary>
+        /// An inception module, combining different kinds of convolution with a pooling operation
+        /// </summary>
+        Inception
     }
 }
\ No newline at end of file

From 9c9a79c2baada46c8f39b5056528fc559c964d5c Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sun, 24 Dec 2017 01:50:19 +0100
Subject: [PATCH 05/30] Added inception cuDNN base initialization

---
 .../Layers/CuDnnInceptionLayer.cs             | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index b271567..2b65f99 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -138,7 +138,28 @@ public ref readonly InceptionInfo OperationInfo
         // cuDNN fields setup
         private void SetupCuDnnInfo()
         {
-
+            // First 1x1 convolution
+            _1x1ConvolutionDescription.Set2D(0, 0, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION);
+            _1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Primary1x1ConvolutionKernels, InputInfo.Channels, 1, 1);
+            _1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Primary1x1ConvolutionKernels, 1, 1);
+
+            // 3x3 convolution
+            _3x3ConvolutionDescription.Set2D(1, 1, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION); // 1-padding to keep size
+            _3x3FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary3x3ConvolutionKernels, _OperationInfo.Primary1x1ConvolutionKernels, 3, 3);
+            _3x3BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary3x3ConvolutionKernels, 1, 1);
+
+            // 5x5 convolution
+            _5x5ConvolutionDescription.Set2D(2, 2, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION);
+            _5x5FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary5x5ConvolutionKernels, _OperationInfo.Primary1x1ConvolutionKernels, 5, 5);
+            _5x5BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary5x5ConvolutionKernels, 1, 1);
+
+            // Pooling
+            PoolingDescription.Set2D(Alea.cuDNN.PoolingMode.AVERAGE_COUNT_EXCLUDE_PADDING, NanPropagation.PROPAGATE_NAN, 3, 3, 1, 1, 1, 1);
+            
+            // Secondary 1x1 convolution
+            Secondary1x1ConvolutionDescription.Set2D(0, 0, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION);
+            Secondary1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, InputInfo.Channels, _OperationInfo.Chained1x1AfterPoolingConvolutionKernels, 1, 1);
+            Secondary1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Chained1x1AfterPoolingConvolutionKernels, 1, 1);
         }
 
         #endregion

From 9bc9e809968476d04c30e5f29fce918796107bee Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sun, 24 Dec 2017 02:10:51 +0100
Subject: [PATCH 06/30] Added inception layer weights initialization, minor
 changes

---
 .../APIs/Structs/InceptionInfo.cs             |  0
 .../Layers/ConvolutionalLayer.cs              |  2 +-
 .../Layers/FullyConnectedLayer.cs             |  2 +-
 .../Layers/Helpers/WeightsProvider.cs         | 59 ++++++++++++++-----
 .../CuDnnLayersTest.cs                        | 34 +++++------
 .../SerializationTest.cs                      |  6 +-
 6 files changed, 64 insertions(+), 39 deletions(-)
 rename {NeuralNetwork.NET.Cuda => NeuralNetwork.NET}/APIs/Structs/InceptionInfo.cs (100%)

diff --git a/NeuralNetwork.NET.Cuda/APIs/Structs/InceptionInfo.cs b/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs
similarity index 100%
rename from NeuralNetwork.NET.Cuda/APIs/Structs/InceptionInfo.cs
rename to NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
index 0be71cf..922cf50 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
@@ -59,7 +59,7 @@ public ref readonly TensorInfo KernelInfo
 
         public ConvolutionalLayer(in TensorInfo input, in ConvolutionInfo operation, (int X, int Y) kernelSize, int kernels, ActivationFunctionType activation, BiasInitializationMode biasMode)
             : base(input, new TensorInfo(input.Height - kernelSize.X + 1, input.Width - kernelSize.Y + 1, kernels),
-                  WeightsProvider.NewConvolutionalKernels(input.Channels, kernelSize.X, kernelSize.Y, kernels),
+                  WeightsProvider.NewConvolutionalKernels(input, kernelSize.X, kernelSize.Y, kernels),
                   WeightsProvider.NewBiases(kernels, biasMode), activation)
         {
             _OperationInfo = operation;
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs
index 9086c89..59bff25 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs
@@ -22,7 +22,7 @@ internal class FullyConnectedLayer : WeightedLayerBase
 
         public FullyConnectedLayer(in TensorInfo input, int neurons, ActivationFunctionType activation, WeightsInitializationMode weightsMode, BiasInitializationMode biasMode)
             : base(input, TensorInfo.CreateLinear(neurons),
-                  WeightsProvider.NewFullyConnectedWeights(input.Size, neurons, weightsMode),
+                  WeightsProvider.NewFullyConnectedWeights(input, neurons, weightsMode),
                   WeightsProvider.NewBiases(neurons, biasMode), activation) { }
 
         public FullyConnectedLayer(in TensorInfo input, int neurons, [NotNull] float[] weights, [NotNull] float[] biases, ActivationFunctionType activation)
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
index 87ddb9c..72a6722 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
@@ -13,35 +13,34 @@ namespace NeuralNetworkNET.Networks.Implementations.Layers.Helpers
     internal static class WeightsProvider
     {
         /// <summary>
-        /// Creates a weight matrix for a fully connected layer
+        /// Creates a weights vector for a fully connected layer
         /// </summary>
-        /// <param name="inputs">The input neurons</param>
+        /// <param name="inputs">The layer inputs</param>
         /// <param name="outputs">The output neurons</param>
         /// <param name="mode">The initialization mode for the weights</param>
         [Pure, NotNull]
-        public static unsafe float[] NewFullyConnectedWeights(int inputs, int outputs, WeightsInitializationMode mode)
+        public static unsafe float[] NewFullyConnectedWeights(in TensorInfo input, int outputs, WeightsInitializationMode mode)
         {
-            if (inputs <= 0 || outputs <= 0) throw new ArgumentOutOfRangeException("The inputs and outputs must be positive numbers");
-            float[] weights = new float[inputs * outputs];
+            float[] weights = new float[input.Size * outputs];
             fixed (float* pw = weights)
             {
-                Tensor.Reshape(pw, inputs, outputs, out Tensor wTensor);
+                Tensor.Reshape(pw, input.Size, outputs, out Tensor wTensor);
                 switch (mode)
                 {
                     case WeightsInitializationMode.LeCunUniform:
-                        KerasWeightsProvider.FillWithLeCunUniform(wTensor, inputs);
+                        KerasWeightsProvider.FillWithLeCunUniform(wTensor, input.Size);
                         break;
                     case WeightsInitializationMode.GlorotNormal:
-                        KerasWeightsProvider.FillWithGlorotNormal(wTensor, inputs, outputs);
+                        KerasWeightsProvider.FillWithGlorotNormal(wTensor, input.Size, outputs);
                         break;
                     case WeightsInitializationMode.GlorotUniform:
-                        KerasWeightsProvider.FillWithGlorotUniform(wTensor, inputs, outputs);
+                        KerasWeightsProvider.FillWithGlorotUniform(wTensor, input.Size, outputs);
                         break;
                     case WeightsInitializationMode.HeEtAlNormal:
-                        KerasWeightsProvider.FillWithHeEtAlNormal(wTensor, inputs);
+                        KerasWeightsProvider.FillWithHeEtAlNormal(wTensor, input.Size);
                         break;
                     case WeightsInitializationMode.HeEtAlUniform:
-                        KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, inputs);
+                        KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Size);
                         break;
                     default: throw new ArgumentOutOfRangeException(nameof(mode), "Unsupported weights initialization mode");
                 }
@@ -50,21 +49,49 @@ public static unsafe float[] NewFullyConnectedWeights(int inputs, int outputs, W
         }
 
         /// <summary>
-        /// Creates a weight matrix for a convolutional layer
+        /// Creates a weights vector for a convolutional layer
         /// </summary>
-        /// <param name="inputDepth">The depth of the input volume</param>
+        /// <param name="input">The layer inputs</param>
         /// <param name="kernelsHeight">The height of each kernel</param>
         /// <param name="kernelsWidth">The width of each kernel</param>
         /// <param name="kernels">The number of kernels in the layer</param>
         [Pure, NotNull]
-        public static unsafe float[] NewConvolutionalKernels(int inputDepth, int kernelsHeight, int kernelsWidth, int kernels)
+        public static unsafe float[] NewConvolutionalKernels(in TensorInfo input, int kernelsHeight, int kernelsWidth, int kernels)
         {
             if (kernels <= 0) throw new ArgumentOutOfRangeException(nameof(kernels), "The number of kernels must be positive");
-            float[] weights = new float[kernels * kernelsHeight * kernelsWidth * inputDepth];
+            float[] weights = new float[kernels * kernelsHeight * kernelsWidth * input.Channels];
             fixed (float* pw = weights)
             {
                 Tensor.Reshape(pw, 1, weights.Length, out Tensor wTensor);
-                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, inputDepth * kernelsHeight * kernelsWidth);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels * kernelsHeight * kernelsWidth);
+            }
+            return weights;
+        }
+
+        /// <summary>
+        /// Creates a new mixed weights vector for an inception layer
+        /// </summary>
+        /// <param name="input">The layer inputs</param>
+        /// <param name="info">The info on the target inception layer</param>
+        [Pure, NotNull]
+        public static unsafe float[] NewInceptionWeights(in TensorInfo input, in InceptionInfo info)
+        {
+            int
+                _1x1Length = input.Channels * info.Primary1x1ConvolutionKernels,
+                _3x3Length = 3 * 3 * info.Primary1x1ConvolutionKernels * info.Secondary3x3ConvolutionKernels,
+                _5x5Length = 5 * 5 * info.Primary1x1ConvolutionKernels * info.Secondary5x5ConvolutionKernels,
+                secondary1x1Length = input.Channels * info.Chained1x1AfterPoolingConvolutionKernels;
+            float[] weights = new float[_1x1Length + _3x3Length + _5x5Length + secondary1x1Length];
+            fixed (float* pw = weights)
+            {
+                Tensor.Reshape(pw, 1, _1x1Length, out Tensor wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
+                Tensor.Reshape(pw + _1x1Length, 1, _3x3Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, 3 * 3 * info.Primary1x1ConvolutionKernels);
+                Tensor.Reshape(pw + _1x1Length + _3x3Length, 1, _5x5Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, 5 * 5 * info.Primary1x1ConvolutionKernels);
+                Tensor.Reshape(pw + _1x1Length + _3x3Length + _5x5Length, 1, secondary1x1Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
             }
             return weights;
         }
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
index 0b7aed2..790a697 100644
--- a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
@@ -73,7 +73,7 @@ private static unsafe void TestGradient(WeightedLayerBase cpu, WeightedLayerBase
         [TestMethod]
         public void FullyConnectedForward()
         {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
             FullyConnectedLayer
                 cpu = new FullyConnectedLayer(TensorInfo.CreateLinear(250), 127, ActivationFunctionType.LeCunTanh, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnFullyConnectedLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases, cpu.ActivationFunctionType);
@@ -84,8 +84,8 @@ public void FullyConnectedForward()
         public void FullyConnectedBackward()
         {
             float[,]
-                delta_1 = WeightsProvider.NewFullyConnectedWeights(400, 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127),
-                z = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
+                delta_1 = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127),
+                z = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
             FullyConnectedLayer
                 cpu = new FullyConnectedLayer(TensorInfo.CreateLinear(250), 127, ActivationFunctionType.LeCunTanh, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnFullyConnectedLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases, cpu.ActivationFunctionType);
@@ -96,8 +96,8 @@ public void FullyConnectedBackward()
         public void FullyConnectedGradient()
         {
             float[,]
-                x = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
-                delta = WeightsProvider.NewFullyConnectedWeights(400, 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127);
+                x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
+                delta = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127);
             FullyConnectedLayer
                 cpu = new FullyConnectedLayer(TensorInfo.CreateLinear(250), 127, ActivationFunctionType.LeCunTanh, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnFullyConnectedLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases, cpu.ActivationFunctionType);
@@ -111,7 +111,7 @@ public void FullyConnectedGradient()
         [TestMethod]
         public void SoftmaxForward()
         {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
             SoftmaxLayer
                 cpu = new SoftmaxLayer(TensorInfo.CreateLinear(250), 127, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnSoftmaxLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases);
@@ -122,8 +122,8 @@ public void SoftmaxForward()
         public void SoftmaxBackward()
         {
             float[,]
-                delta_1 = WeightsProvider.NewFullyConnectedWeights(400, 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127),
-                z = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
+                delta_1 = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127),
+                z = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
             SoftmaxLayer
                 cpu = new SoftmaxLayer(TensorInfo.CreateLinear(250), 127, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnSoftmaxLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases);
@@ -134,8 +134,8 @@ public void SoftmaxBackward()
         public void SoftmaxGradient()
         {
             float[,]
-                a = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
-                delta = WeightsProvider.NewFullyConnectedWeights(400, 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127);
+                a = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
+                delta = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127);
             SoftmaxLayer
                 cpu = new SoftmaxLayer(TensorInfo.CreateLinear(250), 127, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnSoftmaxLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases);
@@ -146,7 +146,7 @@ public void SoftmaxGradient()
         public unsafe void SoftmaxBackwardOutput()
         {
             float[,]
-                x = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
+                x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
                 y = new float[400, 127];
             for (int i = 0; i < 400; i++)
                 y[i, ThreadSafeRandom.NextInt(max: 127)] = 1;
@@ -175,7 +175,7 @@ public unsafe void SoftmaxBackwardOutput()
         [TestMethod]
         public void ConvolutionForward()
         {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(127, 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 58 * 58 * 3);
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(127), 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 58 * 58 * 3);
             ConvolutionalLayer
                 cpu = new ConvolutionalLayer(new TensorInfo(58, 58, 3), ConvolutionInfo.Default, (5, 5), 20, ActivationFunctionType.LeakyReLU, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnConvolutionalLayer(cpu.InputInfo, ConvolutionInfo.Default, cpu.KernelInfo, cpu.OutputInfo, cpu.Weights, cpu.Biases, cpu.ActivationFunctionType);
@@ -186,8 +186,8 @@ public void ConvolutionForward()
         public unsafe void ConvolutionBackward()
         {
             float[,]
-                delta_1 = WeightsProvider.NewFullyConnectedWeights(127, 54 * 54 * 20, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 54 * 54 * 20),
-                z = WeightsProvider.NewFullyConnectedWeights(127, 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 58 * 58 * 3);
+                delta_1 = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(127), 54 * 54 * 20, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 54 * 54 * 20),
+                z = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(127), 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 58 * 58 * 3);
             ConvolutionalLayer
                 cpu = new ConvolutionalLayer(new TensorInfo(58, 58, 3), ConvolutionInfo.Default, (5, 5), 20, ActivationFunctionType.LeCunTanh, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnConvolutionalLayer(cpu.InputInfo, ConvolutionInfo.Default, cpu.KernelInfo, cpu.OutputInfo, cpu.Weights, cpu.Biases, ActivationFunctionType.LeCunTanh);
@@ -228,7 +228,7 @@ public void ConvolutionGradient()
         [TestMethod]
         public void PoolingForward()
         {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(400, 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 58 * 58 * 3);
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 58 * 58 * 3);
             PoolingLayer
                 cpu = new PoolingLayer(new TensorInfo(58, 58, 3), PoolingInfo.Default, ActivationFunctionType.LeakyReLU),
                 gpu = new CuDnnPoolingLayer(cpu.InputInfo, PoolingInfo.Default, ActivationFunctionType.LeakyReLU);
@@ -239,8 +239,8 @@ public void PoolingForward()
         public void PoolingBackward()
         {
             float[,]
-                delta_1 = WeightsProvider.NewFullyConnectedWeights(400, 29 * 29 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 29 * 29 * 3),
-                z = WeightsProvider.NewFullyConnectedWeights(400, 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 58 * 58 * 3);
+                delta_1 = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 29 * 29 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 29 * 29 * 3),
+                z = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 58 * 58 * 3);
             PoolingLayer
                 cpu = new PoolingLayer(new TensorInfo(58, 58, 3), PoolingInfo.Default, ActivationFunctionType.LeakyReLU),
                 gpu = new CuDnnPoolingLayer(cpu.InputInfo, PoolingInfo.Default, ActivationFunctionType.LeakyReLU);
diff --git a/Unit/NeuralNetwork.NET.Unit/SerializationTest.cs b/Unit/NeuralNetwork.NET.Unit/SerializationTest.cs
index 4ae03d4..f646e86 100644
--- a/Unit/NeuralNetwork.NET.Unit/SerializationTest.cs
+++ b/Unit/NeuralNetwork.NET.Unit/SerializationTest.cs
@@ -1,6 +1,4 @@
-﻿using System;
-using System.IO;
-using System.Linq;
+﻿using System.IO;
 using Microsoft.VisualStudio.TestTools.UnitTesting;
 using NeuralNetworkNET.APIs;
 using NeuralNetworkNET.APIs.Enums;
@@ -50,7 +48,7 @@ public void StreamSerialize()
         {
             using (MemoryStream stream = new MemoryStream())
             {
-                float[] w = WeightsProvider.NewFullyConnectedWeights(784, 30, WeightsInitializationMode.GlorotNormal);
+                float[] w = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(784), 30, WeightsInitializationMode.GlorotNormal);
                 stream.WriteShuffled(w);
                 Assert.IsTrue(stream.Position == sizeof(float) * w.Length);
                 stream.Seek(0, SeekOrigin.Begin);

From de8a9e7c6e0acc458071869bd31a97aad043e3d6 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sun, 24 Dec 2017 12:47:20 +0100
Subject: [PATCH 07/30] Inception constructors and Clone method implemented

---
 .../Layers/CuDnnInceptionLayer.cs             | 22 ++++++++++++++-----
 .../APIs/Structs/InceptionInfo.cs             | 12 +++++++++-
 NeuralNetwork.NET/APIs/Structs/TensorInfo.cs  | 11 ++++++++--
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 2b65f99..5cd550b 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -7,6 +7,7 @@
 using NeuralNetworkNET.Networks.Activations;
 using NeuralNetworkNET.Networks.Activations.Delegates;
 using NeuralNetworkNET.Networks.Implementations.Layers.Abstract;
+using NeuralNetworkNET.Networks.Implementations.Layers.Helpers;
 using System;
 using System.Runtime.CompilerServices;
 
@@ -164,8 +165,21 @@ private void SetupCuDnnInfo()
 
         #endregion
 
-        protected CuDnnInceptionLayer(in TensorInfo input, in TensorInfo output, [NotNull] float[] w, [NotNull] float[] b, ActivationFunctionType activation) : base(input, output, w, b, activation)
+        internal CuDnnInceptionLayer(in TensorInfo input, in InceptionInfo info, BiasInitializationMode biasMode = BiasInitializationMode.Zero)
+            : base(input, new TensorInfo(input.Height, input.Width, info.OutputChannels),
+                  WeightsProvider.NewInceptionWeights(input, info),
+                  WeightsProvider.NewBiases(info.OutputChannels, biasMode),
+                  ActivationFunctionType.ReLU)
         {
+            _OperationInfo = info;
+            SetupCuDnnInfo();
+        }
+
+        internal CuDnnInceptionLayer(in TensorInfo input, in InceptionInfo info, [NotNull] float[] w, [NotNull] float[] b) 
+            : base(input, new TensorInfo(input.Height, input.Width, info.OutputChannels), w, b, ActivationFunctionType.ReLU)
+        {
+            _OperationInfo = info;
+            SetupCuDnnInfo();
         }
 
         #region Implementation
@@ -187,9 +201,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
 
         #endregion
 
-        public override INetworkLayer Clone()
-        {
-            throw new NotImplementedException();
-        }
+        /// <inheritdoc/>
+        public override INetworkLayer Clone() => new CuDnnInceptionLayer(InputInfo, OperationInfo, Weights, Biases);
     }
 }
diff --git a/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs b/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs
index 27da1ab..676b86a 100644
--- a/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs
@@ -12,7 +12,7 @@ namespace NeuralNetworkNET.APIs.Structs
     [JsonObject(MemberSerialization.Fields)]
     public readonly struct InceptionInfo : IEquatable<InceptionInfo>
     {
-        #region Fields
+        #region Fields and properties
 
         /// <summary>
         /// The number of 1x1 convolution kernels used in the first step of the forward pass
@@ -39,6 +39,16 @@ namespace NeuralNetworkNET.APIs.Structs
         /// </summary>
         public readonly int Chained1x1AfterPoolingConvolutionKernels;
 
+        /// <summary>
+        /// Gets the number of output channels after the depth concatenation
+        /// </summary>
+        public int OutputChannels
+        {
+            [Pure]
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => Primary1x1ConvolutionKernels + Secondary3x3ConvolutionKernels + Secondary5x5ConvolutionKernels + Chained1x1AfterPoolingConvolutionKernels;
+        }
+
         #endregion
 
         #region Constructors
diff --git a/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs b/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs
index b938fed..0853a92 100644
--- a/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs
@@ -13,6 +13,8 @@ namespace NeuralNetworkNET.APIs.Structs
     [DebuggerDisplay("Height: {Height}, Width: {Width}, Channels: {Channels}, Size: {Size}")]
     public readonly struct TensorInfo : IEquatable<TensorInfo>
     {
+        #region Fields and parameters
+
         /// <summary>
         /// The height of each 2D slice
         /// </summary>
@@ -52,13 +54,16 @@ public int SliceSize
             get => Height * Width;
         }
 
+        #endregion
+
+        #region Constructors
+
         internal TensorInfo(int height, int width, int channels)
         {
             if (height * width <= 0) throw new ArgumentException("The height and width of the kernels must be positive values");
-            if (channels < 1) throw new ArgumentOutOfRangeException(nameof(channels), "The number of channels must be at least equal to 1");
             Height = height;
             Width = width;
-            Channels = channels;
+            Channels = channels >= 1 ? channels :  throw new ArgumentOutOfRangeException(nameof(channels), "The number of channels must be at least equal to 1");
         }
 
         /// <summary>
@@ -87,6 +92,8 @@ internal TensorInfo(int height, int width, int channels)
         [Pure]
         public static TensorInfo CreateLinear(int size) => new TensorInfo(1, 1, size);
 
+        #endregion
+
         #region Equality
 
         /// <inheritdoc/>

From f41a4a8efebce75c7abaeb5e8651bace90be96d5 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sun, 24 Dec 2017 15:18:16 +0100
Subject: [PATCH 08/30] Inception layer forward method implemented (WIP)

---
 .../Extensions/GpuExtensions.cs               |  18 +++
 .../Layers/CuDnnInceptionLayer.cs             | 111 ++++++++++++++++--
 2 files changed, 122 insertions(+), 7 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
index 3426b0a..6576b2b 100644
--- a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
+++ b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
@@ -54,6 +54,24 @@ public static void CopyToHost([NotNull] this DeviceMemory<float> source, int n,
             source.CopyTo(result);
         }
 
+        /// <summary>
+        /// Copies the source data into the target <see cref="Tensor"/>, splitting each individual entry into its own row
+        /// </summary>
+        /// <param name="source">The source memory area with the concatenated data for each entry</param>
+        /// <param name="destination">The destination <see cref="Tensor"/> that will store the data</param>
+        /// <param name="offset">The column offset for the data for each entry</param>
+        /// <param name="length">The number of values to copy for each entry</param>
+        public static unsafe void CopyToRows([NotNull] this DeviceMemory<float> source, in Tensor destination, int offset, int length)
+        {
+            if (source.Length / length != destination.Entities) throw new ArgumentOutOfRangeException(nameof(length), "The input length doesn't match the given arguments");
+            if (destination.Length - offset > length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
+            CUDAInterop.cudaError_enum result = CUDAInterop.cudaError_enum.CUDA_SUCCESS;
+            for (int i = 0; i < destination.Entities; i++)
+                result |= CUDAInterop.cuMemcpy(new IntPtr((float*)destination + offset), source.Handle + i * destination.Length, new IntPtr(sizeof(float) * length));
+            if (result != CUDAInterop.cudaError_enum.CUDA_SUCCESS)
+                throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
+        }
+
         #endregion
 
         /// <summary>
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 5cd550b..0392e33 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -1,8 +1,10 @@
-﻿using Alea.cuDNN;
+﻿using Alea;
+using Alea.cuDNN;
 using JetBrains.Annotations;
 using NeuralNetworkNET.APIs.Enums;
 using NeuralNetworkNET.APIs.Interfaces;
 using NeuralNetworkNET.APIs.Structs;
+using NeuralNetworkNET.Cuda.Extensions;
 using NeuralNetworkNET.Cuda.Services;
 using NeuralNetworkNET.Networks.Activations;
 using NeuralNetworkNET.Networks.Activations.Delegates;
@@ -120,16 +122,16 @@ public ref readonly InceptionInfo OperationInfo
         [NotNull]
         private readonly TensorDescriptor Secondary1x1BiasDescription = new TensorDescriptor();
 
-        // The first secondary 1x1 convolution info
-        [NotNull]
-        private readonly ConvolutionDescriptor Secondary1x1ConvolutionDescription = new ConvolutionDescriptor();
-
         // The info on the secondary 1x1 convolution outputs
         [NotNull]
         private readonly TensorDescriptor Secondary1x1OutputDescription = new TensorDescriptor();
 
         #endregion
 
+        // The shared ReLU activation description for the current layer
+        [NotNull]
+        private readonly ActivationDescriptor ActivationDescription = new ActivationDescriptor();
+
         /// <summary>
         /// Gets the <see cref="Dnn"/> instance for the current layer
         /// </summary>
@@ -158,9 +160,11 @@ private void SetupCuDnnInfo()
             PoolingDescription.Set2D(Alea.cuDNN.PoolingMode.AVERAGE_COUNT_EXCLUDE_PADDING, NanPropagation.PROPAGATE_NAN, 3, 3, 1, 1, 1, 1);
             
             // Secondary 1x1 convolution
-            Secondary1x1ConvolutionDescription.Set2D(0, 0, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION);
             Secondary1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, InputInfo.Channels, _OperationInfo.Chained1x1AfterPoolingConvolutionKernels, 1, 1);
             Secondary1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Chained1x1AfterPoolingConvolutionKernels, 1, 1);
+
+            // Activation
+            ActivationDescription.Set(ActivationMode.RELU, NanPropagation.PROPAGATE_NAN, 0);
         }
 
         #endregion
@@ -186,7 +190,100 @@ internal CuDnnInceptionLayer(in TensorInfo input, in InceptionInfo info, [NotNul
 
         public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
-            throw new NotImplementedException();
+            Tensor.New(x.Entities, OutputInfo.Size, out z);
+            Tensor.New(x.Entities, OutputInfo.Size, out a);
+            using (DeviceMemory<float>
+                    w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
+                    b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
+            {
+                using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels))
+                {
+                    // First 1x1 convolution
+                    using (DeviceMemory<float> x_gpu = DnnInstance.Gpu.AllocateDevice(x))
+                    {
+                        // Descriptors setup and first 1x1 convolution
+                        InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
+                        _1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Primary1x1ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                        DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, _1x1FilterDescription, _1x1ConvolutionDescription, _1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                        DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, _1x1FilterDescription, _1x1ConvolutionDescription, _1x1OutputDescription, algorithm, out IntPtr size);
+                        using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                        {
+                            DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _1x1FilterDescription, w_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _1x1OutputDescription, _1x1Output_gpu.Ptr);                            
+                        }
+                        DnnInstance.AddTensor(1, _1x1BiasDescription, b_gpu.Ptr, 1, _1x1OutputDescription, _1x1Output_gpu.Ptr);
+                        _1x1Output_gpu.CopyToRows(z, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
+
+                        // 1x1 convolution activation
+                        DnnInstance.ActivationForward(ActivationDescription, 1, _1x1OutputDescription, _1x1Output_gpu.Ptr, 0, _1x1OutputDescription, _1x1Output_gpu.Ptr);
+                        _1x1Output_gpu.CopyToRows(a, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
+                    }
+
+                    // 3x3 convolution
+                    using (DeviceMemory<float> _3x3Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels))
+                    {
+                        _3x3OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary3x3ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                        DnnInstance.GetConvolutionForwardAlgorithm(_1x1OutputDescription, _3x3FilterDescription, _3x3ConvolutionDescription, _3x3OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                        DnnInstance.GetConvolutionForwardWorkspaceSize(_1x1OutputDescription, _3x3FilterDescription, _3x3ConvolutionDescription, _3x3OutputDescription, algorithm, out IntPtr size);
+                        using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                        {
+                            DnnInstance.ConvolutionForward(1, _1x1OutputDescription, _1x1Output_gpu.Ptr, _3x3FilterDescription, w_gpu.Ptr + InputInfo.Channels * OperationInfo.Primary1x1ConvolutionKernels, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3OutputDescription, _3x3Output_gpu.Ptr);                            
+                        }
+                        DnnInstance.AddTensor(1, _3x3BiasDescription, b_gpu.Ptr + OperationInfo.Primary1x1ConvolutionKernels, 1, _3x3OutputDescription, _3x3Output_gpu.Ptr);
+                        _3x3Output_gpu.CopyToRows(z, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
+
+                        // 3x3 convolution activation
+                        DnnInstance.ActivationForward(ActivationDescription, 1, _3x3OutputDescription, _3x3Output_gpu.Ptr, 0, _3x3OutputDescription, _3x3Output_gpu.Ptr);
+                        _3x3Output_gpu.CopyToRows(a, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
+                    }
+
+                    // 5x5 convolution
+                    using (DeviceMemory<float> _5x5Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels))
+                    {
+                        _5x5OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary5x5ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                        DnnInstance.GetConvolutionForwardAlgorithm(_1x1OutputDescription, _5x5FilterDescription, _5x5ConvolutionDescription, _5x5OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                        DnnInstance.GetConvolutionForwardWorkspaceSize(_1x1OutputDescription, _5x5FilterDescription, _5x5ConvolutionDescription, _5x5OutputDescription, algorithm, out IntPtr size);
+                        using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                        {
+                            DnnInstance.ConvolutionForward(1, _1x1OutputDescription, _1x1Output_gpu.Ptr, _5x5FilterDescription, w_gpu.Ptr + InputInfo.Channels * OperationInfo.Primary1x1ConvolutionKernels + 3 * 3 * OperationInfo.Primary1x1ConvolutionKernels * OperationInfo.Secondary3x3ConvolutionKernels, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5OutputDescription, _5x5Output_gpu.Ptr);                            
+                        }
+                        DnnInstance.AddTensor(1, _5x5BiasDescription, b_gpu.Ptr + OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels, 1, _5x5OutputDescription, _5x5Output_gpu.Ptr);
+                        _5x5Output_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
+
+                        // 3x3 convolution activation
+                        DnnInstance.ActivationForward(ActivationDescription, 1, _5x5OutputDescription, _5x5Output_gpu.Ptr, 0, _5x5OutputDescription, _5x5Output_gpu.Ptr);
+                        _5x5Output_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
+                    }
+                }
+                
+                // Pooling pipeline
+                PoolingOutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
+                using (DeviceMemory<float> y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Size))
+                {
+                    // Pooling
+                    using (DeviceMemory<float> x_gpu = DnnInstance.Gpu.AllocateDevice(x))
+                    {
+                        DnnInstance.PoolingForward(PoolingDescription, 1, InputDescription, x_gpu.Ptr, 0, InputDescription, y_gpu.Ptr);
+                    }
+
+                    // 1x1 convolution
+                    using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Chained1x1AfterPoolingConvolutionKernels))
+                    {
+                        _1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Chained1x1AfterPoolingConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                        DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, Secondary1x1FilterDescription, _1x1ConvolutionDescription, Secondary1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                        DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, Secondary1x1FilterDescription, _1x1ConvolutionDescription, Secondary1x1OutputDescription, algorithm, out IntPtr size);
+                        using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                        {
+                            DnnInstance.ConvolutionForward(1, InputDescription, y_gpu.Ptr, Secondary1x1FilterDescription, w_gpu.Ptr + InputInfo.Channels * OperationInfo.Primary1x1ConvolutionKernels + 3 * 3 * OperationInfo.Primary1x1ConvolutionKernels * OperationInfo.Secondary3x3ConvolutionKernels + 5 * 5 * OperationInfo.Primary1x1ConvolutionKernels * OperationInfo.Secondary5x5ConvolutionKernels, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);                            
+                        }
+                        DnnInstance.AddTensor(1, Secondary1x1BiasDescription, b_gpu.Ptr + OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels, 1, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);
+                        _1x1Output_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Chained1x1AfterPoolingConvolutionKernels);
+
+                        // 1x1 convolution activation
+                        DnnInstance.ActivationForward(ActivationDescription, 1, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr, 0, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);
+                        _1x1Output_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Chained1x1AfterPoolingConvolutionKernels);
+                    }
+                }
+            }
         }
 
         public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)

From 127e241a844f8f4dae01cb9e90d4aecad37768f8 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sun, 24 Dec 2017 15:48:45 +0100
Subject: [PATCH 09/30] Minor code improvements to the CuDnn layers

---
 .../Layers/CuDnnConvolutionalLayer.cs         | 90 +++++++++----------
 .../Layers/CuDnnFullyConnectedLayer.cs        | 42 ++++-----
 .../Layers/CuDnnSoftmaxLayer.cs               | 16 ++--
 3 files changed, 64 insertions(+), 84 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
index 4aa0982..df31119 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
@@ -74,71 +74,63 @@ public CuDnnConvolutionalLayer(
         #region Implementation
 
         /// <inheritdoc/>
-        public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
+        public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
             {
-                Tensor.Reshape(pw, OutputInfo.Channels, KernelInfo.Size, out Tensor wTensor);
-                using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
+                // Tensors info setup
+                InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
+                OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);
+
+                // Forward convolution
+                DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float>
+                    x_gpu = DnnInstance.Gpu.AllocateDevice(x),
+                    w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                 {
-                    // Tensors info setup
-                    InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
-                    OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);
-
-                    // Forward convolution
-                    DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
-                    DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, algorithm, out IntPtr size);
-                    using (DeviceMemory<float>
-                        x_gpu = DnnInstance.Gpu.AllocateDevice(x),
-                        w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor))
-                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
-                    {
-                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, FilterDescription, w_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, OutputDescription, z_gpu.Ptr);
-                    }
+                    DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, FilterDescription, w_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, OutputDescription, z_gpu.Ptr);
+                }
 
-                    // Biases
-                    using (DeviceMemory<float> b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
-                    {
-                        DnnInstance.AddTensor(1, BiasDescription, b_gpu.Ptr, 1, OutputDescription, z_gpu.Ptr);
-                    }
-                    z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
+                // Biases
+                using (DeviceMemory<float> b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
+                {
+                    DnnInstance.AddTensor(1, BiasDescription, b_gpu.Ptr, 1, OutputDescription, z_gpu.Ptr);
+                }
+                z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
 
-                    // Activation
-                    if (ActivationFunctionType == ActivationFunctionType.Identity) z.Duplicate(out a);
-                    else
-                    {
-                        DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
-                        z_gpu.CopyToHost(z.Entities, z.Length, out a);
-                    }
+                // Activation
+                if (ActivationFunctionType == ActivationFunctionType.Identity) z.Duplicate(out a);
+                else
+                {
+                    DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
+                    z_gpu.CopyToHost(z.Entities, z.Length, out a);
                 }
             }
         }
 
         /// <inheritdoc/>
-        public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
+        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float> delta_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
             {
-                Tensor.Reshape(pw, OutputInfo.Channels, KernelInfo.Size, out Tensor wTensor);
+                // Convolution
                 DnnInstance.GetConvolutionBackwardDataAlgorithm(FilterDescription, OutputDescription, ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdDataAlgo algorithm);
                 DnnInstance.GetConvolutionBackwardDataWorkspaceSize(FilterDescription, OutputDescription, ConvolutionDescription, InputDescription, algorithm, out IntPtr size);
-                using (DeviceMemory<float> delta_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
+                using (DeviceMemory<float>
+                    delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
+                    w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                 {
-                    // Backwards convolution
-                    using (DeviceMemory<float>
-                        delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
-                        w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor))
-                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
-                    {
-                        DnnInstance.ConvolutionBackwardData(1, FilterDescription, w_gpu.Ptr, OutputDescription, delta_1_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, delta_gpu.Ptr);
-                    }
+                    DnnInstance.ConvolutionBackwardData(1, FilterDescription, w_gpu.Ptr, OutputDescription, delta_1_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, delta_gpu.Ptr);
+                }
 
-                    // Activation
-                    using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
-                    {
-                        DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, delta_gpu.Ptr, activationPrime);
-                        z_gpu.CopyTo(z);
-                    }
+                // Activation
+                using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
+                {
+                    DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, delta_gpu.Ptr, activationPrime);
+                    z_gpu.CopyTo(z);
                 }
             }
         }
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
index 87685d3..f1c587c 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
@@ -30,39 +30,31 @@ public CuDnnFullyConnectedLayer(in TensorInfo input, int neurons, [NotNull] floa
         #region Implementation
 
         /// <inheritdoc/>
-        public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
+        public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float>
+                x_gpu = DnnInstance.Gpu.AllocateDevice(x),
+                w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
+                y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size),
+                b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
             {
-                Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
-                using (DeviceMemory<float>
-                    x_gpu = DnnInstance.Gpu.AllocateDevice(x),
-                    w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
-                    y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size),
-                    b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
-                {
-                    DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, y_gpu.Ptr);
-                    y_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
-                    DnnInstance.ActivationForward(z.Entities, z.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
-                    y_gpu.CopyToHost(z.Entities, z.Length, out a);
-                }
+                DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, y_gpu.Ptr);
+                y_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
+                DnnInstance.ActivationForward(z.Entities, z.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
+                y_gpu.CopyToHost(z.Entities, z.Length, out a);
             }
         }
 
         /// <inheritdoc/>
-        public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
+        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float>
+                delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
+                w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
+                z_gpu = DnnInstance.Gpu.AllocateDevice(z))
             {
-                Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
-                using (DeviceMemory<float>
-                    delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
-                    w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
-                    z_gpu = DnnInstance.Gpu.AllocateDevice(z))
-                {
-                    DnnInstance.FullyConnectedBackwardData(z.Entities, InputInfo.Size, OutputInfo.Size, z_gpu.Ptr, delta_1_gpu.Ptr, w_gpu.Ptr, activationPrime);
-                    z_gpu.CopyTo(z);
-                }
+                DnnInstance.FullyConnectedBackwardData(z.Entities, InputInfo.Size, OutputInfo.Size, z_gpu.Ptr, delta_1_gpu.Ptr, w_gpu.Ptr, activationPrime);
+                z_gpu.CopyTo(z);
             }
         }
 
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnSoftmaxLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnSoftmaxLayer.cs
index abc7f07..be8a0cd 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnSoftmaxLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnSoftmaxLayer.cs
@@ -42,17 +42,13 @@ public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
             using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
             {
                 // Linear pass
-                fixed (float* pw = Weights)
+                using (DeviceMemory<float>
+                    x_gpu = DnnInstance.Gpu.AllocateDevice(x),
+                    w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
+                    b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
                 {
-                    Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
-                    using (DeviceMemory<float>
-                        x_gpu = DnnInstance.Gpu.AllocateDevice(x),
-                        w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
-                        b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
-                    {
-                        DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, z_gpu.Ptr);
-                        z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
-                    }
+                    DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, z_gpu.Ptr);
+                    z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
                 }
 
                 // Activation

From c16c249690380e34197c931db063b72ea6ec65c5 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sun, 24 Dec 2017 17:01:40 +0100
Subject: [PATCH 10/30] Minor bug fixes

---
 NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
index 6576b2b..79723f2 100644
--- a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
+++ b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
@@ -66,8 +66,14 @@ public static unsafe void CopyToRows([NotNull] this DeviceMemory<float> source,
             if (source.Length / length != destination.Entities) throw new ArgumentOutOfRangeException(nameof(length), "The input length doesn't match the given arguments");
             if (destination.Length - offset > length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
             CUDAInterop.cudaError_enum result = CUDAInterop.cudaError_enum.CUDA_SUCCESS;
+            int 
+                bytes = sizeof(float) * length,                     // Bytes to copy for each row
+                lineBytes = sizeof(float) * destination.Length;     // Bytes to skip for each entry to jump to the line below at the same offset
+            IntPtr
+                start = destination.Ptr + sizeof(float) * offset,   // Initial destination offset
+                size = new IntPtr(bytes);
             for (int i = 0; i < destination.Entities; i++)
-                result |= CUDAInterop.cuMemcpy(new IntPtr((float*)destination + offset), source.Handle + i * destination.Length, new IntPtr(sizeof(float) * length));
+                result |= CUDAInterop.cuMemcpy(start + i * bytes, source.Handle + i * lineBytes, size);
             if (result != CUDAInterop.cudaError_enum.CUDA_SUCCESS)
                 throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
         }

From 6a82479855107d41a645178793e2bf6b8cecefb0 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Sun, 24 Dec 2017 18:10:12 +0100
Subject: [PATCH 11/30] Minor improvements to the Tensor struct

---
 NeuralNetwork.NET/APIs/Structs/Tensor.cs      | 27 ++++++++++++++++++-
 .../Networks/Implementations/NeuralNetwork.cs |  4 +--
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/NeuralNetwork.NET/APIs/Structs/Tensor.cs b/NeuralNetwork.NET/APIs/Structs/Tensor.cs
index 9252f95..dee5e5e 100644
--- a/NeuralNetwork.NET/APIs/Structs/Tensor.cs
+++ b/NeuralNetwork.NET/APIs/Structs/Tensor.cs
@@ -34,7 +34,22 @@ public readonly struct Tensor
         /// <summary>
         /// The total size (the number of <see cref="float"/> values) in the current <see cref="Tensor"/>
         /// </summary>
-        public int Size => Entities * Length;
+        public int Size
+        {
+            [Pure]
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => Entities * Length;
+        }
+
+        /// <summary>
+        /// Gets whether or not the current instance is linked to an allocated memory area
+        /// </summary>
+        public bool Null
+        {
+            [Pure]
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => Ptr == IntPtr.Zero;
+        }
 
         #region Initialization
 
@@ -192,6 +207,16 @@ public float[] ToArray()
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public void Free() => Marshal.FreeHGlobal(Ptr);
 
+        /// <summary>
+        /// Frees the memory associated with the current instance, if needed
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void TryFree()
+        {
+            if (Ptr != IntPtr.Zero)
+                Marshal.FreeHGlobal(Ptr);
+        }
+
         // Implicit pointer conversion
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static unsafe implicit operator float*(in Tensor tensor) => (float*)tensor.Ptr.ToPointer();
diff --git a/NeuralNetwork.NET/Networks/Implementations/NeuralNetwork.cs b/NeuralNetwork.NET/Networks/Implementations/NeuralNetwork.cs
index 493282f..6707591 100644
--- a/NeuralNetwork.NET/Networks/Implementations/NeuralNetwork.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/NeuralNetwork.cs
@@ -248,7 +248,7 @@ internal unsafe void Backpropagate(in TrainingBatch batch, float dropout, [NotNu
                      * Multiply the previous delta with the transposed weights of the following layer
                      * Compute d(l), the Hadamard product of z'(l) and delta(l + 1) * W(l + 1)T */
                     _Layers[l + 1].Backpropagate(*deltas[l + 1], zList[l], _Layers[l].ActivationFunctions.ActivationPrime);
-                    if (dropoutMasks[l].Ptr != IntPtr.Zero) zList[l].InPlaceHadamardProduct(dropoutMasks[l]);
+                    if (!dropoutMasks[l].Null) zList[l].InPlaceHadamardProduct(dropoutMasks[l]);
                     deltas[l] = zList + l;
                 }
 
@@ -285,7 +285,7 @@ internal unsafe void Backpropagate(in TrainingBatch batch, float dropout, [NotNu
                 {
                     zList[i].Free();
                     aList[i].Free();
-                    if (dropoutMasks[i].Ptr != IntPtr.Zero) dropoutMasks[i].Free();
+                    dropoutMasks[i].TryFree();
                 }
                 zList[_Layers.Length - 1].Free();
                 aList[_Layers.Length - 1].Free();

From 657c277464d1c9740da7f5c8f9b32a4fac5f586c Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Mon, 25 Dec 2017 12:46:54 +0100
Subject: [PATCH 12/30] InceptionInfo struct improved, minor changes

---
 .../Layers/CuDnnInceptionLayer.cs             | 59 +++++++++++++++----
 .../APIs/Structs/InceptionInfo.cs             | 44 +++++++++++---
 .../Layers/Helpers/WeightsProvider.cs         | 22 ++++---
 3 files changed, 99 insertions(+), 26 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 0392e33..5875030 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -64,6 +64,22 @@ public ref readonly InceptionInfo OperationInfo
 
         #endregion
 
+        #region 3x3 reduce 1x1 convolution
+
+        // The NCHW info for the 3x3 reduce 1x1 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor _3x3Reduce1x1FilterDescription = new FilterDescriptor();
+
+         // The info on the 3x3 reduce 1x1 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor _3x3Reduce1x1BiasDescription = new TensorDescriptor();
+
+        // The NCHW tensor info for the outputs of the 3x3 reduce 1x1 convolution
+        [NotNull]
+        private readonly TensorDescriptor _3x3Reduce1x1OutputDescription = new TensorDescriptor();
+
+        #endregion
+
         #region 3x3 secondary convolution
 
         // The NCHW info for the 3x3 convolution weights
@@ -84,6 +100,22 @@ public ref readonly InceptionInfo OperationInfo
 
         #endregion
 
+        #region 3x3 reduce 1x1 convolution
+
+        // The NCHW info for the 5x5 reduce 1x1 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor _5x5Reduce1x1FilterDescription = new FilterDescriptor();
+
+         // The info on the 5x5 reduce 1x1 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor _5x5Reduce1x1BiasDescription = new TensorDescriptor();
+
+        // The NCHW tensor info for the outputs of the 5x5 reduce 1x1 convolution
+        [NotNull]
+        private readonly TensorDescriptor _5x5Reduce1x1OutputDescription = new TensorDescriptor();
+
+        #endregion
+
         #region 5x5 secondary convolution
 
         // The NCHW info for the 5x5 convolution weights
@@ -142,26 +174,33 @@ public ref readonly InceptionInfo OperationInfo
         private void SetupCuDnnInfo()
         {
             // First 1x1 convolution
-            _1x1ConvolutionDescription.Set2D(0, 0, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION);
             _1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Primary1x1ConvolutionKernels, InputInfo.Channels, 1, 1);
             _1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Primary1x1ConvolutionKernels, 1, 1);
 
+            // 3x3 reduce 1x1 convolution
+            _3x3Reduce1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, InputInfo.Channels, 1, 1);
+            _3x3Reduce1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, 1, 1);
+
             // 3x3 convolution
             _3x3ConvolutionDescription.Set2D(1, 1, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION); // 1-padding to keep size
-            _3x3FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary3x3ConvolutionKernels, _OperationInfo.Primary1x1ConvolutionKernels, 3, 3);
+            _3x3FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary3x3ConvolutionKernels, _OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, 3, 3);
             _3x3BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary3x3ConvolutionKernels, 1, 1);
 
+            // 5x5 reduce 1x1 convolution
+            _5x5Reduce1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, InputInfo.Channels, 1, 1);
+            _5x5Reduce1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, 1, 1);
+
             // 5x5 convolution
             _5x5ConvolutionDescription.Set2D(2, 2, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION);
-            _5x5FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary5x5ConvolutionKernels, _OperationInfo.Primary1x1ConvolutionKernels, 5, 5);
+            _5x5FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary5x5ConvolutionKernels, _OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, 5, 5);
             _5x5BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary5x5ConvolutionKernels, 1, 1);
 
             // Pooling
             PoolingDescription.Set2D(Alea.cuDNN.PoolingMode.AVERAGE_COUNT_EXCLUDE_PADDING, NanPropagation.PROPAGATE_NAN, 3, 3, 1, 1, 1, 1);
             
             // Secondary 1x1 convolution
-            Secondary1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, InputInfo.Channels, _OperationInfo.Chained1x1AfterPoolingConvolutionKernels, 1, 1);
-            Secondary1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Chained1x1AfterPoolingConvolutionKernels, 1, 1);
+            Secondary1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, InputInfo.Channels, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, 1, 1);
+            Secondary1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, 1, 1);
 
             // Activation
             ActivationDescription.Set(ActivationMode.RELU, NanPropagation.PROPAGATE_NAN, 0);
@@ -172,7 +211,7 @@ private void SetupCuDnnInfo()
         internal CuDnnInceptionLayer(in TensorInfo input, in InceptionInfo info, BiasInitializationMode biasMode = BiasInitializationMode.Zero)
             : base(input, new TensorInfo(input.Height, input.Width, info.OutputChannels),
                   WeightsProvider.NewInceptionWeights(input, info),
-                  WeightsProvider.NewBiases(info.OutputChannels, biasMode),
+                  WeightsProvider.NewBiases(info.ConvolutionKernels, biasMode),
                   ActivationFunctionType.ReLU)
         {
             _OperationInfo = info;
@@ -266,9 +305,9 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     }
 
                     // 1x1 convolution
-                    using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Chained1x1AfterPoolingConvolutionKernels))
+                    using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize)) // TODO
                     {
-                        _1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Chained1x1AfterPoolingConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                        _1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, -1, InputInfo.Height, InputInfo.Width);
                         DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, Secondary1x1FilterDescription, _1x1ConvolutionDescription, Secondary1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
                         DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, Secondary1x1FilterDescription, _1x1ConvolutionDescription, Secondary1x1OutputDescription, algorithm, out IntPtr size);
                         using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
@@ -276,11 +315,11 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                             DnnInstance.ConvolutionForward(1, InputDescription, y_gpu.Ptr, Secondary1x1FilterDescription, w_gpu.Ptr + InputInfo.Channels * OperationInfo.Primary1x1ConvolutionKernels + 3 * 3 * OperationInfo.Primary1x1ConvolutionKernels * OperationInfo.Secondary3x3ConvolutionKernels + 5 * 5 * OperationInfo.Primary1x1ConvolutionKernels * OperationInfo.Secondary5x5ConvolutionKernels, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);                            
                         }
                         DnnInstance.AddTensor(1, Secondary1x1BiasDescription, b_gpu.Ptr + OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels, 1, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);
-                        _1x1Output_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Chained1x1AfterPoolingConvolutionKernels);
+                        _1x1Output_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize); // TODO
 
                         // 1x1 convolution activation
                         DnnInstance.ActivationForward(ActivationDescription, 1, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr, 0, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);
-                        _1x1Output_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Chained1x1AfterPoolingConvolutionKernels);
+                        _1x1Output_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize); // TODO
                     }
                 }
             }
diff --git a/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs b/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs
index 676b86a..2a69aab 100644
--- a/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs
@@ -19,11 +19,21 @@ namespace NeuralNetworkNET.APIs.Structs
         /// </summary>
         public readonly int Primary1x1ConvolutionKernels;
 
+        /// <summary>
+        /// The number of 1x1 convolution kernels before the 3x3 convolution
+        /// </summary>
+        public readonly int Primary3x3Reduce1x1ConvolutionKernels;
+
         /// <summary>
         /// The number of 3x3 convolution kernels
         /// </summary>
         public readonly int Secondary3x3ConvolutionKernels;
 
+        /// <summary>
+        /// The number of 1x1 convolution kernels before the 5x5 convolution
+        /// </summary>
+        public readonly int Primary5x5Reduce1x1ConvolutionKernels;
+
         /// <summary>
         /// The number of 5x5 convolution kernels
         /// </summary>
@@ -37,7 +47,7 @@ namespace NeuralNetworkNET.APIs.Structs
         /// <summary>
         /// The number of 1x1 convolution kernels after the pooling operation
         /// </summary>
-        public readonly int Chained1x1AfterPoolingConvolutionKernels;
+        public readonly int Secondary1x1AfterPoolingConvolutionKernels;
 
         /// <summary>
         /// Gets the number of output channels after the depth concatenation
@@ -46,7 +56,17 @@ public int OutputChannels
         {
             [Pure]
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            get => Primary1x1ConvolutionKernels + Secondary3x3ConvolutionKernels + Secondary5x5ConvolutionKernels + Chained1x1AfterPoolingConvolutionKernels;
+            get => Primary1x1ConvolutionKernels + Secondary3x3ConvolutionKernels + Secondary5x5ConvolutionKernels + Secondary1x1AfterPoolingConvolutionKernels;
+        }
+
+        /// <summary>
+        /// Gets the total number of convolution kernels for the current instance
+        /// </summary>
+        public int ConvolutionKernels
+        {
+            [Pure]
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => Primary1x1ConvolutionKernels + Primary3x3Reduce1x1ConvolutionKernels + Secondary3x3ConvolutionKernels + Primary5x5Reduce1x1ConvolutionKernels + Secondary5x5ConvolutionKernels + Secondary1x1AfterPoolingConvolutionKernels;
         }
 
         #endregion
@@ -54,12 +74,14 @@ public int OutputChannels
         #region Constructors
 
         // Internal constructor
-        private InceptionInfo(int _1x1Kernels, int _3x3Kernels, int _5x5Kernels, PoolingMode poolingMode, int _1x1SecondaryKernels)
+        private InceptionInfo(int _1x1Kernels, int _3x3Reduce1x1Kernels, int _3x3Kernels, int _5x5Reduce1x1Kernels, int _5x5Kernels, PoolingMode poolingMode, int _1x1SecondaryKernels)
         {
             Primary1x1ConvolutionKernels = _1x1Kernels >= 1 ? _1x1Kernels : throw new ArgumentOutOfRangeException(nameof(_1x1Kernels), "The number of 1x1 kernels must be at least 1");
+            Primary3x3Reduce1x1ConvolutionKernels = _3x3Reduce1x1Kernels >= 1 ? _3x3Reduce1x1Kernels : throw new ArgumentOutOfRangeException(nameof(_3x3Reduce1x1Kernels), "The number of 3x3 reduction 1x1 kernels must be at least 1");
             Secondary3x3ConvolutionKernels = _3x3Kernels >= 1 ? _3x3Kernels : throw new ArgumentOutOfRangeException(nameof(_3x3Kernels), "The number of 3x3 kernels must be at least 1");
+            Primary5x5Reduce1x1ConvolutionKernels = _5x5Reduce1x1Kernels >= 1 ? _5x5Reduce1x1Kernels : throw new ArgumentOutOfRangeException(nameof(_3x3Kernels), "The number of 5x5 reduction 1x1 kernels must be at least 1");
             Secondary5x5ConvolutionKernels = _5x5Kernels >= 1 ? _5x5Kernels : throw new ArgumentOutOfRangeException(nameof(_5x5Kernels), "The number of 5x5 kernels must be at least 1");
-            Chained1x1AfterPoolingConvolutionKernels = _1x1SecondaryKernels >= 1 ? _1x1SecondaryKernels : throw new ArgumentOutOfRangeException(nameof(_1x1SecondaryKernels), "The number of secondary 1x1 kernels must be at least 1");
+            Secondary1x1AfterPoolingConvolutionKernels = _1x1SecondaryKernels >= 1 ? _1x1SecondaryKernels : throw new ArgumentOutOfRangeException(nameof(_1x1SecondaryKernels), "The number of secondary 1x1 kernels must be at least 1");
             Pooling = poolingMode;
         }
 
@@ -67,16 +89,18 @@ private InceptionInfo(int _1x1Kernels, int _3x3Kernels, int _5x5Kernels, Pooling
         /// Creates a new inception layer description with the input parameters
         /// </summary>
         /// <param name="_1x1Kernels">The number of 1x1 primary convolution kernels</param>
+        /// <param name="_3x3Reduce1x1Kernels">The number of 3x3 reduction 1x1 kernels</param>
         /// <param name="_3x3Kernels">The number of 3x3 convolution kernels</param>
+        /// <param name="_5x5Reduce1x1Kernels">The number of 5x5 reduction 1x1 kernels</param>
         /// <param name="_5x5Kernels">The number of 5x5 convolution kernels</param>
         /// <param name="poolingMode">The pooling mode for the pooling pipeline</param>
         /// <param name="_1x1SecondaryKernels">The number of secondary 1x1 convolution kernels</param>
         [PublicAPI]
         [Pure]
         public static InceptionInfo New(
-            int _1x1Kernels, int _3x3Kernels, int _5x5Kernels, 
+            int _1x1Kernels, int _3x3Reduce1x1Kernels, int _3x3Kernels, int _5x5Reduce1x1Kernels, int _5x5Kernels, 
             PoolingMode poolingMode, int _1x1SecondaryKernels)
-            => new InceptionInfo(_1x1Kernels, _3x3Kernels, _5x5Kernels, poolingMode, _1x1SecondaryKernels);
+            => new InceptionInfo(_1x1Kernels, _3x3Reduce1x1Kernels, _3x3Kernels, _5x5Reduce1x1Kernels, _5x5Kernels, poolingMode, _1x1SecondaryKernels);
 
         #endregion
 
@@ -95,9 +119,11 @@ public override int GetHashCode()
             unchecked
             {
                 hash = hash * 31 + Primary1x1ConvolutionKernels;
-                hash = hash * 31 + Chained1x1AfterPoolingConvolutionKernels;
+                hash = hash * 31 + Primary3x3Reduce1x1ConvolutionKernels;
                 hash = hash * 31 + Secondary3x3ConvolutionKernels;
+                hash = hash * 31 + Primary5x5Reduce1x1ConvolutionKernels;
                 hash = hash * 31 + Secondary5x5ConvolutionKernels;
+                hash = hash * 31 + Secondary1x1AfterPoolingConvolutionKernels;
                 hash = hash * 31 + (int)Pooling;
             }
             return hash;
@@ -105,9 +131,11 @@ public override int GetHashCode()
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static bool operator ==(in InceptionInfo a, in InceptionInfo b) => a.Primary1x1ConvolutionKernels == b.Primary1x1ConvolutionKernels &&
-                                                                                  a.Chained1x1AfterPoolingConvolutionKernels == b.Chained1x1AfterPoolingConvolutionKernels && 
+                                                                                  a.Primary3x3Reduce1x1ConvolutionKernels == b.Primary3x3Reduce1x1ConvolutionKernels && 
                                                                                   a.Secondary3x3ConvolutionKernels == b.Secondary3x3ConvolutionKernels &&
+                                                                                  a.Primary5x5Reduce1x1ConvolutionKernels == b.Primary5x5Reduce1x1ConvolutionKernels &&
                                                                                   a.Secondary5x5ConvolutionKernels == b.Secondary5x5ConvolutionKernels && 
+                                                                                  a.Secondary1x1AfterPoolingConvolutionKernels == b.Secondary1x1AfterPoolingConvolutionKernels &&
                                                                                   a.Pooling == b.Pooling;
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
index 72a6722..572ef96 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
@@ -78,19 +78,25 @@ public static unsafe float[] NewInceptionWeights(in TensorInfo input, in Incepti
         {
             int
                 _1x1Length = input.Channels * info.Primary1x1ConvolutionKernels,
-                _3x3Length = 3 * 3 * info.Primary1x1ConvolutionKernels * info.Secondary3x3ConvolutionKernels,
-                _5x5Length = 5 * 5 * info.Primary1x1ConvolutionKernels * info.Secondary5x5ConvolutionKernels,
-                secondary1x1Length = input.Channels * info.Chained1x1AfterPoolingConvolutionKernels;
+                _3x3Reduce1x1Length = input.Channels * info.Primary3x3Reduce1x1ConvolutionKernels,
+                _3x3Length = 3 * 3 * info.Primary3x3Reduce1x1ConvolutionKernels * info.Secondary3x3ConvolutionKernels,
+                _5x5Reduce1x1Length = input.Channels * info.Primary5x5Reduce1x1ConvolutionKernels,
+                _5x5Length = 5 * 5 * info.Primary5x5Reduce1x1ConvolutionKernels * info.Secondary5x5ConvolutionKernels,
+                secondary1x1Length = input.Channels * info.Secondary1x1AfterPoolingConvolutionKernels;
             float[] weights = new float[_1x1Length + _3x3Length + _5x5Length + secondary1x1Length];
             fixed (float* pw = weights)
             {
                 Tensor.Reshape(pw, 1, _1x1Length, out Tensor wTensor);
                 KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
-                Tensor.Reshape(pw + _1x1Length, 1, _3x3Length, out wTensor);
-                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, 3 * 3 * info.Primary1x1ConvolutionKernels);
-                Tensor.Reshape(pw + _1x1Length + _3x3Length, 1, _5x5Length, out wTensor);
-                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, 5 * 5 * info.Primary1x1ConvolutionKernels);
-                Tensor.Reshape(pw + _1x1Length + _3x3Length + _5x5Length, 1, secondary1x1Length, out wTensor);
+                Tensor.Reshape(pw + _1x1Length, 1, _3x3Reduce1x1Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
+                Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length, 1, _3x3Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, 3 * 3 * info.Primary3x3Reduce1x1ConvolutionKernels);
+                Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length + _3x3Length, 1, _5x5Reduce1x1Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
+                Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length + _3x3Length + _5x5Reduce1x1Length, 1, _5x5Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, 5 * 5 * info.Primary5x5Reduce1x1ConvolutionKernels);
+                Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length + _3x3Length + _5x5Reduce1x1Length + _5x5Length, 1, secondary1x1Length, out wTensor);
                 KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
             }
             return weights;

From 801b4caa40dce467c18e201c717dd10e378e528c Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Mon, 25 Dec 2017 17:11:19 +0100
Subject: [PATCH 13/30] Inception layer forward method implemented

---
 .../Layers/CuDnnInceptionLayer.cs             | 179 +++++++++++++-----
 1 file changed, 130 insertions(+), 49 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 5875030..714bfb0 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -36,6 +36,53 @@ public ref readonly InceptionInfo OperationInfo
             get => ref _OperationInfo;
         }
 
+        #endregion
+
+        #region Weights info
+
+        // 1x1 convolution weights on first pipeline
+        private int _1x1Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => InputInfo.Channels * OperationInfo.Primary1x1ConvolutionKernels;
+        }
+
+        // 1x1 convolution weights on 3x3 pipeline
+        private int _3x3Reduce1x1Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => InputInfo.Channels * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels;
+        }
+
+        // 3x3 convolution weights
+        private int _3x3Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => 3 * 3 * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels * OperationInfo.Secondary3x3ConvolutionKernels;
+        }
+
+        // 1x1 convolution weights on 5x5 pipeline
+        private int _5x5Reduce1x1Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => InputInfo.Channels * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels;
+        }
+
+        // 5x5 convolution weights
+        private int _5x5Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => 5 * 5 * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels * OperationInfo.Secondary5x5ConvolutionKernels;
+        }
+
+        // 1x1 convolution weights on pooling pipeline
+        private int Secondary1x1Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => InputInfo.Channels * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels;
+        }
+
+
         #endregion
 
         #region cuDNN fields
@@ -227,6 +274,7 @@ internal CuDnnInceptionLayer(in TensorInfo input, in InceptionInfo info, [NotNul
 
         #region Implementation
 
+        /// <inheritdoc/>
         public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
             Tensor.New(x.Entities, OutputInfo.Size, out z);
@@ -235,63 +283,94 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
                     b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
             {
-                using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels))
+                // Pointers
+                deviceptr<float> pw_gpu = w_gpu.Ptr, pb_gpu = b_gpu.Ptr;
+
+                // First 1x1 convolution
+                using (DeviceMemory<float> y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels))
                 {
-                    // First 1x1 convolution
+                    // Descriptors setup and first 1x1 convolution
+                    InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
+                    _1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Primary1x1ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                    DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, _1x1FilterDescription, _1x1ConvolutionDescription, _1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                    DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, _1x1FilterDescription, _1x1ConvolutionDescription, _1x1OutputDescription, algorithm, out IntPtr size);
                     using (DeviceMemory<float> x_gpu = DnnInstance.Gpu.AllocateDevice(x))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                     {
-                        // Descriptors setup and first 1x1 convolution
-                        InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
-                        _1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Primary1x1ConvolutionKernels, InputInfo.Height, InputInfo.Width);
-                        DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, _1x1FilterDescription, _1x1ConvolutionDescription, _1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
-                        DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, _1x1FilterDescription, _1x1ConvolutionDescription, _1x1OutputDescription, algorithm, out IntPtr size);
-                        using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
-                        {
-                            DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _1x1FilterDescription, w_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _1x1OutputDescription, _1x1Output_gpu.Ptr);                            
-                        }
-                        DnnInstance.AddTensor(1, _1x1BiasDescription, b_gpu.Ptr, 1, _1x1OutputDescription, _1x1Output_gpu.Ptr);
-                        _1x1Output_gpu.CopyToRows(z, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
+                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _1x1FilterDescription, pw_gpu, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _1x1OutputDescription, y_gpu.Ptr);                            
+                    }
+                    DnnInstance.AddTensor(1, _1x1BiasDescription, pb_gpu, 1, _1x1OutputDescription, y_gpu.Ptr);
+                    y_gpu.CopyToRows(z, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
 
-                        // 1x1 convolution activation
-                        DnnInstance.ActivationForward(ActivationDescription, 1, _1x1OutputDescription, _1x1Output_gpu.Ptr, 0, _1x1OutputDescription, _1x1Output_gpu.Ptr);
-                        _1x1Output_gpu.CopyToRows(a, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
+                    // 1x1 convolution activation
+                    DnnInstance.ActivationForward(ActivationDescription, 1, _1x1OutputDescription, y_gpu.Ptr, 0, _1x1OutputDescription, y_gpu.Ptr);
+                    y_gpu.CopyToRows(a, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
+                }
+
+                // 1x1 + 3x3 convolution
+                using (DeviceMemory<float> 
+                    y1x1_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels),
+                    y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels))
+                {
+                    // 1x1 convolution
+                    _3x3Reduce1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                    DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, _3x3Reduce1x1FilterDescription, _1x1ConvolutionDescription, _3x3Reduce1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                    DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, _3x3Reduce1x1FilterDescription, _1x1ConvolutionDescription, _3x3Reduce1x1OutputDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> x_gpu = DnnInstance.Gpu.AllocateDevice(x))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _3x3Reduce1x1FilterDescription, pw_gpu += _1x1Weights, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr);                            
                     }
+                    DnnInstance.AddTensor(1, _3x3Reduce1x1BiasDescription, pb_gpu += OperationInfo.Primary1x1ConvolutionKernels, 1, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr);
+                    DnnInstance.ActivationForward(ActivationDescription, 1, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr, 0, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr);
 
                     // 3x3 convolution
-                    using (DeviceMemory<float> _3x3Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels))
+                    _3x3OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary3x3ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                    DnnInstance.GetConvolutionForwardAlgorithm(_3x3Reduce1x1OutputDescription, _3x3FilterDescription, _3x3ConvolutionDescription, _3x3OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionForwardWorkspaceSize(_3x3Reduce1x1OutputDescription, _3x3FilterDescription, _3x3ConvolutionDescription, _3x3OutputDescription, algorithm, out size);
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                     {
-                        _3x3OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary3x3ConvolutionKernels, InputInfo.Height, InputInfo.Width);
-                        DnnInstance.GetConvolutionForwardAlgorithm(_1x1OutputDescription, _3x3FilterDescription, _3x3ConvolutionDescription, _3x3OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
-                        DnnInstance.GetConvolutionForwardWorkspaceSize(_1x1OutputDescription, _3x3FilterDescription, _3x3ConvolutionDescription, _3x3OutputDescription, algorithm, out IntPtr size);
-                        using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
-                        {
-                            DnnInstance.ConvolutionForward(1, _1x1OutputDescription, _1x1Output_gpu.Ptr, _3x3FilterDescription, w_gpu.Ptr + InputInfo.Channels * OperationInfo.Primary1x1ConvolutionKernels, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3OutputDescription, _3x3Output_gpu.Ptr);                            
-                        }
-                        DnnInstance.AddTensor(1, _3x3BiasDescription, b_gpu.Ptr + OperationInfo.Primary1x1ConvolutionKernels, 1, _3x3OutputDescription, _3x3Output_gpu.Ptr);
-                        _3x3Output_gpu.CopyToRows(z, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
+                        DnnInstance.ConvolutionForward(1, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr, _3x3FilterDescription, pw_gpu += _3x3Reduce1x1Weights, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3OutputDescription, y_gpu.Ptr);      
+                    }
+                    DnnInstance.AddTensor(1, _3x3BiasDescription, pb_gpu += OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, 1, _3x3OutputDescription, y_gpu.Ptr);
+                    y_gpu.CopyToRows(z, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
+
+                    // Activation
+                    DnnInstance.ActivationForward(ActivationDescription, 1, _3x3OutputDescription, y_gpu.Ptr, 0, _3x3OutputDescription, y_gpu.Ptr);
+                    y_gpu.CopyToRows(a, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
+                }
 
-                        // 3x3 convolution activation
-                        DnnInstance.ActivationForward(ActivationDescription, 1, _3x3OutputDescription, _3x3Output_gpu.Ptr, 0, _3x3OutputDescription, _3x3Output_gpu.Ptr);
-                        _3x3Output_gpu.CopyToRows(a, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
+                // 1x1 + 5x5 convolution
+                using (DeviceMemory<float> 
+                    y1x1_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels),
+                    y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels))
+                {
+                    // 1x1 convolution
+                    _5x5Reduce1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                    DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, _5x5Reduce1x1FilterDescription, _1x1ConvolutionDescription, _5x5Reduce1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                    DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, _5x5Reduce1x1FilterDescription, _1x1ConvolutionDescription, _5x5Reduce1x1OutputDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> x_gpu = DnnInstance.Gpu.AllocateDevice(x))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _5x5Reduce1x1FilterDescription, pw_gpu += _3x3Weights, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);                            
                     }
+                    DnnInstance.AddTensor(1, _5x5Reduce1x1BiasDescription, pb_gpu += OperationInfo.Secondary3x3ConvolutionKernels, 1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);
+                    DnnInstance.ActivationForward(ActivationDescription, 1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr, 0, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);
 
                     // 5x5 convolution
-                    using (DeviceMemory<float> _5x5Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels))
+                    _5x5OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary5x5ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                    DnnInstance.GetConvolutionForwardAlgorithm(_5x5Reduce1x1OutputDescription, _5x5FilterDescription, _5x5ConvolutionDescription, _5x5OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionForwardWorkspaceSize(_5x5Reduce1x1OutputDescription, _5x5FilterDescription, _5x5ConvolutionDescription, _5x5OutputDescription, algorithm, out size);
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                     {
-                        _5x5OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary5x5ConvolutionKernels, InputInfo.Height, InputInfo.Width);
-                        DnnInstance.GetConvolutionForwardAlgorithm(_1x1OutputDescription, _5x5FilterDescription, _5x5ConvolutionDescription, _5x5OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
-                        DnnInstance.GetConvolutionForwardWorkspaceSize(_1x1OutputDescription, _5x5FilterDescription, _5x5ConvolutionDescription, _5x5OutputDescription, algorithm, out IntPtr size);
-                        using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
-                        {
-                            DnnInstance.ConvolutionForward(1, _1x1OutputDescription, _1x1Output_gpu.Ptr, _5x5FilterDescription, w_gpu.Ptr + InputInfo.Channels * OperationInfo.Primary1x1ConvolutionKernels + 3 * 3 * OperationInfo.Primary1x1ConvolutionKernels * OperationInfo.Secondary3x3ConvolutionKernels, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5OutputDescription, _5x5Output_gpu.Ptr);                            
-                        }
-                        DnnInstance.AddTensor(1, _5x5BiasDescription, b_gpu.Ptr + OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels, 1, _5x5OutputDescription, _5x5Output_gpu.Ptr);
-                        _5x5Output_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
-
-                        // 3x3 convolution activation
-                        DnnInstance.ActivationForward(ActivationDescription, 1, _5x5OutputDescription, _5x5Output_gpu.Ptr, 0, _5x5OutputDescription, _5x5Output_gpu.Ptr);
-                        _5x5Output_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
+                        DnnInstance.ConvolutionForward(1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr, _5x5FilterDescription, pw_gpu += _5x5Reduce1x1Weights, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5OutputDescription, y_gpu.Ptr);      
                     }
+                    DnnInstance.AddTensor(1, _5x5BiasDescription, pb_gpu += OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, 1, _5x5OutputDescription, y_gpu.Ptr);
+                    y_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
+
+                    // Activation
+                    DnnInstance.ActivationForward(ActivationDescription, 1, _3x3OutputDescription, y_gpu.Ptr, 0, _3x3OutputDescription, y_gpu.Ptr);
+                    y_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
                 }
                 
                 // Pooling pipeline
@@ -305,31 +384,33 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     }
 
                     // 1x1 convolution
-                    using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize)) // TODO
+                    using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels))
                     {
-                        _1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, -1, InputInfo.Height, InputInfo.Width);
+                        Secondary1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
                         DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, Secondary1x1FilterDescription, _1x1ConvolutionDescription, Secondary1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
                         DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, Secondary1x1FilterDescription, _1x1ConvolutionDescription, Secondary1x1OutputDescription, algorithm, out IntPtr size);
                         using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                         {
-                            DnnInstance.ConvolutionForward(1, InputDescription, y_gpu.Ptr, Secondary1x1FilterDescription, w_gpu.Ptr + InputInfo.Channels * OperationInfo.Primary1x1ConvolutionKernels + 3 * 3 * OperationInfo.Primary1x1ConvolutionKernels * OperationInfo.Secondary3x3ConvolutionKernels + 5 * 5 * OperationInfo.Primary1x1ConvolutionKernels * OperationInfo.Secondary5x5ConvolutionKernels, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);                            
+                            DnnInstance.ConvolutionForward(1, InputDescription, y_gpu.Ptr, Secondary1x1FilterDescription, pw_gpu += _5x5Weights, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);                            
                         }
-                        DnnInstance.AddTensor(1, Secondary1x1BiasDescription, b_gpu.Ptr + OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels, 1, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);
-                        _1x1Output_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize); // TODO
+                        DnnInstance.AddTensor(1, Secondary1x1BiasDescription, pb_gpu += OperationInfo.Secondary5x5ConvolutionKernels, 1, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);
+                        _1x1Output_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
 
                         // 1x1 convolution activation
                         DnnInstance.ActivationForward(ActivationDescription, 1, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr, 0, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);
-                        _1x1Output_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize); // TODO
+                        _1x1Output_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
                     }
                 }
             }
         }
 
+        /// <inheritdoc/>
         public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
         {
             throw new NotImplementedException();
         }
 
+        /// <inheritdoc/>
         public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJdw, out Tensor dJdb)
         {
             throw new NotImplementedException();

From 7e6366bbda65f29c71ed5815410a3d131b60228c Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Tue, 26 Dec 2017 00:13:35 +0100
Subject: [PATCH 14/30] Inception backpropagation 90% completed

---
 .../Layers/CuDnnInceptionLayer.cs             | 188 ++++++++++++++++--
 1 file changed, 172 insertions(+), 16 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 714bfb0..009b273 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -18,7 +18,7 @@ namespace NeuralNetworkNET.Cuda.Layers
     /// <summary>
     /// A simplified inception module, with 4 pipelines combining 1x1 convolution, 1x1 + 3x3, 1x1 + 5x5 and pooling + 1x1
     /// </summary>
-    internal sealed class CuDnnInceptionLayer : WeightedLayerBase
+    internal sealed class CuDnnInceptionLayer : WeightedLayerBase, IDisposable
     {
         #region Parameters
 
@@ -38,7 +38,7 @@ public ref readonly InceptionInfo OperationInfo
 
         #endregion
 
-        #region Weights info
+        #region Private fields and parameters
 
         // 1x1 convolution weights on first pipeline
         private int _1x1Weights
@@ -82,6 +82,32 @@ private int Secondary1x1Weights
             get => InputInfo.Channels * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels;
         }
 
+        // 3x3 reduction 1x1 convolution activity
+        private Tensor _3x3Reduce1x1Z;
+
+        // 3x3 reduction 1x1 convolution activation
+        private Tensor _3x3Reduce1x1A;
+
+        // 3x3 reduction 1x1 convolution output delta
+        private Tensor _3x3Reduce1x1Delta;
+
+        // 5x5 reduction 1x1 convolution activity
+        private Tensor _5x5Reduce1x1Z;
+
+        // 5x5 reduction 1x1 convolution activation
+        private Tensor _5x5Reduce1x1A;
+
+        // 5x5 reduction 1x1 convolution output delta
+        private Tensor _5x5Reduce1x1Delta;
+
+        // Pooling output activity
+        private Tensor PoolingZ;
+
+        // Pooling output activation
+        private Tensor PoolingA;
+
+        // Pooling output delta
+        private Tensor PoolingDelta;
 
         #endregion
 
@@ -207,10 +233,6 @@ private int Secondary1x1Weights
 
         #endregion
 
-        // The shared ReLU activation description for the current layer
-        [NotNull]
-        private readonly ActivationDescriptor ActivationDescription = new ActivationDescriptor();
-
         /// <summary>
         /// Gets the <see cref="Dnn"/> instance for the current layer
         /// </summary>
@@ -248,9 +270,6 @@ private void SetupCuDnnInfo()
             // Secondary 1x1 convolution
             Secondary1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, InputInfo.Channels, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, 1, 1);
             Secondary1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, 1, 1);
-
-            // Activation
-            ActivationDescription.Set(ActivationMode.RELU, NanPropagation.PROPAGATE_NAN, 0);
         }
 
         #endregion
@@ -303,7 +322,7 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     y_gpu.CopyToRows(z, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
 
                     // 1x1 convolution activation
-                    DnnInstance.ActivationForward(ActivationDescription, 1, _1x1OutputDescription, y_gpu.Ptr, 0, _1x1OutputDescription, y_gpu.Ptr);
+                    DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
                     y_gpu.CopyToRows(a, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
                 }
 
@@ -322,7 +341,11 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                         DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _3x3Reduce1x1FilterDescription, pw_gpu += _1x1Weights, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr);                            
                     }
                     DnnInstance.AddTensor(1, _3x3Reduce1x1BiasDescription, pb_gpu += OperationInfo.Primary1x1ConvolutionKernels, 1, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr);
-                    DnnInstance.ActivationForward(ActivationDescription, 1, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr, 0, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr);
+                    _3x3Reduce1x1Z.TryFree();
+                    y1x1_gpu.CopyToHost(x.Entities, InputInfo.SliceSize * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, out _3x3Reduce1x1Z);
+                    DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, y1x1_gpu.Ptr, y1x1_gpu.Ptr, ActivationFunctions.Activation);
+                    _3x3Reduce1x1A.TryFree();
+                    y1x1_gpu.CopyToHost(x.Entities, InputInfo.SliceSize * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, out _3x3Reduce1x1A);
 
                     // 3x3 convolution
                     _3x3OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary3x3ConvolutionKernels, InputInfo.Height, InputInfo.Width);
@@ -336,7 +359,7 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     y_gpu.CopyToRows(z, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
 
                     // Activation
-                    DnnInstance.ActivationForward(ActivationDescription, 1, _3x3OutputDescription, y_gpu.Ptr, 0, _3x3OutputDescription, y_gpu.Ptr);
+                    DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
                     y_gpu.CopyToRows(a, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
                 }
 
@@ -355,7 +378,11 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                         DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _5x5Reduce1x1FilterDescription, pw_gpu += _3x3Weights, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);                            
                     }
                     DnnInstance.AddTensor(1, _5x5Reduce1x1BiasDescription, pb_gpu += OperationInfo.Secondary3x3ConvolutionKernels, 1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);
-                    DnnInstance.ActivationForward(ActivationDescription, 1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr, 0, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);
+                    _5x5Reduce1x1Z.TryFree();
+                    y1x1_gpu.CopyToHost(x.Entities, InputInfo.SliceSize * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, out _5x5Reduce1x1Z);
+                    DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, y1x1_gpu.Ptr, y1x1_gpu.Ptr, ActivationFunctions.Activation);
+                    _5x5Reduce1x1A.TryFree();
+                    y1x1_gpu.CopyToHost(x.Entities, InputInfo.SliceSize * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, out _5x5Reduce1x1A);
 
                     // 5x5 convolution
                     _5x5OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary5x5ConvolutionKernels, InputInfo.Height, InputInfo.Width);
@@ -369,7 +396,7 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     y_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
 
                     // Activation
-                    DnnInstance.ActivationForward(ActivationDescription, 1, _3x3OutputDescription, y_gpu.Ptr, 0, _3x3OutputDescription, y_gpu.Ptr);
+                    DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
                     y_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
                 }
                 
@@ -382,6 +409,11 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     {
                         DnnInstance.PoolingForward(PoolingDescription, 1, InputDescription, x_gpu.Ptr, 0, InputDescription, y_gpu.Ptr);
                     }
+                    PoolingZ.TryFree();
+                    y_gpu.CopyToHost(x.Entities, InputInfo.Size, out PoolingZ);
+                    DnnInstance.ActivationForward(x.Entities, x.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
+                    PoolingA.TryFree();
+                    y_gpu.CopyToHost(x.Entities, InputInfo.Size, out PoolingA);
 
                     // 1x1 convolution
                     using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels))
@@ -397,7 +429,7 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                         _1x1Output_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
 
                         // 1x1 convolution activation
-                        DnnInstance.ActivationForward(ActivationDescription, 1, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr, 0, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);
+                        DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, _1x1Output_gpu.Ptr, _1x1Output_gpu.Ptr, ActivationFunctions.Activation);
                         _1x1Output_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
                     }
                 }
@@ -407,7 +439,104 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         /// <inheritdoc/>
         public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
         {
-            throw new NotImplementedException();
+            using (DeviceMemory<float> 
+                dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size),
+                dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
+                w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
+            {
+                // Pointers
+                deviceptr<float> pdy_gpu = dy_gpu.Ptr; // TODO: load rows
+
+                // First 1x1 convolution
+                DnnInstance.GetConvolutionBackwardDataAlgorithm(_1x1FilterDescription, _1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdDataAlgo algorithm);
+                DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_1x1FilterDescription, _1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                {
+                    DnnInstance.ConvolutionBackwardData(1, _1x1FilterDescription, w_gpu.Ptr, _1x1OutputDescription, dy_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, dx_gpu.Ptr);
+                }
+
+                // 1x1 + 3x3 convolution
+                using (DeviceMemory<float> _3x3Reduce1x1z_gpu = DnnInstance.Gpu.AllocateDevice(_3x3Reduce1x1Z))
+                {
+                    // 3x3 backward
+                    DnnInstance.GetConvolutionBackwardDataAlgorithm(_3x3FilterDescription, _3x3OutputDescription, _3x3ConvolutionDescription, _3x3Reduce1x1OutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_3x3FilterDescription, _3x3OutputDescription, _3x3ConvolutionDescription, _3x3Reduce1x1OutputDescription, algorithm, out size);
+                    using (DeviceMemory<float> _3x3Reduce1x1dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(_3x3Reduce1x1Z.Size))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        deviceptr<float> p3x3Weights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights;
+                        DnnInstance.ConvolutionBackwardData(1, _3x3FilterDescription, p3x3Weights_gpu, _3x3OutputDescription, pdy_gpu += InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3Reduce1x1OutputDescription, _3x3Reduce1x1dx_gpu.Ptr);
+                        DnnInstance.ActivationBackward(_3x3Reduce1x1Z.Entities, _3x3Reduce1x1Z.Length, _3x3Reduce1x1z_gpu.Ptr, _3x3Reduce1x1dx_gpu.Ptr, ActivationFunctions.ActivationPrime);
+                        _3x3Reduce1x1Delta.TryFree();
+                        _3x3Reduce1x1z_gpu.CopyToHost(_3x3Reduce1x1Z.Entities, _3x3Reduce1x1Z.Length, out _3x3Reduce1x1Delta);
+                    }
+
+                    // 3x3 reduce 1x1 backward
+                    DnnInstance.GetConvolutionBackwardDataAlgorithm(_3x3Reduce1x1FilterDescription, _3x3Reduce1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_3x3Reduce1x1FilterDescription, _3x3Reduce1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, algorithm, out size);
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        deviceptr<float> p3x3Reduce1x1Weights_gpu = w_gpu.Ptr + _1x1Weights;
+                        DnnInstance.ConvolutionBackwardData(1, _3x3Reduce1x1FilterDescription, p3x3Reduce1x1Weights_gpu, _3x3Reduce1x1OutputDescription, _3x3Reduce1x1z_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 1, InputDescription, dx_gpu.Ptr);
+                    }
+                }
+
+                // 1x1 + 5x5 convolution
+                using (DeviceMemory<float> _5x5Reduce1x1z_gpu = DnnInstance.Gpu.AllocateDevice(_5x5Reduce1x1Z))
+                {
+                    // 5x5 backward
+                    DnnInstance.GetConvolutionBackwardDataAlgorithm(_5x5FilterDescription, _5x5OutputDescription, _5x5ConvolutionDescription, _5x5Reduce1x1OutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_5x5FilterDescription, _5x5OutputDescription, _5x5ConvolutionDescription, _5x5Reduce1x1OutputDescription, algorithm, out size);
+                    using (DeviceMemory<float> _5x5Reduce1x1dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(_5x5Reduce1x1Z.Size))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        deviceptr<float> p5x5Weights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights;
+                        DnnInstance.ConvolutionBackwardData(1, _5x5FilterDescription, p5x5Weights_gpu, _5x5OutputDescription, pdy_gpu += InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1OutputDescription, _5x5Reduce1x1dx_gpu.Ptr);
+                        DnnInstance.ActivationBackward(_5x5Reduce1x1Z.Entities, _5x5Reduce1x1Z.Length, _5x5Reduce1x1z_gpu.Ptr, _5x5Reduce1x1dx_gpu.Ptr, ActivationFunctions.ActivationPrime);
+                        _5x5Reduce1x1Delta.TryFree();
+                        _5x5Reduce1x1z_gpu.CopyToHost(_5x5Reduce1x1Z.Entities, _5x5Reduce1x1Z.Length, out _5x5Reduce1x1Delta);
+                    }
+
+                    // 5x5 reduce 1x1 backward
+                    DnnInstance.GetConvolutionBackwardDataAlgorithm(_5x5Reduce1x1FilterDescription, _5x5Reduce1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_5x5Reduce1x1FilterDescription, _5x5Reduce1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, algorithm, out size);
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        deviceptr<float> p5x5Reduce1x1Weights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights;
+                        DnnInstance.ConvolutionBackwardData(1, _5x5Reduce1x1FilterDescription, p5x5Reduce1x1Weights_gpu, _5x5Reduce1x1OutputDescription, _5x5Reduce1x1z_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 1, InputDescription, dx_gpu.Ptr);
+                    }
+                }
+
+                // Pooling
+                using (DeviceMemory<float> poolDy_gpu = DnnInstance.Gpu.AllocateDevice(PoolingZ))
+                {
+                    // 1x1 backward
+                    DnnInstance.GetConvolutionBackwardDataAlgorithm(Secondary1x1FilterDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, PoolingOutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionBackwardDataWorkspaceSize(Secondary1x1FilterDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, PoolingOutputDescription, algorithm, out size);
+                    using (DeviceMemory<float> poolDx_gpu = DnnInstance.Gpu.AllocateDevice<float>(PoolingZ.Size))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        deviceptr<float> p1x1PoolingWeights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights + _5x5Weights;
+                        DnnInstance.ConvolutionBackwardData(1, Secondary1x1FilterDescription, p1x1PoolingWeights_gpu, Secondary1x1OutputDescription, pdy_gpu, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, PoolingOutputDescription, poolDx_gpu.Ptr);
+                        DnnInstance.ActivationBackward(PoolingZ.Entities, PoolingZ.Length, poolDy_gpu.Ptr, poolDx_gpu.Ptr, ActivationFunctions.ActivationPrime);
+                        PoolingDelta.TryFree();
+                        poolDy_gpu.CopyToHost(PoolingDelta.Entities, PoolingDelta.Length, out PoolingDelta);
+                    }
+
+                    // Pooling backward
+                    using (DeviceMemory<float> poolZ_gpu = DnnInstance.Gpu.AllocateDevice(PoolingZ))
+                    {
+                        DnnInstance.PoolingBackward(PoolingDescription, 1, PoolingOutputDescription, poolZ_gpu.Ptr, PoolingOutputDescription, poolDy_gpu.Ptr, InputDescription, default, 1, InputDescription, dx_gpu.Ptr);
+                    }
+                }
+
+                // Activation backward
+                using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
+                {
+                    DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, dx_gpu.Ptr, activationPrime);
+                    z_gpu.CopyTo(z);
+                }
+            }
         }
 
         /// <inheritdoc/>
@@ -420,5 +549,32 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
 
         /// <inheritdoc/>
         public override INetworkLayer Clone() => new CuDnnInceptionLayer(InputInfo, OperationInfo, Weights, Biases);
+
+        #region IDisposable
+
+        ~CuDnnInceptionLayer() => Dispose();
+
+        /// <inheritdoc/>
+        void IDisposable.Dispose()
+        {
+            GC.SuppressFinalize(this);
+            Dispose();
+        }
+
+        // Private Dispose method
+        private void Dispose()
+        {
+            _3x3Reduce1x1Z.TryFree();
+            _3x3Reduce1x1A.TryFree();
+            _3x3Reduce1x1Delta.TryFree();
+            _5x5Reduce1x1Z.TryFree();
+            _5x5Reduce1x1A.TryFree();
+            _5x5Reduce1x1Delta.TryFree();
+            PoolingZ.TryFree();
+            PoolingA.TryFree();
+            PoolingDelta.TryFree();
+        }
+
+        #endregion
     }
 }

From d1de1abd52205f6bd3e77931d8608a33d0f37956 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Tue, 26 Dec 2017 01:01:28 +0100
Subject: [PATCH 15/30] GPU copy to rows and rows allocation methods improved

---
 .../Extensions/GpuExtensions.cs               | 60 +++++++++++++++----
 .../GpuExtensionsTest.cs                      | 55 +++++++++++++++++
 2 files changed, 105 insertions(+), 10 deletions(-)
 create mode 100644 Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs

diff --git a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
index 79723f2..6adb6cb 100644
--- a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
+++ b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
@@ -63,21 +63,61 @@ public static void CopyToHost([NotNull] this DeviceMemory<float> source, int n,
         /// <param name="length">The number of values to copy for each entry</param>
         public static unsafe void CopyToRows([NotNull] this DeviceMemory<float> source, in Tensor destination, int offset, int length)
         {
+            // Checks
             if (source.Length / length != destination.Entities) throw new ArgumentOutOfRangeException(nameof(length), "The input length doesn't match the given arguments");
-            if (destination.Length - offset > length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
-            CUDAInterop.cudaError_enum result = CUDAInterop.cudaError_enum.CUDA_SUCCESS;
-            int 
-                bytes = sizeof(float) * length,                     // Bytes to copy for each row
-                lineBytes = sizeof(float) * destination.Length;     // Bytes to skip for each entry to jump to the line below at the same offset
-            IntPtr
-                start = destination.Ptr + sizeof(float) * offset,   // Initial destination offset
-                size = new IntPtr(bytes);
-            for (int i = 0; i < destination.Entities; i++)
-                result |= CUDAInterop.cuMemcpy(start + i * bytes, source.Handle + i * lineBytes, size);
+            if (destination.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
+
+            // Memory copy
+            CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
+            ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
+            {
+                srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
+                srcDevice = source.Handle,
+                srcPitch = new IntPtr(sizeof(float) * length),
+                dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
+                dstHost = destination.Ptr + sizeof(float) * offset,
+                dstPitch = new IntPtr(sizeof(float) * destination.Length),
+                WidthInBytes = new IntPtr(sizeof(float) * length),
+                Height = new IntPtr(destination.Entities)
+            };
+            CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
             if (result != CUDAInterop.cudaError_enum.CUDA_SUCCESS)
                 throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
         }
 
+        /// <summary>
+        /// Allocates a memory area on device memory, reading the target values at a given offset from the input <see cref="Tensor"/>
+        /// </summary>
+        /// <param name="gpu">The <see cref="Gpu"/> device to use</param>
+        /// <param name="source">The source <see cref="Tensor"/> with the data to copy</param>
+        /// <param name="offset">The column offset for the data to read from each row</param>
+        /// <param name="length"></param>
+        [MustUseReturnValue, NotNull]
+        public static unsafe DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tensor source, int offset, int length)
+        {
+            // Checks
+            if (source.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
+
+            // Memory copy
+            DeviceMemory<float> result_gpu = gpu.AllocateDevice<float>(source.Entities * length);
+            CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
+            ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
+            {
+                srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
+                srcHost = source.Ptr + sizeof(float) * offset,
+                srcPitch = new IntPtr(sizeof(float) * source.Length),
+                dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
+                dstDevice = result_gpu.Handle,
+                dstPitch = new IntPtr(sizeof(float) * length),
+                WidthInBytes = new IntPtr(sizeof(float) * length),
+                Height = new IntPtr(source.Entities)
+            };
+            CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
+            return result == CUDAInterop.cudaError_enum.CUDA_SUCCESS
+                ? result_gpu
+                : throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
+        }
+
         #endregion
 
         /// <summary>
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs
new file mode 100644
index 0000000..6676490
--- /dev/null
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs
@@ -0,0 +1,55 @@
+﻿using Alea;
+using Microsoft.VisualStudio.TestTools.UnitTesting;
+using NeuralNetworkNET.APIs.Structs;
+using NeuralNetworkNET.Cuda.Extensions;
+using NeuralNetworkNET.Extensions;
+
+namespace NeuralNetworkNET.Cuda.Unit
+{
+    /// <summary>
+    /// Test class for the cuDNN GPU extension methods
+    /// </summary>
+    [TestClass]
+    [TestCategory(nameof(GpuExtensionsTest))]
+    public class GpuExtensionsTest
+    {
+        [TestMethod]
+        public void CopyToRows()
+        {
+            float[] test = {1,2,3,4,5,6,7,8,9};
+            Tensor.NewZeroed(3, 10, out Tensor tensor);
+            Gpu gpu = Gpu.Default;
+            using (DeviceMemory<float> m_gpu = gpu.AllocateDevice(test))
+            {
+                m_gpu.CopyToRows(tensor, 5, 3);
+            }
+            float[,] expected =
+            {
+                { 0, 0, 0, 0, 0, 1, 2, 3, 0, 0 },
+                { 0, 0, 0, 0, 0, 4, 5, 6, 0, 0 },
+                { 0, 0, 0, 0, 0, 7, 8, 9, 0, 0 }
+            };
+            Assert.IsTrue(tensor.ToArray2D().ContentEquals(expected));
+        }
+
+        [TestMethod]
+        public void AllocateDeviceRows()
+        {
+            float[,] source =
+            {
+                { 0, 0, 0, 0, 0, 1, 2, 3, 0, 0 },
+                { 0, 0, 0, 0, 0, 4, 5, 6, 0, 0 },
+                { 0, 0, 0, 0, 0, 7, 8, 9, 0, 0 }
+            };
+            Tensor.From(source, out Tensor tensor);
+            Gpu gpu = Gpu.Default;
+            using (DeviceMemory<float> m_gpu = gpu.AllocateDevice(tensor, 5, 3))
+            {
+                float[]
+                    copy = Gpu.CopyToHost(m_gpu),
+                    expected = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+                Assert.IsTrue(copy.ContentEquals(expected));
+            }
+        }
+    }
+}

From 07af79e57fd7cb910db56eb0ca412c88ad01655f Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Tue, 26 Dec 2017 01:13:56 +0100
Subject: [PATCH 16/30] Inception layer delta loading fixed

---
 .../Layers/CuDnnInceptionLayer.cs             | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 009b273..ef714cb 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -441,15 +441,13 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
         {
             using (DeviceMemory<float> 
                 dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size),
-                dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
                 w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
             {
-                // Pointers
-                deviceptr<float> pdy_gpu = dy_gpu.Ptr; // TODO: load rows
 
                 // First 1x1 convolution
                 DnnInstance.GetConvolutionBackwardDataAlgorithm(_1x1FilterDescription, _1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdDataAlgo algorithm);
                 DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_1x1FilterDescription, _1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float> dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels))
                 using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                 {
                     DnnInstance.ConvolutionBackwardData(1, _1x1FilterDescription, w_gpu.Ptr, _1x1OutputDescription, dy_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, dx_gpu.Ptr);
@@ -461,11 +459,13 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
                     // 3x3 backward
                     DnnInstance.GetConvolutionBackwardDataAlgorithm(_3x3FilterDescription, _3x3OutputDescription, _3x3ConvolutionDescription, _3x3Reduce1x1OutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
                     DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_3x3FilterDescription, _3x3OutputDescription, _3x3ConvolutionDescription, _3x3Reduce1x1OutputDescription, algorithm, out size);
-                    using (DeviceMemory<float> _3x3Reduce1x1dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(_3x3Reduce1x1Z.Size))
+                    using (DeviceMemory<float> 
+                        dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels),
+                        _3x3Reduce1x1dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(_3x3Reduce1x1Z.Size))
                     using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                     {
                         deviceptr<float> p3x3Weights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights;
-                        DnnInstance.ConvolutionBackwardData(1, _3x3FilterDescription, p3x3Weights_gpu, _3x3OutputDescription, pdy_gpu += InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3Reduce1x1OutputDescription, _3x3Reduce1x1dx_gpu.Ptr);
+                        DnnInstance.ConvolutionBackwardData(1, _3x3FilterDescription, p3x3Weights_gpu, _3x3OutputDescription, dy_gpu.Ptr, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3Reduce1x1OutputDescription, _3x3Reduce1x1dx_gpu.Ptr);
                         DnnInstance.ActivationBackward(_3x3Reduce1x1Z.Entities, _3x3Reduce1x1Z.Length, _3x3Reduce1x1z_gpu.Ptr, _3x3Reduce1x1dx_gpu.Ptr, ActivationFunctions.ActivationPrime);
                         _3x3Reduce1x1Delta.TryFree();
                         _3x3Reduce1x1z_gpu.CopyToHost(_3x3Reduce1x1Z.Entities, _3x3Reduce1x1Z.Length, out _3x3Reduce1x1Delta);
@@ -487,11 +487,13 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
                     // 5x5 backward
                     DnnInstance.GetConvolutionBackwardDataAlgorithm(_5x5FilterDescription, _5x5OutputDescription, _5x5ConvolutionDescription, _5x5Reduce1x1OutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
                     DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_5x5FilterDescription, _5x5OutputDescription, _5x5ConvolutionDescription, _5x5Reduce1x1OutputDescription, algorithm, out size);
-                    using (DeviceMemory<float> _5x5Reduce1x1dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(_5x5Reduce1x1Z.Size))
+                    using (DeviceMemory<float> 
+                        dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels),
+                        _5x5Reduce1x1dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(_5x5Reduce1x1Z.Size))
                     using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                     {
                         deviceptr<float> p5x5Weights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights;
-                        DnnInstance.ConvolutionBackwardData(1, _5x5FilterDescription, p5x5Weights_gpu, _5x5OutputDescription, pdy_gpu += InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1OutputDescription, _5x5Reduce1x1dx_gpu.Ptr);
+                        DnnInstance.ConvolutionBackwardData(1, _5x5FilterDescription, p5x5Weights_gpu, _5x5OutputDescription, dy_gpu.Ptr, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1OutputDescription, _5x5Reduce1x1dx_gpu.Ptr);
                         DnnInstance.ActivationBackward(_5x5Reduce1x1Z.Entities, _5x5Reduce1x1Z.Length, _5x5Reduce1x1z_gpu.Ptr, _5x5Reduce1x1dx_gpu.Ptr, ActivationFunctions.ActivationPrime);
                         _5x5Reduce1x1Delta.TryFree();
                         _5x5Reduce1x1z_gpu.CopyToHost(_5x5Reduce1x1Z.Entities, _5x5Reduce1x1Z.Length, out _5x5Reduce1x1Delta);
@@ -513,11 +515,13 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
                     // 1x1 backward
                     DnnInstance.GetConvolutionBackwardDataAlgorithm(Secondary1x1FilterDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, PoolingOutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
                     DnnInstance.GetConvolutionBackwardDataWorkspaceSize(Secondary1x1FilterDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, PoolingOutputDescription, algorithm, out size);
-                    using (DeviceMemory<float> poolDx_gpu = DnnInstance.Gpu.AllocateDevice<float>(PoolingZ.Size))
+                    using (DeviceMemory<float> 
+                        dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels),
+                        poolDx_gpu = DnnInstance.Gpu.AllocateDevice<float>(PoolingZ.Size))
                     using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                     {
                         deviceptr<float> p1x1PoolingWeights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights + _5x5Weights;
-                        DnnInstance.ConvolutionBackwardData(1, Secondary1x1FilterDescription, p1x1PoolingWeights_gpu, Secondary1x1OutputDescription, pdy_gpu, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, PoolingOutputDescription, poolDx_gpu.Ptr);
+                        DnnInstance.ConvolutionBackwardData(1, Secondary1x1FilterDescription, p1x1PoolingWeights_gpu, Secondary1x1OutputDescription, dy_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, PoolingOutputDescription, poolDx_gpu.Ptr);
                         DnnInstance.ActivationBackward(PoolingZ.Entities, PoolingZ.Length, poolDy_gpu.Ptr, poolDx_gpu.Ptr, ActivationFunctions.ActivationPrime);
                         PoolingDelta.TryFree();
                         poolDy_gpu.CopyToHost(PoolingDelta.Entities, PoolingDelta.Length, out PoolingDelta);
@@ -526,7 +530,7 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
                     // Pooling backward
                     using (DeviceMemory<float> poolZ_gpu = DnnInstance.Gpu.AllocateDevice(PoolingZ))
                     {
-                        DnnInstance.PoolingBackward(PoolingDescription, 1, PoolingOutputDescription, poolZ_gpu.Ptr, PoolingOutputDescription, poolDy_gpu.Ptr, InputDescription, default, 1, InputDescription, dx_gpu.Ptr);
+                        DnnInstance.PoolingBackward(PoolingDescription, 1, PoolingOutputDescription, poolZ_gpu.Ptr, PoolingOutputDescription, poolDy_gpu.Ptr, InputDescription, default, 1, InputDescription, dx_gpu.Ptr); // TODO: finish pooling backward
                     }
                 }
 

From c6a84a69e5492e70b081f0a0a009dacc91270c26 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <Sergio0694@live.com>
Date: Tue, 26 Dec 2017 17:41:27 +0100
Subject: [PATCH 17/30] Inception layer implementation finished (WIP)

---
 .../Extensions/GpuExtensions.cs               |  86 ++++-----
 .../Layers/CuDnnInceptionLayer.cs             | 176 ++++++++++++++++--
 .../GpuExtensionsTest.cs                      |   2 +-
 3 files changed, 200 insertions(+), 64 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
index 6adb6cb..03d7ead 100644
--- a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
+++ b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
@@ -27,6 +27,39 @@ public static DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tens
                 : throw new InvalidOperationException($"Failed to copy the source data on the target GPU device, [CUDA ERROR] {result}");
         }
 
+        /// <summary>
+        /// Allocates a memory area on device memory, reading the target values at a given offset from the input <see cref="Tensor"/>
+        /// </summary>
+        /// <param name="gpu">The <see cref="Gpu"/> device to use</param>
+        /// <param name="source">The source <see cref="Tensor"/> with the data to copy</param>
+        /// <param name="offset">The column offset for the data to read from each row</param>
+        /// <param name="length"></param>
+        [MustUseReturnValue, NotNull]
+        public static unsafe DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tensor source, int offset, int length)
+        {
+            // Checks
+            if (source.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
+
+            // Memory copy
+            DeviceMemory<float> result_gpu = gpu.AllocateDevice<float>(source.Entities * length);
+            CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
+            ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
+            {
+                srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
+                srcHost = source.Ptr + sizeof(float) * offset,
+                srcPitch = new IntPtr(sizeof(float) * source.Length),
+                dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
+                dstDevice = result_gpu.Handle,
+                dstPitch = new IntPtr(sizeof(float) * length),
+                WidthInBytes = new IntPtr(sizeof(float) * length),
+                Height = new IntPtr(source.Entities)
+            };
+            CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
+            return result == CUDAInterop.cudaError_enum.CUDA_SUCCESS
+                ? result_gpu
+                : throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
+        }
+
         /// <summary>
         /// Copies the contents of the input <see cref="DeviceMemory{T}"/> instance to the target host memory area
         /// </summary>
@@ -40,20 +73,6 @@ public static void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor d
                 throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
         }
 
-        /// <summary>
-        /// Copies the contents of the input <see cref="DeviceMemory{T}"/> to a new memory area on the unmanaged heap
-        /// </summary>
-        /// <param name="source">The source <see cref="DeviceMemory{T}"/> memory to copy</param>
-        /// <param name="n">The height of the input memory area</param>
-        /// <param name="chw">The width of the input memory area</param>
-        /// <param name="result">The resulting matrix</param>
-        [MustUseReturnValue]
-        public static void CopyToHost([NotNull] this DeviceMemory<float> source, int n, int chw, out Tensor result)
-        {
-            Tensor.New(n, chw, out result);
-            source.CopyTo(result);
-        }
-
         /// <summary>
         /// Copies the source data into the target <see cref="Tensor"/>, splitting each individual entry into its own row
         /// </summary>
@@ -61,7 +80,7 @@ public static void CopyToHost([NotNull] this DeviceMemory<float> source, int n,
         /// <param name="destination">The destination <see cref="Tensor"/> that will store the data</param>
         /// <param name="offset">The column offset for the data for each entry</param>
         /// <param name="length">The number of values to copy for each entry</param>
-        public static unsafe void CopyToRows([NotNull] this DeviceMemory<float> source, in Tensor destination, int offset, int length)
+        public static unsafe void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor destination, int offset, int length)
         {
             // Checks
             if (source.Length / length != destination.Entities) throw new ArgumentOutOfRangeException(nameof(length), "The input length doesn't match the given arguments");
@@ -86,36 +105,17 @@ public static unsafe void CopyToRows([NotNull] this DeviceMemory<float> source,
         }
 
         /// <summary>
-        /// Allocates a memory area on device memory, reading the target values at a given offset from the input <see cref="Tensor"/>
+        /// Copies the contents of the input <see cref="DeviceMemory{T}"/> to a new memory area on the unmanaged heap
         /// </summary>
-        /// <param name="gpu">The <see cref="Gpu"/> device to use</param>
-        /// <param name="source">The source <see cref="Tensor"/> with the data to copy</param>
-        /// <param name="offset">The column offset for the data to read from each row</param>
-        /// <param name="length"></param>
-        [MustUseReturnValue, NotNull]
-        public static unsafe DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tensor source, int offset, int length)
+        /// <param name="source">The source <see cref="DeviceMemory{T}"/> memory to copy</param>
+        /// <param name="n">The height of the input memory area</param>
+        /// <param name="chw">The width of the input memory area</param>
+        /// <param name="result">The resulting matrix</param>
+        [MustUseReturnValue]
+        public static void CopyToHost([NotNull] this DeviceMemory<float> source, int n, int chw, out Tensor result)
         {
-            // Checks
-            if (source.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
-
-            // Memory copy
-            DeviceMemory<float> result_gpu = gpu.AllocateDevice<float>(source.Entities * length);
-            CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
-            ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
-            {
-                srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
-                srcHost = source.Ptr + sizeof(float) * offset,
-                srcPitch = new IntPtr(sizeof(float) * source.Length),
-                dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
-                dstDevice = result_gpu.Handle,
-                dstPitch = new IntPtr(sizeof(float) * length),
-                WidthInBytes = new IntPtr(sizeof(float) * length),
-                Height = new IntPtr(source.Entities)
-            };
-            CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
-            return result == CUDAInterop.cudaError_enum.CUDA_SUCCESS
-                ? result_gpu
-                : throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
+            Tensor.New(n, chw, out result);
+            source.CopyTo(result);
         }
 
         #endregion
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index ef714cb..08221df 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -82,6 +82,9 @@ private int Secondary1x1Weights
             get => InputInfo.Channels * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels;
         }
 
+        // A copy of the forward layer inputs
+        private Tensor _Inputs;
+
         // 3x3 reduction 1x1 convolution activity
         private Tensor _3x3Reduce1x1Z;
 
@@ -104,10 +107,10 @@ private int Secondary1x1Weights
         private Tensor PoolingZ;
 
         // Pooling output activation
-        private Tensor PoolingA;
+        private Tensor _PoolingA;
 
         // Pooling output delta
-        private Tensor PoolingDelta;
+        private Tensor _PoolingDelta;
 
         #endregion
 
@@ -296,6 +299,8 @@ internal CuDnnInceptionLayer(in TensorInfo input, in InceptionInfo info, [NotNul
         /// <inheritdoc/>
         public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
+            _Inputs.TryFree();
+            x.Duplicate(out _Inputs);
             Tensor.New(x.Entities, OutputInfo.Size, out z);
             Tensor.New(x.Entities, OutputInfo.Size, out a);
             using (DeviceMemory<float>
@@ -319,11 +324,11 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                         DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _1x1FilterDescription, pw_gpu, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _1x1OutputDescription, y_gpu.Ptr);                            
                     }
                     DnnInstance.AddTensor(1, _1x1BiasDescription, pb_gpu, 1, _1x1OutputDescription, y_gpu.Ptr);
-                    y_gpu.CopyToRows(z, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
+                    y_gpu.CopyTo(z, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
 
                     // 1x1 convolution activation
                     DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
-                    y_gpu.CopyToRows(a, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
+                    y_gpu.CopyTo(a, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
                 }
 
                 // 1x1 + 3x3 convolution
@@ -356,11 +361,11 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                         DnnInstance.ConvolutionForward(1, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr, _3x3FilterDescription, pw_gpu += _3x3Reduce1x1Weights, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3OutputDescription, y_gpu.Ptr);      
                     }
                     DnnInstance.AddTensor(1, _3x3BiasDescription, pb_gpu += OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, 1, _3x3OutputDescription, y_gpu.Ptr);
-                    y_gpu.CopyToRows(z, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
+                    y_gpu.CopyTo(z, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
 
                     // Activation
                     DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
-                    y_gpu.CopyToRows(a, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
+                    y_gpu.CopyTo(a, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
                 }
 
                 // 1x1 + 5x5 convolution
@@ -393,11 +398,11 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                         DnnInstance.ConvolutionForward(1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr, _5x5FilterDescription, pw_gpu += _5x5Reduce1x1Weights, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5OutputDescription, y_gpu.Ptr);      
                     }
                     DnnInstance.AddTensor(1, _5x5BiasDescription, pb_gpu += OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, 1, _5x5OutputDescription, y_gpu.Ptr);
-                    y_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
+                    y_gpu.CopyTo(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
 
                     // Activation
                     DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
-                    y_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
+                    y_gpu.CopyTo(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
                 }
                 
                 // Pooling pipeline
@@ -412,8 +417,8 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     PoolingZ.TryFree();
                     y_gpu.CopyToHost(x.Entities, InputInfo.Size, out PoolingZ);
                     DnnInstance.ActivationForward(x.Entities, x.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
-                    PoolingA.TryFree();
-                    y_gpu.CopyToHost(x.Entities, InputInfo.Size, out PoolingA);
+                    _PoolingA.TryFree();
+                    y_gpu.CopyToHost(x.Entities, InputInfo.Size, out _PoolingA);
 
                     // 1x1 convolution
                     using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels))
@@ -426,11 +431,11 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                             DnnInstance.ConvolutionForward(1, InputDescription, y_gpu.Ptr, Secondary1x1FilterDescription, pw_gpu += _5x5Weights, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);                            
                         }
                         DnnInstance.AddTensor(1, Secondary1x1BiasDescription, pb_gpu += OperationInfo.Secondary5x5ConvolutionKernels, 1, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);
-                        _1x1Output_gpu.CopyToRows(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
+                        _1x1Output_gpu.CopyTo(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
 
                         // 1x1 convolution activation
                         DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, _1x1Output_gpu.Ptr, _1x1Output_gpu.Ptr, ActivationFunctions.Activation);
-                        _1x1Output_gpu.CopyToRows(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
+                        _1x1Output_gpu.CopyTo(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
                     }
                 }
             }
@@ -443,7 +448,6 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
                 dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size),
                 w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
             {
-
                 // First 1x1 convolution
                 DnnInstance.GetConvolutionBackwardDataAlgorithm(_1x1FilterDescription, _1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdDataAlgo algorithm);
                 DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_1x1FilterDescription, _1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, algorithm, out IntPtr size);
@@ -523,14 +527,16 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
                         deviceptr<float> p1x1PoolingWeights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights + _5x5Weights;
                         DnnInstance.ConvolutionBackwardData(1, Secondary1x1FilterDescription, p1x1PoolingWeights_gpu, Secondary1x1OutputDescription, dy_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, PoolingOutputDescription, poolDx_gpu.Ptr);
                         DnnInstance.ActivationBackward(PoolingZ.Entities, PoolingZ.Length, poolDy_gpu.Ptr, poolDx_gpu.Ptr, ActivationFunctions.ActivationPrime);
-                        PoolingDelta.TryFree();
-                        poolDy_gpu.CopyToHost(PoolingDelta.Entities, PoolingDelta.Length, out PoolingDelta);
+                        _PoolingDelta.TryFree();
+                        poolDy_gpu.CopyToHost(_PoolingDelta.Entities, _PoolingDelta.Length, out _PoolingDelta);
                     }
 
                     // Pooling backward
-                    using (DeviceMemory<float> poolZ_gpu = DnnInstance.Gpu.AllocateDevice(PoolingZ))
+                    using (DeviceMemory<float> 
+                        x_gpu = DnnInstance.Gpu.AllocateDevice(_Inputs),
+                        poolZ_gpu = DnnInstance.Gpu.AllocateDevice(PoolingZ))
                     {
-                        DnnInstance.PoolingBackward(PoolingDescription, 1, PoolingOutputDescription, poolZ_gpu.Ptr, PoolingOutputDescription, poolDy_gpu.Ptr, InputDescription, default, 1, InputDescription, dx_gpu.Ptr); // TODO: finish pooling backward
+                        DnnInstance.PoolingBackward(PoolingDescription, 1, PoolingOutputDescription, poolZ_gpu.Ptr, PoolingOutputDescription, poolDy_gpu.Ptr, InputDescription, x_gpu.Ptr, 1, InputDescription, dx_gpu.Ptr); // TODO: finish pooling backward
                     }
                 }
 
@@ -546,7 +552,136 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
         /// <inheritdoc/>
         public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJdw, out Tensor dJdb)
         {
-            throw new NotImplementedException();
+            Tensor.New(1, Weights.Length, out dJdw);
+            Tensor.New(1, Biases.Length, out dJdb);
+            using (DeviceMemory<float> a_gpu = DnnInstance.Gpu.AllocateDevice(a))
+            {
+                // 1x1 weights
+                using (DeviceMemory<float> dy1x1_gpu = DnnInstance.Gpu.AllocateDevice(delta, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels))
+                {
+                    DnnInstance.GetConvolutionBackwardFilterAlgorithm(InputDescription, _1x1OutputDescription, _1x1ConvolutionDescription, _1x1FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                    DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(InputDescription, _1x1OutputDescription, _1x1ConvolutionDescription, _1x1FilterDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(_1x1Weights))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, _1x1OutputDescription, dy1x1_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _1x1FilterDescription, dw_gpu.Ptr);
+                        dw_gpu.CopyTo(dJdw, 0, _1x1Weights);
+                    }
+
+                    // 1x1 bias
+                    using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Primary1x1ConvolutionKernels))
+                    {
+                        DnnInstance.ConvolutionBackwardBias(1, _1x1OutputDescription, dy1x1_gpu.Ptr, 0, _1x1BiasDescription, db_gpu.Ptr);
+                        db_gpu.CopyTo(dJdb, 0, OperationInfo.Primary1x1ConvolutionKernels);
+                    }
+                }
+
+                // 3x3 reduce 1x1 weights
+                using (DeviceMemory<float> dy3x3Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_3x3Reduce1x1A))
+                {
+                    DnnInstance.GetConvolutionBackwardFilterAlgorithm(InputDescription, _3x3Reduce1x1OutputDescription, _1x1ConvolutionDescription, _3x3Reduce1x1FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                    DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(InputDescription, _3x3Reduce1x1OutputDescription, _1x1ConvolutionDescription, _3x3Reduce1x1FilterDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(_3x3Reduce1x1Weights))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, _3x3Reduce1x1OutputDescription, dy3x3Reduce1x1_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3Reduce1x1FilterDescription, dw_gpu.Ptr);
+                        dw_gpu.CopyTo(dJdw, _1x1Weights, _3x3Reduce1x1Weights);
+                    }
+
+                    // 3x3 reduce 1x1 bias
+                    using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Primary3x3Reduce1x1ConvolutionKernels))
+                    {
+                        DnnInstance.ConvolutionBackwardBias(1, _3x3Reduce1x1OutputDescription, dy3x3Reduce1x1_gpu.Ptr, 0, _3x3Reduce1x1BiasDescription, db_gpu.Ptr);
+                        db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels, OperationInfo.Primary3x3Reduce1x1ConvolutionKernels);
+                    }
+                }
+
+                // 5x5 reduce 1x1 weights
+                using (DeviceMemory<float> dy5x5Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_5x5Reduce1x1A))
+                {
+                    DnnInstance.GetConvolutionBackwardFilterAlgorithm(InputDescription, _5x5Reduce1x1OutputDescription, _1x1ConvolutionDescription, _5x5Reduce1x1FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                    DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(InputDescription, _5x5Reduce1x1OutputDescription, _1x1ConvolutionDescription, _5x5Reduce1x1FilterDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(_5x5Reduce1x1Weights))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, _5x5Reduce1x1OutputDescription, dy5x5Reduce1x1_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1FilterDescription, dw_gpu.Ptr);
+                        dw_gpu.CopyTo(dJdw, _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights, _5x5Reduce1x1Weights);
+                    }
+
+                    // 3x3 reduce 1x1 bias
+                    using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Primary5x5Reduce1x1ConvolutionKernels))
+                    {
+                        DnnInstance.ConvolutionBackwardBias(1, _5x5Reduce1x1OutputDescription, dy5x5Reduce1x1_gpu.Ptr, 0, _5x5Reduce1x1BiasDescription, db_gpu.Ptr);
+                        db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Primary3x3Reduce1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels, OperationInfo.Primary5x5Reduce1x1ConvolutionKernels);
+                    }
+                }
+            }
+
+            // 3x3 weights
+            using (DeviceMemory<float> dy3x3_gpu = DnnInstance.Gpu.AllocateDevice(delta, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels))
+            {
+                DnnInstance.GetConvolutionBackwardFilterAlgorithm(_3x3Reduce1x1OutputDescription, _3x3OutputDescription, _3x3ConvolutionDescription, _3x3FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(_3x3Reduce1x1OutputDescription, _3x3OutputDescription, _3x3ConvolutionDescription, _3x3FilterDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float>
+                    a3x3Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_3x3Reduce1x1A),
+                    dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(_3x3Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                {
+                    DnnInstance.ConvolutionBackwardFilter(1, _3x3Reduce1x1OutputDescription, a3x3Reduce1x1_gpu.Ptr, _3x3OutputDescription, dy3x3_gpu.Ptr, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3FilterDescription, dw_gpu.Ptr);
+                    dw_gpu.CopyTo(dJdw, _1x1Weights + _3x3Reduce1x1Weights, _3x3Weights);
+                }
+
+                // 3x3 bias
+                using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Secondary3x3ConvolutionKernels))
+                {
+                    DnnInstance.ConvolutionBackwardBias(1, _3x3OutputDescription, dy3x3_gpu.Ptr, 0, _3x3BiasDescription, db_gpu.Ptr);
+                    db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels, OperationInfo.Secondary3x3ConvolutionKernels);
+                }
+            }
+
+            // 5x5 weights
+            using (DeviceMemory<float> dy5x5_gpu = DnnInstance.Gpu.AllocateDevice(delta, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels))
+            {
+                DnnInstance.GetConvolutionBackwardFilterAlgorithm(_5x5Reduce1x1OutputDescription, _5x5OutputDescription, _5x5ConvolutionDescription, _5x5FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(_5x5Reduce1x1OutputDescription, _5x5OutputDescription, _5x5ConvolutionDescription, _5x5FilterDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float>
+                    a5x5Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_5x5Reduce1x1A),
+                    dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(_5x5Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                {
+                    DnnInstance.ConvolutionBackwardFilter(1, _5x5Reduce1x1OutputDescription, a5x5Reduce1x1_gpu.Ptr, _5x5OutputDescription, dy5x5_gpu.Ptr, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5FilterDescription, dw_gpu.Ptr);
+                    dw_gpu.CopyTo(dJdw, _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights, _5x5Weights);
+                }
+
+                // 3x3 bias
+                using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Secondary5x5ConvolutionKernels))
+                {
+                    DnnInstance.ConvolutionBackwardBias(1, _5x5OutputDescription, dy5x5_gpu.Ptr, 0, _5x5BiasDescription, db_gpu.Ptr);
+                    db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Primary3x3Reduce1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, OperationInfo.Secondary5x5ConvolutionKernels);
+                }
+            }
+
+            // Pooling 1x1 convolution
+            using (DeviceMemory<float> dy1x1Pool_gpu = DnnInstance.Gpu.AllocateDevice(delta, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels))
+            {
+                DnnInstance.GetConvolutionBackwardFilterAlgorithm(PoolingOutputDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, Secondary1x1FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(PoolingOutputDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, Secondary1x1FilterDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float>
+                    aPool_gpu = DnnInstance.Gpu.AllocateDevice(_PoolingA),
+                    dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(Secondary1x1Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                {
+                    DnnInstance.ConvolutionBackwardFilter(1, PoolingOutputDescription, aPool_gpu.Ptr, Secondary1x1OutputDescription, dy1x1Pool_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, Secondary1x1FilterDescription, dw_gpu.Ptr);
+                    dw_gpu.CopyTo(dJdw, _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights + _5x5Weights, Secondary1x1Weights);
+                }
+
+                // Pooling 1x1 bias
+                using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Secondary1x1AfterPoolingConvolutionKernels))
+                {
+                    DnnInstance.ConvolutionBackwardBias(1, PoolingOutputDescription, dy1x1Pool_gpu.Ptr, 0, Secondary1x1BiasDescription, db_gpu.Ptr);
+                    db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Primary3x3Reduce1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Primary5x5Reduce1x1ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels, OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
+                }
+            }
         }
 
         #endregion
@@ -568,6 +703,7 @@ void IDisposable.Dispose()
         // Private Dispose method
         private void Dispose()
         {
+            _Inputs.TryFree();
             _3x3Reduce1x1Z.TryFree();
             _3x3Reduce1x1A.TryFree();
             _3x3Reduce1x1Delta.TryFree();
@@ -575,8 +711,8 @@ private void Dispose()
             _5x5Reduce1x1A.TryFree();
             _5x5Reduce1x1Delta.TryFree();
             PoolingZ.TryFree();
-            PoolingA.TryFree();
-            PoolingDelta.TryFree();
+            _PoolingA.TryFree();
+            _PoolingDelta.TryFree();
         }
 
         #endregion
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs
index 6676490..9531146 100644
--- a/Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs
@@ -21,7 +21,7 @@ public void CopyToRows()
             Gpu gpu = Gpu.Default;
             using (DeviceMemory<float> m_gpu = gpu.AllocateDevice(test))
             {
-                m_gpu.CopyToRows(tensor, 5, 3);
+                m_gpu.CopyTo(tensor, 5, 3);
             }
             float[,] expected =
             {

From ceae801ab24c58d24e4b0bdb783de25019b8b390 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <Sergio0694@live.com>
Date: Tue, 26 Dec 2017 17:53:49 +0100
Subject: [PATCH 18/30] Inception layer public API and serialization methods
 added

---
 .../APIs/CuDnnNetworkLayers.cs                | 13 ++++++++
 .../APIs/CuDnnNetworkLayersDeserializer.cs    |  1 +
 .../Layers/CuDnnInceptionLayer.cs             | 30 +++++++++++++++++++
 3 files changed, 44 insertions(+)

diff --git a/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs
index 13c6ab2..1b6f1d6 100644
--- a/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs
+++ b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs
@@ -67,5 +67,18 @@ public static INetworkLayer Convolutional(
         [PublicAPI]
         [Pure, NotNull]
         public static INetworkLayer Pooling(in TensorInfo input, in PoolingInfo info, ActivationFunctionType activation) => new CuDnnPoolingLayer(input, info, activation);
+
+        /// <summary>
+        /// Creates a new inception layer with the given input and features
+        /// </summary>
+        /// <param name="input">The input volume to process</param>
+        /// <param name="info">The info on the operations to execute inside the layer</param>
+        /// <param name="biasMode">Indicates the desired initialization mode to use for the layer bias values</param>
+        [PublicAPI]
+        [Pure, NotNull]
+        public static INetworkLayer Inception(
+            in TensorInfo input, in InceptionInfo info,
+            BiasInitializationMode biasMode = BiasInitializationMode.Zero)
+            => new CuDnnInceptionLayer(input, info, biasMode);
     }
 }
\ No newline at end of file
diff --git a/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs
index dfafc6a..fda4c29 100644
--- a/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs
+++ b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs
@@ -31,6 +31,7 @@ private static INetworkLayer Deserialize([NotNull] Stream stream, LayerType type
                 case LayerType.Convolutional: return CuDnnConvolutionalLayer.Deserialize(stream);
                 case LayerType.Pooling: return CuDnnPoolingLayer.Deserialize(stream);
                 case LayerType.Softmax: return CuDnnSoftmaxLayer.Deserialize(stream);
+                case LayerType.Inception: return CuDnnInceptionLayer.Deserialize(stream);
                 default: return null;
             }
         } 
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 08221df..3b60619 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -6,6 +6,7 @@
 using NeuralNetworkNET.APIs.Structs;
 using NeuralNetworkNET.Cuda.Extensions;
 using NeuralNetworkNET.Cuda.Services;
+using NeuralNetworkNET.Extensions;
 using NeuralNetworkNET.Networks.Activations;
 using NeuralNetworkNET.Networks.Activations.Delegates;
 using NeuralNetworkNET.Networks.Implementations.Layers.Abstract;
@@ -686,9 +687,38 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
 
         #endregion
 
+        #region Misc
+
         /// <inheritdoc/>
         public override INetworkLayer Clone() => new CuDnnInceptionLayer(InputInfo, OperationInfo, Weights, Biases);
 
+        /// <inheritdoc/>
+        public override void Serialize(System.IO.Stream stream)
+        {
+            base.Serialize(stream);
+            stream.Write(OperationInfo);
+        }
+
+        /// <summary>
+        /// Tries to deserialize a new <see cref="CuDnnInceptionLayer"/> from the input <see cref="System.IO.Stream"/>
+        /// </summary>
+        /// <param name="stream">The input <see cref="System.IO.Stream"/> to use to read the layer data</param>
+        [MustUseReturnValue, CanBeNull]
+        public static INetworkLayer Deserialize([NotNull] System.IO.Stream stream)
+        {
+            if (!stream.TryRead(out TensorInfo input)) return null;
+            if (!stream.TryRead<TensorInfo>(out _)) return null;
+            if (!stream.TryRead<ActivationFunctionType>(out _)) return null;
+            if (!stream.TryRead(out int wLength)) return null;
+            float[] weights = stream.ReadUnshuffled(wLength);
+            if (!stream.TryRead(out int bLength)) return null;
+            float[] biases = stream.ReadUnshuffled(bLength);
+            if (!stream.TryRead(out InceptionInfo info)) return null;
+            return new CuDnnInceptionLayer(input, info, weights, biases);
+        }
+
+        #endregion
+
         #region IDisposable
 
         ~CuDnnInceptionLayer() => Dispose();

From 393d01a73df9c2920fca4c679acfdc0b7c7d1001 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <Sergio0694@live.com>
Date: Tue, 26 Dec 2017 19:02:59 +0100
Subject: [PATCH 19/30] Minor fixes to the inception layer (WIP)

---
 NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 3b60619..7394aaf 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -241,12 +241,13 @@ private int Secondary1x1Weights
         /// Gets the <see cref="Dnn"/> instance for the current layer
         /// </summary>
         [NotNull]
-        private readonly Dnn DnnInstance = DnnService.Instance;
+        private readonly Dnn DnnInstance = null;
 
         // cuDNN fields setup
         private void SetupCuDnnInfo()
         {
             // First 1x1 convolution
+            _1x1ConvolutionDescription.Set2D(0, 0, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION);
             _1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Primary1x1ConvolutionKernels, InputInfo.Channels, 1, 1);
             _1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Primary1x1ConvolutionKernels, 1, 1);
 
@@ -272,7 +273,7 @@ private void SetupCuDnnInfo()
             PoolingDescription.Set2D(Alea.cuDNN.PoolingMode.AVERAGE_COUNT_EXCLUDE_PADDING, NanPropagation.PROPAGATE_NAN, 3, 3, 1, 1, 1, 1);
             
             // Secondary 1x1 convolution
-            Secondary1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, InputInfo.Channels, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, 1, 1);
+            Secondary1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, InputInfo.Channels, 1, 1);
             Secondary1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, 1, 1);
         }
 
@@ -424,7 +425,7 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     // 1x1 convolution
                     using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels))
                     {
-                        Secondary1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
+                        Secondary1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, InputInfo.Height, InputInfo.Width);
                         DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, Secondary1x1FilterDescription, _1x1ConvolutionDescription, Secondary1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
                         DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, Secondary1x1FilterDescription, _1x1ConvolutionDescription, Secondary1x1OutputDescription, algorithm, out IntPtr size);
                         using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))

From 84502d3f363945459572eb654dd34dba689816ab Mon Sep 17 00:00:00 2001
From: Sergio Pedri <Sergio0694@live.com>
Date: Tue, 26 Dec 2017 19:06:03 +0100
Subject: [PATCH 20/30] Ooops!

---
 NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 7394aaf..345edc2 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -241,7 +241,7 @@ private int Secondary1x1Weights
         /// Gets the <see cref="Dnn"/> instance for the current layer
         /// </summary>
         [NotNull]
-        private readonly Dnn DnnInstance = null;
+        private readonly Dnn DnnInstance = DnnService.Instance;
 
         // cuDNN fields setup
         private void SetupCuDnnInfo()

From bfe4a04f8d3d6f961000d3d4a50a2d6cf779583a Mon Sep 17 00:00:00 2001
From: Sergio Pedri <Sergio0694@live.com>
Date: Tue, 26 Dec 2017 19:31:45 +0100
Subject: [PATCH 21/30] More fixes to the inception layer (WIP)

---
 .../Layers/CuDnnInceptionLayer.cs                  | 10 +++++-----
 .../Layers/Helpers/WeightsProvider.cs              | 14 +++++++++++++-
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 345edc2..c3a12b8 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -516,7 +516,7 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
                 }
 
                 // Pooling
-                using (DeviceMemory<float> poolDy_gpu = DnnInstance.Gpu.AllocateDevice(PoolingZ))
+                using (DeviceMemory<float> pooldy_gpu = DnnInstance.Gpu.AllocateDevice(PoolingZ))
                 {
                     // 1x1 backward
                     DnnInstance.GetConvolutionBackwardDataAlgorithm(Secondary1x1FilterDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, PoolingOutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
@@ -528,9 +528,9 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
                     {
                         deviceptr<float> p1x1PoolingWeights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights + _5x5Weights;
                         DnnInstance.ConvolutionBackwardData(1, Secondary1x1FilterDescription, p1x1PoolingWeights_gpu, Secondary1x1OutputDescription, dy_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, PoolingOutputDescription, poolDx_gpu.Ptr);
-                        DnnInstance.ActivationBackward(PoolingZ.Entities, PoolingZ.Length, poolDy_gpu.Ptr, poolDx_gpu.Ptr, ActivationFunctions.ActivationPrime);
+                        DnnInstance.ActivationBackward(PoolingZ.Entities, PoolingZ.Length, pooldy_gpu.Ptr, poolDx_gpu.Ptr, ActivationFunctions.ActivationPrime);
                         _PoolingDelta.TryFree();
-                        poolDy_gpu.CopyToHost(_PoolingDelta.Entities, _PoolingDelta.Length, out _PoolingDelta);
+                        pooldy_gpu.CopyToHost(PoolingZ.Entities, PoolingZ.Length, out _PoolingDelta);
                     }
 
                     // Pooling backward
@@ -538,7 +538,7 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
                         x_gpu = DnnInstance.Gpu.AllocateDevice(_Inputs),
                         poolZ_gpu = DnnInstance.Gpu.AllocateDevice(PoolingZ))
                     {
-                        DnnInstance.PoolingBackward(PoolingDescription, 1, PoolingOutputDescription, poolZ_gpu.Ptr, PoolingOutputDescription, poolDy_gpu.Ptr, InputDescription, x_gpu.Ptr, 1, InputDescription, dx_gpu.Ptr); // TODO: finish pooling backward
+                        DnnInstance.PoolingBackward(PoolingDescription, 1, PoolingOutputDescription, poolZ_gpu.Ptr, PoolingOutputDescription, pooldy_gpu.Ptr, InputDescription, x_gpu.Ptr, 1, InputDescription, dx_gpu.Ptr); // TODO: finish pooling backward
                     }
                 }
 
@@ -680,7 +680,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                 // Pooling 1x1 bias
                 using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Secondary1x1AfterPoolingConvolutionKernels))
                 {
-                    DnnInstance.ConvolutionBackwardBias(1, PoolingOutputDescription, dy1x1Pool_gpu.Ptr, 0, Secondary1x1BiasDescription, db_gpu.Ptr);
+                    DnnInstance.ConvolutionBackwardBias(1, Secondary1x1OutputDescription, dy1x1Pool_gpu.Ptr, 0, Secondary1x1BiasDescription, db_gpu.Ptr);
                     db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Primary3x3Reduce1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Primary5x5Reduce1x1ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels, OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
                 }
             }
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
index 572ef96..23cd6be 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
@@ -76,6 +76,7 @@ public static unsafe float[] NewConvolutionalKernels(in TensorInfo input, int ke
         [Pure, NotNull]
         public static unsafe float[] NewInceptionWeights(in TensorInfo input, in InceptionInfo info)
         {
+            // Setup
             int
                 _1x1Length = input.Channels * info.Primary1x1ConvolutionKernels,
                 _3x3Reduce1x1Length = input.Channels * info.Primary3x3Reduce1x1ConvolutionKernels,
@@ -83,19 +84,30 @@ public static unsafe float[] NewInceptionWeights(in TensorInfo input, in Incepti
                 _5x5Reduce1x1Length = input.Channels * info.Primary5x5Reduce1x1ConvolutionKernels,
                 _5x5Length = 5 * 5 * info.Primary5x5Reduce1x1ConvolutionKernels * info.Secondary5x5ConvolutionKernels,
                 secondary1x1Length = input.Channels * info.Secondary1x1AfterPoolingConvolutionKernels;
-            float[] weights = new float[_1x1Length + _3x3Length + _5x5Length + secondary1x1Length];
+            float[] weights = new float[_1x1Length + _3x3Reduce1x1Length + _3x3Length + _5x5Reduce1x1Length + _5x5Length + secondary1x1Length];
             fixed (float* pw = weights)
             {
+                // 1x1
                 Tensor.Reshape(pw, 1, _1x1Length, out Tensor wTensor);
                 KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
+
+                // 3x3 reduce 1x1
                 Tensor.Reshape(pw + _1x1Length, 1, _3x3Reduce1x1Length, out wTensor);
                 KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
+
+                // 3x3
                 Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length, 1, _3x3Length, out wTensor);
                 KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, 3 * 3 * info.Primary3x3Reduce1x1ConvolutionKernels);
+
+                // 5x5 reduce 1x1
                 Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length + _3x3Length, 1, _5x5Reduce1x1Length, out wTensor);
                 KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
+
+                // 5x5
                 Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length + _3x3Length + _5x5Reduce1x1Length, 1, _5x5Length, out wTensor);
                 KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, 5 * 5 * info.Primary5x5Reduce1x1ConvolutionKernels);
+
+                // Pool 1x1
                 Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length + _3x3Length + _5x5Reduce1x1Length + _5x5Length, 1, secondary1x1Length, out wTensor);
                 KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
             }

From 096994ad605bca5aab0d2f92fc2860d38e330de8 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <Sergio0694@live.com>
Date: Tue, 26 Dec 2017 23:34:57 +0100
Subject: [PATCH 22/30] Fixed convolution output size

---
 NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs | 15 +++++++++++++++
 .../Implementations/Layers/ConvolutionalLayer.cs  |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs b/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs
index 49a34f8..cb8c7ad 100644
--- a/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs
@@ -75,6 +75,21 @@ public static ConvolutionInfo New(
 
         #endregion
 
+        /// <summary>
+        /// Calculates the output size after applying a convolution operation to the input tensor
+        /// </summary>
+        /// <param name="input">The info on the input tensor</param>
+        /// <param name="field">The size of the convolution kernels</param>
+        /// <param name="kernels">The number of convolution kernels to be used</param>
+        [Pure]
+        internal TensorInfo GetForwardOutputTensorInfo(in TensorInfo input, (int X, int Y) field, int kernels)
+        {
+            int
+                h = (input.Height - field.X + 2 * VerticalPadding) / VerticalStride + 1,
+                w = (input.Width - field.Y + 2 * HorizontalPadding) / HorizontalStride + 1;
+            return new TensorInfo(h, w, kernels);
+        }
+
         #region Equality
 
         /// <inheritdoc/>
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
index 922cf50..4f28de0 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
@@ -58,7 +58,7 @@ public ref readonly TensorInfo KernelInfo
         #endregion
 
         public ConvolutionalLayer(in TensorInfo input, in ConvolutionInfo operation, (int X, int Y) kernelSize, int kernels, ActivationFunctionType activation, BiasInitializationMode biasMode)
-            : base(input, new TensorInfo(input.Height - kernelSize.X + 1, input.Width - kernelSize.Y + 1, kernels),
+            : base(input, operation.GetForwardOutputTensorInfo(input, kernelSize, kernels),
                   WeightsProvider.NewConvolutionalKernels(input, kernelSize.X, kernelSize.Y, kernels),
                   WeightsProvider.NewBiases(kernels, biasMode), activation)
         {

From fa6c36d874f6a77cf2642702c7d5676d9ab80747 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <Sergio0694@live.com>
Date: Tue, 26 Dec 2017 23:35:34 +0100
Subject: [PATCH 23/30] Added initial inception layer tests

---
 .../Layers/CuDnnInceptionLayer.cs             |  2 +-
 .../CuDnnLayersTest.cs                        | 77 +++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index c3a12b8..a49c6e5 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -406,7 +406,7 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
                     y_gpu.CopyTo(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
                 }
-                
+            
                 // Pooling pipeline
                 PoolingOutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
                 using (DeviceMemory<float> y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Size))
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
index 790a697..dd473b5 100644
--- a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
@@ -8,6 +8,8 @@
 using NeuralNetworkNET.Networks.Implementations.Layers;
 using NeuralNetworkNET.Networks.Implementations.Layers.Abstract;
 using NeuralNetworkNET.Networks.Implementations.Layers.Helpers;
+using System;
+using System.Runtime.CompilerServices;
 
 namespace NeuralNetworkNET.Cuda.Unit
 {
@@ -248,5 +250,80 @@ public void PoolingBackward()
         }
 
         #endregion
+
+        #region Inception
+
+        [TestMethod]
+        public unsafe void InceptionForward1x1()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 32 * 32 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 32 * 32 * 3);
+            CuDnnConvolutionalLayer conv = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(32, 32), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(conv.InputInfo, InceptionInfo.New(10, 10, 10, 10, 10, PoolingMode.Max, 10));
+            Buffer.BlockCopy(conv.Weights, 0, inception.Weights, 0, sizeof(float) * conv.Weights.Length);
+            Buffer.BlockCopy(conv.Biases, 0, inception.Biases, 0, sizeof(float) * conv.Biases.Length);
+            fixed (float* px = x)
+            {
+                // Forward + Z
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                conv.Forward(xTensor, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer(), preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+
+                // A
+                float* paInc = (float*)aInc.Ptr.ToPointer();
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+                zConv.Free();
+                aConv.Free();
+                zInc.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+
+        [TestMethod]
+        public unsafe void InceptionForward3x3Pipeline()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 32 * 32 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 32 * 32 * 3);
+            CuDnnConvolutionalLayer
+                conv1 = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(32, 32), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian),
+                conv2 = new CuDnnConvolutionalLayer(conv1.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation, 1, 1), (3, 3), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(32, 32), InceptionInfo.New(10, 10, 10, 10, 10, PoolingMode.Max, 10));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv1.Weights, 0, inception.Weights, sizeof(float) * 3 * 10, sizeof(float) * conv1.Weights.Length);
+            Buffer.BlockCopy(conv2.Weights, 0, inception.Weights, sizeof(float) * 3 * 10 + sizeof(float) * conv1.Weights.Length, sizeof(float) * conv2.Weights.Length);
+            Buffer.BlockCopy(conv1.Biases, 0, inception.Biases, sizeof(float) * 10, sizeof(float) * conv1.Biases.Length);
+            Buffer.BlockCopy(conv2.Biases, 0, inception.Biases, sizeof(float) * 20, sizeof(float) * conv2.Biases.Length);
+            fixed (float* px = x)
+            {
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                conv1.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
+                zTemp.Free();
+                conv2.Forward(aTemp, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer() + 32 * 32 * 10, preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+                zConv.Free();
+                zInc.Free();
+                float* paInc = (float*)aInc.Ptr.ToPointer() + 32 * 32 * 10;
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+                aConv.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+
+        #endregion
     }
 }

From b3e136c225057a76d4658ed0b230c0a132909bf1 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <Sergio0694@live.com>
Date: Wed, 27 Dec 2017 18:22:48 +0100
Subject: [PATCH 24/30] Minor bug fixes, inception 5x5 test added

---
 .../Layers/CuDnnInceptionLayer.cs             |  6 +--
 .../CuDnnLayersTest.cs                        | 39 +++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index a49c6e5..e336740 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -382,7 +382,7 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     using (DeviceMemory<float> x_gpu = DnnInstance.Gpu.AllocateDevice(x))
                     using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                     {
-                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _5x5Reduce1x1FilterDescription, pw_gpu += _3x3Weights, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);                            
+                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _5x5Reduce1x1FilterDescription, pw_gpu += _3x3Weights, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);                            
                     }
                     DnnInstance.AddTensor(1, _5x5Reduce1x1BiasDescription, pb_gpu += OperationInfo.Secondary3x3ConvolutionKernels, 1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);
                     _5x5Reduce1x1Z.TryFree();
@@ -397,7 +397,7 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     DnnInstance.GetConvolutionForwardWorkspaceSize(_5x5Reduce1x1OutputDescription, _5x5FilterDescription, _5x5ConvolutionDescription, _5x5OutputDescription, algorithm, out size);
                     using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                     {
-                        DnnInstance.ConvolutionForward(1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr, _5x5FilterDescription, pw_gpu += _5x5Reduce1x1Weights, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5OutputDescription, y_gpu.Ptr);      
+                        DnnInstance.ConvolutionForward(1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr, _5x5FilterDescription, pw_gpu += _5x5Reduce1x1Weights, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5OutputDescription, y_gpu.Ptr);
                     }
                     DnnInstance.AddTensor(1, _5x5BiasDescription, pb_gpu += OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, 1, _5x5OutputDescription, y_gpu.Ptr);
                     y_gpu.CopyTo(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
@@ -406,7 +406,7 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
                     y_gpu.CopyTo(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
                 }
-            
+        
                 // Pooling pipeline
                 PoolingOutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
                 using (DeviceMemory<float> y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Size))
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
index dd473b5..d7e4132 100644
--- a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
@@ -324,6 +324,45 @@ public unsafe void InceptionForward3x3Pipeline()
             }
         }
 
+        [TestMethod]
+        public unsafe void InceptionForward5x5Pipeline()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 12 * 12 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 12 * 12 * 3);
+            CuDnnConvolutionalLayer
+                conv1 = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(12, 12), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian),
+                conv2 = new CuDnnConvolutionalLayer(conv1.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation, 2, 2), (5, 5), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(12, 12), InceptionInfo.New(3, 2, 2, 10, 10, PoolingMode.Max, 2));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv1.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2), sizeof(float) * conv1.Weights.Length);
+            Buffer.BlockCopy(conv2.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2 + conv1.Weights.Length), sizeof(float) * conv2.Weights.Length);
+            Buffer.BlockCopy(conv1.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2), sizeof(float) * conv1.Biases.Length);
+            Buffer.BlockCopy(conv2.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2 + 10), sizeof(float) * conv2.Biases.Length);
+            fixed (float* px = x)
+            {
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                conv1.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
+                zTemp.Free();
+                conv2.Forward(aTemp, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer() + 12 * 12 * (3 + 2), preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+                aTemp.Free();
+                zConv.Free();
+                zInc.Free();
+                float* paInc = (float*)aInc.Ptr.ToPointer() + 12 * 12 * (3 + 2);
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+                aConv.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+
         #endregion
     }
 }

From c4a3966b79e25619b3cce69a0b45ad4754e399a0 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <Sergio0694@live.com>
Date: Wed, 27 Dec 2017 19:03:36 +0100
Subject: [PATCH 25/30] Added inception layer pooling test, minor bug fixes

---
 .../Layers/CuDnnInceptionLayer.cs             | 24 ++++++-------
 .../Layers/CuDnnPoolingLayer.cs               |  2 +-
 NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs | 13 +++++++
 .../Implementations/Layers/PoolingLayer.cs    |  5 +--
 .../CuDnnLayersTest.cs                        | 36 +++++++++++++++++++
 5 files changed, 63 insertions(+), 17 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index e336740..4999b46 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -105,7 +105,7 @@ private int Secondary1x1Weights
         private Tensor _5x5Reduce1x1Delta;
 
         // Pooling output activity
-        private Tensor PoolingZ;
+        private Tensor _PoolingZ;
 
         // Pooling output activation
         private Tensor _PoolingA;
@@ -270,8 +270,8 @@ private void SetupCuDnnInfo()
             _5x5BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary5x5ConvolutionKernels, 1, 1);
 
             // Pooling
-            PoolingDescription.Set2D(Alea.cuDNN.PoolingMode.AVERAGE_COUNT_EXCLUDE_PADDING, NanPropagation.PROPAGATE_NAN, 3, 3, 1, 1, 1, 1);
-            
+            PoolingDescription.Set2D((Alea.cuDNN.PoolingMode)OperationInfo.Pooling, NanPropagation.PROPAGATE_NAN, 3, 3, 1, 1, 1, 1);
+
             // Secondary 1x1 convolution
             Secondary1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, InputInfo.Channels, 1, 1);
             Secondary1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, 1, 1);
@@ -406,7 +406,7 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
                     y_gpu.CopyTo(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
                 }
-        
+
                 // Pooling pipeline
                 PoolingOutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
                 using (DeviceMemory<float> y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Size))
@@ -416,8 +416,8 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                     {
                         DnnInstance.PoolingForward(PoolingDescription, 1, InputDescription, x_gpu.Ptr, 0, InputDescription, y_gpu.Ptr);
                     }
-                    PoolingZ.TryFree();
-                    y_gpu.CopyToHost(x.Entities, InputInfo.Size, out PoolingZ);
+                    _PoolingZ.TryFree();
+                    y_gpu.CopyToHost(x.Entities, InputInfo.Size, out _PoolingZ);
                     DnnInstance.ActivationForward(x.Entities, x.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
                     _PoolingA.TryFree();
                     y_gpu.CopyToHost(x.Entities, InputInfo.Size, out _PoolingA);
@@ -516,27 +516,27 @@ public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFun
                 }
 
                 // Pooling
-                using (DeviceMemory<float> pooldy_gpu = DnnInstance.Gpu.AllocateDevice(PoolingZ))
+                using (DeviceMemory<float> pooldy_gpu = DnnInstance.Gpu.AllocateDevice(_PoolingZ))
                 {
                     // 1x1 backward
                     DnnInstance.GetConvolutionBackwardDataAlgorithm(Secondary1x1FilterDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, PoolingOutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
                     DnnInstance.GetConvolutionBackwardDataWorkspaceSize(Secondary1x1FilterDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, PoolingOutputDescription, algorithm, out size);
                     using (DeviceMemory<float> 
                         dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels),
-                        poolDx_gpu = DnnInstance.Gpu.AllocateDevice<float>(PoolingZ.Size))
+                        poolDx_gpu = DnnInstance.Gpu.AllocateDevice<float>(_PoolingZ.Size))
                     using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                     {
                         deviceptr<float> p1x1PoolingWeights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights + _5x5Weights;
                         DnnInstance.ConvolutionBackwardData(1, Secondary1x1FilterDescription, p1x1PoolingWeights_gpu, Secondary1x1OutputDescription, dy_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, PoolingOutputDescription, poolDx_gpu.Ptr);
-                        DnnInstance.ActivationBackward(PoolingZ.Entities, PoolingZ.Length, pooldy_gpu.Ptr, poolDx_gpu.Ptr, ActivationFunctions.ActivationPrime);
+                        DnnInstance.ActivationBackward(_PoolingZ.Entities, _PoolingZ.Length, pooldy_gpu.Ptr, poolDx_gpu.Ptr, ActivationFunctions.ActivationPrime);
                         _PoolingDelta.TryFree();
-                        pooldy_gpu.CopyToHost(PoolingZ.Entities, PoolingZ.Length, out _PoolingDelta);
+                        pooldy_gpu.CopyToHost(_PoolingZ.Entities, _PoolingZ.Length, out _PoolingDelta);
                     }
 
                     // Pooling backward
                     using (DeviceMemory<float> 
                         x_gpu = DnnInstance.Gpu.AllocateDevice(_Inputs),
-                        poolZ_gpu = DnnInstance.Gpu.AllocateDevice(PoolingZ))
+                        poolZ_gpu = DnnInstance.Gpu.AllocateDevice(_PoolingZ))
                     {
                         DnnInstance.PoolingBackward(PoolingDescription, 1, PoolingOutputDescription, poolZ_gpu.Ptr, PoolingOutputDescription, pooldy_gpu.Ptr, InputDescription, x_gpu.Ptr, 1, InputDescription, dx_gpu.Ptr); // TODO: finish pooling backward
                     }
@@ -741,7 +741,7 @@ private void Dispose()
             _5x5Reduce1x1Z.TryFree();
             _5x5Reduce1x1A.TryFree();
             _5x5Reduce1x1Delta.TryFree();
-            PoolingZ.TryFree();
+            _PoolingZ.TryFree();
             _PoolingA.TryFree();
             _PoolingDelta.TryFree();
         }
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs
index 0d5aced..0c5c4be 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs
@@ -14,7 +14,7 @@
 namespace NeuralNetworkNET.Cuda.Layers
 {
     /// <summary>
-    /// A pooling layer running on cuDNN, with a 2x2 window and a stride of 2
+    /// A pooling layer running on cuDNN, with a custom pooling mode
     /// </summary>
     [JsonObject(MemberSerialization.OptIn)]
     internal sealed class CuDnnPoolingLayer : PoolingLayer
diff --git a/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs b/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs
index ea2166a..e497dd1 100644
--- a/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs
@@ -89,6 +89,19 @@ public static PoolingInfo New(
 
         #endregion
 
+        /// <summary>
+        /// Calculates the output size after applying a pooling operation to the input tensor
+        /// </summary>
+        /// <param name="input">The info on the input tensor</param>
+        [Pure]
+        internal TensorInfo GetForwardOutputTensorInfo(in TensorInfo input)
+        {
+            int
+                h = (input.Height - WindowHeight + 2 * VerticalPadding) / VerticalStride + 1,
+                w = (input.Width - WindowWidth + 2 * HorizontalPadding) / HorizontalStride + 1;
+            return new TensorInfo(h, w, input.Channels);
+        }
+
         #region Equality
 
         /// <inheritdoc/>
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/PoolingLayer.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/PoolingLayer.cs
index 6cba6bc..2b6c4ab 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/PoolingLayer.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/PoolingLayer.cs
@@ -34,10 +34,7 @@ public ref readonly PoolingInfo OperationInfo
         }
 
         public PoolingLayer(in TensorInfo input, in PoolingInfo operation, ActivationFunctionType activation)
-            : base(input, new TensorInfo(
-                input.Height / 2 + (input.Height % 2 == 0 ? 0 : 1),
-                input.Width / 2 + (input.Width % 2 == 0 ? 0 : 1),
-                input.Channels), activation)
+            : base(input, operation.GetForwardOutputTensorInfo(input), activation)
             => _OperationInfo = operation;
 
         /// <inheritdoc/>
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
index d7e4132..a661ad8 100644
--- a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
@@ -363,6 +363,42 @@ public unsafe void InceptionForward5x5Pipeline()
             }
         }
 
+        [TestMethod]
+        public unsafe void InceptionForwardPoolPipeline()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 12 * 12 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 12 * 12 * 3);
+            CuDnnPoolingLayer pool = new CuDnnPoolingLayer(TensorInfo.CreateForRgbImage(12, 12), PoolingInfo.New(PoolingMode.Max, 3, 3, 1, 1, 1, 1), ActivationFunctionType.ReLU);
+            CuDnnConvolutionalLayer conv = new CuDnnConvolutionalLayer(pool.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(12, 12), InceptionInfo.New(3, 2, 2, 2, 2, PoolingMode.Max, 10));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2 + 3 * 2 + 5 * 5 * 2 * 2), sizeof(float) * conv.Weights.Length);
+            Buffer.BlockCopy(conv.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2 + 2 + 2), sizeof(float) * conv.Biases.Length);
+            fixed (float* px = x)
+            {
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                pool.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
+                conv.Forward(aTemp, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer() + 12 * 12 * (3 + 2 + 2), preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+                zTemp.Free();
+                aTemp.Free();
+                zConv.Free();
+                zInc.Free();
+                float* paInc = (float*)aInc.Ptr.ToPointer() + 12 * 12 * (3 + 2 + 2);
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+                aConv.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+
         #endregion
     }
 }

From ffa603ebeb960a9b62597f828c09c530a1addacf Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Fri, 29 Dec 2017 11:07:53 +0100
Subject: [PATCH 26/30] Minor layer tweaks

---
 .../Layers/CuDnnConvolutionalLayer.cs             |  2 +-
 .../Layers/CuDnnFullyConnectedLayer.cs            |  2 +-
 NeuralNetwork.NET/APIs/Structs/Tensor.cs          | 15 ++++++++++++++-
 .../Implementations/Layers/ConvolutionalLayer.cs  |  3 ++-
 .../Implementations/Layers/FullyConnectedLayer.cs |  3 ++-
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
index a6133cb..db5feb5 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
@@ -149,7 +149,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                     {
                         DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, OutputDescription, delta_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, FilterDescription, w_gpu.Ptr);
                     }
-                    w_gpu.CopyToHost(Kernels, KernelInfo.Size, out dJdw);
+                    w_gpu.CopyToHost(1, Weights.Length, out dJdw);
                 }
 
                 // Bias
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
index f1c587c..c21c554 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
@@ -67,7 +67,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                 w_gpu = DnnInstance.Gpu.AllocateDevice<float>(a.Length * delta.Length))
             {
                 DnnInstance.FullyConnectedBackwardFilter(a.Entities, a.Length, delta.Length, a_gpu.Ptr, delta_gpu.Ptr, w_gpu.Ptr);
-                w_gpu.CopyToHost(a.Length, delta.Length, out dJdw);
+                w_gpu.CopyToHost(1, Weights.Length, out dJdw);
             }
             delta.CompressVertically(out dJdb); // Doing this on CPU is generally faster than launching the kernels
         }
diff --git a/NeuralNetwork.NET/APIs/Structs/Tensor.cs b/NeuralNetwork.NET/APIs/Structs/Tensor.cs
index dee5e5e..cbc8732 100644
--- a/NeuralNetwork.NET/APIs/Structs/Tensor.cs
+++ b/NeuralNetwork.NET/APIs/Structs/Tensor.cs
@@ -201,6 +201,19 @@ public float[] ToArray()
 
         #endregion
 
+        /// <summary>
+        /// Creates a new instance by wrapping the current memory area
+        /// </summary>
+        /// <param name="n">The height of the final matrix</param>
+        /// <param name="chw">The width of the final matrix</param>
+        /// <param name="tensor">The resulting instance</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void Reshape(int n, int chw, out Tensor tensor)
+        {
+            if (n * chw != Size) throw new ArgumentException("Invalid input resized shape");
+            tensor = new Tensor(Ptr, n, chw);
+        }
+
         /// <summary>
         /// Frees the memory associated with the current instance
         /// </summary>
@@ -258,7 +271,7 @@ unsafe float[] ExtractRow(int i)
                     // Spawn the sequence
                     int
                         max = MaximumItemsCount / obj.Length,
-                        up = max.Min(MaximumRowsCount).Max(1);
+                        up = max.Min(MaximumRowsCount).Max(1).Min(obj.Entities);
                     for (int i = 0; i < up; i++)
                         yield return ExtractRow(i);
                 }
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
index 4f28de0..476ac33 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
@@ -107,7 +107,8 @@ public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, Activa
         public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJdw, out Tensor dJdb)
         {
             a.Rotate180(InputInfo.Channels, out Tensor a180);
-            a180.ConvoluteGradient(InputInfo, delta, OutputInfo, out dJdw);
+            a180.ConvoluteGradient(InputInfo, delta, OutputInfo, out Tensor dJdwM);
+            dJdwM.Reshape(1, Weights.Length, out dJdw);
             a180.Free();
             delta.CompressVertically(OutputInfo.Channels, out dJdb);
         }
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs
index 59bff25..6fb8cc6 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs
@@ -59,7 +59,8 @@ public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, Activa
         public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJdw, out Tensor dJdb)
         {
             a.Transpose(out Tensor at);
-            at.Multiply(delta, out dJdw);
+            at.Multiply(delta, out Tensor dJdwM);
+            dJdwM.Reshape(1, Weights.Length, out dJdw);
             at.Free();
             delta.CompressVertically(out dJdb);
         }

From 2ef571a88f4f9e47165bae5256e9b7030384008c Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Fri, 29 Dec 2017 15:01:11 +0100
Subject: [PATCH 27/30] ContentEquals method improved with relative threshold

---
 .../Extensions/MatrixExtensions.cs            | 21 +++++++++++--------
 .../Extensions/MiscExtensions.cs              | 18 +++++++++++++---
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/NeuralNetwork.NET/Extensions/MatrixExtensions.cs b/NeuralNetwork.NET/Extensions/MatrixExtensions.cs
index 069ee72..148d897 100644
--- a/NeuralNetwork.NET/Extensions/MatrixExtensions.cs
+++ b/NeuralNetwork.NET/Extensions/MatrixExtensions.cs
@@ -706,8 +706,9 @@ public static unsafe float[] BlockCopy([NotNull] this float[] v)
         /// </summary>
         /// <param name="m">The first <see cref="Tensor"/> to test</param>
         /// <param name="o">The second <see cref="Tensor"/> to test</param>
-        /// <param name="delta">The comparison threshold</param>
-        public static unsafe bool ContentEquals(in this Tensor m, in Tensor o, float delta = 1e-6f)
+        /// <param name="absolute">The relative comparison threshold</param>
+        /// <param name="relative">The relative comparison threshold</param>
+        public static unsafe bool ContentEquals(in this Tensor m, in Tensor o,float absolute = 1e-6f, float relative = 1e-6f)
         {
             if (m.Ptr == IntPtr.Zero && o.Ptr == IntPtr.Zero) return true;
             if (m.Ptr == IntPtr.Zero || o.Ptr == IntPtr.Zero) return false;
@@ -715,7 +716,7 @@ public static unsafe bool ContentEquals(in this Tensor m, in Tensor o, float del
             float* pm = m, po = o;
             int items = m.Size;
             for (int i = 0; i < items; i++)
-                if (!pm[i].EqualsWithDelta(po[i], delta)) return false;
+                if (!pm[i].EqualsWithDelta(po[i], absolute, relative)) return false;
             return true;
         }
 
@@ -724,8 +725,9 @@ public static unsafe bool ContentEquals(in this Tensor m, in Tensor o, float del
         /// </summary>
         /// <param name="m">The first matrix to test</param>
         /// <param name="o">The second matrix to test</param>
-        /// <param name="delta">The comparison threshold</param>
-        public static bool ContentEquals([CanBeNull] this float[,] m, [CanBeNull] float[,] o, float delta = 1e-6f)
+        /// <param name="absolute">The relative comparison threshold</param>
+        /// <param name="relative">The relative comparison threshold</param>
+        public static bool ContentEquals([CanBeNull] this float[,] m, [CanBeNull] float[,] o, float absolute = 1e-6f, float relative = 1e-6f)
         {
             if (m == null && o == null) return true;
             if (m == null || o == null) return false;
@@ -733,7 +735,7 @@ public static bool ContentEquals([CanBeNull] this float[,] m, [CanBeNull] float[
                 m.GetLength(1) != o.GetLength(1)) return false;
             for (int i = 0; i < m.GetLength(0); i++)
                 for (int j = 0; j < m.GetLength(1); j++)
-                    if (!m[i, j].EqualsWithDelta(o[i, j], delta)) return false;
+                    if (!m[i, j].EqualsWithDelta(o[i, j], absolute, relative)) return false;
             return true;
         }
 
@@ -742,14 +744,15 @@ public static bool ContentEquals([CanBeNull] this float[,] m, [CanBeNull] float[
         /// </summary>
         /// <param name="v">The first vector to test</param>
         /// <param name="o">The second vector to test</param>
-        /// <param name="delta">The comparison threshold</param>
-        public static bool ContentEquals([CanBeNull] this float[] v, [CanBeNull] float[] o, float delta = 1e-6f)
+        /// <param name="absolute">The relative comparison threshold</param>
+        /// <param name="relative">The relative comparison threshold</param>
+        public static bool ContentEquals([CanBeNull] this float[] v, [CanBeNull] float[] o, float absolute = 1e-6f, float relative = 1e-6f)
         {
             if (v == null && o == null) return true;
             if (v == null || o == null) return false;
             if (v.Length != o.Length) return false;
             for (int i = 0; i < v.Length; i++)
-                if (!v[i].EqualsWithDelta(o[i], delta)) return false;
+                if (!v[i].EqualsWithDelta(o[i], absolute, relative)) return false;
             return true;
         }
 
diff --git a/NeuralNetwork.NET/Extensions/MiscExtensions.cs b/NeuralNetwork.NET/Extensions/MiscExtensions.cs
index 914dfde..d3b2c76 100644
--- a/NeuralNetwork.NET/Extensions/MiscExtensions.cs
+++ b/NeuralNetwork.NET/Extensions/MiscExtensions.cs
@@ -32,6 +32,15 @@ public static TOut To<TIn, TOut>([NotNull] this TIn item) where TOut : class, TI
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static int Max(this int a, int b) => a >= b ? a : b;
 
+        /// <summary>
+        /// Returns the maximum value between two numbers
+        /// </summary>
+        /// <param name="a">The first number</param>
+        /// <param name="b">The second number</param>
+        [Pure]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static float Max(this float a, float b) => a >= b ? a : b;
+
         /// <summary>
         /// Returns the minimum value between two numbers
         /// </summary>
@@ -54,16 +63,19 @@ public static TOut To<TIn, TOut>([NotNull] this TIn item) where TOut : class, TI
         /// </summary>
         /// <param name="value">The first value</param>
         /// <param name="other">The second value</param>
-        /// <param name="delta">The comparison threshold</param>
+        /// <param name="absolute">The relative comparison threshold</param>
+        /// <param name="relative">The relative comparison threshold</param>
         [Pure]
-        public static bool EqualsWithDelta(this float value, float other, float delta = 1e-6f)
+        public static bool EqualsWithDelta(this float value, float other, float absolute = 1e-6f, float relative = 1e-6f)
         {
             if (float.IsNaN(value) ^ float.IsNaN(other)) return false;
             if (float.IsNaN(value) && float.IsNaN(other)) return true;
             if (float.IsInfinity(value) ^ float.IsInfinity(other)) return false;
             if (float.IsPositiveInfinity(value) && float.IsPositiveInfinity(other)) return true;
             if (float.IsNegativeInfinity(value) && float.IsNegativeInfinity(other)) return true;
-            return (value - other).Abs() < delta;
+            float abs = (value - other).Abs();
+            if (abs < absolute) return true;
+            return abs <= absolute.Max(relative * value.Abs().Max(other.Abs()));
         }
 
         /// <summary>

From 968eba607997e5de297a7514588cb3276c127b1e Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Fri, 29 Dec 2017 15:01:26 +0100
Subject: [PATCH 28/30] Inception layer bug fixes, more tests added

---
 .../Layers/CuDnnInceptionLayer.cs             |  10 +-
 .../CuDnnInceptionLayerTest.cs                | 258 ++++++++++++++++++
 .../CuDnnLayersTest.cs                        | 152 -----------
 3 files changed, 263 insertions(+), 157 deletions(-)
 create mode 100644 Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
index 4999b46..27d7200 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -579,7 +579,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                 }
 
                 // 3x3 reduce 1x1 weights
-                using (DeviceMemory<float> dy3x3Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_3x3Reduce1x1A))
+                using (DeviceMemory<float> dy3x3Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_3x3Reduce1x1Delta))
                 {
                     DnnInstance.GetConvolutionBackwardFilterAlgorithm(InputDescription, _3x3Reduce1x1OutputDescription, _1x1ConvolutionDescription, _3x3Reduce1x1FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
                     DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(InputDescription, _3x3Reduce1x1OutputDescription, _1x1ConvolutionDescription, _3x3Reduce1x1FilterDescription, algorithm, out IntPtr size);
@@ -599,7 +599,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                 }
 
                 // 5x5 reduce 1x1 weights
-                using (DeviceMemory<float> dy5x5Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_5x5Reduce1x1A))
+                using (DeviceMemory<float> dy5x5Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_5x5Reduce1x1Delta))
                 {
                     DnnInstance.GetConvolutionBackwardFilterAlgorithm(InputDescription, _5x5Reduce1x1OutputDescription, _1x1ConvolutionDescription, _5x5Reduce1x1FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
                     DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(InputDescription, _5x5Reduce1x1OutputDescription, _1x1ConvolutionDescription, _5x5Reduce1x1FilterDescription, algorithm, out IntPtr size);
@@ -610,7 +610,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                         dw_gpu.CopyTo(dJdw, _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights, _5x5Reduce1x1Weights);
                     }
 
-                    // 3x3 reduce 1x1 bias
+                    // 5x5 reduce 1x1 bias
                     using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Primary5x5Reduce1x1ConvolutionKernels))
                     {
                         DnnInstance.ConvolutionBackwardBias(1, _5x5Reduce1x1OutputDescription, dy5x5Reduce1x1_gpu.Ptr, 0, _5x5Reduce1x1BiasDescription, db_gpu.Ptr);
@@ -637,7 +637,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                 using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Secondary3x3ConvolutionKernels))
                 {
                     DnnInstance.ConvolutionBackwardBias(1, _3x3OutputDescription, dy3x3_gpu.Ptr, 0, _3x3BiasDescription, db_gpu.Ptr);
-                    db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels, OperationInfo.Secondary3x3ConvolutionKernels);
+                    db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, OperationInfo.Secondary3x3ConvolutionKernels);
                 }
             }
 
@@ -655,7 +655,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                     dw_gpu.CopyTo(dJdw, _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights, _5x5Weights);
                 }
 
-                // 3x3 bias
+                // 5x5 bias
                 using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Secondary5x5ConvolutionKernels))
                 {
                     DnnInstance.ConvolutionBackwardBias(1, _5x5OutputDescription, dy5x5_gpu.Ptr, 0, _5x5BiasDescription, db_gpu.Ptr);
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs
new file mode 100644
index 0000000..e12a674
--- /dev/null
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs
@@ -0,0 +1,258 @@
+﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
+using NeuralNetworkNET.APIs.Enums;
+using NeuralNetworkNET.APIs.Structs;
+using NeuralNetworkNET.Cuda.Layers;
+using NeuralNetworkNET.Extensions;
+using NeuralNetworkNET.Networks.Activations;
+using NeuralNetworkNET.Networks.Implementations.Layers.Helpers;
+using System;
+using System.Runtime.CompilerServices;
+
+namespace NeuralNetworkNET.Cuda.Unit
+{
+    /// <summary>
+    /// Test class for the cuDNN inception layer
+    /// </summary>
+    [TestClass]
+    [TestCategory(nameof(CuDnnInceptionLayerTest))]
+    public class CuDnnInceptionLayerTest
+    {
+        [TestMethod]
+        public unsafe void Inception1x1()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 32 * 32 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 32 * 32 * 3);
+            CuDnnConvolutionalLayer conv = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(32, 32), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(conv.InputInfo, InceptionInfo.New(10, 10, 10, 10, 10, PoolingMode.Max, 10));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv.Weights, 0, inception.Weights, 0, sizeof(float) * conv.Weights.Length);
+            Buffer.BlockCopy(conv.Biases, 0, inception.Biases, 0, sizeof(float) * conv.Biases.Length);
+            fixed (float* px = x)
+            {
+                // Forward + Z
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                conv.Forward(xTensor, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer(), preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+
+                // A
+                float* paInc = (float*)aInc.Ptr.ToPointer();
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+
+                // Backpropagate
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor z1);
+                KerasWeightsProvider.FillWithHeEtAlUniform(z1, 10);
+                z1.Duplicate(out Tensor z2);
+                conv.Backpropagate(aConv, z1, ActivationFunctions.ReLUPrime);
+                inception.Backpropagate(aInc, z2, ActivationFunctions.ReLUPrime);
+                Assert.IsTrue(z1.ContentEquals(z2));
+
+                // Gradient
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor a);
+                KerasWeightsProvider.FillWithHeEtAlUniform(a, 10);
+                conv.ComputeGradient(a, aConv, out Tensor dJdwConv, out Tensor dJdbConv);
+                inception.ComputeGradient(a, aInc, out Tensor dJdwInc, out Tensor dJdbInc);
+                Tensor.New(1, dJdwConv.Length, out Tensor dJdwInc0);
+                Buffer.MemoryCopy((float*)dJdwInc.Ptr.ToPointer(), (float*)dJdwInc0.Ptr.ToPointer(), sizeof(float) * dJdwInc0.Size, sizeof(float) * dJdwInc0.Size);
+                Tensor.New(1, dJdbConv.Length, out Tensor dJdbInc0);
+                Buffer.MemoryCopy((float*)dJdbInc.Ptr.ToPointer(), (float*)dJdbInc0.Ptr.ToPointer(), sizeof(float) * dJdbInc0.Size, sizeof(float) * dJdbInc0.Size);
+                Assert.IsTrue(dJdwConv.ContentEquals(dJdwInc0, 1e-5f));
+                Assert.IsTrue(dJdbConv.ContentEquals(dJdbInc0, 1e-5f));
+
+                // Cleanup
+                dJdwConv.Free();
+                dJdbConv.Free();
+                dJdwInc.Free();
+                dJdbInc.Free();
+                dJdwInc0.Free();
+                dJdbInc0.Free();
+                z1.Free();
+                z2.Free();
+                zConv.Free();
+                aConv.Free();
+                zInc.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+
+        [TestMethod]
+        public unsafe void Inception3x3Pipeline()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 32 * 32 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 32 * 32 * 3);
+            CuDnnConvolutionalLayer
+                conv1 = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(32, 32), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian),
+                conv2 = new CuDnnConvolutionalLayer(conv1.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation, 1, 1), (3, 3), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(32, 32), InceptionInfo.New(10, 10, 10, 10, 10, PoolingMode.Max, 10));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv1.Weights, 0, inception.Weights, sizeof(float) * 3 * 10, sizeof(float) * conv1.Weights.Length);
+            Buffer.BlockCopy(conv2.Weights, 0, inception.Weights, sizeof(float) * 3 * 10 + sizeof(float) * conv1.Weights.Length, sizeof(float) * conv2.Weights.Length);
+            Buffer.BlockCopy(conv1.Biases, 0, inception.Biases, sizeof(float) * 10, sizeof(float) * conv1.Biases.Length);
+            Buffer.BlockCopy(conv2.Biases, 0, inception.Biases, sizeof(float) * 20, sizeof(float) * conv2.Biases.Length);
+            fixed (float* px = x)
+            {
+                // Forward + Z
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                conv1.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
+                conv2.Forward(aTemp, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer() + 32 * 32 * 10, preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+                
+                // A
+                float* paInc = (float*)aInc.Ptr.ToPointer() + 32 * 32 * 10;
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+
+                // Backpropagation
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor z1);
+                KerasWeightsProvider.FillWithHeEtAlUniform(z1, 10);
+                z1.Duplicate(out Tensor z2);
+                conv2.Backpropagate(aConv, zTemp, conv1.ActivationFunctions.ActivationPrime);
+                conv1.Backpropagate(zTemp, z1, ActivationFunctions.ReLUPrime);
+                inception.Backpropagate(aInc, z2, ActivationFunctions.ReLUPrime);
+                Assert.IsTrue(z1.ContentEquals(z2));
+
+                // Gradient
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor a);
+                KerasWeightsProvider.FillWithHeEtAlUniform(a, 10);
+                conv1.ComputeGradient(a, zTemp, out Tensor dJdwConv1, out Tensor dJdbConv1);
+                conv2.ComputeGradient(aTemp, aConv, out Tensor dJdwConv2, out Tensor dJdbConv2);
+                inception.ComputeGradient(a, aInc, out Tensor dJdwInc, out Tensor dJdbInc);
+                Tensor.Reshape((float*)dJdwInc.Ptr.ToPointer() + 30, 1, dJdwConv1.Size, out Tensor dJdwInc0);
+                Tensor.Reshape((float*)dJdbInc.Ptr.ToPointer() + 10, 1, dJdbConv1.Size, out Tensor dJdbInc0);
+                Assert.IsTrue(dJdwConv1.ContentEquals(dJdwInc0, 1e-5f));
+                Assert.IsTrue(dJdbConv1.ContentEquals(dJdbInc0, 1e-5f));
+                Tensor.Reshape((float*)dJdwInc.Ptr.ToPointer() + 30 + dJdwConv1.Size, 1, dJdwConv2.Size, out Tensor dJdwInc1);
+                Tensor.Reshape((float*)dJdbInc.Ptr.ToPointer() + 20, 1, dJdbConv2.Size, out Tensor dJdbInc1);
+                Assert.IsTrue(dJdwConv2.ContentEquals(dJdwInc1, 1e-5f));
+                Assert.IsTrue(dJdbConv2.ContentEquals(dJdbInc1, 1e-5f));
+
+                // Cleanup
+                z1.Free();
+                z2.Free();
+                zTemp.Free();
+                zConv.Free();
+                zInc.Free();
+                aConv.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+
+        [TestMethod]
+        public unsafe void Inception5x5Pipeline()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 12 * 12 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 12 * 12 * 3);
+            CuDnnConvolutionalLayer
+                conv1 = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(12, 12), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian),
+                conv2 = new CuDnnConvolutionalLayer(conv1.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation, 2, 2), (5, 5), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(12, 12), InceptionInfo.New(3, 2, 2, 10, 10, PoolingMode.Max, 2));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv1.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2), sizeof(float) * conv1.Weights.Length);
+            Buffer.BlockCopy(conv2.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2 + conv1.Weights.Length), sizeof(float) * conv2.Weights.Length);
+            Buffer.BlockCopy(conv1.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2), sizeof(float) * conv1.Biases.Length);
+            Buffer.BlockCopy(conv2.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2 + 10), sizeof(float) * conv2.Biases.Length);
+            fixed (float* px = x)
+            {
+                // Forwaard + Z
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                conv1.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
+                conv2.Forward(aTemp, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer() + 12 * 12 * (3 + 2), preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+                
+                // A
+                float* paInc = (float*)aInc.Ptr.ToPointer() + 12 * 12 * (3 + 2);
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+
+                // Backpropagation
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor z1);
+                KerasWeightsProvider.FillWithHeEtAlUniform(z1, 10);
+                z1.Duplicate(out Tensor z2);
+                conv2.Backpropagate(aConv, zTemp, conv1.ActivationFunctions.ActivationPrime);
+                conv1.Backpropagate(zTemp, z1, ActivationFunctions.ReLUPrime);
+                inception.Backpropagate(aInc, z2, ActivationFunctions.ReLUPrime);
+                Assert.IsTrue(z1.ContentEquals(z2));
+
+                // Gradient
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor a);
+                KerasWeightsProvider.FillWithHeEtAlUniform(a, 10);
+                conv1.ComputeGradient(a, zTemp, out Tensor dJdwConv1, out Tensor dJdbConv1);
+                conv2.ComputeGradient(aTemp, aConv, out Tensor dJdwConv2, out Tensor dJdbConv2);
+                inception.ComputeGradient(a, aInc, out Tensor dJdwInc, out Tensor dJdbInc);
+                Tensor.Reshape((float*)dJdwInc.Ptr.ToPointer() + (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2), 1, dJdwConv1.Size, out Tensor dJdwInc0);
+                Tensor.Reshape((float*)dJdbInc.Ptr.ToPointer() + 7, 1, dJdbConv1.Size, out Tensor dJdbInc0);
+                Assert.IsTrue(dJdwConv1.ContentEquals(dJdwInc0, 1e-5f));
+                Assert.IsTrue(dJdbConv1.ContentEquals(dJdbInc0, 1e-5f));
+                Tensor.Reshape((float*)dJdwInc.Ptr.ToPointer() + (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2) + dJdwConv1.Size, 1, dJdwConv2.Size, out Tensor dJdwInc1);
+                Tensor.Reshape((float*)dJdbInc.Ptr.ToPointer() + 17, 1, dJdbConv2.Size, out Tensor dJdbInc1);
+                Assert.IsTrue(dJdwConv2.ContentEquals(dJdwInc1, 1e-5f));
+                Assert.IsTrue(dJdbConv2.ContentEquals(dJdbInc1, 1e-5f));
+
+                // Cleanup
+                zTemp.Free();
+                aTemp.Free();
+                zConv.Free();
+                zInc.Free();
+                aConv.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+
+        [TestMethod]
+        public unsafe void InceptionPoolPipeline()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 12 * 12 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 12 * 12 * 3);
+            CuDnnPoolingLayer pool = new CuDnnPoolingLayer(TensorInfo.CreateForRgbImage(12, 12), PoolingInfo.New(PoolingMode.Max, 3, 3, 1, 1, 1, 1), ActivationFunctionType.ReLU);
+            CuDnnConvolutionalLayer conv = new CuDnnConvolutionalLayer(pool.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(12, 12), InceptionInfo.New(3, 2, 2, 2, 2, PoolingMode.Max, 10));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2 + 3 * 2 + 5 * 5 * 2 * 2), sizeof(float) * conv.Weights.Length);
+            Buffer.BlockCopy(conv.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2 + 2 + 2), sizeof(float) * conv.Biases.Length);
+            fixed (float* px = x)
+            {
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                pool.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
+                conv.Forward(aTemp, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer() + 12 * 12 * (3 + 2 + 2), preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+                zTemp.Free();
+                aTemp.Free();
+                zConv.Free();
+                zInc.Free();
+                float* paInc = (float*)aInc.Ptr.ToPointer() + 12 * 12 * (3 + 2 + 2);
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+                aConv.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+    }
+}
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
index a661ad8..790a697 100644
--- a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
@@ -8,8 +8,6 @@
 using NeuralNetworkNET.Networks.Implementations.Layers;
 using NeuralNetworkNET.Networks.Implementations.Layers.Abstract;
 using NeuralNetworkNET.Networks.Implementations.Layers.Helpers;
-using System;
-using System.Runtime.CompilerServices;
 
 namespace NeuralNetworkNET.Cuda.Unit
 {
@@ -250,155 +248,5 @@ public void PoolingBackward()
         }
 
         #endregion
-
-        #region Inception
-
-        [TestMethod]
-        public unsafe void InceptionForward1x1()
-        {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 32 * 32 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 32 * 32 * 3);
-            CuDnnConvolutionalLayer conv = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(32, 32), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
-            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(conv.InputInfo, InceptionInfo.New(10, 10, 10, 10, 10, PoolingMode.Max, 10));
-            Buffer.BlockCopy(conv.Weights, 0, inception.Weights, 0, sizeof(float) * conv.Weights.Length);
-            Buffer.BlockCopy(conv.Biases, 0, inception.Biases, 0, sizeof(float) * conv.Biases.Length);
-            fixed (float* px = x)
-            {
-                // Forward + Z
-                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
-                conv.Forward(xTensor, out Tensor zConv, out Tensor aConv);
-                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
-                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
-                float* pzInc = (float*)zInc.Ptr.ToPointer(), preshaped = (float*)reshaped.Ptr.ToPointer();
-                for (int i = 0; i < zConv.Entities; i++)
-                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
-                Assert.IsTrue(reshaped.ContentEquals(zConv));
-
-                // A
-                float* paInc = (float*)aInc.Ptr.ToPointer();
-                for (int i = 0; i < aConv.Entities; i++)
-                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
-                Assert.IsTrue(reshaped.ContentEquals(aConv));
-                zConv.Free();
-                aConv.Free();
-                zInc.Free();
-                aInc.Free();
-                reshaped.Free();
-            }
-        }
-
-        [TestMethod]
-        public unsafe void InceptionForward3x3Pipeline()
-        {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 32 * 32 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 32 * 32 * 3);
-            CuDnnConvolutionalLayer
-                conv1 = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(32, 32), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian),
-                conv2 = new CuDnnConvolutionalLayer(conv1.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation, 1, 1), (3, 3), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
-            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(32, 32), InceptionInfo.New(10, 10, 10, 10, 10, PoolingMode.Max, 10));
-            fixed (float* pw = inception.Weights)
-                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
-            Buffer.BlockCopy(conv1.Weights, 0, inception.Weights, sizeof(float) * 3 * 10, sizeof(float) * conv1.Weights.Length);
-            Buffer.BlockCopy(conv2.Weights, 0, inception.Weights, sizeof(float) * 3 * 10 + sizeof(float) * conv1.Weights.Length, sizeof(float) * conv2.Weights.Length);
-            Buffer.BlockCopy(conv1.Biases, 0, inception.Biases, sizeof(float) * 10, sizeof(float) * conv1.Biases.Length);
-            Buffer.BlockCopy(conv2.Biases, 0, inception.Biases, sizeof(float) * 20, sizeof(float) * conv2.Biases.Length);
-            fixed (float* px = x)
-            {
-                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
-                conv1.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
-                zTemp.Free();
-                conv2.Forward(aTemp, out Tensor zConv, out Tensor aConv);
-                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
-                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
-                float* pzInc = (float*)zInc.Ptr.ToPointer() + 32 * 32 * 10, preshaped = (float*)reshaped.Ptr.ToPointer();
-                for (int i = 0; i < zConv.Entities; i++)
-                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
-                Assert.IsTrue(reshaped.ContentEquals(zConv));
-                zConv.Free();
-                zInc.Free();
-                float* paInc = (float*)aInc.Ptr.ToPointer() + 32 * 32 * 10;
-                for (int i = 0; i < aConv.Entities; i++)
-                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
-                Assert.IsTrue(reshaped.ContentEquals(aConv));
-                aConv.Free();
-                aInc.Free();
-                reshaped.Free();
-            }
-        }
-
-        [TestMethod]
-        public unsafe void InceptionForward5x5Pipeline()
-        {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 12 * 12 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 12 * 12 * 3);
-            CuDnnConvolutionalLayer
-                conv1 = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(12, 12), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian),
-                conv2 = new CuDnnConvolutionalLayer(conv1.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation, 2, 2), (5, 5), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
-            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(12, 12), InceptionInfo.New(3, 2, 2, 10, 10, PoolingMode.Max, 2));
-            fixed (float* pw = inception.Weights)
-                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
-            Buffer.BlockCopy(conv1.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2), sizeof(float) * conv1.Weights.Length);
-            Buffer.BlockCopy(conv2.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2 + conv1.Weights.Length), sizeof(float) * conv2.Weights.Length);
-            Buffer.BlockCopy(conv1.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2), sizeof(float) * conv1.Biases.Length);
-            Buffer.BlockCopy(conv2.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2 + 10), sizeof(float) * conv2.Biases.Length);
-            fixed (float* px = x)
-            {
-                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
-                conv1.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
-                zTemp.Free();
-                conv2.Forward(aTemp, out Tensor zConv, out Tensor aConv);
-                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
-                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
-                float* pzInc = (float*)zInc.Ptr.ToPointer() + 12 * 12 * (3 + 2), preshaped = (float*)reshaped.Ptr.ToPointer();
-                for (int i = 0; i < zConv.Entities; i++)
-                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
-                Assert.IsTrue(reshaped.ContentEquals(zConv));
-                aTemp.Free();
-                zConv.Free();
-                zInc.Free();
-                float* paInc = (float*)aInc.Ptr.ToPointer() + 12 * 12 * (3 + 2);
-                for (int i = 0; i < aConv.Entities; i++)
-                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
-                Assert.IsTrue(reshaped.ContentEquals(aConv));
-                aConv.Free();
-                aInc.Free();
-                reshaped.Free();
-            }
-        }
-
-        [TestMethod]
-        public unsafe void InceptionForwardPoolPipeline()
-        {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 12 * 12 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 12 * 12 * 3);
-            CuDnnPoolingLayer pool = new CuDnnPoolingLayer(TensorInfo.CreateForRgbImage(12, 12), PoolingInfo.New(PoolingMode.Max, 3, 3, 1, 1, 1, 1), ActivationFunctionType.ReLU);
-            CuDnnConvolutionalLayer conv = new CuDnnConvolutionalLayer(pool.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
-            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(12, 12), InceptionInfo.New(3, 2, 2, 2, 2, PoolingMode.Max, 10));
-            fixed (float* pw = inception.Weights)
-                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
-            Buffer.BlockCopy(conv.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2 + 3 * 2 + 5 * 5 * 2 * 2), sizeof(float) * conv.Weights.Length);
-            Buffer.BlockCopy(conv.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2 + 2 + 2), sizeof(float) * conv.Biases.Length);
-            fixed (float* px = x)
-            {
-                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
-                pool.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
-                conv.Forward(aTemp, out Tensor zConv, out Tensor aConv);
-                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
-                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
-                float* pzInc = (float*)zInc.Ptr.ToPointer() + 12 * 12 * (3 + 2 + 2), preshaped = (float*)reshaped.Ptr.ToPointer();
-                for (int i = 0; i < zConv.Entities; i++)
-                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
-                Assert.IsTrue(reshaped.ContentEquals(zConv));
-                zTemp.Free();
-                aTemp.Free();
-                zConv.Free();
-                zInc.Free();
-                float* paInc = (float*)aInc.Ptr.ToPointer() + 12 * 12 * (3 + 2 + 2);
-                for (int i = 0; i < aConv.Entities; i++)
-                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
-                Assert.IsTrue(reshaped.ContentEquals(aConv));
-                aConv.Free();
-                aInc.Free();
-                reshaped.Free();
-            }
-        }
-
-        #endregion
     }
 }

From fb738d119d742ef701fc7ecb82331e328f3531b0 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Fri, 29 Dec 2017 20:47:42 +0100
Subject: [PATCH 29/30] CuDnnPoolingLayer backpropagation switched to cuDNN

---
 .../Layers/CuDnnPoolingLayer.cs               | 58 ++++++++++++++++++-
 .../CuDnnLayersTest.cs                        | 36 ++++++++++--
 2 files changed, 86 insertions(+), 8 deletions(-)

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs
index 0c5c4be..3cca675 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs
@@ -1,4 +1,5 @@
-﻿using Alea;
+﻿using System;
+using Alea;
 using Alea.cuDNN;
 using NeuralNetworkNET.Extensions;
 using NeuralNetworkNET.Cuda.Extensions;
@@ -17,7 +18,7 @@ namespace NeuralNetworkNET.Cuda.Layers
     /// A pooling layer running on cuDNN, with a custom pooling mode
     /// </summary>
     [JsonObject(MemberSerialization.OptIn)]
-    internal sealed class CuDnnPoolingLayer : PoolingLayer
+    internal sealed class CuDnnPoolingLayer : PoolingLayer, IDisposable
     {
         #region cuDNN fields
 
@@ -41,6 +42,16 @@ internal sealed class CuDnnPoolingLayer : PoolingLayer
 
         #endregion
 
+        #region Fields
+
+        // A copy of the layer inputs
+        private Tensor _X;
+
+        // A copy of the layer output activity
+        private Tensor _Z;
+
+        #endregion
+
         public CuDnnPoolingLayer(in TensorInfo input, in PoolingInfo operation, ActivationFunctionType activation) : base(input, operation, activation)
         {
             PoolingDescription.Set2D((PoolingMode)operation.Mode, NanPropagation.PROPAGATE_NAN, operation.WindowHeight, operation.WindowWidth, operation.VerticalPadding, operation.HorizontalPadding, operation.VerticalStride, operation.HorizontalStride);
@@ -49,6 +60,8 @@ public CuDnnPoolingLayer(in TensorInfo input, in PoolingInfo operation, Activati
         /// <inheritdoc/>
         public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
+            _X.TryFree();
+            x.Duplicate(out _X);
             using (DeviceMemory<float>
                 x_gpu = DnnInstance.Gpu.AllocateDevice(x),
                 z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
@@ -58,6 +71,8 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                 OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);
                 DnnInstance.PoolingForward(PoolingDescription, 1, InputDescription, x_gpu.Ptr, 0, OutputDescription, z_gpu.Ptr);
                 z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
+                _Z.TryFree();
+                z.Duplicate(out _Z);
 
                 // Activation
                 DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
@@ -66,7 +81,24 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         }
 
         /// <inheritdoc/>
-        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime) => z.UpscalePool2x2(delta_1, InputInfo.Channels);
+        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
+        {
+            using (DeviceMemory<float> dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
+            {
+                using (DeviceMemory<float>
+                    x_gpu = DnnInstance.Gpu.AllocateDevice(_X),
+                    y_gpu = DnnInstance.Gpu.AllocateDevice(_Z),
+                    dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1))
+                {
+                    DnnInstance.PoolingBackward(PoolingDescription, 1, OutputDescription, y_gpu.Ptr, OutputDescription, dy_gpu.Ptr, InputDescription, x_gpu.Ptr, 0, InputDescription, dx_gpu.Ptr);
+                }
+                using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
+                {
+                    DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, dx_gpu.Ptr, activationPrime);
+                    z_gpu.CopyTo(z);
+                }
+            }
+        }
 
         /// <inheritdoc/>
         public override INetworkLayer Clone() => new CuDnnPoolingLayer(InputInfo, OperationInfo, ActivationFunctionType);
@@ -84,5 +116,25 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
             if (!stream.TryRead(out PoolingInfo operation)) return null;
             return new CuDnnPoolingLayer(input, operation, activation);
         }
+
+        #region IDisposable
+
+        ~CuDnnPoolingLayer() => Dispose();
+
+        /// <inheritdoc/>
+        void IDisposable.Dispose()
+        {
+            GC.SuppressFinalize(this);
+            Dispose();
+        }
+
+        // Private Dispose method
+        private void Dispose()
+        {
+            _X.TryFree();
+            _Z.TryFree();
+        }
+
+        #endregion
     }
 }
\ No newline at end of file
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
index 790a697..87e53d4 100644
--- a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
@@ -236,15 +236,41 @@ public void PoolingForward()
         }
 
         [TestMethod]
-        public void PoolingBackward()
+        public unsafe void PoolingBackward()
         {
-            float[,]
-                delta_1 = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 29 * 29 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 29 * 29 * 3),
-                z = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 58 * 58 * 3);
+            // Setup
+            Tensor.New(400, 58 * 58 * 3, out Tensor x);
+            KerasWeightsProvider.FillWithHeEtAlUniform(x, 10);
             PoolingLayer
                 cpu = new PoolingLayer(new TensorInfo(58, 58, 3), PoolingInfo.Default, ActivationFunctionType.LeakyReLU),
                 gpu = new CuDnnPoolingLayer(cpu.InputInfo, PoolingInfo.Default, ActivationFunctionType.LeakyReLU);
-            TestBackward(cpu, gpu, delta_1, z);
+            gpu.Forward(x, out Tensor z, out Tensor a);
+            a.Free();
+            x.Duplicate(out Tensor x2);
+            Tensor.New(z.Entities, z.Length, out Tensor delta);
+            KerasWeightsProvider.FillWithHeEtAlUniform(delta, 10);
+
+            // Backward
+            cpu.Backpropagate(delta, x, ActivationFunctions.LeakyReLUPrime);
+            gpu.Backpropagate(delta, x2, ActivationFunctions.LeakyReLUPrime);
+            bool valid = true;
+            float* px = (float*)x.Ptr.ToPointer(), px2 = (float*)x2.Ptr.ToPointer();
+            int count = 0;
+            for (int i = 0; i < x.Size; i++)
+            {
+                if (px[i].EqualsWithDelta(px2[i], 1e-5f)) continue;
+                if (px[i].EqualsWithDelta(px2[i] * 100f, 1e-5f)) count++;   // The cuDNN pooling backwards method returns a value scaled by 0.01 from time to time for some reason (less than 2% anyways)
+                else
+                {
+                    valid = false;
+                    break;
+                }
+            }
+            Assert.IsTrue(valid && count * 100f / x.Size < 2);
+            x.Free();
+            x2.Free();
+            z.Free();
+            delta.Free();
         }
 
         #endregion

From b84d99172534dcf0c57c6c41b4c094ee6b62fdc5 Mon Sep 17 00:00:00 2001
From: Sergio0694 <Sergio0694@live.com>
Date: Fri, 29 Dec 2017 21:57:57 +0100
Subject: [PATCH 30/30] Inception layer pool gradient test added

---
 .../CuDnnInceptionLayerTest.cs                | 30 ++++++++++++++++---
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs
index e12a674..66ebcd5 100644
--- a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs
@@ -232,6 +232,7 @@ public unsafe void InceptionPoolPipeline()
             Buffer.BlockCopy(conv.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2 + 2 + 2), sizeof(float) * conv.Biases.Length);
             fixed (float* px = x)
             {
+                // Forward + Z
                 Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
                 pool.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
                 conv.Forward(aTemp, out Tensor zConv, out Tensor aConv);
@@ -241,14 +242,35 @@ public unsafe void InceptionPoolPipeline()
                 for (int i = 0; i < zConv.Entities; i++)
                     Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
                 Assert.IsTrue(reshaped.ContentEquals(zConv));
-                zTemp.Free();
-                aTemp.Free();
-                zConv.Free();
-                zInc.Free();
+               
+                // A
                 float* paInc = (float*)aInc.Ptr.ToPointer() + 12 * 12 * (3 + 2 + 2);
                 for (int i = 0; i < aConv.Entities; i++)
                     Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
                 Assert.IsTrue(reshaped.ContentEquals(aConv));
+
+                // Backpropagation
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor z1);
+                KerasWeightsProvider.FillWithHeEtAlUniform(z1, 10);
+                z1.Duplicate(out Tensor z2);
+                conv.Backpropagate(aConv, zTemp, pool.ActivationFunctions.ActivationPrime);
+                pool.Backpropagate(zTemp, z1, ActivationFunctions.ReLUPrime);
+                inception.Backpropagate(aInc, z2, ActivationFunctions.ReLUPrime);
+                Assert.IsTrue(z1.ContentEquals(z2));
+
+                // Gradient
+                conv.ComputeGradient(aTemp, aConv, out Tensor dJdwConv, out Tensor dJdbConv);
+                inception.ComputeGradient(xTensor, aInc, out Tensor dJdwInc, out Tensor dJdbInc);
+                Tensor.Reshape((float*)dJdwInc.Ptr.ToPointer() + (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2 + 3 * 2 + 5 * 5 * 2 * 2), 1, dJdwConv.Size, out Tensor dJdwInc0);
+                Tensor.Reshape((float*)dJdbInc.Ptr.ToPointer() + 11, 1, dJdbConv.Size, out Tensor dJdbInc0);
+                Assert.IsTrue(dJdwConv.ContentEquals(dJdwInc0, 1e-5f));
+                Assert.IsTrue(dJdbConv.ContentEquals(dJdbInc0, 1e-5f));
+
+                // Cleanup
+                zTemp.Free();
+                aTemp.Free();
+                zConv.Free();
+                zInc.Free();
                 aConv.Free();
                 aInc.Free();
                 reshaped.Free();