diff --git a/NeuralNetwork.NET.Cuda/APIS/CuDnnNetworkLayers.cs b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs
similarity index 85%
rename from NeuralNetwork.NET.Cuda/APIS/CuDnnNetworkLayers.cs
rename to NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs
index 13c6ab2..1b6f1d6 100644
--- a/NeuralNetwork.NET.Cuda/APIS/CuDnnNetworkLayers.cs
+++ b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs
@@ -67,5 +67,18 @@ public static INetworkLayer Convolutional(
         [PublicAPI]
         [Pure, NotNull]
         public static INetworkLayer Pooling(in TensorInfo input, in PoolingInfo info, ActivationFunctionType activation) => new CuDnnPoolingLayer(input, info, activation);
+
+        /// <summary>
+        /// Creates a new inception layer with the given input and features
+        /// </summary>
+        /// <param name="input">The input volume to process</param>
+        /// <param name="info">The info on the operations to execute inside the layer</param>
+        /// <param name="biasMode">Indicates the desired initialization mode to use for the layer bias values</param>
+        [PublicAPI]
+        [Pure, NotNull]
+        public static INetworkLayer Inception(
+            in TensorInfo input, in InceptionInfo info,
+            BiasInitializationMode biasMode = BiasInitializationMode.Zero)
+            => new CuDnnInceptionLayer(input, info, biasMode);
     }
 }
\ No newline at end of file
diff --git a/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs
index dfafc6a..fda4c29 100644
--- a/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs
+++ b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs
@@ -31,6 +31,7 @@ private static INetworkLayer Deserialize([NotNull] Stream stream, LayerType type
                 case LayerType.Convolutional: return CuDnnConvolutionalLayer.Deserialize(stream);
                 case LayerType.Pooling: return CuDnnPoolingLayer.Deserialize(stream);
                 case LayerType.Softmax: return CuDnnSoftmaxLayer.Deserialize(stream);
+                case LayerType.Inception: return CuDnnInceptionLayer.Deserialize(stream);
                 default: return null;
             }
         } 
diff --git a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
index 3426b0a..03d7ead 100644
--- a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
+++ b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
@@ -27,6 +27,39 @@ public static DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tens
                 : throw new InvalidOperationException($"Failed to copy the source data on the target GPU device, [CUDA ERROR] {result}");
         }
 
+        /// <summary>
+        /// Allocates a memory area on device memory, reading the target values at a given offset from the input <see cref="Tensor"/>
+        /// </summary>
+        /// <param name="gpu">The <see cref="Gpu"/> device to use</param>
+        /// <param name="source">The source <see cref="Tensor"/> with the data to copy</param>
+        /// <param name="offset">The column offset for the data to read from each row</param>
+        /// <param name="length"></param>
+        [MustUseReturnValue, NotNull]
+        public static unsafe DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tensor source, int offset, int length)
+        {
+            // Checks
+            if (source.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
+
+            // Memory copy
+            DeviceMemory<float> result_gpu = gpu.AllocateDevice<float>(source.Entities * length);
+            CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
+            ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
+            {
+                srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
+                srcHost = source.Ptr + sizeof(float) * offset,
+                srcPitch = new IntPtr(sizeof(float) * source.Length),
+                dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
+                dstDevice = result_gpu.Handle,
+                dstPitch = new IntPtr(sizeof(float) * length),
+                WidthInBytes = new IntPtr(sizeof(float) * length),
+                Height = new IntPtr(source.Entities)
+            };
+            CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
+            return result == CUDAInterop.cudaError_enum.CUDA_SUCCESS
+                ? result_gpu
+                : throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
+        }
+
         /// <summary>
         /// Copies the contents of the input <see cref="DeviceMemory{T}"/> instance to the target host memory area
         /// </summary>
@@ -40,6 +73,37 @@ public static void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor d
                 throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
         }
 
+        /// <summary>
+        /// Copies the source data into the target <see cref="Tensor"/>, splitting each individual entry into its own row
+        /// </summary>
+        /// <param name="source">The source memory area with the concatenated data for each entry</param>
+        /// <param name="destination">The destination <see cref="Tensor"/> that will store the data</param>
+        /// <param name="offset">The column offset for the data for each entry</param>
+        /// <param name="length">The number of values to copy for each entry</param>
+        public static unsafe void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor destination, int offset, int length)
+        {
+            // Checks
+            if (source.Length / length != destination.Entities) throw new ArgumentOutOfRangeException(nameof(length), "The input length doesn't match the given arguments");
+            if (destination.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
+
+            // Memory copy
+            CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
+            ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
+            {
+                srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
+                srcDevice = source.Handle,
+                srcPitch = new IntPtr(sizeof(float) * length),
+                dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
+                dstHost = destination.Ptr + sizeof(float) * offset,
+                dstPitch = new IntPtr(sizeof(float) * destination.Length),
+                WidthInBytes = new IntPtr(sizeof(float) * length),
+                Height = new IntPtr(destination.Entities)
+            };
+            CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
+            if (result != CUDAInterop.cudaError_enum.CUDA_SUCCESS)
+                throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
+        }
+
         /// <summary>
         /// Copies the contents of the input <see cref="DeviceMemory{T}"/> to a new memory area on the unmanaged heap
         /// </summary>
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
index 4aa0982..db5feb5 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
@@ -47,9 +47,7 @@ internal sealed class CuDnnConvolutionalLayer : ConvolutionalLayer
         [NotNull]
         private readonly Dnn DnnInstance = DnnService.Instance;
 
-        /// <summary>
-        /// Sets the cuDNN fields that will be used during future forward/backwards operations
-        /// </summary>
+        // cuDNN fields setup
         private void SetupCuDnnInfo()
         {
             ConvolutionDescription.Set2D(OperationInfo.VerticalPadding, OperationInfo.HorizontalPadding, OperationInfo.VerticalStride, OperationInfo.HorizontalStride, 1, 1, (Alea.cuDNN.ConvolutionMode)OperationInfo.Mode);
@@ -74,71 +72,63 @@ public CuDnnConvolutionalLayer(
         #region Implementation
 
         /// <inheritdoc/>
-        public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
+        public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
             {
-                Tensor.Reshape(pw, OutputInfo.Channels, KernelInfo.Size, out Tensor wTensor);
-                using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
+                // Tensors info setup
+                InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
+                OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);
+
+                // Forward convolution
+                DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float>
+                    x_gpu = DnnInstance.Gpu.AllocateDevice(x),
+                    w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                 {
-                    // Tensors info setup
-                    InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
-                    OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);
-
-                    // Forward convolution
-                    DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
-                    DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, algorithm, out IntPtr size);
-                    using (DeviceMemory<float>
-                        x_gpu = DnnInstance.Gpu.AllocateDevice(x),
-                        w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor))
-                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
-                    {
-                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, FilterDescription, w_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, OutputDescription, z_gpu.Ptr);
-                    }
+                    DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, FilterDescription, w_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, OutputDescription, z_gpu.Ptr);
+                }
 
-                    // Biases
-                    using (DeviceMemory<float> b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
-                    {
-                        DnnInstance.AddTensor(1, BiasDescription, b_gpu.Ptr, 1, OutputDescription, z_gpu.Ptr);
-                    }
-                    z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
+                // Biases
+                using (DeviceMemory<float> b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
+                {
+                    DnnInstance.AddTensor(1, BiasDescription, b_gpu.Ptr, 1, OutputDescription, z_gpu.Ptr);
+                }
+                z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
 
-                    // Activation
-                    if (ActivationFunctionType == ActivationFunctionType.Identity) z.Duplicate(out a);
-                    else
-                    {
-                        DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
-                        z_gpu.CopyToHost(z.Entities, z.Length, out a);
-                    }
+                // Activation
+                if (ActivationFunctionType == ActivationFunctionType.Identity) z.Duplicate(out a);
+                else
+                {
+                    DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
+                    z_gpu.CopyToHost(z.Entities, z.Length, out a);
                 }
             }
         }
 
         /// <inheritdoc/>
-        public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
+        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float> delta_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
             {
-                Tensor.Reshape(pw, OutputInfo.Channels, KernelInfo.Size, out Tensor wTensor);
+                // Convolution
                 DnnInstance.GetConvolutionBackwardDataAlgorithm(FilterDescription, OutputDescription, ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdDataAlgo algorithm);
                 DnnInstance.GetConvolutionBackwardDataWorkspaceSize(FilterDescription, OutputDescription, ConvolutionDescription, InputDescription, algorithm, out IntPtr size);
-                using (DeviceMemory<float> delta_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
+                using (DeviceMemory<float>
+                    delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
+                    w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                 {
-                    // Backwards convolution
-                    using (DeviceMemory<float>
-                        delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
-                        w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor))
-                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
-                    {
-                        DnnInstance.ConvolutionBackwardData(1, FilterDescription, w_gpu.Ptr, OutputDescription, delta_1_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, delta_gpu.Ptr);
-                    }
+                    DnnInstance.ConvolutionBackwardData(1, FilterDescription, w_gpu.Ptr, OutputDescription, delta_1_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, delta_gpu.Ptr);
+                }
 
-                    // Activation
-                    using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
-                    {
-                        DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, delta_gpu.Ptr, activationPrime);
-                        z_gpu.CopyTo(z);
-                    }
+                // Activation
+                using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
+                {
+                    DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, delta_gpu.Ptr, activationPrime);
+                    z_gpu.CopyTo(z);
                 }
             }
         }
@@ -159,7 +149,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                     {
                         DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, OutputDescription, delta_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, FilterDescription, w_gpu.Ptr);
                     }
-                    w_gpu.CopyToHost(Kernels, KernelInfo.Size, out dJdw);
+                    w_gpu.CopyToHost(1, Weights.Length, out dJdw);
                 }
 
                 // Bias
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
index 87685d3..c21c554 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
@@ -30,39 +30,31 @@ public CuDnnFullyConnectedLayer(in TensorInfo input, int neurons, [NotNull] floa
         #region Implementation
 
         /// <inheritdoc/>
-        public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
+        public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float>
+                x_gpu = DnnInstance.Gpu.AllocateDevice(x),
+                w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
+                y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size),
+                b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
             {
-                Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
-                using (DeviceMemory<float>
-                    x_gpu = DnnInstance.Gpu.AllocateDevice(x),
-                    w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
-                    y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size),
-                    b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
-                {
-                    DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, y_gpu.Ptr);
-                    y_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
-                    DnnInstance.ActivationForward(z.Entities, z.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
-                    y_gpu.CopyToHost(z.Entities, z.Length, out a);
-                }
+                DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, y_gpu.Ptr);
+                y_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
+                DnnInstance.ActivationForward(z.Entities, z.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
+                y_gpu.CopyToHost(z.Entities, z.Length, out a);
             }
         }
 
         /// <inheritdoc/>
-        public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
+        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float>
+                delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
+                w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
+                z_gpu = DnnInstance.Gpu.AllocateDevice(z))
             {
-                Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
-                using (DeviceMemory<float>
-                    delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
-                    w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
-                    z_gpu = DnnInstance.Gpu.AllocateDevice(z))
-                {
-                    DnnInstance.FullyConnectedBackwardData(z.Entities, InputInfo.Size, OutputInfo.Size, z_gpu.Ptr, delta_1_gpu.Ptr, w_gpu.Ptr, activationPrime);
-                    z_gpu.CopyTo(z);
-                }
+                DnnInstance.FullyConnectedBackwardData(z.Entities, InputInfo.Size, OutputInfo.Size, z_gpu.Ptr, delta_1_gpu.Ptr, w_gpu.Ptr, activationPrime);
+                z_gpu.CopyTo(z);
             }
         }
 
@@ -75,7 +67,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                 w_gpu = DnnInstance.Gpu.AllocateDevice<float>(a.Length * delta.Length))
             {
                 DnnInstance.FullyConnectedBackwardFilter(a.Entities, a.Length, delta.Length, a_gpu.Ptr, delta_gpu.Ptr, w_gpu.Ptr);
-                w_gpu.CopyToHost(a.Length, delta.Length, out dJdw);
+                w_gpu.CopyToHost(1, Weights.Length, out dJdw);
             }
             delta.CompressVertically(out dJdb); // Doing this on CPU is generally faster than launching the kernels
         }
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
new file mode 100644
index 0000000..27d7200
--- /dev/null
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnInceptionLayer.cs
@@ -0,0 +1,751 @@
+﻿using Alea;
+using Alea.cuDNN;
+using JetBrains.Annotations;
+using NeuralNetworkNET.APIs.Enums;
+using NeuralNetworkNET.APIs.Interfaces;
+using NeuralNetworkNET.APIs.Structs;
+using NeuralNetworkNET.Cuda.Extensions;
+using NeuralNetworkNET.Cuda.Services;
+using NeuralNetworkNET.Extensions;
+using NeuralNetworkNET.Networks.Activations;
+using NeuralNetworkNET.Networks.Activations.Delegates;
+using NeuralNetworkNET.Networks.Implementations.Layers.Abstract;
+using NeuralNetworkNET.Networks.Implementations.Layers.Helpers;
+using System;
+using System.Runtime.CompilerServices;
+
+namespace NeuralNetworkNET.Cuda.Layers
+{
+    /// <summary>
+    /// A simplified inception module, with 4 pipelines combining 1x1 convolution, 1x1 + 3x3, 1x1 + 5x5 and pooling + 1x1
+    /// </summary>
+    internal sealed class CuDnnInceptionLayer : WeightedLayerBase, IDisposable
+    {
+        #region Parameters
+
+        /// <sinheritdoc/>
+        public override LayerType LayerType { get; } = LayerType.Inception;
+
+        private readonly InceptionInfo _OperationInfo;
+
+        /// <summary>
+        /// Gets the info on the inception parameters used by the layer
+        /// </summary>    
+        public ref readonly InceptionInfo OperationInfo
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => ref _OperationInfo;
+        }
+
+        #endregion
+
+        #region Private fields and parameters
+
+        // 1x1 convolution weights on first pipeline
+        private int _1x1Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => InputInfo.Channels * OperationInfo.Primary1x1ConvolutionKernels;
+        }
+
+        // 1x1 convolution weights on 3x3 pipeline
+        private int _3x3Reduce1x1Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => InputInfo.Channels * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels;
+        }
+
+        // 3x3 convolution weights
+        private int _3x3Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => 3 * 3 * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels * OperationInfo.Secondary3x3ConvolutionKernels;
+        }
+
+        // 1x1 convolution weights on 5x5 pipeline
+        private int _5x5Reduce1x1Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => InputInfo.Channels * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels;
+        }
+
+        // 5x5 convolution weights
+        private int _5x5Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => 5 * 5 * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels * OperationInfo.Secondary5x5ConvolutionKernels;
+        }
+
+        // 1x1 convolution weights on pooling pipeline
+        private int Secondary1x1Weights
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => InputInfo.Channels * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels;
+        }
+
+        // A copy of the forward layer inputs
+        private Tensor _Inputs;
+
+        // 3x3 reduction 1x1 convolution activity
+        private Tensor _3x3Reduce1x1Z;
+
+        // 3x3 reduction 1x1 convolution activation
+        private Tensor _3x3Reduce1x1A;
+
+        // 3x3 reduction 1x1 convolution output delta
+        private Tensor _3x3Reduce1x1Delta;
+
+        // 5x5 reduction 1x1 convolution activity
+        private Tensor _5x5Reduce1x1Z;
+
+        // 5x5 reduction 1x1 convolution activation
+        private Tensor _5x5Reduce1x1A;
+
+        // 5x5 reduction 1x1 convolution output delta
+        private Tensor _5x5Reduce1x1Delta;
+
+        // Pooling output activity
+        private Tensor _PoolingZ;
+
+        // Pooling output activation
+        private Tensor _PoolingA;
+
+        // Pooling output delta
+        private Tensor _PoolingDelta;
+
+        #endregion
+
+        #region cuDNN fields
+
+        // The NCHW tensor info for the layer inputs
+        [NotNull]
+        private readonly TensorDescriptor InputDescription = new TensorDescriptor();
+
+        #region 1x1 convolution
+
+        // The NCHW info for the 1x1 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor _1x1FilterDescription = new FilterDescriptor();
+
+         // The info on the 1x1 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor _1x1BiasDescription = new TensorDescriptor();
+
+        // The first 1x1 convolution info
+        [NotNull]
+        private readonly ConvolutionDescriptor _1x1ConvolutionDescription = new ConvolutionDescriptor();
+
+        // The NCHW tensor info for the outputs of the first 1x1 convolution
+        [NotNull]
+        private readonly TensorDescriptor _1x1OutputDescription = new TensorDescriptor();
+
+        #endregion
+
+        #region 3x3 reduce 1x1 convolution
+
+        // The NCHW info for the 3x3 reduce 1x1 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor _3x3Reduce1x1FilterDescription = new FilterDescriptor();
+
+         // The info on the 3x3 reduce 1x1 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor _3x3Reduce1x1BiasDescription = new TensorDescriptor();
+
+        // The NCHW tensor info for the outputs of the 3x3 reduce 1x1 convolution
+        [NotNull]
+        private readonly TensorDescriptor _3x3Reduce1x1OutputDescription = new TensorDescriptor();
+
+        #endregion
+
+        #region 3x3 secondary convolution
+
+        // The NCHW info for the 3x3 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor _3x3FilterDescription = new FilterDescriptor();
+
+         // The info on the 3x3 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor _3x3BiasDescription = new TensorDescriptor();
+
+        // The first 3x3 convolution info
+        [NotNull]
+        private readonly ConvolutionDescriptor _3x3ConvolutionDescription = new ConvolutionDescriptor();
+
+        // The NCHW tensor info for the outputs of the 3x3 convolution
+        [NotNull]
+        private readonly TensorDescriptor _3x3OutputDescription = new TensorDescriptor();
+
+        #endregion
+
+        #region 3x3 reduce 1x1 convolution
+
+        // The NCHW info for the 5x5 reduce 1x1 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor _5x5Reduce1x1FilterDescription = new FilterDescriptor();
+
+         // The info on the 5x5 reduce 1x1 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor _5x5Reduce1x1BiasDescription = new TensorDescriptor();
+
+        // The NCHW tensor info for the outputs of the 5x5 reduce 1x1 convolution
+        [NotNull]
+        private readonly TensorDescriptor _5x5Reduce1x1OutputDescription = new TensorDescriptor();
+
+        #endregion
+
+        #region 5x5 secondary convolution
+
+        // The NCHW info for the 5x5 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor _5x5FilterDescription = new FilterDescriptor();
+
+         // The info on the 5x5 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor _5x5BiasDescription = new TensorDescriptor();
+
+        // The first 5x5 convolution info
+        [NotNull]
+        private readonly ConvolutionDescriptor _5x5ConvolutionDescription = new ConvolutionDescriptor();
+
+        // The NCHW tensor info for the outputs of the 5x5 convolution
+        [NotNull]
+        private readonly TensorDescriptor _5x5OutputDescription = new TensorDescriptor();
+
+        #endregion
+
+        #region Pooling pipeline
+
+        // The descriptor for the pooling operation performed by the layer
+        [NotNull]
+        private readonly PoolingDescriptor PoolingDescription = new PoolingDescriptor();
+
+        // The NCHW tensor info for the pooling outputs
+        [NotNull]
+        private readonly TensorDescriptor PoolingOutputDescription = new TensorDescriptor();
+
+        // The NCHW info for the secondary 1x1 convolution weights
+        [NotNull]
+        private readonly FilterDescriptor Secondary1x1FilterDescription = new FilterDescriptor();
+
+        // The info on the secondary 1x1 convolution bias (one value per output channel)
+        [NotNull]
+        private readonly TensorDescriptor Secondary1x1BiasDescription = new TensorDescriptor();
+
+        // The info on the secondary 1x1 convolution outputs
+        [NotNull]
+        private readonly TensorDescriptor Secondary1x1OutputDescription = new TensorDescriptor();
+
+        #endregion
+
+        /// <summary>
+        /// Gets the <see cref="Dnn"/> instance for the current layer
+        /// </summary>
+        [NotNull]
+        private readonly Dnn DnnInstance = DnnService.Instance;
+
+        // cuDNN fields setup
+        private void SetupCuDnnInfo()
+        {
+            // First 1x1 convolution
+            _1x1ConvolutionDescription.Set2D(0, 0, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION);
+            _1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Primary1x1ConvolutionKernels, InputInfo.Channels, 1, 1);
+            _1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Primary1x1ConvolutionKernels, 1, 1);
+
+            // 3x3 reduce 1x1 convolution
+            _3x3Reduce1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, InputInfo.Channels, 1, 1);
+            _3x3Reduce1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, 1, 1);
+
+            // 3x3 convolution
+            _3x3ConvolutionDescription.Set2D(1, 1, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION); // 1-padding to keep size
+            _3x3FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary3x3ConvolutionKernels, _OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, 3, 3);
+            _3x3BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary3x3ConvolutionKernels, 1, 1);
+
+            // 5x5 reduce 1x1 convolution
+            _5x5Reduce1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, InputInfo.Channels, 1, 1);
+            _5x5Reduce1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, 1, 1);
+
+            // 5x5 convolution
+            _5x5ConvolutionDescription.Set2D(2, 2, 1, 1, 1, 1, Alea.cuDNN.ConvolutionMode.CROSS_CORRELATION);
+            _5x5FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary5x5ConvolutionKernels, _OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, 5, 5);
+            _5x5BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary5x5ConvolutionKernels, 1, 1);
+
+            // Pooling
+            PoolingDescription.Set2D((Alea.cuDNN.PoolingMode)OperationInfo.Pooling, NanPropagation.PROPAGATE_NAN, 3, 3, 1, 1, 1, 1);
+
+            // Secondary 1x1 convolution
+            Secondary1x1FilterDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, InputInfo.Channels, 1, 1);
+            Secondary1x1BiasDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, 1, _OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, 1, 1);
+        }
+
+        #endregion
+
+        internal CuDnnInceptionLayer(in TensorInfo input, in InceptionInfo info, BiasInitializationMode biasMode = BiasInitializationMode.Zero)
+            : base(input, new TensorInfo(input.Height, input.Width, info.OutputChannels),
+                  WeightsProvider.NewInceptionWeights(input, info),
+                  WeightsProvider.NewBiases(info.ConvolutionKernels, biasMode),
+                  ActivationFunctionType.ReLU)
+        {
+            _OperationInfo = info;
+            SetupCuDnnInfo();
+        }
+
+        internal CuDnnInceptionLayer(in TensorInfo input, in InceptionInfo info, [NotNull] float[] w, [NotNull] float[] b) 
+            : base(input, new TensorInfo(input.Height, input.Width, info.OutputChannels), w, b, ActivationFunctionType.ReLU)
+        {
+            _OperationInfo = info;
+            SetupCuDnnInfo();
+        }
+
+        #region Implementation
+
+        /// <inheritdoc/>
+        public override void Forward(in Tensor x, out Tensor z, out Tensor a)
+        {
+            _Inputs.TryFree();
+            x.Duplicate(out _Inputs);
+            Tensor.New(x.Entities, OutputInfo.Size, out z);
+            Tensor.New(x.Entities, OutputInfo.Size, out a);
+            using (DeviceMemory<float>
+                    w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
+                    b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
+            {
+                // Pointers
+                deviceptr<float> pw_gpu = w_gpu.Ptr, pb_gpu = b_gpu.Ptr;
+
+                // First 1x1 convolution
+                using (DeviceMemory<float> y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels))
+                {
+                    // Descriptors setup and first 1x1 convolution
+                    InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
+                    _1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Primary1x1ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                    DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, _1x1FilterDescription, _1x1ConvolutionDescription, _1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                    DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, _1x1FilterDescription, _1x1ConvolutionDescription, _1x1OutputDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> x_gpu = DnnInstance.Gpu.AllocateDevice(x))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _1x1FilterDescription, pw_gpu, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _1x1OutputDescription, y_gpu.Ptr);                            
+                    }
+                    DnnInstance.AddTensor(1, _1x1BiasDescription, pb_gpu, 1, _1x1OutputDescription, y_gpu.Ptr);
+                    y_gpu.CopyTo(z, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
+
+                    // 1x1 convolution activation
+                    DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
+                    y_gpu.CopyTo(a, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels);
+                }
+
+                // 1x1 + 3x3 convolution
+                using (DeviceMemory<float> 
+                    y1x1_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels),
+                    y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels))
+                {
+                    // 1x1 convolution
+                    _3x3Reduce1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                    DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, _3x3Reduce1x1FilterDescription, _1x1ConvolutionDescription, _3x3Reduce1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                    DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, _3x3Reduce1x1FilterDescription, _1x1ConvolutionDescription, _3x3Reduce1x1OutputDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> x_gpu = DnnInstance.Gpu.AllocateDevice(x))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _3x3Reduce1x1FilterDescription, pw_gpu += _1x1Weights, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr);                            
+                    }
+                    DnnInstance.AddTensor(1, _3x3Reduce1x1BiasDescription, pb_gpu += OperationInfo.Primary1x1ConvolutionKernels, 1, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr);
+                    _3x3Reduce1x1Z.TryFree();
+                    y1x1_gpu.CopyToHost(x.Entities, InputInfo.SliceSize * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, out _3x3Reduce1x1Z);
+                    DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, y1x1_gpu.Ptr, y1x1_gpu.Ptr, ActivationFunctions.Activation);
+                    _3x3Reduce1x1A.TryFree();
+                    y1x1_gpu.CopyToHost(x.Entities, InputInfo.SliceSize * OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, out _3x3Reduce1x1A);
+
+                    // 3x3 convolution
+                    _3x3OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary3x3ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                    DnnInstance.GetConvolutionForwardAlgorithm(_3x3Reduce1x1OutputDescription, _3x3FilterDescription, _3x3ConvolutionDescription, _3x3OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionForwardWorkspaceSize(_3x3Reduce1x1OutputDescription, _3x3FilterDescription, _3x3ConvolutionDescription, _3x3OutputDescription, algorithm, out size);
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionForward(1, _3x3Reduce1x1OutputDescription, y1x1_gpu.Ptr, _3x3FilterDescription, pw_gpu += _3x3Reduce1x1Weights, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3OutputDescription, y_gpu.Ptr);      
+                    }
+                    DnnInstance.AddTensor(1, _3x3BiasDescription, pb_gpu += OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, 1, _3x3OutputDescription, y_gpu.Ptr);
+                    y_gpu.CopyTo(z, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
+
+                    // Activation
+                    DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
+                    y_gpu.CopyTo(a, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels);
+                }
+
+                // 1x1 + 5x5 convolution
+                using (DeviceMemory<float> 
+                    y1x1_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels),
+                    y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels))
+                {
+                    // 1x1 convolution
+                    _5x5Reduce1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                    DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, _5x5Reduce1x1FilterDescription, _1x1ConvolutionDescription, _5x5Reduce1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                    DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, _5x5Reduce1x1FilterDescription, _1x1ConvolutionDescription, _5x5Reduce1x1OutputDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> x_gpu = DnnInstance.Gpu.AllocateDevice(x))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, _5x5Reduce1x1FilterDescription, pw_gpu += _3x3Weights, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);                            
+                    }
+                    DnnInstance.AddTensor(1, _5x5Reduce1x1BiasDescription, pb_gpu += OperationInfo.Secondary3x3ConvolutionKernels, 1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr);
+                    _5x5Reduce1x1Z.TryFree();
+                    y1x1_gpu.CopyToHost(x.Entities, InputInfo.SliceSize * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, out _5x5Reduce1x1Z);
+                    DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, y1x1_gpu.Ptr, y1x1_gpu.Ptr, ActivationFunctions.Activation);
+                    _5x5Reduce1x1A.TryFree();
+                    y1x1_gpu.CopyToHost(x.Entities, InputInfo.SliceSize * OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, out _5x5Reduce1x1A);
+
+                    // 5x5 convolution
+                    _5x5OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary5x5ConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                    DnnInstance.GetConvolutionForwardAlgorithm(_5x5Reduce1x1OutputDescription, _5x5FilterDescription, _5x5ConvolutionDescription, _5x5OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionForwardWorkspaceSize(_5x5Reduce1x1OutputDescription, _5x5FilterDescription, _5x5ConvolutionDescription, _5x5OutputDescription, algorithm, out size);
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionForward(1, _5x5Reduce1x1OutputDescription, y1x1_gpu.Ptr, _5x5FilterDescription, pw_gpu += _5x5Reduce1x1Weights, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5OutputDescription, y_gpu.Ptr);
+                    }
+                    DnnInstance.AddTensor(1, _5x5BiasDescription, pb_gpu += OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, 1, _5x5OutputDescription, y_gpu.Ptr);
+                    y_gpu.CopyTo(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
+
+                    // Activation
+                    DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
+                    y_gpu.CopyTo(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels);
+                }
+
+                // Pooling pipeline
+                PoolingOutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
+                using (DeviceMemory<float> y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Size))
+                {
+                    // Pooling
+                    using (DeviceMemory<float> x_gpu = DnnInstance.Gpu.AllocateDevice(x))
+                    {
+                        DnnInstance.PoolingForward(PoolingDescription, 1, InputDescription, x_gpu.Ptr, 0, InputDescription, y_gpu.Ptr);
+                    }
+                    _PoolingZ.TryFree();
+                    y_gpu.CopyToHost(x.Entities, InputInfo.Size, out _PoolingZ);
+                    DnnInstance.ActivationForward(x.Entities, x.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
+                    _PoolingA.TryFree();
+                    y_gpu.CopyToHost(x.Entities, InputInfo.Size, out _PoolingA);
+
+                    // 1x1 convolution
+                    using (DeviceMemory<float> _1x1Output_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels))
+                    {
+                        Secondary1x1OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, InputInfo.Height, InputInfo.Width);
+                        DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, Secondary1x1FilterDescription, _1x1ConvolutionDescription, Secondary1x1OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                        DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, Secondary1x1FilterDescription, _1x1ConvolutionDescription, Secondary1x1OutputDescription, algorithm, out IntPtr size);
+                        using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                        {
+                            DnnInstance.ConvolutionForward(1, InputDescription, y_gpu.Ptr, Secondary1x1FilterDescription, pw_gpu += _5x5Weights, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);                            
+                        }
+                        DnnInstance.AddTensor(1, Secondary1x1BiasDescription, pb_gpu += OperationInfo.Secondary5x5ConvolutionKernels, 1, Secondary1x1OutputDescription, _1x1Output_gpu.Ptr);
+                        _1x1Output_gpu.CopyTo(z, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
+
+                        // 1x1 convolution activation
+                        DnnInstance.ActivationForward(x.Entities, InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels, _1x1Output_gpu.Ptr, _1x1Output_gpu.Ptr, ActivationFunctions.Activation);
+                        _1x1Output_gpu.CopyTo(a, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
+                    }
+                }
+            }
+        }
+
+        /// <inheritdoc/>
+        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
+        {
+            using (DeviceMemory<float> 
+                dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size),
+                w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
+            {
+                // First 1x1 convolution
+                DnnInstance.GetConvolutionBackwardDataAlgorithm(_1x1FilterDescription, _1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdDataAlgo algorithm);
+                DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_1x1FilterDescription, _1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float> dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                {
+                    DnnInstance.ConvolutionBackwardData(1, _1x1FilterDescription, w_gpu.Ptr, _1x1OutputDescription, dy_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, dx_gpu.Ptr);
+                }
+
+                // 1x1 + 3x3 convolution
+                using (DeviceMemory<float> _3x3Reduce1x1z_gpu = DnnInstance.Gpu.AllocateDevice(_3x3Reduce1x1Z))
+                {
+                    // 3x3 backward
+                    DnnInstance.GetConvolutionBackwardDataAlgorithm(_3x3FilterDescription, _3x3OutputDescription, _3x3ConvolutionDescription, _3x3Reduce1x1OutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_3x3FilterDescription, _3x3OutputDescription, _3x3ConvolutionDescription, _3x3Reduce1x1OutputDescription, algorithm, out size);
+                    using (DeviceMemory<float> 
+                        dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels),
+                        _3x3Reduce1x1dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(_3x3Reduce1x1Z.Size))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        deviceptr<float> p3x3Weights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights;
+                        DnnInstance.ConvolutionBackwardData(1, _3x3FilterDescription, p3x3Weights_gpu, _3x3OutputDescription, dy_gpu.Ptr, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3Reduce1x1OutputDescription, _3x3Reduce1x1dx_gpu.Ptr);
+                        DnnInstance.ActivationBackward(_3x3Reduce1x1Z.Entities, _3x3Reduce1x1Z.Length, _3x3Reduce1x1z_gpu.Ptr, _3x3Reduce1x1dx_gpu.Ptr, ActivationFunctions.ActivationPrime);
+                        _3x3Reduce1x1Delta.TryFree();
+                        _3x3Reduce1x1z_gpu.CopyToHost(_3x3Reduce1x1Z.Entities, _3x3Reduce1x1Z.Length, out _3x3Reduce1x1Delta);
+                    }
+
+                    // 3x3 reduce 1x1 backward
+                    DnnInstance.GetConvolutionBackwardDataAlgorithm(_3x3Reduce1x1FilterDescription, _3x3Reduce1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_3x3Reduce1x1FilterDescription, _3x3Reduce1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, algorithm, out size);
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        deviceptr<float> p3x3Reduce1x1Weights_gpu = w_gpu.Ptr + _1x1Weights;
+                        DnnInstance.ConvolutionBackwardData(1, _3x3Reduce1x1FilterDescription, p3x3Reduce1x1Weights_gpu, _3x3Reduce1x1OutputDescription, _3x3Reduce1x1z_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 1, InputDescription, dx_gpu.Ptr);
+                    }
+                }
+
+                // 1x1 + 5x5 convolution
+                using (DeviceMemory<float> _5x5Reduce1x1z_gpu = DnnInstance.Gpu.AllocateDevice(_5x5Reduce1x1Z))
+                {
+                    // 5x5 backward
+                    DnnInstance.GetConvolutionBackwardDataAlgorithm(_5x5FilterDescription, _5x5OutputDescription, _5x5ConvolutionDescription, _5x5Reduce1x1OutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_5x5FilterDescription, _5x5OutputDescription, _5x5ConvolutionDescription, _5x5Reduce1x1OutputDescription, algorithm, out size);
+                    using (DeviceMemory<float> 
+                        dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels),
+                        _5x5Reduce1x1dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(_5x5Reduce1x1Z.Size))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        deviceptr<float> p5x5Weights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights;
+                        DnnInstance.ConvolutionBackwardData(1, _5x5FilterDescription, p5x5Weights_gpu, _5x5OutputDescription, dy_gpu.Ptr, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1OutputDescription, _5x5Reduce1x1dx_gpu.Ptr);
+                        DnnInstance.ActivationBackward(_5x5Reduce1x1Z.Entities, _5x5Reduce1x1Z.Length, _5x5Reduce1x1z_gpu.Ptr, _5x5Reduce1x1dx_gpu.Ptr, ActivationFunctions.ActivationPrime);
+                        _5x5Reduce1x1Delta.TryFree();
+                        _5x5Reduce1x1z_gpu.CopyToHost(_5x5Reduce1x1Z.Entities, _5x5Reduce1x1Z.Length, out _5x5Reduce1x1Delta);
+                    }
+
+                    // 5x5 reduce 1x1 backward
+                    DnnInstance.GetConvolutionBackwardDataAlgorithm(_5x5Reduce1x1FilterDescription, _5x5Reduce1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionBackwardDataWorkspaceSize(_5x5Reduce1x1FilterDescription, _5x5Reduce1x1OutputDescription, _1x1ConvolutionDescription, InputDescription, algorithm, out size);
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        deviceptr<float> p5x5Reduce1x1Weights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights;
+                        DnnInstance.ConvolutionBackwardData(1, _5x5Reduce1x1FilterDescription, p5x5Reduce1x1Weights_gpu, _5x5Reduce1x1OutputDescription, _5x5Reduce1x1z_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 1, InputDescription, dx_gpu.Ptr);
+                    }
+                }
+
+                // Pooling
+                using (DeviceMemory<float> pooldy_gpu = DnnInstance.Gpu.AllocateDevice(_PoolingZ))
+                {
+                    // 1x1 backward
+                    DnnInstance.GetConvolutionBackwardDataAlgorithm(Secondary1x1FilterDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, PoolingOutputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out algorithm);
+                    DnnInstance.GetConvolutionBackwardDataWorkspaceSize(Secondary1x1FilterDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, PoolingOutputDescription, algorithm, out size);
+                    using (DeviceMemory<float> 
+                        dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels),
+                        poolDx_gpu = DnnInstance.Gpu.AllocateDevice<float>(_PoolingZ.Size))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        deviceptr<float> p1x1PoolingWeights_gpu = w_gpu.Ptr + _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights + _5x5Weights;
+                        DnnInstance.ConvolutionBackwardData(1, Secondary1x1FilterDescription, p1x1PoolingWeights_gpu, Secondary1x1OutputDescription, dy_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, PoolingOutputDescription, poolDx_gpu.Ptr);
+                        DnnInstance.ActivationBackward(_PoolingZ.Entities, _PoolingZ.Length, pooldy_gpu.Ptr, poolDx_gpu.Ptr, ActivationFunctions.ActivationPrime);
+                        _PoolingDelta.TryFree();
+                        pooldy_gpu.CopyToHost(_PoolingZ.Entities, _PoolingZ.Length, out _PoolingDelta);
+                    }
+
+                    // Pooling backward
+                    using (DeviceMemory<float> 
+                        x_gpu = DnnInstance.Gpu.AllocateDevice(_Inputs),
+                        poolZ_gpu = DnnInstance.Gpu.AllocateDevice(_PoolingZ))
+                    {
+                        DnnInstance.PoolingBackward(PoolingDescription, 1, PoolingOutputDescription, poolZ_gpu.Ptr, PoolingOutputDescription, pooldy_gpu.Ptr, InputDescription, x_gpu.Ptr, 1, InputDescription, dx_gpu.Ptr); // TODO: finish pooling backward
+                    }
+                }
+
+                // Activation backward
+                using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
+                {
+                    DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, dx_gpu.Ptr, activationPrime);
+                    z_gpu.CopyTo(z);
+                }
+            }
+        }
+
+        /// <inheritdoc/>
+        public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJdw, out Tensor dJdb)
+        {
+            Tensor.New(1, Weights.Length, out dJdw);
+            Tensor.New(1, Biases.Length, out dJdb);
+            using (DeviceMemory<float> a_gpu = DnnInstance.Gpu.AllocateDevice(a))
+            {
+                // 1x1 weights
+                using (DeviceMemory<float> dy1x1_gpu = DnnInstance.Gpu.AllocateDevice(delta, 0, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels))
+                {
+                    DnnInstance.GetConvolutionBackwardFilterAlgorithm(InputDescription, _1x1OutputDescription, _1x1ConvolutionDescription, _1x1FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                    DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(InputDescription, _1x1OutputDescription, _1x1ConvolutionDescription, _1x1FilterDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(_1x1Weights))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, _1x1OutputDescription, dy1x1_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _1x1FilterDescription, dw_gpu.Ptr);
+                        dw_gpu.CopyTo(dJdw, 0, _1x1Weights);
+                    }
+
+                    // 1x1 bias
+                    using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Primary1x1ConvolutionKernels))
+                    {
+                        DnnInstance.ConvolutionBackwardBias(1, _1x1OutputDescription, dy1x1_gpu.Ptr, 0, _1x1BiasDescription, db_gpu.Ptr);
+                        db_gpu.CopyTo(dJdb, 0, OperationInfo.Primary1x1ConvolutionKernels);
+                    }
+                }
+
+                // 3x3 reduce 1x1 weights
+                using (DeviceMemory<float> dy3x3Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_3x3Reduce1x1Delta))
+                {
+                    DnnInstance.GetConvolutionBackwardFilterAlgorithm(InputDescription, _3x3Reduce1x1OutputDescription, _1x1ConvolutionDescription, _3x3Reduce1x1FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                    DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(InputDescription, _3x3Reduce1x1OutputDescription, _1x1ConvolutionDescription, _3x3Reduce1x1FilterDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(_3x3Reduce1x1Weights))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, _3x3Reduce1x1OutputDescription, dy3x3Reduce1x1_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3Reduce1x1FilterDescription, dw_gpu.Ptr);
+                        dw_gpu.CopyTo(dJdw, _1x1Weights, _3x3Reduce1x1Weights);
+                    }
+
+                    // 3x3 reduce 1x1 bias
+                    using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Primary3x3Reduce1x1ConvolutionKernels))
+                    {
+                        DnnInstance.ConvolutionBackwardBias(1, _3x3Reduce1x1OutputDescription, dy3x3Reduce1x1_gpu.Ptr, 0, _3x3Reduce1x1BiasDescription, db_gpu.Ptr);
+                        db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels, OperationInfo.Primary3x3Reduce1x1ConvolutionKernels);
+                    }
+                }
+
+                // 5x5 reduce 1x1 weights
+                using (DeviceMemory<float> dy5x5Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_5x5Reduce1x1Delta))
+                {
+                    DnnInstance.GetConvolutionBackwardFilterAlgorithm(InputDescription, _5x5Reduce1x1OutputDescription, _1x1ConvolutionDescription, _5x5Reduce1x1FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                    DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(InputDescription, _5x5Reduce1x1OutputDescription, _1x1ConvolutionDescription, _5x5Reduce1x1FilterDescription, algorithm, out IntPtr size);
+                    using (DeviceMemory<float> dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(_5x5Reduce1x1Weights))
+                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                    {
+                        DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, _5x5Reduce1x1OutputDescription, dy5x5Reduce1x1_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5Reduce1x1FilterDescription, dw_gpu.Ptr);
+                        dw_gpu.CopyTo(dJdw, _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights, _5x5Reduce1x1Weights);
+                    }
+
+                    // 5x5 reduce 1x1 bias
+                    using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Primary5x5Reduce1x1ConvolutionKernels))
+                    {
+                        DnnInstance.ConvolutionBackwardBias(1, _5x5Reduce1x1OutputDescription, dy5x5Reduce1x1_gpu.Ptr, 0, _5x5Reduce1x1BiasDescription, db_gpu.Ptr);
+                        db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Primary3x3Reduce1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels, OperationInfo.Primary5x5Reduce1x1ConvolutionKernels);
+                    }
+                }
+            }
+
+            // 3x3 weights
+            using (DeviceMemory<float> dy3x3_gpu = DnnInstance.Gpu.AllocateDevice(delta, InputInfo.SliceSize * OperationInfo.Primary1x1ConvolutionKernels, InputInfo.SliceSize * OperationInfo.Secondary3x3ConvolutionKernels))
+            {
+                DnnInstance.GetConvolutionBackwardFilterAlgorithm(_3x3Reduce1x1OutputDescription, _3x3OutputDescription, _3x3ConvolutionDescription, _3x3FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(_3x3Reduce1x1OutputDescription, _3x3OutputDescription, _3x3ConvolutionDescription, _3x3FilterDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float>
+                    a3x3Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_3x3Reduce1x1A),
+                    dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(_3x3Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                {
+                    DnnInstance.ConvolutionBackwardFilter(1, _3x3Reduce1x1OutputDescription, a3x3Reduce1x1_gpu.Ptr, _3x3OutputDescription, dy3x3_gpu.Ptr, _3x3ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _3x3FilterDescription, dw_gpu.Ptr);
+                    dw_gpu.CopyTo(dJdw, _1x1Weights + _3x3Reduce1x1Weights, _3x3Weights);
+                }
+
+                // 3x3 bias
+                using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Secondary3x3ConvolutionKernels))
+                {
+                    DnnInstance.ConvolutionBackwardBias(1, _3x3OutputDescription, dy3x3_gpu.Ptr, 0, _3x3BiasDescription, db_gpu.Ptr);
+                    db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Primary3x3Reduce1x1ConvolutionKernels, OperationInfo.Secondary3x3ConvolutionKernels);
+                }
+            }
+
+            // 5x5 weights
+            using (DeviceMemory<float> dy5x5_gpu = DnnInstance.Gpu.AllocateDevice(delta, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary5x5ConvolutionKernels))
+            {
+                DnnInstance.GetConvolutionBackwardFilterAlgorithm(_5x5Reduce1x1OutputDescription, _5x5OutputDescription, _5x5ConvolutionDescription, _5x5FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(_5x5Reduce1x1OutputDescription, _5x5OutputDescription, _5x5ConvolutionDescription, _5x5FilterDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float>
+                    a5x5Reduce1x1_gpu = DnnInstance.Gpu.AllocateDevice(_5x5Reduce1x1A),
+                    dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(_5x5Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                {
+                    DnnInstance.ConvolutionBackwardFilter(1, _5x5Reduce1x1OutputDescription, a5x5Reduce1x1_gpu.Ptr, _5x5OutputDescription, dy5x5_gpu.Ptr, _5x5ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, _5x5FilterDescription, dw_gpu.Ptr);
+                    dw_gpu.CopyTo(dJdw, _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights, _5x5Weights);
+                }
+
+                // 5x5 bias
+                using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Secondary5x5ConvolutionKernels))
+                {
+                    DnnInstance.ConvolutionBackwardBias(1, _5x5OutputDescription, dy5x5_gpu.Ptr, 0, _5x5BiasDescription, db_gpu.Ptr);
+                    db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Primary3x3Reduce1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Primary5x5Reduce1x1ConvolutionKernels, OperationInfo.Secondary5x5ConvolutionKernels);
+                }
+            }
+
+            // Pooling 1x1 convolution
+            using (DeviceMemory<float> dy1x1Pool_gpu = DnnInstance.Gpu.AllocateDevice(delta, InputInfo.SliceSize * (OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels), InputInfo.SliceSize * OperationInfo.Secondary1x1AfterPoolingConvolutionKernels))
+            {
+                DnnInstance.GetConvolutionBackwardFilterAlgorithm(PoolingOutputDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, Secondary1x1FilterDescription, ConvolutionBwdFilterPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdFilterAlgo algorithm);
+                DnnInstance.GetConvolutionBackwardFilterWorkspaceSize(PoolingOutputDescription, Secondary1x1OutputDescription, _1x1ConvolutionDescription, Secondary1x1FilterDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float>
+                    aPool_gpu = DnnInstance.Gpu.AllocateDevice(_PoolingA),
+                    dw_gpu = DnnInstance.Gpu.AllocateDevice<float>(Secondary1x1Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
+                {
+                    DnnInstance.ConvolutionBackwardFilter(1, PoolingOutputDescription, aPool_gpu.Ptr, Secondary1x1OutputDescription, dy1x1Pool_gpu.Ptr, _1x1ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, Secondary1x1FilterDescription, dw_gpu.Ptr);
+                    dw_gpu.CopyTo(dJdw, _1x1Weights + _3x3Reduce1x1Weights + _3x3Weights + _5x5Reduce1x1Weights + _5x5Weights, Secondary1x1Weights);
+                }
+
+                // Pooling 1x1 bias
+                using (DeviceMemory<float> db_gpu = DnnInstance.Gpu.AllocateDevice<float>(OperationInfo.Secondary1x1AfterPoolingConvolutionKernels))
+                {
+                    DnnInstance.ConvolutionBackwardBias(1, Secondary1x1OutputDescription, dy1x1Pool_gpu.Ptr, 0, Secondary1x1BiasDescription, db_gpu.Ptr);
+                    db_gpu.CopyTo(dJdb, OperationInfo.Primary1x1ConvolutionKernels + OperationInfo.Primary3x3Reduce1x1ConvolutionKernels + OperationInfo.Secondary3x3ConvolutionKernels + OperationInfo.Primary5x5Reduce1x1ConvolutionKernels + OperationInfo.Secondary5x5ConvolutionKernels, OperationInfo.Secondary1x1AfterPoolingConvolutionKernels);
+                }
+            }
+        }
+
+        #endregion
+
+        #region Misc
+
+        /// <inheritdoc/>
+        public override INetworkLayer Clone() => new CuDnnInceptionLayer(InputInfo, OperationInfo, Weights, Biases);
+
+        /// <inheritdoc/>
+        public override void Serialize(System.IO.Stream stream)
+        {
+            base.Serialize(stream);
+            stream.Write(OperationInfo);
+        }
+
+        /// <summary>
+        /// Tries to deserialize a new <see cref="CuDnnInceptionLayer"/> from the input <see cref="System.IO.Stream"/>
+        /// </summary>
+        /// <param name="stream">The input <see cref="System.IO.Stream"/> to use to read the layer data</param>
+        [MustUseReturnValue, CanBeNull]
+        public static INetworkLayer Deserialize([NotNull] System.IO.Stream stream)
+        {
+            if (!stream.TryRead(out TensorInfo input)) return null;
+            if (!stream.TryRead<TensorInfo>(out _)) return null;
+            if (!stream.TryRead<ActivationFunctionType>(out _)) return null;
+            if (!stream.TryRead(out int wLength)) return null;
+            float[] weights = stream.ReadUnshuffled(wLength);
+            if (!stream.TryRead(out int bLength)) return null;
+            float[] biases = stream.ReadUnshuffled(bLength);
+            if (!stream.TryRead(out InceptionInfo info)) return null;
+            return new CuDnnInceptionLayer(input, info, weights, biases);
+        }
+
+        #endregion
+
+        #region IDisposable
+
+        ~CuDnnInceptionLayer() => Dispose();
+
+        /// <inheritdoc/>
+        void IDisposable.Dispose()
+        {
+            GC.SuppressFinalize(this);
+            Dispose();
+        }
+
+        // Private Dispose method
+        private void Dispose()
+        {
+            _Inputs.TryFree();
+            _3x3Reduce1x1Z.TryFree();
+            _3x3Reduce1x1A.TryFree();
+            _3x3Reduce1x1Delta.TryFree();
+            _5x5Reduce1x1Z.TryFree();
+            _5x5Reduce1x1A.TryFree();
+            _5x5Reduce1x1Delta.TryFree();
+            _PoolingZ.TryFree();
+            _PoolingA.TryFree();
+            _PoolingDelta.TryFree();
+        }
+
+        #endregion
+    }
+}
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs
index 0d5aced..3cca675 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnPoolingLayer.cs
@@ -1,4 +1,5 @@
-﻿using Alea;
+﻿using System;
+using Alea;
 using Alea.cuDNN;
 using NeuralNetworkNET.Extensions;
 using NeuralNetworkNET.Cuda.Extensions;
@@ -14,10 +15,10 @@
 namespace NeuralNetworkNET.Cuda.Layers
 {
     /// <summary>
-    /// A pooling layer running on cuDNN, with a 2x2 window and a stride of 2
+    /// A pooling layer running on cuDNN, with a custom pooling mode
     /// </summary>
     [JsonObject(MemberSerialization.OptIn)]
-    internal sealed class CuDnnPoolingLayer : PoolingLayer
+    internal sealed class CuDnnPoolingLayer : PoolingLayer, IDisposable
     {
         #region cuDNN fields
 
@@ -41,6 +42,16 @@ internal sealed class CuDnnPoolingLayer : PoolingLayer
 
         #endregion
 
+        #region Fields
+
+        // A copy of the layer inputs
+        private Tensor _X;
+
+        // A copy of the layer output activity
+        private Tensor _Z;
+
+        #endregion
+
         public CuDnnPoolingLayer(in TensorInfo input, in PoolingInfo operation, ActivationFunctionType activation) : base(input, operation, activation)
         {
             PoolingDescription.Set2D((PoolingMode)operation.Mode, NanPropagation.PROPAGATE_NAN, operation.WindowHeight, operation.WindowWidth, operation.VerticalPadding, operation.HorizontalPadding, operation.VerticalStride, operation.HorizontalStride);
@@ -49,6 +60,8 @@ public CuDnnPoolingLayer(in TensorInfo input, in PoolingInfo operation, Activati
         /// <inheritdoc/>
         public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
+            _X.TryFree();
+            x.Duplicate(out _X);
             using (DeviceMemory<float>
                 x_gpu = DnnInstance.Gpu.AllocateDevice(x),
                 z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
@@ -58,6 +71,8 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
                 OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);
                 DnnInstance.PoolingForward(PoolingDescription, 1, InputDescription, x_gpu.Ptr, 0, OutputDescription, z_gpu.Ptr);
                 z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
+                _Z.TryFree();
+                z.Duplicate(out _Z);
 
                 // Activation
                 DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
@@ -66,7 +81,24 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         }
 
         /// <inheritdoc/>
-        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime) => z.UpscalePool2x2(delta_1, InputInfo.Channels);
+        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
+        {
+            using (DeviceMemory<float> dx_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
+            {
+                using (DeviceMemory<float>
+                    x_gpu = DnnInstance.Gpu.AllocateDevice(_X),
+                    y_gpu = DnnInstance.Gpu.AllocateDevice(_Z),
+                    dy_gpu = DnnInstance.Gpu.AllocateDevice(delta_1))
+                {
+                    DnnInstance.PoolingBackward(PoolingDescription, 1, OutputDescription, y_gpu.Ptr, OutputDescription, dy_gpu.Ptr, InputDescription, x_gpu.Ptr, 0, InputDescription, dx_gpu.Ptr);
+                }
+                using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
+                {
+                    DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, dx_gpu.Ptr, activationPrime);
+                    z_gpu.CopyTo(z);
+                }
+            }
+        }
 
         /// <inheritdoc/>
         public override INetworkLayer Clone() => new CuDnnPoolingLayer(InputInfo, OperationInfo, ActivationFunctionType);
@@ -84,5 +116,25 @@ public override void Forward(in Tensor x, out Tensor z, out Tensor a)
             if (!stream.TryRead(out PoolingInfo operation)) return null;
             return new CuDnnPoolingLayer(input, operation, activation);
         }
+
+        #region IDisposable
+
+        ~CuDnnPoolingLayer() => Dispose();
+
+        /// <inheritdoc/>
+        void IDisposable.Dispose()
+        {
+            GC.SuppressFinalize(this);
+            Dispose();
+        }
+
+        // Private Dispose method
+        private void Dispose()
+        {
+            _X.TryFree();
+            _Z.TryFree();
+        }
+
+        #endregion
     }
 }
\ No newline at end of file
diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnSoftmaxLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnSoftmaxLayer.cs
index abc7f07..be8a0cd 100644
--- a/NeuralNetwork.NET.Cuda/Layers/CuDnnSoftmaxLayer.cs
+++ b/NeuralNetwork.NET.Cuda/Layers/CuDnnSoftmaxLayer.cs
@@ -42,17 +42,13 @@ public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
             using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
             {
                 // Linear pass
-                fixed (float* pw = Weights)
+                using (DeviceMemory<float>
+                    x_gpu = DnnInstance.Gpu.AllocateDevice(x),
+                    w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
+                    b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
                 {
-                    Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
-                    using (DeviceMemory<float>
-                        x_gpu = DnnInstance.Gpu.AllocateDevice(x),
-                        w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
-                        b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
-                    {
-                        DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, z_gpu.Ptr);
-                        z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
-                    }
+                    DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, z_gpu.Ptr);
+                    z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
                 }
 
                 // Activation
diff --git a/NeuralNetwork.NET/APIs/Enums/LayerType.cs b/NeuralNetwork.NET/APIs/Enums/LayerType.cs
index 498e059..4406841 100644
--- a/NeuralNetwork.NET/APIs/Enums/LayerType.cs
+++ b/NeuralNetwork.NET/APIs/Enums/LayerType.cs
@@ -5,10 +5,34 @@
     /// </summary>
     public enum LayerType : byte
     {
+        /// <summary>
+        /// A fully connected layer, mapping n inputs to m outputs
+        /// </summary>
         FullyConnected,
+
+        /// <summary>
+        /// A convolutional layer, which keeps spatial information on the input volume
+        /// </summary>
         Convolutional,
+
+        /// <summary>
+        /// A pooling layer, useful to reduce the size of the input data volume
+        /// </summary>
         Pooling,
+
+        /// <summary>
+        /// A fully connected output layer, with an arbitrary activation and cost function
+        /// </summary>
         Output,
-        Softmax
+
+        /// <summary>
+        /// A softmax layer, with the softmax activation and log-likelyhood cost function
+        /// </summary>
+        Softmax,
+
+        /// <summary>
+        /// An inception module, combining different kinds of convolution with a pooling operation
+        /// </summary>
+        Inception
     }
 }
\ No newline at end of file
diff --git a/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs b/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs
index 00ac1b7..cb8c7ad 100644
--- a/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/ConvolutionInfo.cs
@@ -13,27 +13,27 @@ namespace NeuralNetworkNET.APIs.Structs
     public readonly struct ConvolutionInfo : IEquatable<ConvolutionInfo>
     {
         /// <summary>
-        /// Gets the current convolution mode for the layer
+        /// The current convolution mode for the layer
         /// </summary>
         public readonly ConvolutionMode Mode;
 
         /// <summary>
-        /// Gets the optional vertical padding for the convolution operation
+        /// The optional vertical padding for the convolution operation
         /// </summary>
         public readonly int VerticalPadding;
 
         /// <summary>
-        /// Gets the optional horizontal padding for the convolution operation
+        /// The optional horizontal padding for the convolution operation
         /// </summary>
         public readonly int HorizontalPadding;
 
         /// <summary>
-        /// Gets the vertical stride length while sliding the receptive window over the input
+        /// The vertical stride length while sliding the receptive window over the input
         /// </summary>
         public readonly int VerticalStride;
 
         /// <summary>
-        /// Gets the horizontal stride length while sliding the receptive window over the input
+        /// The horizontal stride length while sliding the receptive window over the input
         /// </summary>
         public readonly int HorizontalStride;
 
@@ -45,16 +45,11 @@ private ConvolutionInfo(
             int verticalPadding, int horizontalPadding,
             int verticalStride, int horizontalStride)
         {
-            if (verticalPadding < 0) throw new ArgumentOutOfRangeException(nameof(verticalPadding), "The vertical padding must be greater than or equal to 0");
-            if (horizontalPadding < 0) throw new ArgumentOutOfRangeException(nameof(horizontalPadding), "The horizontal padding must be greater than or equal to 0");
-            if (verticalStride < 1) throw new ArgumentOutOfRangeException(nameof(verticalStride), "The vertical stride must be at least equal to 1");
-            if (horizontalStride < 1) throw new ArgumentOutOfRangeException(nameof(horizontalStride), "The horizontal stride must be at least equal to 1");
-
+            VerticalPadding = verticalPadding >= 0 ? verticalPadding : throw new ArgumentOutOfRangeException(nameof(verticalPadding), "The vertical padding must be greater than or equal to 0");
+            HorizontalPadding = horizontalPadding >= 0 ? horizontalPadding : throw new ArgumentOutOfRangeException(nameof(horizontalPadding), "The horizontal padding must be greater than or equal to 0");
+            VerticalStride = verticalStride >= 1 ? verticalStride : throw new ArgumentOutOfRangeException(nameof(verticalStride), "The vertical stride must be at least equal to 1");
+            HorizontalStride = horizontalStride >= 1 ? horizontalStride : throw new ArgumentOutOfRangeException(nameof(horizontalStride), "The horizontal stride must be at least equal to 1");
             Mode = mode;
-            VerticalPadding = verticalPadding;
-            HorizontalPadding = horizontalPadding;
-            VerticalStride = verticalStride;
-            HorizontalStride = horizontalStride;
         }
 
         /// <summary>
@@ -80,6 +75,21 @@ public static ConvolutionInfo New(
 
         #endregion
 
+        /// <summary>
+        /// Calculates the output size after applying a convolution operation to the input tensor
+        /// </summary>
+        /// <param name="input">The info on the input tensor</param>
+        /// <param name="field">The size of the convolution kernels</param>
+        /// <param name="kernels">The number of convolution kernels to be used</param>
+        [Pure]
+        internal TensorInfo GetForwardOutputTensorInfo(in TensorInfo input, (int X, int Y) field, int kernels)
+        {
+            int
+                h = (input.Height - field.X + 2 * VerticalPadding) / VerticalStride + 1,
+                w = (input.Width - field.Y + 2 * HorizontalPadding) / HorizontalStride + 1;
+            return new TensorInfo(h, w, kernels);
+        }
+
         #region Equality
 
         /// <inheritdoc/>
diff --git a/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs b/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs
new file mode 100644
index 0000000..2a69aab
--- /dev/null
+++ b/NeuralNetwork.NET/APIs/Structs/InceptionInfo.cs
@@ -0,0 +1,146 @@
+﻿using JetBrains.Annotations;
+using NeuralNetworkNET.APIs.Enums;
+using Newtonsoft.Json;
+using System;
+using System.Runtime.CompilerServices;
+
+namespace NeuralNetworkNET.APIs.Structs
+{
+    /// <summary>
+    /// A <see cref="struct"/> containing all the info on an inception module
+    /// </summary>
+    [JsonObject(MemberSerialization.Fields)]
+    public readonly struct InceptionInfo : IEquatable<InceptionInfo>
+    {
+        #region Fields and properties
+
+        /// <summary>
+        /// The number of 1x1 convolution kernels used in the first step of the forward pass
+        /// </summary>
+        public readonly int Primary1x1ConvolutionKernels;
+
+        /// <summary>
+        /// The number of 1x1 convolution kernels before the 3x3 convolution
+        /// </summary>
+        public readonly int Primary3x3Reduce1x1ConvolutionKernels;
+
+        /// <summary>
+        /// The number of 3x3 convolution kernels
+        /// </summary>
+        public readonly int Secondary3x3ConvolutionKernels;
+
+        /// <summary>
+        /// The number of 1x1 convolution kernels before the 5x5 convolution
+        /// </summary>
+        public readonly int Primary5x5Reduce1x1ConvolutionKernels;
+
+        /// <summary>
+        /// The number of 5x5 convolution kernels
+        /// </summary>
+        public readonly int Secondary5x5ConvolutionKernels;
+
+        /// <summary>
+        /// The kind of pooling operation performed on the layer
+        /// </summary>
+        public readonly PoolingMode Pooling;
+
+        /// <summary>
+        /// The number of 1x1 convolution kernels after the pooling operation
+        /// </summary>
+        public readonly int Secondary1x1AfterPoolingConvolutionKernels;
+
+        /// <summary>
+        /// Gets the number of output channels after the depth concatenation
+        /// </summary>
+        public int OutputChannels
+        {
+            [Pure]
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => Primary1x1ConvolutionKernels + Secondary3x3ConvolutionKernels + Secondary5x5ConvolutionKernels + Secondary1x1AfterPoolingConvolutionKernels;
+        }
+
+        /// <summary>
+        /// Gets the total number of convolution kernels for the current instance
+        /// </summary>
+        public int ConvolutionKernels
+        {
+            [Pure]
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => Primary1x1ConvolutionKernels + Primary3x3Reduce1x1ConvolutionKernels + Secondary3x3ConvolutionKernels + Primary5x5Reduce1x1ConvolutionKernels + Secondary5x5ConvolutionKernels + Secondary1x1AfterPoolingConvolutionKernels;
+        }
+
+        #endregion
+
+        #region Constructors
+
+        // Internal constructor
+        private InceptionInfo(int _1x1Kernels, int _3x3Reduce1x1Kernels, int _3x3Kernels, int _5x5Reduce1x1Kernels, int _5x5Kernels, PoolingMode poolingMode, int _1x1SecondaryKernels)
+        {
+            Primary1x1ConvolutionKernels = _1x1Kernels >= 1 ? _1x1Kernels : throw new ArgumentOutOfRangeException(nameof(_1x1Kernels), "The number of 1x1 kernels must be at least 1");
+            Primary3x3Reduce1x1ConvolutionKernels = _3x3Reduce1x1Kernels >= 1 ? _3x3Reduce1x1Kernels : throw new ArgumentOutOfRangeException(nameof(_3x3Reduce1x1Kernels), "The number of 3x3 reduction 1x1 kernels must be at least 1");
+            Secondary3x3ConvolutionKernels = _3x3Kernels >= 1 ? _3x3Kernels : throw new ArgumentOutOfRangeException(nameof(_3x3Kernels), "The number of 3x3 kernels must be at least 1");
+            Primary5x5Reduce1x1ConvolutionKernels = _5x5Reduce1x1Kernels >= 1 ? _5x5Reduce1x1Kernels : throw new ArgumentOutOfRangeException(nameof(_3x3Kernels), "The number of 5x5 reduction 1x1 kernels must be at least 1");
+            Secondary5x5ConvolutionKernels = _5x5Kernels >= 1 ? _5x5Kernels : throw new ArgumentOutOfRangeException(nameof(_5x5Kernels), "The number of 5x5 kernels must be at least 1");
+            Secondary1x1AfterPoolingConvolutionKernels = _1x1SecondaryKernels >= 1 ? _1x1SecondaryKernels : throw new ArgumentOutOfRangeException(nameof(_1x1SecondaryKernels), "The number of secondary 1x1 kernels must be at least 1");
+            Pooling = poolingMode;
+        }
+
+        /// <summary>
+        /// Creates a new inception layer description with the input parameters
+        /// </summary>
+        /// <param name="_1x1Kernels">The number of 1x1 primary convolution kernels</param>
+        /// <param name="_3x3Reduce1x1Kernels">The number of 3x3 reduction 1x1 kernels</param>
+        /// <param name="_3x3Kernels">The number of 3x3 convolution kernels</param>
+        /// <param name="_5x5Reduce1x1Kernels">The number of 5x5 reduction 1x1 kernels</param>
+        /// <param name="_5x5Kernels">The number of 5x5 convolution kernels</param>
+        /// <param name="poolingMode">The pooling mode for the pooling pipeline</param>
+        /// <param name="_1x1SecondaryKernels">The number of secondary 1x1 convolution kernels</param>
+        [PublicAPI]
+        [Pure]
+        public static InceptionInfo New(
+            int _1x1Kernels, int _3x3Reduce1x1Kernels, int _3x3Kernels, int _5x5Reduce1x1Kernels, int _5x5Kernels, 
+            PoolingMode poolingMode, int _1x1SecondaryKernels)
+            => new InceptionInfo(_1x1Kernels, _3x3Reduce1x1Kernels, _3x3Kernels, _5x5Reduce1x1Kernels, _5x5Kernels, poolingMode, _1x1SecondaryKernels);
+
+        #endregion
+
+        #region Equality
+
+        /// <inheritdoc/>
+        public bool Equals(InceptionInfo other) => this == other;
+
+        /// <inheritdoc/>
+        public override bool Equals(object obj) => obj is InceptionInfo info ? this == info : false;
+
+        /// <inheritdoc/>
+        public override int GetHashCode()
+        {
+            int hash = 17;
+            unchecked
+            {
+                hash = hash * 31 + Primary1x1ConvolutionKernels;
+                hash = hash * 31 + Primary3x3Reduce1x1ConvolutionKernels;
+                hash = hash * 31 + Secondary3x3ConvolutionKernels;
+                hash = hash * 31 + Primary5x5Reduce1x1ConvolutionKernels;
+                hash = hash * 31 + Secondary5x5ConvolutionKernels;
+                hash = hash * 31 + Secondary1x1AfterPoolingConvolutionKernels;
+                hash = hash * 31 + (int)Pooling;
+            }
+            return hash;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool operator ==(in InceptionInfo a, in InceptionInfo b) => a.Primary1x1ConvolutionKernels == b.Primary1x1ConvolutionKernels &&
+                                                                                  a.Primary3x3Reduce1x1ConvolutionKernels == b.Primary3x3Reduce1x1ConvolutionKernels && 
+                                                                                  a.Secondary3x3ConvolutionKernels == b.Secondary3x3ConvolutionKernels &&
+                                                                                  a.Primary5x5Reduce1x1ConvolutionKernels == b.Primary5x5Reduce1x1ConvolutionKernels &&
+                                                                                  a.Secondary5x5ConvolutionKernels == b.Secondary5x5ConvolutionKernels && 
+                                                                                  a.Secondary1x1AfterPoolingConvolutionKernels == b.Secondary1x1AfterPoolingConvolutionKernels &&
+                                                                                  a.Pooling == b.Pooling;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static bool operator !=(in InceptionInfo a, in InceptionInfo b) => !(a == b);
+
+        #endregion
+    }
+}
diff --git a/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs b/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs
index 7185d35..e497dd1 100644
--- a/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/PoolingInfo.cs
@@ -13,37 +13,37 @@ namespace NeuralNetworkNET.APIs.Structs
     public readonly struct PoolingInfo : IEquatable<PoolingInfo>
     {
         /// <summary>
-        /// Gets the current pooling mode for the layer
+        /// The current pooling mode for the layer
         /// </summary>
         public readonly PoolingMode Mode;
 
         /// <summary>
-        /// Gets the height of each input local receptive field
+        /// The height of each input local receptive field
         /// </summary>
         public readonly int WindowHeight;
 
         /// <summary>
-        /// Gets the width of each input local receptive field
+        /// The width of each input local receptive field
         /// </summary>
         public readonly int WindowWidth;
 
         /// <summary>
-        /// Gets the optional vertical padding for the pooling operation
+        /// The optional vertical padding for the pooling operation
         /// </summary>
         public readonly int VerticalPadding;
 
         /// <summary>
-        /// Gets the optional horizontal padding for the pooling operation
+        /// The optional horizontal padding for the pooling operation
         /// </summary>
         public readonly int HorizontalPadding;
 
         /// <summary>
-        /// Gets the vertical stride length while sliding the receptive window over the input
+        /// The vertical stride length while sliding the receptive window over the input
         /// </summary>
         public readonly int VerticalStride;
 
         /// <summary>
-        /// Gets the horizontal stride length while sliding the receptive window over the input
+        /// The horizontal stride length while sliding the receptive window over the input
         /// </summary>
         public readonly int HorizontalStride;
 
@@ -55,20 +55,13 @@ private PoolingInfo(
             int verticalPadding, int horizontalPadding,
             int verticalStride, int horizontalStride)
         {
-            if (windowHeight <= 0) throw new ArgumentOutOfRangeException(nameof(windowHeight), "The window height must be at least equal to 1");
-            if (windowWidth <= 0) throw new ArgumentOutOfRangeException(nameof(windowWidth), "The window width must be at least equal to 1");
-            if (verticalPadding < 0) throw new ArgumentOutOfRangeException(nameof(verticalPadding), "The vertical padding must be greater than or equal to 0");
-            if (horizontalPadding < 0) throw new ArgumentOutOfRangeException(nameof(horizontalPadding), "The horizontal padding must be greater than or equal to 0");
-            if (verticalStride < 1) throw new ArgumentOutOfRangeException(nameof(verticalStride), "The vertical stride must be at least equal to 1");
-            if (horizontalStride < 1) throw new ArgumentOutOfRangeException(nameof(horizontalStride), "The horizontal stride must be at least equal to 1");
-
+            WindowHeight = windowHeight > 0 ? windowHeight : throw new ArgumentOutOfRangeException(nameof(windowHeight), "The window height must be at least equal to 1");
+            WindowWidth = windowWidth > 0 ? windowWidth : throw new ArgumentOutOfRangeException(nameof(windowWidth), "The window width must be at least equal to 1");
+            VerticalPadding = verticalPadding >= 0 ? verticalPadding : throw new ArgumentOutOfRangeException(nameof(verticalPadding), "The vertical padding must be greater than or equal to 0");
+            HorizontalPadding = horizontalPadding >= 0 ? horizontalPadding : throw new ArgumentOutOfRangeException(nameof(horizontalPadding), "The horizontal padding must be greater than or equal to 0");
+            VerticalStride = verticalStride >= 1 ? verticalStride : throw new ArgumentOutOfRangeException(nameof(verticalStride), "The vertical stride must be at least equal to 1");
+            HorizontalStride = horizontalStride >= 1 ? horizontalStride : throw new ArgumentOutOfRangeException(nameof(horizontalStride), "The horizontal stride must be at least equal to 1");
             Mode = mode;
-            WindowHeight = windowHeight;
-            WindowWidth = windowWidth;
-            VerticalPadding = verticalPadding;
-            HorizontalPadding = horizontalPadding;
-            VerticalStride = verticalStride;
-            HorizontalStride = horizontalStride;
         }
 
         /// <summary>
@@ -96,6 +89,19 @@ public static PoolingInfo New(
 
         #endregion
 
+        /// <summary>
+        /// Calculates the output size after applying a pooling operation to the input tensor
+        /// </summary>
+        /// <param name="input">The info on the input tensor</param>
+        [Pure]
+        internal TensorInfo GetForwardOutputTensorInfo(in TensorInfo input)
+        {
+            int
+                h = (input.Height - WindowHeight + 2 * VerticalPadding) / VerticalStride + 1,
+                w = (input.Width - WindowWidth + 2 * HorizontalPadding) / HorizontalStride + 1;
+            return new TensorInfo(h, w, input.Channels);
+        }
+
         #region Equality
 
         /// <inheritdoc/>
diff --git a/NeuralNetwork.NET/APIs/Structs/Tensor.cs b/NeuralNetwork.NET/APIs/Structs/Tensor.cs
index cf57a6d..cbc8732 100644
--- a/NeuralNetwork.NET/APIs/Structs/Tensor.cs
+++ b/NeuralNetwork.NET/APIs/Structs/Tensor.cs
@@ -17,24 +17,39 @@ namespace NeuralNetworkNET.APIs.Structs
     public readonly struct Tensor
     {
         /// <summary>
-        /// Gets the <see cref="IntPtr"/> value to the allocated memory
+        /// The <see cref="IntPtr"/> value to the allocated memory
         /// </summary>
         public readonly IntPtr Ptr;
 
         /// <summary>
-        /// Gets the number of entities (rows) in the current <see cref="Tensor"/>
+        /// The number of entities (rows) in the current <see cref="Tensor"/>
         /// </summary>
         public readonly int Entities;
 
         /// <summary>
-        /// Gets the size of each entity in the current <see cref="Tensor"/>
+        /// The size of each entity in the current <see cref="Tensor"/>
         /// </summary>
         public readonly int Length;
 
         /// <summary>
-        /// Gets the total size (the number of <see cref="float"/> values) in the current <see cref="Tensor"/>
+        /// The total size (the number of <see cref="float"/> values) in the current <see cref="Tensor"/>
         /// </summary>
-        public int Size => Entities * Length;
+        public int Size
+        {
+            [Pure]
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => Entities * Length;
+        }
+
+        /// <summary>
+        /// Gets whether or not the current instance is linked to an allocated memory area
+        /// </summary>
+        public bool Null
+        {
+            [Pure]
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get => Ptr == IntPtr.Zero;
+        }
 
         #region Initialization
 
@@ -186,12 +201,35 @@ public float[] ToArray()
 
         #endregion
 
+        /// <summary>
+        /// Creates a new instance by wrapping the current memory area
+        /// </summary>
+        /// <param name="n">The height of the final matrix</param>
+        /// <param name="chw">The width of the final matrix</param>
+        /// <param name="tensor">The resulting instance</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void Reshape(int n, int chw, out Tensor tensor)
+        {
+            if (n * chw != Size) throw new ArgumentException("Invalid input resized shape");
+            tensor = new Tensor(Ptr, n, chw);
+        }
+
         /// <summary>
         /// Frees the memory associated with the current instance
         /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public void Free() => Marshal.FreeHGlobal(Ptr);
 
+        /// <summary>
+        /// Frees the memory associated with the current instance, if needed
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void TryFree()
+        {
+            if (Ptr != IntPtr.Zero)
+                Marshal.FreeHGlobal(Ptr);
+        }
+
         // Implicit pointer conversion
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static unsafe implicit operator float*(in Tensor tensor) => (float*)tensor.Ptr.ToPointer();
@@ -233,7 +271,7 @@ unsafe float[] ExtractRow(int i)
                     // Spawn the sequence
                     int
                         max = MaximumItemsCount / obj.Length,
-                        up = max.Min(MaximumRowsCount).Max(1);
+                        up = max.Min(MaximumRowsCount).Max(1).Min(obj.Entities);
                     for (int i = 0; i < up; i++)
                         yield return ExtractRow(i);
                 }
diff --git a/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs b/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs
index d774cf3..0853a92 100644
--- a/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs
+++ b/NeuralNetwork.NET/APIs/Structs/TensorInfo.cs
@@ -13,20 +13,22 @@ namespace NeuralNetworkNET.APIs.Structs
     [DebuggerDisplay("Height: {Height}, Width: {Width}, Channels: {Channels}, Size: {Size}")]
     public readonly struct TensorInfo : IEquatable<TensorInfo>
     {
+        #region Fields and parameters
+
         /// <summary>
-        /// Gets the height of each 2D slice
+        /// The height of each 2D slice
         /// </summary>
         [JsonProperty(nameof(Height), Order = 1)]
         public readonly int Height;
 
         /// <summary>
-        /// Gets the width of each 2D slice
+        /// The width of each 2D slice
         /// </summary>
         [JsonProperty(nameof(Width), Order = 2)]
         public readonly int Width;
 
         /// <summary>
-        /// Gets the number of channels for the tensor description
+        /// The number of channels for the tensor description
         /// </summary>
         [JsonProperty(nameof(Channels), Order = 3)]
         public readonly int Channels;
@@ -52,13 +54,16 @@ public int SliceSize
             get => Height * Width;
         }
 
+        #endregion
+
+        #region Constructors
+
         internal TensorInfo(int height, int width, int channels)
         {
             if (height * width <= 0) throw new ArgumentException("The height and width of the kernels must be positive values");
-            if (channels < 1) throw new ArgumentOutOfRangeException(nameof(channels), "The number of channels must be at least equal to 1");
             Height = height;
             Width = width;
-            Channels = channels;
+            Channels = channels >= 1 ? channels :  throw new ArgumentOutOfRangeException(nameof(channels), "The number of channels must be at least equal to 1");
         }
 
         /// <summary>
@@ -87,6 +92,8 @@ internal TensorInfo(int height, int width, int channels)
         [Pure]
         public static TensorInfo CreateLinear(int size) => new TensorInfo(1, 1, size);
 
+        #endregion
+
         #region Equality
 
         /// <inheritdoc/>
diff --git a/NeuralNetwork.NET/Extensions/MatrixExtensions.cs b/NeuralNetwork.NET/Extensions/MatrixExtensions.cs
index 069ee72..148d897 100644
--- a/NeuralNetwork.NET/Extensions/MatrixExtensions.cs
+++ b/NeuralNetwork.NET/Extensions/MatrixExtensions.cs
@@ -706,8 +706,9 @@ public static unsafe float[] BlockCopy([NotNull] this float[] v)
         /// </summary>
         /// <param name="m">The first <see cref="Tensor"/> to test</param>
         /// <param name="o">The second <see cref="Tensor"/> to test</param>
-        /// <param name="delta">The comparison threshold</param>
-        public static unsafe bool ContentEquals(in this Tensor m, in Tensor o, float delta = 1e-6f)
+        /// <param name="absolute">The relative comparison threshold</param>
+        /// <param name="relative">The relative comparison threshold</param>
+        public static unsafe bool ContentEquals(in this Tensor m, in Tensor o,float absolute = 1e-6f, float relative = 1e-6f)
         {
             if (m.Ptr == IntPtr.Zero && o.Ptr == IntPtr.Zero) return true;
             if (m.Ptr == IntPtr.Zero || o.Ptr == IntPtr.Zero) return false;
@@ -715,7 +716,7 @@ public static unsafe bool ContentEquals(in this Tensor m, in Tensor o, float del
             float* pm = m, po = o;
             int items = m.Size;
             for (int i = 0; i < items; i++)
-                if (!pm[i].EqualsWithDelta(po[i], delta)) return false;
+                if (!pm[i].EqualsWithDelta(po[i], absolute, relative)) return false;
             return true;
         }
 
@@ -724,8 +725,9 @@ public static unsafe bool ContentEquals(in this Tensor m, in Tensor o, float del
         /// </summary>
         /// <param name="m">The first matrix to test</param>
         /// <param name="o">The second matrix to test</param>
-        /// <param name="delta">The comparison threshold</param>
-        public static bool ContentEquals([CanBeNull] this float[,] m, [CanBeNull] float[,] o, float delta = 1e-6f)
+        /// <param name="absolute">The relative comparison threshold</param>
+        /// <param name="relative">The relative comparison threshold</param>
+        public static bool ContentEquals([CanBeNull] this float[,] m, [CanBeNull] float[,] o, float absolute = 1e-6f, float relative = 1e-6f)
         {
             if (m == null && o == null) return true;
             if (m == null || o == null) return false;
@@ -733,7 +735,7 @@ public static bool ContentEquals([CanBeNull] this float[,] m, [CanBeNull] float[
                 m.GetLength(1) != o.GetLength(1)) return false;
             for (int i = 0; i < m.GetLength(0); i++)
                 for (int j = 0; j < m.GetLength(1); j++)
-                    if (!m[i, j].EqualsWithDelta(o[i, j], delta)) return false;
+                    if (!m[i, j].EqualsWithDelta(o[i, j], absolute, relative)) return false;
             return true;
         }
 
@@ -742,14 +744,15 @@ public static bool ContentEquals([CanBeNull] this float[,] m, [CanBeNull] float[
         /// </summary>
         /// <param name="v">The first vector to test</param>
         /// <param name="o">The second vector to test</param>
-        /// <param name="delta">The comparison threshold</param>
-        public static bool ContentEquals([CanBeNull] this float[] v, [CanBeNull] float[] o, float delta = 1e-6f)
+        /// <param name="absolute">The relative comparison threshold</param>
+        /// <param name="relative">The relative comparison threshold</param>
+        public static bool ContentEquals([CanBeNull] this float[] v, [CanBeNull] float[] o, float absolute = 1e-6f, float relative = 1e-6f)
         {
             if (v == null && o == null) return true;
             if (v == null || o == null) return false;
             if (v.Length != o.Length) return false;
             for (int i = 0; i < v.Length; i++)
-                if (!v[i].EqualsWithDelta(o[i], delta)) return false;
+                if (!v[i].EqualsWithDelta(o[i], absolute, relative)) return false;
             return true;
         }
 
diff --git a/NeuralNetwork.NET/Extensions/MiscExtensions.cs b/NeuralNetwork.NET/Extensions/MiscExtensions.cs
index 914dfde..d3b2c76 100644
--- a/NeuralNetwork.NET/Extensions/MiscExtensions.cs
+++ b/NeuralNetwork.NET/Extensions/MiscExtensions.cs
@@ -32,6 +32,15 @@ public static TOut To<TIn, TOut>([NotNull] this TIn item) where TOut : class, TI
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static int Max(this int a, int b) => a >= b ? a : b;
 
+        /// <summary>
+        /// Returns the maximum value between two numbers
+        /// </summary>
+        /// <param name="a">The first number</param>
+        /// <param name="b">The second number</param>
+        [Pure]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static float Max(this float a, float b) => a >= b ? a : b;
+
         /// <summary>
         /// Returns the minimum value between two numbers
         /// </summary>
@@ -54,16 +63,19 @@ public static TOut To<TIn, TOut>([NotNull] this TIn item) where TOut : class, TI
         /// </summary>
         /// <param name="value">The first value</param>
         /// <param name="other">The second value</param>
-        /// <param name="delta">The comparison threshold</param>
+        /// <param name="absolute">The relative comparison threshold</param>
+        /// <param name="relative">The relative comparison threshold</param>
         [Pure]
-        public static bool EqualsWithDelta(this float value, float other, float delta = 1e-6f)
+        public static bool EqualsWithDelta(this float value, float other, float absolute = 1e-6f, float relative = 1e-6f)
         {
             if (float.IsNaN(value) ^ float.IsNaN(other)) return false;
             if (float.IsNaN(value) && float.IsNaN(other)) return true;
             if (float.IsInfinity(value) ^ float.IsInfinity(other)) return false;
             if (float.IsPositiveInfinity(value) && float.IsPositiveInfinity(other)) return true;
             if (float.IsNegativeInfinity(value) && float.IsNegativeInfinity(other)) return true;
-            return (value - other).Abs() < delta;
+            float abs = (value - other).Abs();
+            if (abs < absolute) return true;
+            return abs <= absolute.Max(relative * value.Abs().Max(other.Abs()));
         }
 
         /// <summary>
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
index 0be71cf..476ac33 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/ConvolutionalLayer.cs
@@ -58,8 +58,8 @@ public ref readonly TensorInfo KernelInfo
         #endregion
 
         public ConvolutionalLayer(in TensorInfo input, in ConvolutionInfo operation, (int X, int Y) kernelSize, int kernels, ActivationFunctionType activation, BiasInitializationMode biasMode)
-            : base(input, new TensorInfo(input.Height - kernelSize.X + 1, input.Width - kernelSize.Y + 1, kernels),
-                  WeightsProvider.NewConvolutionalKernels(input.Channels, kernelSize.X, kernelSize.Y, kernels),
+            : base(input, operation.GetForwardOutputTensorInfo(input, kernelSize, kernels),
+                  WeightsProvider.NewConvolutionalKernels(input, kernelSize.X, kernelSize.Y, kernels),
                   WeightsProvider.NewBiases(kernels, biasMode), activation)
         {
             _OperationInfo = operation;
@@ -107,7 +107,8 @@ public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, Activa
         public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJdw, out Tensor dJdb)
         {
             a.Rotate180(InputInfo.Channels, out Tensor a180);
-            a180.ConvoluteGradient(InputInfo, delta, OutputInfo, out dJdw);
+            a180.ConvoluteGradient(InputInfo, delta, OutputInfo, out Tensor dJdwM);
+            dJdwM.Reshape(1, Weights.Length, out dJdw);
             a180.Free();
             delta.CompressVertically(OutputInfo.Channels, out dJdb);
         }
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs
index 9086c89..6fb8cc6 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/FullyConnectedLayer.cs
@@ -22,7 +22,7 @@ internal class FullyConnectedLayer : WeightedLayerBase
 
         public FullyConnectedLayer(in TensorInfo input, int neurons, ActivationFunctionType activation, WeightsInitializationMode weightsMode, BiasInitializationMode biasMode)
             : base(input, TensorInfo.CreateLinear(neurons),
-                  WeightsProvider.NewFullyConnectedWeights(input.Size, neurons, weightsMode),
+                  WeightsProvider.NewFullyConnectedWeights(input, neurons, weightsMode),
                   WeightsProvider.NewBiases(neurons, biasMode), activation) { }
 
         public FullyConnectedLayer(in TensorInfo input, int neurons, [NotNull] float[] weights, [NotNull] float[] biases, ActivationFunctionType activation)
@@ -59,7 +59,8 @@ public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, Activa
         public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJdw, out Tensor dJdb)
         {
             a.Transpose(out Tensor at);
-            at.Multiply(delta, out dJdw);
+            at.Multiply(delta, out Tensor dJdwM);
+            dJdwM.Reshape(1, Weights.Length, out dJdw);
             at.Free();
             delta.CompressVertically(out dJdb);
         }
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
index 87ddb9c..23cd6be 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/Helpers/WeightsProvider.cs
@@ -13,35 +13,34 @@ namespace NeuralNetworkNET.Networks.Implementations.Layers.Helpers
     internal static class WeightsProvider
     {
         /// <summary>
-        /// Creates a weight matrix for a fully connected layer
+        /// Creates a weights vector for a fully connected layer
         /// </summary>
-        /// <param name="inputs">The input neurons</param>
+        /// <param name="inputs">The layer inputs</param>
         /// <param name="outputs">The output neurons</param>
         /// <param name="mode">The initialization mode for the weights</param>
         [Pure, NotNull]
-        public static unsafe float[] NewFullyConnectedWeights(int inputs, int outputs, WeightsInitializationMode mode)
+        public static unsafe float[] NewFullyConnectedWeights(in TensorInfo input, int outputs, WeightsInitializationMode mode)
         {
-            if (inputs <= 0 || outputs <= 0) throw new ArgumentOutOfRangeException("The inputs and outputs must be positive numbers");
-            float[] weights = new float[inputs * outputs];
+            float[] weights = new float[input.Size * outputs];
             fixed (float* pw = weights)
             {
-                Tensor.Reshape(pw, inputs, outputs, out Tensor wTensor);
+                Tensor.Reshape(pw, input.Size, outputs, out Tensor wTensor);
                 switch (mode)
                 {
                     case WeightsInitializationMode.LeCunUniform:
-                        KerasWeightsProvider.FillWithLeCunUniform(wTensor, inputs);
+                        KerasWeightsProvider.FillWithLeCunUniform(wTensor, input.Size);
                         break;
                     case WeightsInitializationMode.GlorotNormal:
-                        KerasWeightsProvider.FillWithGlorotNormal(wTensor, inputs, outputs);
+                        KerasWeightsProvider.FillWithGlorotNormal(wTensor, input.Size, outputs);
                         break;
                     case WeightsInitializationMode.GlorotUniform:
-                        KerasWeightsProvider.FillWithGlorotUniform(wTensor, inputs, outputs);
+                        KerasWeightsProvider.FillWithGlorotUniform(wTensor, input.Size, outputs);
                         break;
                     case WeightsInitializationMode.HeEtAlNormal:
-                        KerasWeightsProvider.FillWithHeEtAlNormal(wTensor, inputs);
+                        KerasWeightsProvider.FillWithHeEtAlNormal(wTensor, input.Size);
                         break;
                     case WeightsInitializationMode.HeEtAlUniform:
-                        KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, inputs);
+                        KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Size);
                         break;
                     default: throw new ArgumentOutOfRangeException(nameof(mode), "Unsupported weights initialization mode");
                 }
@@ -50,21 +49,67 @@ public static unsafe float[] NewFullyConnectedWeights(int inputs, int outputs, W
         }
 
         /// <summary>
-        /// Creates a weight matrix for a convolutional layer
+        /// Creates a weights vector for a convolutional layer
         /// </summary>
-        /// <param name="inputDepth">The depth of the input volume</param>
+        /// <param name="input">The layer inputs</param>
         /// <param name="kernelsHeight">The height of each kernel</param>
         /// <param name="kernelsWidth">The width of each kernel</param>
         /// <param name="kernels">The number of kernels in the layer</param>
         [Pure, NotNull]
-        public static unsafe float[] NewConvolutionalKernels(int inputDepth, int kernelsHeight, int kernelsWidth, int kernels)
+        public static unsafe float[] NewConvolutionalKernels(in TensorInfo input, int kernelsHeight, int kernelsWidth, int kernels)
         {
             if (kernels <= 0) throw new ArgumentOutOfRangeException(nameof(kernels), "The number of kernels must be positive");
-            float[] weights = new float[kernels * kernelsHeight * kernelsWidth * inputDepth];
+            float[] weights = new float[kernels * kernelsHeight * kernelsWidth * input.Channels];
             fixed (float* pw = weights)
             {
                 Tensor.Reshape(pw, 1, weights.Length, out Tensor wTensor);
-                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, inputDepth * kernelsHeight * kernelsWidth);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels * kernelsHeight * kernelsWidth);
+            }
+            return weights;
+        }
+
+        /// <summary>
+        /// Creates a new mixed weights vector for an inception layer
+        /// </summary>
+        /// <param name="input">The layer inputs</param>
+        /// <param name="info">The info on the target inception layer</param>
+        [Pure, NotNull]
+        public static unsafe float[] NewInceptionWeights(in TensorInfo input, in InceptionInfo info)
+        {
+            // Setup
+            int
+                _1x1Length = input.Channels * info.Primary1x1ConvolutionKernels,
+                _3x3Reduce1x1Length = input.Channels * info.Primary3x3Reduce1x1ConvolutionKernels,
+                _3x3Length = 3 * 3 * info.Primary3x3Reduce1x1ConvolutionKernels * info.Secondary3x3ConvolutionKernels,
+                _5x5Reduce1x1Length = input.Channels * info.Primary5x5Reduce1x1ConvolutionKernels,
+                _5x5Length = 5 * 5 * info.Primary5x5Reduce1x1ConvolutionKernels * info.Secondary5x5ConvolutionKernels,
+                secondary1x1Length = input.Channels * info.Secondary1x1AfterPoolingConvolutionKernels;
+            float[] weights = new float[_1x1Length + _3x3Reduce1x1Length + _3x3Length + _5x5Reduce1x1Length + _5x5Length + secondary1x1Length];
+            fixed (float* pw = weights)
+            {
+                // 1x1
+                Tensor.Reshape(pw, 1, _1x1Length, out Tensor wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
+
+                // 3x3 reduce 1x1
+                Tensor.Reshape(pw + _1x1Length, 1, _3x3Reduce1x1Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
+
+                // 3x3
+                Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length, 1, _3x3Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, 3 * 3 * info.Primary3x3Reduce1x1ConvolutionKernels);
+
+                // 5x5 reduce 1x1
+                Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length + _3x3Length, 1, _5x5Reduce1x1Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
+
+                // 5x5
+                Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length + _3x3Length + _5x5Reduce1x1Length, 1, _5x5Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, 5 * 5 * info.Primary5x5Reduce1x1ConvolutionKernels);
+
+                // Pool 1x1
+                Tensor.Reshape(pw + _1x1Length + _3x3Reduce1x1Length + _3x3Length + _5x5Reduce1x1Length + _5x5Length, 1, secondary1x1Length, out wTensor);
+                KerasWeightsProvider.FillWithHeEtAlUniform(wTensor, input.Channels);
             }
             return weights;
         }
diff --git a/NeuralNetwork.NET/Networks/Implementations/Layers/PoolingLayer.cs b/NeuralNetwork.NET/Networks/Implementations/Layers/PoolingLayer.cs
index 6cba6bc..2b6c4ab 100644
--- a/NeuralNetwork.NET/Networks/Implementations/Layers/PoolingLayer.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/Layers/PoolingLayer.cs
@@ -34,10 +34,7 @@ public ref readonly PoolingInfo OperationInfo
         }
 
         public PoolingLayer(in TensorInfo input, in PoolingInfo operation, ActivationFunctionType activation)
-            : base(input, new TensorInfo(
-                input.Height / 2 + (input.Height % 2 == 0 ? 0 : 1),
-                input.Width / 2 + (input.Width % 2 == 0 ? 0 : 1),
-                input.Channels), activation)
+            : base(input, operation.GetForwardOutputTensorInfo(input), activation)
             => _OperationInfo = operation;
 
         /// <inheritdoc/>
diff --git a/NeuralNetwork.NET/Networks/Implementations/NeuralNetwork.cs b/NeuralNetwork.NET/Networks/Implementations/NeuralNetwork.cs
index 493282f..6707591 100644
--- a/NeuralNetwork.NET/Networks/Implementations/NeuralNetwork.cs
+++ b/NeuralNetwork.NET/Networks/Implementations/NeuralNetwork.cs
@@ -248,7 +248,7 @@ internal unsafe void Backpropagate(in TrainingBatch batch, float dropout, [NotNu
                      * Multiply the previous delta with the transposed weights of the following layer
                      * Compute d(l), the Hadamard product of z'(l) and delta(l + 1) * W(l + 1)T */
                     _Layers[l + 1].Backpropagate(*deltas[l + 1], zList[l], _Layers[l].ActivationFunctions.ActivationPrime);
-                    if (dropoutMasks[l].Ptr != IntPtr.Zero) zList[l].InPlaceHadamardProduct(dropoutMasks[l]);
+                    if (!dropoutMasks[l].Null) zList[l].InPlaceHadamardProduct(dropoutMasks[l]);
                     deltas[l] = zList + l;
                 }
 
@@ -285,7 +285,7 @@ internal unsafe void Backpropagate(in TrainingBatch batch, float dropout, [NotNu
                 {
                     zList[i].Free();
                     aList[i].Free();
-                    if (dropoutMasks[i].Ptr != IntPtr.Zero) dropoutMasks[i].Free();
+                    dropoutMasks[i].TryFree();
                 }
                 zList[_Layers.Length - 1].Free();
                 aList[_Layers.Length - 1].Free();
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs
new file mode 100644
index 0000000..66ebcd5
--- /dev/null
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnInceptionLayerTest.cs
@@ -0,0 +1,280 @@
+﻿using Microsoft.VisualStudio.TestTools.UnitTesting;
+using NeuralNetworkNET.APIs.Enums;
+using NeuralNetworkNET.APIs.Structs;
+using NeuralNetworkNET.Cuda.Layers;
+using NeuralNetworkNET.Extensions;
+using NeuralNetworkNET.Networks.Activations;
+using NeuralNetworkNET.Networks.Implementations.Layers.Helpers;
+using System;
+using System.Runtime.CompilerServices;
+
+namespace NeuralNetworkNET.Cuda.Unit
+{
+    /// <summary>
+    /// Test class for the cuDNN inception layer
+    /// </summary>
+    [TestClass]
+    [TestCategory(nameof(CuDnnInceptionLayerTest))]
+    public class CuDnnInceptionLayerTest
+    {
+        [TestMethod]
+        public unsafe void Inception1x1()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 32 * 32 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 32 * 32 * 3);
+            CuDnnConvolutionalLayer conv = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(32, 32), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(conv.InputInfo, InceptionInfo.New(10, 10, 10, 10, 10, PoolingMode.Max, 10));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv.Weights, 0, inception.Weights, 0, sizeof(float) * conv.Weights.Length);
+            Buffer.BlockCopy(conv.Biases, 0, inception.Biases, 0, sizeof(float) * conv.Biases.Length);
+            fixed (float* px = x)
+            {
+                // Forward + Z
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                conv.Forward(xTensor, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer(), preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+
+                // A
+                float* paInc = (float*)aInc.Ptr.ToPointer();
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+
+                // Backpropagate
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor z1);
+                KerasWeightsProvider.FillWithHeEtAlUniform(z1, 10);
+                z1.Duplicate(out Tensor z2);
+                conv.Backpropagate(aConv, z1, ActivationFunctions.ReLUPrime);
+                inception.Backpropagate(aInc, z2, ActivationFunctions.ReLUPrime);
+                Assert.IsTrue(z1.ContentEquals(z2));
+
+                // Gradient
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor a);
+                KerasWeightsProvider.FillWithHeEtAlUniform(a, 10);
+                conv.ComputeGradient(a, aConv, out Tensor dJdwConv, out Tensor dJdbConv);
+                inception.ComputeGradient(a, aInc, out Tensor dJdwInc, out Tensor dJdbInc);
+                Tensor.New(1, dJdwConv.Length, out Tensor dJdwInc0);
+                Buffer.MemoryCopy((float*)dJdwInc.Ptr.ToPointer(), (float*)dJdwInc0.Ptr.ToPointer(), sizeof(float) * dJdwInc0.Size, sizeof(float) * dJdwInc0.Size);
+                Tensor.New(1, dJdbConv.Length, out Tensor dJdbInc0);
+                Buffer.MemoryCopy((float*)dJdbInc.Ptr.ToPointer(), (float*)dJdbInc0.Ptr.ToPointer(), sizeof(float) * dJdbInc0.Size, sizeof(float) * dJdbInc0.Size);
+                Assert.IsTrue(dJdwConv.ContentEquals(dJdwInc0, 1e-5f));
+                Assert.IsTrue(dJdbConv.ContentEquals(dJdbInc0, 1e-5f));
+
+                // Cleanup
+                dJdwConv.Free();
+                dJdbConv.Free();
+                dJdwInc.Free();
+                dJdbInc.Free();
+                dJdwInc0.Free();
+                dJdbInc0.Free();
+                z1.Free();
+                z2.Free();
+                zConv.Free();
+                aConv.Free();
+                zInc.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+
+        [TestMethod]
+        public unsafe void Inception3x3Pipeline()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 32 * 32 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 32 * 32 * 3);
+            CuDnnConvolutionalLayer
+                conv1 = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(32, 32), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian),
+                conv2 = new CuDnnConvolutionalLayer(conv1.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation, 1, 1), (3, 3), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(32, 32), InceptionInfo.New(10, 10, 10, 10, 10, PoolingMode.Max, 10));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv1.Weights, 0, inception.Weights, sizeof(float) * 3 * 10, sizeof(float) * conv1.Weights.Length);
+            Buffer.BlockCopy(conv2.Weights, 0, inception.Weights, sizeof(float) * 3 * 10 + sizeof(float) * conv1.Weights.Length, sizeof(float) * conv2.Weights.Length);
+            Buffer.BlockCopy(conv1.Biases, 0, inception.Biases, sizeof(float) * 10, sizeof(float) * conv1.Biases.Length);
+            Buffer.BlockCopy(conv2.Biases, 0, inception.Biases, sizeof(float) * 20, sizeof(float) * conv2.Biases.Length);
+            fixed (float* px = x)
+            {
+                // Forward + Z
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                conv1.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
+                conv2.Forward(aTemp, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer() + 32 * 32 * 10, preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+                
+                // A
+                float* paInc = (float*)aInc.Ptr.ToPointer() + 32 * 32 * 10;
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+
+                // Backpropagation
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor z1);
+                KerasWeightsProvider.FillWithHeEtAlUniform(z1, 10);
+                z1.Duplicate(out Tensor z2);
+                conv2.Backpropagate(aConv, zTemp, conv1.ActivationFunctions.ActivationPrime);
+                conv1.Backpropagate(zTemp, z1, ActivationFunctions.ReLUPrime);
+                inception.Backpropagate(aInc, z2, ActivationFunctions.ReLUPrime);
+                Assert.IsTrue(z1.ContentEquals(z2));
+
+                // Gradient
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor a);
+                KerasWeightsProvider.FillWithHeEtAlUniform(a, 10);
+                conv1.ComputeGradient(a, zTemp, out Tensor dJdwConv1, out Tensor dJdbConv1);
+                conv2.ComputeGradient(aTemp, aConv, out Tensor dJdwConv2, out Tensor dJdbConv2);
+                inception.ComputeGradient(a, aInc, out Tensor dJdwInc, out Tensor dJdbInc);
+                Tensor.Reshape((float*)dJdwInc.Ptr.ToPointer() + 30, 1, dJdwConv1.Size, out Tensor dJdwInc0);
+                Tensor.Reshape((float*)dJdbInc.Ptr.ToPointer() + 10, 1, dJdbConv1.Size, out Tensor dJdbInc0);
+                Assert.IsTrue(dJdwConv1.ContentEquals(dJdwInc0, 1e-5f));
+                Assert.IsTrue(dJdbConv1.ContentEquals(dJdbInc0, 1e-5f));
+                Tensor.Reshape((float*)dJdwInc.Ptr.ToPointer() + 30 + dJdwConv1.Size, 1, dJdwConv2.Size, out Tensor dJdwInc1);
+                Tensor.Reshape((float*)dJdbInc.Ptr.ToPointer() + 20, 1, dJdbConv2.Size, out Tensor dJdbInc1);
+                Assert.IsTrue(dJdwConv2.ContentEquals(dJdwInc1, 1e-5f));
+                Assert.IsTrue(dJdbConv2.ContentEquals(dJdbInc1, 1e-5f));
+
+                // Cleanup
+                z1.Free();
+                z2.Free();
+                zTemp.Free();
+                zConv.Free();
+                zInc.Free();
+                aConv.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+
+        [TestMethod]
+        public unsafe void Inception5x5Pipeline()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 12 * 12 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 12 * 12 * 3);
+            CuDnnConvolutionalLayer
+                conv1 = new CuDnnConvolutionalLayer(TensorInfo.CreateForRgbImage(12, 12), ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian),
+                conv2 = new CuDnnConvolutionalLayer(conv1.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation, 2, 2), (5, 5), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(12, 12), InceptionInfo.New(3, 2, 2, 10, 10, PoolingMode.Max, 2));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv1.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2), sizeof(float) * conv1.Weights.Length);
+            Buffer.BlockCopy(conv2.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2 + conv1.Weights.Length), sizeof(float) * conv2.Weights.Length);
+            Buffer.BlockCopy(conv1.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2), sizeof(float) * conv1.Biases.Length);
+            Buffer.BlockCopy(conv2.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2 + 10), sizeof(float) * conv2.Biases.Length);
+            fixed (float* px = x)
+            {
+                // Forwaard + Z
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                conv1.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
+                conv2.Forward(aTemp, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer() + 12 * 12 * (3 + 2), preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+                
+                // A
+                float* paInc = (float*)aInc.Ptr.ToPointer() + 12 * 12 * (3 + 2);
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+
+                // Backpropagation
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor z1);
+                KerasWeightsProvider.FillWithHeEtAlUniform(z1, 10);
+                z1.Duplicate(out Tensor z2);
+                conv2.Backpropagate(aConv, zTemp, conv1.ActivationFunctions.ActivationPrime);
+                conv1.Backpropagate(zTemp, z1, ActivationFunctions.ReLUPrime);
+                inception.Backpropagate(aInc, z2, ActivationFunctions.ReLUPrime);
+                Assert.IsTrue(z1.ContentEquals(z2));
+
+                // Gradient
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor a);
+                KerasWeightsProvider.FillWithHeEtAlUniform(a, 10);
+                conv1.ComputeGradient(a, zTemp, out Tensor dJdwConv1, out Tensor dJdbConv1);
+                conv2.ComputeGradient(aTemp, aConv, out Tensor dJdwConv2, out Tensor dJdbConv2);
+                inception.ComputeGradient(a, aInc, out Tensor dJdwInc, out Tensor dJdbInc);
+                Tensor.Reshape((float*)dJdwInc.Ptr.ToPointer() + (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2), 1, dJdwConv1.Size, out Tensor dJdwInc0);
+                Tensor.Reshape((float*)dJdbInc.Ptr.ToPointer() + 7, 1, dJdbConv1.Size, out Tensor dJdbInc0);
+                Assert.IsTrue(dJdwConv1.ContentEquals(dJdwInc0, 1e-5f));
+                Assert.IsTrue(dJdbConv1.ContentEquals(dJdbInc0, 1e-5f));
+                Tensor.Reshape((float*)dJdwInc.Ptr.ToPointer() + (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2) + dJdwConv1.Size, 1, dJdwConv2.Size, out Tensor dJdwInc1);
+                Tensor.Reshape((float*)dJdbInc.Ptr.ToPointer() + 17, 1, dJdbConv2.Size, out Tensor dJdbInc1);
+                Assert.IsTrue(dJdwConv2.ContentEquals(dJdwInc1, 1e-5f));
+                Assert.IsTrue(dJdbConv2.ContentEquals(dJdbInc1, 1e-5f));
+
+                // Cleanup
+                zTemp.Free();
+                aTemp.Free();
+                zConv.Free();
+                zInc.Free();
+                aConv.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+
+        [TestMethod]
+        public unsafe void InceptionPoolPipeline()
+        {
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(10), 12 * 12 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(10, 12 * 12 * 3);
+            CuDnnPoolingLayer pool = new CuDnnPoolingLayer(TensorInfo.CreateForRgbImage(12, 12), PoolingInfo.New(PoolingMode.Max, 3, 3, 1, 1, 1, 1), ActivationFunctionType.ReLU);
+            CuDnnConvolutionalLayer conv = new CuDnnConvolutionalLayer(pool.OutputInfo, ConvolutionInfo.New(ConvolutionMode.CrossCorrelation), (1, 1), 10, ActivationFunctionType.ReLU, BiasInitializationMode.Gaussian);
+            CuDnnInceptionLayer inception = new CuDnnInceptionLayer(TensorInfo.CreateForRgbImage(12, 12), InceptionInfo.New(3, 2, 2, 2, 2, PoolingMode.Max, 10));
+            fixed (float* pw = inception.Weights)
+                Unsafe.InitBlock(pw, 0, (uint)(sizeof(float) * inception.Weights.Length));
+            Buffer.BlockCopy(conv.Weights, 0, inception.Weights, sizeof(float) * (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2 + 3 * 2 + 5 * 5 * 2 * 2), sizeof(float) * conv.Weights.Length);
+            Buffer.BlockCopy(conv.Biases, 0, inception.Biases, sizeof(float) * (3 + 2 + 2 + 2 + 2), sizeof(float) * conv.Biases.Length);
+            fixed (float* px = x)
+            {
+                // Forward + Z
+                Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xTensor);
+                pool.Forward(xTensor, out Tensor zTemp, out Tensor aTemp);
+                conv.Forward(aTemp, out Tensor zConv, out Tensor aConv);
+                inception.Forward(xTensor, out Tensor zInc, out Tensor aInc);
+                Tensor.New(zConv.Entities, zConv.Length, out Tensor reshaped);
+                float* pzInc = (float*)zInc.Ptr.ToPointer() + 12 * 12 * (3 + 2 + 2), preshaped = (float*)reshaped.Ptr.ToPointer();
+                for (int i = 0; i < zConv.Entities; i++)
+                    Buffer.MemoryCopy(pzInc + i * zInc.Length, preshaped + i * zConv.Length, sizeof(float) * zConv.Length, sizeof(float) * zConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(zConv));
+               
+                // A
+                float* paInc = (float*)aInc.Ptr.ToPointer() + 12 * 12 * (3 + 2 + 2);
+                for (int i = 0; i < aConv.Entities; i++)
+                    Buffer.MemoryCopy(paInc + i * aInc.Length, preshaped + i * aConv.Length, sizeof(float) * aConv.Length, sizeof(float) * aConv.Length);
+                Assert.IsTrue(reshaped.ContentEquals(aConv));
+
+                // Backpropagation
+                Tensor.New(xTensor.Entities, xTensor.Length, out Tensor z1);
+                KerasWeightsProvider.FillWithHeEtAlUniform(z1, 10);
+                z1.Duplicate(out Tensor z2);
+                conv.Backpropagate(aConv, zTemp, pool.ActivationFunctions.ActivationPrime);
+                pool.Backpropagate(zTemp, z1, ActivationFunctions.ReLUPrime);
+                inception.Backpropagate(aInc, z2, ActivationFunctions.ReLUPrime);
+                Assert.IsTrue(z1.ContentEquals(z2));
+
+                // Gradient
+                conv.ComputeGradient(aTemp, aConv, out Tensor dJdwConv, out Tensor dJdbConv);
+                inception.ComputeGradient(xTensor, aInc, out Tensor dJdwInc, out Tensor dJdbInc);
+                Tensor.Reshape((float*)dJdwInc.Ptr.ToPointer() + (3 * 3 + 3 * 2 + 3 * 3 * 2 * 2 + 3 * 2 + 5 * 5 * 2 * 2), 1, dJdwConv.Size, out Tensor dJdwInc0);
+                Tensor.Reshape((float*)dJdbInc.Ptr.ToPointer() + 11, 1, dJdbConv.Size, out Tensor dJdbInc0);
+                Assert.IsTrue(dJdwConv.ContentEquals(dJdwInc0, 1e-5f));
+                Assert.IsTrue(dJdbConv.ContentEquals(dJdbInc0, 1e-5f));
+
+                // Cleanup
+                zTemp.Free();
+                aTemp.Free();
+                zConv.Free();
+                zInc.Free();
+                aConv.Free();
+                aInc.Free();
+                reshaped.Free();
+            }
+        }
+    }
+}
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
index 0b7aed2..87e53d4 100644
--- a/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/CuDnnLayersTest.cs
@@ -73,7 +73,7 @@ private static unsafe void TestGradient(WeightedLayerBase cpu, WeightedLayerBase
         [TestMethod]
         public void FullyConnectedForward()
         {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
             FullyConnectedLayer
                 cpu = new FullyConnectedLayer(TensorInfo.CreateLinear(250), 127, ActivationFunctionType.LeCunTanh, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnFullyConnectedLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases, cpu.ActivationFunctionType);
@@ -84,8 +84,8 @@ public void FullyConnectedForward()
         public void FullyConnectedBackward()
         {
             float[,]
-                delta_1 = WeightsProvider.NewFullyConnectedWeights(400, 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127),
-                z = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
+                delta_1 = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127),
+                z = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
             FullyConnectedLayer
                 cpu = new FullyConnectedLayer(TensorInfo.CreateLinear(250), 127, ActivationFunctionType.LeCunTanh, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnFullyConnectedLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases, cpu.ActivationFunctionType);
@@ -96,8 +96,8 @@ public void FullyConnectedBackward()
         public void FullyConnectedGradient()
         {
             float[,]
-                x = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
-                delta = WeightsProvider.NewFullyConnectedWeights(400, 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127);
+                x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
+                delta = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127);
             FullyConnectedLayer
                 cpu = new FullyConnectedLayer(TensorInfo.CreateLinear(250), 127, ActivationFunctionType.LeCunTanh, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnFullyConnectedLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases, cpu.ActivationFunctionType);
@@ -111,7 +111,7 @@ public void FullyConnectedGradient()
         [TestMethod]
         public void SoftmaxForward()
         {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
             SoftmaxLayer
                 cpu = new SoftmaxLayer(TensorInfo.CreateLinear(250), 127, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnSoftmaxLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases);
@@ -122,8 +122,8 @@ public void SoftmaxForward()
         public void SoftmaxBackward()
         {
             float[,]
-                delta_1 = WeightsProvider.NewFullyConnectedWeights(400, 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127),
-                z = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
+                delta_1 = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127),
+                z = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250);
             SoftmaxLayer
                 cpu = new SoftmaxLayer(TensorInfo.CreateLinear(250), 127, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnSoftmaxLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases);
@@ -134,8 +134,8 @@ public void SoftmaxBackward()
         public void SoftmaxGradient()
         {
             float[,]
-                a = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
-                delta = WeightsProvider.NewFullyConnectedWeights(400, 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127);
+                a = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
+                delta = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 127, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 127);
             SoftmaxLayer
                 cpu = new SoftmaxLayer(TensorInfo.CreateLinear(250), 127, WeightsInitializationMode.GlorotNormal, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnSoftmaxLayer(cpu.InputInfo, cpu.OutputInfo.Size, cpu.Weights, cpu.Biases);
@@ -146,7 +146,7 @@ public void SoftmaxGradient()
         public unsafe void SoftmaxBackwardOutput()
         {
             float[,]
-                x = WeightsProvider.NewFullyConnectedWeights(400, 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
+                x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 250, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 250),
                 y = new float[400, 127];
             for (int i = 0; i < 400; i++)
                 y[i, ThreadSafeRandom.NextInt(max: 127)] = 1;
@@ -175,7 +175,7 @@ public unsafe void SoftmaxBackwardOutput()
         [TestMethod]
         public void ConvolutionForward()
         {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(127, 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 58 * 58 * 3);
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(127), 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 58 * 58 * 3);
             ConvolutionalLayer
                 cpu = new ConvolutionalLayer(new TensorInfo(58, 58, 3), ConvolutionInfo.Default, (5, 5), 20, ActivationFunctionType.LeakyReLU, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnConvolutionalLayer(cpu.InputInfo, ConvolutionInfo.Default, cpu.KernelInfo, cpu.OutputInfo, cpu.Weights, cpu.Biases, cpu.ActivationFunctionType);
@@ -186,8 +186,8 @@ public void ConvolutionForward()
         public unsafe void ConvolutionBackward()
         {
             float[,]
-                delta_1 = WeightsProvider.NewFullyConnectedWeights(127, 54 * 54 * 20, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 54 * 54 * 20),
-                z = WeightsProvider.NewFullyConnectedWeights(127, 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 58 * 58 * 3);
+                delta_1 = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(127), 54 * 54 * 20, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 54 * 54 * 20),
+                z = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(127), 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(127, 58 * 58 * 3);
             ConvolutionalLayer
                 cpu = new ConvolutionalLayer(new TensorInfo(58, 58, 3), ConvolutionInfo.Default, (5, 5), 20, ActivationFunctionType.LeCunTanh, BiasInitializationMode.Gaussian),
                 gpu = new CuDnnConvolutionalLayer(cpu.InputInfo, ConvolutionInfo.Default, cpu.KernelInfo, cpu.OutputInfo, cpu.Weights, cpu.Biases, ActivationFunctionType.LeCunTanh);
@@ -228,7 +228,7 @@ public void ConvolutionGradient()
         [TestMethod]
         public void PoolingForward()
         {
-            float[,] x = WeightsProvider.NewFullyConnectedWeights(400, 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 58 * 58 * 3);
+            float[,] x = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(400), 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 58 * 58 * 3);
             PoolingLayer
                 cpu = new PoolingLayer(new TensorInfo(58, 58, 3), PoolingInfo.Default, ActivationFunctionType.LeakyReLU),
                 gpu = new CuDnnPoolingLayer(cpu.InputInfo, PoolingInfo.Default, ActivationFunctionType.LeakyReLU);
@@ -236,15 +236,41 @@ public void PoolingForward()
         }
 
         [TestMethod]
-        public void PoolingBackward()
+        public unsafe void PoolingBackward()
         {
-            float[,]
-                delta_1 = WeightsProvider.NewFullyConnectedWeights(400, 29 * 29 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 29 * 29 * 3),
-                z = WeightsProvider.NewFullyConnectedWeights(400, 58 * 58 * 3, WeightsInitializationMode.GlorotNormal).AsMatrix(400, 58 * 58 * 3);
+            // Setup
+            Tensor.New(400, 58 * 58 * 3, out Tensor x);
+            KerasWeightsProvider.FillWithHeEtAlUniform(x, 10);
             PoolingLayer
                 cpu = new PoolingLayer(new TensorInfo(58, 58, 3), PoolingInfo.Default, ActivationFunctionType.LeakyReLU),
                 gpu = new CuDnnPoolingLayer(cpu.InputInfo, PoolingInfo.Default, ActivationFunctionType.LeakyReLU);
-            TestBackward(cpu, gpu, delta_1, z);
+            gpu.Forward(x, out Tensor z, out Tensor a);
+            a.Free();
+            x.Duplicate(out Tensor x2);
+            Tensor.New(z.Entities, z.Length, out Tensor delta);
+            KerasWeightsProvider.FillWithHeEtAlUniform(delta, 10);
+
+            // Backward
+            cpu.Backpropagate(delta, x, ActivationFunctions.LeakyReLUPrime);
+            gpu.Backpropagate(delta, x2, ActivationFunctions.LeakyReLUPrime);
+            bool valid = true;
+            float* px = (float*)x.Ptr.ToPointer(), px2 = (float*)x2.Ptr.ToPointer();
+            int count = 0;
+            for (int i = 0; i < x.Size; i++)
+            {
+                if (px[i].EqualsWithDelta(px2[i], 1e-5f)) continue;
+                if (px[i].EqualsWithDelta(px2[i] * 100f, 1e-5f)) count++;   // The cuDNN pooling backwards method returns a value scaled by 0.01 from time to time for some reason (less than 2% anyways)
+                else
+                {
+                    valid = false;
+                    break;
+                }
+            }
+            Assert.IsTrue(valid && count * 100f / x.Size < 2);
+            x.Free();
+            x2.Free();
+            z.Free();
+            delta.Free();
         }
 
         #endregion
diff --git a/Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs b/Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs
new file mode 100644
index 0000000..9531146
--- /dev/null
+++ b/Unit/NeuralNetwork.NET.Cuda.Unit/GpuExtensionsTest.cs
@@ -0,0 +1,55 @@
+﻿using Alea;
+using Microsoft.VisualStudio.TestTools.UnitTesting;
+using NeuralNetworkNET.APIs.Structs;
+using NeuralNetworkNET.Cuda.Extensions;
+using NeuralNetworkNET.Extensions;
+
+namespace NeuralNetworkNET.Cuda.Unit
+{
+    /// <summary>
+    /// Test class for the cuDNN GPU extension methods
+    /// </summary>
+    [TestClass]
+    [TestCategory(nameof(GpuExtensionsTest))]
+    public class GpuExtensionsTest
+    {
+        [TestMethod]
+        public void CopyToRows()
+        {
+            float[] test = {1,2,3,4,5,6,7,8,9};
+            Tensor.NewZeroed(3, 10, out Tensor tensor);
+            Gpu gpu = Gpu.Default;
+            using (DeviceMemory<float> m_gpu = gpu.AllocateDevice(test))
+            {
+                m_gpu.CopyTo(tensor, 5, 3);
+            }
+            float[,] expected =
+            {
+                { 0, 0, 0, 0, 0, 1, 2, 3, 0, 0 },
+                { 0, 0, 0, 0, 0, 4, 5, 6, 0, 0 },
+                { 0, 0, 0, 0, 0, 7, 8, 9, 0, 0 }
+            };
+            Assert.IsTrue(tensor.ToArray2D().ContentEquals(expected));
+        }
+
+        [TestMethod]
+        public void AllocateDeviceRows()
+        {
+            float[,] source =
+            {
+                { 0, 0, 0, 0, 0, 1, 2, 3, 0, 0 },
+                { 0, 0, 0, 0, 0, 4, 5, 6, 0, 0 },
+                { 0, 0, 0, 0, 0, 7, 8, 9, 0, 0 }
+            };
+            Tensor.From(source, out Tensor tensor);
+            Gpu gpu = Gpu.Default;
+            using (DeviceMemory<float> m_gpu = gpu.AllocateDevice(tensor, 5, 3))
+            {
+                float[]
+                    copy = Gpu.CopyToHost(m_gpu),
+                    expected = { 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+                Assert.IsTrue(copy.ContentEquals(expected));
+            }
+        }
+    }
+}
diff --git a/Unit/NeuralNetwork.NET.Unit/SerializationTest.cs b/Unit/NeuralNetwork.NET.Unit/SerializationTest.cs
index 4ae03d4..f646e86 100644
--- a/Unit/NeuralNetwork.NET.Unit/SerializationTest.cs
+++ b/Unit/NeuralNetwork.NET.Unit/SerializationTest.cs
@@ -1,6 +1,4 @@
-﻿using System;
-using System.IO;
-using System.Linq;
+﻿using System.IO;
 using Microsoft.VisualStudio.TestTools.UnitTesting;
 using NeuralNetworkNET.APIs;
 using NeuralNetworkNET.APIs.Enums;
@@ -50,7 +48,7 @@ public void StreamSerialize()
         {
             using (MemoryStream stream = new MemoryStream())
             {
-                float[] w = WeightsProvider.NewFullyConnectedWeights(784, 30, WeightsInitializationMode.GlorotNormal);
+                float[] w = WeightsProvider.NewFullyConnectedWeights(TensorInfo.CreateLinear(784), 30, WeightsInitializationMode.GlorotNormal);
                 stream.WriteShuffled(w);
                 Assert.IsTrue(stream.Position == sizeof(float) * w.Length);
                 stream.Seek(0, SeekOrigin.Begin);