Sergio0694 · Sergio0694 · Dec 29, 2017 · Dec 23, 2017 · Dec 24, 2017 · Dec 24, 2017
diff --git a/...twork.NET.Cuda/APIS/CuDnnNetworkLayers.cs → ...twork.NET.Cuda/APIs/CuDnnNetworkLayers.cs b/...twork.NET.Cuda/APIS/CuDnnNetworkLayers.cs → ...twork.NET.Cuda/APIs/CuDnnNetworkLayers.cs
@@ -67,5 +67,18 @@ public static INetworkLayer Convolutional(
         [PublicAPI]
         [Pure, NotNull]
         public static INetworkLayer Pooling(in TensorInfo input, in PoolingInfo info, ActivationFunctionType activation) => new CuDnnPoolingLayer(input, info, activation);
+
+        /// <summary>
+        /// Creates a new inception layer with the given input and features
+        /// </summary>
+        /// <param name="input">The input volume to process</param>
+        /// <param name="info">The info on the operations to execute inside the layer</param>
+        /// <param name="biasMode">Indicates the desired initialization mode to use for the layer bias values</param>
+        [PublicAPI]
+        [Pure, NotNull]
+        public static INetworkLayer Inception(
+            in TensorInfo input, in InceptionInfo info,
+            BiasInitializationMode biasMode = BiasInitializationMode.Zero)
+            => new CuDnnInceptionLayer(input, info, biasMode);
     }
 }
diff --git a/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs b/NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs
@@ -31,6 +31,7 @@ private static INetworkLayer Deserialize([NotNull] Stream stream, LayerType type
                 case LayerType.Convolutional: return CuDnnConvolutionalLayer.Deserialize(stream);
                 case LayerType.Pooling: return CuDnnPoolingLayer.Deserialize(stream);
                 case LayerType.Softmax: return CuDnnSoftmaxLayer.Deserialize(stream);
+                case LayerType.Inception: return CuDnnInceptionLayer.Deserialize(stream);
                 default: return null;
             }
         } 

diff --git a/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs b/NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
@@ -27,6 +27,39 @@ public static DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tens
                 : throw new InvalidOperationException($"Failed to copy the source data on the target GPU device, [CUDA ERROR] {result}");
         }
 
+        /// <summary>
+        /// Allocates a memory area on device memory, reading the target values at a given offset from the input <see cref="Tensor"/>
+        /// </summary>
+        /// <param name="gpu">The <see cref="Gpu"/> device to use</param>
+        /// <param name="source">The source <see cref="Tensor"/> with the data to copy</param>
+        /// <param name="offset">The column offset for the data to read from each row</param>
+        /// <param name="length"></param>
+        [MustUseReturnValue, NotNull]
+        public static unsafe DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tensor source, int offset, int length)
+        {
+            // Checks
+            if (source.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
+
+            // Memory copy
+            DeviceMemory<float> result_gpu = gpu.AllocateDevice<float>(source.Entities * length);
+            CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
+            ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
+            {
+                srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
+                srcHost = source.Ptr + sizeof(float) * offset,
+                srcPitch = new IntPtr(sizeof(float) * source.Length),
+                dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
+                dstDevice = result_gpu.Handle,
+                dstPitch = new IntPtr(sizeof(float) * length),
+                WidthInBytes = new IntPtr(sizeof(float) * length),
+                Height = new IntPtr(source.Entities)
+            };
+            CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
+            return result == CUDAInterop.cudaError_enum.CUDA_SUCCESS
+                ? result_gpu
+                : throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
+        }
+
         /// <summary>
         /// Copies the contents of the input <see cref="DeviceMemory{T}"/> instance to the target host memory area
         /// </summary>
@@ -40,6 +73,37 @@ public static void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor d
                 throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
         }
 
+        /// <summary>
+        /// Copies the source data into the target <see cref="Tensor"/>, splitting each individual entry into its own row
+        /// </summary>
+        /// <param name="source">The source memory area with the concatenated data for each entry</param>
+        /// <param name="destination">The destination <see cref="Tensor"/> that will store the data</param>
+        /// <param name="offset">The column offset for the data for each entry</param>
+        /// <param name="length">The number of values to copy for each entry</param>
+        public static unsafe void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor destination, int offset, int length)
+        {
+            // Checks
+            if (source.Length / length != destination.Entities) throw new ArgumentOutOfRangeException(nameof(length), "The input length doesn't match the given arguments");
+            if (destination.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
+
+            // Memory copy
+            CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
+            ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
+            {
+                srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
+                srcDevice = source.Handle,
+                srcPitch = new IntPtr(sizeof(float) * length),
+                dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
+                dstHost = destination.Ptr + sizeof(float) * offset,
+                dstPitch = new IntPtr(sizeof(float) * destination.Length),
+                WidthInBytes = new IntPtr(sizeof(float) * length),
+                Height = new IntPtr(destination.Entities)
+            };
+            CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
+            if (result != CUDAInterop.cudaError_enum.CUDA_SUCCESS)
+                throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
+        }
+
         /// <summary>
         /// Copies the contents of the input <see cref="DeviceMemory{T}"/> to a new memory area on the unmanaged heap
         /// </summary>

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
@@ -47,9 +47,7 @@ internal sealed class CuDnnConvolutionalLayer : ConvolutionalLayer
         [NotNull]
         private readonly Dnn DnnInstance = DnnService.Instance;
 
-        /// <summary>
-        /// Sets the cuDNN fields that will be used during future forward/backwards operations
-        /// </summary>
+        // cuDNN fields setup
         private void SetupCuDnnInfo()
         {
             ConvolutionDescription.Set2D(OperationInfo.VerticalPadding, OperationInfo.HorizontalPadding, OperationInfo.VerticalStride, OperationInfo.HorizontalStride, 1, 1, (Alea.cuDNN.ConvolutionMode)OperationInfo.Mode);
@@ -74,71 +72,63 @@ public CuDnnConvolutionalLayer(
         #region Implementation
 
         /// <inheritdoc/>
-        public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
+        public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
             {
-                Tensor.Reshape(pw, OutputInfo.Channels, KernelInfo.Size, out Tensor wTensor);
-                using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
+                // Tensors info setup
+                InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
+                OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);
+
+                // Forward convolution
+                DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
+                DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, algorithm, out IntPtr size);
+                using (DeviceMemory<float>
+                    x_gpu = DnnInstance.Gpu.AllocateDevice(x),
+                    w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                 {
-                    // Tensors info setup
-                    InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
-                    OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);
-
-                    // Forward convolution
-                    DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
-                    DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, algorithm, out IntPtr size);
-                    using (DeviceMemory<float>
-                        x_gpu = DnnInstance.Gpu.AllocateDevice(x),
-                        w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor))
-                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
-                    {
-                        DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, FilterDescription, w_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, OutputDescription, z_gpu.Ptr);
-                    }
+                    DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, FilterDescription, w_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, OutputDescription, z_gpu.Ptr);
+                }
 
-                    // Biases
-                    using (DeviceMemory<float> b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
-                    {
-                        DnnInstance.AddTensor(1, BiasDescription, b_gpu.Ptr, 1, OutputDescription, z_gpu.Ptr);
-                    }
-                    z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
+                // Biases
+                using (DeviceMemory<float> b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
+                {
+                    DnnInstance.AddTensor(1, BiasDescription, b_gpu.Ptr, 1, OutputDescription, z_gpu.Ptr);
+                }
+                z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
 
-                    // Activation
-                    if (ActivationFunctionType == ActivationFunctionType.Identity) z.Duplicate(out a);
-                    else
-                    {
-                        DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
-                        z_gpu.CopyToHost(z.Entities, z.Length, out a);
-                    }
+                // Activation
+                if (ActivationFunctionType == ActivationFunctionType.Identity) z.Duplicate(out a);
+                else
+                {
+                    DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
+                    z_gpu.CopyToHost(z.Entities, z.Length, out a);
                 }
             }
         }
 
         /// <inheritdoc/>
-        public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
+        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float> delta_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
             {
-                Tensor.Reshape(pw, OutputInfo.Channels, KernelInfo.Size, out Tensor wTensor);
+                // Convolution
                 DnnInstance.GetConvolutionBackwardDataAlgorithm(FilterDescription, OutputDescription, ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdDataAlgo algorithm);
                 DnnInstance.GetConvolutionBackwardDataWorkspaceSize(FilterDescription, OutputDescription, ConvolutionDescription, InputDescription, algorithm, out IntPtr size);
-                using (DeviceMemory<float> delta_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
+                using (DeviceMemory<float>
+                    delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
+                    w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
+                using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
                 {
-                    // Backwards convolution
-                    using (DeviceMemory<float>
-                        delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
-                        w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor))
-                    using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
-                    {
-                        DnnInstance.ConvolutionBackwardData(1, FilterDescription, w_gpu.Ptr, OutputDescription, delta_1_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, delta_gpu.Ptr);
-                    }
+                    DnnInstance.ConvolutionBackwardData(1, FilterDescription, w_gpu.Ptr, OutputDescription, delta_1_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, delta_gpu.Ptr);
+                }
 
-                    // Activation
-                    using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
-                    {
-                        DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, delta_gpu.Ptr, activationPrime);
-                        z_gpu.CopyTo(z);
-                    }
+                // Activation
+                using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
+                {
+                    DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, delta_gpu.Ptr, activationPrime);
+                    z_gpu.CopyTo(z);
                 }
             }
         }
@@ -159,7 +149,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                     {
                         DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, OutputDescription, delta_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, FilterDescription, w_gpu.Ptr);
                     }
-                    w_gpu.CopyToHost(Kernels, KernelInfo.Size, out dJdw);
+                    w_gpu.CopyToHost(1, Weights.Length, out dJdw);
                 }
 
                 // Bias

diff --git a/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs b/NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
@@ -30,39 +30,31 @@ public CuDnnFullyConnectedLayer(in TensorInfo input, int neurons, [NotNull] floa
         #region Implementation
 
         /// <inheritdoc/>
-        public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
+        public override void Forward(in Tensor x, out Tensor z, out Tensor a)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float>
+                x_gpu = DnnInstance.Gpu.AllocateDevice(x),
+                w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
+                y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size),
+                b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
             {
-                Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
-                using (DeviceMemory<float>
-                    x_gpu = DnnInstance.Gpu.AllocateDevice(x),
-                    w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
-                    y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size),
-                    b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
-                {
-                    DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, y_gpu.Ptr);
-                    y_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
-                    DnnInstance.ActivationForward(z.Entities, z.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
-                    y_gpu.CopyToHost(z.Entities, z.Length, out a);
-                }
+                DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, y_gpu.Ptr);
+                y_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
+                DnnInstance.ActivationForward(z.Entities, z.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
+                y_gpu.CopyToHost(z.Entities, z.Length, out a);
             }
         }
 
         /// <inheritdoc/>
-        public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
+        public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
         {
-            fixed (float* pw = Weights)
+            using (DeviceMemory<float>
+                delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
+                w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
+                z_gpu = DnnInstance.Gpu.AllocateDevice(z))
             {
-                Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
-                using (DeviceMemory<float>
-                    delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
-                    w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
-                    z_gpu = DnnInstance.Gpu.AllocateDevice(z))
-                {
-                    DnnInstance.FullyConnectedBackwardData(z.Entities, InputInfo.Size, OutputInfo.Size, z_gpu.Ptr, delta_1_gpu.Ptr, w_gpu.Ptr, activationPrime);
-                    z_gpu.CopyTo(z);
-                }
+                DnnInstance.FullyConnectedBackwardData(z.Entities, InputInfo.Size, OutputInfo.Size, z_gpu.Ptr, delta_1_gpu.Ptr, w_gpu.Ptr, activationPrime);
+                z_gpu.CopyTo(z);
             }
         }
 
@@ -75,7 +67,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                 w_gpu = DnnInstance.Gpu.AllocateDevice<float>(a.Length * delta.Length))
             {
                 DnnInstance.FullyConnectedBackwardFilter(a.Entities, a.Length, delta.Length, a_gpu.Ptr, delta_gpu.Ptr, w_gpu.Ptr);
-                w_gpu.CopyToHost(a.Length, delta.Length, out dJdw);
+                w_gpu.CopyToHost(1, Weights.Length, out dJdw);
             }
             delta.CompressVertically(out dJdb); // Doing this on CPU is generally faster than launching the kernels
         }