Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature inception layer #48

Merged
merged 32 commits into from
Dec 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
dbd1893
Added InceptionInfo struct
Sergio0694 Dec 23, 2017
fd7f9ec
Temp move to APIs folder
Sergio0694 Dec 24, 2017
2cc4879
Name switched back to original (path fix)
Sergio0694 Dec 24, 2017
efd16e4
Merge pull request #46 from Sergio0694/tweak_case-path
Sergio0694 Dec 24, 2017
ed56e1f
Initial inception layer structure added
Sergio0694 Dec 24, 2017
9c9a79c
Added inception cuDNN base initialization
Sergio0694 Dec 24, 2017
9bc9e80
Added inception layer weights initialization, minor changes
Sergio0694 Dec 24, 2017
de8a9e7
Inception constructors and Clone method implemented
Sergio0694 Dec 24, 2017
f41a4a8
Inception layer forward method implemented (WIP)
Sergio0694 Dec 24, 2017
127e241
Minor code improvements to the CuDnn layers
Sergio0694 Dec 24, 2017
612ba59
Merge pull request #47 from Sergio0694/dev
Sergio0694 Dec 24, 2017
c16c249
Minor bug fixes
Sergio0694 Dec 24, 2017
6a82479
Minor improvements to the Tensor struct
Sergio0694 Dec 24, 2017
657c277
InceptionInfo struct improved, minor changes
Sergio0694 Dec 25, 2017
801b4ca
Inception layer forward method implemented
Sergio0694 Dec 25, 2017
7e6366b
Inception backpropagation 90% completed
Sergio0694 Dec 25, 2017
d1de1ab
GPU copy to rows and rows allocation methods improved
Sergio0694 Dec 26, 2017
07af79e
Inception layer delta loading fixed
Sergio0694 Dec 26, 2017
c6a84a6
Inception layer implementation finished (WIP)
Sergio0694 Dec 26, 2017
ceae801
Inception layer public API and serialization methods added
Sergio0694 Dec 26, 2017
393d01a
Minor fixes to the inception layer (WIP)
Sergio0694 Dec 26, 2017
84502d3
Ooops!
Sergio0694 Dec 26, 2017
bfe4a04
More fixes to the inception layer (WIP)
Sergio0694 Dec 26, 2017
096994a
Fixed convolution output size
Sergio0694 Dec 26, 2017
fa6c36d
Added initial inception layer tests
Sergio0694 Dec 26, 2017
b3e136c
Minor bug fixes, inception 5x5 test added
Sergio0694 Dec 27, 2017
c4a3966
Added inception layer pooling test, minor bug fixes
Sergio0694 Dec 27, 2017
ffa603e
Minor layer tweaks
Sergio0694 Dec 29, 2017
2ef571a
ContentEquals method improved with relative threshold
Sergio0694 Dec 29, 2017
968eba6
Inception layer bug fixes, more tests added
Sergio0694 Dec 29, 2017
fb738d1
CuDnnPoolingLayer backpropagation switched to cuDNN
Sergio0694 Dec 29, 2017
b84d991
Inception layer pool gradient test added
Sergio0694 Dec 29, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -67,5 +67,18 @@ public static INetworkLayer Convolutional(
[PublicAPI]
[Pure, NotNull]
public static INetworkLayer Pooling(in TensorInfo input, in PoolingInfo info, ActivationFunctionType activation) => new CuDnnPoolingLayer(input, info, activation);

/// <summary>
/// Creates a new inception layer with the given input and features
/// </summary>
/// <param name="input">The input volume to process</param>
/// <param name="info">The info on the operations to execute inside the layer</param>
/// <param name="biasMode">Indicates the desired initialization mode to use for the layer bias values</param>
[PublicAPI]
[Pure, NotNull]
public static INetworkLayer Inception(
in TensorInfo input, in InceptionInfo info,
BiasInitializationMode biasMode = BiasInitializationMode.Zero)
=> new CuDnnInceptionLayer(input, info, biasMode);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ private static INetworkLayer Deserialize([NotNull] Stream stream, LayerType type
case LayerType.Convolutional: return CuDnnConvolutionalLayer.Deserialize(stream);
case LayerType.Pooling: return CuDnnPoolingLayer.Deserialize(stream);
case LayerType.Softmax: return CuDnnSoftmaxLayer.Deserialize(stream);
case LayerType.Inception: return CuDnnInceptionLayer.Deserialize(stream);
default: return null;
}
}
Expand Down
64 changes: 64 additions & 0 deletions NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,39 @@ public static DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tens
: throw new InvalidOperationException($"Failed to copy the source data on the target GPU device, [CUDA ERROR] {result}");
}

/// <summary>
/// Allocates a memory area on device memory, reading the target values at a given offset from the input <see cref="Tensor"/>
/// </summary>
/// <param name="gpu">The <see cref="Gpu"/> device to use</param>
/// <param name="source">The source <see cref="Tensor"/> with the data to copy</param>
/// <param name="offset">The column offset for the data to read from each row</param>
/// <param name="length"></param>
[MustUseReturnValue, NotNull]
public static unsafe DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tensor source, int offset, int length)
{
// Checks
if (source.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");

// Memory copy
DeviceMemory<float> result_gpu = gpu.AllocateDevice<float>(source.Entities * length);
CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
{
srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
srcHost = source.Ptr + sizeof(float) * offset,
srcPitch = new IntPtr(sizeof(float) * source.Length),
dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
dstDevice = result_gpu.Handle,
dstPitch = new IntPtr(sizeof(float) * length),
WidthInBytes = new IntPtr(sizeof(float) * length),
Height = new IntPtr(source.Entities)
};
CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
return result == CUDAInterop.cudaError_enum.CUDA_SUCCESS
? result_gpu
: throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
}

/// <summary>
/// Copies the contents of the input <see cref="DeviceMemory{T}"/> instance to the target host memory area
/// </summary>
Expand All @@ -40,6 +73,37 @@ public static void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor d
throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
}

/// <summary>
/// Copies the source data into the target <see cref="Tensor"/>, splitting each individual entry into its own row
/// </summary>
/// <param name="source">The source memory area with the concatenated data for each entry</param>
/// <param name="destination">The destination <see cref="Tensor"/> that will store the data</param>
/// <param name="offset">The column offset for the data for each entry</param>
/// <param name="length">The number of values to copy for each entry</param>
public static unsafe void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor destination, int offset, int length)
{
// Checks
if (source.Length / length != destination.Entities) throw new ArgumentOutOfRangeException(nameof(length), "The input length doesn't match the given arguments");
if (destination.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");

// Memory copy
CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
{
srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
srcDevice = source.Handle,
srcPitch = new IntPtr(sizeof(float) * length),
dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
dstHost = destination.Ptr + sizeof(float) * offset,
dstPitch = new IntPtr(sizeof(float) * destination.Length),
WidthInBytes = new IntPtr(sizeof(float) * length),
Height = new IntPtr(destination.Entities)
};
CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
if (result != CUDAInterop.cudaError_enum.CUDA_SUCCESS)
throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
}

/// <summary>
/// Copies the contents of the input <see cref="DeviceMemory{T}"/> to a new memory area on the unmanaged heap
/// </summary>
Expand Down
96 changes: 43 additions & 53 deletions NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@ internal sealed class CuDnnConvolutionalLayer : ConvolutionalLayer
[NotNull]
private readonly Dnn DnnInstance = DnnService.Instance;

/// <summary>
/// Sets the cuDNN fields that will be used during future forward/backwards operations
/// </summary>
// cuDNN fields setup
private void SetupCuDnnInfo()
{
ConvolutionDescription.Set2D(OperationInfo.VerticalPadding, OperationInfo.HorizontalPadding, OperationInfo.VerticalStride, OperationInfo.HorizontalStride, 1, 1, (Alea.cuDNN.ConvolutionMode)OperationInfo.Mode);
Expand All @@ -74,71 +72,63 @@ public CuDnnConvolutionalLayer(
#region Implementation

/// <inheritdoc/>
public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
public override void Forward(in Tensor x, out Tensor z, out Tensor a)
{
fixed (float* pw = Weights)
using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
{
Tensor.Reshape(pw, OutputInfo.Channels, KernelInfo.Size, out Tensor wTensor);
using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
// Tensors info setup
InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);

// Forward convolution
DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, algorithm, out IntPtr size);
using (DeviceMemory<float>
x_gpu = DnnInstance.Gpu.AllocateDevice(x),
w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
{
// Tensors info setup
InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);

// Forward convolution
DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, algorithm, out IntPtr size);
using (DeviceMemory<float>
x_gpu = DnnInstance.Gpu.AllocateDevice(x),
w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor))
using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
{
DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, FilterDescription, w_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, OutputDescription, z_gpu.Ptr);
}
DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, FilterDescription, w_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, OutputDescription, z_gpu.Ptr);
}

// Biases
using (DeviceMemory<float> b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
{
DnnInstance.AddTensor(1, BiasDescription, b_gpu.Ptr, 1, OutputDescription, z_gpu.Ptr);
}
z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
// Biases
using (DeviceMemory<float> b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
{
DnnInstance.AddTensor(1, BiasDescription, b_gpu.Ptr, 1, OutputDescription, z_gpu.Ptr);
}
z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);

// Activation
if (ActivationFunctionType == ActivationFunctionType.Identity) z.Duplicate(out a);
else
{
DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
z_gpu.CopyToHost(z.Entities, z.Length, out a);
}
// Activation
if (ActivationFunctionType == ActivationFunctionType.Identity) z.Duplicate(out a);
else
{
DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
z_gpu.CopyToHost(z.Entities, z.Length, out a);
}
}
}

/// <inheritdoc/>
public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
{
fixed (float* pw = Weights)
using (DeviceMemory<float> delta_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
{
Tensor.Reshape(pw, OutputInfo.Channels, KernelInfo.Size, out Tensor wTensor);
// Convolution
DnnInstance.GetConvolutionBackwardDataAlgorithm(FilterDescription, OutputDescription, ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdDataAlgo algorithm);
DnnInstance.GetConvolutionBackwardDataWorkspaceSize(FilterDescription, OutputDescription, ConvolutionDescription, InputDescription, algorithm, out IntPtr size);
using (DeviceMemory<float> delta_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
using (DeviceMemory<float>
delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
{
// Backwards convolution
using (DeviceMemory<float>
delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor))
using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
{
DnnInstance.ConvolutionBackwardData(1, FilterDescription, w_gpu.Ptr, OutputDescription, delta_1_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, delta_gpu.Ptr);
}
DnnInstance.ConvolutionBackwardData(1, FilterDescription, w_gpu.Ptr, OutputDescription, delta_1_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, delta_gpu.Ptr);
}

// Activation
using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
{
DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, delta_gpu.Ptr, activationPrime);
z_gpu.CopyTo(z);
}
// Activation
using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
{
DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, delta_gpu.Ptr, activationPrime);
z_gpu.CopyTo(z);
}
}
}
Expand All @@ -159,7 +149,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
{
DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, OutputDescription, delta_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, FilterDescription, w_gpu.Ptr);
}
w_gpu.CopyToHost(Kernels, KernelInfo.Size, out dJdw);
w_gpu.CopyToHost(1, Weights.Length, out dJdw);
}

// Bias
Expand Down
44 changes: 18 additions & 26 deletions NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,39 +30,31 @@ public CuDnnFullyConnectedLayer(in TensorInfo input, int neurons, [NotNull] floa
#region Implementation

/// <inheritdoc/>
public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
public override void Forward(in Tensor x, out Tensor z, out Tensor a)
{
fixed (float* pw = Weights)
using (DeviceMemory<float>
x_gpu = DnnInstance.Gpu.AllocateDevice(x),
w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size),
b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
{
Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
using (DeviceMemory<float>
x_gpu = DnnInstance.Gpu.AllocateDevice(x),
w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size),
b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
{
DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, y_gpu.Ptr);
y_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
DnnInstance.ActivationForward(z.Entities, z.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
y_gpu.CopyToHost(z.Entities, z.Length, out a);
}
DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, y_gpu.Ptr);
y_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
DnnInstance.ActivationForward(z.Entities, z.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
y_gpu.CopyToHost(z.Entities, z.Length, out a);
}
}

/// <inheritdoc/>
public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
{
fixed (float* pw = Weights)
using (DeviceMemory<float>
delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
z_gpu = DnnInstance.Gpu.AllocateDevice(z))
{
Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
using (DeviceMemory<float>
delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
z_gpu = DnnInstance.Gpu.AllocateDevice(z))
{
DnnInstance.FullyConnectedBackwardData(z.Entities, InputInfo.Size, OutputInfo.Size, z_gpu.Ptr, delta_1_gpu.Ptr, w_gpu.Ptr, activationPrime);
z_gpu.CopyTo(z);
}
DnnInstance.FullyConnectedBackwardData(z.Entities, InputInfo.Size, OutputInfo.Size, z_gpu.Ptr, delta_1_gpu.Ptr, w_gpu.Ptr, activationPrime);
z_gpu.CopyTo(z);
}
}

Expand All @@ -75,7 +67,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
w_gpu = DnnInstance.Gpu.AllocateDevice<float>(a.Length * delta.Length))
{
DnnInstance.FullyConnectedBackwardFilter(a.Entities, a.Length, delta.Length, a_gpu.Ptr, delta_gpu.Ptr, w_gpu.Ptr);
w_gpu.CopyToHost(a.Length, delta.Length, out dJdw);
w_gpu.CopyToHost(1, Weights.Length, out dJdw);
}
delta.CompressVertically(out dJdb); // Doing this on CPU is generally faster than launching the kernels
}
Expand Down
Loading