diff --git a/Settings.mk b/Settings.mk index b2f7470..bb735c0 100644 --- a/Settings.mk +++ b/Settings.mk @@ -11,7 +11,7 @@ NNFORGE_WORKING_DATA_PATH=/home/max/nnforge/working_data BOOST_LIBS=-lboost_regex-mt -lboost_chrono-mt -lboost_filesystem-mt -lboost_program_options-mt -lboost_random-mt -lboost_system-mt -lboost_date_time-mt OPENCV_LIBS=-lopencv_highgui -lopencv_imgproc -lopencv_core -CPP_FLAGS_COMMON=-ffast-math -march=native -mfpmath=sse -msse2 # -mavx +CPP_FLAGS_COMMON=-rdynamic -ffast-math -march=native -mfpmath=sse -msse2 # -mavx CPP_FLAGS_DEBUG_MODE=-g CPP_FLAGS_RELEASE_MODE=-O3 @@ -22,5 +22,5 @@ CUDA_FLAGS_COMMON=-use_fast_math CUDA_FLAGS_ARCH_FERMI=-gencode=arch=compute_20,code=sm_20 CUDA_FLAGS_ARCH_KEPLER=-gencode=arch=compute_30,code=sm_30 -gencode=arch=compute_35,code=\"sm_35,compute_35\" CUDA_FLAGS_DEBUG_MODE=-g -G -lineinfo -CUDA_FLAGS_RELEASE_MODE=-O3 +CUDA_FLAGS_RELEASE_MODE=-O3 -lineinfo diff --git a/nnforge/cuda/absolute_layer_updater_cuda.cu b/nnforge/cuda/absolute_layer_updater_cuda.cu index dc6c4ba..3cf183c 100644 --- a/nnforge/cuda/absolute_layer_updater_cuda.cu +++ b/nnforge/cuda/absolute_layer_updater_cuda.cu @@ -81,6 +81,7 @@ namespace nnforge const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) { int elem_count = (input_elem_count_per_entry * entry_count + 3) / 4; @@ -102,6 +103,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) { int elem_count = (input_elem_count_per_entry * entry_count + 3) / 4; diff --git a/nnforge/cuda/absolute_layer_updater_cuda.h b/nnforge/cuda/absolute_layer_updater_cuda.h index 763d89d..f928fc0 100644 --- a/nnforge/cuda/absolute_layer_updater_cuda.h +++ b/nnforge/cuda/absolute_layer_updater_cuda.h @@ -37,6 +37,7 @@ namespace nnforge const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); virtual void enqueue_backprop( @@ -48,6 +49,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); protected: diff --git a/nnforge/cuda/average_subsampling_2d_layer_updater_cuda.cu b/nnforge/cuda/average_subsampling_2d_layer_updater_cuda.cu index 50bb6e8..65d335f 100644 --- a/nnforge/cuda/average_subsampling_2d_layer_updater_cuda.cu +++ b/nnforge/cuda/average_subsampling_2d_layer_updater_cuda.cu @@ -339,6 +339,7 @@ namespace nnforge const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); @@ -380,6 +381,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) { int output_elem_count_per_feature_map_aligned = cuda_util::get_power2_aligned_size(output_configuration_specific.dimension_sizes[0]) * output_configuration_specific.dimension_sizes[1]; diff --git a/nnforge/cuda/average_subsampling_2d_layer_updater_cuda.h b/nnforge/cuda/average_subsampling_2d_layer_updater_cuda.h index 4045502..1d69543 100644 --- a/nnforge/cuda/average_subsampling_2d_layer_updater_cuda.h +++ b/nnforge/cuda/average_subsampling_2d_layer_updater_cuda.h @@ -37,6 +37,7 @@ namespace nnforge const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); virtual void enqueue_backprop( @@ -48,6 +49,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); protected: diff --git a/nnforge/cuda/convolution_2d_layer_updater_cuda.cu b/nnforge/cuda/convolution_2d_layer_updater_cuda.cu deleted file mode 100644 index 939ea72..0000000 --- a/nnforge/cuda/convolution_2d_layer_updater_cuda.cu +++ /dev/null @@ -1,969 +0,0 @@ -/* - * Copyright 2011-2013 Maxim Milakov - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "convolution_2d_layer_updater_cuda.h" - -#include - -#include - -#include "util_cuda.h" -#include "neural_network_cuda_exception.h" -#include "../convolution_layer.h" - -texture input_tex_ref; - -template -__global__ void convolution_2d_tex_upd_kernel( - float * __restrict output, - const float * __restrict weights, - const float * __restrict biases, - int output_width, - int output_height, - int input_width, - int input_height, - int window_width, - int window_height, - int input_feature_map_count, - int output_feature_map_count, - int texture_offset, - int entry_count) -{ - int x = blockIdx.x * blockDim.x + threadIdx.x; - int _y = blockIdx.y * blockDim.y + threadIdx.y; - int output_feature_map_id = _y / output_height; - int entry_id = blockIdx.z * blockDim.z + threadIdx.z; - - bool in_bounds = (entry_id < entry_count) && (x < output_width) && (output_feature_map_id < output_feature_map_count); - if (in_bounds) - { - int y = _y - (output_feature_map_id * output_height); - int input_elem_id = ((different_input ? entry_id * input_feature_map_count * input_height : 0) + y) * input_width + x + texture_offset; - const float * current_weights = weights + (int)((entry_id * output_feature_map_count + output_feature_map_id) * window_width * window_height * input_feature_map_count); - - float sum = biases[output_feature_map_id + entry_id * output_feature_map_count]; - - for(int input_layer_id = 0; input_layer_id < input_feature_map_count; ++input_layer_id) - { - for(int input_y = 0; input_y < window_height; ++input_y) - { - #pragma unroll 4 - for(int input_x = 0; input_x < window_width; ++input_x) - { - sum += tex1Dfetch(input_tex_ref, input_elem_id) * *current_weights; - current_weights++; - input_elem_id++; - } - input_elem_id += input_width - window_width; - } - input_elem_id += input_width * (input_height - window_height); - } - - output[((entry_id * output_feature_map_count + output_feature_map_id) * output_height + y) * output_width + x] = sum; - } -} - -template -__global__ void convolution_2d_tex_exact_blocked_upd_kernel( - float * __restrict output, - const float * __restrict weights, - const float * __restrict biases, - int output_width, - int output_height, - int input_width, - int input_height, - int window_height, - int input_feature_map_count, - int output_feature_map_count, - int block_count, - int input_feature_map_group_count, - int input_feature_map_group_size, - int texture_offset, - int entry_count) -{ - int xy = blockIdx.x * blockDim.x + threadIdx.x; - int y = xy / block_count; - int dd = blockIdx.y * blockDim.y + threadIdx.y; - int input_feature_map_group_id = dd / output_feature_map_count; - int entry_id = blockIdx.z * blockDim.z + threadIdx.z; - - bool in_bounds = (entry_id < entry_count) && (y < output_height) && (input_feature_map_group_id < input_feature_map_group_count); - if (in_bounds) - { - int output_feature_map_id = dd - (input_feature_map_group_id * output_feature_map_count); - int base_input_feature_map_id = input_feature_map_group_id * input_feature_map_group_size; - int x = (xy - (y * block_count)) * BLOCK_SIZE; - int input_elem_id = ((((different_input ? entry_id * input_feature_map_count : 0) + base_input_feature_map_id) * input_height) + y) * input_width + x + texture_offset; - const float * current_weights = weights + (int)(((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + base_input_feature_map_id) * WINDOW_WIDTH * window_height); - int iteration_count = min(input_feature_map_group_size, input_feature_map_count - base_input_feature_map_id); - - float initial_value = 0.0F; - if (input_feature_map_group_id == 0) - initial_value = biases[output_feature_map_id + entry_id * output_feature_map_count]; - float sums[BLOCK_SIZE]; - #pragma unroll - for(int i = 0; i < BLOCK_SIZE; ++i) - sums[i] = initial_value; - - for(int i = 0; i < iteration_count; ++i) - { - for(int input_y = 0; input_y < window_height; ++input_y) - { - #pragma unroll - for(int input_x = 0; input_x < WINDOW_WIDTH; ++input_x) - { - float weight = *current_weights; - #pragma unroll - for(int i = 0; i < BLOCK_SIZE; ++i) - sums[i] += tex1Dfetch(input_tex_ref, input_elem_id + i) * weight; - current_weights++; - input_elem_id++; - } - input_elem_id += input_width - WINDOW_WIDTH; - } - input_elem_id += input_width * (input_height - window_height); - } - - float * base_output = output + ((entry_id * output_feature_map_count + output_feature_map_id) * output_height + y) * output_width + x; - if (input_feature_map_group_count == 1) - { - #pragma unroll - for(int i = 0; i < BLOCK_SIZE; ++i) - { - if (i < output_width - x) - base_output[i] = sums[i]; - } - } - else - { - #pragma unroll - for(int i = 0; i < BLOCK_SIZE; ++i) - { - if (i < output_width - x) - atomicAdd(base_output + i, sums[i]); - } - } - } -} - -extern __shared__ float arr[]; -__global__ void convolution_2d_update_biases_upd_kernel( - float * __restrict biases, - const float * __restrict output_errors, - const float * __restrict training_speed, - int output_feature_map_count, - int output_elem_count_per_feature_map, - int min_iteration_count) -{ - int thread_id = threadIdx.x; - int output_feature_map_id = blockIdx.y; - int entry_id = blockIdx.z; - int threadblock_size = blockDim.x; - float sum = 0.0F; - const float * current_error = output_errors + (entry_id * output_feature_map_count + output_feature_map_id) * output_elem_count_per_feature_map; - int current_output_neuron_id = thread_id; - for(int i = 0; i < min_iteration_count; ++i) - { - sum += current_error[current_output_neuron_id]; - current_output_neuron_id += threadblock_size; - } - if (current_output_neuron_id < output_elem_count_per_feature_map) - sum += current_error[current_output_neuron_id]; - arr[thread_id] = sum; - __syncthreads(); - - int offset = entry_id * output_feature_map_count + output_feature_map_id; - float current_bias_val; - float current_training_speed_val; - if (thread_id == 0) - { - current_bias_val = biases[offset]; - current_training_speed_val = training_speed[offset]; - } - - int t_add_elems = threadblock_size >> 1; - int t_working_elems = (threadblock_size + 1) >> 1; - while (t_add_elems > 0) - { - if (thread_id < t_add_elems) - arr[thread_id] += arr[thread_id + t_working_elems]; - t_add_elems = t_working_elems >> 1; - t_working_elems = (t_working_elems + 1) >> 1; - __syncthreads(); - } - - if (thread_id == 0) - biases[offset] = arr[0] * current_training_speed_val + current_bias_val; -} - -texture output_tex_ref; - -__global__ void convolution_2d_deriviative_tex_upd_kernel( - float * __restrict input_errors, - const float * __restrict weights, - int output_width, - int output_height, - int input_width, - int input_height, - int window_width, - int window_height, - int input_feature_map_count, - int output_feature_map_count, - int entry_count) -{ - int x = blockIdx.x * blockDim.x + threadIdx.x; - int _y = blockIdx.y * blockDim.y + threadIdx.y; - int input_feature_map_id = _y / input_height; - int entry_id = blockIdx.z * blockDim.z + threadIdx.z; - - bool in_bounds = (entry_id < entry_count) && (x < input_width) && (input_feature_map_id < input_feature_map_count); - if (in_bounds) - { - int y = _y - (input_feature_map_id * input_height); - int output_elem_id = (entry_id * output_feature_map_count * output_height + y) * output_width + x; - const float * current_weights = weights + (int)((entry_id * input_feature_map_count * output_feature_map_count + input_feature_map_id) * window_width * window_height); - - float sum = 0.0F; - - int min_y_exclusive = y - output_height; - int max_y_inclusive = y; - int min_x_exclusive = x - output_width; - int max_x_inclusive = x; - for(int output_layer_id = 0; output_layer_id < output_feature_map_count; ++output_layer_id) - { - for(int input_y = 0; input_y < window_height; ++input_y) - { - bool b_fit1 = (input_y > min_y_exclusive) && (input_y <= max_y_inclusive); - for(int input_x = 0; input_x < window_width; ++input_x) - { - bool b_fit2 = b_fit1 && (input_x > min_x_exclusive) && (input_x <= max_x_inclusive); - if (b_fit2) - sum += tex1Dfetch(output_tex_ref, output_elem_id) * *current_weights; - current_weights++; - output_elem_id--; - } - output_elem_id -= output_width - window_width; - } - current_weights += window_width * window_height * (input_feature_map_count - 1); - output_elem_id += output_width * (output_height + window_height); - } - - input_errors[((entry_id * input_feature_map_count + input_feature_map_id) * input_height + y) * input_width + x] = sum; - } -} - -template -__global__ void convolution_2d_deriviative_tex_exact_blocked_upd_kernel( - float * __restrict input_errors, - const float * __restrict weights, - int output_width, - int output_height, - int input_width, - int input_height, - int window_height, - int input_feature_map_count, - int output_feature_map_count, - int block_count, - int output_feature_map_group_count, - int output_feature_map_group_size, - int entry_count) -{ - int xy = blockIdx.x * blockDim.x + threadIdx.x; - int y = xy / block_count; - int dd = blockIdx.y * blockDim.y + threadIdx.y; - int output_feature_map_group_id = dd / input_feature_map_count; - int entry_id = blockIdx.z * blockDim.z + threadIdx.z; - - bool in_bounds = (entry_id < entry_count) && (y < input_height) && (output_feature_map_group_id < output_feature_map_group_count); - if (in_bounds) - { - int input_feature_map_id = dd - (output_feature_map_group_id * input_feature_map_count); - int base_output_feature_map_id = output_feature_map_group_id * output_feature_map_group_size; - int x = (xy - (y * block_count)) * BLOCK_SIZE + (BLOCK_SIZE - 1); - int output_elem_id = ((entry_id * output_feature_map_count + base_output_feature_map_id) * output_height + y) * output_width + x; - const float * current_weights = weights + (int)(((entry_id * output_feature_map_count + base_output_feature_map_id) * input_feature_map_count + input_feature_map_id) * WINDOW_WIDTH * window_height); - int iteration_count = min(output_feature_map_group_size, output_feature_map_count - base_output_feature_map_id); - - float sums[BLOCK_SIZE]; - #pragma unroll - for(int i = 0; i < BLOCK_SIZE; ++i) - sums[i] = 0.0F; - - int min_y_exclusive = y - output_height; - int max_y_inclusive = y; - int min_x_exclusive = x - output_width; - int max_x_inclusive = x; - - unsigned int mask = 0; - for(int i = BLOCK_SIZE + WINDOW_WIDTH - 2; i >= 0; --i) - mask = mask << 1 | (((i > min_x_exclusive) && (i <= max_x_inclusive)) ? 1 : 0); - - for(int i = 0; i < iteration_count; ++i) - { - for(int input_y = 0; input_y < window_height; ++input_y) - { - bool b_fit1 = (input_y > min_y_exclusive) && (input_y <= max_y_inclusive); - - float output_vals[BLOCK_SIZE + WINDOW_WIDTH - 1]; - #pragma unroll - for(int i = 0; i < BLOCK_SIZE + WINDOW_WIDTH - 1; ++i) - { - bool b_fit2 = b_fit1 && (((1 << i) & mask) != 0); - if (b_fit2) - output_vals[i] = tex1Dfetch(output_tex_ref, output_elem_id - i); - else - output_vals[i] = 0.0F; - } - - #pragma unroll - for(int input_x = 0; input_x < WINDOW_WIDTH; ++input_x) - { - float weight = *current_weights; - #pragma unroll - for(int i = 0; i < BLOCK_SIZE; ++i) - sums[i] += output_vals[input_x + i] * weight; - current_weights++; - } - output_elem_id -= output_width; - } - current_weights += WINDOW_WIDTH * window_height * (input_feature_map_count - 1); - output_elem_id += output_width * (output_height + window_height); - } - - float * base_input = input_errors + ((entry_id * input_feature_map_count + input_feature_map_id) * input_height + y) * input_width + x; - if (output_feature_map_group_count == 1) - { - #pragma unroll - for(int i = 0; i < BLOCK_SIZE; ++i) - { - if (i > x - input_width) - *(base_input - i) = sums[i]; - } - } - else - { - #pragma unroll - for(int i = 0; i < BLOCK_SIZE; ++i) - { - if (i > x - input_width) - atomicAdd(base_input - i, sums[i]); - } - } - } -} - -template -__global__ void convolution_2d_update_weights_upd_kernel( - float * __restrict weights, - const float * __restrict output_errors, - const float * __restrict training_speed, - int output_width, - int output_height, - int input_width, - int input_height, - int window_width, - int window_height, - int input_feature_map_count, - int output_feature_map_count, - int texture_offset, - int entry_count) -{ - int internal_weights_id = blockIdx.x * blockDim.x + threadIdx.x; - int feature_map_pair_id = blockIdx.y * blockDim.y + threadIdx.y; - int entry_id = blockIdx.z * blockDim.z + threadIdx.z; - int weight_y = internal_weights_id / window_width; - int output_feature_map_id = feature_map_pair_id / input_feature_map_count; - - if ((weight_y < window_height) && (output_feature_map_id < output_feature_map_count) && (entry_id < entry_count)) - { - int weight_x = internal_weights_id - (weight_y * window_width); - int input_feature_map_id = feature_map_pair_id - (output_feature_map_id * input_feature_map_count); - - const float * current_output_errors = output_errors + ((entry_id * output_feature_map_count + output_feature_map_id) * output_height) * output_width; - int input_elem_id = (((different_input ? entry_id * input_feature_map_count : 0) + input_feature_map_id) * input_height + weight_y) * input_width + weight_x + texture_offset; - - float sum = 0.0F; - for(int y = 0; y < output_height; ++y) - { - for(int x = 0; x < output_width; ++x) - { - float inp = tex1Dfetch(input_tex_ref, input_elem_id); - sum += *current_output_errors * inp; - current_output_errors++; - input_elem_id++; - } - input_elem_id += (window_width - 1); - } - - int offset = (((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + input_feature_map_id) * window_height + weight_y) * window_width + weight_x; - weights[offset] += sum * training_speed[offset]; - } -} - -template -__global__ void convolution_2d_update_weights_exact_blocked_upd_kernel( - float * __restrict weights, - const float * __restrict output_errors, - const float * __restrict training_speed, - int output_width, - int output_height, - int input_width, - int input_height, - int window_height, - int input_feature_map_count, - int output_feature_map_count, - int output_y_group_count, - int texture_offset, - int entry_count) -{ - int dd = blockIdx.x * blockDim.x + threadIdx.x; - int output_y_group_id = dd / window_height; - int feature_map_pair_id = blockIdx.y * blockDim.y + threadIdx.y; - int output_feature_map_id = feature_map_pair_id / input_feature_map_count; - int entry_id = blockIdx.z * blockDim.z + threadIdx.z; - - if ((output_y_group_id < output_y_group_count) && (output_feature_map_id < output_feature_map_count) && (entry_id < entry_count)) - { - int weight_y = dd - (output_y_group_id * window_height); - int input_feature_map_id = feature_map_pair_id - (output_feature_map_id * input_feature_map_count); - - const float * current_output_errors = output_errors + ((entry_id * output_feature_map_count + output_feature_map_id) * output_height + output_y_group_id) * output_width; - int input_elem_id = (((different_input ? entry_id * input_feature_map_count : 0) + input_feature_map_id) * input_height + weight_y + output_y_group_id) * input_width + texture_offset; - - float sums[WINDOW_WIDTH]; - #pragma unroll - for(int i = 0; i < WINDOW_WIDTH; ++i) - sums[i] = 0.0F; - - for(int output_y = output_y_group_id; output_y < output_height; output_y += output_y_group_count) - { - float input_buf[WINDOW_WIDTH]; - #pragma unroll - for(int i = 1; i < WINDOW_WIDTH; ++i) - { - input_buf[i] = tex1Dfetch(input_tex_ref, input_elem_id); - ++input_elem_id; - } - - for(int x = 0; x < output_width; ++x) - { - float current_output_error = *current_output_errors; - - #pragma unroll - for(int i = 0; i < WINDOW_WIDTH - 1; ++i) - input_buf[i] = input_buf[i + 1]; - input_buf[WINDOW_WIDTH - 1] = tex1Dfetch(input_tex_ref, input_elem_id); - - #pragma unroll - for(int i = 0; i < WINDOW_WIDTH; ++i) - sums[i] += current_output_error * input_buf[i]; - - current_output_errors++; - input_elem_id++; - } - - current_output_errors += output_width * (output_y_group_count - 1); - input_elem_id += input_width * (output_y_group_count - 1); - } - - int offset = (((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + input_feature_map_id) * window_height + weight_y) * WINDOW_WIDTH; - float * cur_weights = weights + offset; - const float * cur_training_speed = training_speed + offset; - if (output_y_group_count == 1) - { - #pragma unroll - for(int i = 0; i < WINDOW_WIDTH; ++i) - cur_weights[i] += sums[i] * cur_training_speed[i]; - } - else - { - #pragma unroll - for(int i = 0; i < WINDOW_WIDTH; ++i) - atomicAdd(cur_weights + i, sums[i] * cur_training_speed[i]); - } - } -} - -namespace nnforge -{ - namespace cuda - { - convolution_2d_layer_updater_cuda::convolution_2d_layer_updater_cuda() - { - input_tex_ref.addressMode[0] = cudaAddressModeBorder; - input_tex_ref.normalized = false; - output_tex_ref.addressMode[0] = cudaAddressModeBorder; - output_tex_ref.normalized = false; - input_tex_ref.addressMode[0] = cudaAddressModeBorder; - input_tex_ref.normalized = false; - } - - convolution_2d_layer_updater_cuda::~convolution_2d_layer_updater_cuda() - { - } - -#define MAX_BLOCK_SIZE 5 -#define MAX_WINDOW_WIDTH 10 - -#define launch_exact_block_kernel_const_const(window_width_const, block_size_const, different_input_const) \ - convolution_2d_tex_exact_blocked_upd_kernel<<>>(*output_neurons_buffer, *data[0], *data[1], output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, block_count, input_feature_map_group_count, input_feature_map_group_size, texture_offset, entry_count); - -#define launch_exact_block_kernel_const(window_width, block_size_const, different_input_const) \ - switch (window_width) \ - { \ - case 1: \ - launch_exact_block_kernel_const_const(1, block_size_const, different_input_const); \ - break; \ - case 2: \ - launch_exact_block_kernel_const_const(2, block_size_const, different_input_const); \ - break; \ - case 3: \ - launch_exact_block_kernel_const_const(3, block_size_const, different_input_const); \ - break; \ - case 4: \ - launch_exact_block_kernel_const_const(4, block_size_const, different_input_const); \ - break; \ - case 5: \ - launch_exact_block_kernel_const_const(5, block_size_const, different_input_const); \ - break; \ - case 6: \ - launch_exact_block_kernel_const_const(6, block_size_const, different_input_const); \ - break; \ - case 7: \ - launch_exact_block_kernel_const_const(7, block_size_const, different_input_const); \ - break; \ - case 8: \ - launch_exact_block_kernel_const_const(8, block_size_const, different_input_const); \ - break; \ - case 9: \ - launch_exact_block_kernel_const_const(9, block_size_const, different_input_const); \ - break; \ - case 10: \ - launch_exact_block_kernel_const_const(10, block_size_const, different_input_const); \ - break; \ - }; - -#define launch_exact_block_kernel(window_width, block_size, different_input_const) \ - switch (block_size) \ - { \ - case 1: \ - launch_exact_block_kernel_const(window_width, 1, different_input_const); \ - break; \ - case 2: \ - launch_exact_block_kernel_const(window_width, 2, different_input_const); \ - break; \ - case 3: \ - launch_exact_block_kernel_const(window_width, 3, different_input_const); \ - break; \ - case 4: \ - launch_exact_block_kernel_const(window_width, 4, different_input_const); \ - break; \ - case 5: \ - launch_exact_block_kernel_const(window_width, 5, different_input_const); \ - break; \ - }; - -#define launch_backprop_exact_block_kernel_const_const(window_width_const, block_size_const) \ - convolution_2d_deriviative_tex_exact_blocked_upd_kernel<<>>(*input_errors_buffer, *data[0], output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, block_count, output_feature_map_group_count, output_feature_map_group_size, entry_count); - -#define launch_backprop_exact_block_kernel_const(window_width, block_size_const) \ - switch (window_width) \ - { \ - case 1: \ - launch_backprop_exact_block_kernel_const_const(1, block_size_const); \ - break; \ - case 2: \ - launch_backprop_exact_block_kernel_const_const(2, block_size_const); \ - break; \ - case 3: \ - launch_backprop_exact_block_kernel_const_const(3, block_size_const); \ - break; \ - case 4: \ - launch_backprop_exact_block_kernel_const_const(4, block_size_const); \ - break; \ - case 5: \ - launch_backprop_exact_block_kernel_const_const(5, block_size_const); \ - break; \ - case 6: \ - launch_backprop_exact_block_kernel_const_const(6, block_size_const); \ - break; \ - case 7: \ - launch_backprop_exact_block_kernel_const_const(7, block_size_const); \ - break; \ - case 8: \ - launch_backprop_exact_block_kernel_const_const(8, block_size_const); \ - break; \ - case 9: \ - launch_backprop_exact_block_kernel_const_const(9, block_size_const); \ - break; \ - case 10: \ - launch_backprop_exact_block_kernel_const_const(10, block_size_const); \ - break; \ - }; - -#define launch_backprop_exact_block_kernel(window_width, block_size) \ - switch (block_size) \ - { \ - case 1: \ - launch_backprop_exact_block_kernel_const(window_width, 1); \ - break; \ - case 2: \ - launch_backprop_exact_block_kernel_const(window_width, 2); \ - break; \ - case 3: \ - launch_backprop_exact_block_kernel_const(window_width, 3); \ - break; \ - case 4: \ - launch_backprop_exact_block_kernel_const(window_width, 4); \ - break; \ - case 5: \ - launch_backprop_exact_block_kernel_const(window_width, 5); \ - break; \ - }; - -#define launch_update_weights_exact_block_kernel_const(window_width_const, different_input_const) \ - convolution_2d_update_weights_exact_blocked_upd_kernel<<>>(*data[0], *output_errors_buffer, *training_speed[0], output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, output_y_group_count, texture_offset, entry_count); - -#define launch_update_weights_exact_block_kernel(window_width, different_input_const) \ - switch (window_width) \ - { \ - case 1: \ - launch_update_weights_exact_block_kernel_const(1, different_input_const); \ - break; \ - case 2: \ - launch_update_weights_exact_block_kernel_const(2, different_input_const); \ - break; \ - case 3: \ - launch_update_weights_exact_block_kernel_const(3, different_input_const); \ - break; \ - case 4: \ - launch_update_weights_exact_block_kernel_const(4, different_input_const); \ - break; \ - case 5: \ - launch_update_weights_exact_block_kernel_const(5, different_input_const); \ - break; \ - case 6: \ - launch_update_weights_exact_block_kernel_const(6, different_input_const); \ - break; \ - case 7: \ - launch_update_weights_exact_block_kernel_const(7, different_input_const); \ - break; \ - case 8: \ - launch_update_weights_exact_block_kernel_const(8, different_input_const); \ - break; \ - case 9: \ - launch_update_weights_exact_block_kernel_const(9, different_input_const); \ - break; \ - case 10: \ - launch_update_weights_exact_block_kernel_const(10, different_input_const); \ - break; \ - }; - - void convolution_2d_layer_updater_cuda::enqueue_test( - unsigned int offset_input_entry_id, - cudaStream_t stream_id, - const std::vector& schema_data, - const std::vector& data, - const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, - cuda_linear_buffer_device_smart_ptr output_neurons_buffer, - const std::vector& additional_buffers, - unsigned int entry_count) - { - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - size_t texture_offset; - cuda_safe_call(cudaBindTexture(&texture_offset, input_tex_ref, (const float *)(*input_neurons_buffer) + (offset_input_entry_id * input_elem_count_per_entry), desc, input_elem_count_per_entry * sizeof(float) * (different_input ? entry_count : 1))); - texture_offset /= sizeof(float); - - if (window_sizes[0] <= MAX_WINDOW_WIDTH) - { - int block_size = get_block_size(output_configuration_specific.dimension_sizes[0]); - int block_count = (output_configuration_specific.dimension_sizes[0] + block_size - 1) / block_size; - int input_feature_map_group_count = cuda_util::get_group_count( - *cuda_config, - block_count * output_configuration_specific.dimension_sizes[1] * output_configuration_specific.feature_map_count * entry_count, - input_configuration_specific.feature_map_count); - int input_feature_map_group_size = (input_configuration_specific.feature_map_count + input_feature_map_group_count - 1) / input_feature_map_group_count; - - if (input_feature_map_group_count > 1) - cuda_util::set_with_value( - *cuda_config, - *output_neurons_buffer, - 0.0F, - output_elem_count_per_entry * entry_count, - stream_id); - - std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( - *cuda_config, - block_count * output_configuration_specific.dimension_sizes[1], - output_configuration_specific.feature_map_count * input_feature_map_group_count, - entry_count); - - if (different_input) - { - launch_exact_block_kernel(window_sizes[0], block_size, true); - } - else - { - launch_exact_block_kernel(window_sizes[0], block_size, false); - } - } - else - { - std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_2d_access( - *cuda_config, - output_configuration_specific.dimension_sizes[0], - output_configuration_specific.dimension_sizes[1] * output_configuration_specific.feature_map_count, - entry_count); - - if (different_input) - convolution_2d_tex_upd_kernel<<>>( - *output_neurons_buffer, - *data[0], - *data[1], - output_configuration_specific.dimension_sizes[0], - output_configuration_specific.dimension_sizes[1], - input_configuration_specific.dimension_sizes[0], - input_configuration_specific.dimension_sizes[1], - window_sizes[0], - window_sizes[1], - input_configuration_specific.feature_map_count, - output_configuration_specific.feature_map_count, - texture_offset, - entry_count); - else - convolution_2d_tex_upd_kernel<<>>( - *output_neurons_buffer, - *data[0], - *data[1], - output_configuration_specific.dimension_sizes[0], - output_configuration_specific.dimension_sizes[1], - input_configuration_specific.dimension_sizes[0], - input_configuration_specific.dimension_sizes[1], - window_sizes[0], - window_sizes[1], - input_configuration_specific.feature_map_count, - output_configuration_specific.feature_map_count, - texture_offset, - entry_count); - } - } - - void convolution_2d_layer_updater_cuda::enqueue_backprop( - cudaStream_t stream_id, - const std::vector& schema_data, - const std::vector& data, - const_cuda_linear_buffer_device_smart_ptr output_neurons_buffer, - const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, - cuda_linear_buffer_device_smart_ptr output_errors_buffer, - cuda_linear_buffer_device_smart_ptr input_errors_buffer, - const std::vector& additional_buffers, - unsigned int entry_count) - { - if (!different_input) - throw neural_network_exception("convolution_2d_layer_updater_cuda is not able to backprop to the same input"); - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - cuda_safe_call(cudaBindTexture(0, output_tex_ref, *output_errors_buffer, desc, output_elem_count_per_entry * entry_count * sizeof(float))); - - if (window_sizes[0] <= MAX_WINDOW_WIDTH) - { - int block_size = get_block_size(input_configuration_specific.dimension_sizes[0]); - int block_count = (input_configuration_specific.dimension_sizes[0] + block_size - 1) / block_size; - int output_feature_map_group_count = cuda_util::get_group_count( - *cuda_config, - block_count * input_configuration_specific.dimension_sizes[1] * input_configuration_specific.feature_map_count * entry_count, - output_configuration_specific.feature_map_count); - int output_feature_map_group_size = (output_configuration_specific.feature_map_count + output_feature_map_group_count - 1) / output_feature_map_group_count; - - if (output_feature_map_group_count > 1) - cuda_util::set_with_value( - *cuda_config, - *input_errors_buffer, - 0.0F, - input_elem_count_per_entry * entry_count, - stream_id); - - std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( - *cuda_config, - block_count * input_configuration_specific.dimension_sizes[1], - input_configuration_specific.feature_map_count * output_feature_map_group_count, - entry_count); - launch_backprop_exact_block_kernel(window_sizes[0], block_size); - } - else - { - std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_2d_access( - *cuda_config, - input_configuration_specific.dimension_sizes[0], - input_configuration_specific.dimension_sizes[1] * input_configuration_specific.feature_map_count, - entry_count); - - convolution_2d_deriviative_tex_upd_kernel<<>>( - *input_errors_buffer, - *data[0], - output_configuration_specific.dimension_sizes[0], - output_configuration_specific.dimension_sizes[1], - input_configuration_specific.dimension_sizes[0], - input_configuration_specific.dimension_sizes[1], - window_sizes[0], - window_sizes[1], - input_configuration_specific.feature_map_count, - output_configuration_specific.feature_map_count, - entry_count); - } - } - - void convolution_2d_layer_updater_cuda::enqueue_update_weights( - unsigned int offset_input_entry_id, - cudaStream_t stream_id, - const std::vector& data, - const std::vector& schema_data, - const std::vector& training_speed, - cuda_linear_buffer_device_smart_ptr output_errors_buffer, - const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, - const std::vector& additional_buffers, - unsigned int entry_count) - { - // Update biases - { - int threadblock_size = get_threadblock_size_biases(output_elem_count_per_feature_map); - dim3 grid_size(1, output_configuration_specific.feature_map_count, entry_count); - dim3 block_size(threadblock_size, 1, 1); - int smem_size = threadblock_size * sizeof(float); - int min_iteration_count = output_elem_count_per_feature_map / threadblock_size; - - convolution_2d_update_biases_upd_kernel<<>>( - *data[1], - *output_errors_buffer, - *training_speed[1], - output_configuration_specific.feature_map_count, - output_elem_count_per_feature_map, - min_iteration_count); - } - - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - size_t texture_offset; - cuda_safe_call(cudaBindTexture(&texture_offset, input_tex_ref, (const float *)(*input_neurons_buffer) + (offset_input_entry_id * input_elem_count_per_entry), desc, input_elem_count_per_entry * sizeof(float) * (different_input ? entry_count : 1))); - texture_offset /= sizeof(float); - - // Update weights - { - if (window_sizes[0] <= MAX_WINDOW_WIDTH) - { - int output_y_group_count = cuda_util::get_group_count( - *cuda_config, - output_configuration_specific.feature_map_count * input_configuration_specific.feature_map_count * window_sizes[1] * entry_count, - output_configuration_specific.dimension_sizes[1]); - std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( - *cuda_config, - window_sizes[1] * output_y_group_count, - output_configuration_specific.feature_map_count * input_configuration_specific.feature_map_count, - entry_count); - - if (different_input) - { - launch_update_weights_exact_block_kernel(window_sizes[0], true); - } - else - { - launch_update_weights_exact_block_kernel(window_sizes[0], false); - } - } - else - { - std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( - *cuda_config, - window_sizes[0] * window_sizes[1], - output_configuration_specific.feature_map_count * input_configuration_specific.feature_map_count, - entry_count); - - if (different_input) - convolution_2d_update_weights_upd_kernel<<>>( - *data[0], - *output_errors_buffer, - *training_speed[0], - output_configuration_specific.dimension_sizes[0], - output_configuration_specific.dimension_sizes[1], - input_configuration_specific.dimension_sizes[0], - input_configuration_specific.dimension_sizes[1], - window_sizes[0], - window_sizes[1], - input_configuration_specific.feature_map_count, - output_configuration_specific.feature_map_count, - texture_offset, - entry_count); - else - convolution_2d_update_weights_upd_kernel<<>>( - *data[0], - *output_errors_buffer, - *training_speed[0], - output_configuration_specific.dimension_sizes[0], - output_configuration_specific.dimension_sizes[1], - input_configuration_specific.dimension_sizes[0], - input_configuration_specific.dimension_sizes[1], - window_sizes[0], - window_sizes[1], - input_configuration_specific.feature_map_count, - output_configuration_specific.feature_map_count, - texture_offset, - entry_count); - } - } - } - - int convolution_2d_layer_updater_cuda::get_block_size(int width) - { - int block_count = (width + MAX_BLOCK_SIZE - 1) / MAX_BLOCK_SIZE; - int block_size = (width + block_count - 1) / block_count; - return block_size; - } - - void convolution_2d_layer_updater_cuda::updater_configured() - { - std::tr1::shared_ptr layer_derived = std::tr1::dynamic_pointer_cast(layer_schema); - - for(std::vector::const_iterator it = layer_derived->window_sizes.begin(); it != layer_derived->window_sizes.end(); ++it) - window_sizes.push_back(static_cast(*it)); - } - - bool convolution_2d_layer_updater_cuda::is_in_place_backprop() const - { - return false; - } - - std::vector convolution_2d_layer_updater_cuda::get_linear_addressing_through_texture_per_entry() const - { - std::vector res; - - res.push_back(input_elem_count_per_entry); - res.push_back(output_elem_count_per_entry); - - return res; - } - - int convolution_2d_layer_updater_cuda::get_threadblock_size_biases(int output_neuron_count) - { - if (output_neuron_count < 256) - return output_neuron_count; - - int threadblock_count = (output_neuron_count + 256 - 1) / 256; - int threadblock_size = (output_neuron_count + threadblock_count - 1) / threadblock_count; - threadblock_size = (threadblock_size + 32 - 1) / 32 * 32; - - return threadblock_size; - } - } -} diff --git a/nnforge/cuda/convolution_2d_layer_updater_cuda_fermi.cu b/nnforge/cuda/convolution_2d_layer_updater_cuda_fermi.cu new file mode 100644 index 0000000..ecaaf43 --- /dev/null +++ b/nnforge/cuda/convolution_2d_layer_updater_cuda_fermi.cu @@ -0,0 +1,1511 @@ +/* + * Copyright 2011-2013 Maxim Milakov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "convolution_2d_layer_updater_cuda_fermi.h" + +#include + +#include + +#include "util_cuda.h" +#include "neural_network_cuda_exception.h" +#include "../convolution_layer.h" + +texture input_tex_ref; +texture output_tex_ref; + +#define FEATURE_MAP_BLOCK_SIZE 4 +#define WINDOW_WIDTH_LOCAL 4 + +struct __align__(4) xy_config +{ + xy_config(int y, int x) + { + this->xy_pair = (((unsigned int)y) << 16) | (unsigned int)x; + } + + unsigned int xy_pair; +}; + +struct __align__(4) feature_map_config +{ + feature_map_config(int input_feature_map_id, int output_feature_map_id) + { + this->feature_map_pair = (((unsigned int)input_feature_map_id) << 16) | (unsigned int)output_feature_map_id; + } + + unsigned int feature_map_pair; +}; + +struct __align__(4) output_y_weight_y_config +{ + output_y_weight_y_config(int output_y, int weight_y) + { + this->output_y_window_y_pair = (((unsigned int)output_y) << 16) | (unsigned int)weight_y; + } + + unsigned int output_y_window_y_pair; +}; + +struct __align__(4) output_y_weight_y_weight_x_config +{ + output_y_weight_y_weight_x_config(int output_y, int weight_y, int weight_x) + { + this->output_y_window_y_window_x_pair = (((unsigned int)output_y) << 16) | (((unsigned int)weight_y) << 8) | ((unsigned int)weight_x); + } + + unsigned int output_y_window_y_window_x_pair; +}; + +template +__global__ void convolution_2d_tex_upd_kernel_fermi( + float * __restrict output, + const float * __restrict weights, + const float * __restrict biases, + const xy_config * __restrict xy_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_width, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int input_feature_map_group_size, + int texture_offset, + int entry_count, + bool different_input, + int xy_config_count, + int feature_map_config_count) +{ + int xy_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + bool in_bounds = (entry_id < entry_count) && (xy_config_id < xy_config_count) && (feature_map_config_id < feature_map_config_count); + if (in_bounds) + { + xy_config xyc = xy_config_list[xy_config_id]; + int x = xyc.xy_pair & 0xFFFF; + int y = xyc.xy_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int base_input_feature_map_id = fmc.feature_map_pair >> 16; + + int weight_count_per_output_feature_map = window_width * window_height * input_feature_map_count; + int input_elem_id = ((((different_input ? entry_id * input_feature_map_count : 0) + base_input_feature_map_id) * input_height) + y) * input_width + x + texture_offset; + const float * current_weights = weights + (int)(((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + base_input_feature_map_id) * window_width * window_height); + int iteration_count = min(input_feature_map_group_size, input_feature_map_count - base_input_feature_map_id); + + float initial_values[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + initial_values[i] = 0.0F; + if (base_input_feature_map_id == 0) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + if (i < output_feature_map_count - output_feature_map_id) + initial_values[i] = biases[entry_id * output_feature_map_count + output_feature_map_id + i]; + } + float sums[BLOCK_SIZE * FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + sums[i * BLOCK_SIZE + j] = initial_values[i]; + int weight_offsets[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_offsets[i] = (i < output_feature_map_count - output_feature_map_id) ? weight_count_per_output_feature_map * i : 0; + + for(int i = 0; i < iteration_count; ++i) + { + for(int input_y = 0; input_y < window_height; ++input_y) + { + #pragma unroll 4 + for(int input_x = 0; input_x < window_width; ++input_x) + { + float weight_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_list[i] = current_weights[weight_offsets[i]]; + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + float inp = tex1Dfetch(input_tex_ref, input_elem_id + j); + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + sums[i * BLOCK_SIZE + j] += inp * weight_list[i]; + } + current_weights++; + input_elem_id++; + } + input_elem_id += input_width - window_width; + } + input_elem_id += input_width * (input_height - window_height); + } + + float * base_output = output + ((entry_id * output_feature_map_count + output_feature_map_id) * output_height + y) * output_width + x; + int output_neuron_count_per_feature_map = output_height * output_width; + if (single_input_feature_map_group) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j < output_width - x) + base_output[output_neuron_count_per_feature_map * i + j] = sums[i * BLOCK_SIZE + j]; + } + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j < output_width - x) + atomicAdd(base_output + output_neuron_count_per_feature_map * i + j, sums[i * BLOCK_SIZE + j]); + } + } + } + } + } +} + +template +__global__ void convolution_2d_tex_exact_upd_kernel_fermi( + float * __restrict output, + const float * __restrict weights, + const float * __restrict biases, + const xy_config * __restrict xy_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int input_feature_map_group_size, + int texture_offset, + int entry_count, + bool different_input, + int xy_config_count, + int feature_map_config_count) +{ + int xy_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + bool in_bounds = (entry_id < entry_count) && (xy_config_id < xy_config_count) && (feature_map_config_id < feature_map_config_count); + if (in_bounds) + { + xy_config xyc = xy_config_list[xy_config_id]; + int x = xyc.xy_pair & 0xFFFF; + int y = xyc.xy_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int base_input_feature_map_id = fmc.feature_map_pair >> 16; + + int weight_count_per_output_feature_map = WINDOW_WIDTH * window_height * input_feature_map_count; + int input_elem_id = ((((different_input ? entry_id * input_feature_map_count : 0) + base_input_feature_map_id) * input_height) + y) * input_width + x + texture_offset; + const float * current_weights = weights + (int)(((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + base_input_feature_map_id) * WINDOW_WIDTH * window_height); + int iteration_count = min(input_feature_map_group_size, input_feature_map_count - base_input_feature_map_id); + + float initial_values[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + initial_values[i] = 0.0F; + if (base_input_feature_map_id == 0) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + if (i < output_feature_map_count - output_feature_map_id) + initial_values[i] = biases[entry_id * output_feature_map_count + output_feature_map_id + i]; + } + float sums[BLOCK_SIZE * FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + sums[i * BLOCK_SIZE + j] = initial_values[i]; + int weight_offsets[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_offsets[i] = (i < output_feature_map_count - output_feature_map_id) ? weight_count_per_output_feature_map * i : 0; + + for(int i = 0; i < iteration_count; ++i) + { + for(int input_y = 0; input_y < window_height; ++input_y) + { + #pragma unroll + for(int input_x = 0; input_x < WINDOW_WIDTH; ++input_x) + { + float weight_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_list[i] = current_weights[weight_offsets[i]]; + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + float inp = tex1Dfetch(input_tex_ref, input_elem_id + j); + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + sums[i * BLOCK_SIZE + j] += inp * weight_list[i]; + } + current_weights++; + input_elem_id++; + } + input_elem_id += input_width - WINDOW_WIDTH; + } + input_elem_id += input_width * (input_height - window_height); + } + + float * base_output = output + ((entry_id * output_feature_map_count + output_feature_map_id) * output_height + y) * output_width + x; + int output_neuron_count_per_feature_map = output_height * output_width; + if (single_input_feature_map_group) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j < output_width - x) + base_output[output_neuron_count_per_feature_map * i + j] = sums[i * BLOCK_SIZE + j]; + } + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j < output_width - x) + atomicAdd(base_output + output_neuron_count_per_feature_map * i + j, sums[i * BLOCK_SIZE + j]); + } + } + } + } + } +} + +extern __shared__ float arr_sh[]; +__global__ void convolution_2d_update_biases_upd_kernel_fermi( + float * __restrict biases, + const float * __restrict output_errors, + const float * __restrict training_speed, + int output_feature_map_count, + int output_elem_count_per_feature_map, + int min_iteration_count) +{ + int thread_id = threadIdx.x; + int output_feature_map_id = blockIdx.y; + int entry_id = blockIdx.z; + int threadblock_size = blockDim.x; + + float sum = 0.0F; + const float * current_error = output_errors + (entry_id * output_feature_map_count + output_feature_map_id) * output_elem_count_per_feature_map; + int current_output_neuron_id = thread_id; + for(int i = 0; i < min_iteration_count; ++i) + { + sum += current_error[current_output_neuron_id]; + current_output_neuron_id += threadblock_size; + } + if (current_output_neuron_id < output_elem_count_per_feature_map) + sum += current_error[current_output_neuron_id]; + + volatile float * arr = arr_sh; + arr[thread_id] = sum; + int lane_id = thread_id & 31; + #pragma unroll + for(int tx = 16; tx > 0; tx >>= 1) + { + if (lane_id < tx) + arr[thread_id] += arr[thread_id + tx]; + } + sum = arr[thread_id]; + + if (lane_id == 0) + { + int offset = entry_id * output_feature_map_count + output_feature_map_id; + float current_training_speed_val = training_speed[offset]; + atomicAdd(biases + offset, sum * current_training_speed_val); + } +} + +template +__global__ void convolution_2d_deriviative_tex_upd_kernel_fermi( + float * __restrict input_errors, + const float * __restrict weights, + const xy_config * __restrict xy_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_width, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int output_feature_map_group_size, + int entry_count, + int xy_config_count, + int feature_map_config_count) +{ + int xy_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + bool in_bounds = (entry_id < entry_count) && (xy_config_id < xy_config_count) && (feature_map_config_id < feature_map_config_count); + if (in_bounds) + { + xy_config xyc = xy_config_list[xy_config_id]; + int x = xyc.xy_pair & 0xFFFF; + int y = xyc.xy_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int base_output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int input_feature_map_id = fmc.feature_map_pair >> 16; + + int weight_count_per_input_feature_map = window_width * window_height; + int output_elem_id = ((entry_id * output_feature_map_count + base_output_feature_map_id) * output_height + y) * output_width + x; + const float * current_weights = weights + (int)(((entry_id * output_feature_map_count + base_output_feature_map_id) * input_feature_map_count + input_feature_map_id) * window_width * window_height); + int iteration_count = min(output_feature_map_group_size, output_feature_map_count - base_output_feature_map_id); + + float sums[FEATURE_MAP_BLOCK_SIZE * BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE * BLOCK_SIZE; ++i) + sums[i] = 0.0F; + + int weight_offsets[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_offsets[i] = (i < input_feature_map_count - input_feature_map_id) ? weight_count_per_input_feature_map * i : 0; + + int min_y_exclusive = y - output_height; + int max_y_inclusive = y; + int min_x_exclusive = x - output_width; + int max_x_inclusive = x; + + for(int i = 0; i < iteration_count; ++i) + { + for(int input_y = 0; input_y < window_height; ++input_y) + { + bool b_fit1 = (input_y > min_y_exclusive) && (input_y <= max_y_inclusive); + + int input_x = 0; + #pragma unroll 1 + for(; input_x < (window_width - (WINDOW_WIDTH_LOCAL - 1)); input_x += WINDOW_WIDTH_LOCAL) + { + float output_vals[BLOCK_SIZE + WINDOW_WIDTH_LOCAL - 1]; + #pragma unroll + for(int i = 0; i < BLOCK_SIZE + WINDOW_WIDTH_LOCAL - 1; ++i) + { + bool b_fit2 = b_fit1 && (i > min_x_exclusive) && (i <= max_x_inclusive);; + if (b_fit2) + output_vals[i] = tex1Dfetch(output_tex_ref, output_elem_id - i); + else + output_vals[i] = 0.0F; + } + output_elem_id -= WINDOW_WIDTH_LOCAL; + + #pragma unroll + for(int input_x_local = 0; input_x_local < WINDOW_WIDTH_LOCAL; ++input_x_local) + { + float weight_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_list[i] = current_weights[weight_offsets[i]]; + + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + sums[i * BLOCK_SIZE + j] += output_vals[input_x_local + j] * weight_list[i]; + } + current_weights++; + } + } + #pragma unroll 1 + for(; input_x < window_width; ++input_x) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + bool b_fit2 = b_fit1 && (input_x + j > min_x_exclusive) && (input_x + j <= max_x_inclusive); + if (b_fit2) + { + float inp = tex1Dfetch(output_tex_ref, output_elem_id - j); + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + sums[i * BLOCK_SIZE + j] += inp * current_weights[weight_offsets[i]]; + } + } + current_weights++; + output_elem_id--; + } + + output_elem_id += window_width - output_width; + } + current_weights += window_width * window_height * (input_feature_map_count - 1); + output_elem_id += output_width * (output_height + window_height); + } + + float * base_input = input_errors + ((entry_id * input_feature_map_count + input_feature_map_id) * input_height + y) * input_width + x; + int input_neuron_count_per_feature_map = input_height * input_width; + if (single_output_feature_map_group == 1) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < input_feature_map_count - input_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j > x - input_width) + *(base_input + input_neuron_count_per_feature_map * i - j) = sums[i * BLOCK_SIZE + j]; + } + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < input_feature_map_count - input_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j > x - input_width) + atomicAdd(base_input + input_neuron_count_per_feature_map * i - j, sums[i * BLOCK_SIZE + j]); + } + } + } + } + } +} + +template +__global__ void convolution_2d_deriviative_tex_exact_upd_kernel_fermi( + float * __restrict input_errors, + const float * __restrict weights, + const xy_config * __restrict xy_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int output_feature_map_group_size, + int entry_count, + int xy_config_count, + int feature_map_config_count) +{ + int xy_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + bool in_bounds = (entry_id < entry_count) && (xy_config_id < xy_config_count) && (feature_map_config_id < feature_map_config_count); + if (in_bounds) + { + xy_config xyc = xy_config_list[xy_config_id]; + int x = xyc.xy_pair & 0xFFFF; + int y = xyc.xy_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int base_output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int input_feature_map_id = fmc.feature_map_pair >> 16; + + int weight_count_per_input_feature_map = WINDOW_WIDTH * window_height; + int output_elem_id = ((entry_id * output_feature_map_count + base_output_feature_map_id) * output_height + y) * output_width + x; + const float * current_weights = weights + (int)(((entry_id * output_feature_map_count + base_output_feature_map_id) * input_feature_map_count + input_feature_map_id) * WINDOW_WIDTH * window_height); + int iteration_count = min(output_feature_map_group_size, output_feature_map_count - base_output_feature_map_id); + + float sums[FEATURE_MAP_BLOCK_SIZE * BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE * BLOCK_SIZE; ++i) + sums[i] = 0.0F; + + int weight_offsets[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_offsets[i] = (i < input_feature_map_count - input_feature_map_id) ? weight_count_per_input_feature_map * i : 0; + + int min_y_exclusive = y - output_height; + int max_y_inclusive = y; + int min_x_exclusive = x - output_width; + int max_x_inclusive = x; + + unsigned int mask = 0; + for(int i = BLOCK_SIZE + WINDOW_WIDTH - 2; i >= 0; --i) + mask = mask << 1 | (((i > min_x_exclusive) && (i <= max_x_inclusive)) ? 1 : 0); + + for(int i = 0; i < iteration_count; ++i) + { + for(int input_y = 0; input_y < window_height; ++input_y) + { + bool b_fit1 = (input_y > min_y_exclusive) && (input_y <= max_y_inclusive); + + float output_vals[BLOCK_SIZE + WINDOW_WIDTH - 1]; + #pragma unroll + for(int i = 0; i < BLOCK_SIZE + WINDOW_WIDTH - 1; ++i) + { + bool b_fit2 = b_fit1 && (((1 << i) & mask) != 0); + if (b_fit2) + output_vals[i] = tex1Dfetch(output_tex_ref, output_elem_id - i); + else + output_vals[i] = 0.0F; + } + + #pragma unroll + for(int input_x = 0; input_x < WINDOW_WIDTH; ++input_x) + { + float weight_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_list[i] = current_weights[weight_offsets[i]]; + + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + sums[i * BLOCK_SIZE + j] += output_vals[input_x + j] * weight_list[i]; + } + current_weights++; + } + output_elem_id -= output_width; + } + current_weights += WINDOW_WIDTH * window_height * (input_feature_map_count - 1); + output_elem_id += output_width * (output_height + window_height); + } + + float * base_input = input_errors + ((entry_id * input_feature_map_count + input_feature_map_id) * input_height + y) * input_width + x; + int input_neuron_count_per_feature_map = input_height * input_width; + if (single_output_feature_map_group == 1) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < input_feature_map_count - input_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j > x - input_width) + *(base_input + input_neuron_count_per_feature_map * i - j) = sums[i * BLOCK_SIZE + j]; + } + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < input_feature_map_count - input_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j > x - input_width) + atomicAdd(base_input + input_neuron_count_per_feature_map * i - j, sums[i * BLOCK_SIZE + j]); + } + } + } + } + } +} + +template +__global__ void convolution_2d_update_weights_upd_kernel_fermi( + float * __restrict weights, + const float * __restrict output_errors, + const float * __restrict training_speed, + const output_y_weight_y_weight_x_config * __restrict output_y_weight_y_weight_x_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_width, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int output_y_group_count, + int texture_offset, + int entry_count, + bool different_input, + int output_y_weight_y_weight_x_config_count, + int feature_map_config_count) +{ + int output_y_weight_y_weight_x_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + if ((output_y_weight_y_weight_x_config_id < output_y_weight_y_weight_x_config_count) && (feature_map_config_id < feature_map_config_count) && (entry_id < entry_count)) + { + output_y_weight_y_weight_x_config yw = output_y_weight_y_weight_x_config_list[output_y_weight_y_weight_x_config_id]; + int weight_x = yw.output_y_window_y_window_x_pair & 0xFF; + int weight_y = (yw.output_y_window_y_window_x_pair & 0xFFFF) >> 8; + int output_y_start_id = yw.output_y_window_y_window_x_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int input_feature_map_id = fmc.feature_map_pair >> 16; + + int output_neuron_count_per_feature_map = output_width * output_height; + const float * current_output_errors = output_errors + ((entry_id * output_feature_map_count + output_feature_map_id) * output_height + output_y_start_id) * output_width; + int input_elem_id = (((different_input ? entry_id * input_feature_map_count : 0) + input_feature_map_id) * input_height + weight_y + output_y_start_id) * input_width + texture_offset + weight_x; + + float sums[FEATURE_MAP_BLOCK_SIZE * WINDOW_WIDTH_LOCAL]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE * WINDOW_WIDTH_LOCAL; ++i) + sums[i] = 0.0F; + + int output_offsets[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + output_offsets[i] = (i < output_feature_map_count - output_feature_map_id) ? output_neuron_count_per_feature_map * i : 0; + + for(int output_y = output_y_start_id; output_y < output_height; output_y += output_y_group_count) + { + float input_buf[WINDOW_WIDTH_LOCAL]; + #pragma unroll + for(int i = 1; i < WINDOW_WIDTH_LOCAL; ++i) + { + input_buf[i] = tex1Dfetch(input_tex_ref, input_elem_id); + ++input_elem_id; + } + + for(int x = 0; x < output_width; ++x) + { + float output_error_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + output_error_list[i] = current_output_errors[output_offsets[i]]; + + #pragma unroll + for(int i = 0; i < WINDOW_WIDTH_LOCAL - 1; ++i) + input_buf[i] = input_buf[i + 1]; + input_buf[WINDOW_WIDTH_LOCAL - 1] = tex1Dfetch(input_tex_ref, input_elem_id); + + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH_LOCAL; ++j) + sums[i * WINDOW_WIDTH_LOCAL + j] += output_error_list[i] * input_buf[j]; + + current_output_errors++; + input_elem_id++; + } + + current_output_errors += output_width * (output_y_group_count - 1); + input_elem_id += input_width * (output_y_group_count - 1) + (window_width - WINDOW_WIDTH_LOCAL); + } + + int offset = (((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + input_feature_map_id) * window_height + weight_y) * window_width + weight_x; + int weight_count_per_output_feature_map = input_feature_map_count * window_height * window_width; + float * cur_weights = weights + offset; + const float * cur_training_speed = training_speed + offset; + if (single_output_y_group) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH_LOCAL; ++j) + if (j < window_width - weight_x) + cur_weights[i * weight_count_per_output_feature_map + j] += sums[i * WINDOW_WIDTH_LOCAL + j] * cur_training_speed[i * weight_count_per_output_feature_map + j]; + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH_LOCAL; ++j) + if (j < window_width - weight_x) + atomicAdd(cur_weights + i * weight_count_per_output_feature_map + j, sums[i * WINDOW_WIDTH_LOCAL + j] * cur_training_speed[i * weight_count_per_output_feature_map + j]); + } + } + } + } +} + +template +__global__ void convolution_2d_update_weights_exact_upd_kernel_fermi( + float * __restrict weights, + const float * __restrict output_errors, + const float * __restrict training_speed, + const output_y_weight_y_config * __restrict output_y_weight_y_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int output_y_group_count, + int texture_offset, + int entry_count, + bool different_input, + int output_y_weight_y_config_count, + int feature_map_config_count) +{ + int output_y_weight_y_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + if ((output_y_weight_y_config_id < output_y_weight_y_config_count) && (feature_map_config_id < feature_map_config_count) && (entry_id < entry_count)) + { + output_y_weight_y_config yw = output_y_weight_y_config_list[output_y_weight_y_config_id]; + int weight_y = yw.output_y_window_y_pair & 0xFFFF; + int output_y_start_id = yw.output_y_window_y_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int input_feature_map_id = fmc.feature_map_pair >> 16; + + int output_neuron_count_per_feature_map = output_width * output_height; + const float * current_output_errors = output_errors + ((entry_id * output_feature_map_count + output_feature_map_id) * output_height + output_y_start_id) * output_width; + int input_elem_id = (((different_input ? entry_id * input_feature_map_count : 0) + input_feature_map_id) * input_height + weight_y + output_y_start_id) * input_width + texture_offset; + + float sums[FEATURE_MAP_BLOCK_SIZE * WINDOW_WIDTH]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE * WINDOW_WIDTH; ++i) + sums[i] = 0.0F; + + int output_offsets[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + output_offsets[i] = (i < output_feature_map_count - output_feature_map_id) ? output_neuron_count_per_feature_map * i : 0; + + for(int output_y = output_y_start_id; output_y < output_height; output_y += output_y_group_count) + { + float input_buf[WINDOW_WIDTH]; + #pragma unroll + for(int i = 1; i < WINDOW_WIDTH; ++i) + { + input_buf[i] = tex1Dfetch(input_tex_ref, input_elem_id); + ++input_elem_id; + } + + for(int x = 0; x < output_width; ++x) + { + float output_error_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + output_error_list[i] = current_output_errors[output_offsets[i]]; + + #pragma unroll + for(int i = 0; i < WINDOW_WIDTH - 1; ++i) + input_buf[i] = input_buf[i + 1]; + input_buf[WINDOW_WIDTH - 1] = tex1Dfetch(input_tex_ref, input_elem_id); + + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH; ++j) + sums[i * WINDOW_WIDTH + j] += output_error_list[i] * input_buf[j]; + + current_output_errors++; + input_elem_id++; + } + + current_output_errors += output_width * (output_y_group_count - 1); + input_elem_id += input_width * (output_y_group_count - 1); + } + + int offset = (((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + input_feature_map_id) * window_height + weight_y) * WINDOW_WIDTH; + int weight_count_per_output_feature_map = input_feature_map_count * window_height * WINDOW_WIDTH; + float * cur_weights = weights + offset; + const float * cur_training_speed = training_speed + offset; + if (single_output_y_group) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH; ++j) + cur_weights[i * weight_count_per_output_feature_map + j] += sums[i * WINDOW_WIDTH + j] * cur_training_speed[i * weight_count_per_output_feature_map + j]; + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH; ++j) + atomicAdd(cur_weights + i * weight_count_per_output_feature_map + j, sums[i * WINDOW_WIDTH + j] * cur_training_speed[i * weight_count_per_output_feature_map + j]); + } + } + } + } +} + +namespace nnforge +{ + namespace cuda + { + convolution_2d_layer_updater_cuda_fermi::convolution_2d_layer_updater_cuda_fermi() + { + input_tex_ref.addressMode[0] = cudaAddressModeBorder; + input_tex_ref.normalized = false; + output_tex_ref.addressMode[0] = cudaAddressModeBorder; + output_tex_ref.normalized = false; + input_tex_ref.addressMode[0] = cudaAddressModeBorder; + input_tex_ref.normalized = false; + } + + convolution_2d_layer_updater_cuda_fermi::~convolution_2d_layer_updater_cuda_fermi() + { + } + +#define MAX_BLOCK_SIZE 5 +#define MAX_WINDOW_WIDTH 10 + +#define launch_exact_kernel_const_const(window_width_const, block_size_const, single_input_feature_map_group) \ + convolution_2d_tex_exact_upd_kernel_fermi<<>>(*output_neurons_buffer, *data[0], *data[1], xy_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, forward_input_feature_map_group_size, texture_offset, entry_count, different_input, xy_config_count, feature_map_config_count); + +#define launch_exact_kernel_const(window_width, block_size_const, single_input_feature_map_group) \ + switch (window_width) \ + { \ + case 1: \ + launch_exact_kernel_const_const(1, block_size_const, single_input_feature_map_group); \ + break; \ + case 2: \ + launch_exact_kernel_const_const(2, block_size_const, single_input_feature_map_group); \ + break; \ + case 3: \ + launch_exact_kernel_const_const(3, block_size_const, single_input_feature_map_group); \ + break; \ + case 4: \ + launch_exact_kernel_const_const(4, block_size_const, single_input_feature_map_group); \ + break; \ + case 5: \ + launch_exact_kernel_const_const(5, block_size_const, single_input_feature_map_group); \ + break; \ + case 6: \ + launch_exact_kernel_const_const(6, block_size_const, single_input_feature_map_group); \ + break; \ + case 7: \ + launch_exact_kernel_const_const(7, block_size_const, single_input_feature_map_group); \ + break; \ + case 8: \ + launch_exact_kernel_const_const(8, block_size_const, single_input_feature_map_group); \ + break; \ + case 9: \ + launch_exact_kernel_const_const(9, block_size_const, single_input_feature_map_group); \ + break; \ + case 10: \ + launch_exact_kernel_const_const(10, block_size_const, single_input_feature_map_group); \ + break; \ + }; + +#define launch_exact_kernel(window_width, block_size, single_input_feature_map_group) \ + switch (block_size) \ + { \ + case 1: \ + launch_exact_kernel_const(window_width, 1, single_input_feature_map_group); \ + break; \ + case 2: \ + launch_exact_kernel_const(window_width, 2, single_input_feature_map_group); \ + break; \ + case 3: \ + launch_exact_kernel_const(window_width, 3, single_input_feature_map_group); \ + break; \ + case 4: \ + launch_exact_kernel_const(window_width, 4, single_input_feature_map_group); \ + break; \ + case 5: \ + launch_exact_kernel_const(window_width, 5, single_input_feature_map_group); \ + break; \ + }; + +#define launch_kernel_const(block_size_const, single_input_feature_map_group) \ + convolution_2d_tex_upd_kernel_fermi<<>>(*output_neurons_buffer, *data[0], *data[1], xy_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[0], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, forward_input_feature_map_group_size, texture_offset, entry_count, different_input, xy_config_count, feature_map_config_count); + +#define launch_kernel(block_size, single_input_feature_map_group) \ + switch (block_size) \ + { \ + case 1: \ + launch_kernel_const(1, single_input_feature_map_group); \ + break; \ + case 2: \ + launch_kernel_const(2, single_input_feature_map_group); \ + break; \ + case 3: \ + launch_kernel_const(3, single_input_feature_map_group); \ + break; \ + case 4: \ + launch_kernel_const(4, single_input_feature_map_group); \ + break; \ + case 5: \ + launch_kernel_const(5, single_input_feature_map_group); \ + break; \ + }; + +#define launch_backprop_exact_kernel_const_const(window_width_const, block_size_const, single_output_feature_map_group) \ + convolution_2d_deriviative_tex_exact_upd_kernel_fermi<<>>(*input_errors_buffer, *data[0], xy_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, backward_output_feature_map_group_size, entry_count, xy_config_count, feature_map_config_count); + +#define launch_backprop_exact_kernel_const(window_width, block_size_const, single_output_feature_map_group) \ + switch (window_width) \ + { \ + case 1: \ + launch_backprop_exact_kernel_const_const(1, block_size_const, single_output_feature_map_group); \ + break; \ + case 2: \ + launch_backprop_exact_kernel_const_const(2, block_size_const, single_output_feature_map_group); \ + break; \ + case 3: \ + launch_backprop_exact_kernel_const_const(3, block_size_const, single_output_feature_map_group); \ + break; \ + case 4: \ + launch_backprop_exact_kernel_const_const(4, block_size_const, single_output_feature_map_group); \ + break; \ + case 5: \ + launch_backprop_exact_kernel_const_const(5, block_size_const, single_output_feature_map_group); \ + break; \ + case 6: \ + launch_backprop_exact_kernel_const_const(6, block_size_const, single_output_feature_map_group); \ + break; \ + case 7: \ + launch_backprop_exact_kernel_const_const(7, block_size_const, single_output_feature_map_group); \ + break; \ + case 8: \ + launch_backprop_exact_kernel_const_const(8, block_size_const, single_output_feature_map_group); \ + break; \ + case 9: \ + launch_backprop_exact_kernel_const_const(9, block_size_const, single_output_feature_map_group); \ + break; \ + case 10: \ + launch_backprop_exact_kernel_const_const(10, block_size_const, single_output_feature_map_group); \ + break; \ + }; + +#define launch_backprop_exact_kernel(window_width, block_size, single_output_feature_map_group) \ + switch (block_size) \ + { \ + case 1: \ + launch_backprop_exact_kernel_const(window_width, 1, single_output_feature_map_group); \ + break; \ + case 2: \ + launch_backprop_exact_kernel_const(window_width, 2, single_output_feature_map_group); \ + break; \ + case 3: \ + launch_backprop_exact_kernel_const(window_width, 3, single_output_feature_map_group); \ + break; \ + case 4: \ + launch_backprop_exact_kernel_const(window_width, 4, single_output_feature_map_group); \ + break; \ + case 5: \ + launch_backprop_exact_kernel_const(window_width, 5, single_output_feature_map_group); \ + break; \ + }; + +#define launch_backprop_kernel_const(block_size_const, single_output_feature_map_group) \ + convolution_2d_deriviative_tex_upd_kernel_fermi<<>>(*input_errors_buffer, *data[0], xy_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[0], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, backward_output_feature_map_group_size, entry_count, xy_config_count, feature_map_config_count); + +#define launch_backprop_kernel(block_size, single_output_feature_map_group) \ + switch (block_size) \ + { \ + case 1: \ + launch_backprop_kernel_const(1, single_output_feature_map_group); \ + break; \ + case 2: \ + launch_backprop_kernel_const(2, single_output_feature_map_group); \ + break; \ + case 3: \ + launch_backprop_kernel_const(3, single_output_feature_map_group); \ + break; \ + case 4: \ + launch_backprop_kernel_const(4, single_output_feature_map_group); \ + break; \ + case 5: \ + launch_backprop_kernel_const(5, single_output_feature_map_group); \ + break; \ + }; + +#define launch_update_weights_exact_kernel_const(window_width_const, single_output_y_group_const) \ + convolution_2d_update_weights_exact_upd_kernel_fermi<<>>(*data[0], *output_errors_buffer, *training_speed[0], output_y_weight_y_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, updater_output_y_group_count, texture_offset, entry_count, different_input, output_y_weight_y_config_count, feature_map_config_count); + +#define launch_update_weights_exact_kernel(window_width, single_output_y_group_const) \ + switch (window_width) \ + { \ + case 1: \ + launch_update_weights_exact_kernel_const(1, single_output_y_group_const); \ + break; \ + case 2: \ + launch_update_weights_exact_kernel_const(2, single_output_y_group_const); \ + break; \ + case 3: \ + launch_update_weights_exact_kernel_const(3, single_output_y_group_const); \ + break; \ + case 4: \ + launch_update_weights_exact_kernel_const(4, single_output_y_group_const); \ + break; \ + case 5: \ + launch_update_weights_exact_kernel_const(5, single_output_y_group_const); \ + break; \ + case 6: \ + launch_update_weights_exact_kernel_const(6, single_output_y_group_const); \ + break; \ + case 7: \ + launch_update_weights_exact_kernel_const(7, single_output_y_group_const); \ + break; \ + case 8: \ + launch_update_weights_exact_kernel_const(8, single_output_y_group_const); \ + break; \ + case 9: \ + launch_update_weights_exact_kernel_const(9, single_output_y_group_const); \ + break; \ + case 10: \ + launch_update_weights_exact_kernel_const(10, single_output_y_group_const); \ + break; \ + }; + +#define launch_update_weights_kernel_const(single_output_y_group_const) \ + convolution_2d_update_weights_upd_kernel_fermi<<>>(*data[0], *output_errors_buffer, *training_speed[0], output_y_weight_y_weight_x_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[0], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, updater_output_y_group_count, texture_offset, entry_count, different_input, output_y_weight_y_weight_x_config_count, feature_map_config_count); + + void convolution_2d_layer_updater_cuda_fermi::enqueue_test( + unsigned int offset_input_entry_id, + cudaStream_t stream_id, + const std::vector& schema_data, + const std::vector& data, + const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, + cuda_linear_buffer_device_smart_ptr output_neurons_buffer, + const std::vector& additional_buffers, + std::vector& dynamic_memobjects, + unsigned int entry_count) + { + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + size_t texture_offset; + cuda_safe_call(cudaBindTexture(&texture_offset, input_tex_ref, (const float *)(*input_neurons_buffer) + (offset_input_entry_id * input_elem_count_per_entry), desc, input_elem_count_per_entry * sizeof(float) * (different_input ? entry_count : 1))); + texture_offset /= sizeof(float); + + int xy_config_count = forward_x_block_count * output_configuration_specific.dimension_sizes[1]; + const xy_config * xy_config_list = static_cast((const void *)*additional_buffers[0]); + + int feature_map_config_count = forward_input_feature_map_group_count * forward_output_feature_map_block_count; + const feature_map_config * feature_map_config_list = static_cast((const void *)*additional_buffers[1]); + + if (forward_input_feature_map_group_count > 1) + cuda_util::set_with_value( + *cuda_config, + *output_neurons_buffer, + 0.0F, + output_elem_count_per_entry * entry_count, + stream_id); + + std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( + *cuda_config, + xy_config_count, + feature_map_config_count, + entry_count); + + if (window_sizes[0] <= MAX_WINDOW_WIDTH) + { + if (forward_input_feature_map_group_count == 1) + { + launch_exact_kernel(window_sizes[0], forward_x_block_size, true); + } + else + { + launch_exact_kernel(window_sizes[0], forward_x_block_size, false); + } + } + else + { + if (forward_input_feature_map_group_count == 1) + { + launch_kernel(forward_x_block_size, true); + } + else + { + launch_kernel(forward_x_block_size, false); + } + } + } + + void convolution_2d_layer_updater_cuda_fermi::enqueue_backprop( + cudaStream_t stream_id, + const std::vector& schema_data, + const std::vector& data, + const_cuda_linear_buffer_device_smart_ptr output_neurons_buffer, + const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, + cuda_linear_buffer_device_smart_ptr output_errors_buffer, + cuda_linear_buffer_device_smart_ptr input_errors_buffer, + const std::vector& additional_buffers, + std::vector& dynamic_memobjects, + unsigned int entry_count) + { + if (!different_input) + throw neural_network_exception("convolution_2d_layer_updater_cuda_fermi is not able to backprop to the same input"); + + if (!backprop_required) + throw neural_network_exception("convolution_2d_layer_updater_cuda_fermi is not configured to do backprop but requested to"); + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cuda_safe_call(cudaBindTexture(0, output_tex_ref, *output_errors_buffer, desc, output_elem_count_per_entry * entry_count * sizeof(float))); + + int xy_config_count = backward_x_block_count * input_configuration_specific.dimension_sizes[1]; + const xy_config * xy_config_list = static_cast((const void *)*additional_buffers[4]); + + int feature_map_config_count = backward_output_feature_map_group_count * backward_input_feature_map_block_count; + const feature_map_config * feature_map_config_list = static_cast((const void *)*additional_buffers[5]); + + if (backward_output_feature_map_group_count > 1) + cuda_util::set_with_value( + *cuda_config, + *input_errors_buffer, + 0.0F, + input_elem_count_per_entry * entry_count, + stream_id); + + std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( + *cuda_config, + xy_config_count, + feature_map_config_count, + entry_count); + + if (window_sizes[0] <= MAX_WINDOW_WIDTH) + { + if (backward_output_feature_map_group_count == 1) + { + launch_backprop_exact_kernel(window_sizes[0], backward_x_block_size, true); + } + else + { + launch_backprop_exact_kernel(window_sizes[0], backward_x_block_size, false); + } + } + else + { + if (backward_output_feature_map_group_count == 1) + { + launch_backprop_kernel(backward_x_block_size, true); + } + else + { + launch_backprop_kernel(backward_x_block_size, false); + } + } + } + + void convolution_2d_layer_updater_cuda_fermi::enqueue_update_weights( + unsigned int offset_input_entry_id, + cudaStream_t stream_id, + const std::vector& data, + const std::vector& schema_data, + const std::vector& training_speed, + cuda_linear_buffer_device_smart_ptr output_errors_buffer, + const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, + const std::vector& additional_buffers, + std::vector& dynamic_memobjects, + unsigned int entry_count) + { + // Update biases + { + int threadblock_size = get_threadblock_size_biases(output_elem_count_per_feature_map); + dim3 grid_size(1, output_configuration_specific.feature_map_count, entry_count); + dim3 block_size(threadblock_size, 1, 1); + int smem_size = threadblock_size * sizeof(float); + int min_iteration_count = output_elem_count_per_feature_map / threadblock_size; + + convolution_2d_update_biases_upd_kernel_fermi<<>>( + *data[1], + *output_errors_buffer, + *training_speed[1], + output_configuration_specific.feature_map_count, + output_elem_count_per_feature_map, + min_iteration_count); + } + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + size_t texture_offset; + cuda_safe_call(cudaBindTexture(&texture_offset, input_tex_ref, (const float *)(*input_neurons_buffer) + (offset_input_entry_id * input_elem_count_per_entry), desc, input_elem_count_per_entry * sizeof(float) * (different_input ? entry_count : 1))); + texture_offset /= sizeof(float); + + int feature_map_config_count = updater_output_feature_map_block_count * input_configuration_specific.feature_map_count; + const feature_map_config * feature_map_config_list = static_cast((const void *)*additional_buffers[3]); + + // Update weights + { + if (updater_window_x_block_count == 1) + { + int output_y_weight_y_config_count = updater_output_y_group_count * window_sizes[1]; + const output_y_weight_y_config * output_y_weight_y_config_list = static_cast((const void *)*additional_buffers[2]); + + std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( + *cuda_config, + output_y_weight_y_config_count, + feature_map_config_count, + entry_count); + + if (updater_output_y_group_count == 1) + { + launch_update_weights_exact_kernel(window_sizes[0], true); + } + else + { + launch_update_weights_exact_kernel(window_sizes[0], false); + } + } + else + { + int output_y_weight_y_weight_x_config_count = updater_output_y_group_count * window_sizes[1] * updater_window_x_block_count; + const output_y_weight_y_weight_x_config * output_y_weight_y_weight_x_config_list = static_cast((const void *)*additional_buffers[2]); + + std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( + *cuda_config, + output_y_weight_y_weight_x_config_count, + feature_map_config_count, + entry_count); + + if (updater_output_y_group_count == 1) + { + launch_update_weights_kernel_const(true); + } + else + { + launch_update_weights_kernel_const(false); + } + } + } + } + + int convolution_2d_layer_updater_cuda_fermi::get_block_size(int width) + { + int block_count = (width + MAX_BLOCK_SIZE - 1) / MAX_BLOCK_SIZE; + int block_size = (width + block_count - 1) / block_count; + return block_size; + } + + void convolution_2d_layer_updater_cuda_fermi::updater_configured() + { + std::tr1::shared_ptr layer_derived = std::tr1::dynamic_pointer_cast(layer_schema); + + for(std::vector::const_iterator it = layer_derived->window_sizes.begin(); it != layer_derived->window_sizes.end(); ++it) + window_sizes.push_back(static_cast(*it)); + + forward_x_block_size = get_block_size(output_configuration_specific.dimension_sizes[0]); + forward_x_block_count = (output_configuration_specific.dimension_sizes[0] + forward_x_block_size - 1) / forward_x_block_size; + forward_output_feature_map_block_count = (output_configuration_specific.feature_map_count + FEATURE_MAP_BLOCK_SIZE - 1) / FEATURE_MAP_BLOCK_SIZE; + + updater_output_feature_map_block_count = (output_configuration_specific.feature_map_count + FEATURE_MAP_BLOCK_SIZE - 1) / FEATURE_MAP_BLOCK_SIZE; + updater_window_x_block_count = (window_sizes[0] <= MAX_WINDOW_WIDTH) ? 1 : (window_sizes[0] + WINDOW_WIDTH_LOCAL - 1) / WINDOW_WIDTH_LOCAL; + + if (backprop_required) + { + backward_x_block_size = get_block_size(input_configuration_specific.dimension_sizes[0]); + backward_x_block_count = (input_configuration_specific.dimension_sizes[0] + backward_x_block_size - 1) / backward_x_block_size; + backward_input_feature_map_block_count = (input_configuration_specific.feature_map_count + FEATURE_MAP_BLOCK_SIZE - 1) / FEATURE_MAP_BLOCK_SIZE; + } + } + + bool convolution_2d_layer_updater_cuda_fermi::is_in_place_backprop() const + { + return false; + } + + std::vector convolution_2d_layer_updater_cuda_fermi::get_linear_addressing_through_texture_per_entry() const + { + std::vector res; + + res.push_back(input_elem_count_per_entry); + res.push_back(output_elem_count_per_entry); + + return res; + } + + int convolution_2d_layer_updater_cuda_fermi::get_threadblock_size_biases(int output_neuron_count) + { + int threadblock_size; + + if (output_neuron_count < 128) + { + threadblock_size = (output_neuron_count + 32 - 1) / 32 * 32; + } + else + { + int threadblock_count = (output_neuron_count + 128 - 1) / 128; + threadblock_size = (output_neuron_count + threadblock_count - 1) / threadblock_count; + threadblock_size = (threadblock_size + 32 - 1) / 32 * 32; + } + + return threadblock_size; + } + + std::vector convolution_2d_layer_updater_cuda_fermi::get_sizes_of_additional_buffers_fixed() const + { + std::vector res; + + res.push_back(sizeof(xy_config) * forward_x_block_count * output_configuration_specific.dimension_sizes[1]); + res.push_back(sizeof(feature_map_config) * input_configuration_specific.feature_map_count * forward_output_feature_map_block_count); + + res.push_back(sizeof(output_y_weight_y_config) * window_sizes[1] * output_configuration_specific.dimension_sizes[1] * updater_window_x_block_count); + res.push_back(sizeof(feature_map_config) * input_configuration_specific.feature_map_count * updater_output_feature_map_block_count); + + if (backprop_required) + { + res.push_back(sizeof(xy_config) * backward_x_block_count * input_configuration_specific.dimension_sizes[1]); + res.push_back(sizeof(feature_map_config) * output_configuration_specific.feature_map_count * backward_input_feature_map_block_count); + } + + return res; + } + + void convolution_2d_layer_updater_cuda_fermi::fill_additional_buffers(const std::vector& additional_buffers) const + { + { + std::vector task_list; + for(int y = 0; y < output_configuration_specific.dimension_sizes[1]; ++y) + for(int x = 0; x < forward_x_block_count; ++x) + task_list.push_back(xy_config(y, x * forward_x_block_size)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[0], &(*task_list.begin()), sizeof(xy_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + + { + std::vector task_list; + for(int input_feature_map_group_id = 0; input_feature_map_group_id < forward_input_feature_map_group_count; ++input_feature_map_group_id) + for(int output_feature_map_id = 0; output_feature_map_id < forward_output_feature_map_block_count; ++output_feature_map_id) + task_list.push_back(feature_map_config(input_feature_map_group_id * forward_input_feature_map_group_size, output_feature_map_id * FEATURE_MAP_BLOCK_SIZE)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[1], &(*task_list.begin()), sizeof(feature_map_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + + if (updater_window_x_block_count == 1) + { + std::vector task_list; + for(int output_y = 0; output_y < updater_output_y_group_count; ++output_y) + for(int weight_y = 0; weight_y < window_sizes[1]; ++weight_y) + task_list.push_back(output_y_weight_y_config(output_y, weight_y)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[2], &(*task_list.begin()), sizeof(xy_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + else + { + std::vector task_list; + for(int output_y = 0; output_y < updater_output_y_group_count; ++output_y) + for(int weight_y = 0; weight_y < window_sizes[1]; ++weight_y) + for(int weight_x = 0; weight_x < updater_window_x_block_count; ++weight_x) + task_list.push_back(output_y_weight_y_weight_x_config(output_y, weight_y, weight_x * FEATURE_MAP_BLOCK_SIZE)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[2], &(*task_list.begin()), sizeof(xy_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + + { + std::vector > pair_list; + cuda_util::fill_tiling_pattern(input_configuration_specific.feature_map_count, updater_output_feature_map_block_count, pair_list); + + std::vector task_list; + for(std::vector >::const_iterator it = pair_list.begin(); it != pair_list.end(); ++it) + task_list.push_back(feature_map_config(it->first, it->second * FEATURE_MAP_BLOCK_SIZE)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[3], &(*task_list.begin()), sizeof(feature_map_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + + if (backprop_required) + { + { + std::vector task_list; + for(int y = 0; y < input_configuration_specific.dimension_sizes[1]; ++y) + for(int x = 0; x < backward_x_block_count; ++x) + task_list.push_back(xy_config(y, x * backward_x_block_size + (backward_x_block_size - 1))); + + cuda_safe_call(cudaMemcpy(*additional_buffers[4], &(*task_list.begin()), sizeof(xy_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + + { + std::vector task_list; + for(int output_feature_map_group_id = 0; output_feature_map_group_id < backward_output_feature_map_group_count; ++output_feature_map_group_id) + for(int input_feature_map_id = 0; input_feature_map_id < backward_input_feature_map_block_count; ++input_feature_map_id) + task_list.push_back(feature_map_config(input_feature_map_id * FEATURE_MAP_BLOCK_SIZE, output_feature_map_group_id * backward_output_feature_map_group_size)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[5], &(*task_list.begin()), sizeof(feature_map_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + } + } + + void convolution_2d_layer_updater_cuda_fermi::set_max_entry_count(unsigned int max_entry_count) + { + forward_input_feature_map_group_count = cuda_util::get_group_count( + *cuda_config, + forward_x_block_count * output_configuration_specific.dimension_sizes[1] * forward_output_feature_map_block_count * max_entry_count, + input_configuration_specific.feature_map_count); + forward_input_feature_map_group_size = (input_configuration_specific.feature_map_count + forward_input_feature_map_group_count - 1) / forward_input_feature_map_group_count; + + updater_output_y_group_count = cuda_util::get_group_count( + *cuda_config, + updater_output_feature_map_block_count * input_configuration_specific.feature_map_count * window_sizes[1] * max_entry_count * updater_window_x_block_count, + output_configuration_specific.dimension_sizes[1]); + updater_output_y_group_size = (output_configuration_specific.dimension_sizes[1] + updater_output_y_group_count - 1) / updater_output_y_group_count; + + if (backprop_required) + { + backward_output_feature_map_group_count = cuda_util::get_group_count( + *cuda_config, + backward_x_block_count * input_configuration_specific.dimension_sizes[1] * backward_input_feature_map_block_count * max_entry_count, + output_configuration_specific.feature_map_count); + backward_output_feature_map_group_size = (output_configuration_specific.feature_map_count + backward_output_feature_map_group_count - 1) / backward_output_feature_map_group_count; + } + } + } +} diff --git a/nnforge/cuda/convolution_2d_layer_updater_cuda.h b/nnforge/cuda/convolution_2d_layer_updater_cuda_fermi.h similarity index 67% rename from nnforge/cuda/convolution_2d_layer_updater_cuda.h rename to nnforge/cuda/convolution_2d_layer_updater_cuda_fermi.h index 0af38d4..86a7589 100644 --- a/nnforge/cuda/convolution_2d_layer_updater_cuda.h +++ b/nnforge/cuda/convolution_2d_layer_updater_cuda_fermi.h @@ -22,12 +22,12 @@ namespace nnforge { namespace cuda { - class convolution_2d_layer_updater_cuda : public layer_updater_cuda + class convolution_2d_layer_updater_cuda_fermi : public layer_updater_cuda { public: - convolution_2d_layer_updater_cuda(); + convolution_2d_layer_updater_cuda_fermi(); - virtual ~convolution_2d_layer_updater_cuda(); + virtual ~convolution_2d_layer_updater_cuda_fermi(); virtual void enqueue_test( unsigned int offset_input_entry_id, @@ -37,6 +37,7 @@ namespace nnforge const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); virtual void enqueue_backprop( @@ -48,6 +49,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); virtual void enqueue_update_weights( @@ -59,6 +61,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); protected: @@ -66,10 +69,32 @@ namespace nnforge virtual void updater_configured(); + virtual std::vector get_sizes_of_additional_buffers_fixed() const; + + virtual void set_max_entry_count(unsigned int max_entry_count); + virtual std::vector get_linear_addressing_through_texture_per_entry() const; + virtual void fill_additional_buffers(const std::vector& additional_buffers) const; + std::vector window_sizes; + int forward_x_block_size; + int forward_x_block_count; + int forward_input_feature_map_group_count; + int forward_input_feature_map_group_size; + int forward_output_feature_map_block_count; + + int backward_x_block_size; + int backward_x_block_count; + int backward_output_feature_map_group_count; + int backward_output_feature_map_group_size; + int backward_input_feature_map_block_count; + + int updater_output_y_group_count; + int updater_output_y_group_size; + int updater_output_feature_map_block_count; + int updater_window_x_block_count; private: static int get_block_size(int width); diff --git a/nnforge/cuda/convolution_2d_layer_updater_cuda_kepler.cu b/nnforge/cuda/convolution_2d_layer_updater_cuda_kepler.cu new file mode 100644 index 0000000..1f51744 --- /dev/null +++ b/nnforge/cuda/convolution_2d_layer_updater_cuda_kepler.cu @@ -0,0 +1,1501 @@ +/* + * Copyright 2011-2013 Maxim Milakov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "convolution_2d_layer_updater_cuda_kepler.h" + +#include + +#include + +#include "util_cuda.h" +#include "neural_network_cuda_exception.h" +#include "cuda_texture.h" +#include "../convolution_layer.h" + +texture output_tex_ref; + +#define FEATURE_MAP_BLOCK_SIZE 4 +#define WINDOW_WIDTH_LOCAL 4 + +struct __align__(4) xy_config +{ + xy_config(int y, int x) + { + this->xy_pair = (((unsigned int)y) << 16) | (unsigned int)x; + } + + unsigned int xy_pair; +}; + +struct __align__(4) feature_map_config +{ + feature_map_config(int input_feature_map_id, int output_feature_map_id) + { + this->feature_map_pair = (((unsigned int)input_feature_map_id) << 16) | (unsigned int)output_feature_map_id; + } + + unsigned int feature_map_pair; +}; + +struct __align__(4) output_y_weight_y_config +{ + output_y_weight_y_config(int output_y, int weight_y) + { + this->output_y_window_y_pair = (((unsigned int)output_y) << 16) | (unsigned int)weight_y; + } + + unsigned int output_y_window_y_pair; +}; + +struct __align__(4) output_y_weight_y_weight_x_config +{ + output_y_weight_y_weight_x_config(int output_y, int weight_y, int weight_x) + { + this->output_y_window_y_window_x_pair = (((unsigned int)output_y) << 16) | (((unsigned int)weight_y) << 8) | ((unsigned int)weight_x); + } + + unsigned int output_y_window_y_window_x_pair; +}; + +template +__global__ void convolution_2d_tex_upd_kernel_kepler( + float * __restrict output, + cudaTextureObject_t input_tex, + cudaTextureObject_t weights_tex, + const float * __restrict biases, + const xy_config * __restrict xy_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_width, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int input_feature_map_group_size, + int texture_offset, + int entry_count, + bool different_input, + int xy_config_count, + int feature_map_config_count) +{ + int xy_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + bool in_bounds = (entry_id < entry_count) && (xy_config_id < xy_config_count) && (feature_map_config_id < feature_map_config_count); + if (in_bounds) + { + xy_config xyc = xy_config_list[xy_config_id]; + int x = xyc.xy_pair & 0xFFFF; + int y = xyc.xy_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int base_input_feature_map_id = fmc.feature_map_pair >> 16; + + int weight_count_per_output_feature_map = window_width * window_height * input_feature_map_count; + int input_elem_id = ((((different_input ? entry_id * input_feature_map_count : 0) + base_input_feature_map_id) * input_height) + y) * input_width + x + texture_offset; + int weights_offset = ((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + base_input_feature_map_id) * window_width * window_height; + int iteration_count = min(input_feature_map_group_size, input_feature_map_count - base_input_feature_map_id); + + float initial_values[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + initial_values[i] = 0.0F; + if (base_input_feature_map_id == 0) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + if (i < output_feature_map_count - output_feature_map_id) + initial_values[i] = biases[entry_id * output_feature_map_count + output_feature_map_id + i]; + } + float sums[BLOCK_SIZE * FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + sums[i * BLOCK_SIZE + j] = initial_values[i]; + + for(int i = 0; i < iteration_count; ++i) + { + for(int input_y = 0; input_y < window_height; ++input_y) + { + #pragma unroll 4 + for(int input_x = 0; input_x < window_width; ++input_x) + { + float weight_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_list[i] = tex1Dfetch(weights_tex, weights_offset + weight_count_per_output_feature_map * i); + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + float inp = tex1Dfetch(input_tex, input_elem_id + j); + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + sums[i * BLOCK_SIZE + j] += inp * weight_list[i]; + } + weights_offset++; + input_elem_id++; + } + input_elem_id += input_width - window_width; + } + input_elem_id += input_width * (input_height - window_height); + } + + float * base_output = output + ((entry_id * output_feature_map_count + output_feature_map_id) * output_height + y) * output_width + x; + int output_neuron_count_per_feature_map = output_height * output_width; + if (single_input_feature_map_group) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j < output_width - x) + base_output[output_neuron_count_per_feature_map * i + j] = sums[i * BLOCK_SIZE + j]; + } + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j < output_width - x) + atomicAdd(base_output + output_neuron_count_per_feature_map * i + j, sums[i * BLOCK_SIZE + j]); + } + } + } + } + } +} + +template +__global__ void convolution_2d_tex_exact_upd_kernel_kepler( + float * __restrict output, + cudaTextureObject_t input_tex, + cudaTextureObject_t weights_tex, + const float * __restrict biases, + const xy_config * __restrict xy_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int input_feature_map_group_size, + int texture_offset, + int entry_count, + bool different_input, + int xy_config_count, + int feature_map_config_count) +{ + int xy_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + bool in_bounds = (entry_id < entry_count) && (xy_config_id < xy_config_count) && (feature_map_config_id < feature_map_config_count); + if (in_bounds) + { + xy_config xyc = xy_config_list[xy_config_id]; + int x = xyc.xy_pair & 0xFFFF; + int y = xyc.xy_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int base_input_feature_map_id = fmc.feature_map_pair >> 16; + + int weight_count_per_output_feature_map = WINDOW_WIDTH * window_height * input_feature_map_count; + int input_elem_id = ((((different_input ? entry_id * input_feature_map_count : 0) + base_input_feature_map_id) * input_height) + y) * input_width + x + texture_offset; + int weights_offset = ((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + base_input_feature_map_id) * WINDOW_WIDTH * window_height; + int iteration_count = min(input_feature_map_group_size, input_feature_map_count - base_input_feature_map_id); + + float initial_values[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + initial_values[i] = 0.0F; + if (base_input_feature_map_id == 0) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + if (i < output_feature_map_count - output_feature_map_id) + initial_values[i] = biases[entry_id * output_feature_map_count + output_feature_map_id + i]; + } + float sums[BLOCK_SIZE * FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + sums[i * BLOCK_SIZE + j] = initial_values[i]; + + for(int i = 0; i < iteration_count; ++i) + { + for(int input_y = 0; input_y < window_height; ++input_y) + { + #pragma unroll + for(int input_x = 0; input_x < WINDOW_WIDTH; ++input_x) + { + float weight_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_list[i] = tex1Dfetch(weights_tex, weights_offset + weight_count_per_output_feature_map * i); + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + float inp = tex1Dfetch(input_tex, input_elem_id + j); + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + sums[i * BLOCK_SIZE + j] += inp * weight_list[i]; + } + weights_offset++; + input_elem_id++; + } + input_elem_id += input_width - WINDOW_WIDTH; + } + input_elem_id += input_width * (input_height - window_height); + } + + float * base_output = output + ((entry_id * output_feature_map_count + output_feature_map_id) * output_height + y) * output_width + x; + int output_neuron_count_per_feature_map = output_height * output_width; + if (single_input_feature_map_group) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j < output_width - x) + base_output[output_neuron_count_per_feature_map * i + j] = sums[i * BLOCK_SIZE + j]; + } + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j < output_width - x) + atomicAdd(base_output + output_neuron_count_per_feature_map * i + j, sums[i * BLOCK_SIZE + j]); + } + } + } + } + } +} + +extern __shared__ float arr_sh[]; +__global__ void convolution_2d_update_biases_upd_kernel_kepler( + float * __restrict biases, + const float * __restrict output_errors, + const float * __restrict training_speed, + int output_feature_map_count, + int output_elem_count_per_feature_map, + int min_iteration_count) +{ + int thread_id = threadIdx.x; + int output_feature_map_id = blockIdx.y; + int entry_id = blockIdx.z; + int threadblock_size = blockDim.x; + + float sum = 0.0F; + const float * current_error = output_errors + (entry_id * output_feature_map_count + output_feature_map_id) * output_elem_count_per_feature_map; + int current_output_neuron_id = thread_id; + for(int i = 0; i < min_iteration_count; ++i) + { + sum += current_error[current_output_neuron_id]; + current_output_neuron_id += threadblock_size; + } + if (current_output_neuron_id < output_elem_count_per_feature_map) + sum += current_error[current_output_neuron_id]; + + volatile float * arr = arr_sh; + arr[thread_id] = sum; + int lane_id = thread_id & 31; + #pragma unroll + for(int tx = 16; tx > 0; tx >>= 1) + { + if (lane_id < tx) + arr[thread_id] += arr[thread_id + tx]; + } + sum = arr[thread_id]; + + if (lane_id == 0) + { + int offset = entry_id * output_feature_map_count + output_feature_map_id; + float current_training_speed_val = training_speed[offset]; + atomicAdd(biases + offset, sum * current_training_speed_val); + } +} + +template +__global__ void convolution_2d_deriviative_tex_upd_kernel_kepler( + float * __restrict input_errors, + cudaTextureObject_t output_tex, + cudaTextureObject_t weights_tex, + const xy_config * __restrict xy_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_width, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int output_feature_map_group_size, + int entry_count, + int xy_config_count, + int feature_map_config_count) +{ + int xy_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + bool in_bounds = (entry_id < entry_count) && (xy_config_id < xy_config_count) && (feature_map_config_id < feature_map_config_count); + if (in_bounds) + { + xy_config xyc = xy_config_list[xy_config_id]; + int x = xyc.xy_pair & 0xFFFF; + int y = xyc.xy_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int base_output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int input_feature_map_id = fmc.feature_map_pair >> 16; + + int weight_count_per_input_feature_map = window_width * window_height; + int output_elem_id = ((entry_id * output_feature_map_count + base_output_feature_map_id) * output_height + y) * output_width + x; + int weights_offset = ((entry_id * output_feature_map_count + base_output_feature_map_id) * input_feature_map_count + input_feature_map_id) * window_width * window_height; + int iteration_count = min(output_feature_map_group_size, output_feature_map_count - base_output_feature_map_id); + + float sums[FEATURE_MAP_BLOCK_SIZE * BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE * BLOCK_SIZE; ++i) + sums[i] = 0.0F; + + int min_y_exclusive = y - output_height; + int max_y_inclusive = y; + int min_x_exclusive = x - output_width; + int max_x_inclusive = x; + + for(int i = 0; i < iteration_count; ++i) + { + for(int input_y = 0; input_y < window_height; ++input_y) + { + bool b_fit1 = (input_y > min_y_exclusive) && (input_y <= max_y_inclusive); + + int input_x = 0; + #pragma unroll 1 + for(; input_x < (window_width - (WINDOW_WIDTH_LOCAL - 1)); input_x += WINDOW_WIDTH_LOCAL) + { + float output_vals[BLOCK_SIZE + WINDOW_WIDTH_LOCAL - 1]; + #pragma unroll + for(int i = 0; i < BLOCK_SIZE + WINDOW_WIDTH_LOCAL - 1; ++i) + { + bool b_fit2 = b_fit1 && (i > min_x_exclusive) && (i <= max_x_inclusive);; + if (b_fit2) + output_vals[i] = tex1Dfetch(output_tex, output_elem_id - i); + else + output_vals[i] = 0.0F; + } + output_elem_id -= WINDOW_WIDTH_LOCAL; + + #pragma unroll + for(int input_x_local = 0; input_x_local < WINDOW_WIDTH_LOCAL; ++input_x_local) + { + float weight_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_list[i] = tex1Dfetch(weights_tex, weights_offset + weight_count_per_input_feature_map * i); + + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + sums[i * BLOCK_SIZE + j] += output_vals[input_x_local + j] * weight_list[i]; + } + weights_offset++; + } + } + #pragma unroll 1 + for(; input_x < window_width; ++input_x) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + bool b_fit2 = b_fit1 && (input_x + j > min_x_exclusive) && (input_x + j <= max_x_inclusive); + if (b_fit2) + { + float inp = tex1Dfetch(output_tex, output_elem_id - j); + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + sums[i * BLOCK_SIZE + j] += inp * tex1Dfetch(weights_tex, weights_offset + weight_count_per_input_feature_map * i); + } + } + weights_offset++; + output_elem_id--; + } + + output_elem_id += window_width - output_width; + } + weights_offset += window_width * window_height * (input_feature_map_count - 1); + output_elem_id += output_width * (output_height + window_height); + } + + float * base_input = input_errors + ((entry_id * input_feature_map_count + input_feature_map_id) * input_height + y) * input_width + x; + int input_neuron_count_per_feature_map = input_height * input_width; + if (single_output_feature_map_group == 1) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < input_feature_map_count - input_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j > x - input_width) + *(base_input + input_neuron_count_per_feature_map * i - j) = sums[i * BLOCK_SIZE + j]; + } + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < input_feature_map_count - input_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j > x - input_width) + atomicAdd(base_input + input_neuron_count_per_feature_map * i - j, sums[i * BLOCK_SIZE + j]); + } + } + } + } + } +} + +template +__global__ void convolution_2d_deriviative_tex_exact_upd_kernel_kepler( + float * __restrict input_errors, + cudaTextureObject_t output_tex, + cudaTextureObject_t weights_tex, + const xy_config * __restrict xy_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int output_feature_map_group_size, + int entry_count, + int xy_config_count, + int feature_map_config_count) +{ + int xy_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + bool in_bounds = (entry_id < entry_count) && (xy_config_id < xy_config_count) && (feature_map_config_id < feature_map_config_count); + if (in_bounds) + { + xy_config xyc = xy_config_list[xy_config_id]; + int x = xyc.xy_pair & 0xFFFF; + int y = xyc.xy_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int base_output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int input_feature_map_id = fmc.feature_map_pair >> 16; + + int weight_count_per_input_feature_map = WINDOW_WIDTH * window_height; + int output_elem_id = ((entry_id * output_feature_map_count + base_output_feature_map_id) * output_height + y) * output_width + x; + int weights_offset = ((entry_id * output_feature_map_count + base_output_feature_map_id) * input_feature_map_count + input_feature_map_id) * WINDOW_WIDTH * window_height; + int iteration_count = min(output_feature_map_group_size, output_feature_map_count - base_output_feature_map_id); + + float sums[FEATURE_MAP_BLOCK_SIZE * BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE * BLOCK_SIZE; ++i) + sums[i] = 0.0F; + + int min_y_exclusive = y - output_height; + int max_y_inclusive = y; + int min_x_exclusive = x - output_width; + int max_x_inclusive = x; + + unsigned int mask = 0; + for(int i = BLOCK_SIZE + WINDOW_WIDTH - 2; i >= 0; --i) + mask = mask << 1 | (((i > min_x_exclusive) && (i <= max_x_inclusive)) ? 1 : 0); + + for(int i = 0; i < iteration_count; ++i) + { + for(int input_y = 0; input_y < window_height; ++input_y) + { + bool b_fit1 = (input_y > min_y_exclusive) && (input_y <= max_y_inclusive); + + float output_vals[BLOCK_SIZE + WINDOW_WIDTH - 1]; + #pragma unroll + for(int i = 0; i < BLOCK_SIZE + WINDOW_WIDTH - 1; ++i) + { + bool b_fit2 = b_fit1 && (((1 << i) & mask) != 0); + if (b_fit2) + output_vals[i] = tex1Dfetch(output_tex, output_elem_id - i); + else + output_vals[i] = 0.0F; + } + + #pragma unroll + for(int input_x = 0; input_x < WINDOW_WIDTH; ++input_x) + { + float weight_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + weight_list[i] = tex1Dfetch(weights_tex, weights_offset + weight_count_per_input_feature_map * i); + + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + sums[i * BLOCK_SIZE + j] += output_vals[input_x + j] * weight_list[i]; + } + weights_offset++; + } + output_elem_id -= output_width; + } + weights_offset += WINDOW_WIDTH * window_height * (input_feature_map_count - 1); + output_elem_id += output_width * (output_height + window_height); + } + + float * base_input = input_errors + ((entry_id * input_feature_map_count + input_feature_map_id) * input_height + y) * input_width + x; + int input_neuron_count_per_feature_map = input_height * input_width; + if (single_output_feature_map_group == 1) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < input_feature_map_count - input_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j > x - input_width) + *(base_input + input_neuron_count_per_feature_map * i - j) = sums[i * BLOCK_SIZE + j]; + } + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < input_feature_map_count - input_feature_map_id) + { + #pragma unroll + for(int j = 0; j < BLOCK_SIZE; ++j) + { + if (j > x - input_width) + atomicAdd(base_input + input_neuron_count_per_feature_map * i - j, sums[i * BLOCK_SIZE + j]); + } + } + } + } + } +} + +template +__global__ void convolution_2d_update_weights_upd_kernel_kepler( + float * __restrict weights, + cudaTextureObject_t input_tex, + const float * __restrict training_speed, + const output_y_weight_y_weight_x_config * __restrict output_y_weight_y_weight_x_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_width, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int output_y_group_count, + int texture_offset, + int entry_count, + bool different_input, + int output_y_weight_y_weight_x_config_count, + int feature_map_config_count) +{ + int output_y_weight_y_weight_x_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + if ((output_y_weight_y_weight_x_config_id < output_y_weight_y_weight_x_config_count) && (feature_map_config_id < feature_map_config_count) && (entry_id < entry_count)) + { + output_y_weight_y_weight_x_config yw = output_y_weight_y_weight_x_config_list[output_y_weight_y_weight_x_config_id]; + int weight_x = yw.output_y_window_y_window_x_pair & 0xFF; + int weight_y = (yw.output_y_window_y_window_x_pair & 0xFFFF) >> 8; + int output_y_start_id = yw.output_y_window_y_window_x_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int input_feature_map_id = fmc.feature_map_pair >> 16; + + int output_neuron_count_per_feature_map = output_width * output_height; + int output_elem_id = ((entry_id * output_feature_map_count + output_feature_map_id) * output_height + output_y_start_id) * output_width; + int input_elem_id = (((different_input ? entry_id * input_feature_map_count : 0) + input_feature_map_id) * input_height + weight_y + output_y_start_id) * input_width + texture_offset + weight_x; + + float sums[FEATURE_MAP_BLOCK_SIZE * WINDOW_WIDTH_LOCAL]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE * WINDOW_WIDTH_LOCAL; ++i) + sums[i] = 0.0F; + + for(int output_y = output_y_start_id; output_y < output_height; output_y += output_y_group_count) + { + float input_buf[WINDOW_WIDTH_LOCAL]; + #pragma unroll + for(int i = 1; i < WINDOW_WIDTH_LOCAL; ++i) + { + input_buf[i] = tex1Dfetch(input_tex, input_elem_id); + ++input_elem_id; + } + + for(int x = 0; x < output_width; ++x) + { + float output_error_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + output_error_list[i] = tex1Dfetch(output_tex_ref, output_elem_id + output_neuron_count_per_feature_map * i); + + #pragma unroll + for(int i = 0; i < WINDOW_WIDTH_LOCAL - 1; ++i) + input_buf[i] = input_buf[i + 1]; + input_buf[WINDOW_WIDTH_LOCAL - 1] = tex1Dfetch(input_tex, input_elem_id); + + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH_LOCAL; ++j) + sums[i * WINDOW_WIDTH_LOCAL + j] += output_error_list[i] * input_buf[j]; + + output_elem_id++; + input_elem_id++; + } + + output_elem_id += output_width * (output_y_group_count - 1); + input_elem_id += input_width * (output_y_group_count - 1) + (window_width - WINDOW_WIDTH_LOCAL); + } + + int offset = (((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + input_feature_map_id) * window_height + weight_y) * window_width + weight_x; + int weight_count_per_output_feature_map = input_feature_map_count * window_height * window_width; + float * cur_weights = weights + offset; + const float * cur_training_speed = training_speed + offset; + if (single_output_y_group) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH_LOCAL; ++j) + if (j < window_width - weight_x) + cur_weights[i * weight_count_per_output_feature_map + j] += sums[i * WINDOW_WIDTH_LOCAL + j] * cur_training_speed[i * weight_count_per_output_feature_map + j]; + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH_LOCAL; ++j) + if (j < window_width - weight_x) + atomicAdd(cur_weights + i * weight_count_per_output_feature_map + j, sums[i * WINDOW_WIDTH_LOCAL + j] * cur_training_speed[i * weight_count_per_output_feature_map + j]); + } + } + } + } +} + +template +__global__ void convolution_2d_update_weights_exact_upd_kernel_kepler( + float * __restrict weights, + cudaTextureObject_t input_tex, + const float * __restrict training_speed, + const output_y_weight_y_config * __restrict output_y_weight_y_config_list, + const feature_map_config * __restrict feature_map_config_list, + int output_width, + int output_height, + int input_width, + int input_height, + int window_height, + int input_feature_map_count, + int output_feature_map_count, + int output_y_group_count, + int texture_offset, + int entry_count, + bool different_input, + int output_y_weight_y_config_count, + int feature_map_config_count) +{ + int output_y_weight_y_config_id = blockIdx.x * blockDim.x + threadIdx.x; + int feature_map_config_id = blockIdx.y * blockDim.y + threadIdx.y; + int entry_id = blockIdx.z * blockDim.z + threadIdx.z; + + if ((output_y_weight_y_config_id < output_y_weight_y_config_count) && (feature_map_config_id < feature_map_config_count) && (entry_id < entry_count)) + { + output_y_weight_y_config yw = output_y_weight_y_config_list[output_y_weight_y_config_id]; + int weight_y = yw.output_y_window_y_pair & 0xFFFF; + int output_y_start_id = yw.output_y_window_y_pair >> 16; + + feature_map_config fmc = feature_map_config_list[feature_map_config_id]; + int output_feature_map_id = fmc.feature_map_pair & 0xFFFF; + int input_feature_map_id = fmc.feature_map_pair >> 16; + + int output_neuron_count_per_feature_map = output_width * output_height; + int output_elem_id = ((entry_id * output_feature_map_count + output_feature_map_id) * output_height + output_y_start_id) * output_width; + int input_elem_id = (((different_input ? entry_id * input_feature_map_count : 0) + input_feature_map_id) * input_height + weight_y + output_y_start_id) * input_width + texture_offset; + + float sums[FEATURE_MAP_BLOCK_SIZE * WINDOW_WIDTH]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE * WINDOW_WIDTH; ++i) + sums[i] = 0.0F; + + for(int output_y = output_y_start_id; output_y < output_height; output_y += output_y_group_count) + { + float input_buf[WINDOW_WIDTH]; + #pragma unroll + for(int i = 1; i < WINDOW_WIDTH; ++i) + { + input_buf[i] = tex1Dfetch(input_tex, input_elem_id); + ++input_elem_id; + } + + for(int x = 0; x < output_width; ++x) + { + float output_error_list[FEATURE_MAP_BLOCK_SIZE]; + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + output_error_list[i] = tex1Dfetch(output_tex_ref, output_elem_id + output_neuron_count_per_feature_map * i); + + #pragma unroll + for(int i = 0; i < WINDOW_WIDTH - 1; ++i) + input_buf[i] = input_buf[i + 1]; + input_buf[WINDOW_WIDTH - 1] = tex1Dfetch(input_tex, input_elem_id); + + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH; ++j) + sums[i * WINDOW_WIDTH + j] += output_error_list[i] * input_buf[j]; + + output_elem_id++; + input_elem_id++; + } + + output_elem_id += output_width * (output_y_group_count - 1); + input_elem_id += input_width * (output_y_group_count - 1); + } + + int offset = (((entry_id * output_feature_map_count + output_feature_map_id) * input_feature_map_count + input_feature_map_id) * window_height + weight_y) * WINDOW_WIDTH; + int weight_count_per_output_feature_map = input_feature_map_count * window_height * WINDOW_WIDTH; + float * cur_weights = weights + offset; + const float * cur_training_speed = training_speed + offset; + if (single_output_y_group) + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH; ++j) + cur_weights[i * weight_count_per_output_feature_map + j] += sums[i * WINDOW_WIDTH + j] * cur_training_speed[i * weight_count_per_output_feature_map + j]; + } + } + } + else + { + #pragma unroll + for(int i = 0; i < FEATURE_MAP_BLOCK_SIZE; ++i) + { + if (i < output_feature_map_count - output_feature_map_id) + { + #pragma unroll + for(int j = 0; j < WINDOW_WIDTH; ++j) + atomicAdd(cur_weights + i * weight_count_per_output_feature_map + j, sums[i * WINDOW_WIDTH + j] * cur_training_speed[i * weight_count_per_output_feature_map + j]); + } + } + } + } +} + +namespace nnforge +{ + namespace cuda + { + convolution_2d_layer_updater_cuda_kepler::convolution_2d_layer_updater_cuda_kepler() + { + output_tex_ref.addressMode[0] = cudaAddressModeBorder; + output_tex_ref.normalized = false; + } + + convolution_2d_layer_updater_cuda_kepler::~convolution_2d_layer_updater_cuda_kepler() + { + } + +#define MAX_BLOCK_SIZE 5 +#define MAX_WINDOW_WIDTH 10 + +#define launch_exact_kernel_const_const(window_width_const, block_size_const, single_input_feature_map_group) \ + convolution_2d_tex_exact_upd_kernel_kepler<<>>(*output_neurons_buffer, input_tex, weights_tex, *data[1], xy_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, forward_input_feature_map_group_size, texture_offset, entry_count, different_input, xy_config_count, feature_map_config_count); + +#define launch_exact_kernel_const(window_width, block_size_const, single_input_feature_map_group) \ + switch (window_width) \ + { \ + case 1: \ + launch_exact_kernel_const_const(1, block_size_const, single_input_feature_map_group); \ + break; \ + case 2: \ + launch_exact_kernel_const_const(2, block_size_const, single_input_feature_map_group); \ + break; \ + case 3: \ + launch_exact_kernel_const_const(3, block_size_const, single_input_feature_map_group); \ + break; \ + case 4: \ + launch_exact_kernel_const_const(4, block_size_const, single_input_feature_map_group); \ + break; \ + case 5: \ + launch_exact_kernel_const_const(5, block_size_const, single_input_feature_map_group); \ + break; \ + case 6: \ + launch_exact_kernel_const_const(6, block_size_const, single_input_feature_map_group); \ + break; \ + case 7: \ + launch_exact_kernel_const_const(7, block_size_const, single_input_feature_map_group); \ + break; \ + case 8: \ + launch_exact_kernel_const_const(8, block_size_const, single_input_feature_map_group); \ + break; \ + case 9: \ + launch_exact_kernel_const_const(9, block_size_const, single_input_feature_map_group); \ + break; \ + case 10: \ + launch_exact_kernel_const_const(10, block_size_const, single_input_feature_map_group); \ + break; \ + }; + +#define launch_exact_kernel(window_width, block_size, single_input_feature_map_group) \ + switch (block_size) \ + { \ + case 1: \ + launch_exact_kernel_const(window_width, 1, single_input_feature_map_group); \ + break; \ + case 2: \ + launch_exact_kernel_const(window_width, 2, single_input_feature_map_group); \ + break; \ + case 3: \ + launch_exact_kernel_const(window_width, 3, single_input_feature_map_group); \ + break; \ + case 4: \ + launch_exact_kernel_const(window_width, 4, single_input_feature_map_group); \ + break; \ + case 5: \ + launch_exact_kernel_const(window_width, 5, single_input_feature_map_group); \ + break; \ + }; + +#define launch_kernel_const(block_size_const, single_input_feature_map_group) \ + convolution_2d_tex_upd_kernel_kepler<<>>(*output_neurons_buffer, input_tex, weights_tex, *data[1], xy_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[0], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, forward_input_feature_map_group_size, texture_offset, entry_count, different_input, xy_config_count, feature_map_config_count); + +#define launch_kernel(block_size, single_input_feature_map_group) \ + switch (block_size) \ + { \ + case 1: \ + launch_kernel_const(1, single_input_feature_map_group); \ + break; \ + case 2: \ + launch_kernel_const(2, single_input_feature_map_group); \ + break; \ + case 3: \ + launch_kernel_const(3, single_input_feature_map_group); \ + break; \ + case 4: \ + launch_kernel_const(4, single_input_feature_map_group); \ + break; \ + case 5: \ + launch_kernel_const(5, single_input_feature_map_group); \ + break; \ + }; + +#define launch_backprop_exact_kernel_const_const(window_width_const, block_size_const, single_output_feature_map_group) \ + convolution_2d_deriviative_tex_exact_upd_kernel_kepler<<>>(*input_errors_buffer, output_tex, weights_tex, xy_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, backward_output_feature_map_group_size, entry_count, xy_config_count, feature_map_config_count); + +#define launch_backprop_exact_kernel_const(window_width, block_size_const, single_output_feature_map_group) \ + switch (window_width) \ + { \ + case 1: \ + launch_backprop_exact_kernel_const_const(1, block_size_const, single_output_feature_map_group); \ + break; \ + case 2: \ + launch_backprop_exact_kernel_const_const(2, block_size_const, single_output_feature_map_group); \ + break; \ + case 3: \ + launch_backprop_exact_kernel_const_const(3, block_size_const, single_output_feature_map_group); \ + break; \ + case 4: \ + launch_backprop_exact_kernel_const_const(4, block_size_const, single_output_feature_map_group); \ + break; \ + case 5: \ + launch_backprop_exact_kernel_const_const(5, block_size_const, single_output_feature_map_group); \ + break; \ + case 6: \ + launch_backprop_exact_kernel_const_const(6, block_size_const, single_output_feature_map_group); \ + break; \ + case 7: \ + launch_backprop_exact_kernel_const_const(7, block_size_const, single_output_feature_map_group); \ + break; \ + case 8: \ + launch_backprop_exact_kernel_const_const(8, block_size_const, single_output_feature_map_group); \ + break; \ + case 9: \ + launch_backprop_exact_kernel_const_const(9, block_size_const, single_output_feature_map_group); \ + break; \ + case 10: \ + launch_backprop_exact_kernel_const_const(10, block_size_const, single_output_feature_map_group); \ + break; \ + }; + +#define launch_backprop_exact_kernel(window_width, block_size, single_output_feature_map_group) \ + switch (block_size) \ + { \ + case 1: \ + launch_backprop_exact_kernel_const(window_width, 1, single_output_feature_map_group); \ + break; \ + case 2: \ + launch_backprop_exact_kernel_const(window_width, 2, single_output_feature_map_group); \ + break; \ + case 3: \ + launch_backprop_exact_kernel_const(window_width, 3, single_output_feature_map_group); \ + break; \ + case 4: \ + launch_backprop_exact_kernel_const(window_width, 4, single_output_feature_map_group); \ + break; \ + case 5: \ + launch_backprop_exact_kernel_const(window_width, 5, single_output_feature_map_group); \ + break; \ + }; + +#define launch_backprop_kernel_const(block_size_const, single_output_feature_map_group) \ + convolution_2d_deriviative_tex_upd_kernel_kepler<<>>(*input_errors_buffer, output_tex, weights_tex, xy_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[0], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, backward_output_feature_map_group_size, entry_count, xy_config_count, feature_map_config_count); + +#define launch_backprop_kernel(block_size, single_output_feature_map_group) \ + switch (block_size) \ + { \ + case 1: \ + launch_backprop_kernel_const(1, single_output_feature_map_group); \ + break; \ + case 2: \ + launch_backprop_kernel_const(2, single_output_feature_map_group); \ + break; \ + case 3: \ + launch_backprop_kernel_const(3, single_output_feature_map_group); \ + break; \ + case 4: \ + launch_backprop_kernel_const(4, single_output_feature_map_group); \ + break; \ + case 5: \ + launch_backprop_kernel_const(5, single_output_feature_map_group); \ + break; \ + }; + +#define launch_update_weights_exact_kernel_const(window_width_const, single_output_y_group_const) \ + convolution_2d_update_weights_exact_upd_kernel_kepler<<>>(*data[0], input_tex, *training_speed[0], output_y_weight_y_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, updater_output_y_group_count, texture_offset, entry_count, different_input, output_y_weight_y_config_count, feature_map_config_count); + +#define launch_update_weights_exact_kernel(window_width, single_output_y_group_const) \ + switch (window_width) \ + { \ + case 1: \ + launch_update_weights_exact_kernel_const(1, single_output_y_group_const); \ + break; \ + case 2: \ + launch_update_weights_exact_kernel_const(2, single_output_y_group_const); \ + break; \ + case 3: \ + launch_update_weights_exact_kernel_const(3, single_output_y_group_const); \ + break; \ + case 4: \ + launch_update_weights_exact_kernel_const(4, single_output_y_group_const); \ + break; \ + case 5: \ + launch_update_weights_exact_kernel_const(5, single_output_y_group_const); \ + break; \ + case 6: \ + launch_update_weights_exact_kernel_const(6, single_output_y_group_const); \ + break; \ + case 7: \ + launch_update_weights_exact_kernel_const(7, single_output_y_group_const); \ + break; \ + case 8: \ + launch_update_weights_exact_kernel_const(8, single_output_y_group_const); \ + break; \ + case 9: \ + launch_update_weights_exact_kernel_const(9, single_output_y_group_const); \ + break; \ + case 10: \ + launch_update_weights_exact_kernel_const(10, single_output_y_group_const); \ + break; \ + }; + +#define launch_update_weights_kernel_const(single_output_y_group_const) \ + convolution_2d_update_weights_upd_kernel_kepler<<>>(*data[0], input_tex, *training_speed[0], output_y_weight_y_weight_x_config_list, feature_map_config_list, output_configuration_specific.dimension_sizes[0], output_configuration_specific.dimension_sizes[1], input_configuration_specific.dimension_sizes[0], input_configuration_specific.dimension_sizes[1], window_sizes[0], window_sizes[1], input_configuration_specific.feature_map_count, output_configuration_specific.feature_map_count, updater_output_y_group_count, texture_offset, entry_count, different_input, output_y_weight_y_weight_x_config_count, feature_map_config_count); + + void convolution_2d_layer_updater_cuda_kepler::enqueue_test( + unsigned int offset_input_entry_id, + cudaStream_t stream_id, + const std::vector& schema_data, + const std::vector& data, + const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, + cuda_linear_buffer_device_smart_ptr output_neurons_buffer, + const std::vector& additional_buffers, + std::vector& dynamic_memobjects, + unsigned int entry_count) + { + if (dynamic_memobjects[0] == 0) + dynamic_memobjects[0] = cuda_texture_smart_ptr(new cuda_texture(input_neurons_buffer)); + cuda_texture& input_tex = *(dynamic_cast(dynamic_memobjects[0].get())); + int texture_offset = offset_input_entry_id * input_elem_count_per_entry; + + if (dynamic_memobjects[1] == 0) + dynamic_memobjects[1] = cuda_texture_smart_ptr(new cuda_texture(data[0])); + cuda_texture& weights_tex = *(dynamic_cast(dynamic_memobjects[1].get())); + + int xy_config_count = forward_x_block_count * output_configuration_specific.dimension_sizes[1]; + const xy_config * xy_config_list = static_cast((const void *)*additional_buffers[0]); + + int feature_map_config_count = forward_input_feature_map_group_count * forward_output_feature_map_block_count; + const feature_map_config * feature_map_config_list = static_cast((const void *)*additional_buffers[1]); + + if (forward_input_feature_map_group_count > 1) + cuda_util::set_with_value( + *cuda_config, + *output_neurons_buffer, + 0.0F, + output_elem_count_per_entry * entry_count, + stream_id); + + std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( + *cuda_config, + xy_config_count, + feature_map_config_count, + entry_count); + + if (window_sizes[0] <= MAX_WINDOW_WIDTH) + { + if (forward_input_feature_map_group_count == 1) + { + launch_exact_kernel(window_sizes[0], forward_x_block_size, true); + } + else + { + launch_exact_kernel(window_sizes[0], forward_x_block_size, false); + } + } + else + { + if (forward_input_feature_map_group_count == 1) + { + launch_kernel(forward_x_block_size, true); + } + else + { + launch_kernel(forward_x_block_size, false); + } + } + } + + void convolution_2d_layer_updater_cuda_kepler::enqueue_backprop( + cudaStream_t stream_id, + const std::vector& schema_data, + const std::vector& data, + const_cuda_linear_buffer_device_smart_ptr output_neurons_buffer, + const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, + cuda_linear_buffer_device_smart_ptr output_errors_buffer, + cuda_linear_buffer_device_smart_ptr input_errors_buffer, + const std::vector& additional_buffers, + std::vector& dynamic_memobjects, + unsigned int entry_count) + { + if (!different_input) + throw neural_network_exception("convolution_2d_layer_updater_cuda_kepler is not able to backprop to the same input"); + + if (!backprop_required) + throw neural_network_exception("convolution_2d_layer_updater_cuda_kepler is not configured to do backprop but requested to"); + + if (dynamic_memobjects[2] == 0) + dynamic_memobjects[2] = cuda_texture_smart_ptr(new cuda_texture(output_errors_buffer)); + cuda_texture& output_tex = *(dynamic_cast(dynamic_memobjects[2].get())); + + if (dynamic_memobjects[1] == 0) + dynamic_memobjects[1] = cuda_texture_smart_ptr(new cuda_texture(data[0])); + cuda_texture& weights_tex = *(dynamic_cast(dynamic_memobjects[1].get())); + + int xy_config_count = backward_x_block_count * input_configuration_specific.dimension_sizes[1]; + const xy_config * xy_config_list = static_cast((const void *)*additional_buffers[4]); + + int feature_map_config_count = backward_output_feature_map_group_count * backward_input_feature_map_block_count; + const feature_map_config * feature_map_config_list = static_cast((const void *)*additional_buffers[5]); + + if (backward_output_feature_map_group_count > 1) + cuda_util::set_with_value( + *cuda_config, + *input_errors_buffer, + 0.0F, + input_elem_count_per_entry * entry_count, + stream_id); + + std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( + *cuda_config, + xy_config_count, + feature_map_config_count, + entry_count); + + if (window_sizes[0] <= MAX_WINDOW_WIDTH) + { + if (backward_output_feature_map_group_count == 1) + { + launch_backprop_exact_kernel(window_sizes[0], backward_x_block_size, true); + } + else + { + launch_backprop_exact_kernel(window_sizes[0], backward_x_block_size, false); + } + } + else + { + if (backward_output_feature_map_group_count == 1) + { + launch_backprop_kernel(backward_x_block_size, true); + } + else + { + launch_backprop_kernel(backward_x_block_size, false); + } + } + } + + void convolution_2d_layer_updater_cuda_kepler::enqueue_update_weights( + unsigned int offset_input_entry_id, + cudaStream_t stream_id, + const std::vector& data, + const std::vector& schema_data, + const std::vector& training_speed, + cuda_linear_buffer_device_smart_ptr output_errors_buffer, + const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, + const std::vector& additional_buffers, + std::vector& dynamic_memobjects, + unsigned int entry_count) + { + // Update biases + { + int threadblock_size = get_threadblock_size_biases(output_elem_count_per_feature_map); + dim3 grid_size(1, output_configuration_specific.feature_map_count, entry_count); + dim3 block_size(threadblock_size, 1, 1); + int smem_size = threadblock_size * sizeof(float); + int min_iteration_count = output_elem_count_per_feature_map / threadblock_size; + + convolution_2d_update_biases_upd_kernel_kepler<<>>( + *data[1], + *output_errors_buffer, + *training_speed[1], + output_configuration_specific.feature_map_count, + output_elem_count_per_feature_map, + min_iteration_count); + } + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cuda_safe_call(cudaBindTexture(0, output_tex_ref, *output_errors_buffer, desc, output_elem_count_per_entry * entry_count * sizeof(float))); + + if (dynamic_memobjects[0] == 0) + dynamic_memobjects[0] = cuda_texture_smart_ptr(new cuda_texture(input_neurons_buffer)); + cuda_texture& input_tex = *(dynamic_cast(dynamic_memobjects[0].get())); + int texture_offset = offset_input_entry_id * input_elem_count_per_entry; + + + int feature_map_config_count = updater_output_feature_map_block_count * input_configuration_specific.feature_map_count; + const feature_map_config * feature_map_config_list = static_cast((const void *)*additional_buffers[3]); + + // Update weights + { + if (updater_window_x_block_count == 1) + { + int output_y_weight_y_config_count = updater_output_y_group_count * window_sizes[1]; + const output_y_weight_y_config * output_y_weight_y_config_list = static_cast((const void *)*additional_buffers[2]); + + std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( + *cuda_config, + output_y_weight_y_config_count, + feature_map_config_count, + entry_count); + + if (updater_output_y_group_count == 1) + { + launch_update_weights_exact_kernel(window_sizes[0], true); + } + else + { + launch_update_weights_exact_kernel(window_sizes[0], false); + } + } + else + { + int output_y_weight_y_weight_x_config_count = updater_output_y_group_count * window_sizes[1] * updater_window_x_block_count; + const output_y_weight_y_weight_x_config * output_y_weight_y_weight_x_config_list = static_cast((const void *)*additional_buffers[2]); + + std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( + *cuda_config, + output_y_weight_y_weight_x_config_count, + feature_map_config_count, + entry_count); + + if (updater_output_y_group_count == 1) + { + launch_update_weights_kernel_const(true); + } + else + { + launch_update_weights_kernel_const(false); + } + } + } + } + + int convolution_2d_layer_updater_cuda_kepler::get_block_size(int width) + { + int block_count = (width + MAX_BLOCK_SIZE - 1) / MAX_BLOCK_SIZE; + int block_size = (width + block_count - 1) / block_count; + return block_size; + } + + void convolution_2d_layer_updater_cuda_kepler::updater_configured() + { + std::tr1::shared_ptr layer_derived = std::tr1::dynamic_pointer_cast(layer_schema); + + for(std::vector::const_iterator it = layer_derived->window_sizes.begin(); it != layer_derived->window_sizes.end(); ++it) + window_sizes.push_back(static_cast(*it)); + + forward_x_block_size = get_block_size(output_configuration_specific.dimension_sizes[0]); + forward_x_block_count = (output_configuration_specific.dimension_sizes[0] + forward_x_block_size - 1) / forward_x_block_size; + forward_output_feature_map_block_count = (output_configuration_specific.feature_map_count + FEATURE_MAP_BLOCK_SIZE - 1) / FEATURE_MAP_BLOCK_SIZE; + + updater_output_feature_map_block_count = (output_configuration_specific.feature_map_count + FEATURE_MAP_BLOCK_SIZE - 1) / FEATURE_MAP_BLOCK_SIZE; + updater_window_x_block_count = (window_sizes[0] <= MAX_WINDOW_WIDTH) ? 1 : (window_sizes[0] + WINDOW_WIDTH_LOCAL - 1) / WINDOW_WIDTH_LOCAL; + + if (backprop_required) + { + backward_x_block_size = get_block_size(input_configuration_specific.dimension_sizes[0]); + backward_x_block_count = (input_configuration_specific.dimension_sizes[0] + backward_x_block_size - 1) / backward_x_block_size; + backward_input_feature_map_block_count = (input_configuration_specific.feature_map_count + FEATURE_MAP_BLOCK_SIZE - 1) / FEATURE_MAP_BLOCK_SIZE; + } + } + + bool convolution_2d_layer_updater_cuda_kepler::is_in_place_backprop() const + { + return false; + } + + std::vector convolution_2d_layer_updater_cuda_kepler::get_linear_addressing_through_texture_per_entry() const + { + std::vector res; + + res.push_back(input_elem_count_per_entry); + res.push_back(output_elem_count_per_entry); + + return res; + } + + int convolution_2d_layer_updater_cuda_kepler::get_threadblock_size_biases(int output_neuron_count) + { + int threadblock_size; + + if (output_neuron_count < 128) + { + threadblock_size = (output_neuron_count + 32 - 1) / 32 * 32; + } + else + { + int threadblock_count = (output_neuron_count + 128 - 1) / 128; + threadblock_size = (output_neuron_count + threadblock_count - 1) / threadblock_count; + threadblock_size = (threadblock_size + 32 - 1) / 32 * 32; + } + + return threadblock_size; + } + + std::vector convolution_2d_layer_updater_cuda_kepler::get_sizes_of_additional_buffers_fixed() const + { + std::vector res; + + res.push_back(sizeof(xy_config) * forward_x_block_count * output_configuration_specific.dimension_sizes[1]); + res.push_back(sizeof(feature_map_config) * input_configuration_specific.feature_map_count * forward_output_feature_map_block_count); + + res.push_back(sizeof(output_y_weight_y_config) * window_sizes[1] * output_configuration_specific.dimension_sizes[1] * updater_window_x_block_count); + res.push_back(sizeof(feature_map_config) * input_configuration_specific.feature_map_count * updater_output_feature_map_block_count); + + if (backprop_required) + { + res.push_back(sizeof(xy_config) * backward_x_block_count * input_configuration_specific.dimension_sizes[1]); + res.push_back(sizeof(feature_map_config) * output_configuration_specific.feature_map_count * backward_input_feature_map_block_count); + } + + return res; + } + + void convolution_2d_layer_updater_cuda_kepler::fill_additional_buffers(const std::vector& additional_buffers) const + { + { + std::vector task_list; + for(int y = 0; y < output_configuration_specific.dimension_sizes[1]; ++y) + for(int x = 0; x < forward_x_block_count; ++x) + task_list.push_back(xy_config(y, x * forward_x_block_size)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[0], &(*task_list.begin()), sizeof(xy_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + + { + std::vector task_list; + for(int input_feature_map_group_id = 0; input_feature_map_group_id < forward_input_feature_map_group_count; ++input_feature_map_group_id) + for(int output_feature_map_id = 0; output_feature_map_id < forward_output_feature_map_block_count; ++output_feature_map_id) + task_list.push_back(feature_map_config(input_feature_map_group_id * forward_input_feature_map_group_size, output_feature_map_id * FEATURE_MAP_BLOCK_SIZE)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[1], &(*task_list.begin()), sizeof(feature_map_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + + if (updater_window_x_block_count == 1) + { + std::vector task_list; + for(int output_y = 0; output_y < updater_output_y_group_count; ++output_y) + for(int weight_y = 0; weight_y < window_sizes[1]; ++weight_y) + task_list.push_back(output_y_weight_y_config(output_y, weight_y)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[2], &(*task_list.begin()), sizeof(xy_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + else + { + std::vector task_list; + for(int output_y = 0; output_y < updater_output_y_group_count; ++output_y) + for(int weight_y = 0; weight_y < window_sizes[1]; ++weight_y) + for(int weight_x = 0; weight_x < updater_window_x_block_count; ++weight_x) + task_list.push_back(output_y_weight_y_weight_x_config(output_y, weight_y, weight_x * FEATURE_MAP_BLOCK_SIZE)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[2], &(*task_list.begin()), sizeof(xy_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + + { + std::vector > pair_list; + cuda_util::fill_tiling_pattern(input_configuration_specific.feature_map_count, updater_output_feature_map_block_count, pair_list); + + std::vector task_list; + for(std::vector >::const_iterator it = pair_list.begin(); it != pair_list.end(); ++it) + task_list.push_back(feature_map_config(it->first, it->second * FEATURE_MAP_BLOCK_SIZE)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[3], &(*task_list.begin()), sizeof(feature_map_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + + if (backprop_required) + { + { + std::vector task_list; + for(int y = 0; y < input_configuration_specific.dimension_sizes[1]; ++y) + for(int x = 0; x < backward_x_block_count; ++x) + task_list.push_back(xy_config(y, x * backward_x_block_size + (backward_x_block_size - 1))); + + cuda_safe_call(cudaMemcpy(*additional_buffers[4], &(*task_list.begin()), sizeof(xy_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + + { + std::vector task_list; + for(int output_feature_map_group_id = 0; output_feature_map_group_id < backward_output_feature_map_group_count; ++output_feature_map_group_id) + for(int input_feature_map_id = 0; input_feature_map_id < backward_input_feature_map_block_count; ++input_feature_map_id) + task_list.push_back(feature_map_config(input_feature_map_id * FEATURE_MAP_BLOCK_SIZE, output_feature_map_group_id * backward_output_feature_map_group_size)); + + cuda_safe_call(cudaMemcpy(*additional_buffers[5], &(*task_list.begin()), sizeof(feature_map_config) * task_list.size(), cudaMemcpyHostToDevice)); + } + } + } + + void convolution_2d_layer_updater_cuda_kepler::set_max_entry_count(unsigned int max_entry_count) + { + forward_input_feature_map_group_count = cuda_util::get_group_count( + *cuda_config, + forward_x_block_count * output_configuration_specific.dimension_sizes[1] * forward_output_feature_map_block_count * max_entry_count, + input_configuration_specific.feature_map_count); + forward_input_feature_map_group_size = (input_configuration_specific.feature_map_count + forward_input_feature_map_group_count - 1) / forward_input_feature_map_group_count; + + updater_output_y_group_count = cuda_util::get_group_count( + *cuda_config, + updater_output_feature_map_block_count * input_configuration_specific.feature_map_count * window_sizes[1] * max_entry_count * updater_window_x_block_count, + output_configuration_specific.dimension_sizes[1]); + updater_output_y_group_size = (output_configuration_specific.dimension_sizes[1] + updater_output_y_group_count - 1) / updater_output_y_group_count; + + if (backprop_required) + { + backward_output_feature_map_group_count = cuda_util::get_group_count( + *cuda_config, + backward_x_block_count * input_configuration_specific.dimension_sizes[1] * backward_input_feature_map_block_count * max_entry_count, + output_configuration_specific.feature_map_count); + backward_output_feature_map_group_size = (output_configuration_specific.feature_map_count + backward_output_feature_map_group_count - 1) / backward_output_feature_map_group_count; + } + } + + int convolution_2d_layer_updater_cuda_kepler::get_dynamic_memobject_count() const + { + return 3; + } + } +} diff --git a/nnforge/cuda/convolution_2d_layer_updater_cuda_kepler.h b/nnforge/cuda/convolution_2d_layer_updater_cuda_kepler.h new file mode 100644 index 0000000..4f0767d --- /dev/null +++ b/nnforge/cuda/convolution_2d_layer_updater_cuda_kepler.h @@ -0,0 +1,106 @@ +/* + * Copyright 2011-2013 Maxim Milakov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "layer_updater_cuda.h" + +namespace nnforge +{ + namespace cuda + { + class convolution_2d_layer_updater_cuda_kepler : public layer_updater_cuda + { + public: + convolution_2d_layer_updater_cuda_kepler(); + + virtual ~convolution_2d_layer_updater_cuda_kepler(); + + virtual void enqueue_test( + unsigned int offset_input_entry_id, + cudaStream_t stream_id, + const std::vector& schema_data, + const std::vector& data, + const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, + cuda_linear_buffer_device_smart_ptr output_neurons_buffer, + const std::vector& additional_buffers, + std::vector& dynamic_memobjects, + unsigned int entry_count); + + virtual void enqueue_backprop( + cudaStream_t stream_id, + const std::vector& schema_data, + const std::vector& data, + const_cuda_linear_buffer_device_smart_ptr output_neurons_buffer, + const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, + cuda_linear_buffer_device_smart_ptr output_errors_buffer, + cuda_linear_buffer_device_smart_ptr input_errors_buffer, + const std::vector& additional_buffers, + std::vector& dynamic_memobjects, + unsigned int entry_count); + + virtual void enqueue_update_weights( + unsigned int offset_input_entry_id, + cudaStream_t stream_id, + const std::vector& data, + const std::vector& schema_data, + const std::vector& training_speed, + cuda_linear_buffer_device_smart_ptr output_errors_buffer, + const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, + const std::vector& additional_buffers, + std::vector& dynamic_memobjects, + unsigned int entry_count); + + protected: + virtual bool is_in_place_backprop() const; + + virtual void updater_configured(); + + virtual std::vector get_sizes_of_additional_buffers_fixed() const; + + virtual void set_max_entry_count(unsigned int max_entry_count); + + virtual std::vector get_linear_addressing_through_texture_per_entry() const; + + virtual void fill_additional_buffers(const std::vector& additional_buffers) const; + + virtual int get_dynamic_memobject_count() const; + + std::vector window_sizes; + + int forward_x_block_size; + int forward_x_block_count; + int forward_input_feature_map_group_count; + int forward_input_feature_map_group_size; + int forward_output_feature_map_block_count; + + int backward_x_block_size; + int backward_x_block_count; + int backward_output_feature_map_group_count; + int backward_output_feature_map_group_size; + int backward_input_feature_map_block_count; + + int updater_output_y_group_count; + int updater_output_y_group_size; + int updater_output_feature_map_block_count; + int updater_window_x_block_count; + private: + static int get_block_size(int width); + + static int get_threadblock_size_biases(int output_neuron_count); + }; + } +} diff --git a/nnforge/cuda/convolution_layer_updater_schema.cpp b/nnforge/cuda/convolution_layer_updater_schema.cpp index c0cb3ed..5e65650 100644 --- a/nnforge/cuda/convolution_layer_updater_schema.cpp +++ b/nnforge/cuda/convolution_layer_updater_schema.cpp @@ -18,7 +18,8 @@ #include "../convolution_layer.h" #include "../neural_network_exception.h" -#include "convolution_2d_layer_updater_cuda.h" +#include "convolution_2d_layer_updater_cuda_fermi.h" +#include "convolution_2d_layer_updater_cuda_kepler.h" #include "fully_connected_layer_updater_cuda.h" #include @@ -60,7 +61,10 @@ namespace nnforge switch (output_configuration_specific.dimension_sizes.size()) { case 2: - res = layer_updater_cuda_smart_ptr(new convolution_2d_layer_updater_cuda()); + if (cuda_config->get_compute_capability() >= 300) + res = layer_updater_cuda_smart_ptr(new convolution_2d_layer_updater_cuda_kepler()); + else + res = layer_updater_cuda_smart_ptr(new convolution_2d_layer_updater_cuda_fermi()); break; default: throw neural_network_exception((boost::format("No CUDA updater for the convolution layer of %1% dimensions") % output_configuration_specific.dimension_sizes.size()).str()); diff --git a/nnforge/cuda/cuda_running_configuration.cpp b/nnforge/cuda/cuda_running_configuration.cpp index 948ee8a..af5c1b9 100644 --- a/nnforge/cuda/cuda_running_configuration.cpp +++ b/nnforge/cuda/cuda_running_configuration.cpp @@ -81,6 +81,7 @@ namespace nnforge for(int i = 0; i < sizeof(max_grid_size) / sizeof(max_grid_size[0]); ++i) max_grid_size[i] = device_prop.maxGridSize[i]; max_texture_1d_linear = device_prop.maxTexture1DLinear; + texture_alignment = device_prop.textureAlignment; pci_bus_id = device_prop.pciBusID; pci_device_id = device_prop.pciDeviceID; #ifdef WIN32 @@ -131,6 +132,7 @@ namespace nnforge << running_configuration.max_grid_size[1] << " x " << running_configuration.max_grid_size[2] << std::endl; out << "Maximum size of 1D texture bound to linear memory = " << running_configuration.max_texture_1d_linear << std::endl; + out << "Texture alignment = " << running_configuration.texture_alignment << " bytes" << std::endl; out << "PCI Bus ID = " << running_configuration.pci_bus_id << std::endl; out << "PCI Location ID = " << running_configuration.pci_device_id << std::endl; #ifdef WIN32 diff --git a/nnforge/cuda/cuda_running_configuration.h b/nnforge/cuda/cuda_running_configuration.h index d539df5..0eb3b15 100644 --- a/nnforge/cuda/cuda_running_configuration.h +++ b/nnforge/cuda/cuda_running_configuration.h @@ -72,6 +72,7 @@ namespace nnforge int max_threads_dim[3]; int max_grid_size[3]; int max_texture_1d_linear; + int texture_alignment; // in bytes int pci_bus_id; int pci_device_id; diff --git a/nnforge/cuda/cuda_texture.cpp b/nnforge/cuda/cuda_texture.cpp index 5ce721f..c610399 100644 --- a/nnforge/cuda/cuda_texture.cpp +++ b/nnforge/cuda/cuda_texture.cpp @@ -26,6 +26,8 @@ namespace nnforge { cuda_texture::cuda_texture(const_cuda_linear_buffer_device_smart_ptr dev_smart_ptr) : tex(0) + , texture_offset_elems(0) + , dev_smart_ptr(dev_smart_ptr) { struct cudaResourceDesc res_desc; memset(&res_desc, 0, sizeof(res_desc)); @@ -43,6 +45,35 @@ namespace nnforge cuda_safe_call(cudaCreateTextureObject(&tex, &res_desc, &tex_desc, 0)); } + cuda_texture::cuda_texture( + const_cuda_linear_buffer_device_smart_ptr dev_smart_ptr, + int elem_offset, + int elem_count, + const cuda_running_configuration& cuda_config) + : tex(0) + , texture_offset_elems(0) + , dev_smart_ptr(dev_smart_ptr) + { + int texture_alignment_in_elems = cuda_config.texture_alignment / sizeof(float); + int elem_offset_aligned = (elem_offset / texture_alignment_in_elems) * texture_alignment_in_elems; + this->texture_offset_elems = elem_offset - elem_offset_aligned; + + struct cudaResourceDesc res_desc; + memset(&res_desc, 0, sizeof(res_desc)); + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = const_cast(((const float *)(*dev_smart_ptr)) + elem_offset_aligned); + res_desc.res.linear.desc = cudaCreateChannelDesc(); + res_desc.res.linear.sizeInBytes = (elem_count + (this->texture_offset_elems)) * sizeof(float); + + struct cudaTextureDesc tex_desc; + memset(&tex_desc, 0, sizeof(tex_desc)); + tex_desc.addressMode[0] = cudaAddressModeBorder; + tex_desc.readMode = cudaReadModeElementType; + tex_desc.normalizedCoords = 0; + + cuda_safe_call(cudaCreateTextureObject(&tex, &res_desc, &tex_desc, 0)); + } + cuda_texture::~cuda_texture() { if (tex != 0) @@ -61,5 +92,10 @@ namespace nnforge return res_desc.res.linear.sizeInBytes; } + + int cuda_texture::get_texture_offset_elems() const + { + return texture_offset_elems; + } } } diff --git a/nnforge/cuda/cuda_texture.h b/nnforge/cuda/cuda_texture.h index d175922..7fcda65 100644 --- a/nnforge/cuda/cuda_texture.h +++ b/nnforge/cuda/cuda_texture.h @@ -18,6 +18,7 @@ #include "cuda_memobject.h" #include "cuda_linear_buffer_device.h" +#include "cuda_running_configuration.h" #include @@ -30,15 +31,27 @@ namespace nnforge public: cuda_texture(const_cuda_linear_buffer_device_smart_ptr dev_smart_ptr); + cuda_texture( + const_cuda_linear_buffer_device_smart_ptr dev_smart_ptr, + int elem_offset, + int elem_count, + const cuda_running_configuration& cuda_config); + virtual ~cuda_texture(); operator cudaTextureObject_t () const; virtual size_t get_size() const; + int get_texture_offset_elems() const; + protected: cudaTextureObject_t tex; const_cuda_linear_buffer_device_smart_ptr dev_smart_ptr; + int texture_offset_elems; }; + + typedef std::tr1::shared_ptr cuda_texture_smart_ptr; + typedef std::tr1::shared_ptr const_cuda_texture_smart_ptr; } } diff --git a/nnforge/cuda/fully_connected_layer_updater_cuda.cu b/nnforge/cuda/fully_connected_layer_updater_cuda.cu index 37c8ae1..23e89dc 100644 --- a/nnforge/cuda/fully_connected_layer_updater_cuda.cu +++ b/nnforge/cuda/fully_connected_layer_updater_cuda.cu @@ -23,13 +23,12 @@ #include "neural_network_cuda_exception.h" #include "../convolution_layer.h" -extern __shared__ float arr[]; +extern __shared__ float arr_sh[]; __global__ void fully_connected_upd_kernel( - float * __restrict output, + float * output, const float * __restrict input, - const float * __restrict weights, - const float * __restrict biases, + const float * weights, int input_neuron_count, int output_neuron_count, int min_iteration_count) @@ -50,29 +49,32 @@ __global__ void fully_connected_upd_kernel( } if (current_input_neuron_id < input_neuron_count) sum += current_input[current_input_neuron_id] * current_weights[current_input_neuron_id]; - arr[thread_id] = sum; - __syncthreads(); - int output_offset = entry_id * output_neuron_count + output_neuron_id; - float bias; - if (thread_id == 0) - bias = biases[output_offset]; + int lane_id = thread_id & 31; - int t_add_elems = threadblock_size >> 1; - int t_working_elems = (threadblock_size + 1) >> 1; - while (t_add_elems > 0) +#if __CUDA_ARCH__ >= 300 + #pragma unroll + for(int tx = 16; tx > 0; tx >>= 1) + { + sum += __shfl_down(sum, tx); + } +#else + volatile float * arr = arr_sh; + arr[thread_id] = sum; + #pragma unroll + for(int tx = 16; tx > 0; tx >>= 1) { - if (thread_id < t_add_elems) - arr[thread_id] += arr[thread_id + t_working_elems]; - t_add_elems = t_working_elems >> 1; - t_working_elems = (t_working_elems + 1) >> 1; - __syncthreads(); + if (lane_id < tx) + arr[thread_id] += arr[thread_id + tx]; } + sum = arr[thread_id]; +#endif - if (thread_id == 0) - output[output_offset] = arr[0] + bias; + if (lane_id == 0) + atomicAdd(output + entry_id * output_neuron_count + output_neuron_id, sum); } +template __global__ void fully_connected_deriviative_upd_kernel( float * __restrict input_errors, const float * __restrict output_errors, @@ -89,22 +91,27 @@ __global__ void fully_connected_deriviative_upd_kernel( bool in_bounds = (input_neuron_id < input_neuron_count) && (output_group_id < output_group_count) && (entry_id < entry_count); if (in_bounds) { - const float * current_output = output_errors + (int)(entry_id * output_neuron_count + output_group_id); - const float * current_weights = weights + (int)((entry_id * output_neuron_count + output_group_id) * input_neuron_count + input_neuron_id); - int iteration_count = (max_iteration_count * output_group_count + output_group_id < output_neuron_count) ? max_iteration_count : max_iteration_count - 1; + int output_offset = entry_id * output_neuron_count + output_group_id; + int weights_offset = (entry_id * output_neuron_count + output_group_id) * input_neuron_count + input_neuron_id; + int iteration_count = ((max_iteration_count - 1) * output_group_count + output_group_id < output_neuron_count) ? max_iteration_count : max_iteration_count - 1; float sum = 0.0F; + #pragma unroll 4 for(int i = 0; i < iteration_count; ++i) { - sum += *current_output * *current_weights; - current_weights += input_neuron_count * output_group_count; - current_output += output_group_count; + sum += output_errors[output_offset] * weights[weights_offset]; + weights_offset += input_neuron_count * output_group_count; + output_offset += output_group_count; } float * current_input = input_errors + entry_id * input_neuron_count + input_neuron_id; - if (output_group_count == 1) + if (single_output_group_count) + { *current_input = sum; + } else + { atomicAdd(current_input, sum); + } } } @@ -169,21 +176,28 @@ namespace nnforge const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) { if (different_input) { + cuda_util::copy_buffer( + *cuda_config, + *data[1], + *output_neurons_buffer, + output_elem_count_per_entry * entry_count, + stream_id); + int threadblock_size = get_threadblock_size_forward(input_elem_count_per_entry); dim3 grid_size(1, output_elem_count_per_entry, entry_count); dim3 block_size(threadblock_size, 1, 1); - int smem_size = threadblock_size * sizeof(float); + int smem_size = (cuda_config->get_compute_capability() >= 300) ? 0 : (threadblock_size * sizeof(float)); int min_iteration_count = input_elem_count_per_entry / threadblock_size; fully_connected_upd_kernel<<>>( *output_neurons_buffer, *input_neurons_buffer, *data[0], - *data[1], input_elem_count_per_entry, output_elem_count_per_entry, min_iteration_count); @@ -218,15 +232,17 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) { if (!different_input) throw neural_network_exception("fully_connected_layer_updater_cuda is not able to backprop to the same input"); int output_group_count = cuda_util::get_group_count( - *cuda_config, - input_elem_count_per_entry * entry_count, - output_elem_count_per_entry); + *cuda_config, + input_elem_count_per_entry * entry_count, + output_elem_count_per_entry, + true); int max_iteration_count = (output_elem_count_per_entry + output_group_count - 1) / output_group_count; if (output_group_count > 1) @@ -242,15 +258,27 @@ namespace nnforge input_elem_count_per_entry, output_group_count, entry_count); - fully_connected_deriviative_upd_kernel<<>>( - *input_errors_buffer, - *output_errors_buffer, - *data[0], - input_elem_count_per_entry, - output_elem_count_per_entry, - output_group_count, - max_iteration_count, - entry_count); + + if (output_group_count == 1) + fully_connected_deriviative_upd_kernel<<>>( + *input_errors_buffer, + *output_errors_buffer, + *data[0], + input_elem_count_per_entry, + output_elem_count_per_entry, + output_group_count, + max_iteration_count, + entry_count); + else + fully_connected_deriviative_upd_kernel<<>>( + *input_errors_buffer, + *output_errors_buffer, + *data[0], + input_elem_count_per_entry, + output_elem_count_per_entry, + output_group_count, + max_iteration_count, + entry_count); } void fully_connected_layer_updater_cuda::enqueue_update_weights( @@ -262,6 +290,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) { // Update biases @@ -315,12 +344,18 @@ namespace nnforge int fully_connected_layer_updater_cuda::get_threadblock_size_forward(int input_neuron_count) { - if (input_neuron_count < 128) - return input_neuron_count; + int threadblock_size; - int threadblock_count = (input_neuron_count + 128 - 1) / 128; - int threadblock_size = (input_neuron_count + threadblock_count - 1) / threadblock_count; - threadblock_size = (threadblock_size + 32 - 1) / 32 * 32; + if (input_neuron_count < 128) + { + threadblock_size = (input_neuron_count + 32 - 1) / 32 * 32; + } + else + { + int threadblock_count = (input_neuron_count + 128 - 1) / 128; + threadblock_size = (input_neuron_count + threadblock_count - 1) / threadblock_count; + threadblock_size = (threadblock_size + 32 - 1) / 32 * 32; + } return threadblock_size; } diff --git a/nnforge/cuda/fully_connected_layer_updater_cuda.h b/nnforge/cuda/fully_connected_layer_updater_cuda.h index dbfcc25..577106f 100644 --- a/nnforge/cuda/fully_connected_layer_updater_cuda.h +++ b/nnforge/cuda/fully_connected_layer_updater_cuda.h @@ -37,6 +37,7 @@ namespace nnforge const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); virtual void enqueue_backprop( @@ -48,6 +49,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); virtual void enqueue_update_weights( @@ -59,6 +61,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); protected: diff --git a/nnforge/cuda/hyperbolic_tangent_layer_updater_cuda.cu b/nnforge/cuda/hyperbolic_tangent_layer_updater_cuda.cu index 9052c32..f7e7339 100644 --- a/nnforge/cuda/hyperbolic_tangent_layer_updater_cuda.cu +++ b/nnforge/cuda/hyperbolic_tangent_layer_updater_cuda.cu @@ -103,6 +103,7 @@ namespace nnforge const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) { int elem_count = (input_elem_count_per_entry * entry_count + 3) / 4; @@ -126,6 +127,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) { int elem_count = (input_elem_count_per_entry * entry_count + 3) / 4; diff --git a/nnforge/cuda/hyperbolic_tangent_layer_updater_cuda.h b/nnforge/cuda/hyperbolic_tangent_layer_updater_cuda.h index b9427c4..8c407f2 100644 --- a/nnforge/cuda/hyperbolic_tangent_layer_updater_cuda.h +++ b/nnforge/cuda/hyperbolic_tangent_layer_updater_cuda.h @@ -37,6 +37,7 @@ namespace nnforge const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); virtual void enqueue_backprop( @@ -48,6 +49,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); protected: diff --git a/nnforge/cuda/layer_updater_cuda.cu b/nnforge/cuda/layer_updater_cuda.cu index 59605c1..f026e65 100644 --- a/nnforge/cuda/layer_updater_cuda.cu +++ b/nnforge/cuda/layer_updater_cuda.cu @@ -92,10 +92,14 @@ namespace nnforge void layer_updater_cuda::update_buffer_configuration(buffer_cuda_size_configuration& buffer_configuration) const { - std::vector sizes = get_sizes_of_additional_buffers_per_entry(); - for(std::vector::const_iterator it = sizes.begin(); it != sizes.end(); ++it) + std::vector per_entry_sizes = get_sizes_of_additional_buffers_per_entry(); + for(std::vector::const_iterator it = per_entry_sizes.begin(); it != per_entry_sizes.end(); ++it) buffer_configuration.add_per_entry_buffer(*it); + std::vector fixed_sized = get_sizes_of_additional_buffers_fixed(); + for(std::vector::const_iterator it = fixed_sized.begin(); it != fixed_sized.end(); ++it) + buffer_configuration.add_constant_buffer(*it); + buffer_configuration.add_per_entry_buffer(output_elem_count_per_entry * sizeof(float)); if (backprop_required && !is_in_place_backprop()) @@ -110,28 +114,35 @@ namespace nnforge buffer_cuda_size_configuration& buffer_configuration, unsigned int updater_entry_count) const { - std::vector sizes = get_sizes_of_additional_buffers_per_entry(); - for(std::vector::const_iterator it = sizes.begin(); it != sizes.end(); ++it) + std::vector per_entry_sizes = get_sizes_of_additional_buffers_per_entry(); + for(std::vector::const_iterator it = per_entry_sizes.begin(); it != per_entry_sizes.end(); ++it) buffer_configuration.add_constant_buffer(*it * updater_entry_count); + std::vector fixed_sizes = get_sizes_of_additional_buffers_fixed(); + for(std::vector::const_iterator it = fixed_sizes.begin(); it != fixed_sizes.end(); ++it) + buffer_configuration.add_constant_buffer(*it); + buffer_configuration.add_constant_buffer(output_elem_count_per_entry * sizeof(float) * updater_entry_count); if (backprop_required && !is_in_place_backprop()) buffer_configuration.add_constant_buffer(input_elem_count_per_entry * sizeof(float) * updater_entry_count); } - layer_updater_cuda::buffer_set layer_updater_cuda::allocate_all_buffers(unsigned int max_entry_count) const + layer_updater_cuda::buffer_set layer_updater_cuda::allocate_all_buffers(unsigned int max_entry_count) { buffer_set res; - std::vector sizes = get_sizes_of_additional_buffers_per_entry(); + set_max_entry_count(max_entry_count); - for(std::vector::const_iterator it = sizes.begin(); it != sizes.end(); ++it) - { - // Allow safe float4 accesses - size_t sz = *it * max_entry_count; - res.additional_buffers.push_back(cuda_linear_buffer_device_smart_ptr(new cuda_linear_buffer_device(sz))); - } + std::vector per_entry_sizes = get_sizes_of_additional_buffers_per_entry(); + for(std::vector::const_iterator it = per_entry_sizes.begin(); it != per_entry_sizes.end(); ++it) + res.additional_buffers.push_back(cuda_linear_buffer_device_smart_ptr(new cuda_linear_buffer_device(*it * max_entry_count))); + + std::vector fixed_sizes = get_sizes_of_additional_buffers_fixed(); + for(std::vector::const_iterator it = fixed_sizes.begin(); it != fixed_sizes.end(); ++it) + res.additional_buffers.push_back(cuda_linear_buffer_device_smart_ptr(new cuda_linear_buffer_device(*it))); + + fill_additional_buffers(res.additional_buffers); { size_t sz = output_elem_count_per_entry * sizeof(float) * max_entry_count; @@ -144,6 +155,8 @@ namespace nnforge res.input_errors_buffer = cuda_linear_buffer_device_smart_ptr(new cuda_linear_buffer_device(sz)); } + res.dynamic_memobjects.resize(get_dynamic_memobject_count()); + return res; } @@ -156,6 +169,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) { } @@ -209,5 +223,23 @@ namespace nnforge input_configuration_specific.feature_map_count, entry_count); } + + void layer_updater_cuda::fill_additional_buffers(const std::vector& additional_buffers) const + { + } + + std::vector layer_updater_cuda::get_sizes_of_additional_buffers_fixed() const + { + return std::vector(); + } + + void layer_updater_cuda::set_max_entry_count(unsigned int max_entry_count) + { + } + + int layer_updater_cuda::get_dynamic_memobject_count() const + { + return 0; + } } } diff --git a/nnforge/cuda/layer_updater_cuda.h b/nnforge/cuda/layer_updater_cuda.h index e9ee14b..ff2f815 100644 --- a/nnforge/cuda/layer_updater_cuda.h +++ b/nnforge/cuda/layer_updater_cuda.h @@ -39,6 +39,8 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_neurons_buffer; cuda_linear_buffer_device_smart_ptr input_errors_buffer; std::vector additional_buffers; + // dynamic memobject list is intendent to store shallow, lighweight objects, for example, texture objects + std::vector dynamic_memobjects; }; virtual ~layer_updater_cuda(); @@ -51,7 +53,7 @@ namespace nnforge bool backprop_required, bool different_input); - buffer_set allocate_all_buffers(unsigned int max_entry_count) const; + buffer_set allocate_all_buffers(unsigned int max_entry_count); void update_buffer_configuration(buffer_cuda_size_configuration& buffer_configuration) const; @@ -67,6 +69,7 @@ namespace nnforge const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, cuda_linear_buffer_device_smart_ptr output_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) = 0; // input_errors_buffer is null if is_in_place_backprop() is true @@ -79,6 +82,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, cuda_linear_buffer_device_smart_ptr input_errors_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count) = 0; virtual void enqueue_update_weights( @@ -90,6 +94,7 @@ namespace nnforge cuda_linear_buffer_device_smart_ptr output_errors_buffer, const_cuda_linear_buffer_device_smart_ptr input_neurons_buffer, const std::vector& additional_buffers, + std::vector& dynamic_memobjects, unsigned int entry_count); void enqueue_forward_dropout( @@ -118,8 +123,16 @@ namespace nnforge virtual std::vector get_sizes_of_additional_buffers_per_entry() const; + virtual std::vector get_sizes_of_additional_buffers_fixed() const; + + virtual void fill_additional_buffers(const std::vector& additional_buffers) const; + + virtual void set_max_entry_count(unsigned int max_entry_count); + virtual std::vector get_linear_addressing_through_texture_per_entry() const; + virtual int get_dynamic_memobject_count() const; + virtual bool is_in_place_backprop() const = 0; const_layer_smart_ptr layer_schema; diff --git a/nnforge/cuda/network_updater_cuda.cu b/nnforge/cuda/network_updater_cuda.cu index b07b4a9..d80e1ce 100644 --- a/nnforge/cuda/network_updater_cuda.cu +++ b/nnforge/cuda/network_updater_cuda.cu @@ -337,6 +337,7 @@ namespace nnforge input_and_all_buffers_pack_it->first, input_and_all_buffers_pack_it->second.output_neurons_buffer, input_and_all_buffers_pack_it->second.additional_buffers, + input_and_all_buffers_pack_it->second.dynamic_memobjects, updater_entry_count); } } @@ -379,6 +380,7 @@ namespace nnforge *output_errors_it, input_and_all_buffers_pack_it->second.input_errors_buffer, input_and_all_buffers_pack_it->second.additional_buffers, + input_and_all_buffers_pack_it->second.dynamic_memobjects, updater_entry_count); std::map::const_iterator dropout_it = layer_to_dropout_rate_map.find(reverse_layer_id); @@ -406,6 +408,7 @@ namespace nnforge *output_errors_it, input_and_all_buffers_pack_it->first, input_and_all_buffers_pack_it->second.additional_buffers, + input_and_all_buffers_pack_it->second.dynamic_memobjects, updater_entry_count); } } diff --git a/nnforge/cuda/util_cuda.cu b/nnforge/cuda/util_cuda.cu index 53dd925..4cf93e6 100644 --- a/nnforge/cuda/util_cuda.cu +++ b/nnforge/cuda/util_cuda.cu @@ -15,6 +15,7 @@ */ #include "util_cuda.h" +#include "../neural_network_exception.h" __global__ void set_with_value_util_kernel( float4 * __restrict buf, @@ -67,6 +68,16 @@ __global__ void multiply_by_itself_training_util_kernel( } } +__global__ void copy_buffer_util_kernel( + const float4 * __restrict input_buf, + float4 * __restrict output_buf, + int elem_count) +{ + int elem_id = blockDim.x * (blockIdx.y * gridDim.x + blockIdx.x) + threadIdx.x; + if (elem_id < elem_count) + output_buf[elem_id] = input_buf[elem_id]; +} + namespace nnforge { namespace cuda @@ -283,22 +294,109 @@ namespace nnforge multiply_by_itself_training_util_kernel<<>>((const float4 *)input_buf_with_aligned_size, (float4 *)output_buf_with_aligned_size, new_elem_count); } + void cuda_util::copy_buffer( + const cuda_running_configuration& cuda_config, + const float * input_buf_with_aligned_size, + float * output_buf_with_aligned_size, + int elem_count, + cudaStream_t cuda_stream) + { + int new_elem_count = (elem_count + 3) / 4; + std::pair kernel_dims = cuda_util::get_grid_and_threadblock_sizes_sequential_access( + cuda_config, + new_elem_count); + copy_buffer_util_kernel<<>>((const float4 *)input_buf_with_aligned_size, (float4 *)output_buf_with_aligned_size, new_elem_count); + } + int cuda_util::get_group_count( const cuda_running_configuration& cuda_config, int total_thread_count, - int divisible) + int divisible, + bool more_threadblocks) { int initial_threadblock_count = std::max(total_thread_count / 256, 1); - int minimum_threadblock_count = cuda_config.multiprocessor_count * 8; + int minimum_threadblock_count = cuda_config.multiprocessor_count * 4 * (more_threadblocks ? 4 : 1); if (initial_threadblock_count >= minimum_threadblock_count) return 1; - int group_count = std::min(minimum_threadblock_count / initial_threadblock_count, static_cast(sqrtf(static_cast(divisible)))); + int group_count = std::min(minimum_threadblock_count / initial_threadblock_count, static_cast(powf(static_cast(divisible), 2.0F/3.0F))); int iteration_count = (divisible + group_count - 1) / group_count; group_count = (divisible + iteration_count - 1) / iteration_count; return group_count; } + + void cuda_util::fill_tiling_pattern( + int size_x, + int size_y, + std::vector >& pair_list) + { + pair_list.clear(); + + std::stack work_set; + + int size_max = std::max(size_x, size_y); + int size_aligned = 1; + while (size_aligned < size_max) + size_aligned <<= 1; + + work_set.push(tile(0, size_aligned, 0, size_aligned)); + int start_x = size_aligned - size_x; + int start_y = size_aligned - size_y; + + while (!work_set.empty()) + { + tile cur_tile = work_set.top(); + work_set.pop(); + + if (cur_tile.is_point()) + { + int x = cur_tile.left_x - start_x; + int y = cur_tile.top_y - start_y; + + if ((x >= 0) && (y >= 0)) + pair_list.push_back(std::make_pair(x, y)); + } + else + cur_tile.split_to_stack(work_set, start_x, start_y); + } + + if (pair_list.size() != size_x * size_y) + throw neural_network_exception("Internal error when generating tiling pattern"); + } + + cuda_util::tile::tile(int left_x, int right_x, int top_y, int bottom_y) + : left_x(left_x) + , right_x(right_x) + , top_y(top_y) + , bottom_y(bottom_y) + { + } + + bool cuda_util::tile::is_point() const + { + return ((right_x - left_x) == 1) && ((bottom_y - top_y) == 1); + } + + void cuda_util::tile::split_to_stack( + std::stack& st, + int start_x, + int start_y) const + { + int middle_x = (left_x + right_x) >> 1; + int middle_y = (top_y + bottom_y) >> 1; + + st.push(tile(middle_x, right_x, middle_y, bottom_y)); + + if (middle_x > start_x) + st.push(tile(left_x, middle_x, middle_y, bottom_y)); + + if (middle_y > start_y) + st.push(tile(middle_x, right_x, top_y, middle_y)); + + if ((middle_x > start_x) && (middle_y > start_y)) + st.push(tile(left_x, middle_x, top_y, middle_y)); + } } } diff --git a/nnforge/cuda/util_cuda.h b/nnforge/cuda/util_cuda.h index 7889798..1bceae7 100644 --- a/nnforge/cuda/util_cuda.h +++ b/nnforge/cuda/util_cuda.h @@ -18,6 +18,9 @@ #include #include +#include +#include + #include "cuda_running_configuration.h" namespace nnforge @@ -81,10 +84,23 @@ namespace nnforge int elem_count, cudaStream_t cuda_stream); + static void copy_buffer( + const cuda_running_configuration& cuda_config, + const float * input_buf_with_aligned_size, + float * output_with_aligned_size, + int elem_count, + cudaStream_t cuda_stream); + static int get_group_count( const cuda_running_configuration& cuda_config, int total_thread_count, - int divisible); + int divisible, + bool more_threadblocks = false); + + static void fill_tiling_pattern( + int size_x, + int size_y, + std::vector >& pair_list); private: cuda_util(); @@ -92,6 +108,23 @@ namespace nnforge cuda_util& operator =(const cuda_util&); ~cuda_util(); + struct tile + { + tile(int left_x, int right_x, int top_y, int bottom_y); + + bool is_point() const; + + void split_to_stack( + std::stack& st, + int start_x, + int start_y) const; + + int left_x; + int right_x; + int top_y; + int bottom_y; + }; + static const unsigned int preferred_width_2d_access; static const unsigned int preferred_height_2d_access; static const unsigned int preferred_threadblocksize_sequential_access;