From e1316417c3583eb8e6cecd2f520305b3be1d7b25 Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Wed, 14 Oct 2020 15:03:40 +0000
Subject: [PATCH 01/15] adaptation to FPGA completed for pipeline use case

---
 fpga_kernels/generate_makefile.cpp            |   3 +
 .../kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp      | 704 ++++++++++++++++++
 fpga_kernels/test_fpga/Makefile               |  19 +
 fpga_kernels/test_fpga/description.json       |  64 ++
 fpga_kernels/test_fpga/src/Makefile           | 189 +++++
 .../src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp    | 383 ++++++++++
 fpga_kernels/test_fpga/src/test_mult2d.cpp    | 566 ++++++++++++++
 fpga_kernels/test_fpga/src/test_relu.cpp      | 304 ++++++++
 fpga_kernels/test_fpga/utils.mk               | 101 +++
 include/eddl/hardware/fpga/fpga_enables.h     |   3 +-
 include/eddl/hardware/fpga/fpga_hw.h          |   5 +-
 include/eddl/profiling.h                      |  37 +
 include/eddl/tensor/tensor.h                  |   2 +-
 src/hardware/fpga/fpga_core.cpp               |   7 +-
 src/hardware/fpga/nn/fpga_conv.cpp            |  68 +-
 src/layers/core/layer_activation.cpp          |   3 +
 src/layers/core/layer_reshape.cpp             |  11 +
 src/tensor/nn/tensor_activations.cpp          |  10 +-
 src/tensor/nn/tensor_conv.cpp                 |  10 +-
 src/tensor/nn/tensor_pool.cpp                 |  10 +-
 src/tensor/tensor.cpp                         |  64 +-
 src/tensor/tensor_comparison.cpp              |   3 +-
 src/tensor/tensor_math.cpp                    |  17 +-
 23 files changed, 2534 insertions(+), 49 deletions(-)
 create mode 100644 fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp
 create mode 100644 fpga_kernels/test_fpga/Makefile
 create mode 100644 fpga_kernels/test_fpga/description.json
 create mode 100644 fpga_kernels/test_fpga/src/Makefile
 create mode 100644 fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp
 create mode 100644 fpga_kernels/test_fpga/src/test_mult2d.cpp
 create mode 100644 fpga_kernels/test_fpga/src/test_relu.cpp
 create mode 100755 fpga_kernels/test_fpga/utils.mk
 create mode 100644 include/eddl/profiling.h

diff --git a/fpga_kernels/generate_makefile.cpp b/fpga_kernels/generate_makefile.cpp
index 2d19598d1..303ed3bec 100644
--- a/fpga_kernels/generate_makefile.cpp
+++ b/fpga_kernels/generate_makefile.cpp
@@ -166,6 +166,9 @@ int main(int argc, char **argv) {
   #ifdef K_ENABLED_CONV2D
   strcpy(szKernels[num_kernels++], "conv2d");
   #endif
+  #ifdef K_ENABLED_CONV2D_K3X3_S1X1_P1X1_BS1
+  strcpy(szKernels[num_kernels++], "conv2D_K3x3_S1x1_P1x1_BS1");
+  #endif
 
 
   // Core
diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp
new file mode 100644
index 000000000..20b25cbd2
--- /dev/null
+++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp
@@ -0,0 +1,704 @@
+//KERNEL_CONV2D_4.cpp
+//Modified by: Jorge García Martinez
+//Date: 17/09/2020
+//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs
+//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel.
+//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores
+//the data that It should be written into the memory.
+
+
+
+#include <math.h>
+#include <stdio.h>
+#include <ap_int.h>
+
+#include <hls_stream.h>
+
+//#define DEBUG_VERBOSE
+
+extern "C" {
+
+// Fixed parameters (optimized at compilation/synthesis time)
+#define KW       3  // kernel width
+#define KH       3  // kernel height
+//#define I        8  // number of input channels
+//#define O        8  // number of output channels
+#define CPI      4  // channels per input port
+#define CPO      4  // channels per output port
+//#define W        256 // input width
+//#define H        256 // input height
+//#define I_ITER   I/CPI // iterations per input
+//#define O_ITER   O/CPO // iterations per output
+
+#define LOAD_MODEL
+#define READ_MODEL
+#define READ_INPUT
+#define WRITE_OUTPUT
+
+// pixel_in
+struct pixel_in_t {
+  float pixel[CPI];
+};
+
+struct pixel_out_t {
+  float pixel[CPO];
+};
+
+// frames struct
+struct frame_t {
+  pixel_in_t pixel[9];
+};
+
+// --------------------------------------------------------------------------------------
+// read_input:
+// The function reads and writes the kernels, bias and data in different stream.
+// Data are sent to padding module, kenels to mul and bias to add modules.
+// LOOP FLOW
+// ko = 0
+// b = 0
+//   for o_iter 0 .. n
+//        read bias[b..b+3]
+//        b = b + 4
+//        d = 0
+//        ki = 0
+//        for i_iter 0 .. n
+//            read kernel[ki..ki+3][ko..ko+3]
+//            ki = ki +4
+//            read data[d..d+3]
+//            d = d + 4
+//
+//        ko = ko + 4
+//
+//
+// Arguments:
+//   ptr  : Pointer to input data (in)
+//   k_ptr: pointer to kernels (in)
+//   b_ptr: pointer to bias (in)
+//   out  : data output stream (out)
+//   k_out: pointer to kernel (out)
+//   b_out: pointer to bias (out)
+//
+static void read_input(int H, int W, int I, int O, int I_ITER, int O_ITER, pixel_in_t *ptr, float *k_ptr, float *b_ptr, hls::stream<frame_t> &k_out, hls::stream<pixel_out_t> &b_out, hls::stream<pixel_in_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("read_input: start\n");
+#endif
+
+  frame_t frame_k;
+  #pragma HLS ARRAY_PARTITION variable=frame_k dim=0
+
+  pixel_out_t bias;
+  #pragma HLS ARRAY_PARTITION variable=bias dim=0
+
+  pixel_in_t data;
+  #pragma HLS ARRAY_PARTITION variable=data dim=0
+
+
+  read_input_o_iter_loop:
+  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+    //Sending bias to add in pack of CPO bias
+    // int data_pointer = 0;
+    read_loop_bias_load:
+      for (int b=0; b<CPO; b++) {
+        #pragma HLS PIPELINE II=1
+        // float v = b_ptr[b];
+        bias.pixel[0] = 1;
+        b_out << bias;
+      }
+    read_input_i_iter_loop:
+    for (int i_iter = 0; i_iter < I_ITER; i_iter++){
+      // printf("o_iter = %d -- i_iter = %d \n ", o_iter, i_iter);
+      //Sending kernels to mul in pack of CPI*CPO kernels
+      int kernel_size_cpo = CPO*KH*KW; //kernels size each i_iter
+      int i_offset = I_ITER * CPI * CPO * KH * KW; //addr_k offset for each i_iter
+      int cpo = 0; //index for kernel size
+      int kx = 0; //index for channels
+      read_loop_kernel_load_ext:
+      for(int i = 0; i < CPI; i++){
+        // printf("i = %d -- kernel_size_cpo = %d \n", i, kernel_size_cpo);
+        read_loop_kernel_load_int:
+        for (int j = 0; j < kernel_size_cpo; j++) {
+           // int addr_k = j + i*kernel_size_cpo*I_ITER + i_iter*i_offset + o_iter*kernel_size_cpo;
+           // float v = k_ptr[addr_k];
+          frame_k.pixel[kx].pixel[cpo] = 1;
+
+          #ifdef DEBUG_VERBOSE
+          printf("[%d]:", addr_k);
+          printf("%6.4f ", v);
+          #endif
+
+          kx = kx + 1;
+          if (kx == 9) {
+            // printf("\n");
+            kx = 0;
+            cpo = cpo + 1;
+            if (cpo == CPO) {
+              cpo = 0;
+              k_out << frame_k;
+            }
+          }
+        }
+      }
+
+    //Sending data to padding  in pack of CPI channels
+
+    read_loop_data_load_i:
+      for (int r=0; r<H*W; r++) {
+        #pragma HLS PIPELINE II=1
+        // printf("r = %d \n", r);
+        // data = ptr[offset_read];
+        #ifdef DEBUG_VERBOSE
+        printf("data.pixel[0] = %6.2f  ", data.pixel[0]);
+        printf("data.pixel[1] = %6.2f  ", data.pixel[1]);
+        printf("data.pixel[2] = %6.2f  ", data.pixel[2]);
+        printf("data.pixel[3] = %6.2f  \n", data.pixel[3]);
+        #endif
+        out  << ptr[r];
+        // data_pointer++;
+      }
+
+   } //i_iter
+} //o_iter
+
+
+#ifdef DEBUG_VERBOSE
+  printf("read_input: end\n");
+#endif
+}
+
+// ---------------------------------------------------------------------------------------
+// padding. Adds padding to the input and forwards it through the output
+//
+// Arguments:
+//   in                : input stream
+//   out               : vector of output streams
+//
+static void padding(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<pixel_in_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("padding: start\n");
+#endif
+
+//we init zero only first time
+
+pixel_in_t data;
+#pragma HLS ARRAY_PARTITION variable=data complete
+
+pixel_in_t zero;
+#pragma HLS ARRAY_PARTITION variable=data complete
+
+for (int cpi=0; cpi<CPI; cpi++) zero.pixel[cpi] = 0.f;
+
+padding_o_iter_loop:
+for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+  #pragma HLS loop_flatten off
+  padding_i_iter_loop:
+  for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+
+    for(int h = 0; h < H + 2; h++){
+      #pragma HLS_PIPELINE II=1
+      for(int w = 0; w < W + 2; w++){
+        #pragma HLS_PIPELINE II=1
+
+        if(h==0 || h == H+1 || w == 0 || w == W+1){
+          for(int cpi = 0; cpi < CPI; cpi++){
+            data = zero;
+          }
+        }
+        else data = in.read();
+        out << data;
+      }
+    }
+
+  } //i_iter
+} //o_iter
+
+
+
+#ifdef DEBUG_VERBOSE
+  printf("padding: end\n");
+#endif
+}
+
+// ---------------------------------------------------------------------------------------------
+// relu. Performs the relu operation on an input stream and produces an output stream
+// Arguments:
+//
+//   in: input stream
+//   out: output stream
+//
+static void relu(int H, int W, int O, hls::stream<float> &in, hls::stream<float> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("relu: start\n");
+#endif
+
+  int data_size = W * H * O;
+  for (int i=0; i < data_size; i++) {
+    #pragma HLS PIPELINE II=1
+    float data = in.read();
+    if (data < 0) data = 0.f;
+    out << data;
+  }
+
+#ifdef DEBUG_VERBOSE
+  printf("relu: end\n");
+#endif
+}
+
+// --------------------------------------------------------------------------------
+// write_output: Writes data comming from one stream into memory
+// LOOP FLOW:
+//  for o_iter 0 .. n
+//      write data[do .. do+3]
+//
+//  d = d + 4
+//
+// Arguments:
+//   ptr: memory address pointer
+//   in: input stream
+//
+static void write_output(int H, int W, int O_ITER, pixel_out_t *ptr, hls::stream<pixel_out_t> &in) {
+
+#ifdef DEBUG_VERBOSE
+  printf("write_output: start\n");
+#endif
+
+
+
+  // int data_pointer = 0;
+
+  // write_output_o_iter_loop:
+  // for (int o_iter = 0; o_iter<O_ITER; o_iter++){
+    //writes must be performed with pixel_in_t struct
+    write_output_data_size_loop:
+    for (int i=0; i<H*W*O_ITER; i++) {
+      ptr[i] = in.read();
+      // data_pointer++;
+      #ifdef DEBUG_VERBOSE
+      printf("o_iter = %d para i = %d \n", o_iter, i);
+      printf("ptr--p.pixel[0] = %6.2f \n", p.pixel[0]);
+      printf("ptr--p.pixel[1] = %6.2f \n", p.pixel[1]);
+      printf("ptr--p.pixel[2] = %6.2f \n", p.pixel[2]);
+      printf("ptr--p.pixel[3] = %6.2f \n\n", p.pixel[3]);
+      #endif
+    }
+  // } //o_iter
+
+
+
+#ifdef DEBUG_VERBOSE
+  printf("write_output: end\n");
+#endif
+}
+
+
+
+// ---------------------------------------------------------------------------------------------------
+// cvt: reads an input stream with an image of format (W, H, CPI) and writes an output stream
+// in a 2D format based on (KW, KH). (SW=1, SH=1) stride is assumed and (PW=1, PH=1) padding is assumed.
+// The function outputs data in the format (CPI, KW, KH).
+//
+// Arguments:
+//   in  : input stream
+//   out : output stream
+//   id  : function id (for debugging)
+static void cvt(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &out, int id) {
+
+#ifdef DEBUG_VERBOSE
+  printf("cvt_%d: start\n", id);
+#endif
+
+cvt_o_iter_loop:
+for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+  cvt_i_iter_loop:
+  for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+
+  // Now we process the input data and convert the data into frames
+
+  // buffers (keep three rows)
+  pixel_in_t buffer0[W+2];
+  pixel_in_t buffer1[W+2];
+  pixel_in_t buffer2[W+2];
+  #pragma HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=2
+  #pragma HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=2
+  #pragma HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=2
+
+  // frame
+  frame_t frame;
+  #pragma HLS ARRAY_PARTITION variable=frame
+
+  // We loop for every incoming pixel
+  cvt_loop_1:
+  for (int pin_row=0; pin_row < H+2; pin_row++) {
+    cvt_loop_2:
+    for (int pin_col=0; pin_col < W+2; pin_col++) {
+      // get the pixel
+      pixel_in_t pixel;
+      pixel = in.read();
+      // row buffer write (in which buffer row we write the pixel)
+      int row0_buffer_write = (pin_row % 3) == 0;
+      int row1_buffer_write = (pin_row % 3) == 1;
+      // first row buffer
+      int row0 = (pin_row <= 2) | ((pin_row % 3) == 2);
+      int row1 = !row0 & ((pin_row % 3) == 0);
+      // we write the pixel into the buffer
+      if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel;
+      // build the frame
+      pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8;
+      int shift_frame = (pin_row>1) & (pin_col > 2);
+      int send_frame = (pin_row>1) & (pin_col > 1);
+      pixel_in_t pixel_b0, pixel_b1, pixel_b2;
+      pixel_b0 = buffer0[pin_col];
+      pixel_b1 = buffer1[pin_col];
+      pixel_b2 = buffer2[pin_col];
+      // p0, p1, p2
+      if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;}
+      if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;}
+      if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2;
+      // p3, p4, p5
+      if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;}
+      if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;}
+      if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0;
+      // p6, p7, p8
+      if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;}
+      if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;}
+      if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1;
+
+      if (send_frame) {
+        frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2;
+        frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5;
+        frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8;
+        out << frame;
+      #ifdef DEBUG_VERBOSE
+      printf("cvt_%d: frame sent:\n", id);
+      for (int cpi=0; cpi<CPI; cpi++) {
+        printf("  cpi %d:\n", cpi);
+        printf("    %6.4f %6.4f %6.4f\n", frame.pixel[0].pixel[cpi], frame.pixel[1].pixel[cpi], frame.pixel[2].pixel[cpi]);
+        printf("    %6.4f %6.4f %6.4f\n", frame.pixel[3].pixel[cpi], frame.pixel[4].pixel[cpi], frame.pixel[5].pixel[cpi]);
+        printf("    %6.4f %6.4f %6.4f\n", frame.pixel[6].pixel[cpi], frame.pixel[7].pixel[cpi], frame.pixel[8].pixel[cpi]);
+      }
+      #endif
+     }
+    }
+  }
+
+} //i_iter
+} //o_iter
+
+
+#ifdef DEBUG_VERBOSE
+  printf("cvt_%d: end\n", id);
+#endif
+}
+
+// ----------------------------------------------------------------------------------------
+// mul: This function performs the multiplication of an input frame with the stored kernels
+// and sends the produced pixels. Before normal operation it receives its kernels
+// Arguments:
+//   in: input stream with incoming data frames
+//   k_in: input stream with kernels
+//   out: output stream
+//   id: function id (for debugging only)
+//
+static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream<frame_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &out, int id) {
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: start\n", id);
+#endif
+
+  // first we read the kernels
+  frame_t kernel[CPI];
+  #pragma HLS ARRAY_PARTITION variable=kernel dim=0
+  frame_t data_in;
+
+#ifdef LOAD_MODEL
+
+  mul_o_iter_loop:
+  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+    mul_i_iter_loop:
+    for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+      //we load the kernels into pack of frames
+      loop_mul_kernels_load_cpo:
+      for (int cpi=0; cpi<CPI; cpi++) {
+        #pragma HLS PIPELINE II=1
+        kernel[cpi] = k_in.read();
+      }
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: kernels received\n", id);
+  for (int cpi=0; cpi < CPI; cpi++) {
+    for (int cpo=0; cpo < CPO; cpo++) {
+      printf("  cpi=%d, cpo=%d:\n", cpi, cpo);
+      printf("    %6.4f %6.4f %6.4f\n", kernel[cpi].pixel[0].pixel[cpo], kernel[cpi].pixel[1].pixel[cpo], kernel[cpi].pixel[2].pixel[cpo]);
+      printf("    %6.4f %6.4f %6.4f\n", kernel[cpi].pixel[3].pixel[cpo], kernel[cpi].pixel[4].pixel[cpo], kernel[cpi].pixel[5].pixel[cpo]);
+      printf("    %6.4f %6.4f %6.4f\n", kernel[cpi].pixel[6].pixel[cpo], kernel[cpi].pixel[7].pixel[cpo], kernel[cpi].pixel[8].pixel[cpo]);
+    }
+  }
+#endif
+
+
+    // now we read frames and produce the pixels
+    float sum[CPO];
+    #pragma HLS ARRAY_PARTITION variable=sum dim=0 block factor=4
+    //factor = 16
+    //the array_partition factor in this case is assumed to be CPO value
+    int num_iterations = W * H;
+    mul_sum_loop:
+    for (int cpo=0; cpo<CPO; cpo++) sum[cpo] = 0.f;
+
+    mul_num_iterations_loop:
+    for (int i=0; i<num_iterations; i++) {
+      data_in = in.read();
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: data received\n", id);
+  for (int cpi=0; cpi<CPI; cpi++) {
+    printf("  cpi=%d\n", cpi);
+    printf("    %6.4f %6.4f %6.4f\n", data_in.pixel[0].pixel[cpi], data_in.pixel[1].pixel[cpi], data_in.pixel[2].pixel[cpi]);
+    printf("    %6.4f %6.4f %6.4f\n", data_in.pixel[3].pixel[cpi], data_in.pixel[4].pixel[cpi], data_in.pixel[5].pixel[cpi]);
+    printf("    %6.4f %6.4f %6.4f\n", data_in.pixel[6].pixel[cpi], data_in.pixel[7].pixel[cpi], data_in.pixel[8].pixel[cpi]);
+  }
+#endif
+
+    loop_mul_cpi:
+    for (int cpi=0; cpi<CPI; cpi++) {
+      #pragma HLS UNROLL
+      loop_mul_j:
+      for (int j=0; j<KW*KH; j++) {
+	       #pragma HLS UNROLL
+        loop_mul_cpo:
+      	for (int cpo=0; cpo<CPO; cpo++) {
+          #pragma HLS UNROLL
+          sum[cpo] += data_in.pixel[j].pixel[cpi] * kernel[cpi].pixel[j].pixel[cpo];
+        }
+      }
+    }
+    pixel_out_t p_out;
+    for (int cpo=0; cpo<CPO; cpo++) {
+      #pragma HLS unroll
+      #ifdef DEBUG_VERBOSE
+      printf("mul_%d: pixel produced\n", id);
+      for (int cpo=0; cpo<CPO; cpo++) printf("  cpo=%d -> %6.4f\n", cpo, sum[cpo]);
+      #endif
+      p_out.pixel[cpo] = sum[cpo];
+      sum[cpo] = 0.f;
+     }
+     out << p_out;
+    }
+  } //i_iter
+} //o_iter
+
+#endif
+
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: end\n", id);
+#endif
+}
+
+// -------------------------------------------------------------------------------
+// add: This function performs the addition of all subpixels for the same channel.
+// It adds also the corresponding bias.
+// LOOP FLOW
+//   for o_iter 0 .. n
+//        receive bias[b..b+3]
+//        init buff_o_channels with bias
+//        for i_iter 0 .. n
+//            receive data[do..d+3]
+//            buff_o_channels = buff_o_channels + data
+//
+//        for num_iterations
+//            for CPO
+//              send data to write module
+//
+// Arguments:
+//   in:  input streams data
+//   b_in: input stream bias
+//   out: output stream
+//
+static void add(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_out_t> &in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("add: start\n");
+#endif
+
+  float bias[CPO];
+
+  //number of iterations by CPI || CPO channels
+  int num_iterations = W * H;
+
+  //Buffer for all data and CPO channels
+  float buff_o_channels[CPO][num_iterations];
+  #pragma HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=4
+
+  //We read Bias in O_iter packs of CPO size
+  add_o_iter_loop:
+  for (int o_iter = 0; o_iter<O_ITER; o_iter++){
+
+    //We receive bias in packs of CPO
+    add_load_bias_loop:
+    for (int b=0; b<CPO; b++) {
+      #pragma HLS PIPELINE II=1
+      pixel_out_t p_out;
+      p_out = b_in.read();
+      bias[b] = p_out.pixel[0];
+    }
+
+    #ifdef DEBUG_VERBOSE
+    for (int b=0; b<CPO; b++) {
+      printf("Bias[%d] = %6.4f \n", b, bias[b]);
+    }
+    #endif
+
+    #ifdef DEBUG_VERBOSE
+    printf("add: bias received\n");
+    #endif
+
+    //It is necessary to reset the buffer each o_iter
+    // add_init_buff_o_channels_loop:
+    // for(int cpo = 0; cpo<CPO; cpo++){
+    //   for(int it = 0; it<num_iterations; it++){
+    //     buff_o_channels[cpo][it] = bias[cpo];
+    //   }
+    // }
+
+      #ifdef DEBUG_VERBOSE
+      printf("o_iter = %d \n", o_iter);
+      for(int cpo = 0; cpo<CPO; cpo++){
+        printf("Channel cpo = %d: ", cpo);
+        for(int it = 0; it<num_iterations; it++){
+          printf("%6.2f ", buff_o_channels[cpo][it]);
+        }
+        printf("\n");
+      }
+      #endif
+
+      //All input data have effect into output add
+      add_i_iter_loop:
+      for (int i_iter = 0; i_iter < I_ITER; i_iter++){
+        // //prueba
+        pixel_out_t data_out;
+        // pixel_out_t data;
+        // data = in.read();
+        // out<<data;
+        #pragma HLS loop_flatten off
+        add_load_data_it_loop:
+        for(int it = 0; it<num_iterations; it++){
+          pixel_out_t data_in;
+          data_in = in.read();
+          pixel_out_t data;
+          add_load_data_cpo_loop:
+          for (int cpo=0; cpo<CPO; cpo++) {
+            #pragma HLS unroll
+            if(i_iter == 0){
+              data.pixel[cpo] = bias[cpo];
+            }
+            else{
+              data.pixel[cpo] = buff_o_channels[cpo][it];
+            }
+            buff_o_channels[cpo][it] = data.pixel[cpo] + data_in.pixel[cpo];
+
+            if(i_iter ==(I_ITER-1)){
+              data_out.pixel[cpo] = buff_o_channels[cpo][it];
+            }
+          }
+          if(i_iter ==(I_ITER-1)){
+            out << data_out;
+          }
+        }
+      } //i_iter
+
+      #ifdef DEBUG_VERBOSE
+      printf("CH %d: ", o_iter*CPO);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", buff_o_channels[0][it]);
+      }
+      printf("\n");
+      printf("CH %d: ", o_iter*CPO +1);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", buff_o_channels[1][it]);
+      }
+      printf("\n");
+      printf("CH %d: ", o_iter*CPO +2);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", buff_o_channels[2][it]);
+      }
+      printf("\n");
+      printf("CH %d: ", o_iter*CPO +3);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", buff_o_channels[3][it]);
+      }
+      printf("\n");
+      #endif
+
+
+  } //o_iter
+
+
+#ifdef DEBUG_VERBOSE
+  printf("add: end\n");
+#endif
+
+
+}
+
+// conv: Convolutional kernel
+//
+// Arguments:
+//   in: input stream
+//   out: output stream
+static void conv(int H, int W, int I, int O, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
+
+  // streams
+  static hls::stream<pixel_in_t>  str_pad_cvt;  // padding->cvt
+  static hls::stream<frame_t>     str_cvt_mul;  // cvt->mul
+  static hls::stream<pixel_out_t> str_mul_add;  // mul->add
+
+
+  // topology
+  #pragma HLS dataflow
+  padding(H, W, I_ITER, O_ITER, in, str_pad_cvt);          // padding
+  cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0);  // cvt
+  mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0);  // mul
+  add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out);             // add
+}
+
+void k_conv2D_K3x3_S1x1_P1x1_BS1(pixel_in_t *ptr_data, int H, int W, int I, float *ptr_kernel, float *ptr_bias, pixel_out_t *ptr_out, int O) {
+
+  //#pragma HLS INTERFACE s_axilite port=W bundle=control
+  //#pragma HLS INTERFACE s_axilite port=H bundle=control
+  #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_out  offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE s_axilite port=return bundle=control
+
+  // ptr_data struct to be packed as a single element vector (to improve memory read)
+  // the compiler will do full structure access (all elements of structure)
+  #pragma HLS data_pack variable = ptr_data
+  #pragma HLS data_pack variable = ptr_out
+
+  int I_ITER = I/CPI;
+  int O_ITER = O/CPO;
+
+  // input and output streams
+  static hls::stream<pixel_in_t> out_read;
+  static hls::stream<frame_t> out_read_kernel;
+  static hls::stream<pixel_out_t> out_read_bias;
+  static hls::stream<pixel_out_t> out_conv;
+
+  // stream sizes
+  #pragma HLS STREAM variable = out_read depth = 32
+  #pragma HLS STREAM variable = out_read_kernel depth = 32
+  #pragma HLS STREAM variable = out_read_bias depth = 32
+  #pragma HLS STREAM variable = out_conv depth = 32
+  #pragma HLS STREAM variable = out_relu depth = 32
+
+  #pragma HLS dataflow
+  read_input(H, W, I, O, I_ITER, O_ITER, ptr_data, ptr_kernel, ptr_bias, out_read_kernel, out_read_bias, out_read);
+  conv(H, W, I, O, I_ITER, O_ITER, out_read, out_read_kernel, out_read_bias, out_conv);
+  write_output(H, W, O_ITER, ptr_out, out_conv);
+}
+
+} // end extern "C"
diff --git a/fpga_kernels/test_fpga/Makefile b/fpga_kernels/test_fpga/Makefile
new file mode 100644
index 000000000..7dc6b92d8
--- /dev/null
+++ b/fpga_kernels/test_fpga/Makefile
@@ -0,0 +1,19 @@
+# list of kernel test to compile
+LIST ?=conv2D_K3x3_S1x1_P1x1_BS1
+
+# default target
+all build clean cleanall: KERNELS
+
+KERNELS : 
+	for krnl in $(LIST); do \
+		$(info Launch Makefile to generate test for kernel $(krnl)) \
+		$(MAKE) -C ./src -e KNAME=$$krnl $(MAKECMDGOALS) ; \
+	done
+# build any target by forwarding to $(dirs) rule
+#% : $(DIRS) ;
+
+.PHONY: KERNELS all build clean cleanall
+
+
+$(info all done)
+
diff --git a/fpga_kernels/test_fpga/description.json b/fpga_kernels/test_fpga/description.json
new file mode 100644
index 000000000..4625f665d
--- /dev/null
+++ b/fpga_kernels/test_fpga/description.json
@@ -0,0 +1,64 @@
+{
+    "name": "Data Transfer (C)", 
+    "description": [
+        "This example illustrates several ways to use the OpenCL API to transfer data to and from the FPGA"
+    ],
+    "keywords": [
+        "enqueueWriteBuffer", 
+        "enqueueReadBuffer", 
+        "enqueueMapBuffer", 
+        "enqueueUnmapMemObject", 
+        "enqueueMigrateMemObjects"
+    ], 
+    "key_concepts": [
+        "OpenCL API", 
+        "Data Transfer", 
+        "Write Buffers", 
+        "Read Buffers", 
+        "Map Buffers", 
+        "Async Memcpy"
+    ], 
+    "os": [
+        "Linux"
+    ], 
+    "runtime": [
+        "OpenCL"
+    ], 
+    "host": {
+        "host_exe": "data_transfer", 
+        "compiler": {
+            "sources": [
+                "REPO_DIR/common/includes/xcl2"
+            ], 
+            "includepaths": [
+                "REPO_DIR/common/includes/xcl2"
+            ]
+        }
+    }, 
+    "containers": [
+        {
+            "accelerators": [
+                {
+                    "name": "dummy_kernel", 
+                    "location": "src/dummy_kernel.cpp"
+                }
+            ], 
+            "name": "dummy_kernel"
+        }
+    ],
+    "launch": [
+        {
+            "cmd_args": "BUILD/dummy_kernel.xclbin", 
+            "name": "generic launch for all flows"
+        }
+    ], 
+    "contributors": [
+        {
+            "url": "http://www.xilinx.com", 
+            "group": "Xilinx"
+        }
+    ],
+    "testinfo": {
+        "profile": "no"
+    }  
+}
diff --git a/fpga_kernels/test_fpga/src/Makefile b/fpga_kernels/test_fpga/src/Makefile
new file mode 100644
index 000000000..97b860376
--- /dev/null
+++ b/fpga_kernels/test_fpga/src/Makefile
@@ -0,0 +1,189 @@
+.PHONY: help
+
+help::
+	$(ECHO) "Makefile Usage:"
+	$(ECHO) "  make all TARGET=<sw_emu/hw_emu/hw> DEVICE=<FPGA platform> HOST_ARCH=<aarch32/aarch64/x86> EDGE_COMMON_SW=<rootfs and kernel image path>"
+	$(ECHO) "      Command to generate the design for specified Target and Shell."
+	$(ECHO) "      By default, HOST_ARCH=x86. HOST_ARCH and EDGE_COMMON_SW is required for SoC shells"
+	$(ECHO) ""
+	$(ECHO) "  make clean "
+	$(ECHO) "      Command to remove the generated non-hardware files."
+	$(ECHO) ""
+	$(ECHO) "  make cleanall"
+	$(ECHO) "      Command to remove all the generated files."
+	$(ECHO) ""
+	$(ECHO)  "  make test DEVICE=<FPGA platform>"
+	$(ECHO)  "     Command to run the application. This is same as 'check' target but does not have any makefile dependency."
+	$(ECHO)  ""
+	$(ECHO) "  make sd_card TARGET=<sw_emu/hw_emu/hw> DEVICE=<FPGA platform> HOST_ARCH=<aarch32/aarch64/x86> EDGE_COMMON_SW=<rootfs and kernel image path>"
+	$(ECHO) "      Command to prepare sd_card files."
+	$(ECHO) "      By default, HOST_ARCH=x86. HOST_ARCH and EDGE_COMMON_SW is required for SoC shells"
+	$(ECHO) ""
+	$(ECHO) "  make check TARGET=<sw_emu/hw_emu/hw> DEVICE=<FPGA platform> HOST_ARCH=<aarch32/aarch64/x86> EDGE_COMMON_SW=<rootfs and kernel image path>"
+	$(ECHO) "      Command to run application in emulation."
+	$(ECHO) "      By default, HOST_ARCH=x86. HOST_ARCH and EDGE_COMMON_SW is required for SoC shells"
+	$(ECHO) ""
+	$(ECHO) "  make build TARGET=<sw_emu/hw_emu/hw> DEVICE=<FPGA platform> HOST_ARCH=<aarch32/aarch64/x86> EDGE_COMMON_SW=<rootfs and kernel image path>"
+	$(ECHO) "      Command to build xclbin application."
+	$(ECHO) "      By default, HOST_ARCH=x86. HOST_ARCH and EDGE_COMMON_SW is required for SoC shells"
+	$(ECHO) ""
+
+
+KNAME ?= DUMMY
+
+KRNL_DIR  ?= ./../..
+KRNL_FUNC ?= k_$(KNAME)
+KRNL_NAME ?= kernel_$(KNAME)
+KRNL_SRCS ?= $(KRNL_DIR)/kernel_$(KNAME).cpp
+KRNL_FILE := kernel_$(KNAME).cpp
+TEST_SRCS ?= test_$(KNAME).cpp
+
+DEVICE ?= xilinx_u200_xdma_201830_2
+TARGET ?= sw_emu
+HOST_ARCH ?= x86
+SYSROOT ?= 
+
+$(info )
+$(info )
+$(info Running Makefile for KERNEL $(KNAME)  DEVICE $(DEVICE) TARGET $(TARGET) )
+$(info )
+
+BASE_DIR = ./..
+
+# Points to top directory of Git repository
+COMMON_REPO = $(BASE_DIR)
+PWD = $(shell readlink -f .)
+ABS_COMMON_REPO = $(shell readlink -f $(COMMON_REPO))
+
+
+include $(BASE_DIR)/utils.mk
+
+XSA := $(call device2xsa, $(DEVICE))
+TEMP_DIR := $(BASE_DIR)/_x.$(TARGET).$(XSA)
+BUILD_DIR := $(BASE_DIR)/build_dir.$(TARGET).$(XSA)
+
+# SoC variables
+RUN_APP_SCRIPT = run_app.sh
+PACKAGE_OUT = package.$(TARGET)
+
+LAUNCH_EMULATOR = $(PACKAGE_OUT)/launch_$(TARGET).sh
+RESULT_STRING = TEST PASSED
+
+VPP := v++
+SDCARD := sd_card
+
+#Include Libraries
+include $(ABS_COMMON_REPO)/common/includes/opencl/opencl.mk
+include $(ABS_COMMON_REPO)/common/includes/xcl2/xcl2.mk
+CXXFLAGS += $(xcl2_CXXFLAGS)
+LDFLAGS += $(xcl2_LDFLAGS)
+HOST_SRCS += $(xcl2_SRCS)
+CXXFLAGS += $(opencl_CXXFLAGS) -Wall -O0 -g -std=c++11 
+LDFLAGS += $(opencl_LDFLAGS)
+INCL_DIR := -I$(KRNL_DIR)
+
+HOST_SRCS += $(TEST_SRCS)
+
+# Host compiler global settings
+CXXFLAGS += -fmessage-length=0
+LDFLAGS += -lrt -lstdc++ 
+
+
+ifneq ($(HOST_ARCH), x86)
+	LDFLAGS += --sysroot=$(SYSROOT)
+endif
+
+# Kernel compiler global settings
+CLFLAGS += -t $(TARGET) --platform $(DEVICE) --save-temps
+ifneq ($(TARGET), hw)
+	CLFLAGS += -g
+endif
+
+EXECUTABLE := $(TEST_SRCS:%.cpp=%)
+#$(info "EXECUTABLE is $(EXECUTABLE)" ) 
+KRNL_XCLBIN = $(KRNL_NAME:%=%.xclbin)
+KRNL_OBJ = $(KRNL_NAME:%=%.xo)
+
+CMD_ARGS = $(BUILD_DIR)/$(KRNL_XCLBIN)
+EMCONFIG_DIR = $(TEMP_DIR)
+EMU_DIR = $(SDCARD)/data/emulation
+
+BINARY_CONTAINERS += $(BUILD_DIR)/$(KRNL_XCLBIN)
+BINARY_CONTAINER_kernel_OBJS += $(TEMP_DIR)/$(KRNL_OBJ)
+
+CP = cp -rf
+
+.PHONY: all clean cleanall docs emconfig
+all: check-devices $(EXECUTABLE) $(BINARY_CONTAINERS) emconfig sd_card
+
+.PHONY: exe
+exe: $(EXECUTABLE)
+
+.PHONY: build
+build: check-vitis $(BINARY_CONTAINERS)
+
+
+# Building kernel
+$(TEMP_DIR)/$(KRNL_OBJ): $(KRNL_SRCS)
+	mkdir -p $(TEMP_DIR)
+	$(VPP) $(CLFLAGS) --temp_dir $(TEMP_DIR) -c -k $(KRNL_FUNC) -I'$(<D)' -I../ -o'$@' '$<'
+$(BUILD_DIR)/$(KRNL_XCLBIN): $(BINARY_CONTAINER_kernel_OBJS)
+	mkdir -p $(BUILD_DIR)
+	$(VPP) $(CLFLAGS) --temp_dir $(BUILD_DIR) -l $(LDCLFLAGS) -o'$@' $(+)
+
+# Building Host
+$(EXECUTABLE): check-xrt $(HOST_SRCS) $(HOST_HDRS)
+	$(CXX) $(CXXFLAGS) $(HOST_SRCS) $(HOST_HDRS) -o '$(BASE_DIR)/$@' $(LDFLAGS)
+
+emconfig:$(EMCONFIG_DIR)/emconfig.json
+$(EMCONFIG_DIR)/emconfig.json:
+	emconfigutil --platform $(DEVICE) --od $(EMCONFIG_DIR)
+
+check: all
+ifeq ($(TARGET),$(filter $(TARGET),sw_emu hw_emu))
+ifeq ($(HOST_ARCH), x86)
+	$(CP) $(EMCONFIG_DIR)/emconfig.json .
+	XCL_EMULATION_MODE=$(TARGET) ./$(EXECUTABLE) $(BUILD_DIR)/$(KRNL_XCLBIN)
+else
+	$(ABS_COMMON_REPO)/common/utility/run_emulation.pl "./${LAUNCH_EMULATOR} | tee run_app.log" "./${RUN_APP_SCRIPT} $(TARGET)" "${RESULT_STRING}" "7"
+endif
+else
+ifeq ($(HOST_ARCH), x86)
+	./$(EXECUTABLE) $(BUILD_DIR)/$(KRNL_XCLBIN)
+endif
+endif
+
+.PHONY: test
+test: $(EXECUTABLE)
+ifeq ($(TARGET),$(filter $(TARGET),sw_emu hw_emu))
+ifeq ($(HOST_ARCH), x86)
+	XCL_EMULATION_MODE=$(TARGET) ./$(EXECUTABLE) $(BUILD_DIR)/$(KRNL_XCLBIN)
+else
+	$(ABS_COMMON_REPO)/common/utility/run_emulation.pl "./${LAUNCH_EMULATOR} | tee embedded_run.log" "./${RUN_APP_SCRIPT} $(TARGET)" "${RESULT_STRING}" "7"
+endif
+else
+ifeq ($(HOST_ARCH), x86)
+	./$(EXECUTABLE) $(BUILD_DIR)/$(KRNL_XCLBIN)
+else
+	$(ECHO) "Please copy the content of sd_card folder and data to an SD Card and run on the board"
+endif
+endif
+
+
+sd_card: gen_run_app
+ifneq ($(HOST_ARCH), x86)
+	$(VPP) -t $(TARGET) --platform $(DEVICE) -p $(BUILD_DIR)/$(KRNL_XCLBIN) --package.out_dir $(PACKAGE_OUT) --package.rootfs $(EDGE_COMMON_SW)/rootfs.ext4 --package.sd_file $(SD_IMAGE_FILE) --package.sd_file xrt.ini --package.sd_file $(RUN_APP_SCRIPT) --package.sd_file $(EXECUTABLE) -o $(KRNL_XCLBIN)
+endif
+
+# Cleaning stuff
+clean:
+	-$(RMDIR) "$(BASE_DIR)/$(EXECUTABLE)" "$(BASE_DIR)/$(XCLBIN)/*sw_emu*"  "$(BASE_DIR)/$(XCLBIN)/*hw_emu*"
+	-$(RMDIR) profile_* TempConfig system_estimate.xtxt *.rpt *.csv $(BASE_DIR)/*.rpt $(BASE_DIR)/*.csv 
+	-$(RMDIR) /*.ll *v++* .Xil $(BASE_DIR)/emconfig.json dltmp* xmltmp* *.log *.jou *.wcfg *.wdb
+
+cleanall: clean
+	-$(RMDIR) $(BASE_DIR)/build_dir* sd_card*
+	-$(RMDIR) package.*
+	-$(RMDIR) $(BASE_DIR)/_x* $(BASE_DIR)/*xclbin.run_summary qemu-memory-_* emulation/ _vimage/ pl* start_simulation.sh $(BASE_DIR)/*.xclbin
+	-$(RMDIR) cmake_build/CMakeCache.txt cmake_build/CMakeFiles cmake_build/Makefile cmake_build/cmake_install.cmake 
+	
diff --git a/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp
new file mode 100644
index 000000000..4b06398cf
--- /dev/null
+++ b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp
@@ -0,0 +1,383 @@
+#include <cstdio>      /* printf, scanf, NULL */
+#include <cstdlib>     /* malloc, free, rand */
+
+#include <stdlib.h>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <vector>
+#include "xcl2.hpp"
+
+using std::vector;
+
+// CL
+cl::Buffer buf;
+cl::Context context;
+cl::CommandQueue q;
+cl::Program program;
+
+
+#define W    256 //256
+#define H    256 //256
+#define C    4  //I
+#define COUT 4  //O
+#define KW   3
+#define KH   3
+
+// buffers
+float data_in[  W   * H  * C       ]  __attribute__ ((__aligned__(16)));
+float kernel [ KW   * KH * C * COUT]  __attribute__ ((__aligned__(16)));
+float bias   [ COUT                ]  __attribute__ ((__aligned__(16)));
+float out    [  W   * H  * COUT    ]  __attribute__ ((__aligned__(16)));
+float out_cpu[  W   * H  * COUT    ]  __attribute__ ((__aligned__(16)));
+
+void cpu_conv2d() {
+
+  int size_out = W * H * COUT;
+  for (int i=0; i<size_out; i++) out_cpu[i] = 0.f;
+
+  for (int c=0; c<C; c++) {
+    for (int cout=0; cout<COUT; cout++) {
+      for (int h=0; h<H; h++) {
+        for (int w=0; w<W; w++) {
+          for (int kh=0; kh<KH; kh++) {
+	    for (int kw=0; kw<KW; kw++) {
+	      int data_h = (h-1)+kh;
+	      int data_w = (w-1)+kw;
+	      int padding = (data_h == -1) | (data_w == -1) | (data_w == W) | (data_h == H);
+	      int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
+              int addr_p = (data_h * W * C) + (data_w * C) + c;
+	      int addr_o = (h * W * COUT) + (w * COUT) + cout;
+	      if (!padding) out_cpu[addr_o] += data_in[addr_p] * kernel[addr_k];
+	    }
+	  }
+	}
+      }
+    }
+  }
+
+  // añadimos bias
+  for (int cout=0; cout<COUT; cout++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+        out_cpu[addr_o] += bias[cout];
+      }
+    }
+  }
+
+  // aplicamos relu
+/*  for (int cout=0; cout<COUT; cout++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+        if (out_cpu[addr_o] < 0.f) out_cpu[addr_o] = 0.f;
+      }
+    }
+  }*/
+}
+
+void cpu_print_data_in() {
+  printf("data in:\n");
+  for (int c=0; c<C; c++) {
+    printf(" channel %d:\n", c);
+    printf("   ");
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+	int addr_p = (h * W * C) + (w * C) + c;
+        printf("%6.2f ", data_in[addr_p]);
+      }
+      printf("\n");
+      printf("   ");
+    }
+    printf("\n");
+  }
+}
+
+void cpu_print_kernels() {
+  printf("kernels:\n");
+  for (int c=0; c<C; c++) {
+    for (int cout=0; cout<COUT; cout++) {
+      printf("kernel c=%d cout %d:\n", c, cout);
+      for (int kh=0; kh<KH; kh++) {
+        for (int kw=0; kw<KW; kw++) {
+	  int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
+	  printf("%6.2f ", kernel[addr_k]);
+	}
+	printf("\n");
+      }
+    }
+  }
+}
+
+void cpu_print_bias() {
+  printf("bias:\n");
+  for (int cout=0; cout<COUT; cout++) {
+    printf("%6.2f ", bias[cout]);
+  }
+  printf("\n");
+}
+
+void cpu_print_out() {
+  printf("output: cpu (fpga)\n");
+  for (int cout=0; cout<COUT; cout++) {
+    printf("channel %d:\n", cout);
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+        printf(" %10.6f (%10.6f) (diff %10.6f) | ", out_cpu[addr_o], out[addr_o], out_cpu[addr_o]-out[addr_o]);
+      }
+      printf("\n");
+    }
+  }
+}
+
+void check_result() {
+
+  int error = 0;
+  for (int cout=0; cout<COUT; cout++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+        if (fabs(out_cpu[addr_o] - out[addr_o]) > 0.001) {
+          printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, out_cpu[addr_o], out[addr_o], fabs(out_cpu[addr_o]-out[addr_o]));
+          error = 1;
+	  return;
+	}
+      }
+    }
+  }
+  if (!error) printf("results OK!\n"); else {
+    printf("results differ:\n");
+    //cpu_print_out();
+  }
+}
+
+
+//---------------------------------------------------------------------------------------------------------------------
+//---------------------------------------------------------------------------------------------------------------------
+
+// An event callback function that prints the operations performed by the OpenCL
+// runtime.
+void event_cb(cl_event event1, cl_int cmd_status, void *data) {
+  cl_int err;
+  cl_command_type command;
+  cl::Event event(event1, true);
+  OCL_CHECK(err, err = event.getInfo(CL_EVENT_COMMAND_TYPE, &command));
+  cl_int status;
+  OCL_CHECK(err,
+            err = event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status));
+  const char *command_str;
+  const char *status_str;
+  switch (command) {
+  case CL_COMMAND_READ_BUFFER:
+    command_str = "buffer read";
+    break;
+  case CL_COMMAND_WRITE_BUFFER:
+    command_str = "buffer write";
+    break;
+  case CL_COMMAND_NDRANGE_KERNEL:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_MAP_BUFFER:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_COPY_BUFFER:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_MIGRATE_MEM_OBJECTS:
+    command_str = "buffer migrate";
+    break;
+  default:
+    command_str = "unknown";
+  }
+  switch (status) {
+  case CL_QUEUED:
+    status_str = "Queued";
+    break;
+  case CL_SUBMITTED:
+    status_str = "Submitted";
+    break;
+  case CL_RUNNING:
+    status_str = "Executing";
+    break;
+  case CL_COMPLETE:
+    status_str = "Completed";
+    break;
+  }
+  printf("[%s]: %s %s\n", reinterpret_cast<char *>(data), status_str,
+         command_str);
+  fflush(stdout);
+}
+
+// Sets the callback for a particular event
+void set_callback(cl::Event event, const char *queue_name) {
+  cl_int err;
+  OCL_CHECK(err,
+            err = event.setCallback(CL_COMPLETE, event_cb, (void *)queue_name));
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  printf("Test CONV: [WxHxC] = [%dx%dx%d] -> [WxHxC] = [%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", W, H, C, W, H, COUT, KW, KH);
+
+  std::string binaryFile = argv[1];
+  cl_int err;
+  cl::Kernel kernel_conv2d_2;
+
+  std::cout << "Creating Context..." << std::endl;
+  auto devices = xcl::get_xil_devices();
+  auto device = devices[0];
+  OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err));
+  OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
+
+  std::string device_name = device.getInfo<CL_DEVICE_NAME>();
+  auto fileBuf = xcl::read_binary_file(binaryFile);
+  cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
+  devices.resize(1);
+
+  OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err));
+  std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl;
+
+  OCL_CHECK(err, kernel_conv2d_2 = cl::Kernel(program,"k_conv2D_K3x3_S1x1_P1x1_BS1", &err));
+  std::cout << "Kernel sucessfully created" << std::endl ;
+
+  size_t size_data_in_bytes = W*H*C*sizeof(float);
+  size_t size_output_in_bytes = W*H*COUT * sizeof(float);
+  size_t size_kernel_in_bytes = KW * KH * C * COUT * sizeof(float);
+  size_t size_bias_in_bytes = COUT * sizeof(float);
+  // Allocate memory on the host and fill with random data.
+
+  //-----------------------------
+  // fill data vector with random data
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  std::cout << "Filling buffer with useful data" << std::endl ;
+  int addr = 0;
+  for (int h=0; h<H; h++) {
+    for (int w=0; w<W; w++) {
+      for (int c=0; c<C; c++) {
+	       float value = (c*W*H) + (float)(h*W)+w; //c+1; // (float)((c * 25) + (h * W) + w);
+         data_in[addr] = dist(gen); //value;
+	       addr++;
+      }
+    }
+  }
+
+  std::cout << "Filling kernel buffer with useful data" << std::endl;
+  int kernel_id = 1;
+  for (int c=0; c<C; c++) {
+    for (int cout=0; cout<COUT; cout++) {
+      for (int kh=0; kh<KH; kh++) {
+	       for (int kw=0; kw<KW; kw++) {
+          float value = (float)kernel_id;
+          int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
+	         kernel[addr_k] = dist(gen);
+        }
+      }
+      kernel_id++;
+    }
+  }
+
+  std::cout << "Filling bias buffer with useful data" << std::endl;
+  for (int cout=0; cout<COUT; cout++) bias[cout] = cout; //dist(gen);
+
+  //-----------------------------
+  // THIS PAIR OF EVENTS WILL BE USED TO TRACK WHEN A KERNEL IS FINISHED WITH
+  // THE INPUT BUFFERS. ONCE THE KERNEL IS FINISHED PROCESSING THE DATA, A NEW
+  // SET OF ELEMENTS WILL BE WRITTEN INTO THE BUFFER.
+  vector<cl::Event> kernel_events(1);
+  vector<cl::Event> read_events(1);
+  vector<cl::Event> write_events(1);
+  cl::Buffer buffer_a;
+  cl::Buffer buffer_b;
+  cl::Buffer buffer_k;
+  cl::Buffer buffer_bias;
+
+  //-----------------------------
+  // Allocate Buffer in Global Memory
+  // Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and
+  // Device-to-host communication
+  std::cout << "Creating Buffers..." << std::endl;
+
+  OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_data_in_bytes, &data_in, &err));
+  OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY  | CL_MEM_USE_HOST_PTR , size_output_in_bytes, &out, &err));
+  OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, &kernel, &err));
+  OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, &bias, &err));
+
+  // set kernel arguments
+  int arg = 0;
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, H));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, W));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, C));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_k));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_bias));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_b));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, COUT));
+
+  //-----------------------------
+  // Copy input data to device global memory
+  std::cout << "Copying data (Host to Device)..." << std::endl;
+  // Because we are passing the write_events, it returns an event object
+  // that identifies this particular command and can be used to query
+  // or queue a wait for this particular command to complete.
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0]));
+  set_callback(write_events[0], "ooo_queue");
+
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[0]));
+  set_callback(write_events[0], "ooo_queue");
+
+  //-----------------------------
+  printf("Enqueueing NDRange kernel.\n");
+  // This event needs to wait for the write buffer operations to complete
+  // before executing. We are sending the write_events into its wait list to
+  // ensure that the order of operations is correct.
+  // Launch the Kernel
+  std::vector<cl::Event> waitList;
+  waitList.push_back(write_events[0]);
+  OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[0]));
+  set_callback(kernel_events[0], "ooo_queue");
+
+
+
+  std::cout << "Getting Results (Device to Host)..." << std::endl;
+  std::vector<cl::Event> eventList;
+  eventList.push_back(kernel_events[0]);
+  // This operation only needs to wait for the kernel call.
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0]));
+  set_callback(read_events[0], "ooo_queue");
+  OCL_CHECK(err, err = read_events[0].wait());
+
+  // Wait for all of the OpenCL operations to complete
+  std::cout << "Waiting..." << std::endl;
+  OCL_CHECK(err, err = q.flush());
+  OCL_CHECK(err, err = q.finish());
+
+
+  std::cout << "computing conv in CPU..." << std::endl;
+
+ // cpu_print_data_in();
+  // cpu_print_kernels();
+ // cpu_print_bias();
+  // cpu_conv2d();
+ // cpu_print_out();
+
+  // check_result();
+
+  //-----------------------------
+  std::cout << "" << std::endl;
+  std::cout << "All done" << std::endl;
+  std::cout << "quit now" << std::endl;
+
+  // exit
+  return 0;
+}
diff --git a/fpga_kernels/test_fpga/src/test_mult2d.cpp b/fpga_kernels/test_fpga/src/test_mult2d.cpp
new file mode 100644
index 000000000..37add80b6
--- /dev/null
+++ b/fpga_kernels/test_fpga/src/test_mult2d.cpp
@@ -0,0 +1,566 @@
+#include <cstdio>      /* printf, scanf, NULL */
+#include <cstdlib>     /* malloc, free, rand */
+
+#include <stdlib.h>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <vector>
+#include "xcl2.hpp"
+
+//#define VERBOSE
+//#define DEBUG
+
+using std::vector;
+
+//---------------------------------------------------------------------------------------------------------------------
+//---------------------------------------------------------------------------------------------------------------------
+
+// An event callback function that prints the operations performed by the OpenCL
+// runtime.
+void event_cb(
+    cl_event event1,
+    cl_int cmd_status
+    , void *data
+) {
+  cl_int err;
+  cl_command_type command;
+  cl::Event event(event1, true);
+  OCL_CHECK(err, err = event.getInfo(CL_EVENT_COMMAND_TYPE, &command));
+  cl_int status;
+  OCL_CHECK(err,
+            err = event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status));
+  const char *command_str;
+  const char *status_str;
+  switch (command) {
+  case CL_COMMAND_READ_BUFFER:
+    command_str = "buffer read";
+    break;
+  case CL_COMMAND_WRITE_BUFFER:
+    command_str = "buffer write";
+    break;
+  case CL_COMMAND_NDRANGE_KERNEL:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_MAP_BUFFER:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_COPY_BUFFER:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_MIGRATE_MEM_OBJECTS:
+    command_str = "buffer migrate";
+    break;
+  default:
+    command_str = "unknown";
+  }
+  switch (status) {
+  case CL_QUEUED:
+    status_str = "Queued";
+    break;
+  case CL_SUBMITTED:
+    status_str = "Submitted";
+    break;
+  case CL_RUNNING:
+    status_str = "Executing";
+    break;
+  case CL_COMPLETE:
+    status_str = "Completed";
+    break;
+  }
+  printf("[%s]: %s %s\n", reinterpret_cast<char *>(data), status_str,
+         command_str);
+  fflush(stdout);
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+// Sets the callback for a particular event
+void set_callback(
+    cl::Event event,
+    const char *queue_name
+) {
+  cl_int err;
+  OCL_CHECK(err, err = event.setCallback(CL_COMPLETE, event_cb, (void *)queue_name));
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+void usage (
+    char *p_name
+) {
+  std::cout << "ERROR: unexpected number of parameters" << std::endl;
+  std::cout << "Usage: " << p_name << " <XCLBIN File>" << " <Ashape0> <Ashape1> <Bshape0> <Bshape1> <tA> <tB> <incC>" << std::endl;
+  exit(EXIT_FAILURE);
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+void fpga_init(
+    cl::Context      &context, 
+    cl::CommandQueue &q, 
+    cl::Program      &program, 
+    cl::Kernel       &kernel_ut,
+    const char       *fname, // fpga device binary file name
+    const char       *k_name // kernel name
+) { 
+  cl_int err;
+  std::string binaryFile = fname;
+
+  // OPENCL HOST CODE AREA START
+  // get_xil_devices() is a utility API which will find the xilinx
+  // platforms and will return list of devices connected to Xilinx platform
+  std::cout << "Creating Context..." << std::endl;
+  // The get_xil_devices will return vector of Xilinx Devices
+  auto devices = xcl::get_xil_devices();
+  auto device = devices[0];
+
+//  std::vector<float, aligned_allocator<float>> host_memory(elements, 42);
+  // Creating Context and Command Queue for selected Device
+  OCL_CHECK(err, context = cl::Context(device, NULL, NULL, NULL, &err));
+
+  std::cout << "  setting command queue" << std::endl;
+  OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
+
+  std::string device_name = device.getInfo<CL_DEVICE_NAME>();
+  std::cout << "Allocating and transferring binary file to " << device_name.c_str() << std::endl;
+
+  auto fileBuf = xcl::read_binary_file(binaryFile);
+  cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
+  devices.resize(1);
+
+
+  std::cout << "Loading program to " << device_name.c_str() << std::endl;
+  OCL_CHECK(err, program = cl::Program(context, devices, bins, NULL, &err));
+  std::cout << "  ... program successful!" << std::endl;
+
+  std::cout << "Creating kernel in program" << std::endl;
+  OCL_CHECK(err, kernel_ut = cl::Kernel(program, k_name, &err));
+  std::cout << "  ... kernel sucessfully created" << std::endl ;
+
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+void create_buffers(
+    cl::Context &context,
+    cl::Buffer &buffer_a,
+    cl::Buffer &buffer_b,
+    cl::Buffer &buffer_c,
+    vector<float, aligned_allocator<float>> &a,
+    vector<float, aligned_allocator<float>> &b,
+    vector<float, aligned_allocator<float>> &c,
+    size_t size_a_in_bytes,
+    size_t size_b_in_bytes,
+    size_t size_c_in_bytes
+
+) {
+  cl_int err;
+  
+  //-----------------------------
+  // Allocate Buffer in Global Memory
+  // Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and
+  // Device-to-host communication
+  std::cout << "Creating Buffers..." << std::endl;
+
+  OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_a_in_bytes, &a[0], &err));
+  OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_b_in_bytes, &b[0], &err));
+  // buffer c will be used for write, and depending on the params can also be read (incremental mmul)
+  OCL_CHECK(err, buffer_c = cl::Buffer(context,                     CL_MEM_USE_HOST_PTR , size_c_in_bytes, &c[0], &err));
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+void fill(
+    vector<float, aligned_allocator<float>> &a,
+    vector<float, aligned_allocator<float>> &b,
+    vector<float, aligned_allocator<float>> &c,
+    vector<float, aligned_allocator<float>> &c_local,
+    int Ashape0, int Ashape1,
+    int Bshape0, int Bshape1,
+    int Cshape0, int Cshape1
+) {
+    // Set/Initialize matrices
+  // fill data vectors
+  int val = 0;
+  std::cout << "Filling matrix A[" << Ashape0 << " , " << Ashape1 << "] with sequential values" << std::endl ;
+  for (int i = 0; i < Ashape0; i++) {
+    for (int j = 0; j < Ashape1; j++) {
+      int ind = i*Ashape1 + j;
+      a[ind] = val;
+      val += 1;
+    }
+  }
+
+  std::cout << "Filling matrix B[" << Bshape0 << " , " << Bshape1 << "] to be the Identity Matrix" << std::endl ;
+  for (int i = 0; i < Bshape0; i++) {
+    for (int j = 0; j < Bshape1; j++) {
+      int ind = i*Bshape1 + j;
+      b[ind] = (i == j) ? 1 : 0;
+    }
+  }
+
+  std::cout << "result matrix C will be dimensioned to: C[" << Cshape0 << ", " << Cshape1 << "]" << std::endl;
+  std::cout << "Filling matrix C[" << Cshape0 << " , " << Bshape1 << "] with 1s" << std::endl ;
+  for (int i = 0; i < Cshape0; i++) {
+    for (int j = 0; j < Cshape1; j++) {
+      int ind = i*Cshape1 + j;
+      c[ind]       = 1;
+      c_local[ind] = c[ind];
+    }
+  }
+
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+void run(
+    cl::Context      &context,
+    cl::CommandQueue &q,
+    cl::Kernel       &kernel_ut,
+    cl::Buffer       &buffer_a,
+    cl::Buffer       &buffer_b,
+    cl::Buffer       &buffer_c,
+    int Ashape0, int Ashape1,
+    int Bshape0, int Bshape1,
+    int tA, int tB, int incC
+) {
+  cl_int err;
+
+  // These events will be used to track when a kernel is finished with
+  // the input and output buffers. Once the kernel is finished processing the data,
+  // a new set of elements will be written into the output buffer.
+  vector<cl::Event> kernel_events(1);
+  vector<cl::Event> read_events(1);
+  vector<cl::Event> write_events(1);
+
+
+  //-----------------------------
+  // These events will be used to track when a kernel is finished with
+  // the input and output buffers. Once the kernel is finished processing the data,
+  // a new set of elements will be written into the output buffer.
+  //vector<cl::Event> kernel_events(1);
+  //vector<cl::Event> read_events(1);
+  //vector<cl::Event> write_events(1);
+  
+  // set kernel arguments
+  //test_run_index++;
+  std::cout << std::endl;
+  //std::cout << "RUN "<< test_run_index << std::endl;
+  std::cout << "Setting kernel arguments...  tA " << tA << "  tB " << tB << "  incC " << incC << std::endl;
+  OCL_CHECK(err, err = kernel_ut.setArg(0, buffer_a));
+  OCL_CHECK(err, err = kernel_ut.setArg(1, buffer_b));
+  OCL_CHECK(err, err = kernel_ut.setArg(2, buffer_c));
+  OCL_CHECK(err, err = kernel_ut.setArg(3, Ashape0));
+  OCL_CHECK(err, err = kernel_ut.setArg(4, Ashape1));
+  OCL_CHECK(err, err = kernel_ut.setArg(5, Bshape0));
+  OCL_CHECK(err, err = kernel_ut.setArg(6, Bshape1));
+  OCL_CHECK(err, err = kernel_ut.setArg(7, tA));
+  OCL_CHECK(err, err = kernel_ut.setArg(8, tB));
+  OCL_CHECK(err, err = kernel_ut.setArg(9, incC));
+
+  //-----------------------------
+  // Copy input data to device global memory
+  std::cout << "Copying data (Host to Device)..." << std::endl;
+  // Because we are passing the write_events, it returns an event object
+  // that identifies this particular command and can be used to query
+  // or queue a wait for this particular command to complete.
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a, buffer_b, buffer_c}, 0 /*0 means from host*/, NULL, &write_events[0]));
+  set_callback(write_events[0], "ooo_queue");
+
+  //-----------------------------
+  printf("Enqueueing NDRange kernel.\n");
+  // This event needs to wait for the write buffer operations to complete
+  // before executing. We are sending the write_events into its wait list to
+  // ensure that the order of operations is correct.
+  // Launch the Kernel
+  std::vector<cl::Event> waitList;
+  waitList.push_back(write_events[0]);
+  OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_ut, 0, 1, 1, &waitList, &kernel_events[0]));
+  set_callback(kernel_events[0], "ooo_queue");
+
+  //-----------------------------
+  // Copy Result from Device Global Memory to Host Local Memory
+  std::cout << "Getting Results (Device to Host)..." << std::endl;
+  std::vector<cl::Event> eventList;
+  eventList.push_back(kernel_events[0]);
+  // This operation only needs to wait for the kernel call. 
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_c}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0]));
+  set_callback(read_events[0], "ooo_queue");
+  OCL_CHECK(err, err = read_events[0].wait());
+
+  std::cout << " Matrix C retrieved from device memory" << std::endl;
+
+  //-----------------------------
+  // HEY !!!!
+  // It is necessary to release the resources, all of them,
+  //  memories, buffers, kernels, programs,...
+
+  // Wait for all of the OpenCL operations to complete
+  std::cout << "Waiting for all the operations to complete..." << std::endl;
+  OCL_CHECK(err, err = q.flush());
+  OCL_CHECK(err, err = q.finish());
+
+  // clear event queues
+  kernel_events.clear();
+  kernel_events.shrink_to_fit();
+  read_events.clear();
+  read_events.shrink_to_fit();
+  write_events.clear();
+  write_events.shrink_to_fit();
+
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+void run_cpu(const vector<float, aligned_allocator<float>> &a,
+             const vector<float, aligned_allocator<float>> &b,
+             //const vector<float, aligned_allocator<float>> &c,
+             vector<float, aligned_allocator<float>> &c_local,
+             int Ashape0, int Ashape1,
+             int Bshape0, int Bshape1,
+             int Cshape0, int Cshape1,
+             int tA, int tB, int incC
+) {
+  int *fA_sum, *fA_mult, *fB_sum, *fB_mult;
+  int kmax; // common dimension, 
+  int i, j, k;
+
+  std::cout << std::endl;
+  std::cout << "Performing kernel opeation in CPU" << std::endl;
+
+  if (tA == 0) {
+    fA_mult = &i;
+    fA_sum  = &k;
+    kmax = Ashape1;
+  }
+  else
+  {
+    fA_mult = &k;
+    fA_sum  = &i;
+    kmax = Ashape0;
+  }
+
+  if (tB == 0) {
+    fB_mult = &k;
+    fB_sum  = &j;
+  }
+  else
+  {
+    fB_mult = &j;
+    fB_sum  = &k;
+  }
+  
+  #ifdef VERBOSE
+  std::cout << "c_local" << std::endl;
+  for (i = 0; i < Cshape0; i++) {
+    for (j = 0; j < Cshape1; j++) {
+      int   ind_c = i * Cshape1 + j;
+      std::cout << "C[" << i << "][" << j << "] = " << c_local[ind_c] << std::endl;
+    }
+  }
+  #endif
+
+  for (i = 0; i < Cshape0; i++) {
+    for (j = 0; j < Cshape1; j++) {
+      int   ind_c = i * Cshape1 + j;
+      float sum   = 0.0f;
+
+      #ifdef VERBOSE
+      std::cout << "C[" << i << "][" << j << "] = ";
+      #endif
+      for (k = 0; k < kmax; k++) {
+        int ind_x = ((*fA_mult) * Ashape1) + (*fA_sum);
+        int ind_y = ((*fB_mult) * Bshape1) + (*fB_sum);
+        sum += a[ind_x] * b[ind_y];
+        #ifdef VERBOSE
+        std::cout << "a[" << ind_x << "] * b[" << ind_y << "] ";
+        if (k < (kmax -1)) std::cout << "+ ";
+        #endif
+      }
+        #ifdef VERBOSE
+        std::cout << " = " << c_local[ind_c] <<  " + " <<  sum  << " = " << (c_local[ind_c] + sum) << std::endl;
+        #endif
+        c_local[ind_c] = (incC ? c_local[ind_c]:0) + sum;
+    }
+  }
+
+  #ifdef VERBOSE
+  std::cout << "CPU result" << std::endl;
+  for (i = 0; i < Cshape0; i++) {
+    for (j = 0; j < Cshape1; j++) {
+      int   ind_c = i * Cshape1 + j;
+      std::cout << "C[" << i << "][" << j << "] = " << c_local[ind_c] << std::endl;
+    }
+  }
+  #endif
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+// return status of comparison
+//     ret 1 - matrices match
+//             otherwise return 0
+int compare( const vector<float, aligned_allocator<float>> &c,
+             const vector<float, aligned_allocator<float>> &c_local,
+             size_t size
+) {
+  int matrices_match = 1;
+
+  for(size_t i = 0; i < size; i++) {
+    if (c[i] != c_local[i]) {
+      std::cout << "Data mismatch found" << std::endl;
+      matrices_match = 0;
+      break;
+    }
+  }
+  return matrices_match;
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+int main(int argc, char **argv) {
+  int Ashape0;
+  int Ashape1;
+  int Bshape0;
+  int Bshape1;
+  int Cshape0;
+  int Cshape1 ;
+  int tA;
+  int tB;
+  int incC;
+
+  int test_ok      = 1;
+  //int test_run_index = 0;
+
+  // CL 
+  cl::Context      context;
+  cl::CommandQueue q;
+  cl::Program      program;
+  cl::Kernel       kernel_ut;
+  cl::Buffer       buffer_a, buffer_b, buffer_c;
+
+  //---------------------------------------------------------------------------
+  if (argc != 9) {
+    usage(argv[0]);
+  }
+
+  Ashape0 = atoi(argv[2]);
+  Ashape1 = atoi(argv[3]);
+  Bshape0 = atoi(argv[4]);
+  Bshape1 = atoi(argv[5]);
+  tA   = atoi(argv[6]); 
+  tB   = atoi(argv[7]); 
+  incC = atoi(argv[8]); 
+ 
+  // check input configuration
+  {
+    int ic_err = 0;
+    if ((tA == 0) && (tB == 0)) {
+      if (Ashape1 != Bshape0) {
+        ic_err = 1;
+      }
+    }
+    else if ((tA == 0) && (tB == 1)) {
+      if (Ashape1 != Bshape1) {
+        ic_err = 1;
+      }
+    }
+    else if ((tA == 1) && (tB == 0)) {
+      if (Ashape0 != Bshape0) {
+        ic_err = 1;
+      }
+    }
+    else if ((tA == 1) && (tB == 1)) {
+      if (Ashape0 != Bshape1) {
+        ic_err = 1; 
+      }
+    }
+    else {
+      std::cout << "Unexpected configuration" << std::endl;
+      ic_err = 1;
+    }
+  
+    if (ic_err != 0) {
+      std::cout << "Error matrix dimensions mismatch for requested operation" << std::endl << std::endl;
+      return EXIT_FAILURE;
+    }
+  }
+  
+  //---------------------------------------------------------------------------
+  std::ofstream outfile;
+  std::string   outfname = "output.txt";
+  outfile.open (outfname.c_str()); // we delete file content by open/close operations
+  outfile.close (); //we close the file in case any error happens and the test exits before completion
+
+  //---------------------------------------------------------------------------
+  // set matrices dimensions 
+  Cshape0 = tA? Ashape1:Ashape0;
+  Cshape1 = tB? Bshape0:Bshape1;
+  
+  size_t size_a = Ashape0 * Ashape1;
+  size_t size_b = Bshape0 * Bshape1;
+  size_t size_c = Cshape0 * Cshape1;
+  size_t size_a_in_bytes = size_a * sizeof(float);
+  size_t size_b_in_bytes = size_b * sizeof(float);
+  size_t size_c_in_bytes = size_c * sizeof(float);
+
+  // Allocate memory on the host
+  vector<float, aligned_allocator<float>> a(size_a, 0);
+  vector<float, aligned_allocator<float>> b(size_b, 0);
+  vector<float, aligned_allocator<float>> c(size_c, 0);
+  vector<float, aligned_allocator<float>> c_local(size_c, 0);
+  
+  std::cout << "tA " << tA << "  tB " << tB << "  incC " << incC << std::endl;
+  std::cout << "A[" << Ashape0 << "x" << Ashape1 << "]   B[" << Bshape0 << "x" << Bshape1 << "]  C[" << Cshape0 << "x" << Cshape1 << "] " << std::endl;
+  //---------------
+  // fill matrices
+  fill(a, b, c, c_local, Ashape0, Ashape1, Bshape0, Bshape1, Cshape0, Cshape1);
+
+  //---------------------------------------------------------------------------
+  // Initialize fpga, load binary and kernel
+  fpga_init(context, q, program, kernel_ut, argv[1], "k_mult2d");
+
+  // create CL buffers
+  create_buffers(context, buffer_a, buffer_b, buffer_c, a, b, c, size_a_in_bytes, size_b_in_bytes, size_c_in_bytes);
+
+  // Run the kernel
+  run(context, q, kernel_ut, buffer_a, buffer_b, buffer_c, Ashape0, Ashape1, Bshape0, Bshape1, tA, tB, incC);
+
+  // locally calculate result
+  run_cpu(a, b, c_local, Ashape0, Ashape1, Bshape0, Bshape1, Cshape0, Cshape1, tA,tB,incC);
+
+  // compare results
+  test_ok = compare (c, c_local, size_c);
+
+
+  outfile.open(outfname.c_str(), std::ofstream::out | std::ofstream::app);
+  if (test_ok != 0) {
+    std::cout << "" << std::endl;
+    std::cout << "TEST PASSED" << std::endl << std::endl;
+
+    outfile << "" << std::endl;
+    outfile << "TEST PASSED" << std::endl << std::endl;
+  }
+  else {
+    std::cout << "" << std::endl;
+    std::cout << "ERRORS DETECTED" << std::endl << std::endl;
+    std::cout << "TEST KO" << std::endl;
+
+    outfile << "" << std::endl;
+    outfile << "ERRORS DETECTED" << std::endl << std::endl;
+    outfile << "TEST KO" << std::endl;
+  }
+  outfile.close();
+
+
+  //-----------------------------
+  // It is necessary to release the resources, all of them,
+  a.clear();
+  b.clear();
+  c.clear();
+  c_local.clear();
+  a.shrink_to_fit();
+  b.shrink_to_fit();
+  c.shrink_to_fit();
+  c_local.shrink_to_fit();
+
+  //-----------------------------
+  std::cout << "" << std::endl;
+  std::cout << "All done" << std::endl;
+  std::cout << "quit now" << std::endl;
+
+  // exit
+  return 0;
+}
diff --git a/fpga_kernels/test_fpga/src/test_relu.cpp b/fpga_kernels/test_fpga/src/test_relu.cpp
new file mode 100644
index 000000000..ab83ab7bd
--- /dev/null
+++ b/fpga_kernels/test_fpga/src/test_relu.cpp
@@ -0,0 +1,304 @@
+#include <cstdio>      /* printf, scanf, NULL */
+#include <cstdlib>     /* malloc, free, rand */
+
+#include <stdlib.h>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <vector>
+#include "xcl2.hpp"
+//#include "/home/jorga20j/integration_eddl/eddl/fpga_kernels/test_fpga/test/src/xcl2.hpp"
+//#include "/home/jomarm10/workspace/Vitis_Accel_Examples/common/includes/xcl2/xcl2.hpp"
+
+using std::vector;
+
+// CL
+cl::Buffer buf;
+cl::Context context;
+cl::CommandQueue q;
+cl::Program program;
+
+
+#define SIZE 1024
+
+static const int elements = 256;
+
+
+
+//---------------------------------------------------------------------------------------------------------------------
+//---------------------------------------------------------------------------------------------------------------------
+
+// An event callback function that prints the operations performed by the OpenCL
+// runtime.
+void event_cb(cl_event event1, cl_int cmd_status, void *data) {
+  cl_int err;
+  cl_command_type command;
+  cl::Event event(event1, true);
+  OCL_CHECK(err, err = event.getInfo(CL_EVENT_COMMAND_TYPE, &command));
+  cl_int status;
+  OCL_CHECK(err,
+            err = event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status));
+  const char *command_str;
+  const char *status_str;
+  switch (command) {
+  case CL_COMMAND_READ_BUFFER:
+    command_str = "buffer read";
+    break;
+  case CL_COMMAND_WRITE_BUFFER:
+    command_str = "buffer write";
+    break;
+  case CL_COMMAND_NDRANGE_KERNEL:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_MAP_BUFFER:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_COPY_BUFFER:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_MIGRATE_MEM_OBJECTS:
+    command_str = "buffer migrate";
+    break;
+  default:
+    command_str = "unknown";
+  }
+  switch (status) {
+  case CL_QUEUED:
+    status_str = "Queued";
+    break;
+  case CL_SUBMITTED:
+    status_str = "Submitted";
+    break;
+  case CL_RUNNING:
+    status_str = "Executing";
+    break;
+  case CL_COMPLETE:
+    status_str = "Completed";
+    break;
+  }
+  printf("[%s]: %s %s\n", reinterpret_cast<char *>(data), status_str,
+         command_str);
+  fflush(stdout);
+}
+
+// Sets the callback for a particular event
+void set_callback(cl::Event event, const char *queue_name) {
+  cl_int err;
+  OCL_CHECK(err,
+            err = event.setCallback(CL_COMPLETE, event_cb, (void *)queue_name));
+}
+
+
+
+//---------------------------------------------------------------------------------------------------------------------
+
+
+
+
+
+
+
+
+
+
+
+
+
+void fpga_init(){ // initialize only once
+
+
+
+}
+
+void create_buffers() {
+
+
+
+
+}
+
+void fill(cl::Buffer *buf) {
+
+
+}
+
+void run() {
+
+
+}
+
+void run_cpu() {
+}
+
+void compare() {
+}
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  std::string binaryFile = argv[1];
+  cl_int err;
+  cl::Kernel kernel_relu;
+
+
+  // size_t size_in_bytes = host_memory.size() * sizeof(int);
+
+  // OPENCL HOST CODE AREA START
+  // get_xil_devices() is a utility API which will find the xilinx
+  // platforms and will return list of devices connected to Xilinx platform
+  std::cout << "Creating Context..." << std::endl;
+  // The get_xil_devices will return vector of Xilinx Devices
+  auto devices = xcl::get_xil_devices();
+  auto device = devices[0];
+
+  std::vector<float, aligned_allocator<float>> host_memory(elements, 42);
+  // Creating Context and Command Queue for selected Device
+  OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err));
+  OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
+
+  std::string device_name = device.getInfo<CL_DEVICE_NAME>();
+  std::cout << "Allocating and transferring data to " << device_name.c_str() << std::endl;
+
+  auto fileBuf = xcl::read_binary_file(binaryFile);
+  cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
+  devices.resize(1);
+
+  OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err));
+  std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl;
+
+  OCL_CHECK(err, kernel_relu = cl::Kernel(program,"k_relu", &err));
+  std::cout << "Kernel sucessfully created" << std::endl ;
+
+  size_t size_in_bytes = 4096*sizeof(float);
+  // Allocate memory on the host and fill with random data.
+  vector<float, aligned_allocator<float>> a(size_in_bytes);
+  vector<float, aligned_allocator<float>> b(size_in_bytes);
+
+
+  //-----------------------------
+  // fill data vector with random data
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  std::cout << "Filling Tensor A with random values [-20.0, 30.0]" << std::endl ;
+  for (int i = 0; i < SIZE; i++) {
+    a[i] = dist(gen);
+  }
+  std::cout << "A[] = {" << std::endl;
+  for (int i = 0; i < 20; i++) {
+      std::cout << " " << a[i] << ",";
+  }
+  std::cout << " ...}" << std::endl ;
+
+  //-----------------------------
+  // THIS PAIR OF EVENTS WILL BE USED TO TRACK WHEN A KERNEL IS FINISHED WITH
+  // THE INPUT BUFFERS. ONCE THE KERNEL IS FINISHED PROCESSING THE DATA, A NEW
+  // SET OF ELEMENTS WILL BE WRITTEN INTO THE BUFFER.
+  vector<cl::Event> kernel_events(1);
+  vector<cl::Event> read_events(1);
+  vector<cl::Event> write_events(1);
+  cl::Buffer buffer_a, buffer_b;
+
+  //-----------------------------
+  // Allocate Buffer in Global Memory
+  // Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and
+  // Device-to-host communication
+  std::cout << "Creating Buffers..." << std::endl;
+
+  OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_in_bytes, &a[0], &err));
+  OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR , size_in_bytes, &b[0], &err));
+
+  // set kernel arguments
+  OCL_CHECK(err, err = kernel_relu.setArg(0, buffer_a));
+  OCL_CHECK(err, err = kernel_relu.setArg(1, buffer_b));
+  OCL_CHECK(err, err = kernel_relu.setArg(2, (long int)SIZE));
+
+  //-----------------------------
+  // Copy input data to device global memory
+  std::cout << "Copying data (Host to Device)..." << std::endl;
+  // Because we are passing the write_events, it returns an event object
+  // that identifies this particular command and can be used to query
+  // or queue a wait for this particular command to complete.
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0]));
+  set_callback(write_events[0], "ooo_queue");
+
+  //-----------------------------
+  printf("Enqueueing NDRange kernel.\n");
+  // This event needs to wait for the write buffer operations to complete
+  // before executing. We are sending the write_events into its wait list to
+  // ensure that the order of operations is correct.
+  // Launch the Kernel
+  std::vector<cl::Event> waitList;
+  waitList.push_back(write_events[0]);
+  OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_relu, 0, 1, 1, &waitList, &kernel_events[0]));
+  set_callback(kernel_events[0], "ooo_queue");
+
+  //-----------------------------
+  // Copy Result from Device Global Memory to Host Local Memory
+  std::cout << "Getting Results (Device to Host)..." << std::endl;
+  std::vector<cl::Event> eventList;
+  eventList.push_back(kernel_events[0]);
+  // This operation only needs to wait for the kernel call. 
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0]));
+  set_callback(read_events[0], "ooo_queue");
+  OCL_CHECK(err, err = read_events[0].wait());
+
+  std::cout << "kernel returned" << std::endl ;
+  std::cout << "  B [] = {" ;
+   for (int i = 0; i < 10; i++) {
+      std::cout << " " << b[i] << ",";
+  }
+  std::cout << " ... }" << std::endl;
+  
+  //-----------------------------
+  // check received data
+  std::cout << "Check kernel output, checking " << SIZE << " values"<< std::endl;
+  {
+    vector<float, aligned_allocator<float>> res_local(size_in_bytes);
+    // perform kernel operation in host
+    for (int i = 0; i < SIZE; i++ ) {
+      if (a[i] < 0.0) res_local[i] = 0.0f;
+      else            res_local[i] = a[i];
+    }
+    // compare data vectors
+    int data_matches = 1;
+    for (int i = 0; i < SIZE; i++) {
+      if (res_local [i] != b[i]) {
+        data_matches = 0;
+        std::cout << "DATA MISMATCH    v_local[= " << i << "] = " << res_local[i] << "   !=   b[" << i << "] = " << b[i] << std::endl;
+      }
+    }
+
+    if (data_matches) {
+      std::cout << "" << std::endl;
+      std::cout << "TEST PASSED" << std::endl;
+    }
+    else {
+      std::cout << "" << std::endl;
+      std::cout << "ERRORS DETECTED" << std::endl;
+      std::cout << "TEST KO" << std::endl;
+    }
+  }
+
+  //-----------------------------
+  // HEY !!!!
+  // It is necessary to release the resources, all of them,
+  //  memories, buffers, kernels, programs,...
+
+  // Wait for all of the OpenCL operations to complete
+  std::cout << "Waiting..." << std::endl;
+  OCL_CHECK(err, err = q.flush());
+  OCL_CHECK(err, err = q.finish());
+
+  //-----------------------------
+  std::cout << "" << std::endl;
+  std::cout << "All done" << std::endl;
+  std::cout << "quit now" << std::endl;
+
+  // exit
+  return 0;
+}
diff --git a/fpga_kernels/test_fpga/utils.mk b/fpga_kernels/test_fpga/utils.mk
new file mode 100755
index 000000000..c4a81e29c
--- /dev/null
+++ b/fpga_kernels/test_fpga/utils.mk
@@ -0,0 +1,101 @@
+#+-------------------------------------------------------------------------------
+# The following parameters are assigned with default values. These parameters can
+# be overridden through the make command line
+#+-------------------------------------------------------------------------------
+
+DEBUG := no
+B_TEMP = `$(ABS_COMMON_REPO)/common/utility/parse_platform_list.py $(DEVICE)`
+
+#Generates debug summary report
+ifeq ($(DEBUG), yes)
+LDCLFLAGS += --dk list_ports
+endif
+
+#Setting Platform Path
+ifeq ($(findstring xpfm, $(DEVICE)), xpfm)
+	B_NAME = $(shell dirname $(DEVICE))
+else
+	B_NAME = $(B_TEMP)/$(DEVICE)
+endif
+
+#Checks for XILINX_VITIS
+check-vitis:
+ifndef XILINX_VITIS
+	$(error XILINX_VITIS variable is not set, please set correctly and rerun)
+endif
+
+#Checks for Device Family
+ifeq ($(HOST_ARCH), aarch32)
+	DEV_FAM = 7Series
+else ifeq ($(HOST_ARCH), aarch64)
+	DEV_FAM = Ultrascale
+endif
+
+#Checks for XILINX_XRT
+check-xrt:
+ifeq ($(HOST_ARCH), x86)
+ifndef XILINX_XRT
+	$(error XILINX_XRT variable is not set, please set correctly and rerun)
+endif
+else
+ifndef XILINX_VITIS
+	$(error XILINX_VITIS variable is not set, please set correctly and rerun)
+endif
+endif
+
+#Checks for Correct architecture
+ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86))
+$(error HOST_ARCH variable not set, please set correctly and rerun)
+endif
+
+#Checks for EDGE_COMMON_SW
+ifneq ($(HOST_ARCH), x86)
+ifndef EDGE_COMMON_SW
+$(error EDGE_COMMON_SW variable is not set, please set correctly and rerun)
+endif
+ifeq ($(HOST_ARCH), aarch64)
+SYSROOT := $(EDGE_COMMON_SW)/sysroots/aarch64-xilinx-linux
+SD_IMAGE_FILE := $(EDGE_COMMON_SW)/Image
+CXX := $(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++
+else ifeq ($(HOST_ARCH), aarch32)
+SYSROOT := $(EDGE_COMMON_SW)/sysroots/cortexa9t2hf-neon-xilinx-linux-gnueabi/
+SD_IMAGE_FILE := $(EDGE_COMMON_SW)/uImage
+CXX := $(XILINX_VITIS)/gnu/aarch32/lin/gcc-arm-linux-gnueabi/bin/arm-linux-gnueabihf-g++
+endif
+endif
+
+gen_run_app:
+ifneq ($(HOST_ARCH), x86)
+	rm -rf run_app.sh
+	$(ECHO) 'export LD_LIBRARY_PATH=/mnt:/tmp:$(LD_LIBRARY_PATH)' >> run_app.sh
+	$(ECHO) 'export XILINX_XRT=/usr' >> run_app.sh
+ifeq ($(TARGET),$(filter $(TARGET),sw_emu hw_emu))
+	$(ECHO) 'export XILINX_VITIS=/mnt' >> run_app.sh
+	$(ECHO) 'export XCL_EMULATION_MODE=$(TARGET)' >> run_app.sh
+endif
+	$(ECHO) './$(EXECUTABLE) dummy_kernel.xclbin' >> run_app.sh
+	$(ECHO) 'return_code=$$?' >> run_app.sh
+	$(ECHO) 'if [ $$return_code -ne 0 ]; then' >> run_app.sh
+	$(ECHO) 'echo "ERROR: host run failed, RC=$$return_code"' >> run_app.sh
+	$(ECHO) 'fi' >> run_app.sh
+	$(ECHO) 'echo "INFO: host run completed."' >> run_app.sh
+endif
+check-devices:
+ifndef DEVICE
+	$(error DEVICE not set. Please set the DEVICE properly and rerun. Run "make help" for more details.)
+endif
+
+#   device2xsa - create a filesystem friendly name from device name
+#   $(1) - full name of device
+device2xsa = $(strip $(patsubst %.xpfm, % , $(shell basename $(DEVICE))))
+
+# Cleaning stuff
+RM = rm -f
+RMDIR = rm -rf
+
+ECHO:= @echo
+
+docs: README.md
+
+README.md: description.json
+	$(ABS_COMMON_REPO)/common/utility/readme_gen/readme_gen.py description.json
diff --git a/include/eddl/hardware/fpga/fpga_enables.h b/include/eddl/hardware/fpga/fpga_enables.h
index 4057aa3ef..55f183d85 100644
--- a/include/eddl/hardware/fpga/fpga_enables.h
+++ b/include/eddl/hardware/fpga/fpga_enables.h
@@ -2,7 +2,7 @@
 // implemented on the FPGA
 
 //Activations
-//#define K_ENABLED_RELU
+#define K_ENABLED_RELU
 //#define K_ENABLED_D_RELU
 //#define K_ENABLED_THRESHOLDED_RELU
 //#define K_ENABLED_D_TRHESHOLDED_RELU
@@ -65,6 +65,7 @@
 //#define K_ENABLED_CONV2D
 //#define K_ENALBED_CONV2D_GRAD
 //#define K_ENABLED_CONV2D_BACK
+#define K_ENABLED_CONV2D_K3X3_S1X1_P1X1_BS1
 
 //Core
 //#define K_ENABLED_FILL_
diff --git a/include/eddl/hardware/fpga/fpga_hw.h b/include/eddl/hardware/fpga/fpga_hw.h
index a3c8c9df6..c14c372d4 100644
--- a/include/eddl/hardware/fpga/fpga_hw.h
+++ b/include/eddl/hardware/fpga/fpga_hw.h
@@ -20,7 +20,7 @@
 
 extern cl::CommandQueue q;
 
-#define FPGA_DEBUG
+//#define FPGA_DEBUG
 
 #include "eddl/hardware/fpga/fpga_enables.h"
 
@@ -49,8 +49,9 @@ extern cl::Kernel kernel_select,      kernel_select_back, kernel_set_select, ker
 extern cl::Kernel kernel_set_select2, kernel_deselect,    kernel_concat;
 extern cl::Kernel kernel_select_nn,   kernel_select_back_nn, kernel_set_select_back_nn, kernel_set_select_nn;
 
-// conv kernels (2)
+// conv kernels (3)
 extern cl::Kernel kernel_im2col,      kernel_conv2d;
+extern cl::Kernel kernel_conv2D_K3x3_S1x1_P1x1_BS1;
 
 // create kernels (3)
 extern cl::Kernel kernel_range, kernel_eye, kernel_diag;
diff --git a/include/eddl/profiling.h b/include/eddl/profiling.h
new file mode 100644
index 000000000..8cca92267
--- /dev/null
+++ b/include/eddl/profiling.h
@@ -0,0 +1,37 @@
+#ifndef _PROFILING
+
+#define _PROFILING
+
+#include <sys/time.h>
+
+#define PROFILING_ENABLE(fn) \
+    unsigned long long prof_##fn##_time; \
+    unsigned long long prof_##fn##_calls; \
+
+#define PROFILING_HEADER(fn) \
+    struct timeval prof_t1; \
+    gettimeofday(&prof_t1, NULL);
+
+#define PROFILING_HEADER_EXTERN(fn) \
+    extern unsigned long long prof_##fn##_time; \
+    extern unsigned long long prof_##fn##_calls; \
+    extern int prof_##fn##_device; \
+    struct timeval prof_t1; \
+    gettimeofday(&prof_t1, NULL);
+
+#define PROFILING_FOOTER(fn) \
+    struct timeval prof_t2; \
+    gettimeofday(&prof_t2, NULL); \
+    prof_##fn##_time += ((prof_t2.tv_sec - prof_t1.tv_sec) * 1000000) + (prof_t2.tv_usec - prof_t1.tv_usec); \
+    prof_##fn##_calls += 1;
+
+#define PROFILING_PRINTF(fn) \
+    if (prof_##fn##_calls > 0) printf("  %-50s: %8lld calls, %8lld us , %10.4f us/call\n", #fn, \
+                    prof_##fn##_calls, prof_##fn##_time, \
+                    (float) prof_##fn##_time / (float) prof_##fn##_calls);
+
+#define PROFILING_PRINTF2(fn, acc) \
+    if (prof_##fn##_calls > 0) printf("  %-50s: %8lld calls, %8lld us (%6.2f), %10.4f us/call\n", #fn, \
+                    prof_##fn##_calls, prof_##fn##_time, \
+            100.0 * prof_##fn##_time / acc, (float) prof_##fn##_time / (float) prof_##fn##_calls);
+#endif
diff --git a/include/eddl/tensor/tensor.h b/include/eddl/tensor/tensor.h
index 44552d4e9..b365cfc46 100644
--- a/include/eddl/tensor/tensor.h
+++ b/include/eddl/tensor/tensor.h
@@ -126,7 +126,7 @@ class Tensor {
     *  @param dev  One of ``DEV_CPU`` or ``DEV_GPU``
     *  @return a tensor
     */
-    Tensor(const vector<int> &shape, float *fptr, int dev);
+    Tensor(const vector<int> &shape, float *fptr, int dev, void *fptr2=0);
 
     /**
     *  @brief Constructor of an uninitialized tensor
diff --git a/src/hardware/fpga/fpga_core.cpp b/src/hardware/fpga/fpga_core.cpp
index 08ea5f7ae..d88e22c1a 100644
--- a/src/hardware/fpga/fpga_core.cpp
+++ b/src/hardware/fpga/fpga_core.cpp
@@ -60,8 +60,9 @@ cl::Kernel kernel_select,      kernel_select_back, kernel_set_select, kernel_set
 cl::Kernel kernel_set_select2, kernel_deselect,    kernel_concat;
 cl::Kernel kernel_select_nn,   kernel_select_back_nn, kernel_set_select_nn, kernel_set_select_back_nn;
 
-// conv kernels (2)
+// conv kernels (3)
 cl::Kernel kernel_im2col,      kernel_conv2d;
+cl::Kernel kernel_conv2D_K3x3_S1x1_P1x1_BS1;
 
 // create kernels (3)
 cl::Kernel kernel_range, kernel_eye, kernel_diag;
@@ -633,6 +634,10 @@ void fpga_init(){ // initialize only once
     OCL_CHECK(err, kernel_conv2d = cl::Kernel(program,"k_conv2d", &err));
     if (err != CL_SUCCESS) printf("Error creating kernel\n");
     #endif
+    #ifdef K_ENABLED_CONV2D_K3X3_S1X1_P1X1_BS1
+    OCL_CHECK(err, kernel_conv2D_K3x3_S1x1_P1x1_BS1 = cl::Kernel(program, "k_conv2D_K3x3_S1x1_P1x1_BS1", &err));
+    if (err != CL_SUCCESS) printf("Error creating kernel\n");
+    #endif
     #ifdef K_ENABLED_RANGE
     OCL_CHECK(err, kernel_range = cl::Kernel(program,"k_range", &err));
     if (err != CL_SUCCESS) printf("Error creating kernel\n");
diff --git a/src/hardware/fpga/nn/fpga_conv.cpp b/src/hardware/fpga/nn/fpga_conv.cpp
index cf93d9c08..24f7ce232 100644
--- a/src/hardware/fpga/nn/fpga_conv.cpp
+++ b/src/hardware/fpga/nn/fpga_conv.cpp
@@ -37,10 +37,29 @@ void fpga_cpuemu_conv2D(ConvolDescriptor *D) {
   fpga_copy_memory_to_fpga(D->ptrI, D->fpga_ptrI, D->fpga_sizeI);
 }
 
+// Convolution: Kernel(3x3), Stride(1x1), Padding(1x1), BatchSize=1
+void fpga_conv2D_K3x3_S1x1_P1x1_BS1(cl::Buffer I, int Irows, int Icols, int Ichannels, cl::Buffer K, cl::Buffer B, cl::Buffer O, int Ochannels) {
+
+ cl_int err;
+ cl::Event event;
+ int arg=0;
+ OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, I));
+ OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Irows));
+ OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Icols));
+ OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Ichannels));
+ OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, K));
+ OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, B));
+ OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, O));
+ OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Ochannels));
+
+ OCL_CHECK(err, err = q.enqueueTask(kernel_conv2D_K3x3_S1x1_P1x1_BS1, NULL, &event));
+ q.finish();
+}
+
 void fpga_conv2D(ConvolDescriptor *D)
 {
   _profile_fpga(_FPGA_CONV2D, 0);
-#ifndef K_ENABLED_CONV2D
+#if !defined(K_ENABLED_CONV2D) && !defined(K_ENABLED_CONV2D_K3X3_S1X1_P1X1_BS1)
   fpga_cpuemu_conv2D(D);
 #else
   cl_int err;
@@ -66,27 +85,32 @@ void fpga_conv2D(ConvolDescriptor *D)
   int stride_rows  = D->sr;              // rows stride
   int stride_cols  = D->sc;              // cols stride
 
-  OCL_CHECK(err, err = kernel_conv2d.setArg(0, batch_size));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(1, I));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(2, Irows));    // input
-  OCL_CHECK(err, err = kernel_conv2d.setArg(3, Icols));    // output
-  OCL_CHECK(err, err = kernel_conv2d.setArg(4, Ichannels));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(5, K));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(6, Krows));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(7, Kcols));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(8, B));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(9, use_bias));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(10, O));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(11, Orows));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(12, Ocols));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(13, Ochannels));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(14, padding_rows));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(15, padding_cols));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(16, stride_rows));
-  OCL_CHECK(err, err = kernel_conv2d.setArg(17, stride_cols));
-
-  OCL_CHECK(err, err = q.enqueueTask(kernel_conv2d, NULL, &event));
-  q.finish();
+  // depending on the conv parameters we select the kernel to launch
+  if ((stride_rows == 1) && (stride_cols == 1) && (Krows == 3) && (Kcols == 3) && (batch_size == 1) && (padding_rows == 1) && (padding_cols == 1)) {
+    fpga_conv2D_K3x3_S1x1_P1x1_BS1(I, Irows, Icols, Ichannels, K, B, O, Ochannels);
+  } else {
+    OCL_CHECK(err, err = kernel_conv2d.setArg(0, batch_size));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(1, I));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(2, Irows));    // input
+    OCL_CHECK(err, err = kernel_conv2d.setArg(3, Icols));    // output
+    OCL_CHECK(err, err = kernel_conv2d.setArg(4, Ichannels));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(5, K));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(6, Krows));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(7, Kcols));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(8, B));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(9, use_bias));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(10, O));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(11, Orows));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(12, Ocols));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(13, Ochannels));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(14, padding_rows));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(15, padding_cols));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(16, stride_rows));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(17, stride_cols));
+
+    OCL_CHECK(err, err = q.enqueueTask(kernel_conv2d, NULL, &event));
+    q.finish();
+  }
 #endif
   _profile_fpga(_FPGA_CONV2D, 1);
 }
diff --git a/src/layers/core/layer_activation.cpp b/src/layers/core/layer_activation.cpp
index 50ac3fe4c..6ba3bf1c0 100644
--- a/src/layers/core/layer_activation.cpp
+++ b/src/layers/core/layer_activation.cpp
@@ -26,6 +26,9 @@ LActivation::LActivation(Layer *parent, string act, vector<float> params, string
     this->params = params;
 
     input = parent->output;
+#ifdef DEBUG_FPGA
+    printf("creating output for RELU\n");
+#endif
     output = new Tensor(input->shape, dev);
     delta_bp = 0;
 
diff --git a/src/layers/core/layer_reshape.cpp b/src/layers/core/layer_reshape.cpp
index 5dc45ae9e..5d9668d13 100644
--- a/src/layers/core/layer_reshape.cpp
+++ b/src/layers/core/layer_reshape.cpp
@@ -71,6 +71,10 @@ LReshape::LReshape(Layer *parent, vector<int> shape, string name, int dev, int m
     ///////
 
     // sharing the pointers to data
+#ifdef cFPGA
+    printf("creating new tensor output for reshape (at constructor)\n");
+#endif
+
     output = new Tensor(ls, parent->output);
 
     parent->addchild(this);
@@ -85,6 +89,7 @@ LReshape::~LReshape(){
 void LReshape::resize(int batch){
     ls[0]=batch;
 #ifdef cFPGA
+    printf("voy a hacer resize!!!! batch %d shape[0] %d tensor_id %d, tensor_id parent %d fpga_ptr %p\n", batch, output->shape[0], output->fpga_tensor_id, parent[0]->output->fpga_tensor_id, parent[0]->output->fpga_ptr);
     output->resize(batch, parent[0]->output->ptr, parent[0]->output->fpga_ptr, false);
 #else
     output->resize(batch, parent[0]->output->ptr, nullptr, false);
@@ -98,6 +103,9 @@ void LReshape::mem_delta() {
         parent[0]->mem_delta();
 
         // Problem: Delta is always created, regardless of the low_mem
+#ifdef cFPGA
+	printf("creating new delta tensor for reshape at mem_delta\n");
+#endif
         delta = new Tensor(ls, parent[0]->delta);
 
         if(this->verbosity_level >= 2){
@@ -111,6 +119,9 @@ void LReshape::free_delta() {
     if(this->delta != nullptr) {
         // Do not delete its delta directly (It's pointer points to parent's delta)
         delta->ptr = nullptr;
+#ifdef cFPGA
+        delta->fpga_ptr = nullptr;
+#endif
         delete delta;
         delta = nullptr;
 
diff --git a/src/tensor/nn/tensor_activations.cpp b/src/tensor/nn/tensor_activations.cpp
index f2232c099..b465a8918 100644
--- a/src/tensor/nn/tensor_activations.cpp
+++ b/src/tensor/nn/tensor_activations.cpp
@@ -8,6 +8,7 @@
 */
 #include "eddl/tensor/nn/tensor_nn.h"
 #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h"
+#include "eddl/profiling.h"
 
 #ifdef cFPGA
 #include "eddl/hardware/fpga/nn/fpga_nn.h"
@@ -21,12 +22,16 @@
 
 namespace tensorNN {
 
+	PROFILING_ENABLE(ReLu);
+
 
 // ReLU
     void ReLu(Tensor *A, Tensor *B) {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::ReLu");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::ReLu");
 
+	PROFILING_HEADER_EXTERN(ReLu);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_relu(A, B);
@@ -44,6 +49,9 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+	PROFILING_FOOTER(ReLu);
+	PROFILING_PRINTF(ReLu);
     }
 
 // RELU Derivative, always increment over parent delta
@@ -622,4 +630,4 @@ namespace tensorNN {
 
     }
 
-}
\ No newline at end of file
+}
diff --git a/src/tensor/nn/tensor_conv.cpp b/src/tensor/nn/tensor_conv.cpp
index 5103f78f8..ec876fc92 100644
--- a/src/tensor/nn/tensor_conv.cpp
+++ b/src/tensor/nn/tensor_conv.cpp
@@ -8,6 +8,7 @@
 */
 #include "eddl/tensor/nn/tensor_nn.h"
 #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h"
+#include "eddl/profiling.h"
 
 #ifdef cGPU
 #include "eddl/hardware/gpu/gpu_tensor.h"
@@ -22,6 +23,8 @@
 
 namespace tensorNN{
 
+	PROFILING_ENABLE(Conv2D);
+
 
 
 void Conv2D(ConvolDescriptor *D) {
@@ -33,6 +36,8 @@ void Conv2D(ConvolDescriptor *D) {
     /////////////////////////////////////////////////////////////////////
     if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::Conv2D");
 
+    PROFILING_HEADER_EXTERN(Conv2D);
+
     D->O->tsem->lock();
     if (D->I->isCPU()) {
         cpu_conv2D(D);
@@ -50,6 +55,9 @@ void Conv2D(ConvolDescriptor *D) {
     }
 #endif
     D->O->tsem->unlock();
+
+    PROFILING_FOOTER(Conv2D);
+    PROFILING_PRINTF(Conv2D);
 }
 
 void Conv2D_grad(ConvolDescriptor *D) {
@@ -106,4 +114,4 @@ void Conv2D_back(ConvolDescriptor *D) {
     D->ID->tsem->unlock();
 }
 
-}
\ No newline at end of file
+}
diff --git a/src/tensor/nn/tensor_pool.cpp b/src/tensor/nn/tensor_pool.cpp
index d742eaee6..d3e0a7a39 100644
--- a/src/tensor/nn/tensor_pool.cpp
+++ b/src/tensor/nn/tensor_pool.cpp
@@ -8,6 +8,7 @@
 */
 #include "eddl/tensor/nn/tensor_nn.h"
 #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h"
+#include "eddl/profiling.h"
 
 #ifdef cGPU
 #include "eddl/hardware/gpu/gpu_tensor.h"
@@ -20,6 +21,8 @@
 #include "eddl/hardware/fpga/nn/fpga_nn.h"
 #endif
 
+PROFILING_ENABLE(MPool2D);
+
 namespace tensorNN {
 
 
@@ -32,6 +35,8 @@ namespace tensorNN {
         /////////////////////////////////////////////////////////////////////
         if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::MPool2D");
 
+	PROFILING_HEADER(MPool2D);
+
         D->O->tsem->lock();
         if (D->I->isCPU()) {
             cpu_mpool2D(D);
@@ -49,6 +54,9 @@ namespace tensorNN {
       }
 #endif
         D->O->tsem->unlock();
+
+	PROFILING_FOOTER(MPool2D);
+	PROFILING_PRINTF(MPool2D);
     }
 
     void MPool2D_back(PoolDescriptor *D) {
@@ -136,4 +144,4 @@ namespace tensorNN {
         D->ID->tsem->unlock();
     }
 
-}
\ No newline at end of file
+}
diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp
index 8be55e465..80e39fc37 100755
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -54,7 +54,7 @@ void checkCompatibility(Tensor *A, Tensor *B, Tensor *C, const string &title){
 Tensor::Tensor() : device(DEV_CPU), ndim(0), size(0) {}
 
 
-Tensor::Tensor(const vector<int> &shape, float *fptr, int dev){
+Tensor::Tensor(const vector<int> &shape, float *fptr, int dev, void *fptr2){
     /*
      * Important! If we are creating a GPU tensor, "fptr" must point to a GPU pointer.
      */
@@ -79,7 +79,7 @@ Tensor::Tensor(const vector<int> &shape, float *fptr, int dev){
     updateShape(shape);
     updateSize();
     updateStrides();
-    updateData(fptr);
+    updateData(fptr, fptr2);
 
     this->tsem = new mutex();
 }
@@ -88,7 +88,11 @@ Tensor::Tensor(const vector<int> &shape, float *fptr, int dev){
 Tensor::Tensor(const vector<int> &shape, int dev):Tensor(shape, nullptr, dev){}
 
 // From shape and Tensor (sharing ptr)
-Tensor::Tensor(const vector<int> &shape, Tensor *T) : Tensor(shape,T->ptr, T->device) {}
+Tensor::Tensor(const vector<int> &shape, Tensor *T) : Tensor(shape,T->ptr, T->device
+#ifdef cFPGA
+		, (void *)T->fpga_ptr
+#endif
+		) {}
 
 Tensor::Tensor(const vector<float>& data, const vector<int> &shape, int dev) : Tensor(shape, nullptr, DEV_CPU) {
     isshared=false;
@@ -211,17 +215,20 @@ void Tensor::updateData(float *fptr, void *fptr2,bool setshared){
 #ifdef cFPGA
     else if (this->isFPGA())
     {
+	#ifdef FPGA_DEBUG
+	printf("Tensor::updateData: fptr=%p, fptr2=%p, setshared=%d\n", fptr, fptr2, setshared);
+        #endif
         fpga_device = device-DEV_FPGA;
         if (!initfpga[fpga_device]) {
           #ifdef FPGA_DEBUG
-          printf("Initializing FPGA device\n");
+          printf(" initializing FPGA device\n");
           #endif
           fpga_init(/*fpga_device*/);
           initfpga[fpga_device]=1;
         }
         if (fptr == nullptr) {
           #ifdef FPGA_DEBUG
-          printf("  ([updateData fptr==null] creating tensor size %d; id being assigned %d)\n", this->size, next_fpga_tensor_id);
+          printf(" creating tensor: size=%d fpga_tensor_id=%d\n", this->size, next_fpga_tensor_id);
           #endif
           this->fpga_ptr = fpga_create_tensor(fpga_device, this->size);
           this->fpga_size = this->size;
@@ -231,27 +238,54 @@ void Tensor::updateData(float *fptr, void *fptr2,bool setshared){
           this->fpga_tensor_id = next_fpga_tensor_id;
           next_fpga_tensor_id++;
           #ifdef FPGA_DEBUG
-          printf("  ([updateData] ptr %p fpga_ptr %p)\n", this->ptr, this->fpga_ptr);
+          printf("  new pointers: ptr=%p fpga_ptr=%p\n", this->ptr, this->fpga_ptr);
           #endif
         } else {
-          // The data has already been created in CPU, so we need now to create a buffer in FPGA and write the buffer into it
-          // we first update the cpu buffer
+          printf(" info: fpga_ptr %p fptr2 %p\n", this->fpga_ptr, fptr2);
+          if ((this->fpga_ptr == (cl::Buffer *)nullptr) && (fptr2 == nullptr)) {
+            this->fpga_ptr = fpga_create_tensor(fpga_device, this->size);
+            this->fpga_size = this->size;
+            this->fpga_tensor_id = next_fpga_tensor_id;
+            next_fpga_tensor_id++;
+            fpga_copy_to_fpga(fptr, this);
+            #ifdef FPGA_DEBUG
+            printf("  fpga_ptr and fptr2 were null, we create a buffer with tensor id %d\n", this->fpga_tensor_id);
+            #endif
+          } else if ((this->fpga_ptr == (cl::Buffer *)nullptr) && (fptr2 != nullptr)) {
+            #ifdef FPGA_DEBUG
+            printf("  fpga_ptr null but fptr2 not\n");
+            #endif
+            this->fpga_size = this->size;
+            this->fpga_ptr = (cl::Buffer *)fptr2;
+	    this->fpga_tensor_id = next_fpga_tensor_id;
+	    next_fpga_tensor_id++;
+            #ifdef FPGA_DEBUG
+	    printf("   new fpga_size %d fpga_ptr %p fpga_tensor_id %d\n", this->fpga_size, this->fpga_ptr, this->fpga_tensor_id);
+            #endif
+          } else {
+            #ifdef FPGA_DEBUG
+	    printf("  fpga_ptr and fptr2 are not null\n");
+            #endif
+            this->fpga_size = this->size;
+            this->fpga_ptr = (cl::Buffer *)fptr2;
+            #ifdef FPGA_DEBUG
+	    printf("   new fpga_size %d fpga_ptr %x\n", this->fpga_size, this->fpga_ptr);
+            #endif
+          }
           #ifdef FPGA_DEBUG
-          printf("  ([updateData fptr!=null] fptr %p tensor id %d ptr %p fpga_ptr %p size %d fpga_size %d)\n", fptr, this->fpga_tensor_id, this->ptr, this->fpga_ptr, this->size, this->fpga_size);
+          printf("  end of changes: fptr %p tensor id %d ptr %p fpga_ptr %p size %d fpga_size %d fptr2 %p)\n", fptr, this->fpga_tensor_id, this->ptr, this->fpga_ptr, this->size, this->fpga_size, fptr2);
           #endif
-          this->fpga_size = this->size;
-          #ifdef FPGA_DEBUG
-          printf("    reallocated tensor id %d new size %d\n", this->fpga_tensor_id, this->fpga_size);
-            #endif
           this->ptr = fptr;
-	  this->fpga_ptr = (cl::Buffer *)fptr2;
         }
         // For 2 dimensions, map to data to Eigen for efficiency
         // Efficient operations will be done over ptr2, which also points to ptr
         if (this->ndim == 2) {
           this->ptr2= new Eigen::Map<Eigen::MatrixXf>(this->ptr, this->shape[1], this->shape[0]);
         }
-      }
+        #ifdef FPGA_DEBUG
+        printf("-------------------------\n");
+        #endif
+    }
 #endif
 }
 
diff --git a/src/tensor/tensor_comparison.cpp b/src/tensor/tensor_comparison.cpp
index a696eaee9..d0663761e 100644
--- a/src/tensor/tensor_comparison.cpp
+++ b/src/tensor/tensor_comparison.cpp
@@ -650,7 +650,8 @@ int Tensor::equivalent(Tensor *A, Tensor *B, float atol, float rtol, bool equal_
 #endif
 #ifdef cFPGA
     else {
-          return fpga_equal2(A, B, epsilon);
+	  printf("Error, please check (FPGA), epsilon does not exist\n");
+//          return fpga_equal2(A, B, epsilon);
         }
 #endif
 
diff --git a/src/tensor/tensor_math.cpp b/src/tensor/tensor_math.cpp
index a79fb4d34..ac5f5d343 100644
--- a/src/tensor/tensor_math.cpp
+++ b/src/tensor/tensor_math.cpp
@@ -12,6 +12,7 @@
 #include <iostream>
 
 #include "eddl/tensor/tensor.h"
+#include "eddl/profiling.h"
 #include "eddl/hardware/cpu/cpu_tensor.h"
 
 #ifdef cGPU
@@ -25,6 +26,9 @@
 
 using namespace std;
 
+PROFILING_ENABLE(sum2D_rowwise);
+PROFILING_ENABLE(mult2D);
+
 // Math operations (Tensor-Tensor, Tensor-float) ************************
 
 Tensor* Tensor::maximum(float v){
@@ -2276,9 +2280,6 @@ void Tensor::el_div(Tensor *A, Tensor *B, Tensor *C, int incC) {
 }
 
 
-
-
-
 void Tensor::mult2D(Tensor *A, int tA, Tensor *B, int tB, Tensor *C, int incC) {
     ///////////////////////////////////////
     //// MULT2D C=A*B
@@ -2288,6 +2289,8 @@ void Tensor::mult2D(Tensor *A, int tA, Tensor *B, int tB, Tensor *C, int incC) {
     //// Dimensions and types must be compatible
     //// Only for 2D Tensors
     ///////////////////////////////////////
+    
+    PROFILING_HEADER_EXTERN(mult2D);
 
     if ((A->device != B->device) || (A->device != C->device)) {A->info();B->info();C->info();msg("Tensors in different devices", "Tensor::mult2D");}
     if ((A->ndim != 2) || (B->ndim != 2) || (C->ndim != 2)) msg("Only 2D tensors", "Tensor::mult2D");
@@ -2325,6 +2328,9 @@ void Tensor::mult2D(Tensor *A, int tA, Tensor *B, int tB, Tensor *C, int incC) {
       }
 #endif
     C->tsem->unlock();
+
+    PROFILING_FOOTER(mult2D);
+    PROFILING_PRINTF(mult2D);
 }
 
 
@@ -2374,6 +2380,8 @@ void Tensor::sum2D_rowwise(Tensor *A, Tensor *B, Tensor *C) {
     if ((A->ndim != 2) || (B->ndim != 1) || (C->ndim != 2)) msg("sum2D_rowwise dims");
     if ((!sameShape(A, C)) || (A->shape[1] != B->shape[0])) msg("Incompatible dims", "Tensor::sum2D_rowwise");
 
+    PROFILING_HEADER(sum2D_rowwise);
+
     C->tsem->lock();
     if (A->isCPU()) {
         cpu_sum2D_rowwise(A, B, C);
@@ -2391,6 +2399,9 @@ void Tensor::sum2D_rowwise(Tensor *A, Tensor *B, Tensor *C) {
       }
 #endif
     C->tsem->unlock();
+
+    PROFILING_FOOTER(sum2D_rowwise);
+    PROFILING_PRINTF(sum2D_rowwise);
 }
 
 

From c4bba3985e4a3681ba5b67b04aad63624c98bb51 Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Sat, 17 Oct 2020 06:51:14 +0000
Subject: [PATCH 02/15] conv2D with arbitrary precission

---
 .../kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp      |  73 +-
 .../kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp   | 700 ++++++++++++++++++
 fpga_kernels/setenv.sh                        |   4 +
 fpga_kernels/test_fpga/Makefile               |   2 +-
 .../src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp | 389 ++++++++++
 include/eddl/hardware/fpga/fpga_enables.h     |   2 +-
 include/eddl/profiling.h                      |  11 +
 src/hardware/fpga/fpga_core.cpp               |  58 +-
 src/hardware/fpga/nn/fpga_activations.cpp     |  19 +
 src/hardware/fpga/nn/fpga_conv.cpp            |   4 +
 10 files changed, 1219 insertions(+), 43 deletions(-)
 create mode 100644 fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
 create mode 100644 fpga_kernels/setenv.sh
 create mode 100644 fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp

diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp
index 20b25cbd2..9ec02806c 100644
--- a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp
+++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp
@@ -18,17 +18,18 @@
 
 extern "C" {
 
+// To allow using defines inside Xilinx pragmas
+#define PRAGMA_SUB(x) _Pragma (#x)
+#define DO_PRAGMA(x) PRAGMA_SUB(x)
+
 // Fixed parameters (optimized at compilation/synthesis time)
 #define KW       3  // kernel width
 #define KH       3  // kernel height
-//#define I        8  // number of input channels
-//#define O        8  // number of output channels
 #define CPI      4  // channels per input port
 #define CPO      4  // channels per output port
-//#define W        256 // input width
-//#define H        256 // input height
-//#define I_ITER   I/CPI // iterations per input
-//#define O_ITER   O/CPO // iterations per output
+//
+#define WMAX 512
+#define WHMAX 512*512
 
 #define LOAD_MODEL
 #define READ_MODEL
@@ -173,7 +174,7 @@ static void read_input(int H, int W, int I, int O, int I_ITER, int O_ITER, pixel
 //   in                : input stream
 //   out               : vector of output streams
 //
-static void padding(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<pixel_in_t> &out) {
+static void padding(int H, int W, int ITER, hls::stream<pixel_in_t> &in, hls::stream<pixel_in_t> &out) {
 
 #ifdef DEBUG_VERBOSE
   printf("padding: start\n");
@@ -182,36 +183,29 @@ static void padding(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_in_t
 //we init zero only first time
 
 pixel_in_t data;
-#pragma HLS ARRAY_PARTITION variable=data complete
+DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete)
 
 pixel_in_t zero;
-#pragma HLS ARRAY_PARTITION variable=data complete
+DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete)
 
 for (int cpi=0; cpi<CPI; cpi++) zero.pixel[cpi] = 0.f;
 
-padding_o_iter_loop:
-for (int o_iter = 0; o_iter < O_ITER; o_iter++){
-  #pragma HLS loop_flatten off
-  padding_i_iter_loop:
-  for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+  padding_iter_loop:
+  for(int iter = 0; iter < ITER; iter++){
 
     for(int h = 0; h < H + 2; h++){
       #pragma HLS_PIPELINE II=1
       for(int w = 0; w < W + 2; w++){
         #pragma HLS_PIPELINE II=1
-
-        if(h==0 || h == H+1 || w == 0 || w == W+1){
-          for(int cpi = 0; cpi < CPI; cpi++){
-            data = zero;
-          }
-        }
-        else data = in.read();
+        if (h==0 || h == H+1 || w == 0 || w == W+1) {
+          data = zero;
+        } else {
+	  data = in.read();
+	}
         out << data;
       }
     }
-
-  } //i_iter
-} //o_iter
+  } // iter
 
 
 
@@ -317,16 +311,16 @@ for (int o_iter = 0; o_iter < O_ITER; o_iter++){
   // Now we process the input data and convert the data into frames
 
   // buffers (keep three rows)
-  pixel_in_t buffer0[W+2];
-  pixel_in_t buffer1[W+2];
-  pixel_in_t buffer2[W+2];
-  #pragma HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=2
-  #pragma HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=2
-  #pragma HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=2
+  pixel_in_t buffer0[WMAX+2];
+  pixel_in_t buffer1[WMAX+2];
+  pixel_in_t buffer2[WMAX+2];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI)
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI)
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI)
 
   // frame
   frame_t frame;
-  #pragma HLS ARRAY_PARTITION variable=frame
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=frame)
 
   // We loop for every incoming pixel
   cvt_loop_1:
@@ -409,7 +403,7 @@ static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream<frame_t> &in,
 
   // first we read the kernels
   frame_t kernel[CPI];
-  #pragma HLS ARRAY_PARTITION variable=kernel dim=0
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0)
   frame_t data_in;
 
 #ifdef LOAD_MODEL
@@ -440,7 +434,7 @@ static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream<frame_t> &in,
 
     // now we read frames and produce the pixels
     float sum[CPO];
-    #pragma HLS ARRAY_PARTITION variable=sum dim=0 block factor=4
+    DO_PRAGMA(HLS ARRAY_PARTITION variable=sum dim=0 block factor=CPO)
     //factor = 16
     //the array_partition factor in this case is assumed to be CPO value
     int num_iterations = W * H;
@@ -529,8 +523,8 @@ static void add(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_out_t> &
   int num_iterations = W * H;
 
   //Buffer for all data and CPO channels
-  float buff_o_channels[CPO][num_iterations];
-  #pragma HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=4
+  float buff_o_channels[CPO][WHMAX];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO)
 
   //We read Bias in O_iter packs of CPO size
   add_o_iter_loop:
@@ -655,10 +649,9 @@ static void conv(int H, int W, int I, int O, int I_ITER, int O_ITER, hls::stream
   static hls::stream<frame_t>     str_cvt_mul;  // cvt->mul
   static hls::stream<pixel_out_t> str_mul_add;  // mul->add
 
-
   // topology
   #pragma HLS dataflow
-  padding(H, W, I_ITER, O_ITER, in, str_pad_cvt);          // padding
+  padding(H, W, I_ITER * O_ITER, in, str_pad_cvt);          // padding
   cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0);  // cvt
   mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0);  // mul
   add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out);             // add
@@ -666,8 +659,10 @@ static void conv(int H, int W, int I, int O, int I_ITER, int O_ITER, hls::stream
 
 void k_conv2D_K3x3_S1x1_P1x1_BS1(pixel_in_t *ptr_data, int H, int W, int I, float *ptr_kernel, float *ptr_bias, pixel_out_t *ptr_out, int O) {
 
-  //#pragma HLS INTERFACE s_axilite port=W bundle=control
-  //#pragma HLS INTERFACE s_axilite port=H bundle=control
+  #pragma HLS INTERFACE s_axilite port=W bundle=control
+  #pragma HLS INTERFACE s_axilite port=H bundle=control
+  #pragma HLS INTERFACE s_axilite port=I bundle=control
+  #pragma HLS INTERFACE s_axilite port=O bundle=control
   #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
   #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256
   #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
new file mode 100644
index 000000000..0f1e99ece
--- /dev/null
+++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
@@ -0,0 +1,700 @@
+//KERNEL_CONV2D_4.cpp
+//Modified by: Jorge García Martinez
+//Date: 17/09/2020
+//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs
+//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel.
+//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores
+//the data that It should be written into the memory.
+
+
+
+#include <math.h>
+#include <stdio.h>
+#include <ap_fixed.h>
+
+#include <hls_stream.h>
+
+#define DEBUG_VERBOSE
+
+extern "C" {
+
+#define data_type ap_fixed<8,4,AP_TRN,AP_WRAP>
+//#define data_type float
+
+// To allow using defines inside Xilinx pragmas
+#define PRAGMA_SUB(x) _Pragma (#x)
+#define DO_PRAGMA(x) PRAGMA_SUB(x)
+
+// Fixed parameters (optimized at compilation/synthesis time)
+#define KW       3  // kernel width
+#define KH       3  // kernel height
+#define CPI      16  // channels per input port
+#define CPO      16  // channels per output port
+//
+#define WMAX 256
+#define WHMAX 256*256
+
+#define LOAD_MODEL
+#define READ_MODEL
+#define READ_INPUT
+#define WRITE_OUTPUT
+
+// pixel_in
+struct pixel_in_t {
+  data_type pixel[CPI];
+};
+
+struct pixel_out_t {
+  data_type pixel[CPO];
+};
+
+// frames struct
+struct frame_t {
+  pixel_in_t pixel[9];
+};
+
+// --------------------------------------------------------------------------------------
+// read_input:
+// The function reads and writes the kernels, bias and data in different stream.
+// Data are sent to padding module, kenels to mul and bias to add modules.
+// LOOP FLOW
+// ko = 0
+// b = 0
+//   for o_iter 0 .. n
+//        read bias[b..b+3]
+//        b = b + 4
+//        d = 0
+//        ki = 0
+//        for i_iter 0 .. n
+//            read kernel[ki..ki+3][ko..ko+3]
+//            ki = ki +4
+//            read data[d..d+3]
+//            d = d + 4
+//
+//        ko = ko + 4
+//
+//
+// Arguments:
+//   ptr  : Pointer to input data (in)
+//   k_ptr: pointer to kernels (in)
+//   b_ptr: pointer to bias (in)
+//   out  : data output stream (out)
+//   k_out: pointer to kernel (out)
+//   b_out: pointer to bias (out)
+//
+static void read_input(int H, int W, int I, int O, int I_ITER, int O_ITER, pixel_in_t *ptr, data_type *k_ptr, data_type *b_ptr, hls::stream<frame_t> &k_out, hls::stream<pixel_out_t> &b_out, hls::stream<pixel_in_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("read_input: start\n");
+#endif
+
+  frame_t frame_k;
+  #pragma HLS ARRAY_PARTITION variable=frame_k dim=0
+
+  pixel_out_t bias;
+  #pragma HLS ARRAY_PARTITION variable=bias dim=0
+
+  pixel_in_t data;
+  #pragma HLS ARRAY_PARTITION variable=data dim=0
+
+
+  read_input_o_iter_loop:
+  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+    //Sending bias to add in pack of CPO bias
+    // int data_pointer = 0;
+    read_loop_bias_load:
+      for (int b=0; b<CPO; b++) {
+        #pragma HLS PIPELINE II=1
+        //data_type v = b_ptr[b];
+        bias.pixel[0] = 1;
+        b_out << bias;
+      }
+    read_input_i_iter_loop:
+    for (int i_iter = 0; i_iter < I_ITER; i_iter++){
+      // printf("o_iter = %d -- i_iter = %d \n ", o_iter, i_iter);
+      //Sending kernels to mul in pack of CPI*CPO kernels
+      int kernel_size_cpo = CPO*KH*KW; //kernels size each i_iter
+      int i_offset = I_ITER * CPI * CPO * KH * KW; //addr_k offset for each i_iter
+      int cpo = 0; //index for kernel size
+      int kx = 0; //index for channels
+      read_loop_kernel_load_ext:
+      for(int i = 0; i < CPI; i++){
+        // printf("i = %d -- kernel_size_cpo = %d \n", i, kernel_size_cpo);
+        read_loop_kernel_load_int:
+        for (int j = 0; j < kernel_size_cpo; j++) {
+           // int addr_k = j + i*kernel_size_cpo*I_ITER + i_iter*i_offset + o_iter*kernel_size_cpo;
+           // data_type v = k_ptr[addr_k];
+          frame_k.pixel[kx].pixel[cpo] = 1;
+
+          #ifdef DEBUG_VERBOSE
+          //printf("[%d]:", addr_k);
+          //printf("%6.4f ", float(v));
+          #endif
+
+          kx = kx + 1;
+          if (kx == 9) {
+            // printf("\n");
+            kx = 0;
+            cpo = cpo + 1;
+            if (cpo == CPO) {
+              cpo = 0;
+              k_out << frame_k;
+            }
+          }
+        }
+      }
+
+    //Sending data to padding  in pack of CPI channels
+
+    read_loop_data_load_i:
+      for (int r=0; r<H*W; r++) {
+        #pragma HLS PIPELINE II=1
+        // printf("r = %d \n", r);
+        // data = ptr[offset_read];
+        #ifdef DEBUG_VERBOSE
+        printf("data.pixel[0] = %6.2f  ", float(data.pixel[0]));
+        printf("data.pixel[1] = %6.2f  ", float(data.pixel[1]));
+        printf("data.pixel[2] = %6.2f  ", float(data.pixel[2]));
+        printf("data.pixel[3] = %6.2f  \n", float(data.pixel[3]));
+        #endif
+        out  << ptr[r];
+        // data_pointer++;
+      }
+
+   } //i_iter
+} //o_iter
+
+
+#ifdef DEBUG_VERBOSE
+  printf("read_input: end\n");
+#endif
+}
+
+// ---------------------------------------------------------------------------------------
+// padding. Adds padding to the input and forwards it through the output
+//
+// Arguments:
+//   in                : input stream
+//   out               : vector of output streams
+//
+static void padding(int H, int W, int ITER, hls::stream<pixel_in_t> &in, hls::stream<pixel_in_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("padding: start\n");
+#endif
+
+//we init zero only first time
+
+pixel_in_t data;
+DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete)
+
+pixel_in_t zero;
+DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete)
+
+for (int cpi=0; cpi<CPI; cpi++) zero.pixel[cpi] = 0.f;
+
+  padding_iter_loop:
+  for(int iter = 0; iter < ITER; iter++){
+
+    for(int h = 0; h < H + 2; h++){
+      #pragma HLS_PIPELINE II=1
+      for(int w = 0; w < W + 2; w++){
+        #pragma HLS_PIPELINE II=1
+        if (h==0 || h == H+1 || w == 0 || w == W+1) {
+          data = zero;
+        } else {
+	  data = in.read();
+	}
+        out << data;
+      }
+    }
+  } // iter
+
+
+
+#ifdef DEBUG_VERBOSE
+  printf("padding: end\n");
+#endif
+}
+
+// ---------------------------------------------------------------------------------------------
+// relu. Performs the relu operation on an input stream and produces an output stream
+// Arguments:
+//
+//   in: input stream
+//   out: output stream
+//
+static void relu(int H, int W, int O, hls::stream<data_type> &in, hls::stream<data_type> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("relu: start\n");
+#endif
+
+  int data_size = W * H * O;
+  for (int i=0; i < data_size; i++) {
+    #pragma HLS PIPELINE II=1
+    data_type data = in.read();
+    if (data < 0) data = 0.f;
+    out << data;
+  }
+
+#ifdef DEBUG_VERBOSE
+  printf("relu: end\n");
+#endif
+}
+
+// --------------------------------------------------------------------------------
+// write_output: Writes data comming from one stream into memory
+// LOOP FLOW:
+//  for o_iter 0 .. n
+//      write data[do .. do+3]
+//
+//  d = d + 4
+//
+// Arguments:
+//   ptr: memory address pointer
+//   in: input stream
+//
+static void write_output(int H, int W, int O_ITER, pixel_out_t *ptr, hls::stream<pixel_out_t> &in) {
+
+#ifdef DEBUG_VERBOSE
+  printf("write_output: start\n");
+#endif
+
+
+
+  // int data_pointer = 0;
+
+  // write_output_o_iter_loop:
+  // for (int o_iter = 0; o_iter<O_ITER; o_iter++){
+    //writes must be performed with pixel_in_t struct
+    write_output_data_size_loop:
+    for (int i=0; i<H*W*O_ITER; i++) {
+      pixel_out_t p = in.read();
+      ptr[i] = p;
+      // data_pointer++;
+      #ifdef DEBUG_VERBOSE
+      printf("i = %d \n",  i);
+      for (int cpo=0; cpo<CPO; cpo++) printf("ptr--p.pixel[%d] = %6.2f \n", cpo, float(p.pixel[cpo]));
+      #endif
+    }
+  // } //o_iter
+
+
+
+#ifdef DEBUG_VERBOSE
+  printf("write_output: end\n");
+#endif
+}
+
+
+
+// ---------------------------------------------------------------------------------------------------
+// cvt: reads an input stream with an image of format (W, H, CPI) and writes an output stream
+// in a 2D format based on (KW, KH). (SW=1, SH=1) stride is assumed and (PW=1, PH=1) padding is assumed.
+// The function outputs data in the format (CPI, KW, KH).
+//
+// Arguments:
+//   in  : input stream
+//   out : output stream
+//   id  : function id (for debugging)
+static void cvt(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &out, int id) {
+
+#ifdef DEBUG_VERBOSE
+  printf("cvt_%d: start\n", id);
+#endif
+
+cvt_o_iter_loop:
+for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+  cvt_i_iter_loop:
+  for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+
+  // Now we process the input data and convert the data into frames
+
+  // buffers (keep three rows)
+  pixel_in_t buffer0[WMAX+2];
+  pixel_in_t buffer1[WMAX+2];
+  pixel_in_t buffer2[WMAX+2];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI)
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI)
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI)
+
+  // frame
+  frame_t frame;
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=frame)
+
+  // We loop for every incoming pixel
+  cvt_loop_1:
+  for (int pin_row=0; pin_row < H+2; pin_row++) {
+    cvt_loop_2:
+    for (int pin_col=0; pin_col < W+2; pin_col++) {
+      // get the pixel
+      pixel_in_t pixel;
+      pixel = in.read();
+      // row buffer write (in which buffer row we write the pixel)
+      int row0_buffer_write = (pin_row % 3) == 0;
+      int row1_buffer_write = (pin_row % 3) == 1;
+      // first row buffer
+      int row0 = (pin_row <= 2) | ((pin_row % 3) == 2);
+      int row1 = !row0 & ((pin_row % 3) == 0);
+      // we write the pixel into the buffer
+      if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel;
+      // build the frame
+      pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8;
+      int shift_frame = (pin_row>1) & (pin_col > 2);
+      int send_frame = (pin_row>1) & (pin_col > 1);
+      pixel_in_t pixel_b0, pixel_b1, pixel_b2;
+      pixel_b0 = buffer0[pin_col];
+      pixel_b1 = buffer1[pin_col];
+      pixel_b2 = buffer2[pin_col];
+      // p0, p1, p2
+      if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;}
+      if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;}
+      if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2;
+      // p3, p4, p5
+      if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;}
+      if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;}
+      if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0;
+      // p6, p7, p8
+      if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;}
+      if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;}
+      if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1;
+
+      if (send_frame) {
+        frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2;
+        frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5;
+        frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8;
+        out << frame;
+      #ifdef DEBUG_VERBOSE
+      printf("cvt_%d: frame sent:\n", id);
+      for (int cpi=0; cpi<CPI; cpi++) {
+        printf("  cpi %d:\n", cpi);
+        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[0].pixel[cpi]), float(frame.pixel[1].pixel[cpi]), float(frame.pixel[2].pixel[cpi]));
+        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[3].pixel[cpi]), float(frame.pixel[4].pixel[cpi]), float(frame.pixel[5].pixel[cpi]));
+        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[6].pixel[cpi]), float(frame.pixel[7].pixel[cpi]), float(frame.pixel[8].pixel[cpi]));
+      }
+      #endif
+     }
+    }
+  }
+
+} //i_iter
+} //o_iter
+
+
+#ifdef DEBUG_VERBOSE
+  printf("cvt_%d: end\n", id);
+#endif
+}
+
+// ----------------------------------------------------------------------------------------
+// mul: This function performs the multiplication of an input frame with the stored kernels
+// and sends the produced pixels. Before normal operation it receives its kernels
+// Arguments:
+//   in: input stream with incoming data frames
+//   k_in: input stream with kernels
+//   out: output stream
+//   id: function id (for debugging only)
+//
+static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream<frame_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &out, int id) {
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: start\n", id);
+#endif
+
+  // first we read the kernels
+  frame_t kernel[CPI];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0)
+  frame_t data_in;
+
+#ifdef LOAD_MODEL
+
+  mul_o_iter_loop:
+  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+    mul_i_iter_loop:
+    for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+      //we load the kernels into pack of frames
+      loop_mul_kernels_load_cpo:
+      for (int cpi=0; cpi<CPI; cpi++) {
+        #pragma HLS PIPELINE II=1
+        kernel[cpi] = k_in.read();
+      }
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: kernels received\n", id);
+  for (int cpi=0; cpi < CPI; cpi++) {
+    for (int cpo=0; cpo < CPO; cpo++) {
+      printf("  cpi=%d, cpo=%d:\n", cpi, cpo);
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[0].pixel[cpo]), float(kernel[cpi].pixel[1].pixel[cpo]), float(kernel[cpi].pixel[2].pixel[cpo]));
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[3].pixel[cpo]), float(kernel[cpi].pixel[4].pixel[cpo]), float(kernel[cpi].pixel[5].pixel[cpo]));
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[6].pixel[cpo]), float(kernel[cpi].pixel[7].pixel[cpo]), float(kernel[cpi].pixel[8].pixel[cpo]));
+    }
+  }
+#endif
+
+
+    // now we read frames and produce the pixels
+    data_type sum[CPO];
+    DO_PRAGMA(HLS ARRAY_PARTITION variable=sum dim=0 block factor=CPO)
+    //factor = 16
+    //the array_partition factor in this case is assumed to be CPO value
+    int num_iterations = W * H;
+    mul_sum_loop:
+    for (int cpo=0; cpo<CPO; cpo++) sum[cpo] = 0.f;
+
+    mul_num_iterations_loop:
+    for (int i=0; i<num_iterations; i++) {
+      data_in = in.read();
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: data received\n", id);
+  for (int cpi=0; cpi<CPI; cpi++) {
+    printf("  cpi=%d\n", cpi);
+    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[0].pixel[cpi]), float(data_in.pixel[1].pixel[cpi]), float(data_in.pixel[2].pixel[cpi]));
+    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[3].pixel[cpi]), float(data_in.pixel[4].pixel[cpi]), float(data_in.pixel[5].pixel[cpi]));
+    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[6].pixel[cpi]), float(data_in.pixel[7].pixel[cpi]), float(data_in.pixel[8].pixel[cpi]));
+  }
+#endif
+
+    loop_mul_cpi:
+    for (int cpi=0; cpi<CPI; cpi++) {
+      #pragma HLS UNROLL
+      loop_mul_j:
+      for (int j=0; j<KW*KH; j++) {
+	       #pragma HLS UNROLL
+        loop_mul_cpo:
+      	for (int cpo=0; cpo<CPO; cpo++) {
+          #pragma HLS UNROLL
+          sum[cpo] += data_in.pixel[j].pixel[cpi] * kernel[cpi].pixel[j].pixel[cpo];
+        }
+      }
+    }
+    pixel_out_t p_out;
+    for (int cpo=0; cpo<CPO; cpo++) {
+      #pragma HLS unroll
+      #ifdef DEBUG_VERBOSE
+      printf("mul_%d: pixel produced\n", id);
+      for (int cpo=0; cpo<CPO; cpo++) printf("  cpo=%d -> %6.4f\n", cpo, float(sum[cpo]));
+      #endif
+      p_out.pixel[cpo] = sum[cpo];
+      sum[cpo] = 0.f;
+     }
+     out << p_out;
+    }
+  } //i_iter
+} //o_iter
+
+#endif
+
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: end\n", id);
+#endif
+}
+
+// -------------------------------------------------------------------------------
+// add: This function performs the addition of all subpixels for the same channel.
+// It adds also the corresponding bias.
+// LOOP FLOW
+//   for o_iter 0 .. n
+//        receive bias[b..b+3]
+//        init buff_o_channels with bias
+//        for i_iter 0 .. n
+//            receive data[do..d+3]
+//            buff_o_channels = buff_o_channels + data
+//
+//        for num_iterations
+//            for CPO
+//              send data to write module
+//
+// Arguments:
+//   in:  input streams data
+//   b_in: input stream bias
+//   out: output stream
+//
+static void add(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_out_t> &in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("add: start\n");
+#endif
+
+  data_type bias[CPO];
+
+  //number of iterations by CPI || CPO channels
+  int num_iterations = W * H;
+
+  //Buffer for all data and CPO channels
+  data_type buff_o_channels[CPO][WHMAX];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO)
+
+  //We read Bias in O_iter packs of CPO size
+  add_o_iter_loop:
+  for (int o_iter = 0; o_iter<O_ITER; o_iter++){
+
+    //We receive bias in packs of CPO
+    add_load_bias_loop:
+    for (int b=0; b<CPO; b++) {
+      #pragma HLS PIPELINE II=1
+      pixel_out_t p_out;
+      p_out = b_in.read();
+      bias[b] = p_out.pixel[0];
+    }
+
+    #ifdef DEBUG_VERBOSE
+    for (int b=0; b<CPO; b++) {
+      printf("Bias[%d] = %6.4f \n", b, float(bias[b]));
+    }
+    #endif
+
+    #ifdef DEBUG_VERBOSE
+    printf("add: bias received\n");
+    #endif
+
+    //It is necessary to reset the buffer each o_iter
+    // add_init_buff_o_channels_loop:
+    // for(int cpo = 0; cpo<CPO; cpo++){
+    //   for(int it = 0; it<num_iterations; it++){
+    //     buff_o_channels[cpo][it] = bias[cpo];
+    //   }
+    // }
+
+      #ifdef DEBUG_VERBOSE
+      printf("o_iter = %d \n", o_iter);
+      for(int cpo = 0; cpo<CPO; cpo++){
+        printf("Channel cpo = %d: ", cpo);
+        for(int it = 0; it<num_iterations; it++){
+          printf("%6.2f ", float(buff_o_channels[cpo][it]));
+        }
+        printf("\n");
+      }
+      #endif
+
+      //All input data have effect into output add
+      add_i_iter_loop:
+      for (int i_iter = 0; i_iter < I_ITER; i_iter++){
+        // //prueba
+        pixel_out_t data_out;
+        // pixel_out_t data;
+        // data = in.read();
+        // out<<data;
+        #pragma HLS loop_flatten off
+        add_load_data_it_loop:
+        for(int it = 0; it<num_iterations; it++){
+          pixel_out_t data_in;
+          data_in = in.read();
+          pixel_out_t data;
+          add_load_data_cpo_loop:
+          for (int cpo=0; cpo<CPO; cpo++) {
+            #pragma HLS unroll
+            if(i_iter == 0){
+              data.pixel[cpo] = bias[cpo];
+            }
+            else{
+              data.pixel[cpo] = buff_o_channels[cpo][it];
+            }
+            buff_o_channels[cpo][it] = data.pixel[cpo] + data_in.pixel[cpo];
+
+            if(i_iter ==(I_ITER-1)){
+              data_out.pixel[cpo] = buff_o_channels[cpo][it];
+            }
+          }
+          if(i_iter ==(I_ITER-1)){
+            out << data_out;
+          }
+        }
+      } //i_iter
+
+      #ifdef DEBUG_VERBOSE
+      printf("CH %d: ", o_iter*CPO);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", float(buff_o_channels[0][it]));
+      }
+      printf("\n");
+      printf("CH %d: ", o_iter*CPO +1);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", float(buff_o_channels[1][it]));
+      }
+      printf("\n");
+      printf("CH %d: ", o_iter*CPO +2);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", float(buff_o_channels[2][it]));
+      }
+      printf("\n");
+      printf("CH %d: ", o_iter*CPO +3);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", float(buff_o_channels[3][it]));
+      }
+      printf("\n");
+      #endif
+
+
+  } //o_iter
+
+
+#ifdef DEBUG_VERBOSE
+  printf("add: end\n");
+#endif
+
+
+}
+
+// conv: Convolutional kernel
+//
+// Arguments:
+//   in: input stream
+//   out: output stream
+static void conv(int H, int W, int I, int O, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
+
+  // streams
+  static hls::stream<pixel_in_t>  str_pad_cvt;  // padding->cvt
+  static hls::stream<frame_t>     str_cvt_mul;  // cvt->mul
+  static hls::stream<pixel_out_t> str_mul_add;  // mul->add
+
+  // topology
+  #pragma HLS dataflow
+  padding(H, W, I_ITER * O_ITER, in, str_pad_cvt);          // padding
+  cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0);  // cvt
+  mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0);  // mul
+  add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out);             // add
+}
+
+void k_conv2D_K3x3_S1x1_P1x1_BS1_ap(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O) {
+
+  #pragma HLS INTERFACE s_axilite port=W bundle=control
+  #pragma HLS INTERFACE s_axilite port=H bundle=control
+  #pragma HLS INTERFACE s_axilite port=I bundle=control
+  #pragma HLS INTERFACE s_axilite port=O bundle=control
+  #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_out  offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE s_axilite port=return bundle=control
+
+  // ptr_data struct to be packed as a single element vector (to improve memory read)
+  // the compiler will do full structure access (all elements of structure)
+  #pragma HLS data_pack variable = ptr_data
+  #pragma HLS data_pack variable = ptr_out
+
+  int I_ITER = I/CPI;
+  int O_ITER = O/CPO;
+
+  // input and output streams
+  static hls::stream<pixel_in_t> out_read;
+  static hls::stream<frame_t> out_read_kernel;
+  static hls::stream<pixel_out_t> out_read_bias;
+  static hls::stream<pixel_out_t> out_conv;
+
+  // stream sizes
+  #pragma HLS STREAM variable = out_read depth = 32
+  #pragma HLS STREAM variable = out_read_kernel depth = 32
+  #pragma HLS STREAM variable = out_read_bias depth = 32
+  #pragma HLS STREAM variable = out_conv depth = 32
+  #pragma HLS STREAM variable = out_relu depth = 32
+
+  #pragma HLS dataflow
+  read_input(H, W, I, O, I_ITER, O_ITER, ptr_data, ptr_kernel, ptr_bias, out_read_kernel, out_read_bias, out_read);
+  conv(H, W, I, O, I_ITER, O_ITER, out_read, out_read_kernel, out_read_bias, out_conv);
+  write_output(H, W, O_ITER, ptr_out, out_conv);
+}
+
+} // end extern "C"
diff --git a/fpga_kernels/setenv.sh b/fpga_kernels/setenv.sh
new file mode 100644
index 000000000..2a72e6de9
--- /dev/null
+++ b/fpga_kernels/setenv.sh
@@ -0,0 +1,4 @@
+source /opt/xilinx/xrt/setup.sh
+source /opt/Xilinx/Vitis/2019.2/settings64.sh
+export XILINX_SDX=/opt/Xilinx/Vitis/2019.2
+export XCL_EMULATION_MODE=sw_emu
diff --git a/fpga_kernels/test_fpga/Makefile b/fpga_kernels/test_fpga/Makefile
index 7dc6b92d8..165a7c2d9 100644
--- a/fpga_kernels/test_fpga/Makefile
+++ b/fpga_kernels/test_fpga/Makefile
@@ -1,5 +1,5 @@
 # list of kernel test to compile
-LIST ?=conv2D_K3x3_S1x1_P1x1_BS1
+LIST ?=conv2D_K3x3_S1x1_P1x1_BS1_ap
 
 # default target
 all build clean cleanall: KERNELS
diff --git a/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
new file mode 100644
index 000000000..facd8b604
--- /dev/null
+++ b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
@@ -0,0 +1,389 @@
+#include <cstdio>      /* printf, scanf, NULL */
+#include <cstdlib>     /* malloc, free, rand */
+
+#include <stdlib.h>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <vector>
+#include "xcl2.hpp"
+
+#include <ap_fixed.h>
+
+using std::vector;
+
+// data type
+//#define data_type ap_fixed<8,4,AP_TRN,AP_WRAP>
+#define data_type float
+
+// CL
+cl::Buffer buf;
+cl::Context context;
+cl::CommandQueue q;
+cl::Program program;
+
+
+#define W    256 //256
+#define H    256 //256
+#define C    16  //I
+#define COUT 16  //O
+#define KW   3
+#define KH   3
+
+// buffers
+data_type data_in[  W   * H  * C       ]  __attribute__ ((__aligned__(16)));
+data_type kernel [ KW   * KH * C * COUT]  __attribute__ ((__aligned__(16)));
+data_type bias   [ COUT                ]  __attribute__ ((__aligned__(16)));
+data_type out    [  W   * H  * COUT    ]  __attribute__ ((__aligned__(16)));
+data_type out_cpu[  W   * H  * COUT    ]  __attribute__ ((__aligned__(16)));
+
+void cpu_conv2d() {
+
+  int size_out = W * H * COUT;
+  for (int i=0; i<size_out; i++) out_cpu[i] = 0.f;
+
+  for (int c=0; c<C; c++) {
+    for (int cout=0; cout<COUT; cout++) {
+      for (int h=0; h<H; h++) {
+        for (int w=0; w<W; w++) {
+          for (int kh=0; kh<KH; kh++) {
+	    for (int kw=0; kw<KW; kw++) {
+	      int data_h = (h-1)+kh;
+	      int data_w = (w-1)+kw;
+	      int padding = (data_h == -1) | (data_w == -1) | (data_w == W) | (data_h == H);
+	      int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
+              int addr_p = (data_h * W * C) + (data_w * C) + c;
+	      int addr_o = (h * W * COUT) + (w * COUT) + cout;
+	      if (!padding) out_cpu[addr_o] += data_in[addr_p] * kernel[addr_k];
+	    }
+	  }
+	}
+      }
+    }
+  }
+
+  // añadimos bias
+  for (int cout=0; cout<COUT; cout++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+        out_cpu[addr_o] += bias[cout];
+      }
+    }
+  }
+
+  // aplicamos relu
+/*  for (int cout=0; cout<COUT; cout++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+        if (out_cpu[addr_o] < 0.f) out_cpu[addr_o] = 0.f;
+      }
+    }
+  }*/
+}
+
+void cpu_print_data_in() {
+  printf("data in:\n");
+  for (int c=0; c<C; c++) {
+    printf(" channel %d:\n", c);
+    printf("   ");
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+	int addr_p = (h * W * C) + (w * C) + c;
+        printf("%6.2f ", float(data_in[addr_p]));
+      }
+      printf("\n");
+      printf("   ");
+    }
+    printf("\n");
+  }
+}
+
+void cpu_print_kernels() {
+  printf("kernels:\n");
+  for (int c=0; c<C; c++) {
+    for (int cout=0; cout<COUT; cout++) {
+      printf("kernel c=%d cout %d:\n", c, cout);
+      for (int kh=0; kh<KH; kh++) {
+        for (int kw=0; kw<KW; kw++) {
+	  int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
+	  printf("%6.2f ", float(kernel[addr_k]));
+	}
+	printf("\n");
+      }
+    }
+  }
+}
+
+void cpu_print_bias() {
+  printf("bias:\n");
+  for (int cout=0; cout<COUT; cout++) {
+    printf("%6.2f ", float(bias[cout]));
+  }
+  printf("\n");
+}
+
+void cpu_print_out() {
+  printf("output: cpu (fpga)\n");
+  for (int cout=0; cout<COUT; cout++) {
+    printf("channel %d:\n", cout);
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+        printf(" %10.6f (%10.6f) (diff %10.6f) | ", float(out_cpu[addr_o]), float(out[addr_o]), float(out_cpu[addr_o]-out[addr_o]));
+      }
+      printf("\n");
+    }
+  }
+}
+
+void check_result() {
+
+  int error = 0;
+  for (int cout=0; cout<COUT; cout++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+        if (fabs(out_cpu[addr_o] - out[addr_o]) > 0.001) {
+          printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, float(out_cpu[addr_o]), float(out[addr_o]), fabs(float(out_cpu[addr_o]-out[addr_o])));
+          error = 1;
+	  return;
+	}
+      }
+    }
+  }
+  if (!error) printf("results OK!\n"); else {
+    printf("results differ:\n");
+    //cpu_print_out();
+  }
+}
+
+
+//---------------------------------------------------------------------------------------------------------------------
+//---------------------------------------------------------------------------------------------------------------------
+
+// An event callback function that prints the operations performed by the OpenCL
+// runtime.
+void event_cb(cl_event event1, cl_int cmd_status, void *data) {
+  cl_int err;
+  cl_command_type command;
+  cl::Event event(event1, true);
+  OCL_CHECK(err, err = event.getInfo(CL_EVENT_COMMAND_TYPE, &command));
+  cl_int status;
+  OCL_CHECK(err,
+            err = event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status));
+  const char *command_str;
+  const char *status_str;
+  switch (command) {
+  case CL_COMMAND_READ_BUFFER:
+    command_str = "buffer read";
+    break;
+  case CL_COMMAND_WRITE_BUFFER:
+    command_str = "buffer write";
+    break;
+  case CL_COMMAND_NDRANGE_KERNEL:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_MAP_BUFFER:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_COPY_BUFFER:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_MIGRATE_MEM_OBJECTS:
+    command_str = "buffer migrate";
+    break;
+  default:
+    command_str = "unknown";
+  }
+  switch (status) {
+  case CL_QUEUED:
+    status_str = "Queued";
+    break;
+  case CL_SUBMITTED:
+    status_str = "Submitted";
+    break;
+  case CL_RUNNING:
+    status_str = "Executing";
+    break;
+  case CL_COMPLETE:
+    status_str = "Completed";
+    break;
+  }
+  printf("[%s]: %s %s\n", reinterpret_cast<char *>(data), status_str,
+         command_str);
+  fflush(stdout);
+}
+
+// Sets the callback for a particular event
+void set_callback(cl::Event event, const char *queue_name) {
+  cl_int err;
+  OCL_CHECK(err,
+            err = event.setCallback(CL_COMPLETE, event_cb, (void *)queue_name));
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  printf("Test CONV: [WxHxC] = [%dx%dx%d] -> [WxHxC] = [%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", W, H, C, W, H, COUT, KW, KH);
+
+  std::string binaryFile = argv[1];
+  cl_int err;
+  cl::Kernel kernel_conv2d_2;
+
+  std::cout << "Creating Context..." << std::endl;
+  auto devices = xcl::get_xil_devices();
+  auto device = devices[0];
+  OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err));
+  OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
+
+  std::string device_name = device.getInfo<CL_DEVICE_NAME>();
+  auto fileBuf = xcl::read_binary_file(binaryFile);
+  cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
+  devices.resize(1);
+
+  OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err));
+  std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl;
+
+  OCL_CHECK(err, kernel_conv2d_2 = cl::Kernel(program,"k_conv2D_K3x3_S1x1_P1x1_BS1_ap", &err));
+  std::cout << "Kernel sucessfully created" << std::endl ;
+
+  size_t size_data_in_bytes = W*H*C*sizeof(data_type);
+  size_t size_output_in_bytes = W*H*COUT * sizeof(data_type);
+  size_t size_kernel_in_bytes = KW * KH * C * COUT * sizeof(data_type);
+  size_t size_bias_in_bytes = COUT * sizeof(data_type);
+  // Allocate memory on the host and fill with random data.
+
+  //-----------------------------
+  // fill data vector with random data
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<data_type> dist(-1.0f, 1.0f);
+
+  std::cout << "Filling buffer with useful data" << std::endl ;
+  int addr = 0;
+  for (int h=0; h<H; h++) {
+    for (int w=0; w<W; w++) {
+      for (int c=0; c<C; c++) {
+	       data_type value = (c*W*H) + (data_type)(h*W)+w; //c+1; // (data_type)((c * 25) + (h * W) + w);
+         data_in[addr] = dist(gen); //value;
+	       addr++;
+      }
+    }
+  }
+
+  std::cout << "Filling kernel buffer with useful data" << std::endl;
+  int kernel_id = 1;
+  for (int c=0; c<C; c++) {
+    for (int cout=0; cout<COUT; cout++) {
+      for (int kh=0; kh<KH; kh++) {
+	       for (int kw=0; kw<KW; kw++) {
+          data_type value = (data_type)kernel_id;
+          int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
+	         kernel[addr_k] = dist(gen);
+        }
+      }
+      kernel_id++;
+    }
+  }
+
+  std::cout << "Filling bias buffer with useful data" << std::endl;
+  for (int cout=0; cout<COUT; cout++) bias[cout] = cout; //dist(gen);
+
+  //-----------------------------
+  // THIS PAIR OF EVENTS WILL BE USED TO TRACK WHEN A KERNEL IS FINISHED WITH
+  // THE INPUT BUFFERS. ONCE THE KERNEL IS FINISHED PROCESSING THE DATA, A NEW
+  // SET OF ELEMENTS WILL BE WRITTEN INTO THE BUFFER.
+  vector<cl::Event> kernel_events(1);
+  vector<cl::Event> read_events(1);
+  vector<cl::Event> write_events(1);
+  cl::Buffer buffer_a;
+  cl::Buffer buffer_b;
+  cl::Buffer buffer_k;
+  cl::Buffer buffer_bias;
+
+  //-----------------------------
+  // Allocate Buffer in Global Memory
+  // Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and
+  // Device-to-host communication
+  std::cout << "Creating Buffers..." << std::endl;
+
+  OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_data_in_bytes, &data_in, &err));
+  OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY  | CL_MEM_USE_HOST_PTR , size_output_in_bytes, &out, &err));
+  OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, &kernel, &err));
+  OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, &bias, &err));
+
+  // set kernel arguments
+  int arg = 0;
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, H));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, W));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, C));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_k));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_bias));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_b));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, COUT));
+
+  //-----------------------------
+  // Copy input data to device global memory
+  std::cout << "Copying data (Host to Device)..." << std::endl;
+  // Because we are passing the write_events, it returns an event object
+  // that identifies this particular command and can be used to query
+  // or queue a wait for this particular command to complete.
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0]));
+  set_callback(write_events[0], "ooo_queue");
+
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[0]));
+  set_callback(write_events[0], "ooo_queue");
+
+  //-----------------------------
+  printf("Enqueueing NDRange kernel.\n");
+  // This event needs to wait for the write buffer operations to complete
+  // before executing. We are sending the write_events into its wait list to
+  // ensure that the order of operations is correct.
+  // Launch the Kernel
+  std::vector<cl::Event> waitList;
+  waitList.push_back(write_events[0]);
+  OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[0]));
+  set_callback(kernel_events[0], "ooo_queue");
+
+
+
+  std::cout << "Getting Results (Device to Host)..." << std::endl;
+  std::vector<cl::Event> eventList;
+  eventList.push_back(kernel_events[0]);
+  // This operation only needs to wait for the kernel call.
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0]));
+  set_callback(read_events[0], "ooo_queue");
+  OCL_CHECK(err, err = read_events[0].wait());
+
+  // Wait for all of the OpenCL operations to complete
+  std::cout << "Waiting..." << std::endl;
+  OCL_CHECK(err, err = q.flush());
+  OCL_CHECK(err, err = q.finish());
+
+
+  std::cout << "computing conv in CPU..." << std::endl;
+
+ // cpu_print_data_in();
+  // cpu_print_kernels();
+ // cpu_print_bias();
+  // cpu_conv2d();
+ // cpu_print_out();
+
+  // check_result();
+
+  //-----------------------------
+  std::cout << "" << std::endl;
+  std::cout << "All done" << std::endl;
+  std::cout << "quit now" << std::endl;
+
+  // exit
+  return 0;
+}
diff --git a/include/eddl/hardware/fpga/fpga_enables.h b/include/eddl/hardware/fpga/fpga_enables.h
index 55f183d85..7aacfec2d 100644
--- a/include/eddl/hardware/fpga/fpga_enables.h
+++ b/include/eddl/hardware/fpga/fpga_enables.h
@@ -2,7 +2,7 @@
 // implemented on the FPGA
 
 //Activations
-#define K_ENABLED_RELU
+//#define K_ENABLED_RELU
 //#define K_ENABLED_D_RELU
 //#define K_ENABLED_THRESHOLDED_RELU
 //#define K_ENABLED_D_TRHESHOLDED_RELU
diff --git a/include/eddl/profiling.h b/include/eddl/profiling.h
index 8cca92267..c02848225 100644
--- a/include/eddl/profiling.h
+++ b/include/eddl/profiling.h
@@ -35,3 +35,14 @@
                     prof_##fn##_calls, prof_##fn##_time, \
             100.0 * prof_##fn##_time / acc, (float) prof_##fn##_time / (float) prof_##fn##_calls);
 #endif
+
+
+
+//CxHxW
+//
+//HxWxC
+//
+//GxHxWxC (C=4)   Reshape + Permute
+//
+//32xHxW -> Reshape -> 8x4xHxW -> Permute(0, 2, 3, 1) -> 8xHxWx4   // hay capas y funciones
+
diff --git a/src/hardware/fpga/fpga_core.cpp b/src/hardware/fpga/fpga_core.cpp
index d88e22c1a..84bfee3f4 100644
--- a/src/hardware/fpga/fpga_core.cpp
+++ b/src/hardware/fpga/fpga_core.cpp
@@ -331,8 +331,11 @@ void fpga_init(){ // initialize only once
     cl_int err;
     std::string binaryFile = "eddl.xclbin";
     unsigned fileBufSize;
-    std::vector<cl::Device> devices = xcl::get_xil_devices();
-    cl::Device device = devices[0];
+    //std::vector<cl::Device> devices = xcl::get_xil_devices();
+    //cl::Device device = devices[0];
+    auto devices = xcl::get_xil_devices();
+    auto device = devices[0];
+
     OCL_CHECK(err, context = cl::Context(device, NULL, NULL, NULL, &err));
     OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
     char *fileBuf = xcl::read_binary_file(binaryFile, fileBufSize);
@@ -342,8 +345,27 @@ void fpga_init(){ // initialize only once
     OCL_CHECK(err, program = cl::Program(context, devices, bins, NULL, &err));
 
     #ifdef K_ENABLED_RELU
+    printf("creating ReLu kernel\n");
     OCL_CHECK(err, kernel_relu = cl::Kernel(program,"k_relu", &err));
     if (err != CL_SUCCESS) printf("Error creating kernel\n");
+
+  // prueba
+  cl::Event event;
+  cl::Buffer b1;
+  cl::Buffer b2;
+  long sizeA = 1024;
+  OCL_CHECK(err,b1 = cl::Buffer(context,CL_MEM_READ_WRITE, sizeA*sizeof(float), NULL, &err));
+  OCL_CHECK(err,b2 = cl::Buffer(context,CL_MEM_READ_WRITE, sizeA*sizeof(float), NULL, &err));
+  OCL_CHECK(err, err = kernel_relu.setArg(0, b1));
+  OCL_CHECK(err, err = kernel_relu.setArg(1, b2));
+  OCL_CHECK(err, err = kernel_relu.setArg(2, sizeA));
+  OCL_CHECK(err, err = q.enqueueTask(kernel_relu, NULL, &event));
+  printf("relu kernel lanzado en init\n");
+  //  event.wait();
+  q.finish();
+
+
+
     #endif
     #ifdef K_ENABLED_D_RELU
     OCL_CHECK(err, kernel_d_relu = cl::Kernel(program,"k_d_relu", &err));
@@ -637,6 +659,38 @@ void fpga_init(){ // initialize only once
     #ifdef K_ENABLED_CONV2D_K3X3_S1X1_P1X1_BS1
     OCL_CHECK(err, kernel_conv2D_K3x3_S1x1_P1x1_BS1 = cl::Kernel(program, "k_conv2D_K3x3_S1x1_P1x1_BS1", &err));
     if (err != CL_SUCCESS) printf("Error creating kernel\n");
+
+    // prueba
+    cl::Event event1;
+    cl::Buffer I;
+    cl::Buffer K;
+    cl::Buffer B;
+    cl::Buffer O;
+    int Ich = 4;
+    int W = 256;
+    int H = 256;
+    int Och = 4;
+    int arg = 0;
+
+    OCL_CHECK(err,I = cl::Buffer(context,CL_MEM_READ_WRITE, Ich * W * H * sizeof(float), NULL, &err));
+    OCL_CHECK(err,K = cl::Buffer(context,CL_MEM_READ_WRITE, 3 * 3 * Ich * Och * sizeof(float), NULL, &err));
+    OCL_CHECK(err,B = cl::Buffer(context,CL_MEM_READ_WRITE, Och * sizeof(float), NULL, &err));
+    OCL_CHECK(err,O = cl::Buffer(context,CL_MEM_READ_WRITE, Och * W * H * sizeof(float), NULL, &err));
+
+    OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, I));
+    OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, H));
+    OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, W));
+    OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Ich));
+    OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, K));
+    OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, B));
+    OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, O));
+    OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Och));
+
+    OCL_CHECK(err, err = q.enqueueTask(kernel_conv2D_K3x3_S1x1_P1x1_BS1, NULL, &event1));
+    printf("conv kernel lanzado en init\n");
+    //  event.wait();
+    q.finish();
+
     #endif
     #ifdef K_ENABLED_RANGE
     OCL_CHECK(err, kernel_range = cl::Kernel(program,"k_range", &err));
diff --git a/src/hardware/fpga/nn/fpga_activations.cpp b/src/hardware/fpga/nn/fpga_activations.cpp
index 543bf3f22..fe3953259 100644
--- a/src/hardware/fpga/nn/fpga_activations.cpp
+++ b/src/hardware/fpga/nn/fpga_activations.cpp
@@ -17,6 +17,9 @@
 #include "eddl/hardware/fpga/nn/fpga_nn.h"
 #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h"  // for cpu emulation purposes
 
+// prueba
+extern cl::Context      context;
+
 // -----------------------------------------------------------------
 // relu
 //
@@ -35,13 +38,29 @@ void fpga_relu(Tensor *A, Tensor *B){
   cl_int err;
   cl::Event event;
 
+  // prueba
+/*  cl::Buffer b1;
+  cl::Buffer b2;
+  printf("1\n");
+  OCL_CHECK(err,b1 = cl::Buffer(context,CL_MEM_READ_WRITE, A->size*sizeof(float), NULL, &err));
+  printf("2\n");
+  OCL_CHECK(err,b2 = cl::Buffer(context,CL_MEM_READ_WRITE, B->size*sizeof(float), NULL, &err));
+  printf("3\n");
+  OCL_CHECK(err, err = kernel_relu.setArg(0, b1));
+  printf("4\n");
+  OCL_CHECK(err, err = kernel_relu.setArg(1, b2));
+  printf("5\n");
+  OCL_CHECK(err, err = kernel_relu.setArg(2, A->size));*/
+
   OCL_CHECK(err, err = kernel_relu.setArg(0, *(A->fpga_ptr)));
   OCL_CHECK(err, err = kernel_relu.setArg(1, *(B->fpga_ptr)));
   OCL_CHECK(err, err = kernel_relu.setArg(2, A->size));
 
   OCL_CHECK(err, err = q.enqueueTask(kernel_relu, NULL, &event));
+
   //  event.wait();
   q.finish();
+
 #endif
   _profile_fpga_tensor(B);
   _profile_fpga(_FPGA_RELU, 1);
diff --git a/src/hardware/fpga/nn/fpga_conv.cpp b/src/hardware/fpga/nn/fpga_conv.cpp
index 24f7ce232..8be62b2cc 100644
--- a/src/hardware/fpga/nn/fpga_conv.cpp
+++ b/src/hardware/fpga/nn/fpga_conv.cpp
@@ -89,6 +89,9 @@ void fpga_conv2D(ConvolDescriptor *D)
   if ((stride_rows == 1) && (stride_cols == 1) && (Krows == 3) && (Kcols == 3) && (batch_size == 1) && (padding_rows == 1) && (padding_cols == 1)) {
     fpga_conv2D_K3x3_S1x1_P1x1_BS1(I, Irows, Icols, Ichannels, K, B, O, Ochannels);
   } else {
+    #if !defined(K_ENABLED_CONV2D)
+    fpga_cpuemu_conv2D(D);
+    #else
     OCL_CHECK(err, err = kernel_conv2d.setArg(0, batch_size));
     OCL_CHECK(err, err = kernel_conv2d.setArg(1, I));
     OCL_CHECK(err, err = kernel_conv2d.setArg(2, Irows));    // input
@@ -110,6 +113,7 @@ void fpga_conv2D(ConvolDescriptor *D)
 
     OCL_CHECK(err, err = q.enqueueTask(kernel_conv2d, NULL, &event));
     q.finish();
+    #endif
   }
 #endif
   _profile_fpga(_FPGA_CONV2D, 1);

From 07513eeac240352e1be1f878f3b7cff82fb27192 Mon Sep 17 00:00:00 2001
From: jorga20j <jorga20j@posgrado.upv.es>
Date: Tue, 20 Oct 2020 14:30:10 +0000
Subject: [PATCH 03/15] new read distribution

---
 .../kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp   | 1416 +++++++++--------
 1 file changed, 716 insertions(+), 700 deletions(-)

diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
index 0f1e99ece..28a2441d9 100644
--- a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
+++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
@@ -1,700 +1,716 @@
-//KERNEL_CONV2D_4.cpp
-//Modified by: Jorge García Martinez
-//Date: 17/09/2020
-//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs
-//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel.
-//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores
-//the data that It should be written into the memory.
-
-
-
-#include <math.h>
-#include <stdio.h>
-#include <ap_fixed.h>
-
-#include <hls_stream.h>
-
-#define DEBUG_VERBOSE
-
-extern "C" {
-
-#define data_type ap_fixed<8,4,AP_TRN,AP_WRAP>
-//#define data_type float
-
-// To allow using defines inside Xilinx pragmas
-#define PRAGMA_SUB(x) _Pragma (#x)
-#define DO_PRAGMA(x) PRAGMA_SUB(x)
-
-// Fixed parameters (optimized at compilation/synthesis time)
-#define KW       3  // kernel width
-#define KH       3  // kernel height
-#define CPI      16  // channels per input port
-#define CPO      16  // channels per output port
-//
-#define WMAX 256
-#define WHMAX 256*256
-
-#define LOAD_MODEL
-#define READ_MODEL
-#define READ_INPUT
-#define WRITE_OUTPUT
-
-// pixel_in
-struct pixel_in_t {
-  data_type pixel[CPI];
-};
-
-struct pixel_out_t {
-  data_type pixel[CPO];
-};
-
-// frames struct
-struct frame_t {
-  pixel_in_t pixel[9];
-};
-
-// --------------------------------------------------------------------------------------
-// read_input:
-// The function reads and writes the kernels, bias and data in different stream.
-// Data are sent to padding module, kenels to mul and bias to add modules.
-// LOOP FLOW
-// ko = 0
-// b = 0
-//   for o_iter 0 .. n
-//        read bias[b..b+3]
-//        b = b + 4
-//        d = 0
-//        ki = 0
-//        for i_iter 0 .. n
-//            read kernel[ki..ki+3][ko..ko+3]
-//            ki = ki +4
-//            read data[d..d+3]
-//            d = d + 4
-//
-//        ko = ko + 4
-//
-//
-// Arguments:
-//   ptr  : Pointer to input data (in)
-//   k_ptr: pointer to kernels (in)
-//   b_ptr: pointer to bias (in)
-//   out  : data output stream (out)
-//   k_out: pointer to kernel (out)
-//   b_out: pointer to bias (out)
-//
-static void read_input(int H, int W, int I, int O, int I_ITER, int O_ITER, pixel_in_t *ptr, data_type *k_ptr, data_type *b_ptr, hls::stream<frame_t> &k_out, hls::stream<pixel_out_t> &b_out, hls::stream<pixel_in_t> &out) {
-
-#ifdef DEBUG_VERBOSE
-  printf("read_input: start\n");
-#endif
-
-  frame_t frame_k;
-  #pragma HLS ARRAY_PARTITION variable=frame_k dim=0
-
-  pixel_out_t bias;
-  #pragma HLS ARRAY_PARTITION variable=bias dim=0
-
-  pixel_in_t data;
-  #pragma HLS ARRAY_PARTITION variable=data dim=0
-
-
-  read_input_o_iter_loop:
-  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
-    //Sending bias to add in pack of CPO bias
-    // int data_pointer = 0;
-    read_loop_bias_load:
-      for (int b=0; b<CPO; b++) {
-        #pragma HLS PIPELINE II=1
-        //data_type v = b_ptr[b];
-        bias.pixel[0] = 1;
-        b_out << bias;
-      }
-    read_input_i_iter_loop:
-    for (int i_iter = 0; i_iter < I_ITER; i_iter++){
-      // printf("o_iter = %d -- i_iter = %d \n ", o_iter, i_iter);
-      //Sending kernels to mul in pack of CPI*CPO kernels
-      int kernel_size_cpo = CPO*KH*KW; //kernels size each i_iter
-      int i_offset = I_ITER * CPI * CPO * KH * KW; //addr_k offset for each i_iter
-      int cpo = 0; //index for kernel size
-      int kx = 0; //index for channels
-      read_loop_kernel_load_ext:
-      for(int i = 0; i < CPI; i++){
-        // printf("i = %d -- kernel_size_cpo = %d \n", i, kernel_size_cpo);
-        read_loop_kernel_load_int:
-        for (int j = 0; j < kernel_size_cpo; j++) {
-           // int addr_k = j + i*kernel_size_cpo*I_ITER + i_iter*i_offset + o_iter*kernel_size_cpo;
-           // data_type v = k_ptr[addr_k];
-          frame_k.pixel[kx].pixel[cpo] = 1;
-
-          #ifdef DEBUG_VERBOSE
-          //printf("[%d]:", addr_k);
-          //printf("%6.4f ", float(v));
-          #endif
-
-          kx = kx + 1;
-          if (kx == 9) {
-            // printf("\n");
-            kx = 0;
-            cpo = cpo + 1;
-            if (cpo == CPO) {
-              cpo = 0;
-              k_out << frame_k;
-            }
-          }
-        }
-      }
-
-    //Sending data to padding  in pack of CPI channels
-
-    read_loop_data_load_i:
-      for (int r=0; r<H*W; r++) {
-        #pragma HLS PIPELINE II=1
-        // printf("r = %d \n", r);
-        // data = ptr[offset_read];
-        #ifdef DEBUG_VERBOSE
-        printf("data.pixel[0] = %6.2f  ", float(data.pixel[0]));
-        printf("data.pixel[1] = %6.2f  ", float(data.pixel[1]));
-        printf("data.pixel[2] = %6.2f  ", float(data.pixel[2]));
-        printf("data.pixel[3] = %6.2f  \n", float(data.pixel[3]));
-        #endif
-        out  << ptr[r];
-        // data_pointer++;
-      }
-
-   } //i_iter
-} //o_iter
-
-
-#ifdef DEBUG_VERBOSE
-  printf("read_input: end\n");
-#endif
-}
-
-// ---------------------------------------------------------------------------------------
-// padding. Adds padding to the input and forwards it through the output
-//
-// Arguments:
-//   in                : input stream
-//   out               : vector of output streams
-//
-static void padding(int H, int W, int ITER, hls::stream<pixel_in_t> &in, hls::stream<pixel_in_t> &out) {
-
-#ifdef DEBUG_VERBOSE
-  printf("padding: start\n");
-#endif
-
-//we init zero only first time
-
-pixel_in_t data;
-DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete)
-
-pixel_in_t zero;
-DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete)
-
-for (int cpi=0; cpi<CPI; cpi++) zero.pixel[cpi] = 0.f;
-
-  padding_iter_loop:
-  for(int iter = 0; iter < ITER; iter++){
-
-    for(int h = 0; h < H + 2; h++){
-      #pragma HLS_PIPELINE II=1
-      for(int w = 0; w < W + 2; w++){
-        #pragma HLS_PIPELINE II=1
-        if (h==0 || h == H+1 || w == 0 || w == W+1) {
-          data = zero;
-        } else {
-	  data = in.read();
-	}
-        out << data;
-      }
-    }
-  } // iter
-
-
-
-#ifdef DEBUG_VERBOSE
-  printf("padding: end\n");
-#endif
-}
-
-// ---------------------------------------------------------------------------------------------
-// relu. Performs the relu operation on an input stream and produces an output stream
-// Arguments:
-//
-//   in: input stream
-//   out: output stream
-//
-static void relu(int H, int W, int O, hls::stream<data_type> &in, hls::stream<data_type> &out) {
-
-#ifdef DEBUG_VERBOSE
-  printf("relu: start\n");
-#endif
-
-  int data_size = W * H * O;
-  for (int i=0; i < data_size; i++) {
-    #pragma HLS PIPELINE II=1
-    data_type data = in.read();
-    if (data < 0) data = 0.f;
-    out << data;
-  }
-
-#ifdef DEBUG_VERBOSE
-  printf("relu: end\n");
-#endif
-}
-
-// --------------------------------------------------------------------------------
-// write_output: Writes data comming from one stream into memory
-// LOOP FLOW:
-//  for o_iter 0 .. n
-//      write data[do .. do+3]
-//
-//  d = d + 4
-//
-// Arguments:
-//   ptr: memory address pointer
-//   in: input stream
-//
-static void write_output(int H, int W, int O_ITER, pixel_out_t *ptr, hls::stream<pixel_out_t> &in) {
-
-#ifdef DEBUG_VERBOSE
-  printf("write_output: start\n");
-#endif
-
-
-
-  // int data_pointer = 0;
-
-  // write_output_o_iter_loop:
-  // for (int o_iter = 0; o_iter<O_ITER; o_iter++){
-    //writes must be performed with pixel_in_t struct
-    write_output_data_size_loop:
-    for (int i=0; i<H*W*O_ITER; i++) {
-      pixel_out_t p = in.read();
-      ptr[i] = p;
-      // data_pointer++;
-      #ifdef DEBUG_VERBOSE
-      printf("i = %d \n",  i);
-      for (int cpo=0; cpo<CPO; cpo++) printf("ptr--p.pixel[%d] = %6.2f \n", cpo, float(p.pixel[cpo]));
-      #endif
-    }
-  // } //o_iter
-
-
-
-#ifdef DEBUG_VERBOSE
-  printf("write_output: end\n");
-#endif
-}
-
-
-
-// ---------------------------------------------------------------------------------------------------
-// cvt: reads an input stream with an image of format (W, H, CPI) and writes an output stream
-// in a 2D format based on (KW, KH). (SW=1, SH=1) stride is assumed and (PW=1, PH=1) padding is assumed.
-// The function outputs data in the format (CPI, KW, KH).
-//
-// Arguments:
-//   in  : input stream
-//   out : output stream
-//   id  : function id (for debugging)
-static void cvt(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &out, int id) {
-
-#ifdef DEBUG_VERBOSE
-  printf("cvt_%d: start\n", id);
-#endif
-
-cvt_o_iter_loop:
-for (int o_iter = 0; o_iter < O_ITER; o_iter++){
-  cvt_i_iter_loop:
-  for(int i_iter = 0; i_iter < I_ITER; i_iter++){
-
-  // Now we process the input data and convert the data into frames
-
-  // buffers (keep three rows)
-  pixel_in_t buffer0[WMAX+2];
-  pixel_in_t buffer1[WMAX+2];
-  pixel_in_t buffer2[WMAX+2];
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI)
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI)
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI)
-
-  // frame
-  frame_t frame;
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=frame)
-
-  // We loop for every incoming pixel
-  cvt_loop_1:
-  for (int pin_row=0; pin_row < H+2; pin_row++) {
-    cvt_loop_2:
-    for (int pin_col=0; pin_col < W+2; pin_col++) {
-      // get the pixel
-      pixel_in_t pixel;
-      pixel = in.read();
-      // row buffer write (in which buffer row we write the pixel)
-      int row0_buffer_write = (pin_row % 3) == 0;
-      int row1_buffer_write = (pin_row % 3) == 1;
-      // first row buffer
-      int row0 = (pin_row <= 2) | ((pin_row % 3) == 2);
-      int row1 = !row0 & ((pin_row % 3) == 0);
-      // we write the pixel into the buffer
-      if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel;
-      // build the frame
-      pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8;
-      int shift_frame = (pin_row>1) & (pin_col > 2);
-      int send_frame = (pin_row>1) & (pin_col > 1);
-      pixel_in_t pixel_b0, pixel_b1, pixel_b2;
-      pixel_b0 = buffer0[pin_col];
-      pixel_b1 = buffer1[pin_col];
-      pixel_b2 = buffer2[pin_col];
-      // p0, p1, p2
-      if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;}
-      if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;}
-      if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2;
-      // p3, p4, p5
-      if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;}
-      if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;}
-      if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0;
-      // p6, p7, p8
-      if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;}
-      if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;}
-      if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1;
-
-      if (send_frame) {
-        frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2;
-        frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5;
-        frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8;
-        out << frame;
-      #ifdef DEBUG_VERBOSE
-      printf("cvt_%d: frame sent:\n", id);
-      for (int cpi=0; cpi<CPI; cpi++) {
-        printf("  cpi %d:\n", cpi);
-        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[0].pixel[cpi]), float(frame.pixel[1].pixel[cpi]), float(frame.pixel[2].pixel[cpi]));
-        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[3].pixel[cpi]), float(frame.pixel[4].pixel[cpi]), float(frame.pixel[5].pixel[cpi]));
-        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[6].pixel[cpi]), float(frame.pixel[7].pixel[cpi]), float(frame.pixel[8].pixel[cpi]));
-      }
-      #endif
-     }
-    }
-  }
-
-} //i_iter
-} //o_iter
-
-
-#ifdef DEBUG_VERBOSE
-  printf("cvt_%d: end\n", id);
-#endif
-}
-
-// ----------------------------------------------------------------------------------------
-// mul: This function performs the multiplication of an input frame with the stored kernels
-// and sends the produced pixels. Before normal operation it receives its kernels
-// Arguments:
-//   in: input stream with incoming data frames
-//   k_in: input stream with kernels
-//   out: output stream
-//   id: function id (for debugging only)
-//
-static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream<frame_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &out, int id) {
-
-#ifdef DEBUG_VERBOSE
-  printf("mul_%d: start\n", id);
-#endif
-
-  // first we read the kernels
-  frame_t kernel[CPI];
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0)
-  frame_t data_in;
-
-#ifdef LOAD_MODEL
-
-  mul_o_iter_loop:
-  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
-    mul_i_iter_loop:
-    for(int i_iter = 0; i_iter < I_ITER; i_iter++){
-      //we load the kernels into pack of frames
-      loop_mul_kernels_load_cpo:
-      for (int cpi=0; cpi<CPI; cpi++) {
-        #pragma HLS PIPELINE II=1
-        kernel[cpi] = k_in.read();
-      }
-
-#ifdef DEBUG_VERBOSE
-  printf("mul_%d: kernels received\n", id);
-  for (int cpi=0; cpi < CPI; cpi++) {
-    for (int cpo=0; cpo < CPO; cpo++) {
-      printf("  cpi=%d, cpo=%d:\n", cpi, cpo);
-      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[0].pixel[cpo]), float(kernel[cpi].pixel[1].pixel[cpo]), float(kernel[cpi].pixel[2].pixel[cpo]));
-      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[3].pixel[cpo]), float(kernel[cpi].pixel[4].pixel[cpo]), float(kernel[cpi].pixel[5].pixel[cpo]));
-      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[6].pixel[cpo]), float(kernel[cpi].pixel[7].pixel[cpo]), float(kernel[cpi].pixel[8].pixel[cpo]));
-    }
-  }
-#endif
-
-
-    // now we read frames and produce the pixels
-    data_type sum[CPO];
-    DO_PRAGMA(HLS ARRAY_PARTITION variable=sum dim=0 block factor=CPO)
-    //factor = 16
-    //the array_partition factor in this case is assumed to be CPO value
-    int num_iterations = W * H;
-    mul_sum_loop:
-    for (int cpo=0; cpo<CPO; cpo++) sum[cpo] = 0.f;
-
-    mul_num_iterations_loop:
-    for (int i=0; i<num_iterations; i++) {
-      data_in = in.read();
-
-#ifdef DEBUG_VERBOSE
-  printf("mul_%d: data received\n", id);
-  for (int cpi=0; cpi<CPI; cpi++) {
-    printf("  cpi=%d\n", cpi);
-    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[0].pixel[cpi]), float(data_in.pixel[1].pixel[cpi]), float(data_in.pixel[2].pixel[cpi]));
-    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[3].pixel[cpi]), float(data_in.pixel[4].pixel[cpi]), float(data_in.pixel[5].pixel[cpi]));
-    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[6].pixel[cpi]), float(data_in.pixel[7].pixel[cpi]), float(data_in.pixel[8].pixel[cpi]));
-  }
-#endif
-
-    loop_mul_cpi:
-    for (int cpi=0; cpi<CPI; cpi++) {
-      #pragma HLS UNROLL
-      loop_mul_j:
-      for (int j=0; j<KW*KH; j++) {
-	       #pragma HLS UNROLL
-        loop_mul_cpo:
-      	for (int cpo=0; cpo<CPO; cpo++) {
-          #pragma HLS UNROLL
-          sum[cpo] += data_in.pixel[j].pixel[cpi] * kernel[cpi].pixel[j].pixel[cpo];
-        }
-      }
-    }
-    pixel_out_t p_out;
-    for (int cpo=0; cpo<CPO; cpo++) {
-      #pragma HLS unroll
-      #ifdef DEBUG_VERBOSE
-      printf("mul_%d: pixel produced\n", id);
-      for (int cpo=0; cpo<CPO; cpo++) printf("  cpo=%d -> %6.4f\n", cpo, float(sum[cpo]));
-      #endif
-      p_out.pixel[cpo] = sum[cpo];
-      sum[cpo] = 0.f;
-     }
-     out << p_out;
-    }
-  } //i_iter
-} //o_iter
-
-#endif
-
-
-#ifdef DEBUG_VERBOSE
-  printf("mul_%d: end\n", id);
-#endif
-}
-
-// -------------------------------------------------------------------------------
-// add: This function performs the addition of all subpixels for the same channel.
-// It adds also the corresponding bias.
-// LOOP FLOW
-//   for o_iter 0 .. n
-//        receive bias[b..b+3]
-//        init buff_o_channels with bias
-//        for i_iter 0 .. n
-//            receive data[do..d+3]
-//            buff_o_channels = buff_o_channels + data
-//
-//        for num_iterations
-//            for CPO
-//              send data to write module
-//
-// Arguments:
-//   in:  input streams data
-//   b_in: input stream bias
-//   out: output stream
-//
-static void add(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_out_t> &in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
-
-#ifdef DEBUG_VERBOSE
-  printf("add: start\n");
-#endif
-
-  data_type bias[CPO];
-
-  //number of iterations by CPI || CPO channels
-  int num_iterations = W * H;
-
-  //Buffer for all data and CPO channels
-  data_type buff_o_channels[CPO][WHMAX];
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO)
-
-  //We read Bias in O_iter packs of CPO size
-  add_o_iter_loop:
-  for (int o_iter = 0; o_iter<O_ITER; o_iter++){
-
-    //We receive bias in packs of CPO
-    add_load_bias_loop:
-    for (int b=0; b<CPO; b++) {
-      #pragma HLS PIPELINE II=1
-      pixel_out_t p_out;
-      p_out = b_in.read();
-      bias[b] = p_out.pixel[0];
-    }
-
-    #ifdef DEBUG_VERBOSE
-    for (int b=0; b<CPO; b++) {
-      printf("Bias[%d] = %6.4f \n", b, float(bias[b]));
-    }
-    #endif
-
-    #ifdef DEBUG_VERBOSE
-    printf("add: bias received\n");
-    #endif
-
-    //It is necessary to reset the buffer each o_iter
-    // add_init_buff_o_channels_loop:
-    // for(int cpo = 0; cpo<CPO; cpo++){
-    //   for(int it = 0; it<num_iterations; it++){
-    //     buff_o_channels[cpo][it] = bias[cpo];
-    //   }
-    // }
-
-      #ifdef DEBUG_VERBOSE
-      printf("o_iter = %d \n", o_iter);
-      for(int cpo = 0; cpo<CPO; cpo++){
-        printf("Channel cpo = %d: ", cpo);
-        for(int it = 0; it<num_iterations; it++){
-          printf("%6.2f ", float(buff_o_channels[cpo][it]));
-        }
-        printf("\n");
-      }
-      #endif
-
-      //All input data have effect into output add
-      add_i_iter_loop:
-      for (int i_iter = 0; i_iter < I_ITER; i_iter++){
-        // //prueba
-        pixel_out_t data_out;
-        // pixel_out_t data;
-        // data = in.read();
-        // out<<data;
-        #pragma HLS loop_flatten off
-        add_load_data_it_loop:
-        for(int it = 0; it<num_iterations; it++){
-          pixel_out_t data_in;
-          data_in = in.read();
-          pixel_out_t data;
-          add_load_data_cpo_loop:
-          for (int cpo=0; cpo<CPO; cpo++) {
-            #pragma HLS unroll
-            if(i_iter == 0){
-              data.pixel[cpo] = bias[cpo];
-            }
-            else{
-              data.pixel[cpo] = buff_o_channels[cpo][it];
-            }
-            buff_o_channels[cpo][it] = data.pixel[cpo] + data_in.pixel[cpo];
-
-            if(i_iter ==(I_ITER-1)){
-              data_out.pixel[cpo] = buff_o_channels[cpo][it];
-            }
-          }
-          if(i_iter ==(I_ITER-1)){
-            out << data_out;
-          }
-        }
-      } //i_iter
-
-      #ifdef DEBUG_VERBOSE
-      printf("CH %d: ", o_iter*CPO);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", float(buff_o_channels[0][it]));
-      }
-      printf("\n");
-      printf("CH %d: ", o_iter*CPO +1);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", float(buff_o_channels[1][it]));
-      }
-      printf("\n");
-      printf("CH %d: ", o_iter*CPO +2);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", float(buff_o_channels[2][it]));
-      }
-      printf("\n");
-      printf("CH %d: ", o_iter*CPO +3);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", float(buff_o_channels[3][it]));
-      }
-      printf("\n");
-      #endif
-
-
-  } //o_iter
-
-
-#ifdef DEBUG_VERBOSE
-  printf("add: end\n");
-#endif
-
-
-}
-
-// conv: Convolutional kernel
-//
-// Arguments:
-//   in: input stream
-//   out: output stream
-static void conv(int H, int W, int I, int O, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
-
-  // streams
-  static hls::stream<pixel_in_t>  str_pad_cvt;  // padding->cvt
-  static hls::stream<frame_t>     str_cvt_mul;  // cvt->mul
-  static hls::stream<pixel_out_t> str_mul_add;  // mul->add
-
-  // topology
-  #pragma HLS dataflow
-  padding(H, W, I_ITER * O_ITER, in, str_pad_cvt);          // padding
-  cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0);  // cvt
-  mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0);  // mul
-  add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out);             // add
-}
-
-void k_conv2D_K3x3_S1x1_P1x1_BS1_ap(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O) {
-
-  #pragma HLS INTERFACE s_axilite port=W bundle=control
-  #pragma HLS INTERFACE s_axilite port=H bundle=control
-  #pragma HLS INTERFACE s_axilite port=I bundle=control
-  #pragma HLS INTERFACE s_axilite port=O bundle=control
-  #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
-  #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256
-  #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
-  #pragma HLS INTERFACE m_axi port=ptr_out  offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
-  #pragma HLS INTERFACE s_axilite port=return bundle=control
-
-  // ptr_data struct to be packed as a single element vector (to improve memory read)
-  // the compiler will do full structure access (all elements of structure)
-  #pragma HLS data_pack variable = ptr_data
-  #pragma HLS data_pack variable = ptr_out
-
-  int I_ITER = I/CPI;
-  int O_ITER = O/CPO;
-
-  // input and output streams
-  static hls::stream<pixel_in_t> out_read;
-  static hls::stream<frame_t> out_read_kernel;
-  static hls::stream<pixel_out_t> out_read_bias;
-  static hls::stream<pixel_out_t> out_conv;
-
-  // stream sizes
-  #pragma HLS STREAM variable = out_read depth = 32
-  #pragma HLS STREAM variable = out_read_kernel depth = 32
-  #pragma HLS STREAM variable = out_read_bias depth = 32
-  #pragma HLS STREAM variable = out_conv depth = 32
-  #pragma HLS STREAM variable = out_relu depth = 32
-
-  #pragma HLS dataflow
-  read_input(H, W, I, O, I_ITER, O_ITER, ptr_data, ptr_kernel, ptr_bias, out_read_kernel, out_read_bias, out_read);
-  conv(H, W, I, O, I_ITER, O_ITER, out_read, out_read_kernel, out_read_bias, out_conv);
-  write_output(H, W, O_ITER, ptr_out, out_conv);
-}
-
-} // end extern "C"
+//KERNEL_CONV2D_4.cpp
+//Modified by: Jorge García Martinez
+//Date: 17/09/2020
+//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs
+//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel.
+//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores
+//the data that It should be written into the memory.
+
+
+
+#include <math.h>
+#include <stdio.h>
+#include <ap_fixed.h>
+
+#include <hls_stream.h>
+
+// #define DEBUG_VERBOSE
+
+extern "C" {
+
+// #define data_type ap_fixed<8,4,AP_TRN,AP_WRAP>
+#define data_type float
+
+// To allow using defines inside Xilinx pragmas
+#define PRAGMA_SUB(x) _Pragma (#x)
+#define DO_PRAGMA(x) PRAGMA_SUB(x)
+
+// Fixed parameters (optimized at compilation/synthesis time)
+#define KW       3  // kernel width
+#define KH       3  // kernel height
+#define CPI      4  // channels per input port
+#define CPO      4  // channels per output port
+
+#define WMAX 256
+#define WHMAX 256*256
+
+#define LOAD_MODEL
+#define READ_MODEL
+#define READ_INPUT
+#define WRITE_OUTPUT
+
+// pixel_in
+struct pixel_in_t {
+  data_type pixel[CPI];
+};
+
+struct pixel_out_t {
+  data_type pixel[CPO];
+};
+
+// frames struct
+struct frame_t {
+  pixel_in_t pixel[9];
+};
+
+// ---------------------------------------------------------------------------------------
+// read_bias. Reading bias from memory and sending to add module.
+//
+// Arguments:
+//   b_ptr                : pointer to bias
+//   b_out               :  output streams
+//
+static void read_bias(int O_ITER, data_type *b_ptr, hls::stream<pixel_out_t> &b_out){
+
+#ifdef DEBUG_VERBOSE
+  printf("read_bias: start\n");
+#endif
+  pixel_out_t bias;
+  #pragma HLS ARRAY_PARTITION variable=bias dim=0
+
+  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+    // printf("o_iter = %d \n ", o_iter);
+    //Sending bias to add in pack of CPO bias
+    read_loop_bias_load:
+      for (int b=0; b<CPO; b++) {
+        #pragma HLS PIPELINE II=1
+        data_type v = b_ptr[b+CPO*o_iter];
+        bias.pixel[b] = v;
+      }
+      b_out << bias;
+    }
+#ifdef DEBUG_VERBOSE
+  printf("read_bias: end\n");
+#endif
+}
+
+// ---------------------------------------------------------------------------------------
+// read_kernel. Adds padding to the input and forwards it through the output
+//
+// Arguments:
+//   k_ptr                : pointer to kernels
+//   k_out               :  output stream
+//
+static void read_kernel(int O_ITER, int I_ITER, data_type *k_ptr, hls::stream<frame_t> &k_out){
+
+#ifdef DEBUG_VERBOSE
+  printf("read_kernel: start\n");
+#endif
+
+  frame_t frame_k;
+  #pragma HLS ARRAY_PARTITION variable=frame_k dim=0
+  //Sending kernels to mul in pack of CPI*CPO kernels
+  int kernel_size_cpo = CPO*KH*KW; //kernels size each i_iter
+  int i_offset = I_ITER * CPI * CPO * KH * KW; //addr_k offset for each i_iter
+  int cpo = 0; //index for kernel size
+  int kx = 0; //index for channels
+
+  read_input_o_iter_loop:
+  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+    read_input_i_iter_loop:
+    for (int i_iter = 0; i_iter < I_ITER; i_iter++){
+
+      read_loop_kernel_load_ext:
+      for(int i = 0; i < CPI; i++){
+        // printf("i = %d -- kernel_size_cpo = %d \n", i, kernel_size_cpo);
+        read_loop_kernel_load_int:
+        for (int j = 0; j < kernel_size_cpo; j++) {
+          int addr_k = j + i*kernel_size_cpo*I_ITER + i_iter*i_offset + o_iter*kernel_size_cpo;
+          data_type v = k_ptr[addr_k];
+          frame_k.pixel[kx].pixel[cpo] = v;
+
+          #ifdef DEBUG_VERBOSE
+          printf("[%d]:", addr_k);
+          printf("%6.4f ", v);
+          #endif
+
+          kx = kx + 1;
+          if (kx == 9) {
+            // printf("\n");
+            kx = 0;
+            cpo = cpo + 1;
+            if (cpo == CPO) {
+              cpo = 0;
+              k_out << frame_k;
+            }
+          }
+        }
+      }
+    } //i_iter
+  } //o_iter
+
+#ifdef DEBUG_VERBOSE
+  printf("read_kernel: end\n");
+#endif
+
+}
+
+// --------------------------------------------------------------------------------------
+// read_data: Reading data from memory and sending to conv module
+// Arguments:
+//   ptr  : Pointer to input data (in)
+//   out  : data output stream (out)
+//
+static void read_data(int H, int W, int I_ITER, int O_ITER, pixel_in_t *ptr, hls::stream<pixel_in_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("read_data: start\n");
+#endif
+
+  read_input_o_iter_loop:
+  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+    //Sending data to padding  in pack of CPI channels
+    read_loop_data_load_i:
+      for (int r=0; r<H*W*I_ITER; r++) {
+        #pragma HLS PIPELINE II=1
+        #ifdef DEBUG_VERBOSE
+        printf("addres = %d \n", r);
+        pixel_in_t data;
+        data = ptr[r];
+        for(int cpi = 0;cpi<CPI;cpi++) printf("data.pixel[%d] = %6.2f  ", cpi, float(data.pixel[cpi]));
+        #endif
+        out  << ptr[r];
+      }
+    } //o_iter
+
+#ifdef DEBUG_VERBOSE
+  printf("read_data: end\n");
+#endif
+}
+
+// ---------------------------------------------------------------------------------------
+// padding. Adds padding to the input and forwards it through the output
+//
+// Arguments:
+//   in                : input stream
+//   out               : vector of output streams
+//
+static void padding(int H, int W, int ITER, hls::stream<pixel_in_t> &in, hls::stream<pixel_in_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("padding: start\n");
+#endif
+
+//we init zero only first time
+
+pixel_in_t data;
+DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete)
+
+pixel_in_t zero;
+DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete)
+
+for (int cpi=0; cpi<CPI; cpi++) zero.pixel[cpi] = 0.f;
+
+  padding_iter_loop:
+  for(int iter = 0; iter < ITER; iter++){
+
+    for(int h = 0; h < H + 2; h++){
+      #pragma HLS_PIPELINE II=1
+      for(int w = 0; w < W + 2; w++){
+        #pragma HLS_PIPELINE II=1
+        if (h==0 || h == H+1 || w == 0 || w == W+1) {
+          data = zero;
+        }
+        else {
+	         data = in.read();
+	      }
+        #ifdef DEBUG_VERBOSE
+        for(int cpi = 0;cpi<CPI;cpi++) printf("data.pixel[%d] = %6.2f  ", cpi, float(data.pixel[cpi]));
+        printf("\n");
+        #endif
+        out << data;
+      }
+    }
+  } // iter
+
+
+#ifdef DEBUG_VERBOSE
+  printf("padding: end\n");
+#endif
+}
+
+// ---------------------------------------------------------------------------------------------
+// relu. Performs the relu operation on an input stream and produces an output stream
+// Arguments:
+//
+//   in: input stream
+//   out: output stream
+//
+static void relu(int H, int W, int O, hls::stream<data_type> &in, hls::stream<data_type> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("relu: start\n");
+#endif
+
+  int data_size = W * H * O;
+  for (int i=0; i < data_size; i++) {
+    #pragma HLS PIPELINE II=1
+    data_type data = in.read();
+    if (data < 0) data = 0.f;
+    out << data;
+  }
+
+#ifdef DEBUG_VERBOSE
+  printf("relu: end\n");
+#endif
+}
+
+// --------------------------------------------------------------------------------
+// write_output: Writes data comming from one stream into memory
+// LOOP FLOW:
+//  for o_iter 0 .. n
+//      write data[do .. do+3]
+//
+//  d = d + 4
+//
+// Arguments:
+//   ptr: memory address pointer
+//   in: input stream
+//
+static void write_output(int H, int W, int O_ITER, pixel_out_t *ptr, hls::stream<pixel_out_t> &in) {
+
+#ifdef DEBUG_VERBOSE
+  printf("write_output: start\n");
+#endif
+
+
+    write_output_data_size_loop:
+    for (int i=0; i<H*W*O_ITER; i++) {
+      pixel_out_t p = in.read();
+      ptr[i] = p;
+      #ifdef DEBUG_VERBOSE
+      printf("i = %d \n",  i);
+      for (int cpo=0; cpo<CPO; cpo++) printf("ptr--p.pixel[%d] = %6.2f \n", cpo, float(p.pixel[cpo]));
+      #endif
+    }
+
+
+
+#ifdef DEBUG_VERBOSE
+  printf("write_output: end\n");
+#endif
+}
+
+
+
+// ---------------------------------------------------------------------------------------------------
+// cvt: reads an input stream with an image of format (W, H, CPI) and writes an output stream
+// in a 2D format based on (KW, KH). (SW=1, SH=1) stride is assumed and (PW=1, PH=1) padding is assumed.
+// The function outputs data in the format (CPI, KW, KH).
+//
+// Arguments:
+//   in  : input stream
+//   out : output stream
+//   id  : function id (for debugging)
+static void cvt(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &out, int id) {
+
+#ifdef DEBUG_VERBOSE
+  printf("cvt_%d: start\n", id);
+#endif
+
+cvt_o_iter_loop:
+for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+  cvt_i_iter_loop:
+  for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+
+  // Now we process the input data and convert the data into frames
+
+  // buffers (keep three rows)
+  pixel_in_t buffer0[WMAX+2];
+  pixel_in_t buffer1[WMAX+2];
+  pixel_in_t buffer2[WMAX+2];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI)
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI)
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI)
+
+  // frame
+  frame_t frame;
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=frame)
+
+  // We loop for every incoming pixel
+  cvt_loop_1:
+  for (int pin_row=0; pin_row < H+2; pin_row++) {
+    cvt_loop_2:
+    for (int pin_col=0; pin_col < W+2; pin_col++) {
+      // get the pixel
+      pixel_in_t pixel;
+      pixel = in.read();
+      // row buffer write (in which buffer row we write the pixel)
+      int row0_buffer_write = (pin_row % 3) == 0;
+      int row1_buffer_write = (pin_row % 3) == 1;
+      // first row buffer
+      int row0 = (pin_row <= 2) | ((pin_row % 3) == 2);
+      int row1 = !row0 & ((pin_row % 3) == 0);
+      // we write the pixel into the buffer
+      if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel;
+      // build the frame
+      pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8;
+      int shift_frame = (pin_row>1) & (pin_col > 2);
+      int send_frame = (pin_row>1) & (pin_col > 1);
+      pixel_in_t pixel_b0, pixel_b1, pixel_b2;
+      pixel_b0 = buffer0[pin_col];
+      pixel_b1 = buffer1[pin_col];
+      pixel_b2 = buffer2[pin_col];
+      // p0, p1, p2
+      if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;}
+      if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;}
+      if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2;
+      // p3, p4, p5
+      if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;}
+      if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;}
+      if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0;
+      // p6, p7, p8
+      if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;}
+      if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;}
+      if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1;
+
+      if (send_frame) {
+        frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2;
+        frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5;
+        frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8;
+        out << frame;
+      #ifdef DEBUG_VERBOSE
+      printf("cvt_%d: frame sent:\n", id);
+      for (int cpi=0; cpi<CPI; cpi++) {
+        printf("  cpi %d:\n", cpi);
+        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[0].pixel[cpi]), float(frame.pixel[1].pixel[cpi]), float(frame.pixel[2].pixel[cpi]));
+        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[3].pixel[cpi]), float(frame.pixel[4].pixel[cpi]), float(frame.pixel[5].pixel[cpi]));
+        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[6].pixel[cpi]), float(frame.pixel[7].pixel[cpi]), float(frame.pixel[8].pixel[cpi]));
+      }
+      #endif
+     }
+    }
+  }
+
+} //i_iter
+} //o_iter
+
+
+#ifdef DEBUG_VERBOSE
+  printf("cvt_%d: end\n", id);
+#endif
+}
+
+// ----------------------------------------------------------------------------------------
+// mul: This function performs the multiplication of an input frame with the stored kernels
+// and sends the produced pixels. Before normal operation it receives its kernels
+// Arguments:
+//   in: input stream with incoming data frames
+//   k_in: input stream with kernels
+//   out: output stream
+//   id: function id (for debugging only)
+//
+static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream<frame_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &out, int id) {
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: start\n", id);
+#endif
+
+  // first we read the kernels
+  frame_t kernel[CPI];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0)
+  frame_t data_in;
+
+#ifdef LOAD_MODEL
+
+  mul_o_iter_loop:
+  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+    mul_i_iter_loop:
+    for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+      //we load the kernels into pack of frames
+      loop_mul_kernels_load_cpo:
+      for (int cpi=0; cpi<CPI; cpi++) {
+        #pragma HLS PIPELINE II=1
+        kernel[cpi] = k_in.read();
+      }
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: kernels received\n", id);
+  for (int cpi=0; cpi < CPI; cpi++) {
+    for (int cpo=0; cpo < CPO; cpo++) {
+      printf("  cpi=%d, cpo=%d:\n", cpi, cpo);
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[0].pixel[cpo]), float(kernel[cpi].pixel[1].pixel[cpo]), float(kernel[cpi].pixel[2].pixel[cpo]));
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[3].pixel[cpo]), float(kernel[cpi].pixel[4].pixel[cpo]), float(kernel[cpi].pixel[5].pixel[cpo]));
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[6].pixel[cpo]), float(kernel[cpi].pixel[7].pixel[cpo]), float(kernel[cpi].pixel[8].pixel[cpo]));
+    }
+  }
+#endif
+
+
+    // now we read frames and produce the pixels
+    data_type sum[CPO];
+    DO_PRAGMA(HLS ARRAY_PARTITION variable=sum dim=0 block factor=CPO)
+    //factor = 16
+    //the array_partition factor in this case is assumed to be CPO value
+    int num_iterations = W * H;
+    mul_sum_loop:
+    for (int cpo=0; cpo<CPO; cpo++) sum[cpo] = 0.f;
+
+    mul_num_iterations_loop:
+    for (int i=0; i<num_iterations; i++) {
+      data_in = in.read();
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: data received\n", id);
+  for (int cpi=0; cpi<CPI; cpi++) {
+    printf("  cpi=%d\n", cpi);
+    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[0].pixel[cpi]), float(data_in.pixel[1].pixel[cpi]), float(data_in.pixel[2].pixel[cpi]));
+    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[3].pixel[cpi]), float(data_in.pixel[4].pixel[cpi]), float(data_in.pixel[5].pixel[cpi]));
+    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[6].pixel[cpi]), float(data_in.pixel[7].pixel[cpi]), float(data_in.pixel[8].pixel[cpi]));
+  }
+#endif
+
+    loop_mul_cpi:
+    for (int cpi=0; cpi<CPI; cpi++) {
+      #pragma HLS UNROLL
+      loop_mul_j:
+      for (int j=0; j<KW*KH; j++) {
+	       #pragma HLS UNROLL
+        loop_mul_cpo:
+      	for (int cpo=0; cpo<CPO; cpo++) {
+          #pragma HLS UNROLL
+          sum[cpo] += data_in.pixel[j].pixel[cpi] * kernel[cpi].pixel[j].pixel[cpo];
+        }
+      }
+    }
+    pixel_out_t p_out;
+    for (int cpo=0; cpo<CPO; cpo++) {
+      #pragma HLS unroll
+      #ifdef DEBUG_VERBOSE
+      printf("mul_%d: pixel produced\n", id);
+      for (int cpo=0; cpo<CPO; cpo++) printf("  cpo=%d -> %6.4f\n", cpo, float(sum[cpo]));
+      #endif
+      p_out.pixel[cpo] = sum[cpo];
+      sum[cpo] = 0.f;
+     }
+     out << p_out;
+    }
+  } //i_iter
+} //o_iter
+
+#endif
+
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: end\n", id);
+#endif
+}
+
+// -------------------------------------------------------------------------------
+// add: This function performs the addition of all subpixels for the same channel.
+// It adds also the corresponding bias.
+// LOOP FLOW
+//   for o_iter 0 .. n
+//        receive bias[b..b+3]
+//        init buff_o_channels with bias
+//        for i_iter 0 .. n
+//            receive data[do..d+3]
+//            buff_o_channels = buff_o_channels + data
+//
+//        for num_iterations
+//            for CPO
+//              send data to write module
+//
+// Arguments:
+//   in:  input streams data
+//   b_in: input stream bias
+//   out: output stream
+//
+static void add(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_out_t> &in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("add: start\n");
+#endif
+
+  data_type bias[CPO];
+
+  //number of iterations by CPI || CPO channels
+  int num_iterations = W * H;
+
+  //Buffer for all data and CPO channels
+  data_type buff_o_channels[CPO][WHMAX];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO)
+
+  //We read Bias in O_iter packs of CPO size
+  add_o_iter_loop:
+  for (int o_iter = 0; o_iter<O_ITER; o_iter++){
+
+    //We receive bias in packs of CPO
+    // add_load_bias_loop:
+    // for (int b=0; b<CPO; b++) {
+    //   #pragma HLS PIPELINE II=1
+    //   pixel_out_t p_out;
+    //   p_out = b_in.read();
+    //   bias[b] = p_out.pixel[0];
+    // }
+    //We receive bias in packs of CPO
+    pixel_out_t p_out;
+    p_out = b_in.read();
+    add_load_bias_loop:
+    for (int b=0; b<CPO; b++) {
+      #pragma HLS PIPELINE II=1
+      bias[b] = p_out.pixel[b];
+    }
+
+
+    #ifdef DEBUG_VERBOSE
+    for (int b=0; b<CPO; b++) {
+      printf("Bias[%d] = %6.4f \n", b, float(bias[b]));
+    }
+    #endif
+
+    #ifdef DEBUG_VERBOSE
+    printf("add: bias received\n");
+    #endif
+
+    //It is necessary to reset the buffer each o_iter
+    // add_init_buff_o_channels_loop:
+    // for(int cpo = 0; cpo<CPO; cpo++){
+    //   for(int it = 0; it<num_iterations; it++){
+    //     buff_o_channels[cpo][it] = bias[cpo];
+    //   }
+    // }
+
+      #ifdef DEBUG_VERBOSE
+      printf("o_iter = %d \n", o_iter);
+      for(int cpo = 0; cpo<CPO; cpo++){
+        printf("Channel cpo = %d: ", cpo);
+        for(int it = 0; it<num_iterations; it++){
+          printf("%6.2f ", float(buff_o_channels[cpo][it]));
+        }
+        printf("\n");
+      }
+      #endif
+
+      //All input data have effect into output add
+      add_i_iter_loop:
+      for (int i_iter = 0; i_iter < I_ITER; i_iter++){
+        // //prueba
+        pixel_out_t data_out;
+        // pixel_out_t data;
+        // data = in.read();
+        // out<<data;
+        #pragma HLS loop_flatten off
+        add_load_data_it_loop:
+        for(int it = 0; it<num_iterations; it++){
+          pixel_out_t data_in;
+          data_in = in.read();
+          pixel_out_t data;
+          add_load_data_cpo_loop:
+          for (int cpo=0; cpo<CPO; cpo++) {
+            #pragma HLS unroll
+            if(i_iter == 0){
+              data.pixel[cpo] = bias[cpo];
+            }
+            else{
+              data.pixel[cpo] = buff_o_channels[cpo][it];
+            }
+            buff_o_channels[cpo][it] = data.pixel[cpo] + data_in.pixel[cpo];
+
+            if(i_iter ==(I_ITER-1)){
+              data_out.pixel[cpo] = buff_o_channels[cpo][it];
+            }
+          }
+          if(i_iter ==(I_ITER-1)){
+            out << data_out;
+          }
+        }
+      } //i_iter
+
+      #ifdef DEBUG_VERBOSE
+      printf("CH %d: ", o_iter*CPO);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", float(buff_o_channels[0][it]));
+      }
+      printf("\n");
+      printf("CH %d: ", o_iter*CPO +1);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", float(buff_o_channels[1][it]));
+      }
+      printf("\n");
+      printf("CH %d: ", o_iter*CPO +2);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", float(buff_o_channels[2][it]));
+      }
+      printf("\n");
+      printf("CH %d: ", o_iter*CPO +3);
+      for (int it=0; it<num_iterations; it++) {
+        printf("%6.2f ", float(buff_o_channels[3][it]));
+      }
+      printf("\n");
+      #endif
+
+
+  } //o_iter
+
+
+#ifdef DEBUG_VERBOSE
+  printf("add: end\n");
+#endif
+
+
+}
+
+// conv: Convolutional kernel
+//
+// Arguments:
+//   in: input stream
+//   out: output stream
+static void conv(int H, int W, int I, int O, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
+
+  // streams
+  static hls::stream<pixel_in_t>  str_pad_cvt;  // padding->cvt
+  static hls::stream<frame_t>     str_cvt_mul;  // cvt->mul
+  static hls::stream<pixel_out_t> str_mul_add;  // mul->add
+
+  int ITER = O_ITER*I_ITER;
+  // topology
+  #pragma HLS dataflow
+  padding(H, W, ITER, in, str_pad_cvt);          // padding
+  cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0);  // cvt
+  mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0);  // mul
+  add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out);             // add
+}
+
+void k_conv2D_K3x3_S1x1_P1x1_BS1_ap(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O) {
+
+  #pragma HLS INTERFACE s_axilite port=W bundle=control
+  #pragma HLS INTERFACE s_axilite port=H bundle=control
+  #pragma HLS INTERFACE s_axilite port=I bundle=control
+  #pragma HLS INTERFACE s_axilite port=O bundle=control
+  #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem  max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem1 max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem2   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_out  offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE s_axilite port=return bundle=control
+
+  // ptr_data struct to be packed as a single element vector (to improve memory read)
+  // the compiler will do full structure access (all elements of structure)
+  #pragma HLS data_pack variable = ptr_data
+  #pragma HLS data_pack variable = ptr_out
+
+  int I_ITER = I/CPI;
+  int O_ITER = O/CPO;
+
+  // input and output streams
+  static hls::stream<pixel_in_t> out_read_data;
+  static hls::stream<frame_t> out_read_kernel;
+  static hls::stream<pixel_out_t> out_read_bias;
+  static hls::stream<pixel_out_t> out_conv;
+
+  // stream sizes
+  #pragma HLS STREAM variable = out_read_data depth = 32
+  #pragma HLS STREAM variable = out_read_kernel depth = 32
+  #pragma HLS STREAM variable = out_read_bias depth = 32
+  #pragma HLS STREAM variable = out_conv depth = 32
+  // #pragma HLS STREAM variable = out_relu depth = 32
+
+  #pragma HLS dataflow
+  read_data(H, W, I_ITER, O_ITER, ptr_data, out_read_data);
+  read_bias(O_ITER, ptr_bias, out_read_bias);
+  read_kernel(O_ITER, I_ITER, ptr_kernel, out_read_kernel);
+  conv(H, W, I, O, I_ITER, O_ITER, out_read_data, out_read_kernel, out_read_bias, out_conv);
+  write_output(H, W, O_ITER, ptr_out, out_conv);
+}
+
+} // end extern "C"

From 089a3050645266bbd5240494a2b84d9d06e37d0a Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Wed, 21 Oct 2020 07:33:32 +0000
Subject: [PATCH 04/15] improved conv

---
 .../kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp   | 122 ++++++++----
 .../src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp | 184 +++++++++++++-----
 2 files changed, 212 insertions(+), 94 deletions(-)

diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
index 28a2441d9..3e91eebb2 100644
--- a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
+++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
@@ -14,7 +14,7 @@
 
 #include <hls_stream.h>
 
-// #define DEBUG_VERBOSE
+#define DEBUG_VERBOSE
 
 extern "C" {
 
@@ -28,8 +28,8 @@ extern "C" {
 // Fixed parameters (optimized at compilation/synthesis time)
 #define KW       3  // kernel width
 #define KH       3  // kernel height
-#define CPI      4  // channels per input port
-#define CPO      4  // channels per output port
+#define CPI      2  // channels per input port
+#define CPO      2  // channels per output port
 
 #define WMAX 256
 #define WHMAX 256*256
@@ -68,7 +68,27 @@ static void read_bias(int O_ITER, data_type *b_ptr, hls::stream<pixel_out_t> &b_
   pixel_out_t bias;
   #pragma HLS ARRAY_PARTITION variable=bias dim=0
 
-  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+  // we read the bias
+  int size = O_ITER * CPO;
+  int cpo = 0;
+  for (int i=0; i<size; i++) {
+    data_type v = b_ptr[i];
+    bias.pixel[cpo] = v;
+    cpo = cpo + 1;
+    if (cpo == CPO) {
+      cpo = 0;
+      b_out << bias;
+#ifdef DEBUG_VERBOSE
+      printf("bias read: ");
+      for (int c=0; c<CPO; c++) printf(" %f ", float(bias.pixel[c]));
+      printf("\n");
+#endif
+    }
+  }
+	
+  
+
+  /*for (int o_iter = 0; o_iter < O_ITER; o_iter++){
     // printf("o_iter = %d \n ", o_iter);
     //Sending bias to add in pack of CPO bias
     read_loop_bias_load:
@@ -78,7 +98,7 @@ static void read_bias(int O_ITER, data_type *b_ptr, hls::stream<pixel_out_t> &b_
         bias.pixel[b] = v;
       }
       b_out << bias;
-    }
+    }*/
 #ifdef DEBUG_VERBOSE
   printf("read_bias: end\n");
 #endif
@@ -97,8 +117,38 @@ static void read_kernel(int O_ITER, int I_ITER, data_type *k_ptr, hls::stream<fr
   printf("read_kernel: start\n");
 #endif
 
+  // we read all the kernels and send it through the stream
   frame_t frame_k;
   #pragma HLS ARRAY_PARTITION variable=frame_k dim=0
+  int cpo = 0;
+  int p = 0;
+
+  int size = KW * KH * O_ITER * CPO * I_ITER * CPI;
+  read_kernel_loop:
+  for (int i=0; i<size; i++) {
+    frame_k.pixel[p].pixel[cpo] = k_ptr[i];
+    p = p + 1;
+    if (p == 9) {
+      p = 0;
+      cpo = cpo+1;
+      if (cpo == CPO) {
+        cpo = 0;
+	k_out << frame_k;
+#ifdef DEBUG_VERBOSE
+	printf("kernel read:\n");
+	for (int c=0; c<CPO; c++) {
+          printf("channel %d: ", c);
+	  for (int p=0; p<9; p++) printf(" %f ", float(frame_k.pixel[p].pixel[c]));
+	  printf("\n");
+	}
+#endif
+      }
+    }
+  }
+
+
+  /*frame_t frame_k;
+  #pragma HLS ARRAY_PARTITION variable=frame_k dim=0
   //Sending kernels to mul in pack of CPI*CPO kernels
   int kernel_size_cpo = CPO*KH*KW; //kernels size each i_iter
   int i_offset = I_ITER * CPI * CPO * KH * KW; //addr_k offset for each i_iter
@@ -137,7 +187,7 @@ static void read_kernel(int O_ITER, int I_ITER, data_type *k_ptr, hls::stream<fr
         }
       }
     } //i_iter
-  } //o_iter
+  } //o_iter*/
 
 #ifdef DEBUG_VERBOSE
   printf("read_kernel: end\n");
@@ -162,14 +212,15 @@ static void read_data(int H, int W, int I_ITER, int O_ITER, pixel_in_t *ptr, hls
     //Sending data to padding  in pack of CPI channels
     read_loop_data_load_i:
       for (int r=0; r<H*W*I_ITER; r++) {
-        #pragma HLS PIPELINE II=1
-        #ifdef DEBUG_VERBOSE
-        printf("addres = %d \n", r);
-        pixel_in_t data;
-        data = ptr[r];
-        for(int cpi = 0;cpi<CPI;cpi++) printf("data.pixel[%d] = %6.2f  ", cpi, float(data.pixel[cpi]));
-        #endif
-        out  << ptr[r];
+	#pragma HLS PIPELINE II=1
+	pixel_in_t data;
+	data = ptr[r];
+#ifdef DEBUG_VERBOSE
+	printf("read data:\n");
+        for(int cpi = 0;cpi<CPI;cpi++) printf(" %f ", float(data.pixel[cpi]));
+	printf("\n");
+#endif
+        out  << data;
       }
     } //o_iter
 
@@ -407,7 +458,7 @@ static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream<frame_t> &in,
 #endif
 
   // first we read the kernels
-  frame_t kernel[CPI];
+  frame_t kernel[CPO];
   DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0)
   frame_t data_in;
 
@@ -419,19 +470,19 @@ static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream<frame_t> &in,
     for(int i_iter = 0; i_iter < I_ITER; i_iter++){
       //we load the kernels into pack of frames
       loop_mul_kernels_load_cpo:
-      for (int cpi=0; cpi<CPI; cpi++) {
+      for (int cpo=0; cpo<CPO; cpo++) {
         #pragma HLS PIPELINE II=1
-        kernel[cpi] = k_in.read();
+        kernel[cpo] = k_in.read();
       }
 
 #ifdef DEBUG_VERBOSE
   printf("mul_%d: kernels received\n", id);
-  for (int cpi=0; cpi < CPI; cpi++) {
-    for (int cpo=0; cpo < CPO; cpo++) {
+  for (int cpo=0; cpo < CPO; cpo++) {
+    for (int cpi=0; cpi < CPI; cpi++) {
       printf("  cpi=%d, cpo=%d:\n", cpi, cpo);
-      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[0].pixel[cpo]), float(kernel[cpi].pixel[1].pixel[cpo]), float(kernel[cpi].pixel[2].pixel[cpo]));
-      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[3].pixel[cpo]), float(kernel[cpi].pixel[4].pixel[cpo]), float(kernel[cpi].pixel[5].pixel[cpo]));
-      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpi].pixel[6].pixel[cpo]), float(kernel[cpi].pixel[7].pixel[cpo]), float(kernel[cpi].pixel[8].pixel[cpo]));
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpo].pixel[0].pixel[cpi]), float(kernel[cpo].pixel[1].pixel[cpi]), float(kernel[cpo].pixel[2].pixel[cpi]));
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpo].pixel[3].pixel[cpi]), float(kernel[cpo].pixel[4].pixel[cpi]), float(kernel[cpo].pixel[5].pixel[cpi]));
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpo].pixel[6].pixel[cpi]), float(kernel[cpo].pixel[7].pixel[cpi]), float(kernel[cpo].pixel[8].pixel[cpi]));
     }
   }
 #endif
@@ -469,7 +520,7 @@ static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream<frame_t> &in,
         loop_mul_cpo:
       	for (int cpo=0; cpo<CPO; cpo++) {
           #pragma HLS UNROLL
-          sum[cpo] += data_in.pixel[j].pixel[cpi] * kernel[cpi].pixel[j].pixel[cpo];
+          sum[cpo] += data_in.pixel[j].pixel[cpi] * kernel[cpo].pixel[j].pixel[cpi];
         }
       }
     }
@@ -618,26 +669,13 @@ static void add(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_out_t> &
       } //i_iter
 
       #ifdef DEBUG_VERBOSE
-      printf("CH %d: ", o_iter*CPO);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", float(buff_o_channels[0][it]));
-      }
-      printf("\n");
-      printf("CH %d: ", o_iter*CPO +1);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", float(buff_o_channels[1][it]));
-      }
-      printf("\n");
-      printf("CH %d: ", o_iter*CPO +2);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", float(buff_o_channels[2][it]));
-      }
-      printf("\n");
-      printf("CH %d: ", o_iter*CPO +3);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", float(buff_o_channels[3][it]));
+      for (int cpo=0; cpo<CPO; cpo++) {
+        printf("CH %d: ", cpo);
+        for (int it=0; it<num_iterations; it++) {
+          printf("%6.2f ", float(buff_o_channels[cpo][it]));
+        }
+        printf("\n");
       }
-      printf("\n");
       #endif
 
 
diff --git a/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
index facd8b604..dba2c9492 100644
--- a/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
+++ b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp
@@ -1,3 +1,38 @@
+//
+// test_conv2D. 
+//
+// Constants:
+//
+//  - CPI
+//  - CPO
+//  - KW = 3
+//  - KH = 3
+//  - PW = 1
+//  - PH = 1
+//  - SW = 1
+//  - SH = 1
+//
+//  Arguments:
+//
+//  - W
+//  - H
+//  - I
+//  - O
+//  
+//  Data formats: 
+//
+//  - kernel   : GO x GI x CPO x CPI x KH x KW
+//  - bias     : O
+//  - data_in  : GI x H x W x CPI
+//  - data_out : GO x H x W x CPO
+//
+//  GI = I / CPI
+//  GO = O / CPO
+//
+//
+
+
+
 #include <cstdio>      /* printf, scanf, NULL */
 #include <cstdlib>     /* malloc, free, rand */
 
@@ -23,27 +58,31 @@ cl::CommandQueue q;
 cl::Program program;
 
 
-#define W    256 //256
-#define H    256 //256
-#define C    16  //I
-#define COUT 16  //O
+#define W    4 // 256 //256
+#define H    4 // 256 //256
+#define GI   2
+#define CPI  2 // 16
+#define I    GI * CPI
+#define GO   2 // 16
+#define CPO  2
+#define O    GO * CPO
 #define KW   3
 #define KH   3
 
 // buffers
-data_type data_in[  W   * H  * C       ]  __attribute__ ((__aligned__(16)));
-data_type kernel [ KW   * KH * C * COUT]  __attribute__ ((__aligned__(16)));
-data_type bias   [ COUT                ]  __attribute__ ((__aligned__(16)));
-data_type out    [  W   * H  * COUT    ]  __attribute__ ((__aligned__(16)));
-data_type out_cpu[  W   * H  * COUT    ]  __attribute__ ((__aligned__(16)));
+data_type data_in[  GI * W * H * CPI                 ]  __attribute__ ((__aligned__(16)));
+data_type kernel [  GO * GI * CPO * CPI * KW * KH    ]  __attribute__ ((__aligned__(16)));
+data_type bias   [  O                                ]  __attribute__ ((__aligned__(16)));
+data_type out    [  GO * W * H * CPO                 ]  __attribute__ ((__aligned__(16)));
+data_type out_cpu[  GO * W * H * CPO                 ]  __attribute__ ((__aligned__(16)));
 
 void cpu_conv2d() {
 
-  int size_out = W * H * COUT;
+  int size_out = GO * W * H * CPO;
   for (int i=0; i<size_out; i++) out_cpu[i] = 0.f;
 
-  for (int c=0; c<C; c++) {
-    for (int cout=0; cout<COUT; cout++) {
+  for (int c=0; c<I; c++) {
+    for (int cout=0; cout<O; cout++) {
       for (int h=0; h<H; h++) {
         for (int w=0; w<W; w++) {
           for (int kh=0; kh<KH; kh++) {
@@ -51,9 +90,25 @@ void cpu_conv2d() {
 	      int data_h = (h-1)+kh;
 	      int data_w = (w-1)+kw;
 	      int padding = (data_h == -1) | (data_w == -1) | (data_w == W) | (data_h == H);
-	      int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
-              int addr_p = (data_h * W * C) + (data_w * C) + c;
-	      int addr_o = (h * W * COUT) + (w * COUT) + cout;
+	      // kernel position
+	      int gki = c / CPI;
+	      int ki = c % CPI;
+	      int gko = cout / CPO;
+	      int ko = cout % CPO;
+	      int addr_k = (gko * KW * KH * GI * CPO * CPI) + 
+		           (gki * KW * KH * CPO * CPI) +
+			   (ko * KW * KH * CPI) + 
+			   (ki * KW * KH) +
+			   (kh * KW) + kw;
+	      // data_in pixel position
+	      int gi = c / CPI;
+	      int i = c % CPI;
+              int addr_p = (gi * W * H * CPI) + (data_h * W * CPI) + (data_w * CPI) + i;
+	      // data_out pixel position
+	      int go = cout / CPO;
+	      int o = cout % CPO;
+	      int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
+	      // operation
 	      if (!padding) out_cpu[addr_o] += data_in[addr_p] * kernel[addr_k];
 	    }
 	  }
@@ -63,10 +118,14 @@ void cpu_conv2d() {
   }
 
   // añadimos bias
-  for (int cout=0; cout<COUT; cout++) {
+  for (int cout=0; cout<O; cout++) {
     for (int h=0; h<H; h++) {
       for (int w=0; w<W; w++) {
-        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+	// data_out pixel position
+	int go = cout / CPO;
+	int o = cout % CPO;
+        int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
+	// bias operation
         out_cpu[addr_o] += bias[cout];
       }
     }
@@ -85,12 +144,16 @@ void cpu_conv2d() {
 
 void cpu_print_data_in() {
   printf("data in:\n");
-  for (int c=0; c<C; c++) {
+  for (int c=0; c<I; c++) {
     printf(" channel %d:\n", c);
     printf("   ");
     for (int h=0; h<H; h++) {
       for (int w=0; w<W; w++) {
-	int addr_p = (h * W * C) + (w * C) + c;
+	// data_in pixel position
+	int gi = c / CPI;
+	int i = c % CPI;
+	int addr_p = (gi * W * H * CPI) + (h * W * CPI) + (w * CPI) + i;
+	//
         printf("%6.2f ", float(data_in[addr_p]));
       }
       printf("\n");
@@ -102,12 +165,21 @@ void cpu_print_data_in() {
 
 void cpu_print_kernels() {
   printf("kernels:\n");
-  for (int c=0; c<C; c++) {
-    for (int cout=0; cout<COUT; cout++) {
+  for (int cout=0; cout<O; cout++) {
+    for (int c=0; c<I; c++) {
       printf("kernel c=%d cout %d:\n", c, cout);
       for (int kh=0; kh<KH; kh++) {
         for (int kw=0; kw<KW; kw++) {
-	  int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
+           // kernel position
+           int gki = c / CPI;
+           int ki = c % CPI;
+           int gko = cout / CPO;
+           int ko = cout % CPO;
+           int addr_k = (gko * KW * KH * GI * CPO * CPI) + 
+                        (gki * KW * KH * CPO * CPI) +
+                        (ko * KW * KH * CPI) + 
+                        (ki * KW * KH) +
+                        (kh * KW) + kw;
 	  printf("%6.2f ", float(kernel[addr_k]));
 	}
 	printf("\n");
@@ -118,7 +190,7 @@ void cpu_print_kernels() {
 
 void cpu_print_bias() {
   printf("bias:\n");
-  for (int cout=0; cout<COUT; cout++) {
+  for (int cout=0; cout<O; cout++) {
     printf("%6.2f ", float(bias[cout]));
   }
   printf("\n");
@@ -126,11 +198,14 @@ void cpu_print_bias() {
 
 void cpu_print_out() {
   printf("output: cpu (fpga)\n");
-  for (int cout=0; cout<COUT; cout++) {
+  for (int cout=0; cout<O; cout++) {
     printf("channel %d:\n", cout);
     for (int h=0; h<H; h++) {
       for (int w=0; w<W; w++) {
-        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+	// data_out pixel position
+	int go = cout / CPO;
+	int o = cout % CPO;
+        int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
         printf(" %10.6f (%10.6f) (diff %10.6f) | ", float(out_cpu[addr_o]), float(out[addr_o]), float(out_cpu[addr_o]-out[addr_o]));
       }
       printf("\n");
@@ -141,10 +216,13 @@ void cpu_print_out() {
 void check_result() {
 
   int error = 0;
-  for (int cout=0; cout<COUT; cout++) {
+  for (int cout=0; cout<O; cout++) {
     for (int h=0; h<H; h++) {
       for (int w=0; w<W; w++) {
-        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+	// data_out pixel position
+	int go = cout / CPO;
+	int o = cout % CPO;
+        int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
         if (fabs(out_cpu[addr_o] - out[addr_o]) > 0.001) {
           printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, float(out_cpu[addr_o]), float(out[addr_o]), fabs(float(out_cpu[addr_o]-out[addr_o])));
           error = 1;
@@ -231,7 +309,7 @@ int main(int argc, char **argv) {
     return EXIT_FAILURE;
   }
 
-  printf("Test CONV: [WxHxC] = [%dx%dx%d] -> [WxHxC] = [%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", W, H, C, W, H, COUT, KW, KH);
+  printf("Test CONV: [GIxWxHxCPI] = [%dx%dx%dx%d] -> [GOxWxHxCPO] = [%d%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", GI, W, H, CPI, GO, W, H, CPO, KW, KH);
 
   std::string binaryFile = argv[1];
   cl_int err;
@@ -254,10 +332,10 @@ int main(int argc, char **argv) {
   OCL_CHECK(err, kernel_conv2d_2 = cl::Kernel(program,"k_conv2D_K3x3_S1x1_P1x1_BS1_ap", &err));
   std::cout << "Kernel sucessfully created" << std::endl ;
 
-  size_t size_data_in_bytes = W*H*C*sizeof(data_type);
-  size_t size_output_in_bytes = W*H*COUT * sizeof(data_type);
-  size_t size_kernel_in_bytes = KW * KH * C * COUT * sizeof(data_type);
-  size_t size_bias_in_bytes = COUT * sizeof(data_type);
+  size_t size_data_in_bytes = W * H * I * sizeof(data_type);
+  size_t size_output_in_bytes = W * H * O * sizeof(data_type);
+  size_t size_kernel_in_bytes = KW * KH * I * O * sizeof(data_type);
+  size_t size_bias_in_bytes = O * sizeof(data_type);
   // Allocate memory on the host and fill with random data.
 
   //-----------------------------
@@ -268,25 +346,27 @@ int main(int argc, char **argv) {
 
   std::cout << "Filling buffer with useful data" << std::endl ;
   int addr = 0;
-  for (int h=0; h<H; h++) {
-    for (int w=0; w<W; w++) {
-      for (int c=0; c<C; c++) {
-	       data_type value = (c*W*H) + (data_type)(h*W)+w; //c+1; // (data_type)((c * 25) + (h * W) + w);
-         data_in[addr] = dist(gen); //value;
-	       addr++;
+  for (int gi=0; gi<GI; gi++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        for (int c=0; c<CPI; c++) {
+          data_type value = (gi * W * H * CPI) + (c * W * H) + (data_type)(h * W) + w; //c+1; // (data_type)((c * 25) + (h * W) + w);
+          data_in[addr] = value; //dist(gen); //value;
+          addr++;
+	}
       }
     }
   }
 
   std::cout << "Filling kernel buffer with useful data" << std::endl;
   int kernel_id = 1;
-  for (int c=0; c<C; c++) {
-    for (int cout=0; cout<COUT; cout++) {
+  for (int c=0; c<I; c++) {
+    for (int cout=0; cout<O; cout++) {
       for (int kh=0; kh<KH; kh++) {
-	       for (int kw=0; kw<KW; kw++) {
+	for (int kw=0; kw<KW; kw++) {
           data_type value = (data_type)kernel_id;
-          int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
-	         kernel[addr_k] = dist(gen);
+          int addr_k = (cout * I * KW * KH) + (c * KW * KH) + (kh * KW) + kw;
+          kernel[addr_k] = value; //dist(gen);
         }
       }
       kernel_id++;
@@ -294,7 +374,7 @@ int main(int argc, char **argv) {
   }
 
   std::cout << "Filling bias buffer with useful data" << std::endl;
-  for (int cout=0; cout<COUT; cout++) bias[cout] = cout; //dist(gen);
+  for (int cout=0; cout<O; cout++) bias[cout] = cout; //dist(gen);
 
   //-----------------------------
   // THIS PAIR OF EVENTS WILL BE USED TO TRACK WHEN A KERNEL IS FINISHED WITH
@@ -324,11 +404,11 @@ int main(int argc, char **argv) {
   OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a));
   OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, H));
   OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, W));
-  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, C));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, I));
   OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_k));
   OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_bias));
   OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_b));
-  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, COUT));
+  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, O));
 
   //-----------------------------
   // Copy input data to device global memory
@@ -371,13 +451,13 @@ int main(int argc, char **argv) {
 
   std::cout << "computing conv in CPU..." << std::endl;
 
- // cpu_print_data_in();
-  // cpu_print_kernels();
- // cpu_print_bias();
-  // cpu_conv2d();
- // cpu_print_out();
+  cpu_print_data_in();
+  cpu_print_kernels();
+  cpu_print_bias();
+  cpu_conv2d();
+  cpu_print_out();
 
-  // check_result();
+  check_result();
 
   //-----------------------------
   std::cout << "" << std::endl;

From fdaeb6d262f31f4044520b544f26973e81bd3c2a Mon Sep 17 00:00:00 2001
From: Jorga20j <jorga20j@upv.gap.es>
Date: Fri, 23 Oct 2020 16:00:45 +0000
Subject: [PATCH 05/15] Conv2d: o_iter in host

---
 .../kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp   | 667 ++++++++++++++++++
 .../src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp | 490 +++++++++++++
 2 files changed, 1157 insertions(+)
 create mode 100644 fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
 create mode 100644 fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp

diff --git a/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
new file mode 100644
index 000000000..fbfc6a162
--- /dev/null
+++ b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
@@ -0,0 +1,667 @@
+//KERNEL_CONV2D_4.cpp
+//Modified by: Jorge García Martinez
+//Date: 17/09/2020
+//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs
+//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel.
+//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores
+//the data that It should be written into the memory.
+
+
+
+#include <math.h>
+#include <stdio.h>
+#include <ap_fixed.h>
+
+#include <hls_stream.h>
+
+// #define DEBUG_VERBOSE
+
+extern "C" {
+
+// #define data_type ap_fixed<8,4,AP_TRN,AP_WRAP>
+#define data_type float
+
+// To allow using defines inside Xilinx pragmas
+#define PRAGMA_SUB(x) _Pragma (#x)
+#define DO_PRAGMA(x) PRAGMA_SUB(x)
+
+// Fixed parameters (optimized at compilation/synthesis time)
+#define KW       3  // kernel width
+#define KH       3  // kernel height
+#define CPI      4  // channels per input port
+#define CPO      4  // channels per output port
+
+#define WMAX 256
+#define WHMAX 256*256
+
+#define LOAD_MODEL
+#define READ_MODEL
+#define READ_INPUT
+#define WRITE_OUTPUT
+
+// pixel_in
+struct pixel_in_t {
+  data_type pixel[CPI];
+};
+
+struct pixel_out_t {
+  data_type pixel[CPO];
+};
+
+// frames struct
+struct frame_t {
+  pixel_in_t pixel[9];
+};
+
+// ---------------------------------------------------------------------------------------
+// read_bias. Reading bias from memory and sending to add module.
+//
+// Arguments:
+//   b_ptr                : pointer to bias
+//   b_out               :  output streams
+//
+static void read_bias(int offset_bias, data_type *b_ptr, hls::stream<pixel_out_t> &b_out){
+
+#ifdef DEBUG_VERBOSE
+  printf("read_bias: start\n");
+#endif
+  pixel_out_t bias;
+  #pragma HLS ARRAY_PARTITION variable=bias dim=0
+
+  // we read the bias
+  for (int i=0; i<CPO; i++) {
+    data_type v = b_ptr[i + offset_bias];
+    bias.pixel[i] = v;
+    }
+    #ifdef DEBUG_VERBOSE
+          printf("bias read: ");
+          for (int c=0; c<CPO; c++) printf(" %f ", float(bias.pixel[c]));
+          printf("\n");
+    #endif
+    b_out << bias;
+
+#ifdef DEBUG_VERBOSE
+  printf("read_bias: end\n");
+#endif
+}
+
+// ---------------------------------------------------------------------------------------
+// read_kernel. Adds padding to the input and forwards it through the output
+//
+// Arguments:
+//   k_ptr                : pointer to kernels
+//   k_out               :  output stream
+//
+static void read_kernel(int I_ITER, int offset_kernel, data_type *k_ptr, hls::stream<frame_t> &k_out){
+
+#ifdef DEBUG_VERBOSE
+  printf("read_kernel: start\n");
+#endif
+
+  // we read all the kernels and send it through the stream
+  frame_t frame_k;
+  #pragma HLS ARRAY_PARTITION variable=frame_k dim=0
+  int cpo = 0;
+  int p = 0;
+
+  int size = KW * KH * CPO * I_ITER * CPI;
+  read_kernel_loop:
+  for (int i=0; i<size; i++) {
+    frame_k.pixel[p].pixel[cpo] = k_ptr[i+ offset_kernel];
+    p = p + 1;
+    if (p == 9) {
+      p = 0;
+      cpo = cpo+1;
+      if (cpo == CPO) {
+        cpo = 0;
+	      k_out << frame_k;
+#ifdef DEBUG_VERBOSE
+	printf("kernel read:\n");
+	for (int c=0; c<CPO; c++) {
+          printf("channel %d: ", c);
+	  for (int p=0; p<9; p++) printf(" %f ", float(frame_k.pixel[p].pixel[c]));
+	  printf("\n");
+	}
+#endif
+      }
+    }
+  }
+
+
+
+#ifdef DEBUG_VERBOSE
+  printf("read_kernel: end\n");
+#endif
+
+}
+
+// --------------------------------------------------------------------------------------
+// read_data: Reading data from memory and sending to conv module
+// Arguments:
+//   ptr  : Pointer to input data (in)
+//   out  : data output stream (out)
+//
+static void read_data(int H, int W, int I_ITER, pixel_in_t *ptr, hls::stream<pixel_in_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("read_data: start\n");
+#endif
+
+
+    read_loop_data_load_i:
+      for (int r=0; r<H*W*I_ITER; r++) {
+      	#pragma HLS PIPELINE II=1
+      	pixel_in_t data;
+      	data = ptr[r];
+        #ifdef DEBUG_VERBOSE
+        	printf("read data:\n");
+                for(int cpi = 0;cpi<CPI;cpi++) printf(" %f ", float(data.pixel[cpi]));
+        	printf("\n");
+        #endif
+        out  << data;
+      }
+
+
+#ifdef DEBUG_VERBOSE
+  printf("read_data: end\n");
+#endif
+}
+
+// ---------------------------------------------------------------------------------------
+// padding. Adds padding to the input and forwards it through the output
+//
+// Arguments:
+//   in                : input stream
+//   out               : vector of output streams
+//
+static void padding(int H, int W, int I_ITER, hls::stream<pixel_in_t> &in, hls::stream<pixel_in_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("padding: start\n");
+#endif
+
+//we init zero only first time
+
+pixel_in_t data;
+DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete)
+
+pixel_in_t zero;
+DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete)
+
+for (int cpi=0; cpi<CPI; cpi++) zero.pixel[cpi] = 0.f;
+
+  padding_iter_loop:
+  for(int iter = 0; iter < I_ITER; iter++){
+
+    for(int h = 0; h < H + 2; h++){
+      #pragma HLS_PIPELINE II=1
+      for(int w = 0; w < W + 2; w++){
+        #pragma HLS_PIPELINE II=1
+        if (h==0 || h == H+1 || w == 0 || w == W+1) {
+          data = zero;
+        }
+        else {
+	         data = in.read();
+	      }
+        #ifdef DEBUG_VERBOSE
+        for(int cpi = 0;cpi<CPI;cpi++) printf("data.pixel[%d] = %6.2f  ", cpi, float(data.pixel[cpi]));
+        printf("\n");
+        #endif
+        out << data;
+      }
+    }
+  } // iter
+
+
+#ifdef DEBUG_VERBOSE
+  printf("padding: end\n");
+#endif
+}
+
+// ---------------------------------------------------------------------------------------------
+// relu. Performs the relu operation on an input stream and produces an output stream
+// Arguments:
+//
+//   in: input stream
+//   out: output stream
+//
+static void relu(int H, int W, int O, hls::stream<data_type> &in, hls::stream<data_type> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("relu: start\n");
+#endif
+
+  int data_size = W * H * O;
+  for (int i=0; i < data_size; i++) {
+    #pragma HLS PIPELINE II=1
+    data_type data = in.read();
+    if (data < 0) data = 0.f;
+    out << data;
+  }
+
+#ifdef DEBUG_VERBOSE
+  printf("relu: end\n");
+#endif
+}
+
+// --------------------------------------------------------------------------------
+// write_output: Writes data comming from one stream into memory
+// LOOP FLOW:
+//  for o_iter 0 .. n
+//      write data[do .. do+3]
+//
+//  d = d + 4
+//
+// Arguments:
+//   ptr: memory address pointer
+//   in: input stream
+//
+static void write_output(int H, int W, int offset_data_out, pixel_out_t *ptr, hls::stream<pixel_out_t> &in) {
+
+#ifdef DEBUG_VERBOSE
+  printf("write_output: start\n");
+#endif
+
+
+    write_output_data_size_loop:
+    for (int i=0; i<H*W; i++) {
+      pixel_out_t p = in.read();
+      ptr[i + offset_data_out] = p;
+      #ifdef DEBUG_VERBOSE
+      printf("i = %d \n",  i);
+      for (int cpo=0; cpo<CPO; cpo++) printf("ptr--p.pixel[%d] = %6.2f \n", cpo, float(p.pixel[cpo]));
+      #endif
+    }
+
+
+#ifdef DEBUG_VERBOSE
+  printf("write_output: end\n");
+#endif
+}
+
+
+
+// ---------------------------------------------------------------------------------------------------
+// cvt: reads an input stream with an image of format (W, H, CPI) and writes an output stream
+// in a 2D format based on (KW, KH). (SW=1, SH=1) stride is assumed and (PW=1, PH=1) padding is assumed.
+// The function outputs data in the format (CPI, KW, KH).
+//
+// Arguments:
+//   in  : input stream
+//   out : output stream
+//   id  : function id (for debugging)
+static void cvt(int H, int W, int I_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &out, int id) {
+
+#ifdef DEBUG_VERBOSE
+  printf("cvt_%d: start\n", id);
+#endif
+
+
+  cvt_i_iter_loop:
+  for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+
+  // Now we process the input data and convert the data into frames
+
+  // buffers (keep three rows)
+  pixel_in_t buffer0[WMAX+2];
+  pixel_in_t buffer1[WMAX+2];
+  pixel_in_t buffer2[WMAX+2];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI)
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI)
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI)
+
+  // frame
+  frame_t frame;
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=frame)
+
+  // We loop for every incoming pixel
+  cvt_loop_1:
+  for (int pin_row=0; pin_row < H+2; pin_row++) {
+    cvt_loop_2:
+    for (int pin_col=0; pin_col < W+2; pin_col++) {
+      // get the pixel
+      pixel_in_t pixel;
+      pixel = in.read();
+      // row buffer write (in which buffer row we write the pixel)
+      int row0_buffer_write = (pin_row % 3) == 0;
+      int row1_buffer_write = (pin_row % 3) == 1;
+      // first row buffer
+      int row0 = (pin_row <= 2) | ((pin_row % 3) == 2);
+      int row1 = !row0 & ((pin_row % 3) == 0);
+      // we write the pixel into the buffer
+      if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel;
+      // build the frame
+      pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8;
+      int shift_frame = (pin_row>1) & (pin_col > 2);
+      int send_frame = (pin_row>1) & (pin_col > 1);
+      pixel_in_t pixel_b0, pixel_b1, pixel_b2;
+      pixel_b0 = buffer0[pin_col];
+      pixel_b1 = buffer1[pin_col];
+      pixel_b2 = buffer2[pin_col];
+      // p0, p1, p2
+      if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;}
+      if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;}
+      if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2;
+      // p3, p4, p5
+      if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;}
+      if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;}
+      if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0;
+      // p6, p7, p8
+      if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;}
+      if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;}
+      if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1;
+
+      if (send_frame) {
+        frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2;
+        frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5;
+        frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8;
+        out << frame;
+      #ifdef DEBUG_VERBOSE
+      printf("cvt_%d: frame sent:\n", id);
+      for (int cpi=0; cpi<CPI; cpi++) {
+        printf("  cpi %d:\n", cpi);
+        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[0].pixel[cpi]), float(frame.pixel[1].pixel[cpi]), float(frame.pixel[2].pixel[cpi]));
+        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[3].pixel[cpi]), float(frame.pixel[4].pixel[cpi]), float(frame.pixel[5].pixel[cpi]));
+        printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[6].pixel[cpi]), float(frame.pixel[7].pixel[cpi]), float(frame.pixel[8].pixel[cpi]));
+      }
+      #endif
+     }
+    }
+  }
+
+} //i_iter
+
+
+
+#ifdef DEBUG_VERBOSE
+  printf("cvt_%d: end\n", id);
+#endif
+}
+
+// ----------------------------------------------------------------------------------------
+// mul: This function performs the multiplication of an input frame with the stored kernels
+// and sends the produced pixels. Before normal operation it receives its kernels
+// Arguments:
+//   in: input stream with incoming data frames
+//   k_in: input stream with kernels
+//   out: output stream
+//   id: function id (for debugging only)
+//
+static void mul(int H, int W, int I_ITER, hls::stream<frame_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &out, int id) {
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: start\n", id);
+#endif
+
+  // first we read the kernels
+  frame_t kernel[CPO];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0)
+  frame_t data_in;
+
+#ifdef LOAD_MODEL
+
+    mul_i_iter_loop:
+    for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+      //we load the kernels into pack of frames
+      loop_mul_kernels_load_cpo:
+      for (int cpo=0; cpo<CPO; cpo++) {
+        #pragma HLS PIPELINE II=1
+        kernel[cpo] = k_in.read();
+      }
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: kernels received\n", id);
+  for (int cpo=0; cpo < CPO; cpo++) {
+    for (int cpi=0; cpi < CPI; cpi++) {
+      printf("  cpi=%d, cpo=%d:\n", cpi, cpo);
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpo].pixel[0].pixel[cpi]), float(kernel[cpo].pixel[1].pixel[cpi]), float(kernel[cpo].pixel[2].pixel[cpi]));
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpo].pixel[3].pixel[cpi]), float(kernel[cpo].pixel[4].pixel[cpi]), float(kernel[cpo].pixel[5].pixel[cpi]));
+      printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpo].pixel[6].pixel[cpi]), float(kernel[cpo].pixel[7].pixel[cpi]), float(kernel[cpo].pixel[8].pixel[cpi]));
+    }
+  }
+#endif
+
+
+    // now we read frames and produce the pixels
+    data_type sum[CPO];
+    DO_PRAGMA(HLS ARRAY_PARTITION variable=sum dim=0 block factor=CPO)
+    //factor = 16
+    //the array_partition factor in this case is assumed to be CPO value
+    int num_iterations = W * H;
+    mul_sum_loop:
+    for (int cpo=0; cpo<CPO; cpo++) sum[cpo] = 0.f;
+
+    mul_num_iterations_loop:
+    for (int i=0; i<num_iterations; i++) {
+      data_in = in.read();
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: data received\n", id);
+  for (int cpi=0; cpi<CPI; cpi++) {
+    printf("  cpi=%d\n", cpi);
+    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[0].pixel[cpi]), float(data_in.pixel[1].pixel[cpi]), float(data_in.pixel[2].pixel[cpi]));
+    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[3].pixel[cpi]), float(data_in.pixel[4].pixel[cpi]), float(data_in.pixel[5].pixel[cpi]));
+    printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[6].pixel[cpi]), float(data_in.pixel[7].pixel[cpi]), float(data_in.pixel[8].pixel[cpi]));
+  }
+#endif
+
+    loop_mul_cpi:
+    for (int cpi=0; cpi<CPI; cpi++) {
+      #pragma HLS UNROLL
+      loop_mul_j:
+      for (int j=0; j<KW*KH; j++) {
+	       #pragma HLS UNROLL
+        loop_mul_cpo:
+      	for (int cpo=0; cpo<CPO; cpo++) {
+          #pragma HLS UNROLL
+          sum[cpo] += data_in.pixel[j].pixel[cpi] * kernel[cpo].pixel[j].pixel[cpi];
+        }
+      }
+    }
+    pixel_out_t p_out;
+    for (int cpo=0; cpo<CPO; cpo++) {
+      #pragma HLS unroll
+      #ifdef DEBUG_VERBOSE
+      printf("mul_%d: pixel produced\n", id);
+      for (int cpo=0; cpo<CPO; cpo++) printf("  cpo=%d -> %6.4f\n", cpo, float(sum[cpo]));
+      #endif
+      p_out.pixel[cpo] = sum[cpo];
+      sum[cpo] = 0.f;
+     }
+     out << p_out;
+    }
+  } //i_iter
+
+#endif
+
+
+#ifdef DEBUG_VERBOSE
+  printf("mul_%d: end\n", id);
+#endif
+}
+
+// -------------------------------------------------------------------------------
+// add: This function performs the addition of all subpixels for the same channel.
+// It adds also the corresponding bias.
+// LOOP FLOW
+//   for o_iter 0 .. n
+//        receive bias[b..b+3]
+//        init buff_o_channels with bias
+//        for i_iter 0 .. n
+//            receive data[do..d+3]
+//            buff_o_channels = buff_o_channels + data
+//
+//        for num_iterations
+//            for CPO
+//              send data to write module
+//
+// Arguments:
+//   in:  input streams data
+//   b_in: input stream bias
+//   out: output stream
+//
+static void add(int H, int W, int I_ITER, hls::stream<pixel_out_t> &in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
+
+#ifdef DEBUG_VERBOSE
+  printf("add: start\n");
+#endif
+
+  data_type bias[CPO];
+
+  //number of iterations by CPI || CPO channels
+  int num_iterations = W * H;
+
+  //Buffer for all data and CPO channels
+  data_type buff_o_channels[CPO][WHMAX];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO)
+
+    //We receive bias in packs of CPO
+    pixel_out_t p_out;
+    p_out = b_in.read();
+    add_load_bias_loop:
+    for (int b=0; b<CPO; b++) {
+      #pragma HLS PIPELINE II=1
+      bias[b] = p_out.pixel[b];
+    }
+
+
+    #ifdef DEBUG_VERBOSE
+    for (int b=0; b<CPO; b++) {
+      printf("Bias[%d] = %6.4f \n", b, float(bias[b]));
+    }
+    #endif
+
+    #ifdef DEBUG_VERBOSE
+    printf("add: bias received\n");
+    #endif
+
+
+      #ifdef DEBUG_VERBOSE
+      printf("o_iter = %d \n", o_iter);
+      for(int cpo = 0; cpo<CPO; cpo++){
+        printf("Channel cpo = %d: ", cpo);
+        for(int it = 0; it<num_iterations; it++){
+          printf("%6.2f ", float(buff_o_channels[cpo][it]));
+        }
+        printf("\n");
+      }
+      #endif
+
+      //All input data have effect into output add
+      add_i_iter_loop:
+      for (int i_iter = 0; i_iter < I_ITER; i_iter++){
+        pixel_out_t data_out;
+        #pragma HLS loop_flatten off
+        add_load_data_it_loop:
+        for(int it = 0; it<num_iterations; it++){
+          pixel_out_t data_in;
+          data_in = in.read();
+          pixel_out_t data;
+          add_load_data_cpo_loop:
+          for (int cpo=0; cpo<CPO; cpo++) {
+            #pragma HLS unroll
+            if(i_iter == 0){
+              data.pixel[cpo] = bias[cpo];
+            }
+            else{
+              data.pixel[cpo] = buff_o_channels[cpo][it];
+            }
+            buff_o_channels[cpo][it] = data.pixel[cpo] + data_in.pixel[cpo];
+
+            if(i_iter ==(I_ITER-1)){
+              data_out.pixel[cpo] = buff_o_channels[cpo][it];
+            }
+          }
+          if(i_iter ==(I_ITER-1)){
+            out << data_out;
+          }
+        }
+      } //i_iter
+
+      #ifdef DEBUG_VERBOSE
+      for (int cpo=0; cpo<CPO; cpo++) {
+        printf("CH %d: ", cpo);
+        for (int it=0; it<num_iterations; it++) {
+          printf("%6.2f ", float(buff_o_channels[cpo][it]));
+        }
+        printf("\n");
+      }
+      #endif
+
+
+
+
+
+#ifdef DEBUG_VERBOSE
+  printf("add: end\n");
+#endif
+
+
+}
+// conv: Convolutional kernel
+//
+// Arguments:
+//   in: input stream
+//   out: output stream
+static void conv(int H, int W, int I_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
+
+  // streams
+  static hls::stream<pixel_in_t>  str_pad_cvt;  // padding->cvt
+  static hls::stream<frame_t>     str_cvt_mul;  // cvt->mul
+  static hls::stream<pixel_out_t> str_mul_add;  // mul->add
+
+
+  // topology
+  #pragma HLS dataflow
+  padding(H, W, I_ITER, in, str_pad_cvt);          // padding
+  cvt(H, W, I_ITER, str_pad_cvt, str_cvt_mul, 0);  // cvt
+  mul(H, W, I_ITER, str_cvt_mul, k_in, str_mul_add, 0);  // mul
+  add(H, W, I_ITER, str_mul_add, b_in, out);             // add
+}
+
+
+void k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O, int offset_bias, int offset_kernel, int offset_data_out) {
+
+  #pragma HLS INTERFACE s_axilite port=W bundle=control
+  #pragma HLS INTERFACE s_axilite port=H bundle=control
+  #pragma HLS INTERFACE s_axilite port=I bundle=control
+  #pragma HLS INTERFACE s_axilite port=O bundle=control
+  #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem  max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem1 max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem2   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_out  offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE s_axilite port=offset_bias bundle=control
+  #pragma HLS INTERFACE s_axilite port=offset_kernel bundle=control
+  #pragma HLS INTERFACE s_axilite port=offset_data_out bundle=control
+  #pragma HLS INTERFACE s_axilite port=return bundle=control
+
+  // ptr_data struct to be packed as a single element vector (to improve memory read)
+  // the compiler will do full structure access (all elements of structure)
+  #pragma HLS data_pack variable = ptr_data
+  #pragma HLS data_pack variable = ptr_out
+
+  int I_ITER = I/CPI;
+
+  // input and output streams
+  static hls::stream<pixel_in_t> out_read_data;
+  static hls::stream<frame_t> out_read_kernel;
+  static hls::stream<pixel_out_t> out_read_bias;
+  static hls::stream<pixel_out_t> out_conv;
+
+  // stream sizes
+  #pragma HLS STREAM variable = out_read_data depth = 32
+  #pragma HLS STREAM variable = out_read_kernel depth = 32
+  #pragma HLS STREAM variable = out_read_bias depth = 32
+  #pragma HLS STREAM variable = out_conv depth = 32
+  // #pragma HLS STREAM variable = out_relu depth = 32
+
+    #pragma HLS dataflow
+    read_data(H, W, I_ITER, ptr_data, out_read_data);
+    read_bias(offset_bias, ptr_bias, out_read_bias);
+    read_kernel(I_ITER, offset_kernel, ptr_kernel, out_read_kernel);
+    conv(H, W, I_ITER, out_read_data, out_read_kernel, out_read_bias, out_conv);
+    write_output(H, W, offset_data_out, ptr_out, out_conv);
+
+}
+
+} // end extern "C"
diff --git a/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
new file mode 100644
index 000000000..bd7de32a4
--- /dev/null
+++ b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
@@ -0,0 +1,490 @@
+//
+// test_conv2D.
+//
+// Constants:
+//
+//  - CPI
+//  - CPO
+//  - KW = 3
+//  - KH = 3
+//  - PW = 1
+//  - PH = 1
+//  - SW = 1
+//  - SH = 1
+//
+//  Arguments:
+//
+//  - W
+//  - H
+//  - I
+//  - O
+//
+//  Data formats:
+//
+//  - kernel   : GO x GI x CPO x CPI x KH x KW
+//  - bias     : O
+//  - data_in  : GI x H x W x CPI
+//  - data_out : GO x H x W x CPO
+//
+//  GI = I / CPI
+//  GO = O / CPO
+//
+//
+
+
+
+#include <cstdio>      /* printf, scanf, NULL */
+#include <cstdlib>     /* malloc, free, rand */
+
+#include <stdlib.h>
+#include <fstream>
+#include <iostream>
+#include <random>
+#include <vector>
+#include "xcl2.hpp"
+
+#include <ap_fixed.h>
+
+using std::vector;
+
+// data type
+//#define data_type ap_fixed<8,4,AP_TRN,AP_WRAP>
+#define data_type float
+
+// CL
+cl::Buffer buf;
+cl::Context context;
+cl::CommandQueue q;
+cl::Program program;
+
+
+#define W    256 //256
+#define H    256 //256
+#define GI   2
+#define CPI  4 // 16
+#define I    GI * CPI
+#define GO   2 // 16
+#define CPO  4
+#define O    GO * CPO
+#define KW   3
+#define KH   3
+
+// buffers
+data_type data_in[  GI * W * H * CPI                 ]  __attribute__ ((__aligned__(16)));
+data_type kernel [  GO * GI * CPO * CPI * KW * KH    ]  __attribute__ ((__aligned__(16)));
+data_type bias   [  O                                ]  __attribute__ ((__aligned__(16)));
+data_type out    [  GO * W * H * CPO                 ]  __attribute__ ((__aligned__(16)));
+data_type out_cpu[  GO * W * H * CPO                 ]  __attribute__ ((__aligned__(16)));
+
+void cpu_conv2d() {
+
+  int size_out = GO * W * H * CPO;
+  for (int i=0; i<size_out; i++) out_cpu[i] = 0.f;
+
+  for (int c=0; c<I; c++) {
+    for (int cout=0; cout<O; cout++) {
+      for (int h=0; h<H; h++) {
+        for (int w=0; w<W; w++) {
+          for (int kh=0; kh<KH; kh++) {
+	    for (int kw=0; kw<KW; kw++) {
+	      int data_h = (h-1)+kh;
+	      int data_w = (w-1)+kw;
+	      int padding = (data_h == -1) | (data_w == -1) | (data_w == W) | (data_h == H);
+	      // kernel position
+	      int gki = c / CPI;
+	      int ki = c % CPI;
+	      int gko = cout / CPO;
+	      int ko = cout % CPO;
+	      int addr_k = (gko * KW * KH * GI * CPO * CPI) +
+		           (gki * KW * KH * CPO * CPI) +
+			   (ko * KW * KH * CPI) +
+			   (ki * KW * KH) +
+			   (kh * KW) + kw;
+	      // data_in pixel position
+	      int gi = c / CPI;
+	      int i = c % CPI;
+              int addr_p = (gi * W * H * CPI) + (data_h * W * CPI) + (data_w * CPI) + i;
+	      // data_out pixel position
+	      int go = cout / CPO;
+	      int o = cout % CPO;
+	      int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
+	      // operation
+	      if (!padding) out_cpu[addr_o] += data_in[addr_p] * kernel[addr_k];
+	    }
+	  }
+	}
+      }
+    }
+  }
+
+  // añadimos bias
+  for (int cout=0; cout<O; cout++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+	// data_out pixel position
+	int go = cout / CPO;
+	int o = cout % CPO;
+        int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
+	// bias operation
+        out_cpu[addr_o] += bias[cout];
+      }
+    }
+  }
+
+  // aplicamos relu
+/*  for (int cout=0; cout<COUT; cout++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+        if (out_cpu[addr_o] < 0.f) out_cpu[addr_o] = 0.f;
+      }
+    }
+  }*/
+}
+
+void cpu_print_data_in() {
+  printf("data in:\n");
+  for (int c=0; c<I; c++) {
+    printf(" channel %d:\n", c);
+    printf("   ");
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+	// data_in pixel position
+	int gi = c / CPI;
+	int i = c % CPI;
+	int addr_p = (gi * W * H * CPI) + (h * W * CPI) + (w * CPI) + i;
+	//
+        printf("%6.2f ", float(data_in[addr_p]));
+      }
+      printf("\n");
+      printf("   ");
+    }
+    printf("\n");
+  }
+}
+
+void cpu_print_kernels() {
+  printf("kernels:\n");
+  for (int cout=0; cout<O; cout++) {
+    for (int c=0; c<I; c++) {
+      printf("kernel c=%d cout %d:\n", c, cout);
+      for (int kh=0; kh<KH; kh++) {
+        for (int kw=0; kw<KW; kw++) {
+           // kernel position
+           int gki = c / CPI;
+           int ki = c % CPI;
+           int gko = cout / CPO;
+           int ko = cout % CPO;
+           int addr_k = (gko * KW * KH * GI * CPO * CPI) +
+                        (gki * KW * KH * CPO * CPI) +
+                        (ko * KW * KH * CPI) +
+                        (ki * KW * KH) +
+                        (kh * KW) + kw;
+	  printf("%6.2f ", float(kernel[addr_k]));
+	}
+	printf("\n");
+      }
+    }
+  }
+}
+
+void cpu_print_bias() {
+  printf("bias:\n");
+  for (int cout=0; cout<O; cout++) {
+    printf("%6.2f ", float(bias[cout]));
+  }
+  printf("\n");
+}
+
+void cpu_print_out() {
+  printf("output: cpu (fpga)\n");
+  for (int cout=0; cout<O; cout++) {
+    printf("channel %d:\n", cout);
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+	// data_out pixel position
+	int go = cout / CPO;
+	int o = cout % CPO;
+        int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
+        printf(" %10.6f (%10.6f) (diff %10.6f) | ", float(out_cpu[addr_o]), float(out[addr_o]), float(out_cpu[addr_o]-out[addr_o]));
+      }
+      printf("\n");
+    }
+  }
+}
+
+void check_result() {
+
+  int error = 0;
+  for (int cout=0; cout<O; cout++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+	// data_out pixel position
+	int go = cout / CPO;
+	int o = cout % CPO;
+        int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
+        if (fabs(out_cpu[addr_o] - out[addr_o]) > 0.001) {
+          printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, float(out_cpu[addr_o]), float(out[addr_o]), fabs(float(out_cpu[addr_o]-out[addr_o])));
+          error = 1;
+	  return;
+	}
+      }
+    }
+  }
+  if (!error) printf("results OK!\n"); else {
+    printf("results differ:\n");
+    //cpu_print_out();
+  }
+}
+
+
+//---------------------------------------------------------------------------------------------------------------------
+//---------------------------------------------------------------------------------------------------------------------
+
+// An event callback function that prints the operations performed by the OpenCL
+// runtime.
+void event_cb(cl_event event1, cl_int cmd_status, void *data) {
+  cl_int err;
+  cl_command_type command;
+  cl::Event event(event1, true);
+  OCL_CHECK(err, err = event.getInfo(CL_EVENT_COMMAND_TYPE, &command));
+  cl_int status;
+  OCL_CHECK(err,
+            err = event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status));
+  const char *command_str;
+  const char *status_str;
+  switch (command) {
+  case CL_COMMAND_READ_BUFFER:
+    command_str = "buffer read";
+    break;
+  case CL_COMMAND_WRITE_BUFFER:
+    command_str = "buffer write";
+    break;
+  case CL_COMMAND_NDRANGE_KERNEL:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_MAP_BUFFER:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_COPY_BUFFER:
+    command_str = "kernel";
+    break;
+  case CL_COMMAND_MIGRATE_MEM_OBJECTS:
+    command_str = "buffer migrate";
+    break;
+  default:
+    command_str = "unknown";
+  }
+  switch (status) {
+  case CL_QUEUED:
+    status_str = "Queued";
+    break;
+  case CL_SUBMITTED:
+    status_str = "Submitted";
+    break;
+  case CL_RUNNING:
+    status_str = "Executing";
+    break;
+  case CL_COMPLETE:
+    status_str = "Completed";
+    break;
+  }
+  printf("[%s]: %s %s\n", reinterpret_cast<char *>(data), status_str,
+         command_str);
+  fflush(stdout);
+}
+
+// Sets the callback for a particular event
+void set_callback(cl::Event event, const char *queue_name) {
+  cl_int err;
+  OCL_CHECK(err,
+            err = event.setCallback(CL_COMPLETE, event_cb, (void *)queue_name));
+}
+
+//---------------------------------------------------------------------------------------------------------------------
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  printf("Test CONV: [GIxWxHxCPI] = [%dx%dx%dx%d] -> [GOxWxHxCPO] = [%d%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", GI, W, H, CPI, GO, W, H, CPO, KW, KH);
+
+  std::string binaryFile = argv[1];
+  cl_int err;
+  cl::Kernel kernel_conv2d_2;
+
+  std::cout << "Creating Context..." << std::endl;
+  auto devices = xcl::get_xil_devices();
+  auto device = devices[0];
+  OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err));
+  OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
+
+  std::string device_name = device.getInfo<CL_DEVICE_NAME>();
+  auto fileBuf = xcl::read_binary_file(binaryFile);
+  cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
+  devices.resize(1);
+
+  OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err));
+  std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl;
+
+  OCL_CHECK(err, kernel_conv2d_2 = cl::Kernel(program,"k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2", &err));
+  std::cout << "Kernel sucessfully created" << std::endl ;
+
+  size_t size_data_in_bytes = W * H * I * sizeof(data_type);
+  size_t size_output_in_bytes = W * H * O * sizeof(data_type);
+  size_t size_kernel_in_bytes = KW * KH * I * O * sizeof(data_type);
+  size_t size_bias_in_bytes = O * sizeof(data_type);
+  // Allocate memory on the host and fill with random data.
+
+  //-----------------------------
+  // fill data vector with random data
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<data_type> dist(-1.0f, 1.0f);
+
+  std::cout << "Filling buffer with useful data" << std::endl ;
+  int addr = 0;
+  for (int gi=0; gi<GI; gi++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        for (int c=0; c<CPI; c++) {
+          data_type value = (gi * W * H * CPI) + (c * W * H) + (data_type)(h * W) + w; //c+1; // (data_type)((c * 25) + (h * W) + w);
+          data_in[addr] = value; //dist(gen); //value;
+          addr++;
+	}
+      }
+    }
+  }
+
+  std::cout << "Filling kernel buffer with useful data" << std::endl;
+  int kernel_id = 1;
+  for (int c=0; c<I; c++) {
+    for (int cout=0; cout<O; cout++) {
+      for (int kh=0; kh<KH; kh++) {
+	for (int kw=0; kw<KW; kw++) {
+          data_type value = (data_type)kernel_id;
+          int addr_k = (cout * I * KW * KH) + (c * KW * KH) + (kh * KW) + kw;
+          kernel[addr_k] = value; //dist(gen);
+        }
+      }
+      kernel_id++;
+    }
+  }
+
+  std::cout << "Filling bias buffer with useful data" << std::endl;
+  for (int cout=0; cout<O; cout++) bias[cout] = cout; //dist(gen);
+
+  //-----------------------------
+  // THIS PAIR OF EVENTS WILL BE USED TO TRACK WHEN A KERNEL IS FINISHED WITH
+  // THE INPUT BUFFERS. ONCE THE KERNEL IS FINISHED PROCESSING THE DATA, A NEW
+  // SET OF ELEMENTS WILL BE WRITTEN INTO THE BUFFER.
+  vector<cl::Event> kernel_events(1);
+  vector<cl::Event> read_events(1);
+  vector<cl::Event> write_events(1);
+  cl::Buffer buffer_a;
+  cl::Buffer buffer_b;
+  cl::Buffer buffer_k;
+  cl::Buffer buffer_bias;
+
+  //-----------------------------
+  // Allocate Buffer in Global Memory
+  // Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and
+  // Device-to-host communication
+  std::cout << "Creating Buffers..." << std::endl;
+
+  OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_data_in_bytes, &data_in, &err));
+  OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY  | CL_MEM_USE_HOST_PTR , size_output_in_bytes, &out, &err));
+  OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, &kernel, &err));
+  OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, &bias, &err));
+
+  //Arguments for loop
+  int I_ITER = I/CPI; //GO
+  int O_ITER = O/CPO; //GO
+  int offset_bias = 0;  //offset to pointer bias each loop
+  int offset_kernel = 0; //offset to pointer kernel each loop
+  int offset_data_out = 0; //offset to poiter output data loop
+
+  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+    // set kernel arguments
+    int arg = 0;
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, H));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, W));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, I));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_k));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_bias));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_b));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, O));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, offset_bias));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, offset_kernel));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, offset_data_out));
+
+    // Update the offset poiter to bias, kernels and output data
+    offset_bias = offset_bias + CPO;
+    offset_kernel = offset_kernel + KW * KH * CPO * I_ITER * CPI;
+    offset_data_out = offset_data_out +  H * W;
+    //-----------------------------
+    // Copy input data to device global memory
+    // std::cout << "Copying data (Host to Device)..." << std::endl;
+    // Because we are passing the write_events, it returns an event object
+    // that identifies this particular command and can be used to query
+    // or queue a wait for this particular command to complete.
+    OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0]));
+    set_callback(write_events[0], "ooo_queue");
+
+    OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[0]));
+    set_callback(write_events[0], "ooo_queue");
+
+    //-----------------------------
+    // printf("Enqueueing NDRange kernel.\n");
+    // This event needs to wait for the write buffer operations to complete
+    // before executing. We are sending the write_events into its wait list to
+    // ensure that the order of operations is correct.
+    // Launch the Kernel
+    std::vector<cl::Event> waitList;
+    waitList.push_back(write_events[0]);
+    OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[0]));
+    set_callback(kernel_events[0], "ooo_queue");
+
+    // std::cout << "Getting Results (Device to Host)..." << std::endl;
+    std::vector<cl::Event> eventList;
+    eventList.push_back(kernel_events[0]);
+    // This operation only needs to wait for the kernel call.
+    OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0]));
+    set_callback(read_events[0], "ooo_queue");
+    OCL_CHECK(err, err = read_events[0].wait());
+
+
+}
+
+
+
+
+
+
+  // Wait for all of the OpenCL operations to complete
+  std::cout << "Waiting..." << std::endl;
+  OCL_CHECK(err, err = q.flush());
+  OCL_CHECK(err, err = q.finish());
+
+
+  std::cout << "computing conv in CPU..." << std::endl;
+
+  // cpu_print_data_in();
+  // cpu_print_kernels();
+  // cpu_print_bias();
+  cpu_conv2d();
+  // cpu_print_out();
+
+  check_result();
+
+  //-----------------------------
+  std::cout << "" << std::endl;
+  std::cout << "All done" << std::endl;
+  std::cout << "quit now" << std::endl;
+
+  // exit
+  return 0;
+}

From 851a0425ec752b60de911e1a4eb99e299c963f2a Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Sun, 25 Oct 2020 07:15:33 +0000
Subject: [PATCH 06/15] stats to cn kernel

---
 .../kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp   |   2 +-
 .../src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp | 192 ++++++++++++------
 2 files changed, 131 insertions(+), 63 deletions(-)

diff --git a/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
index fbfc6a162..1545e91e4 100644
--- a/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
+++ b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
@@ -18,7 +18,7 @@
 
 extern "C" {
 
-// #define data_type ap_fixed<8,4,AP_TRN,AP_WRAP>
+//#define data_type ap_fixed<8,4,AP_TRN,AP_WRAP>
 #define data_type float
 
 // To allow using defines inside Xilinx pragmas
diff --git a/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
index bd7de32a4..fc1fae149 100644
--- a/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
+++ b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
@@ -44,6 +44,7 @@
 #include "xcl2.hpp"
 
 #include <ap_fixed.h>
+#include <sys/time.h>
 
 using std::vector;
 
@@ -56,25 +57,66 @@ cl::Buffer buf;
 cl::Context context;
 cl::CommandQueue q;
 cl::Program program;
+std::string binaryFile;
 
+#define WMAX 256
+#define HMAX 256
+#define IMAX 512
+#define OMAX 512
+
+#define CPI 4
+#define CPO 4
+
+#define KW 3
+#define KH 3
+
+int W;
+int H;
+int GI;
+int GO;
+int I;
+int O;
 
-#define W    256 //256
-#define H    256 //256
-#define GI   2
-#define CPI  4 // 16
-#define I    GI * CPI
-#define GO   2 // 16
-#define CPO  4
-#define O    GO * CPO
-#define KW   3
-#define KH   3
 
 // buffers
-data_type data_in[  GI * W * H * CPI                 ]  __attribute__ ((__aligned__(16)));
-data_type kernel [  GO * GI * CPO * CPI * KW * KH    ]  __attribute__ ((__aligned__(16)));
-data_type bias   [  O                                ]  __attribute__ ((__aligned__(16)));
-data_type out    [  GO * W * H * CPO                 ]  __attribute__ ((__aligned__(16)));
-data_type out_cpu[  GO * W * H * CPO                 ]  __attribute__ ((__aligned__(16)));
+data_type *data_in; //[  IMAX * W * H * CPI               ]  __attribute__ ((__aligned__(16)));
+data_type *kernel; // [  GO * GI * CPO * CPI * KW * KH    ]  __attribute__ ((__aligned__(16)));
+data_type *bias;//   [  O                                ]  __attribute__ ((__aligned__(16)));
+data_type *out;   // [  GO * W * H * CPO                 ]  __attribute__ ((__aligned__(16)));
+data_type *out_cpu; //[  GO * W * H * CPO                 ]  __attribute__ ((__aligned__(16)));
+
+void allocate_buffers() {
+  data_in = (data_type*)malloc(I * W * H * sizeof(data_type));
+  kernel = (data_type*)malloc(I * O * KW * KH * sizeof(data_type));
+  bias = (data_type*)malloc(O * sizeof(data_type));
+  out = (data_type*)malloc(O * W * H * sizeof(data_type));
+  out_cpu = (data_type*)malloc(O * W * H * sizeof(data_type));
+}
+
+void parse_arguments(int argc, char **argv) {
+  if (argc != 6) {
+    printf("syntax:\n%s <XCLBIN File> <W> <H> <I> <O>\n", argv[0]);
+    exit(1);
+  }
+
+  binaryFile = argv[1];  
+  W = atoi(argv[2]);
+  H = atoi(argv[3]);
+  I = atoi(argv[4]);
+  O = atoi(argv[5]);
+  if ((I % CPI) != 0) {printf("Error, I must me multiple of %d\n", CPI); exit(1);}
+  if ((O % CPO) != 0) {printf("Error, O must be multiple of %d\n", CPO); exit(1);}
+  GI = I / CPI;
+  GO = O / CPO;
+}
+
+void deallocate_buffers() {
+  free(data_in);
+  free(kernel);
+  free(bias);
+  free(out);
+  free(out_cpu);
+}
 
 void cpu_conv2d() {
 
@@ -223,7 +265,7 @@ void check_result() {
 	int go = cout / CPO;
 	int o = cout % CPO;
         int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
-        if (fabs(out_cpu[addr_o] - out[addr_o]) > 0.001) {
+        if (fabs(float(out_cpu[addr_o]) - float(out[addr_o])) > 0.001) {
           printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, float(out_cpu[addr_o]), float(out[addr_o]), fabs(float(out_cpu[addr_o]-out[addr_o])));
           error = 1;
 	  return;
@@ -304,28 +346,39 @@ void set_callback(cl::Event event, const char *queue_name) {
 //---------------------------------------------------------------------------------------------------------------------
 
 int main(int argc, char **argv) {
-  if (argc != 2) {
-    std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
-    return EXIT_FAILURE;
-  }
 
-  printf("Test CONV: [GIxWxHxCPI] = [%dx%dx%dx%d] -> [GOxWxHxCPO] = [%d%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", GI, W, H, CPI, GO, W, H, CPO, KW, KH);
+  parse_arguments(argc, argv);
+
+  printf("Test CONV: [GIxWxHxCPI] = [%dx%dx%dx%d] -> [GOxWxHxCPO] = [%dx%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", GI, W, H, CPI, GO, W, H, CPO, KW, KH);
+
+  allocate_buffers();
 
-  std::string binaryFile = argv[1];
   cl_int err;
   cl::Kernel kernel_conv2d_2;
 
   std::cout << "Creating Context..." << std::endl;
+
+  printf("1\n");
   auto devices = xcl::get_xil_devices();
+  printf("2\n");
   auto device = devices[0];
+
+  printf("hola1\n");
+
   OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err));
+
+  printf("hola2\n");
   OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
 
+  printf("hola\n");
+
   std::string device_name = device.getInfo<CL_DEVICE_NAME>();
   auto fileBuf = xcl::read_binary_file(binaryFile);
   cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
   devices.resize(1);
 
+  printf("hola2\n");
+
   OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err));
   std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl;
 
@@ -342,7 +395,7 @@ int main(int argc, char **argv) {
   // fill data vector with random data
   std::random_device rd;
   std::mt19937 gen(rd());
-  std::uniform_real_distribution<data_type> dist(-1.0f, 1.0f);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
 
   std::cout << "Filling buffer with useful data" << std::endl ;
   int addr = 0;
@@ -351,7 +404,7 @@ int main(int argc, char **argv) {
       for (int w=0; w<W; w++) {
         for (int c=0; c<CPI; c++) {
           data_type value = (gi * W * H * CPI) + (c * W * H) + (data_type)(h * W) + w; //c+1; // (data_type)((c * 25) + (h * W) + w);
-          data_in[addr] = value; //dist(gen); //value;
+          data_in[addr] = dist(gen); //value;
           addr++;
 	}
       }
@@ -366,7 +419,7 @@ int main(int argc, char **argv) {
 	for (int kw=0; kw<KW; kw++) {
           data_type value = (data_type)kernel_id;
           int addr_k = (cout * I * KW * KH) + (c * KW * KH) + (kh * KW) + kw;
-          kernel[addr_k] = value; //dist(gen);
+          kernel[addr_k] = dist(gen);
         }
       }
       kernel_id++;
@@ -374,15 +427,15 @@ int main(int argc, char **argv) {
   }
 
   std::cout << "Filling bias buffer with useful data" << std::endl;
-  for (int cout=0; cout<O; cout++) bias[cout] = cout; //dist(gen);
+  for (int cout=0; cout<O; cout++) bias[cout] = dist(gen);
 
   //-----------------------------
   // THIS PAIR OF EVENTS WILL BE USED TO TRACK WHEN A KERNEL IS FINISHED WITH
   // THE INPUT BUFFERS. ONCE THE KERNEL IS FINISHED PROCESSING THE DATA, A NEW
   // SET OF ELEMENTS WILL BE WRITTEN INTO THE BUFFER.
-  vector<cl::Event> kernel_events(1);
+  vector<cl::Event> kernel_events(GO);
   vector<cl::Event> read_events(1);
-  vector<cl::Event> write_events(1);
+  vector<cl::Event> write_events(3);
   cl::Buffer buffer_a;
   cl::Buffer buffer_b;
   cl::Buffer buffer_k;
@@ -394,19 +447,37 @@ int main(int argc, char **argv) {
   // Device-to-host communication
   std::cout << "Creating Buffers..." << std::endl;
 
-  OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_data_in_bytes, &data_in, &err));
-  OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY  | CL_MEM_USE_HOST_PTR , size_output_in_bytes, &out, &err));
-  OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, &kernel, &err));
-  OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, &bias, &err));
+  OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_data_in_bytes, data_in, &err));
+  OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY  | CL_MEM_USE_HOST_PTR , size_output_in_bytes, out, &err));
+  OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, kernel, &err));
+  OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, bias, &err));
 
   //Arguments for loop
-  int I_ITER = I/CPI; //GO
-  int O_ITER = O/CPO; //GO
   int offset_bias = 0;  //offset to pointer bias each loop
   int offset_kernel = 0; //offset to pointer kernel each loop
   int offset_data_out = 0; //offset to poiter output data loop
 
-  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
+  //-----------------------------
+  // Copy input data to device global memory
+  // std::cout << "Copying data (Host to Device)..." << std::endl;
+  // Because we are passing the write_events, it returns an event object
+  // that identifies this particular command and can be used to query
+  // or queue a wait for this particular command to complete.
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0]));
+  set_callback(write_events[0], "ooo_queue");
+
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[1]));
+  set_callback(write_events[1], "ooo_queue");
+
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_bias}, 0 /*0 means from host*/, NULL, &write_events[2]));
+  set_callback(write_events[2], "ooo_queue");  
+  
+  // timint stats
+  unsigned long long prof_time;
+  struct timeval prof_t1;
+  gettimeofday(&prof_t1, NULL);
+
+  for (int o_iter = 0; o_iter < GO; o_iter++){
     // set kernel arguments
     int arg = 0;
     OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a));
@@ -423,19 +494,8 @@ int main(int argc, char **argv) {
 
     // Update the offset poiter to bias, kernels and output data
     offset_bias = offset_bias + CPO;
-    offset_kernel = offset_kernel + KW * KH * CPO * I_ITER * CPI;
+    offset_kernel = offset_kernel + KW * KH * CPO * GI * CPI;
     offset_data_out = offset_data_out +  H * W;
-    //-----------------------------
-    // Copy input data to device global memory
-    // std::cout << "Copying data (Host to Device)..." << std::endl;
-    // Because we are passing the write_events, it returns an event object
-    // that identifies this particular command and can be used to query
-    // or queue a wait for this particular command to complete.
-    OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0]));
-    set_callback(write_events[0], "ooo_queue");
-
-    OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[0]));
-    set_callback(write_events[0], "ooo_queue");
 
     //-----------------------------
     // printf("Enqueueing NDRange kernel.\n");
@@ -445,24 +505,30 @@ int main(int argc, char **argv) {
     // Launch the Kernel
     std::vector<cl::Event> waitList;
     waitList.push_back(write_events[0]);
-    OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[0]));
-    set_callback(kernel_events[0], "ooo_queue");
-
-    // std::cout << "Getting Results (Device to Host)..." << std::endl;
-    std::vector<cl::Event> eventList;
-    eventList.push_back(kernel_events[0]);
-    // This operation only needs to wait for the kernel call.
-    OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0]));
-    set_callback(read_events[0], "ooo_queue");
-    OCL_CHECK(err, err = read_events[0].wait());
-
-
-}
-
-
+    waitList.push_back(write_events[1]);
+    waitList.push_back(write_events[2]);
+    OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[o_iter]));
+    set_callback(kernel_events[o_iter], "ooo_queue");
+  }
 
+  // we wait all kernels to have completed
+  for (int o_iter = 0; o_iter < GO; o_iter++) {
+    OCL_CHECK(err, err = kernel_events[o_iter].wait());
+  }
 
+  // timing
+  struct timeval prof_t2;
+  gettimeofday(&prof_t2, NULL);
+  prof_time = ((prof_t2.tv_sec - prof_t1.tv_sec) * 1000000) + (prof_t2.tv_usec - prof_t1.tv_usec);
+  printf("Timing: %8lld usec\n", prof_time);
 
+  // std::cout << "Getting Results (Device to Host)..." << std::endl;
+  std::vector<cl::Event> eventList;
+  eventList.push_back(kernel_events[0]);
+  // This operation only needs to wait for the kernel call.
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0]));
+  set_callback(read_events[0], "ooo_queue");
+  OCL_CHECK(err, err = read_events[0].wait());
 
   // Wait for all of the OpenCL operations to complete
   std::cout << "Waiting..." << std::endl;
@@ -485,6 +551,8 @@ int main(int argc, char **argv) {
   std::cout << "All done" << std::endl;
   std::cout << "quit now" << std::endl;
 
+  deallocate_buffers();
+
   // exit
   return 0;
 }

From d1952fc786825d2cc8004503d39ec681d142afeb Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Thu, 29 Oct 2020 08:05:03 +0000
Subject: [PATCH 07/15] Added final CONV kernel

---
 .../kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp      | 1371 ++++++++---------
 .../src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp    |  366 +++--
 2 files changed, 929 insertions(+), 808 deletions(-)

diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp
index 9ec02806c..a66c9bd67 100644
--- a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp
+++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp
@@ -1,699 +1,672 @@
-//KERNEL_CONV2D_4.cpp
-//Modified by: Jorge García Martinez
-//Date: 17/09/2020
-//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs
-//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel.
-//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores
-//the data that It should be written into the memory.
-
-
-
-#include <math.h>
-#include <stdio.h>
-#include <ap_int.h>
-
-#include <hls_stream.h>
-
-//#define DEBUG_VERBOSE
-
-extern "C" {
-
-// To allow using defines inside Xilinx pragmas
-#define PRAGMA_SUB(x) _Pragma (#x)
-#define DO_PRAGMA(x) PRAGMA_SUB(x)
-
-// Fixed parameters (optimized at compilation/synthesis time)
-#define KW       3  // kernel width
-#define KH       3  // kernel height
-#define CPI      4  // channels per input port
-#define CPO      4  // channels per output port
-//
-#define WMAX 512
-#define WHMAX 512*512
-
-#define LOAD_MODEL
-#define READ_MODEL
-#define READ_INPUT
-#define WRITE_OUTPUT
-
-// pixel_in
-struct pixel_in_t {
-  float pixel[CPI];
-};
-
-struct pixel_out_t {
-  float pixel[CPO];
-};
-
-// frames struct
-struct frame_t {
-  pixel_in_t pixel[9];
-};
-
-// --------------------------------------------------------------------------------------
-// read_input:
-// The function reads and writes the kernels, bias and data in different stream.
-// Data are sent to padding module, kenels to mul and bias to add modules.
-// LOOP FLOW
-// ko = 0
-// b = 0
-//   for o_iter 0 .. n
-//        read bias[b..b+3]
-//        b = b + 4
-//        d = 0
-//        ki = 0
-//        for i_iter 0 .. n
-//            read kernel[ki..ki+3][ko..ko+3]
-//            ki = ki +4
-//            read data[d..d+3]
-//            d = d + 4
-//
-//        ko = ko + 4
-//
-//
-// Arguments:
-//   ptr  : Pointer to input data (in)
-//   k_ptr: pointer to kernels (in)
-//   b_ptr: pointer to bias (in)
-//   out  : data output stream (out)
-//   k_out: pointer to kernel (out)
-//   b_out: pointer to bias (out)
-//
-static void read_input(int H, int W, int I, int O, int I_ITER, int O_ITER, pixel_in_t *ptr, float *k_ptr, float *b_ptr, hls::stream<frame_t> &k_out, hls::stream<pixel_out_t> &b_out, hls::stream<pixel_in_t> &out) {
-
-#ifdef DEBUG_VERBOSE
-  printf("read_input: start\n");
-#endif
-
-  frame_t frame_k;
-  #pragma HLS ARRAY_PARTITION variable=frame_k dim=0
-
-  pixel_out_t bias;
-  #pragma HLS ARRAY_PARTITION variable=bias dim=0
-
-  pixel_in_t data;
-  #pragma HLS ARRAY_PARTITION variable=data dim=0
-
-
-  read_input_o_iter_loop:
-  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
-    //Sending bias to add in pack of CPO bias
-    // int data_pointer = 0;
-    read_loop_bias_load:
-      for (int b=0; b<CPO; b++) {
-        #pragma HLS PIPELINE II=1
-        // float v = b_ptr[b];
-        bias.pixel[0] = 1;
-        b_out << bias;
-      }
-    read_input_i_iter_loop:
-    for (int i_iter = 0; i_iter < I_ITER; i_iter++){
-      // printf("o_iter = %d -- i_iter = %d \n ", o_iter, i_iter);
-      //Sending kernels to mul in pack of CPI*CPO kernels
-      int kernel_size_cpo = CPO*KH*KW; //kernels size each i_iter
-      int i_offset = I_ITER * CPI * CPO * KH * KW; //addr_k offset for each i_iter
-      int cpo = 0; //index for kernel size
-      int kx = 0; //index for channels
-      read_loop_kernel_load_ext:
-      for(int i = 0; i < CPI; i++){
-        // printf("i = %d -- kernel_size_cpo = %d \n", i, kernel_size_cpo);
-        read_loop_kernel_load_int:
-        for (int j = 0; j < kernel_size_cpo; j++) {
-           // int addr_k = j + i*kernel_size_cpo*I_ITER + i_iter*i_offset + o_iter*kernel_size_cpo;
-           // float v = k_ptr[addr_k];
-          frame_k.pixel[kx].pixel[cpo] = 1;
-
-          #ifdef DEBUG_VERBOSE
-          printf("[%d]:", addr_k);
-          printf("%6.4f ", v);
-          #endif
-
-          kx = kx + 1;
-          if (kx == 9) {
-            // printf("\n");
-            kx = 0;
-            cpo = cpo + 1;
-            if (cpo == CPO) {
-              cpo = 0;
-              k_out << frame_k;
-            }
-          }
-        }
-      }
-
-    //Sending data to padding  in pack of CPI channels
-
-    read_loop_data_load_i:
-      for (int r=0; r<H*W; r++) {
-        #pragma HLS PIPELINE II=1
-        // printf("r = %d \n", r);
-        // data = ptr[offset_read];
-        #ifdef DEBUG_VERBOSE
-        printf("data.pixel[0] = %6.2f  ", data.pixel[0]);
-        printf("data.pixel[1] = %6.2f  ", data.pixel[1]);
-        printf("data.pixel[2] = %6.2f  ", data.pixel[2]);
-        printf("data.pixel[3] = %6.2f  \n", data.pixel[3]);
-        #endif
-        out  << ptr[r];
-        // data_pointer++;
-      }
-
-   } //i_iter
-} //o_iter
-
-
-#ifdef DEBUG_VERBOSE
-  printf("read_input: end\n");
-#endif
-}
-
-// ---------------------------------------------------------------------------------------
-// padding. Adds padding to the input and forwards it through the output
-//
-// Arguments:
-//   in                : input stream
-//   out               : vector of output streams
-//
-static void padding(int H, int W, int ITER, hls::stream<pixel_in_t> &in, hls::stream<pixel_in_t> &out) {
-
-#ifdef DEBUG_VERBOSE
-  printf("padding: start\n");
-#endif
-
-//we init zero only first time
-
-pixel_in_t data;
-DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete)
-
-pixel_in_t zero;
-DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete)
-
-for (int cpi=0; cpi<CPI; cpi++) zero.pixel[cpi] = 0.f;
-
-  padding_iter_loop:
-  for(int iter = 0; iter < ITER; iter++){
-
-    for(int h = 0; h < H + 2; h++){
-      #pragma HLS_PIPELINE II=1
-      for(int w = 0; w < W + 2; w++){
-        #pragma HLS_PIPELINE II=1
-        if (h==0 || h == H+1 || w == 0 || w == W+1) {
-          data = zero;
-        } else {
-	  data = in.read();
-	}
-        out << data;
-      }
-    }
-  } // iter
-
-
-
-#ifdef DEBUG_VERBOSE
-  printf("padding: end\n");
-#endif
-}
-
-// ---------------------------------------------------------------------------------------------
-// relu. Performs the relu operation on an input stream and produces an output stream
-// Arguments:
-//
-//   in: input stream
-//   out: output stream
-//
-static void relu(int H, int W, int O, hls::stream<float> &in, hls::stream<float> &out) {
-
-#ifdef DEBUG_VERBOSE
-  printf("relu: start\n");
-#endif
-
-  int data_size = W * H * O;
-  for (int i=0; i < data_size; i++) {
-    #pragma HLS PIPELINE II=1
-    float data = in.read();
-    if (data < 0) data = 0.f;
-    out << data;
-  }
-
-#ifdef DEBUG_VERBOSE
-  printf("relu: end\n");
-#endif
-}
-
-// --------------------------------------------------------------------------------
-// write_output: Writes data comming from one stream into memory
-// LOOP FLOW:
-//  for o_iter 0 .. n
-//      write data[do .. do+3]
-//
-//  d = d + 4
-//
-// Arguments:
-//   ptr: memory address pointer
-//   in: input stream
-//
-static void write_output(int H, int W, int O_ITER, pixel_out_t *ptr, hls::stream<pixel_out_t> &in) {
-
-#ifdef DEBUG_VERBOSE
-  printf("write_output: start\n");
-#endif
-
-
-
-  // int data_pointer = 0;
-
-  // write_output_o_iter_loop:
-  // for (int o_iter = 0; o_iter<O_ITER; o_iter++){
-    //writes must be performed with pixel_in_t struct
-    write_output_data_size_loop:
-    for (int i=0; i<H*W*O_ITER; i++) {
-      ptr[i] = in.read();
-      // data_pointer++;
-      #ifdef DEBUG_VERBOSE
-      printf("o_iter = %d para i = %d \n", o_iter, i);
-      printf("ptr--p.pixel[0] = %6.2f \n", p.pixel[0]);
-      printf("ptr--p.pixel[1] = %6.2f \n", p.pixel[1]);
-      printf("ptr--p.pixel[2] = %6.2f \n", p.pixel[2]);
-      printf("ptr--p.pixel[3] = %6.2f \n\n", p.pixel[3]);
-      #endif
-    }
-  // } //o_iter
-
-
-
-#ifdef DEBUG_VERBOSE
-  printf("write_output: end\n");
-#endif
-}
-
-
-
-// ---------------------------------------------------------------------------------------------------
-// cvt: reads an input stream with an image of format (W, H, CPI) and writes an output stream
-// in a 2D format based on (KW, KH). (SW=1, SH=1) stride is assumed and (PW=1, PH=1) padding is assumed.
-// The function outputs data in the format (CPI, KW, KH).
-//
-// Arguments:
-//   in  : input stream
-//   out : output stream
-//   id  : function id (for debugging)
-static void cvt(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &out, int id) {
-
-#ifdef DEBUG_VERBOSE
-  printf("cvt_%d: start\n", id);
-#endif
-
-cvt_o_iter_loop:
-for (int o_iter = 0; o_iter < O_ITER; o_iter++){
-  cvt_i_iter_loop:
-  for(int i_iter = 0; i_iter < I_ITER; i_iter++){
-
-  // Now we process the input data and convert the data into frames
-
-  // buffers (keep three rows)
-  pixel_in_t buffer0[WMAX+2];
-  pixel_in_t buffer1[WMAX+2];
-  pixel_in_t buffer2[WMAX+2];
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI)
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI)
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI)
-
-  // frame
-  frame_t frame;
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=frame)
-
-  // We loop for every incoming pixel
-  cvt_loop_1:
-  for (int pin_row=0; pin_row < H+2; pin_row++) {
-    cvt_loop_2:
-    for (int pin_col=0; pin_col < W+2; pin_col++) {
-      // get the pixel
-      pixel_in_t pixel;
-      pixel = in.read();
-      // row buffer write (in which buffer row we write the pixel)
-      int row0_buffer_write = (pin_row % 3) == 0;
-      int row1_buffer_write = (pin_row % 3) == 1;
-      // first row buffer
-      int row0 = (pin_row <= 2) | ((pin_row % 3) == 2);
-      int row1 = !row0 & ((pin_row % 3) == 0);
-      // we write the pixel into the buffer
-      if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel;
-      // build the frame
-      pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8;
-      int shift_frame = (pin_row>1) & (pin_col > 2);
-      int send_frame = (pin_row>1) & (pin_col > 1);
-      pixel_in_t pixel_b0, pixel_b1, pixel_b2;
-      pixel_b0 = buffer0[pin_col];
-      pixel_b1 = buffer1[pin_col];
-      pixel_b2 = buffer2[pin_col];
-      // p0, p1, p2
-      if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;}
-      if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;}
-      if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2;
-      // p3, p4, p5
-      if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;}
-      if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;}
-      if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0;
-      // p6, p7, p8
-      if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;}
-      if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;}
-      if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1;
-
-      if (send_frame) {
-        frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2;
-        frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5;
-        frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8;
-        out << frame;
-      #ifdef DEBUG_VERBOSE
-      printf("cvt_%d: frame sent:\n", id);
-      for (int cpi=0; cpi<CPI; cpi++) {
-        printf("  cpi %d:\n", cpi);
-        printf("    %6.4f %6.4f %6.4f\n", frame.pixel[0].pixel[cpi], frame.pixel[1].pixel[cpi], frame.pixel[2].pixel[cpi]);
-        printf("    %6.4f %6.4f %6.4f\n", frame.pixel[3].pixel[cpi], frame.pixel[4].pixel[cpi], frame.pixel[5].pixel[cpi]);
-        printf("    %6.4f %6.4f %6.4f\n", frame.pixel[6].pixel[cpi], frame.pixel[7].pixel[cpi], frame.pixel[8].pixel[cpi]);
-      }
-      #endif
-     }
-    }
-  }
-
-} //i_iter
-} //o_iter
-
-
-#ifdef DEBUG_VERBOSE
-  printf("cvt_%d: end\n", id);
-#endif
-}
-
-// ----------------------------------------------------------------------------------------
-// mul: This function performs the multiplication of an input frame with the stored kernels
-// and sends the produced pixels. Before normal operation it receives its kernels
-// Arguments:
-//   in: input stream with incoming data frames
-//   k_in: input stream with kernels
-//   out: output stream
-//   id: function id (for debugging only)
-//
-static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream<frame_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &out, int id) {
-
-#ifdef DEBUG_VERBOSE
-  printf("mul_%d: start\n", id);
-#endif
-
-  // first we read the kernels
-  frame_t kernel[CPI];
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0)
-  frame_t data_in;
-
-#ifdef LOAD_MODEL
-
-  mul_o_iter_loop:
-  for (int o_iter = 0; o_iter < O_ITER; o_iter++){
-    mul_i_iter_loop:
-    for(int i_iter = 0; i_iter < I_ITER; i_iter++){
-      //we load the kernels into pack of frames
-      loop_mul_kernels_load_cpo:
-      for (int cpi=0; cpi<CPI; cpi++) {
-        #pragma HLS PIPELINE II=1
-        kernel[cpi] = k_in.read();
-      }
-
-#ifdef DEBUG_VERBOSE
-  printf("mul_%d: kernels received\n", id);
-  for (int cpi=0; cpi < CPI; cpi++) {
-    for (int cpo=0; cpo < CPO; cpo++) {
-      printf("  cpi=%d, cpo=%d:\n", cpi, cpo);
-      printf("    %6.4f %6.4f %6.4f\n", kernel[cpi].pixel[0].pixel[cpo], kernel[cpi].pixel[1].pixel[cpo], kernel[cpi].pixel[2].pixel[cpo]);
-      printf("    %6.4f %6.4f %6.4f\n", kernel[cpi].pixel[3].pixel[cpo], kernel[cpi].pixel[4].pixel[cpo], kernel[cpi].pixel[5].pixel[cpo]);
-      printf("    %6.4f %6.4f %6.4f\n", kernel[cpi].pixel[6].pixel[cpo], kernel[cpi].pixel[7].pixel[cpo], kernel[cpi].pixel[8].pixel[cpo]);
-    }
-  }
-#endif
-
-
-    // now we read frames and produce the pixels
-    float sum[CPO];
-    DO_PRAGMA(HLS ARRAY_PARTITION variable=sum dim=0 block factor=CPO)
-    //factor = 16
-    //the array_partition factor in this case is assumed to be CPO value
-    int num_iterations = W * H;
-    mul_sum_loop:
-    for (int cpo=0; cpo<CPO; cpo++) sum[cpo] = 0.f;
-
-    mul_num_iterations_loop:
-    for (int i=0; i<num_iterations; i++) {
-      data_in = in.read();
-
-#ifdef DEBUG_VERBOSE
-  printf("mul_%d: data received\n", id);
-  for (int cpi=0; cpi<CPI; cpi++) {
-    printf("  cpi=%d\n", cpi);
-    printf("    %6.4f %6.4f %6.4f\n", data_in.pixel[0].pixel[cpi], data_in.pixel[1].pixel[cpi], data_in.pixel[2].pixel[cpi]);
-    printf("    %6.4f %6.4f %6.4f\n", data_in.pixel[3].pixel[cpi], data_in.pixel[4].pixel[cpi], data_in.pixel[5].pixel[cpi]);
-    printf("    %6.4f %6.4f %6.4f\n", data_in.pixel[6].pixel[cpi], data_in.pixel[7].pixel[cpi], data_in.pixel[8].pixel[cpi]);
-  }
-#endif
-
-    loop_mul_cpi:
-    for (int cpi=0; cpi<CPI; cpi++) {
-      #pragma HLS UNROLL
-      loop_mul_j:
-      for (int j=0; j<KW*KH; j++) {
-	       #pragma HLS UNROLL
-        loop_mul_cpo:
-      	for (int cpo=0; cpo<CPO; cpo++) {
-          #pragma HLS UNROLL
-          sum[cpo] += data_in.pixel[j].pixel[cpi] * kernel[cpi].pixel[j].pixel[cpo];
-        }
-      }
-    }
-    pixel_out_t p_out;
-    for (int cpo=0; cpo<CPO; cpo++) {
-      #pragma HLS unroll
-      #ifdef DEBUG_VERBOSE
-      printf("mul_%d: pixel produced\n", id);
-      for (int cpo=0; cpo<CPO; cpo++) printf("  cpo=%d -> %6.4f\n", cpo, sum[cpo]);
-      #endif
-      p_out.pixel[cpo] = sum[cpo];
-      sum[cpo] = 0.f;
-     }
-     out << p_out;
-    }
-  } //i_iter
-} //o_iter
-
-#endif
-
-
-#ifdef DEBUG_VERBOSE
-  printf("mul_%d: end\n", id);
-#endif
-}
-
-// -------------------------------------------------------------------------------
-// add: This function performs the addition of all subpixels for the same channel.
-// It adds also the corresponding bias.
-// LOOP FLOW
-//   for o_iter 0 .. n
-//        receive bias[b..b+3]
-//        init buff_o_channels with bias
-//        for i_iter 0 .. n
-//            receive data[do..d+3]
-//            buff_o_channels = buff_o_channels + data
-//
-//        for num_iterations
-//            for CPO
-//              send data to write module
-//
-// Arguments:
-//   in:  input streams data
-//   b_in: input stream bias
-//   out: output stream
-//
-static void add(int H, int W, int I_ITER, int O_ITER, hls::stream<pixel_out_t> &in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
-
-#ifdef DEBUG_VERBOSE
-  printf("add: start\n");
-#endif
-
-  float bias[CPO];
-
-  //number of iterations by CPI || CPO channels
-  int num_iterations = W * H;
-
-  //Buffer for all data and CPO channels
-  float buff_o_channels[CPO][WHMAX];
-  DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO)
-
-  //We read Bias in O_iter packs of CPO size
-  add_o_iter_loop:
-  for (int o_iter = 0; o_iter<O_ITER; o_iter++){
-
-    //We receive bias in packs of CPO
-    add_load_bias_loop:
-    for (int b=0; b<CPO; b++) {
-      #pragma HLS PIPELINE II=1
-      pixel_out_t p_out;
-      p_out = b_in.read();
-      bias[b] = p_out.pixel[0];
-    }
-
-    #ifdef DEBUG_VERBOSE
-    for (int b=0; b<CPO; b++) {
-      printf("Bias[%d] = %6.4f \n", b, bias[b]);
-    }
-    #endif
-
-    #ifdef DEBUG_VERBOSE
-    printf("add: bias received\n");
-    #endif
-
-    //It is necessary to reset the buffer each o_iter
-    // add_init_buff_o_channels_loop:
-    // for(int cpo = 0; cpo<CPO; cpo++){
-    //   for(int it = 0; it<num_iterations; it++){
-    //     buff_o_channels[cpo][it] = bias[cpo];
-    //   }
-    // }
-
-      #ifdef DEBUG_VERBOSE
-      printf("o_iter = %d \n", o_iter);
-      for(int cpo = 0; cpo<CPO; cpo++){
-        printf("Channel cpo = %d: ", cpo);
-        for(int it = 0; it<num_iterations; it++){
-          printf("%6.2f ", buff_o_channels[cpo][it]);
-        }
-        printf("\n");
-      }
-      #endif
-
-      //All input data have effect into output add
-      add_i_iter_loop:
-      for (int i_iter = 0; i_iter < I_ITER; i_iter++){
-        // //prueba
-        pixel_out_t data_out;
-        // pixel_out_t data;
-        // data = in.read();
-        // out<<data;
-        #pragma HLS loop_flatten off
-        add_load_data_it_loop:
-        for(int it = 0; it<num_iterations; it++){
-          pixel_out_t data_in;
-          data_in = in.read();
-          pixel_out_t data;
-          add_load_data_cpo_loop:
-          for (int cpo=0; cpo<CPO; cpo++) {
-            #pragma HLS unroll
-            if(i_iter == 0){
-              data.pixel[cpo] = bias[cpo];
-            }
-            else{
-              data.pixel[cpo] = buff_o_channels[cpo][it];
-            }
-            buff_o_channels[cpo][it] = data.pixel[cpo] + data_in.pixel[cpo];
-
-            if(i_iter ==(I_ITER-1)){
-              data_out.pixel[cpo] = buff_o_channels[cpo][it];
-            }
-          }
-          if(i_iter ==(I_ITER-1)){
-            out << data_out;
-          }
-        }
-      } //i_iter
-
-      #ifdef DEBUG_VERBOSE
-      printf("CH %d: ", o_iter*CPO);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", buff_o_channels[0][it]);
-      }
-      printf("\n");
-      printf("CH %d: ", o_iter*CPO +1);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", buff_o_channels[1][it]);
-      }
-      printf("\n");
-      printf("CH %d: ", o_iter*CPO +2);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", buff_o_channels[2][it]);
-      }
-      printf("\n");
-      printf("CH %d: ", o_iter*CPO +3);
-      for (int it=0; it<num_iterations; it++) {
-        printf("%6.2f ", buff_o_channels[3][it]);
-      }
-      printf("\n");
-      #endif
-
-
-  } //o_iter
-
-
-#ifdef DEBUG_VERBOSE
-  printf("add: end\n");
-#endif
-
-
-}
-
-// conv: Convolutional kernel
-//
-// Arguments:
-//   in: input stream
-//   out: output stream
-static void conv(int H, int W, int I, int O, int I_ITER, int O_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
-
-  // streams
-  static hls::stream<pixel_in_t>  str_pad_cvt;  // padding->cvt
-  static hls::stream<frame_t>     str_cvt_mul;  // cvt->mul
-  static hls::stream<pixel_out_t> str_mul_add;  // mul->add
-
-  // topology
-  #pragma HLS dataflow
-  padding(H, W, I_ITER * O_ITER, in, str_pad_cvt);          // padding
-  cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0);  // cvt
-  mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0);  // mul
-  add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out);             // add
-}
-
-void k_conv2D_K3x3_S1x1_P1x1_BS1(pixel_in_t *ptr_data, int H, int W, int I, float *ptr_kernel, float *ptr_bias, pixel_out_t *ptr_out, int O) {
-
-  #pragma HLS INTERFACE s_axilite port=W bundle=control
-  #pragma HLS INTERFACE s_axilite port=H bundle=control
-  #pragma HLS INTERFACE s_axilite port=I bundle=control
-  #pragma HLS INTERFACE s_axilite port=O bundle=control
-  #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
-  #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256
-  #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
-  #pragma HLS INTERFACE m_axi port=ptr_out  offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
-  #pragma HLS INTERFACE s_axilite port=return bundle=control
-
-  // ptr_data struct to be packed as a single element vector (to improve memory read)
-  // the compiler will do full structure access (all elements of structure)
-  #pragma HLS data_pack variable = ptr_data
-  #pragma HLS data_pack variable = ptr_out
-
-  int I_ITER = I/CPI;
-  int O_ITER = O/CPO;
-
-  // input and output streams
-  static hls::stream<pixel_in_t> out_read;
-  static hls::stream<frame_t> out_read_kernel;
-  static hls::stream<pixel_out_t> out_read_bias;
-  static hls::stream<pixel_out_t> out_conv;
-
-  // stream sizes
-  #pragma HLS STREAM variable = out_read depth = 32
-  #pragma HLS STREAM variable = out_read_kernel depth = 32
-  #pragma HLS STREAM variable = out_read_bias depth = 32
-  #pragma HLS STREAM variable = out_conv depth = 32
-  #pragma HLS STREAM variable = out_relu depth = 32
-
-  #pragma HLS dataflow
-  read_input(H, W, I, O, I_ITER, O_ITER, ptr_data, ptr_kernel, ptr_bias, out_read_kernel, out_read_bias, out_read);
-  conv(H, W, I, O, I_ITER, O_ITER, out_read, out_read_kernel, out_read_bias, out_conv);
-  write_output(H, W, O_ITER, ptr_out, out_conv);
-}
-
-} // end extern "C"
+// Convolution kernel
+// Description: This kernel computes the convolution operation for a given set of output
+// channels. The kernel has a defined set of input channels (CPI) and output
+// channels (CPO) where the convolution is performed in parallel. 
+// The kernel receives the input geometry (I, W, H) as arguments and performs
+// the convolution over CPO channels. For I>CPI configurations the kernel iterates on the
+// input channels to produce the output channels. For O>CPO the kernel must be called for each
+// CPO set of channels to computer. For this, offsets are provided to the kernel as arguments
+// to read from and write to the proper memory locations. 
+// The kernel uses DataFlow model and is optimized in order to be bounded by the memory bandwidth.
+//
+//  Dataflow:
+//
+//   -------                                             
+//   |     | ---> read_bias ---------------------------------------
+//   |     |                                                      |
+//   |     | ---> read_kernel ----------------------------        |
+//   | DDR |                                             |        |
+//   |     | ---> read_data  ---> padding ---> cvt ---> mul ---> add ---> write_data
+//   |     |                                                                 |
+//   |     | <----------------------------------------------------------------
+//   -------
+//
+// The kernels asumes the following memory allocation for data:
+//    - input data : GI x H x W x CPI
+//    - kernels    : GO x GI x CPO x CPI x KH x KW
+//    - bias       : O
+//    - output data: GO x H x W x CPO
+//
+// (GI = group of input channels, GO = group of output channels)
+// (I = GI x CPI, O = GO x CPO)
+//
+// Fixed (static) parameters: 
+//    - CPI: Number of input channels supported in one iteration of the kernel
+//    - CPO: Number of output channels supported in one iteration of the kernel
+//    - KH, KW: Kernel size (3x3)
+//    - PH, PW: Padding (1x1) (implicit in the code)
+//    - SH, SW: Stride (1x1) (implicit in the code)
+//    - WMAX: Maximum value of the width of an input channel
+//    - WHMAX: Maximum value of the width multiplied by the height of an input channels
+//
+// Arguments: 
+//    - I: Number of input channels
+//    - O: Number of output channels
+//    - W: Channel width
+//    - H: Channel height
+//    - ptr_data: Memory pointer to input data
+//    - ptr_kernel: Memory pointer to kernels
+//    - ptr_bias: Memory pointer to bias
+//    - ptr_out: Memory pointer to output buffer
+//    - offset_kernel: Offset within kernel data
+//    - offset_bias: Offset within bias data
+//    - offset_data_out: Offset within output buffer
+//
+
+// Headers
+#include <math.h>
+#include <stdio.h>
+#include <ap_fixed.h>
+#include <hls_stream.h>
+
+// Enable this define to get information (sw_emu)
+// #define DEBUG_VERBOSE
+
+extern "C" {
+
+// Data type to be used
+#define data_type float
+
+// To allow using defines inside Xilinx pragmas
+#define PRAGMA_SUB(x) _Pragma (#x)
+#define DO_PRAGMA(x) PRAGMA_SUB(x)
+
+// Fixed parameters (optimized at compilation/synthesis time)
+#define KW       3  // kernel width
+#define KH       3  // kernel height
+#define CPI      4  // channels per input port
+#define CPO      4  // channels per output port
+
+// Maximum width and width*height
+#define WMAX 256
+#define WHMAX 256*256
+
+// Data type for input reads
+struct pixel_in_t {           // pixel in
+  data_type pixel[CPI];
+};
+
+// Data type for output writes
+struct pixel_out_t {          // pixel out
+  data_type pixel[CPO];
+};
+
+// frames struct (KWxKH)
+struct frame_t {
+  pixel_in_t pixel[9];
+};
+
+// ---------------------------------------------------------------------------------------
+// read_bias. Reading bias from memory and sending to add module.
+//
+// Arguments:
+//   b_ptr               : pointer to bias
+//   offset_bias         : offset to bias
+//   b_out               : output stream
+//
+static void read_bias(int offset_bias, data_type *b_ptr, hls::stream<pixel_out_t> &b_out){
+
+  #ifdef DEBUG_VERBOSE
+  printf("read_bias: start\n");
+  #endif
+
+  pixel_out_t bias;
+  #pragma HLS ARRAY_PARTITION variable=bias dim=0
+
+  // we read the bias
+  for (int i=0; i<CPO; i++) {
+    data_type v = b_ptr[i + offset_bias];
+    bias.pixel[i] = v;
+  }
+  #ifdef DEBUG_VERBOSE
+  printf("bias read: ");
+  for (int c=0; c<CPO; c++) printf(" %f ", float(bias.pixel[c]));
+  printf("\n");
+  #endif
+  b_out << bias;
+
+  #ifdef DEBUG_VERBOSE
+  printf("read_bias: end\n");
+  #endif
+}
+
+// ---------------------------------------------------------------------------------------
+// read_kernel. Reads kernels and sends them through the stream
+//
+// Arguments:
+//   I_ITER              : Number of input iterations (I / CPI)
+//   k_ptr               : pointer to kernels
+//   offset_kernel       : offset to kernels
+//   k_out               : output stream
+//
+static void read_kernel(int I_ITER, int offset_kernel, data_type *k_ptr, hls::stream<frame_t> &k_out){
+
+  #ifdef DEBUG_VERBOSE
+  printf("read_kernel: start\n");
+  #endif
+
+  // we read all the kernels and send them through the stream
+  frame_t frame_k;
+  #pragma HLS ARRAY_PARTITION variable=frame_k dim=0
+  int cpo = 0;
+  int p = 0;
+
+  int size = KW * KH * CPO * I_ITER * CPI;
+  read_kernel_loop:
+  for (int i=0; i<size; i++) {
+    frame_k.pixel[p].pixel[cpo] = k_ptr[i+ offset_kernel];
+    p = p + 1;
+    if (p == 9) {
+      p = 0;
+      cpo = cpo+1;
+      if (cpo == CPO) {
+        cpo = 0;
+	k_out << frame_k;
+        #ifdef DEBUG_VERBOSE
+	printf("kernel read:\n");
+	for (int c=0; c<CPO; c++) {
+          printf("channel %d: ", c);
+	  for (int p=0; p<9; p++) printf(" %f ", float(frame_k.pixel[p].pixel[c]));
+	  printf("\n");
+	}
+        #endif
+      }
+    }
+  }
+
+  #ifdef DEBUG_VERBOSE
+  printf("read_kernel: end\n");
+  #endif
+
+}
+
+// --------------------------------------------------------------------------------------
+// read_data: Reading data from memory and sending it through the output stream
+// Arguments:
+//   H      : Height of the input channel
+//   W      : Width of the input channel
+//   I_ITER : Number of input iterations (I / CPI)
+//   ptr    : Pointer to input data (in)
+//   out    : data output stream (out)
+//
+static void read_data(int H, int W, int I_ITER, pixel_in_t *ptr, hls::stream<pixel_in_t> &out) {
+
+  #ifdef DEBUG_VERBOSE
+  printf("read_data: start\n");
+  #endif
+
+  read_loop_data_load_i:
+  for (int r=0; r<H*W*I_ITER; r++) {
+    #pragma HLS PIPELINE II=1
+    pixel_in_t data;
+    data = ptr[r];
+    #ifdef DEBUG_VERBOSE
+    printf("read data:\n");
+    for(int cpi = 0;cpi<CPI;cpi++) printf(" %f ", float(data.pixel[cpi]));
+    printf("\n");
+    #endif
+    out  << data;
+  }
+
+  #ifdef DEBUG_VERBOSE
+  printf("read_data: end\n");
+  #endif
+}
+
+// ---------------------------------------------------------------------------------------
+// padding. Adds padding to the input and forwards it through the output
+//
+// Arguments:
+//   H                 : Height of input channel
+//   W                 : Width of input channel
+//   I_ITER            : Number of input iterations (I / CPI)
+//   in                : input stream
+//   out               : output stream
+//
+static void padding(int H, int W, int I_ITER, hls::stream<pixel_in_t> &in, hls::stream<pixel_in_t> &out) {
+
+  #ifdef DEBUG_VERBOSE
+  printf("padding: start\n");
+  #endif
+
+  pixel_in_t data;
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete)
+
+  pixel_in_t zero;
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete)
+
+  padding_cpi_loop:
+  for (int cpi=0; cpi<CPI; cpi++) zero.pixel[cpi] = 0.f;
+  
+  padding_iter_loop:
+  for(int iter = 0; iter < I_ITER; iter++){
+    padding_h_loop:
+    for(int h = 0; h < H + 2; h++){
+      #pragma HLS_PIPELINE II=1
+      padding_w_loop:
+      for(int w = 0; w < W + 2; w++){
+        #pragma HLS_PIPELINE II=1
+        if (h==0 || h == H+1 || w == 0 || w == W+1) {
+          data = zero;
+        } else {
+          data = in.read();
+        }
+        #ifdef DEBUG_VERBOSE
+        for(int cpi = 0;cpi<CPI;cpi++) printf("data.pixel[%d] = %6.2f  ", cpi, float(data.pixel[cpi]));
+        printf("\n");
+        #endif
+        out << data;
+      }
+    }
+  } // iter
+
+  #ifdef DEBUG_VERBOSE
+  printf("padding: end\n");
+  #endif
+}
+
+// --------------------------------------------------------------------------------
+// write_output: Writes data comming from one stream into memory
+//
+// Arguments:
+//   H      : Height of a channel
+//   W      : Width of a channel
+//   ptr    : memory address pointer
+//   offset : Offset within the buffer 
+//   in     : input stream
+//
+static void write_output(int H, int W, int offset_data_out, pixel_out_t *ptr, hls::stream<pixel_out_t> &in) {
+
+  #ifdef DEBUG_VERBOSE
+  printf("write_output: start\n");
+  #endif
+
+  write_output_data_size_loop:
+  for (int i=0; i<H*W; i++) {
+    pixel_out_t p = in.read();
+    ptr[i + offset_data_out] = p;
+    #ifdef DEBUG_VERBOSE
+    printf("i = %d \n",  i);
+    for (int cpo=0; cpo<CPO; cpo++) printf("ptr--p.pixel[%d] = %6.2f \n", cpo, float(p.pixel[cpo]));
+    #endif
+  }
+
+  #ifdef DEBUG_VERBOSE
+  printf("write_output: end\n");
+  #endif
+}
+
+
+
+// ---------------------------------------------------------------------------------------------------
+// cvt: reads an input stream with an image of format (H, W, CPI) and writes an output stream
+// in a 2D format based on (KW, KH). (SW=1, SH=1) stride is assumed and (PW=1, PH=1) padding is assumed.
+// The function outputs data in the format (KH, KW, CPI).
+//
+// Arguments:
+//   H   : Height of input channel
+//   W   : Width of input channel
+//   in  : input stream
+//
+static void cvt(int H, int W, int I_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &out) {
+
+  #ifdef DEBUG_VERBOSE
+  printf("cvt: start\n");
+  #endif
+
+  cvt_i_iter_loop:
+  for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+
+    // Now we process the input data and convert the data into frames
+    // buffers (keep three rows)
+    pixel_in_t buffer0[WMAX+2];
+    pixel_in_t buffer1[WMAX+2];
+    pixel_in_t buffer2[WMAX+2];
+    DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI)
+    DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI)
+    DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI)
+
+    // frame
+    frame_t frame;
+    DO_PRAGMA(HLS ARRAY_PARTITION variable=frame)
+
+    // We loop for every incoming pixel
+    cvt_loop_1:
+    for (int pin_row=0; pin_row < H+2; pin_row++) {
+      cvt_loop_2:
+      for (int pin_col=0; pin_col < W+2; pin_col++) {
+        // get the pixel
+        pixel_in_t pixel;
+        pixel = in.read();
+        // row buffer write (in which buffer row we write the pixel)
+        int row0_buffer_write = (pin_row % 3) == 0;
+        int row1_buffer_write = (pin_row % 3) == 1;
+        // first row buffer
+        int row0 = (pin_row <= 2) | ((pin_row % 3) == 2);
+        int row1 = !row0 & ((pin_row % 3) == 0);
+        // we write the pixel into the buffer
+        if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel;
+        // build the frame
+        pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8;
+        int shift_frame = (pin_row>1) & (pin_col > 2);
+        int send_frame = (pin_row>1) & (pin_col > 1);
+        pixel_in_t pixel_b0, pixel_b1, pixel_b2;
+        pixel_b0 = buffer0[pin_col];
+        pixel_b1 = buffer1[pin_col];
+        pixel_b2 = buffer2[pin_col];
+        // p0, p1, p2
+        if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;}
+        if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;}
+        if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2;
+        // p3, p4, p5
+        if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;}
+        if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;}
+        if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0;
+        // p6, p7, p8
+        if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;}
+        if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;}
+        if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1;
+
+        if (send_frame) {
+          frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2;
+          frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5;
+          frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8;
+          out << frame;
+          #ifdef DEBUG_VERBOSE
+          printf("cvt_%d: frame sent:\n", id);
+          for (int cpi=0; cpi<CPI; cpi++) {
+            printf("  cpi %d:\n", cpi);
+            printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[0].pixel[cpi]), float(frame.pixel[1].pixel[cpi]), float(frame.pixel[2].pixel[cpi]));
+            printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[3].pixel[cpi]), float(frame.pixel[4].pixel[cpi]), float(frame.pixel[5].pixel[cpi]));
+            printf("    %6.4f %6.4f %6.4f\n", float(frame.pixel[6].pixel[cpi]), float(frame.pixel[7].pixel[cpi]), float(frame.pixel[8].pixel[cpi]));
+          }
+          #endif
+        }
+      }
+    }
+  } //i_iter
+
+  #ifdef DEBUG_VERBOSE
+  printf("cvt: end\n");
+  #endif
+}
+
+// ----------------------------------------------------------------------------------------
+// mul: This function performs the multiplication of an input frame with the stored kernels
+// and sends the produced pixels. Before normal operation it receives its kernels
+// Arguments:
+//   H     : Height of the input channel
+//   W     : Width of the input channel
+//   I_ITER: Number of input iterations (I / CPI)
+//   in    : input stream with incoming data frames
+//   k_in  : input stream with kernels
+//   out   : output stream
+//
+static void mul(int H, int W, int I_ITER, hls::stream<frame_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &out) {
+
+  #ifdef DEBUG_VERBOSE
+  printf("mul: start\n");
+  #endif
+
+  frame_t kernel[CPO];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0)
+  frame_t data_in;
+
+  // Reading the kernels
+  mul_i_iter_loop:
+  for(int i_iter = 0; i_iter < I_ITER; i_iter++){
+    loop_mul_kernels_load_cpo:
+    for (int cpo=0; cpo<CPO; cpo++) {
+      #pragma HLS PIPELINE II=1
+      kernel[cpo] = k_in.read();
+    }
+    #ifdef DEBUG_VERBOSE
+    printf("mul_%d: kernels received\n", id);
+    for (int cpo=0; cpo < CPO; cpo++) {
+      for (int cpi=0; cpi < CPI; cpi++) {
+        printf("  cpi=%d, cpo=%d:\n", cpi, cpo);
+        printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpo].pixel[0].pixel[cpi]), float(kernel[cpo].pixel[1].pixel[cpi]), float(kernel[cpo].pixel[2].pixel[cpi]));
+        printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpo].pixel[3].pixel[cpi]), float(kernel[cpo].pixel[4].pixel[cpi]), float(kernel[cpo].pixel[5].pixel[cpi]));
+        printf("    %6.4f %6.4f %6.4f\n", float(kernel[cpo].pixel[6].pixel[cpi]), float(kernel[cpo].pixel[7].pixel[cpi]), float(kernel[cpo].pixel[8].pixel[cpi]));
+      }
+    }
+    #endif
+
+    // now we read frames and produce the pixels
+    data_type sum[CPO];
+    DO_PRAGMA(HLS ARRAY_PARTITION variable=sum dim=0 block factor=CPO)
+    //
+    int num_iterations = W * H;
+    mul_sum_loop:
+    for (int cpo=0; cpo<CPO; cpo++) sum[cpo] = 0.f;
+    
+    mul_num_iterations_loop:
+    for (int i=0; i<num_iterations; i++) {
+      data_in = in.read();
+
+      #ifdef DEBUG_VERBOSE
+      printf("mul_%d: data received\n", id);
+      for (int cpi=0; cpi<CPI; cpi++) {
+        printf("  cpi=%d\n", cpi);
+        printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[0].pixel[cpi]), float(data_in.pixel[1].pixel[cpi]), float(data_in.pixel[2].pixel[cpi]));
+        printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[3].pixel[cpi]), float(data_in.pixel[4].pixel[cpi]), float(data_in.pixel[5].pixel[cpi]));
+        printf("    %6.4f %6.4f %6.4f\n", float(data_in.pixel[6].pixel[cpi]), float(data_in.pixel[7].pixel[cpi]), float(data_in.pixel[8].pixel[cpi]));
+      }
+      #endif
+
+      loop_mul_cpi:
+      for (int cpi=0; cpi<CPI; cpi++) {
+        #pragma HLS UNROLL
+        loop_mul_j:
+        for (int j=0; j<KW*KH; j++) {
+          #pragma HLS UNROLL
+          loop_mul_cpo:
+          for (int cpo=0; cpo<CPO; cpo++) {
+            #pragma HLS UNROLL
+            sum[cpo] += data_in.pixel[j].pixel[cpi] * kernel[cpo].pixel[j].pixel[cpi];
+          }
+        }
+      }
+
+      pixel_out_t p_out;
+      for (int cpo=0; cpo<CPO; cpo++) {
+        #pragma HLS unroll
+        #ifdef DEBUG_VERBOSE
+        printf("mul_%d: pixel produced\n", id);
+        for (int cpo=0; cpo<CPO; cpo++) printf("  cpo=%d -> %6.4f\n", cpo, float(sum[cpo]));
+        #endif
+        p_out.pixel[cpo] = sum[cpo];
+        sum[cpo] = 0.f;
+      }
+      out << p_out;
+    }
+  } //i_iter
+
+  #ifdef DEBUG_VERBOSE
+  printf("mul: end\n");
+  #endif
+}
+
+// -------------------------------------------------------------------------------
+// add: This function performs the addition of all subpixels for the same channel.
+// It adds also the corresponding bias.
+//
+// Arguments:
+//   H     : Height of input channel
+//   W     : Width of input channel
+//   in    : input streams data
+//   b_in  : input stream bias
+//   out   : output stream
+//
+static void add(int H, int W, int I_ITER, hls::stream<pixel_out_t> &in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
+
+  #ifdef DEBUG_VERBOSE
+  printf("add: start\n");
+  #endif
+
+  data_type bias[CPO];
+
+  // number of iterations by CPI || CPO channels
+  int num_iterations = W * H;
+
+  // Buffer for all data and CPO channels
+  data_type buff_o_channels[CPO][WHMAX];
+  DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO)
+
+  // We receive bias in packs of CPO
+  pixel_out_t p_out;
+  p_out = b_in.read();
+  add_load_bias_loop:
+  for (int b=0; b<CPO; b++) {
+    #pragma HLS PIPELINE II=1
+    bias[b] = p_out.pixel[b];
+  }
+
+  #ifdef DEBUG_VERBOSE
+  for (int b=0; b<CPO; b++) {
+    printf("Bias[%d] = %6.4f \n", b, float(bias[b]));
+  }
+  printf("add: bias received\n");
+  printf("o_iter = %d \n", o_iter);
+  for(int cpo = 0; cpo<CPO; cpo++){
+    printf("Channel cpo = %d: ", cpo);
+    for(int it = 0; it<num_iterations; it++){
+      printf("%6.2f ", float(buff_o_channels[cpo][it]));
+    }
+    printf("\n");
+  }
+  #endif
+
+  // All input data have effect into output add
+  add_i_iter_loop:
+  for (int i_iter = 0; i_iter < I_ITER; i_iter++){
+    pixel_out_t data_out;
+    #pragma HLS loop_flatten off
+    add_load_data_it_loop:
+    for(int it = 0; it<num_iterations; it++){
+      pixel_out_t data_in;
+      data_in = in.read();
+      pixel_out_t data;
+      add_load_data_cpo_loop:
+      for (int cpo=0; cpo<CPO; cpo++) {
+        #pragma HLS unroll
+        if(i_iter == 0){
+          data.pixel[cpo] = bias[cpo];
+        } else {
+          data.pixel[cpo] = buff_o_channels[cpo][it];
+        }
+        buff_o_channels[cpo][it] = data.pixel[cpo] + data_in.pixel[cpo];
+
+        if(i_iter ==(I_ITER-1)){
+          data_out.pixel[cpo] = buff_o_channels[cpo][it];
+        }
+      }
+      if(i_iter ==(I_ITER-1)){
+        out << data_out;
+      }
+    }
+  } //i_iter
+
+  #ifdef DEBUG_VERBOSE
+  for (int cpo=0; cpo<CPO; cpo++) {
+    printf("CH %d: ", cpo);
+    for (int it=0; it<num_iterations; it++) {
+      printf("%6.2f ", float(buff_o_channels[cpo][it]));
+    }
+    printf("\n");
+  }
+  #endif
+
+  #ifdef DEBUG_VERBOSE
+  printf("add: end\n");
+  #endif
+}
+
+// -------------------------------------------------------------------------------
+// conv: Convolutional kernel
+//
+// Arguments:
+//   H    : Height of the input channel
+//   W    : Width of the input channel
+//   I_ITER: Number of input iterations (I / CPI)
+//   in   : input data stream
+//   k_in : input kernel stream
+//   b_in : input bias stream
+//   out  : output data stream
+//
+static void conv(int H, int W, int I_ITER, hls::stream<pixel_in_t> &in, hls::stream<frame_t> &k_in, hls::stream<pixel_out_t> &b_in, hls::stream<pixel_out_t> &out) {
+
+  // streams
+  static hls::stream<pixel_in_t>  str_pad_cvt;  // padding->cvt
+  static hls::stream<frame_t>     str_cvt_mul;  // cvt->mul
+  static hls::stream<pixel_out_t> str_mul_add;  // mul->add
+
+  // topology
+  #pragma HLS dataflow
+  padding(H, W, I_ITER, in, str_pad_cvt);            // padding
+  cvt(H, W, I_ITER, str_pad_cvt, str_cvt_mul);       // cvt
+  mul(H, W, I_ITER, str_cvt_mul, k_in, str_mul_add); // mul
+  add(H, W, I_ITER, str_mul_add, b_in, out);         // add
+}
+
+// -------------------------------------------------------------------------------
+// k_conv2D_K3x3_S1x1_P1x1_BS1
+// Main kernel
+//
+// Arguments:
+//   ptr_data       : pointer to input data
+//   H              : Height of input channel
+//   W              : Width of input channel
+//   I              : Number of input channels
+//   ptr_kernel     : pinter to kernels
+//   ptr_bias       : pointer to bias
+//   ptr_out        : pointer to output buffer
+//   O              : Number of output channels
+//   offset_bias    : Offset within bias buffer
+//   offset_kernel  : Offset within kernel buffer
+//   offset_data_out: Offset within data out buffer
+//
+void k_conv2D_K3x3_S1x1_P1x1_BS1(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O, int offset_bias, int offset_kernel, int offset_data_out) {
+
+  #pragma HLS INTERFACE s_axilite port=W bundle=control
+  #pragma HLS INTERFACE s_axilite port=H bundle=control
+  #pragma HLS INTERFACE s_axilite port=I bundle=control
+  #pragma HLS INTERFACE s_axilite port=O bundle=control
+  #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem  max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem1 max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem2   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE m_axi port=ptr_out  offset=slave bundle=gmem   max_read_burst_length=256 max_write_burst_length=256
+  #pragma HLS INTERFACE s_axilite port=offset_bias bundle=control
+  #pragma HLS INTERFACE s_axilite port=offset_kernel bundle=control
+  #pragma HLS INTERFACE s_axilite port=offset_data_out bundle=control
+  #pragma HLS INTERFACE s_axilite port=return bundle=control
+
+  // ptr_data struct to be packed as a single element vector (to improve memory read)
+  // the compiler will do full structure access (all elements of structure)
+  #pragma HLS data_pack variable = ptr_data
+  #pragma HLS data_pack variable = ptr_out
+
+  int I_ITER = I/CPI;
+
+  // input and output streams
+  static hls::stream<pixel_in_t> out_read_data;
+  static hls::stream<frame_t> out_read_kernel;
+  static hls::stream<pixel_out_t> out_read_bias;
+  static hls::stream<pixel_out_t> out_conv;
+
+  // stream sizes
+  #pragma HLS STREAM variable = out_read_data depth = 32
+  #pragma HLS STREAM variable = out_read_kernel depth = 32
+  #pragma HLS STREAM variable = out_read_bias depth = 32
+  #pragma HLS STREAM variable = out_conv depth = 32
+  // #pragma HLS STREAM variable = out_relu depth = 32
+
+  #pragma HLS dataflow
+  read_data(H, W, I_ITER, ptr_data, out_read_data);
+  read_bias(offset_bias, ptr_bias, out_read_bias);
+  read_kernel(I_ITER, offset_kernel, ptr_kernel, out_read_kernel);
+  conv(H, W, I_ITER, out_read_data, out_read_kernel, out_read_bias, out_conv);
+  write_output(H, W, offset_data_out, ptr_out, out_conv);
+}
+
+} // end extern "C"
diff --git a/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp
index 4b06398cf..0b5537b3e 100644
--- a/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp
+++ b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp
@@ -1,3 +1,36 @@
+//
+// test_conv2D.
+//
+// Constants:
+//
+//  - CPI
+//  - CPO
+//  - KW = 3
+//  - KH = 3
+//  - PW = 1
+//  - PH = 1
+//  - SW = 1
+//  - SH = 1
+//
+//  Arguments:
+//
+//  - W
+//  - H
+//  - I
+//  - O
+//
+//  Data formats:
+//
+//  - kernel   : GO x GI x CPO x CPI x KH x KW
+//  - bias     : O
+//  - data_in  : GI x H x W x CPI
+//  - data_out : GO x H x W x CPO
+//
+//  GI = I / CPI
+//  GO = O / CPO
+//
+//
+
 #include <cstdio>      /* printf, scanf, NULL */
 #include <cstdlib>     /* malloc, free, rand */
 
@@ -8,36 +41,87 @@
 #include <vector>
 #include "xcl2.hpp"
 
+#include <ap_fixed.h>
+#include <sys/time.h>
+
 using std::vector;
 
+// data type
+#define data_type float
+
 // CL
 cl::Buffer buf;
 cl::Context context;
 cl::CommandQueue q;
 cl::Program program;
+std::string binaryFile;
+
+#define WMAX 256
+#define HMAX 256
+#define IMAX 512
+#define OMAX 512
+
+#define CPI 4
+#define CPO 4
 
+#define KW 3
+#define KH 3
+
+int W;
+int H;
+int GI;
+int GO;
+int I;
+int O;
 
-#define W    256 //256
-#define H    256 //256
-#define C    4  //I
-#define COUT 4  //O
-#define KW   3
-#define KH   3
 
 // buffers
-float data_in[  W   * H  * C       ]  __attribute__ ((__aligned__(16)));
-float kernel [ KW   * KH * C * COUT]  __attribute__ ((__aligned__(16)));
-float bias   [ COUT                ]  __attribute__ ((__aligned__(16)));
-float out    [  W   * H  * COUT    ]  __attribute__ ((__aligned__(16)));
-float out_cpu[  W   * H  * COUT    ]  __attribute__ ((__aligned__(16)));
+data_type *data_in; //[  IMAX * W * H * CPI               ]  __attribute__ ((__aligned__(16)));
+data_type *kernel;  //[  GO * GI * CPO * CPI * KW * KH    ]  __attribute__ ((__aligned__(16)));
+data_type *bias;    //[  O                                ]  __attribute__ ((__aligned__(16)));
+data_type *out;     //[  GO * W * H * CPO                 ]  __attribute__ ((__aligned__(16)));
+data_type *out_cpu; //[  GO * W * H * CPO                 ]  __attribute__ ((__aligned__(16)));
+
+void allocate_buffers() {
+  data_in = (data_type*)malloc(I * W * H * sizeof(data_type));
+  kernel = (data_type*)malloc(I * O * KW * KH * sizeof(data_type));
+  bias = (data_type*)malloc(O * sizeof(data_type));
+  out = (data_type*)malloc(O * W * H * sizeof(data_type));
+  out_cpu = (data_type*)malloc(O * W * H * sizeof(data_type));
+}
+
+void parse_arguments(int argc, char **argv) {
+  if (argc != 6) {
+    printf("syntax:\n%s <XCLBIN File> <W> <H> <I> <O>\n", argv[0]);
+    exit(1);
+  }
+
+  binaryFile = argv[1];  
+  W = atoi(argv[2]);
+  H = atoi(argv[3]);
+  I = atoi(argv[4]);
+  O = atoi(argv[5]);
+  if ((I % CPI) != 0) {printf("Error, I must me multiple of %d\n", CPI); exit(1);}
+  if ((O % CPO) != 0) {printf("Error, O must be multiple of %d\n", CPO); exit(1);}
+  GI = I / CPI;
+  GO = O / CPO;
+}
+
+void deallocate_buffers() {
+  free(data_in);
+  free(kernel);
+  free(bias);
+  free(out);
+  free(out_cpu);
+}
 
 void cpu_conv2d() {
 
-  int size_out = W * H * COUT;
+  int size_out = GO * W * H * CPO;
   for (int i=0; i<size_out; i++) out_cpu[i] = 0.f;
 
-  for (int c=0; c<C; c++) {
-    for (int cout=0; cout<COUT; cout++) {
+  for (int c=0; c<I; c++) {
+    for (int cout=0; cout<O; cout++) {
       for (int h=0; h<H; h++) {
         for (int w=0; w<W; w++) {
           for (int kh=0; kh<KH; kh++) {
@@ -45,9 +129,25 @@ void cpu_conv2d() {
 	      int data_h = (h-1)+kh;
 	      int data_w = (w-1)+kw;
 	      int padding = (data_h == -1) | (data_w == -1) | (data_w == W) | (data_h == H);
-	      int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
-              int addr_p = (data_h * W * C) + (data_w * C) + c;
-	      int addr_o = (h * W * COUT) + (w * COUT) + cout;
+	      // kernel position
+	      int gki = c / CPI;
+	      int ki = c % CPI;
+	      int gko = cout / CPO;
+	      int ko = cout % CPO;
+	      int addr_k = (gko * KW * KH * GI * CPO * CPI) +
+		           (gki * KW * KH * CPO * CPI) +
+			   (ko * KW * KH * CPI) +
+			   (ki * KW * KH) +
+			   (kh * KW) + kw;
+	      // data_in pixel position
+	      int gi = c / CPI;
+	      int i = c % CPI;
+              int addr_p = (gi * W * H * CPI) + (data_h * W * CPI) + (data_w * CPI) + i;
+	      // data_out pixel position
+	      int go = cout / CPO;
+	      int o = cout % CPO;
+	      int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
+	      // operation
 	      if (!padding) out_cpu[addr_o] += data_in[addr_p] * kernel[addr_k];
 	    }
 	  }
@@ -57,35 +157,33 @@ void cpu_conv2d() {
   }
 
   // añadimos bias
-  for (int cout=0; cout<COUT; cout++) {
+  for (int cout=0; cout<O; cout++) {
     for (int h=0; h<H; h++) {
       for (int w=0; w<W; w++) {
-        int addr_o = (h * W * COUT) + (w * COUT) + cout;
+	// data_out pixel position
+	int go = cout / CPO;
+	int o = cout % CPO;
+        int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
+	// bias operation
         out_cpu[addr_o] += bias[cout];
       }
     }
   }
-
-  // aplicamos relu
-/*  for (int cout=0; cout<COUT; cout++) {
-    for (int h=0; h<H; h++) {
-      for (int w=0; w<W; w++) {
-        int addr_o = (h * W * COUT) + (w * COUT) + cout;
-        if (out_cpu[addr_o] < 0.f) out_cpu[addr_o] = 0.f;
-      }
-    }
-  }*/
 }
 
 void cpu_print_data_in() {
   printf("data in:\n");
-  for (int c=0; c<C; c++) {
+  for (int c=0; c<I; c++) {
     printf(" channel %d:\n", c);
     printf("   ");
     for (int h=0; h<H; h++) {
       for (int w=0; w<W; w++) {
-	int addr_p = (h * W * C) + (w * C) + c;
-        printf("%6.2f ", data_in[addr_p]);
+	// data_in pixel position
+	int gi = c / CPI;
+	int i = c % CPI;
+	int addr_p = (gi * W * H * CPI) + (h * W * CPI) + (w * CPI) + i;
+	//
+        printf("%6.2f ", float(data_in[addr_p]));
       }
       printf("\n");
       printf("   ");
@@ -96,13 +194,22 @@ void cpu_print_data_in() {
 
 void cpu_print_kernels() {
   printf("kernels:\n");
-  for (int c=0; c<C; c++) {
-    for (int cout=0; cout<COUT; cout++) {
+  for (int cout=0; cout<O; cout++) {
+    for (int c=0; c<I; c++) {
       printf("kernel c=%d cout %d:\n", c, cout);
       for (int kh=0; kh<KH; kh++) {
         for (int kw=0; kw<KW; kw++) {
-	  int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
-	  printf("%6.2f ", kernel[addr_k]);
+           // kernel position
+           int gki = c / CPI;
+           int ki = c % CPI;
+           int gko = cout / CPO;
+           int ko = cout % CPO;
+           int addr_k = (gko * KW * KH * GI * CPO * CPI) +
+                        (gki * KW * KH * CPO * CPI) +
+                        (ko * KW * KH * CPI) +
+                        (ki * KW * KH) +
+                        (kh * KW) + kw;
+	  printf("%6.2f ", float(kernel[addr_k]));
 	}
 	printf("\n");
       }
@@ -112,20 +219,23 @@ void cpu_print_kernels() {
 
 void cpu_print_bias() {
   printf("bias:\n");
-  for (int cout=0; cout<COUT; cout++) {
-    printf("%6.2f ", bias[cout]);
+  for (int cout=0; cout<O; cout++) {
+    printf("%6.2f ", float(bias[cout]));
   }
   printf("\n");
 }
 
 void cpu_print_out() {
   printf("output: cpu (fpga)\n");
-  for (int cout=0; cout<COUT; cout++) {
+  for (int cout=0; cout<O; cout++) {
     printf("channel %d:\n", cout);
     for (int h=0; h<H; h++) {
       for (int w=0; w<W; w++) {
-        int addr_o = (h * W * COUT) + (w * COUT) + cout;
-        printf(" %10.6f (%10.6f) (diff %10.6f) | ", out_cpu[addr_o], out[addr_o], out_cpu[addr_o]-out[addr_o]);
+	// data_out pixel position
+	int go = cout / CPO;
+	int o = cout % CPO;
+        int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
+        printf(" %10.6f (%10.6f) (diff %10.6f) | ", float(out_cpu[addr_o]), float(out[addr_o]), float(out_cpu[addr_o]-out[addr_o]));
       }
       printf("\n");
     }
@@ -135,12 +245,15 @@ void cpu_print_out() {
 void check_result() {
 
   int error = 0;
-  for (int cout=0; cout<COUT; cout++) {
+  for (int cout=0; cout<O; cout++) {
     for (int h=0; h<H; h++) {
       for (int w=0; w<W; w++) {
-        int addr_o = (h * W * COUT) + (w * COUT) + cout;
-        if (fabs(out_cpu[addr_o] - out[addr_o]) > 0.001) {
-          printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, out_cpu[addr_o], out[addr_o], fabs(out_cpu[addr_o]-out[addr_o]));
+	// data_out pixel position
+	int go = cout / CPO;
+	int o = cout % CPO;
+        int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o;
+        if (fabs(float(out_cpu[addr_o]) - float(out[addr_o])) > 0.001) {
+          printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, float(out_cpu[addr_o]), float(out[addr_o]), fabs(float(out_cpu[addr_o]-out[addr_o])));
           error = 1;
 	  return;
 	}
@@ -220,23 +333,21 @@ void set_callback(cl::Event event, const char *queue_name) {
 //---------------------------------------------------------------------------------------------------------------------
 
 int main(int argc, char **argv) {
-  if (argc != 2) {
-    std::cout << "Usage: " << argv[0] << " <XCLBIN File>" << std::endl;
-    return EXIT_FAILURE;
-  }
 
-  printf("Test CONV: [WxHxC] = [%dx%dx%d] -> [WxHxC] = [%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", W, H, C, W, H, COUT, KW, KH);
+  parse_arguments(argc, argv);
+
+  printf("Test CONV: [GIxWxHxCPI] = [%dx%dx%dx%d] -> [GOxWxHxCPO] = [%dx%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", GI, W, H, CPI, GO, W, H, CPO, KW, KH);
+
+  allocate_buffers();
 
-  std::string binaryFile = argv[1];
   cl_int err;
-  cl::Kernel kernel_conv2d_2;
+  cl::Kernel kernel_conv2d;
 
   std::cout << "Creating Context..." << std::endl;
   auto devices = xcl::get_xil_devices();
   auto device = devices[0];
   OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err));
   OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
-
   std::string device_name = device.getInfo<CL_DEVICE_NAME>();
   auto fileBuf = xcl::read_binary_file(binaryFile);
   cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
@@ -245,13 +356,13 @@ int main(int argc, char **argv) {
   OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err));
   std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl;
 
-  OCL_CHECK(err, kernel_conv2d_2 = cl::Kernel(program,"k_conv2D_K3x3_S1x1_P1x1_BS1", &err));
+  OCL_CHECK(err, kernel_conv2d = cl::Kernel(program,"k_conv2D_K3x3_S1x1_P1x1_BS1", &err));
   std::cout << "Kernel sucessfully created" << std::endl ;
 
-  size_t size_data_in_bytes = W*H*C*sizeof(float);
-  size_t size_output_in_bytes = W*H*COUT * sizeof(float);
-  size_t size_kernel_in_bytes = KW * KH * C * COUT * sizeof(float);
-  size_t size_bias_in_bytes = COUT * sizeof(float);
+  size_t size_data_in_bytes = W * H * I * sizeof(data_type);
+  size_t size_output_in_bytes = W * H * O * sizeof(data_type);
+  size_t size_kernel_in_bytes = KW * KH * I * O * sizeof(data_type);
+  size_t size_bias_in_bytes = O * sizeof(data_type);
   // Allocate memory on the host and fill with random data.
 
   //-----------------------------
@@ -262,25 +373,27 @@ int main(int argc, char **argv) {
 
   std::cout << "Filling buffer with useful data" << std::endl ;
   int addr = 0;
-  for (int h=0; h<H; h++) {
-    for (int w=0; w<W; w++) {
-      for (int c=0; c<C; c++) {
-	       float value = (c*W*H) + (float)(h*W)+w; //c+1; // (float)((c * 25) + (h * W) + w);
-         data_in[addr] = dist(gen); //value;
-	       addr++;
+  for (int gi=0; gi<GI; gi++) {
+    for (int h=0; h<H; h++) {
+      for (int w=0; w<W; w++) {
+        for (int c=0; c<CPI; c++) {
+          data_type value = (gi * W * H * CPI) + (c * W * H) + (data_type)(h * W) + w; //c+1; // (data_type)((c * 25) + (h * W) + w);
+          data_in[addr] = dist(gen); //value;
+          addr++;
+	}
       }
     }
   }
 
   std::cout << "Filling kernel buffer with useful data" << std::endl;
   int kernel_id = 1;
-  for (int c=0; c<C; c++) {
-    for (int cout=0; cout<COUT; cout++) {
+  for (int c=0; c<I; c++) {
+    for (int cout=0; cout<O; cout++) {
       for (int kh=0; kh<KH; kh++) {
-	       for (int kw=0; kw<KW; kw++) {
-          float value = (float)kernel_id;
-          int addr_k = (c * COUT * KW * KH) + (cout * KW * KH) + (kh * KW) + kw;
-	         kernel[addr_k] = dist(gen);
+	for (int kw=0; kw<KW; kw++) {
+          data_type value = (data_type)kernel_id;
+          int addr_k = (cout * I * KW * KH) + (c * KW * KH) + (kh * KW) + kw;
+          kernel[addr_k] = dist(gen);
         }
       }
       kernel_id++;
@@ -288,15 +401,15 @@ int main(int argc, char **argv) {
   }
 
   std::cout << "Filling bias buffer with useful data" << std::endl;
-  for (int cout=0; cout<COUT; cout++) bias[cout] = cout; //dist(gen);
+  for (int cout=0; cout<O; cout++) bias[cout] = dist(gen);
 
   //-----------------------------
   // THIS PAIR OF EVENTS WILL BE USED TO TRACK WHEN A KERNEL IS FINISHED WITH
   // THE INPUT BUFFERS. ONCE THE KERNEL IS FINISHED PROCESSING THE DATA, A NEW
   // SET OF ELEMENTS WILL BE WRITTEN INTO THE BUFFER.
-  vector<cl::Event> kernel_events(1);
+  vector<cl::Event> kernel_events(GO);
   vector<cl::Event> read_events(1);
-  vector<cl::Event> write_events(1);
+  vector<cl::Event> write_events(3);
   cl::Buffer buffer_a;
   cl::Buffer buffer_b;
   cl::Buffer buffer_k;
@@ -308,48 +421,82 @@ int main(int argc, char **argv) {
   // Device-to-host communication
   std::cout << "Creating Buffers..." << std::endl;
 
-  OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_data_in_bytes, &data_in, &err));
-  OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY  | CL_MEM_USE_HOST_PTR , size_output_in_bytes, &out, &err));
-  OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, &kernel, &err));
-  OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, &bias, &err));
-
-  // set kernel arguments
-  int arg = 0;
-  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a));
-  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, H));
-  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, W));
-  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, C));
-  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_k));
-  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_bias));
-  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_b));
-  OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, COUT));
+  OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_data_in_bytes, data_in, &err));
+  OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY  | CL_MEM_USE_HOST_PTR , size_output_in_bytes, out, &err));
+  OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, kernel, &err));
+  OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, bias, &err));
+
+  //Arguments for loop
+  int offset_bias = 0;  //offset to pointer bias each loop
+  int offset_kernel = 0; //offset to pointer kernel each loop
+  int offset_data_out = 0; //offset to poiter output data loop
 
   //-----------------------------
   // Copy input data to device global memory
-  std::cout << "Copying data (Host to Device)..." << std::endl;
+  // std::cout << "Copying data (Host to Device)..." << std::endl;
   // Because we are passing the write_events, it returns an event object
   // that identifies this particular command and can be used to query
   // or queue a wait for this particular command to complete.
   OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0]));
   set_callback(write_events[0], "ooo_queue");
 
-  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[0]));
-  set_callback(write_events[0], "ooo_queue");
-
-  //-----------------------------
-  printf("Enqueueing NDRange kernel.\n");
-  // This event needs to wait for the write buffer operations to complete
-  // before executing. We are sending the write_events into its wait list to
-  // ensure that the order of operations is correct.
-  // Launch the Kernel
-  std::vector<cl::Event> waitList;
-  waitList.push_back(write_events[0]);
-  OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[0]));
-  set_callback(kernel_events[0], "ooo_queue");
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[1]));
+  set_callback(write_events[1], "ooo_queue");
+
+  OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_bias}, 0 /*0 means from host*/, NULL, &write_events[2]));
+  set_callback(write_events[2], "ooo_queue");  
+  
+  // timint stats
+  unsigned long long prof_time;
+  struct timeval prof_t1;
+  gettimeofday(&prof_t1, NULL);
+
+  for (int o_iter = 0; o_iter < GO; o_iter++){
+    // set kernel arguments
+    int arg = 0;
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, buffer_a));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, H));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, W));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, I));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, buffer_k));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, buffer_bias));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, buffer_b));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, O));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, offset_bias));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, offset_kernel));
+    OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, offset_data_out));
+
+    // Update the offset poiter to bias, kernels and output data
+    offset_bias = offset_bias + CPO;
+    offset_kernel = offset_kernel + KW * KH * CPO * GI * CPI;
+    offset_data_out = offset_data_out +  H * W;
+
+    //-----------------------------
+    // printf("Enqueueing NDRange kernel.\n");
+    // This event needs to wait for the write buffer operations to complete
+    // before executing. We are sending the write_events into its wait list to
+    // ensure that the order of operations is correct.
+    // Launch the Kernel
+    std::vector<cl::Event> waitList;
+    waitList.push_back(write_events[0]);
+    waitList.push_back(write_events[1]);
+    waitList.push_back(write_events[2]);
+    OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d, 0, 1, 1, &waitList, &kernel_events[o_iter]));
+    set_callback(kernel_events[o_iter], "ooo_queue");
+  }
 
+  // we wait all kernels to have completed
+  for (int o_iter = 0; o_iter < GO; o_iter++) {
+    OCL_CHECK(err, err = kernel_events[o_iter].wait());
+  }
 
+  // timing
+  struct timeval prof_t2;
+  gettimeofday(&prof_t2, NULL);
+  prof_time = ((prof_t2.tv_sec - prof_t1.tv_sec) * 1000000) + (prof_t2.tv_usec - prof_t1.tv_usec);
+  printf("Timing: %8lld usec\n", prof_time);
 
-  std::cout << "Getting Results (Device to Host)..." << std::endl;
+  // std::cout << "Getting Results (Device to Host)..." << std::endl;
   std::vector<cl::Event> eventList;
   eventList.push_back(kernel_events[0]);
   // This operation only needs to wait for the kernel call.
@@ -362,22 +509,23 @@ int main(int argc, char **argv) {
   OCL_CHECK(err, err = q.flush());
   OCL_CHECK(err, err = q.finish());
 
-
   std::cout << "computing conv in CPU..." << std::endl;
 
- // cpu_print_data_in();
+  // cpu_print_data_in();
   // cpu_print_kernels();
- // cpu_print_bias();
-  // cpu_conv2d();
- // cpu_print_out();
+  // cpu_print_bias();
+  cpu_conv2d();
+  // cpu_print_out();
 
-  // check_result();
+  check_result();
 
   //-----------------------------
   std::cout << "" << std::endl;
   std::cout << "All done" << std::endl;
   std::cout << "quit now" << std::endl;
 
+  deallocate_buffers();
+
   // exit
   return 0;
 }

From 96c9256fafd4398292f936328ce1b11833c7896b Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Mon, 2 Nov 2020 09:04:09 +0000
Subject: [PATCH 08/15] adding profiling info

---
 src/tensor/nn/tensor_activations.cpp |   1 -
 src/tensor/tensor_math.cpp           | 248 +++++++++++++++++++++++++--
 2 files changed, 234 insertions(+), 15 deletions(-)

diff --git a/src/tensor/nn/tensor_activations.cpp b/src/tensor/nn/tensor_activations.cpp
index b465a8918..e3cb2c9ef 100644
--- a/src/tensor/nn/tensor_activations.cpp
+++ b/src/tensor/nn/tensor_activations.cpp
@@ -51,7 +51,6 @@ namespace tensorNN {
         B->tsem->unlock();
 
 	PROFILING_FOOTER(ReLu);
-	PROFILING_PRINTF(ReLu);
     }
 
 // RELU Derivative, always increment over parent delta
diff --git a/src/tensor/tensor_math.cpp b/src/tensor/tensor_math.cpp
index ac5f5d343..27711e3fd 100644
--- a/src/tensor/tensor_math.cpp
+++ b/src/tensor/tensor_math.cpp
@@ -26,8 +26,68 @@
 
 using namespace std;
 
-PROFILING_ENABLE(sum2D_rowwise);
+// profiling declarations
+PROFILING_ENABLE(maximum);
+PROFILING_ENABLE(minimum);
+PROFILING_ENABLE(max);
+PROFILING_ENABLE(argmax);
+PROFILING_ENABLE(argmax_d);
+PROFILING_ENABLE(min);
+PROFILING_ENABLE(argmin);
+PROFILING_ENABLE(sum);
+PROFILING_ENABLE(sum_abs);
+PROFILING_ENABLE(prod);
+PROFILING_ENABLE(mean);
+PROFILING_ENABLE(median);
+PROFILING_ENABLE(std);
+PROFILING_ENABLE(var);
+PROFILING_ENABLE(mode);
+PROFILING_ENABLE(abs);
+PROFILING_ENABLE(acos);
+PROFILING_ENABLE(add);
+PROFILING_ENABLE(asin);
+PROFILING_ENABLE(atan);
+PROFILING_ENABLE(cell);
+PROFILING_ENABLE(clamp);
+PROFILING_ENABLE(clampmax);
+PROFILING_ENABLE(clampmin);
+PROFILING_ENABLE(cos);
+PROFILING_ENABLE(cosh);
+PROFILING_ENABLE(div);
+PROFILING_ENABLE(exp);
+PROFILING_ENABLE(floor);
+PROFILING_ENABLE(inv);
+PROFILING_ENABLE(log);
+PROFILING_ENABLE(log2);
+PROFILING_ENABLE(log10);
+PROFILING_ENABLE(logn);
+PROFILING_ENABLE(mod);
+PROFILING_ENABLE(mult);
+PROFILING_ENABLE(neg);
+PROFILING_ENABLE(normalize);
+PROFILING_ENABLE(pow);
+PROFILING_ENABLE(powb);
+PROFILING_ENABLE(reciprocal);
+PROFILING_ENABLE(remainder);
+PROFILING_ENABLE(round);
+PROFILING_ENABLE(rsqrt);
+PROFILING_ENABLE(sigmoid);
+PROFILING_ENABLE(sign);
+PROFILING_ENABLE(sin);
+PROFILING_ENABLE(sinh);
+PROFILING_ENABLE(sqr);
+PROFILING_ENABLE(sqrt);
+PROFILING_ENABLE(sub);
+PROFILING_ENABLE(tan);
+PROFILING_ENABLE(tanh);
+PROFILING_ENABLE(trunc);
+PROFILING_ENABLE(inc);
+PROFILING_ENABLE(eldiv);
 PROFILING_ENABLE(mult2D);
+PROFILING_ENABLE(el_mult);
+PROFILING_ENABLE(sum2D_rowwise);
+PROFILING_ENABLE(reduce_sum2D);
+PROFILING_ENABLE(sum2D_colwise);
 
 // Math operations (Tensor-Tensor, Tensor-float) ************************
 
@@ -44,6 +104,9 @@ Tensor* Tensor::maximum(Tensor* A, float v){
 }
 
 void Tensor::maximum(Tensor* A, Tensor* B, float v){
+
+    PROFILING_HEADER(maximum);
+
     if (A->isCPU() && B->isCPU()){
         cpu_maximum(A, B, v);
     }
@@ -59,6 +122,7 @@ void Tensor::maximum(Tensor* A, Tensor* B, float v){
     }
 #endif
 
+    PROFILING_FOOTER(maximum);
 }
 
 Tensor* Tensor::maximum(Tensor* A, Tensor* B){
@@ -68,6 +132,9 @@ Tensor* Tensor::maximum(Tensor* A, Tensor* B){
 }
 
 void Tensor::maximum(Tensor* A, Tensor* B, Tensor* C){
+
+    PROFILING_HEADER(maximum);
+
     if (A->isCPU() && B->isCPU() && C->isCPU()){
         cpu_maximum(A, B, C);
     }
@@ -83,6 +150,7 @@ void Tensor::maximum(Tensor* A, Tensor* B, Tensor* C){
     }
 #endif
 
+    PROFILING_FOOTER(maximum);
 }
 
 Tensor* Tensor::minimum(float v){
@@ -98,6 +166,9 @@ Tensor* Tensor::minimum(Tensor* A, float v){
 }
 
 void Tensor::minimum(Tensor* A, Tensor* B, float v){
+
+    PROFILING_HEADER(minimum);
+
     if (A->isCPU() && B->isCPU()){
         cpu_minimum(A, B, v);
     }
@@ -113,6 +184,7 @@ void Tensor::minimum(Tensor* A, Tensor* B, float v){
     }
 #endif
 
+    PROFILING_FOOTER(minimum);
 }
 
 Tensor* Tensor::minimum(Tensor* A, Tensor* B){
@@ -122,6 +194,9 @@ Tensor* Tensor::minimum(Tensor* A, Tensor* B){
 }
 
 void Tensor::minimum(Tensor* A, Tensor* B, Tensor* C){
+
+    PROFILING_HEADER(minimum);
+
     if (A->isCPU() && B->isCPU() && C->isCPU()){
         cpu_minimum(A, B, C);
     }
@@ -137,6 +212,7 @@ void Tensor::minimum(Tensor* A, Tensor* B, Tensor* C){
     }
 #endif
 
+    PROFILING_FOOTER(minimum);
 }
 
 
@@ -149,6 +225,9 @@ float Tensor::max(){
 
 
 float Tensor::max(Tensor* A){
+
+    PROFILING_HEADER(max);
+
     if (A->isCPU()) {
         return cpu_max(A);
     }
@@ -164,6 +243,9 @@ float Tensor::max(Tensor* A){
         return fpga_max(A);
     }
 #endif
+
+    PROFILING_FOOTER(max);
+
     return 0.0f; // Never used, this is for the compiler warning
 }
 
@@ -181,6 +263,9 @@ Tensor* Tensor::max(vector<int> axis, bool keepdims){
 }
 
 void Tensor::max(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
+
+    PROFILING_HEADER(max);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_max(A, B, rd);
     }
@@ -195,6 +280,8 @@ void Tensor::max(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
         fpga_max(A, B, rd);
     }
 #endif
+
+    PROFILING_FOOTER(max);
 }
 
 
@@ -205,21 +292,29 @@ int Tensor::argmax(){
 
 
 int Tensor::argmax(Tensor* A){
+
+    PROFILING_HEADER(argmax);
+
     if (A->isCPU()) {
+        PROFILING_FOOTER(argmax);
         return cpu_argmax(A);
     }
 #ifdef cGPU
     else if (A->isGPU())
     {
+        PROFILING_FOOTER(argmax);
         return gpu_argmax(A);
     }
 #endif
 #ifdef cFPGA
     else {
-        fpga_argmax(A);
+        PROFILING_FOOTER(argmax);
+        return fpga_argmax(A);
     }
 #endif
 
+    PROFILING_FOOTER(argmax);
+
     msg("Invalid device", "Tensor::argmax");
     return 0.0f; // Never used, this is for the compiler warning
 }
@@ -238,6 +333,9 @@ Tensor* Tensor::argmax(vector<int> axis, bool keepdims){
 }
 
 void Tensor::argmax(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
+
+    PROFILING_HEADER(argmax);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_argmax(A, B, rd);
     }
@@ -252,9 +350,14 @@ void Tensor::argmax(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
         fpga_argmax(A, B, rd);
     }
 #endif
+
+    PROFILING_FOOTER(argmax);
 }
 
 void Tensor::argmax_d(Tensor *D, Tensor *O, Tensor *PD){
+
+    PROFILING_HEADER(argmax_d);
+
     if (D->isCPU() && O->isCPU() && PD->isCPU()) {
         cpu_argmax_d(D, O, PD);
     }
@@ -269,6 +372,8 @@ void Tensor::argmax_d(Tensor *D, Tensor *O, Tensor *PD){
         //fpga_argmax_d(D, O, PD);
     }
 #endif
+
+    PROFILING_FOOTER(argmax_d);
 }
 
 float Tensor::min(){
@@ -277,21 +382,29 @@ float Tensor::min(){
 
 
 float Tensor::min(Tensor* A){
+
+    PROFILING_HEADER(min);
+
     if (A->isCPU()) {
+        PROFILING_FOOTER(min);
         return cpu_min(A);
     }
 #ifdef cGPU
     else if (A->isGPU())
     {
-        return gpu_min(A);
+        PROFILING_FOOTER(min);
+	return gpu_min(A);
     }
 #endif
 #ifdef cFPGA
     else {
-        fpga_min(A);
+        PROFILING_FOOTER(min);
+        return fpga_min(A);
     }
 #endif
 
+    PROFILING_FOOTER(min);
+
     msg("Invalid device", "Tensor::min");
     return 0.0f; // Never used, this is for the compiler warning
 }
@@ -310,6 +423,9 @@ Tensor* Tensor::min(vector<int> axis, bool keepdims){
 }
 
 void Tensor::min(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
+
+    PROFILING_HEADER(min);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_min(A, B, rd);
     }
@@ -324,6 +440,8 @@ void Tensor::min(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
         fpga_min(A, B, rd);
     }
 #endif
+
+    PROFILING_FOOTER(min);
 }
 
 
@@ -333,21 +451,29 @@ int Tensor::argmin(){
 
 
 int Tensor::argmin(Tensor* A){
+
+    PROFILING_HEADER(argmin);
+
     if (A->isCPU()) {
+        PROFILING_FOOTER(argmin);
         return cpu_argmin(A);
     }
 #ifdef cGPU
     else if (A->isGPU())
     {
+        PROFILING_FOOTER(argmin);
         return gpu_argmin(A);
     }
 #endif
 #ifdef cFPGA
     else {
-        fpga_argmin(A);
+        PROFILING_FOOTER(argmin);
+        return fpga_argmin(A);
     }
 #endif
 
+    PROFILING_FOOTER(argmin);
+
     msg("Invalid device", "Tensor::argmax");
     return 0.0f; // Never used, this is for the compiler warning
 }
@@ -366,6 +492,9 @@ Tensor* Tensor::argmin(vector<int> axis, bool keepdims){
 }
 
 void Tensor::argmin(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
+
+    PROFILING_HEADER(argmin);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_argmin(A, B, rd);
     }
@@ -381,6 +510,8 @@ void Tensor::argmin(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
         fpga_argmin(A, B, rd);
     }
 #endif
+
+    PROFILING_FOOTER(argmin);
 }
 
 
@@ -390,21 +521,31 @@ float Tensor::sum(){
 
 
 float Tensor::sum(Tensor* A){
+
+    PROFILING_HEADER(sum);
+
     if (A->isCPU()) {
+        PROFILING_FOOTER(sum);
         return cpu_sum(A);
     }
 #ifdef cGPU
     else if (A->isGPU())
     {
+
+        PROFILING_FOOTER(sum);
         return gpu_sum(A);
     }
 #endif
 #ifdef cFPGA
     else {
+
+        PROFILING_FOOTER(sum);
         return fpga_sum(A);
     }
 #endif
 
+    PROFILING_FOOTER(sum);
+
     msg("Invalid device", "Tensor::sum");
     return 0.0f; // Never used, this is for the compiler warning
 }
@@ -423,6 +564,9 @@ Tensor* Tensor::sum(vector<int> axis, bool keepdims){
 }
 
 void Tensor::sum(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
+
+    PROFILING_HEADER(sum);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_sum(A, B, rd);
     }
@@ -437,6 +581,8 @@ void Tensor::sum(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
         fpga_sum(A, B, rd);
     }
 #endif
+
+    PROFILING_FOOTER(sum);
 }
 
 float Tensor::sum_abs(){
@@ -445,21 +591,30 @@ float Tensor::sum_abs(){
 
 
 float Tensor::sum_abs(Tensor* A){
+
+    PROFILING_HEADER(sum_abs);
+
     if (A->isCPU()) {
-        return cpu_sum_abs(A);
+
+        PROFILING_FOOTER(sum_abs);
+	return cpu_sum_abs(A);
     }
 #ifdef cGPU
     else if (A->isGPU())
     {
+        PROFILING_FOOTER(sum_abs);
         return gpu_sum_abs(A);
     }
 #endif
 #ifdef cFPGA
     else {
+        PROFILING_FOOTER(sum_abs);
         return fpga_sum_abs(A);
     }
 #endif
 
+    PROFILING_FOOTER(sum_abs);
+
     msg("Invalid device", "Tensor::sum_abs");
     return 0.0f; // Never used, this is for the compiler warning
 }
@@ -479,6 +634,9 @@ Tensor* Tensor::sum_abs(vector<int> axis, bool keepdims){
 }
 
 void Tensor::sum_abs(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
+
+    PROFILING_HEADER(sum_abs);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_sum_abs(A, B, rd);
     }
@@ -493,6 +651,8 @@ void Tensor::sum_abs(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
         fpga_sum_abs(A, B, rd);
     }
 #endif
+
+    PROFILING_FOOTER(sum_abs);
 }
 
 float Tensor::prod(){
@@ -501,21 +661,29 @@ float Tensor::prod(){
 
 
 float Tensor::prod(Tensor* A){  // AKA factorial
+
+    PROFILING_HEADER(prod);
+
     if (A->isCPU()) {
+        PROFILING_FOOTER(prod);
         return cpu_prod(A);
     }
 #ifdef cGPU
     else if (A->isGPU())
     {
+        PROFILING_FOOTER(prod);
         return gpu_prod(A);
     }
 #endif
 #ifdef cFPGA
     else {
-        fpga_prod(A);
+        PROFILING_FOOTER(prod);
+        return fpga_prod(A);
     }
 #endif
 
+    PROFILING_FOOTER(prod);
+
     msg("Invalid device", "Tensor::prod");
     return 0.0f; // Never used, this is for the compiler warning
 }
@@ -535,6 +703,9 @@ Tensor* Tensor::prod(vector<int> axis, bool keepdims){
 }
 
 void Tensor::prod(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
+
+    PROFILING_HEADER(prod);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_prod(A, B, rd);
     }
@@ -549,6 +720,8 @@ void Tensor::prod(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
         fpga_prod(A, B, rd);
     }
 #endif
+
+    PROFILING_FOOTER(prod);
 }
 
 
@@ -575,6 +748,9 @@ Tensor* Tensor::mean(vector<int> axis, bool keepdims){
 }
 
 void Tensor::mean(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
+
+    PROFILING_HEADER(mean);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_mean(A, B, rd);
     }
@@ -589,6 +765,8 @@ void Tensor::mean(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
         fpga_mean(A, B, rd);
     }
 #endif
+
+    PROFILING_FOOTER(mean);
 }
 
 
@@ -599,6 +777,9 @@ float Tensor::median(){
 
 
 float Tensor::median(Tensor* A){
+
+    PROFILING_HEADER(median);
+
     float res = 0.0f;
 
     // Clone tensor (needs to be sorted first)
@@ -619,6 +800,8 @@ float Tensor::median(Tensor* A){
     }
 #endif
 
+    PROFILING_FOOTER(median);
+
     delete tmp;
     return res;
 }
@@ -637,6 +820,9 @@ Tensor* Tensor::median(vector<int> axis, bool keepdims){
 }
 
 void Tensor::median(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
+
+    PROFILING_HEADER(median);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_median(A, B, rd);
     }
@@ -651,10 +837,9 @@ void Tensor::median(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
         fpga_median(A, B, rd);
     }
 #endif
-}
-
-
 
+    PROFILING_FOOTER(median);
+}
 
 float Tensor::std(bool unbiased){
     return Tensor::std(this, unbiased);
@@ -662,21 +847,29 @@ float Tensor::std(bool unbiased){
 
 
 float Tensor::std(Tensor* A, bool unbiased){
+
+    PROFILING_HEADER(std);
+
     if (A->isCPU()) {
+        PROFILING_FOOTER(std);
         return cpu_std(A, unbiased);
     }
 #ifdef cGPU
     else if (A->isGPU())
     {
+        PROFILING_FOOTER(std);
         return gpu_std(A, unbiased);
     }
 #endif
 #ifdef cFPGA
     else {
+        PROFILING_FOOTER(std);
         fpga_std(A, unbiased);
     }
 #endif
 
+    PROFILING_FOOTER(std);
+
     msg("Invalid device", "Tensor::std");
     return 0.0f; // Never used, this is for the compiler warning
 }
@@ -696,6 +889,9 @@ Tensor* Tensor::std(vector<int> axis, bool keepdims, bool unbiased){
 }
 
 void Tensor::std(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){
+
+    PROFILING_HEADER(std);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_std(A, B, rd, unbiased);
     }
@@ -710,6 +906,8 @@ void Tensor::std(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){
         fpga_std(A, B, rd, unbiased);
     }
 #endif
+
+    PROFILING_FOOTER(std);
 }
 
 
@@ -719,22 +917,33 @@ float Tensor::var(bool unbiased){
 
 
 float Tensor::var(Tensor* A, bool unbiased){
+
+    PROFILING_HEADER(var);
+
     if (A->isCPU()) {
+
+        PROFILING_FOOTER(var);
         return cpu_var(A, unbiased);
     }
 #ifdef cGPU
     else if (A->isGPU())
     {
+
+        PROFILING_FOOTER(var);
         return gpu_var(A, unbiased);
     }
 #endif
 #ifdef cFPGA
     else if (A->isFPGA())
     {
-	return fpga_var(A, unbiased);
+
+        PROFILING_FOOTER(var);
+        return fpga_var(A, unbiased);
     }
 #endif
 
+    PROFILING_FOOTER(var);
+
     msg("Invalid device", "Tensor::var");
     return 0.0f; // Never used, this is for the compiler warning
 }
@@ -754,6 +963,9 @@ Tensor* Tensor::var(vector<int> axis, bool keepdims, bool unbiased){
 }
 
 void Tensor::var(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){
+
+    PROFILING_HEADER(var);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_var(A, B, rd, unbiased);
     }
@@ -769,6 +981,8 @@ void Tensor::var(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){
         fpga_var(A, B, rd, unbiased);
     }
 #endif
+
+    PROFILING_FOOTER(var);
 }
 
 
@@ -778,22 +992,30 @@ int Tensor::mode(){
 
 
 int Tensor::mode(Tensor* A){
+
+    PROFILING_HEADER(mode);
+
     if (A->isCPU()) {
+	PROFILING_FOOTER(mode);
         return cpu_mode(A);
     }
 #ifdef cGPU
     else if (A->isGPU())
     {
+	PROFILING_FOOTER(mode);
         return gpu_mode(A);
     }
 #endif
 #ifdef cFPGA
     else if (A->isFPGA())
     {
+	PROFILING_FOOTER(mode);
         return fpga_mode(A);
     }
 #endif
 
+    PROFILING_FOOTER(mode);
+
     msg("Invalid device", "Tensor::mode");
     return 0; // Never used, this is for the compiler warning
 }
@@ -2289,7 +2511,7 @@ void Tensor::mult2D(Tensor *A, int tA, Tensor *B, int tB, Tensor *C, int incC) {
     //// Dimensions and types must be compatible
     //// Only for 2D Tensors
     ///////////////////////////////////////
-    
+   
     PROFILING_HEADER_EXTERN(mult2D);
 
     if ((A->device != B->device) || (A->device != C->device)) {A->info();B->info();C->info();msg("Tensors in different devices", "Tensor::mult2D");}
@@ -2330,7 +2552,6 @@ void Tensor::mult2D(Tensor *A, int tA, Tensor *B, int tB, Tensor *C, int incC) {
     C->tsem->unlock();
 
     PROFILING_FOOTER(mult2D);
-    PROFILING_PRINTF(mult2D);
 }
 
 
@@ -2401,7 +2622,6 @@ void Tensor::sum2D_rowwise(Tensor *A, Tensor *B, Tensor *C) {
     C->tsem->unlock();
 
     PROFILING_FOOTER(sum2D_rowwise);
-    PROFILING_PRINTF(sum2D_rowwise);
 }
 
 

From e74ee6d138d2de845233186aa733e861630a72b2 Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Mon, 2 Nov 2020 13:30:41 +0100
Subject: [PATCH 09/15] - Update from UPV-GAP

   - CONV2D kernel optimized and finalized
   - Profiling added to arithmetic tensor operations
   - FPGA support for pipeline use case (skin lession classification)
---
 include/eddl/apis/eddl.h   |   9 +-
 include/eddl/utils.h       |   2 +
 src/apis/eddl.cpp          |   6 +
 src/net/net_api.cpp        |  10 --
 src/tensor/tensor_math.cpp | 322 ++++++++++++++++++++++++++-----------
 src/utils.cpp              | 135 ++++++++++++++++
 6 files changed, 376 insertions(+), 108 deletions(-)

diff --git a/include/eddl/apis/eddl.h b/include/eddl/apis/eddl.h
index be91e4b90..df94710e4 100644
--- a/include/eddl/apis/eddl.h
+++ b/include/eddl/apis/eddl.h
@@ -677,7 +677,9 @@ namespace eddl {
       *  @param l  Layer to detach
       *  @return   Detached Layer
     */
-    layer detach(layer l);/**
+    layer detach(layer l);
+    
+    /**
       *  @brief Sets the provided layers as detached, excluding them from the computation of the gradients.
       *
       *  @param l  Layers to detach
@@ -685,6 +687,11 @@ namespace eddl {
     */
     vlayer detach(vlayer l);
 
+    /**
+      * @brief Shows profile information.
+    */
+    void show_profile();   
+
 
     ///////////////////////////////////////
     //  LAYERS
diff --git a/include/eddl/utils.h b/include/eddl/utils.h
index de0917590..2ddd7d143 100755
--- a/include/eddl/utils.h
+++ b/include/eddl/utils.h
@@ -59,4 +59,6 @@ string printVector(vector<T> myvector){
 enum WrappingMode {Constant=0, Reflect=1, Nearest=2, Mirror=3, Wrap=4, Original=5};
 WrappingMode getWrappingMode(string mode);
 
+void __show_profile();
+
 #endif //EDDL_UTILS_H
diff --git a/src/apis/eddl.cpp b/src/apis/eddl.cpp
index 9941fffbb..ebdb394cd 100644
--- a/src/apis/eddl.cpp
+++ b/src/apis/eddl.cpp
@@ -15,6 +15,7 @@
 #include <stdexcept>
 
 #include "eddl/apis/eddl.h"
+#include "eddl/utils.h"
 
 
 using namespace std;
@@ -295,6 +296,11 @@ namespace eddl {
         net->train_batch(in, out, indices,1);
     }
 
+    void show_profile() {
+        printf("profile:\n");
+        __show_profile();
+    }
+
     void next_batch(vector<Tensor *> in,vector<Tensor *> out)
     {
         int i,n;
diff --git a/src/net/net_api.cpp b/src/net/net_api.cpp
index 9dc563ac7..d5a7bfd14 100644
--- a/src/net/net_api.cpp
+++ b/src/net/net_api.cpp
@@ -143,9 +143,6 @@ void *update_t(void *t) {
 }
 /////////////////////////////////////////
 
-
-
-
 /////////////////////////////////////////
 // "a ring to rule them all"
 void Net::run_snets(void *(*F)(void *t))
@@ -1113,11 +1110,4 @@ vtensor Net::predict(vtensor tin) {
 
 }
 
-
-
-
-
-
-
-
 //////
diff --git a/src/tensor/tensor_math.cpp b/src/tensor/tensor_math.cpp
index 27711e3fd..257ba5e43 100644
--- a/src/tensor/tensor_math.cpp
+++ b/src/tensor/tensor_math.cpp
@@ -26,69 +26,6 @@
 
 using namespace std;
 
-// profiling declarations
-PROFILING_ENABLE(maximum);
-PROFILING_ENABLE(minimum);
-PROFILING_ENABLE(max);
-PROFILING_ENABLE(argmax);
-PROFILING_ENABLE(argmax_d);
-PROFILING_ENABLE(min);
-PROFILING_ENABLE(argmin);
-PROFILING_ENABLE(sum);
-PROFILING_ENABLE(sum_abs);
-PROFILING_ENABLE(prod);
-PROFILING_ENABLE(mean);
-PROFILING_ENABLE(median);
-PROFILING_ENABLE(std);
-PROFILING_ENABLE(var);
-PROFILING_ENABLE(mode);
-PROFILING_ENABLE(abs);
-PROFILING_ENABLE(acos);
-PROFILING_ENABLE(add);
-PROFILING_ENABLE(asin);
-PROFILING_ENABLE(atan);
-PROFILING_ENABLE(cell);
-PROFILING_ENABLE(clamp);
-PROFILING_ENABLE(clampmax);
-PROFILING_ENABLE(clampmin);
-PROFILING_ENABLE(cos);
-PROFILING_ENABLE(cosh);
-PROFILING_ENABLE(div);
-PROFILING_ENABLE(exp);
-PROFILING_ENABLE(floor);
-PROFILING_ENABLE(inv);
-PROFILING_ENABLE(log);
-PROFILING_ENABLE(log2);
-PROFILING_ENABLE(log10);
-PROFILING_ENABLE(logn);
-PROFILING_ENABLE(mod);
-PROFILING_ENABLE(mult);
-PROFILING_ENABLE(neg);
-PROFILING_ENABLE(normalize);
-PROFILING_ENABLE(pow);
-PROFILING_ENABLE(powb);
-PROFILING_ENABLE(reciprocal);
-PROFILING_ENABLE(remainder);
-PROFILING_ENABLE(round);
-PROFILING_ENABLE(rsqrt);
-PROFILING_ENABLE(sigmoid);
-PROFILING_ENABLE(sign);
-PROFILING_ENABLE(sin);
-PROFILING_ENABLE(sinh);
-PROFILING_ENABLE(sqr);
-PROFILING_ENABLE(sqrt);
-PROFILING_ENABLE(sub);
-PROFILING_ENABLE(tan);
-PROFILING_ENABLE(tanh);
-PROFILING_ENABLE(trunc);
-PROFILING_ENABLE(inc);
-PROFILING_ENABLE(eldiv);
-PROFILING_ENABLE(mult2D);
-PROFILING_ENABLE(el_mult);
-PROFILING_ENABLE(sum2D_rowwise);
-PROFILING_ENABLE(reduce_sum2D);
-PROFILING_ENABLE(sum2D_colwise);
-
 // Math operations (Tensor-Tensor, Tensor-float) ************************
 
 Tensor* Tensor::maximum(float v){
@@ -105,7 +42,7 @@ Tensor* Tensor::maximum(Tensor* A, float v){
 
 void Tensor::maximum(Tensor* A, Tensor* B, float v){
 
-    PROFILING_HEADER(maximum);
+    PROFILING_HEADER_EXTERN(maximum);
 
     if (A->isCPU() && B->isCPU()){
         cpu_maximum(A, B, v);
@@ -133,7 +70,7 @@ Tensor* Tensor::maximum(Tensor* A, Tensor* B){
 
 void Tensor::maximum(Tensor* A, Tensor* B, Tensor* C){
 
-    PROFILING_HEADER(maximum);
+    PROFILING_HEADER_EXTERN(maximum);
 
     if (A->isCPU() && B->isCPU() && C->isCPU()){
         cpu_maximum(A, B, C);
@@ -167,7 +104,7 @@ Tensor* Tensor::minimum(Tensor* A, float v){
 
 void Tensor::minimum(Tensor* A, Tensor* B, float v){
 
-    PROFILING_HEADER(minimum);
+    PROFILING_HEADER_EXTERN(minimum);
 
     if (A->isCPU() && B->isCPU()){
         cpu_minimum(A, B, v);
@@ -195,7 +132,7 @@ Tensor* Tensor::minimum(Tensor* A, Tensor* B){
 
 void Tensor::minimum(Tensor* A, Tensor* B, Tensor* C){
 
-    PROFILING_HEADER(minimum);
+    PROFILING_HEADER_EXTERN(minimum);
 
     if (A->isCPU() && B->isCPU() && C->isCPU()){
         cpu_minimum(A, B, C);
@@ -226,7 +163,7 @@ float Tensor::max(){
 
 float Tensor::max(Tensor* A){
 
-    PROFILING_HEADER(max);
+    PROFILING_HEADER_EXTERN(max);
 
     if (A->isCPU()) {
         return cpu_max(A);
@@ -264,7 +201,7 @@ Tensor* Tensor::max(vector<int> axis, bool keepdims){
 
 void Tensor::max(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
 
-    PROFILING_HEADER(max);
+    PROFILING_HEADER_EXTERN(max);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_max(A, B, rd);
@@ -293,7 +230,7 @@ int Tensor::argmax(){
 
 int Tensor::argmax(Tensor* A){
 
-    PROFILING_HEADER(argmax);
+    PROFILING_HEADER_EXTERN(argmax);
 
     if (A->isCPU()) {
         PROFILING_FOOTER(argmax);
@@ -334,7 +271,7 @@ Tensor* Tensor::argmax(vector<int> axis, bool keepdims){
 
 void Tensor::argmax(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
 
-    PROFILING_HEADER(argmax);
+    PROFILING_HEADER_EXTERN(argmax);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_argmax(A, B, rd);
@@ -356,7 +293,7 @@ void Tensor::argmax(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
 
 void Tensor::argmax_d(Tensor *D, Tensor *O, Tensor *PD){
 
-    PROFILING_HEADER(argmax_d);
+    PROFILING_HEADER_EXTERN(argmax_d);
 
     if (D->isCPU() && O->isCPU() && PD->isCPU()) {
         cpu_argmax_d(D, O, PD);
@@ -383,7 +320,7 @@ float Tensor::min(){
 
 float Tensor::min(Tensor* A){
 
-    PROFILING_HEADER(min);
+    PROFILING_HEADER_EXTERN(min);
 
     if (A->isCPU()) {
         PROFILING_FOOTER(min);
@@ -424,7 +361,7 @@ Tensor* Tensor::min(vector<int> axis, bool keepdims){
 
 void Tensor::min(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
 
-    PROFILING_HEADER(min);
+    PROFILING_HEADER_EXTERN(min);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_min(A, B, rd);
@@ -452,7 +389,7 @@ int Tensor::argmin(){
 
 int Tensor::argmin(Tensor* A){
 
-    PROFILING_HEADER(argmin);
+    PROFILING_HEADER_EXTERN(argmin);
 
     if (A->isCPU()) {
         PROFILING_FOOTER(argmin);
@@ -493,7 +430,7 @@ Tensor* Tensor::argmin(vector<int> axis, bool keepdims){
 
 void Tensor::argmin(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
 
-    PROFILING_HEADER(argmin);
+    PROFILING_HEADER_EXTERN(argmin);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_argmin(A, B, rd);
@@ -522,7 +459,7 @@ float Tensor::sum(){
 
 float Tensor::sum(Tensor* A){
 
-    PROFILING_HEADER(sum);
+    PROFILING_HEADER_EXTERN(sum);
 
     if (A->isCPU()) {
         PROFILING_FOOTER(sum);
@@ -565,7 +502,7 @@ Tensor* Tensor::sum(vector<int> axis, bool keepdims){
 
 void Tensor::sum(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
 
-    PROFILING_HEADER(sum);
+    PROFILING_HEADER_EXTERN(sum);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_sum(A, B, rd);
@@ -592,7 +529,7 @@ float Tensor::sum_abs(){
 
 float Tensor::sum_abs(Tensor* A){
 
-    PROFILING_HEADER(sum_abs);
+    PROFILING_HEADER_EXTERN(sum_abs);
 
     if (A->isCPU()) {
 
@@ -635,7 +572,7 @@ Tensor* Tensor::sum_abs(vector<int> axis, bool keepdims){
 
 void Tensor::sum_abs(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
 
-    PROFILING_HEADER(sum_abs);
+    PROFILING_HEADER_EXTERN(sum_abs);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_sum_abs(A, B, rd);
@@ -662,7 +599,7 @@ float Tensor::prod(){
 
 float Tensor::prod(Tensor* A){  // AKA factorial
 
-    PROFILING_HEADER(prod);
+    PROFILING_HEADER_EXTERN(prod);
 
     if (A->isCPU()) {
         PROFILING_FOOTER(prod);
@@ -704,7 +641,7 @@ Tensor* Tensor::prod(vector<int> axis, bool keepdims){
 
 void Tensor::prod(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
 
-    PROFILING_HEADER(prod);
+    PROFILING_HEADER_EXTERN(prod);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_prod(A, B, rd);
@@ -749,7 +686,7 @@ Tensor* Tensor::mean(vector<int> axis, bool keepdims){
 
 void Tensor::mean(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
 
-    PROFILING_HEADER(mean);
+    PROFILING_HEADER_EXTERN(mean);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_mean(A, B, rd);
@@ -778,7 +715,7 @@ float Tensor::median(){
 
 float Tensor::median(Tensor* A){
 
-    PROFILING_HEADER(median);
+    PROFILING_HEADER_EXTERN(median);
 
     float res = 0.0f;
 
@@ -821,7 +758,7 @@ Tensor* Tensor::median(vector<int> axis, bool keepdims){
 
 void Tensor::median(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
 
-    PROFILING_HEADER(median);
+    PROFILING_HEADER_EXTERN(median);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_median(A, B, rd);
@@ -848,7 +785,7 @@ float Tensor::std(bool unbiased){
 
 float Tensor::std(Tensor* A, bool unbiased){
 
-    PROFILING_HEADER(std);
+    PROFILING_HEADER_EXTERN(std);
 
     if (A->isCPU()) {
         PROFILING_FOOTER(std);
@@ -890,7 +827,7 @@ Tensor* Tensor::std(vector<int> axis, bool keepdims, bool unbiased){
 
 void Tensor::std(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){
 
-    PROFILING_HEADER(std);
+    PROFILING_HEADER_EXTERN(std);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_std(A, B, rd, unbiased);
@@ -918,7 +855,7 @@ float Tensor::var(bool unbiased){
 
 float Tensor::var(Tensor* A, bool unbiased){
 
-    PROFILING_HEADER(var);
+    PROFILING_HEADER_EXTERN(var);
 
     if (A->isCPU()) {
 
@@ -964,7 +901,7 @@ Tensor* Tensor::var(vector<int> axis, bool keepdims, bool unbiased){
 
 void Tensor::var(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){
 
-    PROFILING_HEADER(var);
+    PROFILING_HEADER_EXTERN(var);
 
     if (A->isCPU() && B->isCPU()) {
         cpu_var(A, B, rd, unbiased);
@@ -993,23 +930,23 @@ int Tensor::mode(){
 
 int Tensor::mode(Tensor* A){
 
-    PROFILING_HEADER(mode);
+    PROFILING_HEADER_EXTERN(mode);
 
     if (A->isCPU()) {
-	PROFILING_FOOTER(mode);
+	    PROFILING_FOOTER(mode);
         return cpu_mode(A);
     }
 #ifdef cGPU
     else if (A->isGPU())
     {
-	PROFILING_FOOTER(mode);
+	    PROFILING_FOOTER(mode);
         return gpu_mode(A);
     }
 #endif
 #ifdef cFPGA
     else if (A->isFPGA())
     {
-	PROFILING_FOOTER(mode);
+	    PROFILING_FOOTER(mode);
         return fpga_mode(A);
     }
 #endif
@@ -1035,6 +972,9 @@ Tensor* Tensor::mode(vector<int> axis, bool keepdims){
 }
 
 void Tensor::mode(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
+
+    PROFILING_HEADER_EXTERN(mode);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_mode(A, B, rd);
     }
@@ -1050,6 +990,8 @@ void Tensor::mode(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){
         fpga_mode(A, B, rd);
     }
 #endif
+
+    PROFILING_FOOTER(mode);
 }
 
 
@@ -1064,6 +1006,9 @@ Tensor* Tensor::abs(){
 }
 
 void Tensor::abs(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(abs);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_abs(A, B);
     }
@@ -1079,6 +1024,8 @@ void Tensor::abs(Tensor *A, Tensor *B){
         fpga_abs(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(abs);
 }
 
 
@@ -1093,6 +1040,9 @@ Tensor* Tensor::acos(){
 }
 
 void Tensor::acos(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(acos);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_acos(A, B);
     }
@@ -1108,6 +1058,8 @@ void Tensor::acos(Tensor *A, Tensor *B){
         fpga_acos(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(acos);
 }
 
 
@@ -1132,6 +1084,8 @@ Tensor* Tensor::add(Tensor* A){
 }
 
 void Tensor::add(Tensor *A, Tensor *B, float v){
+    PROFILING_HEADER_EXTERN(add);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_add(A, B, v);
     }
@@ -1147,6 +1101,8 @@ void Tensor::add(Tensor *A, Tensor *B, float v){
         fpga_add(A, B, v);
       }
 #endif
+
+    PROFILING_FOOTER(add);
 }
 
 
@@ -1162,6 +1118,9 @@ Tensor* Tensor::asin(){
 }
 
 void Tensor::asin(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(asin);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_asin(A, B);
     }
@@ -1177,6 +1136,8 @@ void Tensor::asin(Tensor *A, Tensor *B){
         fpga_asin(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(asin); 
 }
 
 
@@ -1193,6 +1154,9 @@ Tensor* Tensor::atan(){
 
 
 void Tensor::atan(Tensor *A, Tensor *B){
+    
+    PROFILING_HEADER_EXTERN(atan);    
+
     if (A->isCPU() && B->isCPU()) {
         cpu_atan(A, B);
     }
@@ -1208,6 +1172,8 @@ void Tensor::atan(Tensor *A, Tensor *B){
         fpga_atan(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(atan);
 }
 
 
@@ -1224,6 +1190,9 @@ Tensor* Tensor::ceil(){
 
 
 void Tensor::ceil(Tensor *A, Tensor *B){
+    
+    PROFILING_HEADER_EXTERN(ceil);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_ceil(A, B);
     }
@@ -1239,6 +1208,8 @@ void Tensor::ceil(Tensor *A, Tensor *B){
         fpga_ceil(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(ceil);
 }
 
 
@@ -1255,6 +1226,9 @@ Tensor* Tensor::clamp(float min, float max){
 
 
 void Tensor::clamp(Tensor *A, Tensor *B, float min, float max){
+
+    PROFILING_HEADER_EXTERN(clamp);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_clamp(A, B, min, max);
     }
@@ -1270,6 +1244,8 @@ void Tensor::clamp(Tensor *A, Tensor *B, float min, float max){
         fpga_clamp(A, B, min, max);
       }
 #endif
+
+    PROFILING_FOOTER(clamp);
 }
 
 
@@ -1319,6 +1295,9 @@ Tensor* Tensor::cos(){
 
 
 void Tensor::cos(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(cos);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_cos(A, B);
     }
@@ -1334,6 +1313,8 @@ void Tensor::cos(Tensor *A, Tensor *B){
         fpga_cos(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(cos);
 }
 
 
@@ -1349,6 +1330,9 @@ Tensor* Tensor::cosh(){
 }
 
 void Tensor::cosh(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(cosh);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_cosh(A, B);
     }
@@ -1364,6 +1348,8 @@ void Tensor::cosh(Tensor *A, Tensor *B){
         fpga_cosh(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(cosh);
 }
 
 
@@ -1409,6 +1395,9 @@ Tensor* Tensor::exp(){
 
 
 void Tensor::exp(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(exp);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_exp(A, B);
     }
@@ -1424,6 +1413,8 @@ void Tensor::exp(Tensor *A, Tensor *B){
         fpga_exp(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(exp);
 }
 
 
@@ -1440,6 +1431,9 @@ Tensor* Tensor::floor(){
 
 
 void Tensor::floor(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(floor);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_floor(A, B);
     }
@@ -1455,6 +1449,8 @@ void Tensor::floor(Tensor *A, Tensor *B){
         fpga_floor(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(floor);
 }
 
 
@@ -1471,6 +1467,9 @@ Tensor* Tensor::inv(float v){
 
 
 void Tensor::inv(Tensor *A, Tensor *B, float v){
+
+    PROFILING_HEADER_EXTERN(inv);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_inv(A, B, v);
     }
@@ -1486,6 +1485,8 @@ void Tensor::inv(Tensor *A, Tensor *B, float v){
         fpga_inv(A, B, v);
       }
 #endif
+
+    PROFILING_FOOTER(inv);   
 }
 
 
@@ -1502,6 +1503,9 @@ Tensor* Tensor::log(){
 
 
 void Tensor::log(Tensor *A, Tensor *B){
+    
+    PROFILING_HEADER_EXTERN(log);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_log(A, B);
     }
@@ -1517,6 +1521,8 @@ void Tensor::log(Tensor *A, Tensor *B){
         fpga_log(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(log);
 }
 
 
@@ -1533,6 +1539,9 @@ Tensor* Tensor::log2(){
 
 
 void Tensor::log2(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(log2);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_log2(A, B);
     }
@@ -1548,6 +1557,8 @@ void Tensor::log2(Tensor *A, Tensor *B){
         fpga_log2(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(log2);
 }
 
 
@@ -1564,6 +1575,9 @@ Tensor* Tensor::log10(){
 
 
 void Tensor::log10(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(log10);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_log10(A, B);
     }
@@ -1579,6 +1593,8 @@ void Tensor::log10(Tensor *A, Tensor *B){
         fpga_log10(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(log10);
 }
 
 
@@ -1595,6 +1611,9 @@ Tensor* Tensor::logn(float n){
 
 
 void Tensor::logn(Tensor *A, Tensor *B, float n){
+    
+    PROFILING_HEADER_EXTERN(logn);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_logn(A, B, n);
     }
@@ -1610,6 +1629,8 @@ void Tensor::logn(Tensor *A, Tensor *B, float n){
         fpga_logn(A, B, n);
       }
 #endif
+
+    PROFILING_FOOTER(logn);
 }
 
 
@@ -1626,6 +1647,9 @@ Tensor* Tensor::mod(float v){
 
 
 void Tensor::mod(Tensor *A, Tensor *B, float v){
+
+    PROFILING_HEADER_EXTERN(mod);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_mod(A, B, v);
     }
@@ -1641,6 +1665,8 @@ void Tensor::mod(Tensor *A, Tensor *B, float v){
         fpga_mod(A, B, v);
       }
 #endif
+
+    PROFILING_FOOTER(mod);
 }
 
 
@@ -1669,6 +1695,9 @@ Tensor* Tensor::mult(Tensor* A){
 
 
 void Tensor::mult(Tensor *A, Tensor *B, float v){
+
+    PROFILING_HEADER_EXTERN(mult);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_mult(A, B, v);
     }
@@ -1684,6 +1713,8 @@ void Tensor::mult(Tensor *A, Tensor *B, float v){
         fpga_mult(A, B, v);
       }
 #endif
+
+    PROFILING_FOOTER(mult);
 }
 
 
@@ -1717,6 +1748,9 @@ Tensor* Tensor::normalize(float min, float max){
 
 
 void Tensor::normalize(Tensor *A, Tensor *B, float min, float max){
+
+    PROFILING_HEADER_EXTERN(normalize);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_normalize(A, B, min, max);
     }
@@ -1732,6 +1766,8 @@ void Tensor::normalize(Tensor *A, Tensor *B, float min, float max){
         fpga_normalize(A, B, min, max);
       }
 #endif
+
+    PROFILING_FOOTER(normalize);
 }
 
 
@@ -1748,6 +1784,9 @@ Tensor* Tensor::pow(float exp){
 
 
 void Tensor::pow(Tensor *A, Tensor *B, float exp){
+    
+    PROFILING_HEADER_EXTERN(pow);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_pow(A, B, exp);
     }
@@ -1763,6 +1802,8 @@ void Tensor::pow(Tensor *A, Tensor *B, float exp){
         fpga_pow(A, B, exp);
       }
 #endif
+
+    PROFILING_FOOTER(pow);
 }
 
 
@@ -1779,6 +1820,9 @@ Tensor* Tensor::powb(float base){
 
 
 void Tensor::powb(Tensor *A, Tensor *B, float base){
+
+    PROFILING_HEADER_EXTERN(powb);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_powb(A, B, base);
     }
@@ -1794,6 +1838,8 @@ void Tensor::powb(Tensor *A, Tensor *B, float base){
         fpga_powb(A, B, base);
       }
 #endif
+
+    PROFILING_FOOTER(powb);
 }
 
 
@@ -1827,6 +1873,9 @@ Tensor* Tensor::remainder(float v){
 
 
 void Tensor::remainder(Tensor *A, Tensor *B, float v){
+
+    PROFILING_HEADER_EXTERN(remainder);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_remainder(A, B, v);
     }
@@ -1842,6 +1891,9 @@ void Tensor::remainder(Tensor *A, Tensor *B, float v){
         fpga_remainder(A, B, v);
       }
 #endif
+
+
+    PROFILING_FOOTER(remainder);
 }
 
 
@@ -1858,6 +1910,9 @@ Tensor* Tensor::round(){
 
 
 void Tensor::round(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(round);
+
     if (A->isCPU() && B->isCPU()) {
         cpu_round(A, B);
     }
@@ -1873,6 +1928,8 @@ void Tensor::round(Tensor *A, Tensor *B){
         fpga_round(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(round);
 }
 
 
@@ -1889,6 +1946,9 @@ Tensor* Tensor::rsqrt(){
 
 
 void Tensor::rsqrt(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(rsqrt);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_rsqrt(A, B);
     }
@@ -1904,6 +1964,8 @@ void Tensor::rsqrt(Tensor *A, Tensor *B){
         fpga_rsqrt(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(rsqrt);
 }
 
 
@@ -1920,6 +1982,9 @@ Tensor* Tensor::sigmoid(){
 
 
 void Tensor::sigmoid(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(sigmoid);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_sigmoid(A, B);
     }
@@ -1935,6 +2000,8 @@ void Tensor::sigmoid(Tensor *A, Tensor *B){
         fpga_sigmoid(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(sigmoid);
 }
 
 
@@ -1951,6 +2018,9 @@ Tensor* Tensor::sign(float zero_sign){
 
 
 void Tensor::sign(Tensor *A, Tensor *B, float zero_sign) {
+
+    PROFILING_HEADER_EXTERN(sign);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_sign(A, B, zero_sign);
     }
@@ -1966,6 +2036,8 @@ void Tensor::sign(Tensor *A, Tensor *B, float zero_sign) {
         fpga_sign(A, B, zero_sign);
       }
 #endif
+
+    PROFILING_FOOTER(sign);
 }
 
 
@@ -1982,6 +2054,9 @@ Tensor* Tensor::sin(){
 
 
 void Tensor::sin(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(sin);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_sin(A, B);
     }
@@ -1997,6 +2072,8 @@ void Tensor::sin(Tensor *A, Tensor *B){
         fpga_sin(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(sin);
 }
 
 
@@ -2013,6 +2090,9 @@ Tensor* Tensor::sinh(){
 
 
 void Tensor::sinh(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(sinh);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_sinh(A, B);
     }
@@ -2028,6 +2108,8 @@ void Tensor::sinh(Tensor *A, Tensor *B){
         fpga_sinh(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(sinh);
 }
 
 
@@ -2044,6 +2126,9 @@ Tensor* Tensor::sqr(){
 
 
 void Tensor::sqr(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(sqr);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_sqr(A, B);
     }
@@ -2059,6 +2144,8 @@ void Tensor::sqr(Tensor *A, Tensor *B){
         fpga_sqr(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(sqr);
 }
 
 
@@ -2075,6 +2162,9 @@ Tensor* Tensor::sqrt(){
 
 
 void Tensor::sqrt(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(sqrt);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_sqrt(A, B);
     }
@@ -2090,6 +2180,8 @@ void Tensor::sqrt(Tensor *A, Tensor *B){
         fpga_sqrt(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(sqrt);
 }
 
 
@@ -2135,6 +2227,9 @@ Tensor* Tensor::tan(){
 
 
 void Tensor::tan(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(tan);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_tan(A, B);
     }
@@ -2150,6 +2245,8 @@ void Tensor::tan(Tensor *A, Tensor *B){
         fpga_tan(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(tan);
 }
 
 
@@ -2166,6 +2263,9 @@ Tensor* Tensor::tanh(){
 
 
 void Tensor::tanh(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(tanh);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_tanh(A, B);
     }
@@ -2181,6 +2281,8 @@ void Tensor::tanh(Tensor *A, Tensor *B){
         fpga_tanh(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(tanh);
 }
 
 
@@ -2197,6 +2299,9 @@ Tensor* Tensor::trunc(){
 
 
 void Tensor::trunc(Tensor *A, Tensor *B){
+
+    PROFILING_HEADER_EXTERN(trunc);
+    
     if (A->isCPU() && B->isCPU()) {
         cpu_trunc(A, B);
     }
@@ -2212,6 +2317,8 @@ void Tensor::trunc(Tensor *A, Tensor *B){
         fpga_trunc(A, B);
       }
 #endif
+
+    PROFILING_FOOTER(trunc);
 }
 
 
@@ -2405,6 +2512,7 @@ void Tensor::add(float scA, Tensor *A, float scB, Tensor *B, Tensor *C, int incC
     ///////////////////////////////////////
     int aux = 0;
 
+    PROFILING_HEADER_EXTERN(add);
 
     if ((A->device != B->device) || (A->device != C->device)) msg("Tensors in different devices", "Tensor::add_");
     if ((!sameShape(A, B)) || (!sameShape(A, C))) {
@@ -2432,12 +2540,16 @@ void Tensor::add(float scA, Tensor *A, float scB, Tensor *B, Tensor *C, int incC
 #endif
 
     C->tsem->unlock();
+
+    PROFILING_FOOTER(add);
 }
 
 
 void Tensor::inc(Tensor *A, Tensor *B) {
     // TODO: Review against add
 
+    PROFILING_HEADER_EXTERN(inc);
+
     if (!Tensor::sameShape(A, B))
         msg("Tensors with different shape", "Tensor::inc");
 
@@ -2467,10 +2579,9 @@ void Tensor::inc(Tensor *A, Tensor *B) {
         Tensor::add(1,n,1,B,B,0);
         delete n;
     }
-}
-
-
 
+    PROFILING_FOOTER(inc);
+}
 
 void Tensor::el_div(Tensor *A, Tensor *B, Tensor *C, int incC) {
     ///////////////////////////////////////
@@ -2482,6 +2593,8 @@ void Tensor::el_div(Tensor *A, Tensor *B, Tensor *C, int incC) {
     if ((A->device != B->device) || (A->device != C->device)) msg("Tensors in different devices", "Tensor::el_div");
     if ((!sameShape(A, B)) || (!sameShape(A, C))) msg("Incompatible dims", "Tensor::el_div");
 
+    PROFILING_HEADER_EXTERN(el_div);
+
     C->tsem->lock();
     if (A->isCPU()) {
         cpu_el_div(A, B, C, incC);
@@ -2499,6 +2612,8 @@ void Tensor::el_div(Tensor *A, Tensor *B, Tensor *C, int incC) {
       }
 #endif
     C->tsem->unlock();
+
+    PROFILING_FOOTER(el_div);
 }
 
 
@@ -2561,6 +2676,9 @@ void Tensor::el_mult(Tensor *A, Tensor *B, Tensor *C, int incC) {
     //// incC 1 means C+=A.*B (increment over C)
     //// Dimensions must be compatible
     ///////////////////////////////////////
+
+    PROFILING_HEADER_EXTERN(el_mult);
+
     C->tsem->lock();
     if ((A->device != B->device) || (A->device != C->device)) msg("Tensors in different devices", "Tensor::el_mult");
     if ((!sameShape(A, B)) || (!sameShape(A, C))) {
@@ -2586,6 +2704,8 @@ void Tensor::el_mult(Tensor *A, Tensor *B, Tensor *C, int incC) {
       }
 #endif
     C->tsem->unlock();
+
+    PROFILING_FOOTER(el_mult);    
 }
 
 
@@ -2601,7 +2721,7 @@ void Tensor::sum2D_rowwise(Tensor *A, Tensor *B, Tensor *C) {
     if ((A->ndim != 2) || (B->ndim != 1) || (C->ndim != 2)) msg("sum2D_rowwise dims");
     if ((!sameShape(A, C)) || (A->shape[1] != B->shape[0])) msg("Incompatible dims", "Tensor::sum2D_rowwise");
 
-    PROFILING_HEADER(sum2D_rowwise);
+    PROFILING_HEADER_EXTERN(sum2D_rowwise);
 
     C->tsem->lock();
     if (A->isCPU()) {
@@ -2637,6 +2757,8 @@ void Tensor::reduce_sum2D(Tensor *A, Tensor *B, int axis, int incB) {
     if ((A->ndim - 1) != B->ndim) msg("Incorrect dims", "Tensor::reduce_sum2D");
     if ((A->shape[1 - axis] != B->shape[0])) msg("Incompatible dims", "Tensor::reduce_sum2D");
 
+    PROFILING_HEADER_EXTERN(reduce_sum2D);
+
     B->tsem->lock();
     if (A->isCPU()) {
         cpu_reduce_sum2D(A, B, axis, incB);
@@ -2654,6 +2776,8 @@ void Tensor::reduce_sum2D(Tensor *A, Tensor *B, int axis, int incB) {
       }
 #endif
     B->tsem->unlock();
+
+    PROFILING_FOOTER(reduce_sum2D);    
 }
 
 void Tensor::sum2D_colwise(Tensor *A, Tensor *B, Tensor *C) {
@@ -2668,6 +2792,8 @@ void Tensor::sum2D_colwise(Tensor *A, Tensor *B, Tensor *C) {
     if ((A->ndim != 2) || (B->ndim != 1) || (C->ndim != 2)) msg("sum2D_colwise dims");
     if ((!sameShape(A, C)) || (A->shape[0] != B->shape[0])) msg("Incompatible dims", "Tensor::sum2D_colwise");
 
+    PROFILING_HEADER_EXTERN(sum2D_colwise);
+
     C->tsem->lock();
     if (A->isCPU()) {
         cpu_sum2D_colwise(A, B, C);
@@ -2685,4 +2811,6 @@ void Tensor::sum2D_colwise(Tensor *A, Tensor *B, Tensor *C) {
       }
 #endif
     C->tsem->unlock();
+
+    PROFILING_FOOTER(sum2D_colwise);    
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index c61bac82d..0419e271b 100755
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -31,6 +31,7 @@
 
 #include "eddl/system_info.h"
 #include "eddl/utils.h"
+#include "eddl/profiling.h"
 
 #ifdef EDDL_LINUX
 #include "sys/mman.h"
@@ -418,3 +419,137 @@ WrappingMode getWrappingMode(string mode){
         return WrappingMode::Constant;
     }
 }
+
+// ---------------------------------------------------------------------------------------------
+// Profiling
+
+// profiling declarations
+PROFILING_ENABLE(maximum);
+PROFILING_ENABLE(minimum);
+PROFILING_ENABLE(max);
+PROFILING_ENABLE(argmax);
+PROFILING_ENABLE(argmax_d);
+PROFILING_ENABLE(min);
+PROFILING_ENABLE(argmin);
+PROFILING_ENABLE(sum);
+PROFILING_ENABLE(sum_abs);
+PROFILING_ENABLE(prod);
+PROFILING_ENABLE(mean);
+PROFILING_ENABLE(median);
+PROFILING_ENABLE(std);
+PROFILING_ENABLE(var);
+PROFILING_ENABLE(mode);
+PROFILING_ENABLE(abs);
+PROFILING_ENABLE(acos);
+PROFILING_ENABLE(add);
+PROFILING_ENABLE(asin);
+PROFILING_ENABLE(atan);
+PROFILING_ENABLE(cell);
+PROFILING_ENABLE(clamp);
+PROFILING_ENABLE(clampmax);
+PROFILING_ENABLE(clampmin);
+PROFILING_ENABLE(cos);
+PROFILING_ENABLE(cosh);
+PROFILING_ENABLE(div);
+PROFILING_ENABLE(exp);
+PROFILING_ENABLE(floor);
+PROFILING_ENABLE(inv);
+PROFILING_ENABLE(log);
+PROFILING_ENABLE(log2);
+PROFILING_ENABLE(log10);
+PROFILING_ENABLE(logn);
+PROFILING_ENABLE(mod);
+PROFILING_ENABLE(mult);
+PROFILING_ENABLE(neg);
+PROFILING_ENABLE(normalize);
+PROFILING_ENABLE(pow);
+PROFILING_ENABLE(powb);
+PROFILING_ENABLE(reciprocal);
+PROFILING_ENABLE(remainder);
+PROFILING_ENABLE(round);
+PROFILING_ENABLE(rsqrt);
+PROFILING_ENABLE(sigmoid);
+PROFILING_ENABLE(sign);
+PROFILING_ENABLE(sin);
+PROFILING_ENABLE(sinh);
+PROFILING_ENABLE(sqr);
+PROFILING_ENABLE(sqrt);
+PROFILING_ENABLE(sub);
+PROFILING_ENABLE(tan);
+PROFILING_ENABLE(tanh);
+PROFILING_ENABLE(trunc);
+PROFILING_ENABLE(inc);
+PROFILING_ENABLE(el_div);
+PROFILING_ENABLE(mult2D);
+PROFILING_ENABLE(el_mult);
+PROFILING_ENABLE(sum2D_rowwise);
+PROFILING_ENABLE(reduce_sum2D);
+PROFILING_ENABLE(sum2D_colwise);
+PROFILING_ENABLE(ceil);
+
+void __show_profile() {
+
+  // profiling declarations
+  PROFILING_PRINTF(maximum);
+  PROFILING_PRINTF(minimum);
+  PROFILING_PRINTF(max);
+  PROFILING_PRINTF(argmax);
+  PROFILING_PRINTF(argmax_d);
+  PROFILING_PRINTF(min);
+  PROFILING_PRINTF(argmin);
+  PROFILING_PRINTF(sum);
+  PROFILING_PRINTF(sum_abs);
+  PROFILING_PRINTF(prod);
+  PROFILING_PRINTF(mean);
+  PROFILING_PRINTF(median);
+  PROFILING_PRINTF(std);
+  PROFILING_PRINTF(var);
+  PROFILING_PRINTF(mode);
+  PROFILING_PRINTF(abs);
+  PROFILING_PRINTF(acos);
+  PROFILING_PRINTF(add);
+  PROFILING_PRINTF(asin);
+  PROFILING_PRINTF(atan);
+  PROFILING_PRINTF(cell);
+  PROFILING_PRINTF(clamp);
+  PROFILING_PRINTF(clampmax);
+  PROFILING_PRINTF(clampmin);
+  PROFILING_PRINTF(cos);
+  PROFILING_PRINTF(cosh);
+  PROFILING_PRINTF(div);
+  PROFILING_PRINTF(exp);
+  PROFILING_PRINTF(floor);
+  PROFILING_PRINTF(inv);
+  PROFILING_PRINTF(log);
+  PROFILING_PRINTF(log2);
+  PROFILING_PRINTF(log10);
+  PROFILING_PRINTF(logn);
+  PROFILING_PRINTF(mod);
+  PROFILING_PRINTF(mult);
+  PROFILING_PRINTF(neg);
+  PROFILING_PRINTF(normalize);
+  PROFILING_PRINTF(pow);
+  PROFILING_PRINTF(powb);
+  PROFILING_PRINTF(reciprocal);
+  PROFILING_PRINTF(remainder);
+  PROFILING_PRINTF(round);
+  PROFILING_PRINTF(rsqrt);
+  PROFILING_PRINTF(sigmoid);
+  PROFILING_PRINTF(sign);
+  PROFILING_PRINTF(sin);
+  PROFILING_PRINTF(sinh);
+  PROFILING_PRINTF(sqr);
+  PROFILING_PRINTF(sqrt);
+  PROFILING_PRINTF(sub);
+  PROFILING_PRINTF(tan);
+  PROFILING_PRINTF(tanh);
+  PROFILING_PRINTF(trunc);
+  PROFILING_PRINTF(inc);
+  PROFILING_PRINTF(el_div);
+  PROFILING_PRINTF(mult2D);
+  PROFILING_PRINTF(el_mult);
+  PROFILING_PRINTF(sum2D_rowwise);
+  PROFILING_PRINTF(reduce_sum2D);
+  PROFILING_PRINTF(sum2D_colwise);
+  PROFILING_PRINTF(ceil);
+}
\ No newline at end of file

From 9cc6d9042877153dd226098cf8e7bce311734ca1 Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Mon, 2 Nov 2020 13:57:24 +0100
Subject: [PATCH 10/15] UPV-GAP:

  added profiling support for data augmentation operations
---
 src/tensor/tensor_da.cpp | 57 ++++++++++++++++++++++++++++++++++++++++
 src/utils.cpp            | 32 ++++++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/src/tensor/tensor_da.cpp b/src/tensor/tensor_da.cpp
index 410f00811..3d74422d5 100644
--- a/src/tensor/tensor_da.cpp
+++ b/src/tensor/tensor_da.cpp
@@ -14,6 +14,7 @@
 
 #include "eddl/tensor/tensor.h"
 #include "eddl/hardware/cpu/cpu_tensor.h"
+#include "eddl/profiling.h"
 
 #ifdef cGPU
 #include "eddl/hardware/gpu/gpu_tensor.h"
@@ -48,6 +49,8 @@ void Tensor::shift(Tensor *A, Tensor *B, vector<int> shift, WrappingMode mode, f
         msg("This method requires two 4D tensors", "Tensor::shift");
     }
 
+    PROFILING_HEADER_EXTERN(shift);
+
     if (A->isCPU()) {
         cpu_shift(A, B, std::move(shift), mode, cval);
     }
@@ -62,6 +65,8 @@ void Tensor::shift(Tensor *A, Tensor *B, vector<int> shift, WrappingMode mode, f
         fpga_shift(A, B, std::move(shift), mode, cval);
     }
 #endif
+
+    PROFILING_FOOTER(shift);
 }
 
 Tensor* Tensor::rotate(float angle, vector<int> offset_center, WrappingMode mode, float cval){
@@ -78,6 +83,8 @@ void Tensor::rotate(Tensor *A, Tensor *B, float angle, vector<int> offset_center
         msg("This method requires two 4D tensors", "Tensor::rotate");
     }
 
+    PROFILING_HEADER_EXTERN(rotate);
+
     if (A->isCPU()) {
         cpu_rotate(A, B, angle, std::move(offset_center), mode, cval);
     }
@@ -92,6 +99,8 @@ void Tensor::rotate(Tensor *A, Tensor *B, float angle, vector<int> offset_center
         fpga_rotate(A, B, angle, std::move(offset_center), mode, cval);
     }
 #endif
+
+    PROFILING_FOOTER(rotate);
 }
 
 Tensor* Tensor::scale(vector<int> new_shape, WrappingMode mode, float cval, bool keep_size) {
@@ -120,6 +129,8 @@ void Tensor::scale(Tensor *A, Tensor *B, vector<int> new_shape, WrappingMode mod
         msg("This method requires two 4D tensors", "Tensor::scale");
     }
 
+    PROFILING_HEADER_EXTERN(scale);
+
     if (A->isCPU()) {
         cpu_scale(A, B, std::move(new_shape), mode, cval);
     }
@@ -134,6 +145,8 @@ void Tensor::scale(Tensor *A, Tensor *B, vector<int> new_shape, WrappingMode mod
         fpga_scale(A, B, std::move(new_shape), mode, cval);
     }
 #endif
+
+    PROFILING_FOOTER(scale);
 }
 
 
@@ -156,6 +169,8 @@ void Tensor::flip(Tensor *A, Tensor *B, int axis) {
         msg("This method requires two 4D tensors", "Tensor::flip");
     }
 
+    PROFILING_HEADER_EXTERN(flip);
+
     if (A->isCPU()) {
         cpu_flip(A, B, axis);
     }
@@ -170,6 +185,8 @@ void Tensor::flip(Tensor *A, Tensor *B, int axis) {
         fpga_flip(A, B, axis);
     }
 #endif
+
+    PROFILING_FOOTER(flip);
 }
 
 Tensor* Tensor::crop(vector<int> coords_from, vector<int> coords_to, float cval, bool keep_size){
@@ -200,6 +217,8 @@ void Tensor::crop(Tensor *A, Tensor *B, vector<int> coords_from, vector<int> coo
         msg("This method requires two 4D tensors", "Tensor::crop");
     }
 
+    PROFILING_HEADER_EXTERN(crop);
+
     if (A->isCPU()) {
         cpu_crop(A, B, std::move(coords_from), std::move(coords_to), cval, false);
     }
@@ -214,6 +233,8 @@ void Tensor::crop(Tensor *A, Tensor *B, vector<int> coords_from, vector<int> coo
         fpga_crop(A, B, std::move(coords_from), std::move(coords_to), cval, false);
     }
 #endif
+
+    PROFILING_FOOTER(crop);
 }
 
 Tensor* Tensor::crop_scale(vector<int> coords_from, vector<int> coords_to, WrappingMode mode, float cval){
@@ -237,6 +258,8 @@ void Tensor::crop_scale(Tensor *A, Tensor *B, vector<int> coords_from, vector<in
         msg("This method requires two 4D tensors", "Tensor::crop_scale");
     }
 
+    PROFILING_HEADER_EXTERN(crop_scale);
+
     if (A->isCPU()) {
         cpu_crop_scale(A, B, std::move(coords_from), std::move(coords_to), mode, cval);
     }
@@ -251,6 +274,8 @@ void Tensor::crop_scale(Tensor *A, Tensor *B, vector<int> coords_from, vector<in
         fpga_crop_scale(A, B, std::move(coords_from), std::move(coords_to), mode, cval);
     }
 #endif
+
+    PROFILING_FOOTER(crop_scale);
 }
 
 
@@ -277,6 +302,8 @@ void Tensor::cutout(Tensor *A, Tensor *B, vector<int> coords_from, vector<int> c
         msg("This method requires two 4D tensors", "Tensor::cutout");
     }
 
+    PROFILING_HEADER_EXTERN(cutout);
+
     if (A->isCPU()) {
         cpu_crop(A, B, std::move(coords_from), std::move(coords_to), cval, true);
     }
@@ -291,6 +318,8 @@ void Tensor::cutout(Tensor *A, Tensor *B, vector<int> coords_from, vector<int> c
         fpga_crop(A, B, std::move(coords_from), std::move(coords_to), cval, true);
     }
 #endif
+
+    PROFILING_FOOTER(cutout);
 }
 
 
@@ -318,6 +347,8 @@ void Tensor::shift_random(Tensor *A, Tensor *B, vector<float> factor_x, vector<f
         msg("This method requires two 4D tensors", "Tensor::shift_random");
     }
 
+    PROFILING_HEADER_EXTERN(shift_random);
+
     if (A->isCPU()) {
         cpu_shift_random(A, B, std::move(factor_x), std::move(factor_y), mode, cval);
     }
@@ -332,6 +363,8 @@ void Tensor::shift_random(Tensor *A, Tensor *B, vector<float> factor_x, vector<f
         fpga_shift_random(A, B, std::move(factor_x), std::move(factor_y), mode, cval);
     }
 #endif
+
+    PROFILING_FOOTER(shift_random);
 }
 
 
@@ -349,6 +382,8 @@ void Tensor::rotate_random(Tensor *A, Tensor *B, vector<float> factor, vector<in
         msg("This method requires two 4D tensors", "Tensor::rotate_random");
     }
 
+    PROFILING_HEADER_EXTERN(rotate_random);
+
     if (A->isCPU()) {
         cpu_rotate_random(A, B,  std::move(factor), std::move(offset_center), mode, cval);
     }
@@ -363,6 +398,8 @@ void Tensor::rotate_random(Tensor *A, Tensor *B, vector<float> factor, vector<in
         fpga_rotate_random(A, B,  std::move(factor), std::move(offset_center), mode, cval);
     }
 #endif
+
+    PROFILING_FOOTER(rotate_random);
 }
 
 Tensor* Tensor::scale_random(vector<float> factor, WrappingMode mode, float cval){
@@ -384,6 +421,8 @@ void Tensor::scale_random(Tensor *A, Tensor *B, vector<float> factor, WrappingMo
         msg("This method requires two 4D tensors", "Tensor::scale_random");
     }
 
+    PROFILING_HEADER_EXTERN(scale_random);
+
     if (A->isCPU()) {
         cpu_scale_random(A, B, std::move(factor), mode, cval);
     }
@@ -398,6 +437,8 @@ void Tensor::scale_random(Tensor *A, Tensor *B, vector<float> factor, WrappingMo
         fpga_scale_random(A, B, std::move(factor), mode, cval);
     }
 #endif
+
+    PROFILING_FOOTER(scale_random);
 }
 
 
@@ -420,6 +461,8 @@ void Tensor::flip_random(Tensor *A, Tensor *B, int axis) {
         msg("This method requires two 4D tensors", "Tensor::flip_random");
     }
 
+    PROFILING_HEADER_EXTERN(flip_random);
+
     if (A->isCPU()) {
         cpu_flip_random(A, B, axis);
     }
@@ -434,6 +477,8 @@ void Tensor::flip_random(Tensor *A, Tensor *B, int axis) {
         fpga_flip_random(A, B, axis);
     }
 #endif
+
+    PROFILING_FOOTER(flip_random);
 }
 
 Tensor* Tensor::crop_random(int height, int width, float cval, bool keep_size){
@@ -464,6 +509,8 @@ void Tensor::crop_random(Tensor *A, Tensor *B) {
         msg("This method requires two 4D tensors", "Tensor::crop_random");
     }
 
+    PROFILING_HEADER_EXTERN(crop_random);
+
     if (A->isCPU()) {
         cpu_crop_random(A, B);
     }
@@ -478,6 +525,8 @@ void Tensor::crop_random(Tensor *A, Tensor *B) {
         fpga_crop_random(A, B);
     }
 #endif
+
+    PROFILING_FOOTER(crop_random);
 }
 
 Tensor* Tensor::crop_scale_random(vector<float> factor, WrappingMode mode, float cval){
@@ -498,6 +547,8 @@ void Tensor::crop_scale_random(Tensor *A, Tensor *B, vector<float> factor, Wrapp
         msg("This method requires two 4D tensors", "Tensor::crop_scale_random");
     }
 
+    PROFILING_HEADER_EXTERN(crop_scale_random);
+
     if (A->isCPU()) {
         cpu_crop_scale_random(A, B, std::move(factor), mode, cval);
     }
@@ -512,6 +563,8 @@ void Tensor::crop_scale_random(Tensor *A, Tensor *B, vector<float> factor, Wrapp
         fpga_crop_scale_random(A, B, std::move(factor), mode, cval);
     }
 #endif
+
+    PROFILING_FOOTER(crop_scale_random);
 }
 
 Tensor* Tensor::cutout_random(vector<float> factor_x, vector<float> factor_y, float cval){
@@ -536,6 +589,8 @@ void Tensor::cutout_random(Tensor *A, Tensor *B, vector<float> factor_x, vector<
         msg("This method requires two 4D tensors", "Tensor::cutout_random");
     }
 
+    PROFILING_HEADER_EXTERN(cutout_random);
+
     if (A->isCPU()) {
         cpu_cutout_random(A, B, std::move(factor_x), std::move(factor_y), cval);
     }
@@ -550,4 +605,6 @@ void Tensor::cutout_random(Tensor *A, Tensor *B, vector<float> factor_x, vector<
         fpga_cutout_random(A, B, std::move(factor_x), std::move(factor_y), cval);
     }
 #endif
+
+  PROFILING_FOOTER(cutout_random);
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index 0419e271b..a661eb31c 100755
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -486,6 +486,22 @@ PROFILING_ENABLE(sum2D_rowwise);
 PROFILING_ENABLE(reduce_sum2D);
 PROFILING_ENABLE(sum2D_colwise);
 PROFILING_ENABLE(ceil);
+// da
+PROFILING_ENABLE(shift);
+PROFILING_ENABLE(rotate);
+PROFILING_ENABLE(scale);
+PROFILING_ENABLE(flip);
+PROFILING_ENABLE(crop);
+PROFILING_ENABLE(crop_scale);
+PROFILING_ENABLE(cutout);
+PROFILING_ENABLE(shift_random);
+PROFILING_ENABLE(rotate_random);
+PROFILING_ENABLE(scale_random);
+PROFILING_ENABLE(flip_random);
+PROFILING_ENABLE(crop_random);
+PROFILING_ENABLE(crop_scale_random);
+PROFILING_ENABLE(cutout_random);
+
 
 void __show_profile() {
 
@@ -552,4 +568,20 @@ void __show_profile() {
   PROFILING_PRINTF(reduce_sum2D);
   PROFILING_PRINTF(sum2D_colwise);
   PROFILING_PRINTF(ceil);
+  // da
+  PROFILING_PRINTF(shift);
+  PROFILING_PRINTF(rotate);
+  PROFILING_PRINTF(scale);
+  PROFILING_PRINTF(flip);
+  PROFILING_PRINTF(crop);
+  PROFILING_PRINTF(crop_scale);
+  PROFILING_PRINTF(cutout);
+  PROFILING_PRINTF(shift_random);
+  PROFILING_PRINTF(rotate_random);
+  PROFILING_PRINTF(scale_random);
+  PROFILING_PRINTF(flip_random);
+  PROFILING_PRINTF(crop_random);
+  PROFILING_PRINTF(crop_scale_random);
+  PROFILING_PRINTF(cutout_random);
+
 }
\ No newline at end of file

From e4005d888928df9d6d042f47af7481e9898838cc Mon Sep 17 00:00:00 2001
From: jorga20j <jorga20j@posgrado.upv.es>
Date: Tue, 3 Nov 2020 07:48:19 +0000
Subject: [PATCH 11/15] adding relu support to convolution

---
 .../kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp   | 21 ++++++++++------
 .../src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp | 25 ++++++++++++-------
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
index 1545e91e4..898b07c53 100644
--- a/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
+++ b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
@@ -225,17 +225,19 @@ for (int cpi=0; cpi<CPI; cpi++) zero.pixel[cpi] = 0.f;
 //   in: input stream
 //   out: output stream
 //
-static void relu(int H, int W, int O, hls::stream<data_type> &in, hls::stream<data_type> &out) {
+static void relu(int flag_relu, int H, int W, hls::stream<pixel_out_t> &in, hls::stream<pixel_out_t> &out) {
 
 #ifdef DEBUG_VERBOSE
   printf("relu: start\n");
 #endif
-
-  int data_size = W * H * O;
+  pixel_out_t data;
+  int data_size = W * H;
   for (int i=0; i < data_size; i++) {
     #pragma HLS PIPELINE II=1
-    data_type data = in.read();
-    if (data < 0) data = 0.f;
+    data  = in.read();
+    for(int cpo = 0; cpo<CPO; cpo++){
+      if(flag_relu == 1 && data.pixel[cpo] < 0) data.pixel[cpo] = data_type(0.f);
+    }
     out << data;
   }
 
@@ -620,7 +622,7 @@ static void conv(int H, int W, int I_ITER, hls::stream<pixel_in_t> &in, hls::str
 }
 
 
-void k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O, int offset_bias, int offset_kernel, int offset_data_out) {
+void k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O, int offset_bias, int offset_kernel, int offset_data_out, int flag_relu) {
 
   #pragma HLS INTERFACE s_axilite port=W bundle=control
   #pragma HLS INTERFACE s_axilite port=H bundle=control
@@ -633,6 +635,7 @@ void k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2(pixel_in_t *ptr_data, int H, int W, int I, d
   #pragma HLS INTERFACE s_axilite port=offset_bias bundle=control
   #pragma HLS INTERFACE s_axilite port=offset_kernel bundle=control
   #pragma HLS INTERFACE s_axilite port=offset_data_out bundle=control
+  #pragma HLS INTERFACE s_axilite port=flag_relu bundle=control
   #pragma HLS INTERFACE s_axilite port=return bundle=control
 
   // ptr_data struct to be packed as a single element vector (to improve memory read)
@@ -647,20 +650,22 @@ void k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2(pixel_in_t *ptr_data, int H, int W, int I, d
   static hls::stream<frame_t> out_read_kernel;
   static hls::stream<pixel_out_t> out_read_bias;
   static hls::stream<pixel_out_t> out_conv;
+  static hls::stream<pixel_out_t> out_relu;
 
   // stream sizes
   #pragma HLS STREAM variable = out_read_data depth = 32
   #pragma HLS STREAM variable = out_read_kernel depth = 32
   #pragma HLS STREAM variable = out_read_bias depth = 32
   #pragma HLS STREAM variable = out_conv depth = 32
-  // #pragma HLS STREAM variable = out_relu depth = 32
+  #pragma HLS STREAM variable = out_relu depth = 32
 
     #pragma HLS dataflow
     read_data(H, W, I_ITER, ptr_data, out_read_data);
     read_bias(offset_bias, ptr_bias, out_read_bias);
     read_kernel(I_ITER, offset_kernel, ptr_kernel, out_read_kernel);
     conv(H, W, I_ITER, out_read_data, out_read_kernel, out_read_bias, out_conv);
-    write_output(H, W, offset_data_out, ptr_out, out_conv);
+    relu(flag_relu, H, W, out_conv, out_relu);
+    write_output(H, W, offset_data_out, ptr_out, out_relu);
 
 }
 
diff --git a/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
index fc1fae149..9f700f012 100644
--- a/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
+++ b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp
@@ -70,6 +70,8 @@ std::string binaryFile;
 #define KW 3
 #define KH 3
 
+#define RELU 1  // 0/1 function relu is activated or not
+
 int W;
 int H;
 int GI;
@@ -99,7 +101,7 @@ void parse_arguments(int argc, char **argv) {
     exit(1);
   }
 
-  binaryFile = argv[1];  
+  binaryFile = argv[1];
   W = atoi(argv[2]);
   H = atoi(argv[3]);
   I = atoi(argv[4]);
@@ -174,14 +176,16 @@ void cpu_conv2d() {
   }
 
   // aplicamos relu
-/*  for (int cout=0; cout<COUT; cout++) {
-    for (int h=0; h<H; h++) {
-      for (int w=0; w<W; w++) {
-        int addr_o = (h * W * COUT) + (w * COUT) + cout;
-        if (out_cpu[addr_o] < 0.f) out_cpu[addr_o] = 0.f;
+  if(RELU){
+    for (int cout=0; cout<O; cout++) {
+      for (int h=0; h<H; h++) {
+        for (int w=0; w<W; w++) {
+          int addr_o = (h * W * O) + (w * O) + cout;
+          if (out_cpu[addr_o] < 0.f) out_cpu[addr_o] = 0.f;
+        }
       }
     }
-  }*/
+  }
 }
 
 void cpu_print_data_in() {
@@ -456,6 +460,7 @@ int main(int argc, char **argv) {
   int offset_bias = 0;  //offset to pointer bias each loop
   int offset_kernel = 0; //offset to pointer kernel each loop
   int offset_data_out = 0; //offset to poiter output data loop
+  int flag_relu = RELU; //0/1 relu function is activated or no
 
   //-----------------------------
   // Copy input data to device global memory
@@ -470,8 +475,8 @@ int main(int argc, char **argv) {
   set_callback(write_events[1], "ooo_queue");
 
   OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_bias}, 0 /*0 means from host*/, NULL, &write_events[2]));
-  set_callback(write_events[2], "ooo_queue");  
-  
+  set_callback(write_events[2], "ooo_queue");
+
   // timint stats
   unsigned long long prof_time;
   struct timeval prof_t1;
@@ -491,6 +496,8 @@ int main(int argc, char **argv) {
     OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, offset_bias));
     OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, offset_kernel));
     OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, offset_data_out));
+    OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, flag_relu));
+
 
     // Update the offset poiter to bias, kernels and output data
     offset_bias = offset_bias + CPO;

From 25b6540b81e821fc681d2574f22f877f44591c2d Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Tue, 3 Nov 2020 13:38:11 +0100
Subject: [PATCH 12/15] UPV-GAP additions:

 - Profiling reduction functions
---
 src/utils.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/utils.cpp b/src/utils.cpp
index a661eb31c..4393ec4f2 100755
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -501,7 +501,11 @@ PROFILING_ENABLE(flip_random);
 PROFILING_ENABLE(crop_random);
 PROFILING_ENABLE(crop_scale_random);
 PROFILING_ENABLE(cutout_random);
-
+// reduction
+PROFILING_ENABLE(reduce);
+PROFILING_ENABLE(reduce_op);
+PROFILING_ENABLE(reduction);
+PROFILING_ENABLE(reduction_back);
 
 void __show_profile() {
 
@@ -583,5 +587,10 @@ void __show_profile() {
   PROFILING_PRINTF(crop_random);
   PROFILING_PRINTF(crop_scale_random);
   PROFILING_PRINTF(cutout_random);
+  //reduction
+  PROFILING_PRINTF(reduce);
+  PROFILING_PRINTF(reduce_op);
+  PROFILING_PRINTF(reduction);
+  PROFILING_PRINTF(reduction_back);
 
 }
\ No newline at end of file

From 4449f36dbc4c24e7d51cebd26234a95b9ac605a7 Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Tue, 3 Nov 2020 16:45:22 +0100
Subject: [PATCH 13/15] UPV-GAP: Added profiling of activation functions,
 convs, reduction functions

---
 include/eddl/profiling.h             |  12 ++-
 src/tensor/nn/tensor_activations.cpp | 123 ++++++++++++++++++++++++++-
 src/tensor/nn/tensor_conv.cpp        |  19 +++--
 src/tensor/tensor_reduction.cpp      |  27 +++++-
 src/utils.cpp                        |  61 +++++++++++++
 5 files changed, 229 insertions(+), 13 deletions(-)

diff --git a/include/eddl/profiling.h b/include/eddl/profiling.h
index c02848225..43ce4b9c5 100644
--- a/include/eddl/profiling.h
+++ b/include/eddl/profiling.h
@@ -4,9 +4,18 @@
 
 #include <sys/time.h>
 
+#define PROFILING_HEADER(fn) \
+    struct timeval prof_t1; \
+    gettimeofday(&prof_t1, NULL);
+
+
 #define PROFILING_ENABLE(fn) \
     unsigned long long prof_##fn##_time; \
-    unsigned long long prof_##fn##_calls; \
+    unsigned long long prof_##fn##_calls;
+
+#define PROFILING_ENABLE_EXTERN(fn) \
+    extern unsigned long long prof_##fn##_time; \
+    extern unsigned long long prof_##fn##_calls; \
 
 #define PROFILING_HEADER(fn) \
     struct timeval prof_t1; \
@@ -15,7 +24,6 @@
 #define PROFILING_HEADER_EXTERN(fn) \
     extern unsigned long long prof_##fn##_time; \
     extern unsigned long long prof_##fn##_calls; \
-    extern int prof_##fn##_device; \
     struct timeval prof_t1; \
     gettimeofday(&prof_t1, NULL);
 
diff --git a/src/tensor/nn/tensor_activations.cpp b/src/tensor/nn/tensor_activations.cpp
index e3cb2c9ef..92755f99f 100644
--- a/src/tensor/nn/tensor_activations.cpp
+++ b/src/tensor/nn/tensor_activations.cpp
@@ -20,17 +20,39 @@
 #include "eddl/hardware/gpu/nn/gpu_tensor_nn.h"
 #endif
 
-namespace tensorNN {
-
-	PROFILING_ENABLE(ReLu);
+PROFILING_ENABLE_EXTERN(ReLu);
+PROFILING_ENABLE_EXTERN(D_ReLu);
+PROFILING_ENABLE_EXTERN(ThresholdedReLu);
+PROFILING_ENABLE_EXTERN(LeakyReLu);
+PROFILING_ENABLE_EXTERN(D_ThresholdedReLu);
+PROFILING_ENABLE_EXTERN(D_LeakyReLu);
+PROFILING_ENABLE_EXTERN(ELu);
+PROFILING_ENABLE_EXTERN(D_ELu);
+PROFILING_ENABLE_EXTERN(Sigmoid);
+PROFILING_ENABLE_EXTERN(D_Sigmoid);
+PROFILING_ENABLE_EXTERN(HardSigmoid);
+PROFILING_ENABLE_EXTERN(D_HardSigmoid);
+PROFILING_ENABLE_EXTERN(Tanh);
+PROFILING_ENABLE_EXTERN(D_Tanh);
+PROFILING_ENABLE_EXTERN(Softmax);
+PROFILING_ENABLE_EXTERN(D_Softmax);
+PROFILING_ENABLE_EXTERN(Exp);
+PROFILING_ENABLE_EXTERN(D_Exp);
+PROFILING_ENABLE_EXTERN(Linear);
+PROFILING_ENABLE_EXTERN(D_Linear);
+PROFILING_ENABLE_EXTERN(Softsign);
+PROFILING_ENABLE_EXTERN(D_softsign);
+PROFILING_ENABLE_EXTERN(Softplus);
+PROFILING_ENABLE_EXTERN(D_softplus);
 
+namespace tensorNN {
 
 // ReLU
     void ReLu(Tensor *A, Tensor *B) {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::ReLu");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::ReLu");
 
-	PROFILING_HEADER_EXTERN(ReLu);
+	    PROFILING_HEADER(ReLu);
 
         B->tsem->lock();
         if (A->isCPU()) {
@@ -60,6 +82,8 @@ namespace tensorNN {
         }
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_ReLu");
 
+        PROFILING_HEADER(D_ReLu);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_relu(D, I, PD);
@@ -77,6 +101,8 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_ReLu);
     }
 
 // ThresholdedReLu
@@ -84,6 +110,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::ThresholdedReLu");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::ThresholdedReLu");
 
+        PROFILING_HEADER(ThresholdedReLu);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_thresholded_relu(A, B, param);
@@ -101,6 +129,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(ThresholdedReLu);
     }
 
 // ThresholdedReLu Derivative
@@ -109,6 +139,8 @@ namespace tensorNN {
             msg("Tensors in different devices", "Tensor::D_ThresholdedReLu");
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_ThresholdedReLu");
 
+        PROFILING_HEADER(D_ThresholdedReLu);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_thresholded_relu(D, I, PD, param);
@@ -126,6 +158,8 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_ThresholdedReLu);
     }
 
 // LeakyReLU
@@ -133,6 +167,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::LeakyReLu");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::LeakyReLu");
 
+        PROFILING_HEADER(LeakyReLu);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_leaky_relu(A, B, param);
@@ -150,6 +186,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(LeakyReLu);
     }
 
 // RELU Derivative, always increment over parent delta
@@ -158,6 +196,8 @@ namespace tensorNN {
             msg("Tensors in different devices", "Tensor::D_ReLu");
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_ReLu");
 
+        PROFILING_HEADER(D_LeakyReLu);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_leaky_relu(D, I, PD, param);
@@ -175,6 +215,8 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_LeakyReLu);
     }
 
 
@@ -183,6 +225,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::ELu");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::ELu");
 
+        PROFILING_HEADER(ELu);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_elu(A, B, param);
@@ -200,6 +244,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(ELu);
     }
 
 // ELU Derivative
@@ -207,6 +253,8 @@ namespace tensorNN {
         if ((D->device != I->device) || (D->device != PD->device)) msg("Tensors in different devices", "Tensor::D_ELu");
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_ELu");
 
+        PROFILING_HEADER(D_ELu);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_elu(D, I, PD, param);
@@ -224,6 +272,8 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_ELu);
     }
 
 
@@ -232,6 +282,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::Softplus");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Softplus");
 
+        PROFILING_HEADER(Softplus);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_softplus(A, B);
@@ -249,6 +301,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(Softplus);
     }
 
 // Softplus Derivative
@@ -257,6 +311,8 @@ namespace tensorNN {
             msg("Tensors in different devices", "Tensor::D_softplus");
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_softplus");
 
+        PROFILING_HEADER(D_softplus);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_softplus(D, I, PD);
@@ -274,6 +330,8 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_softplus);
     }
 
 
@@ -282,6 +340,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::Softsign");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Softsign");
 
+        PROFILING_HEADER(Softsign);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_softsign(A, B);
@@ -299,6 +359,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(Softsign);
     }
 
 // Softsign Derivative
@@ -307,6 +369,8 @@ namespace tensorNN {
             msg("Tensors in different devices", "Tensor::D_softsign");
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_softsign");
 
+        PROFILING_HEADER(D_softsign);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_softsign(D, I, PD);
@@ -324,6 +388,8 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_softsign);
     }
 
 // Linear
@@ -331,6 +397,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::Linear");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Linear");
 
+        PROFILING_HEADER(Linear);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_linear(A, B, param);
@@ -348,6 +416,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(Linear);
     }
 
 // Linear Derivative
@@ -356,6 +426,8 @@ namespace tensorNN {
             msg("Tensors in different devices", "Tensor::D_Linear");
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_Linear");
 
+        PROFILING_HEADER(D_Linear);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_linear(D, I, PD, param);
@@ -373,6 +445,9 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_Linear);
+
     }
 
 // Sigmoid
@@ -380,6 +455,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::Sigmoid");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Sigmoid");
 
+        PROFILING_HEADER(Sigmoid);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_sigmoid(A, B);
@@ -397,6 +474,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(Sigmoid);
     }
 
 // Sigmoid Derivative, always increment over parent delta
@@ -405,6 +484,8 @@ namespace tensorNN {
             msg("Tensors in different devices", "Tensor::D_Sigmoid");
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_Sigmoid");
 
+        PROFILING_HEADER(D_Sigmoid);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_sigmoid(D, I, PD);
@@ -422,6 +503,8 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_Sigmoid);
     }
 
 // Hard Sigmoid
@@ -429,6 +512,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::HardSigmoid");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::HardSigmoid");
 
+        PROFILING_HEADER(HardSigmoid);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_hard_sigmoid(A, B);
@@ -446,6 +531,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(HardSigmoid);
     }
 
 // Hard Sigmoid Derivative
@@ -454,6 +541,8 @@ namespace tensorNN {
             msg("Tensors in different devices", "Tensor::D_HardSigmoid");
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_HardSigmoid");
 
+        PROFILING_HEADER(D_HardSigmoid);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_hard_sigmoid(D, I, PD);
@@ -471,6 +560,8 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_HardSigmoid);
     }
 
 // Exponential
@@ -478,6 +569,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::Exp");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Exp");
 
+        PROFILING_HEADER(Exp);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_exp(A, B);
@@ -496,6 +589,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(Exp);
     }
 
 // Exponential Derivative
@@ -503,6 +598,8 @@ namespace tensorNN {
         if ((D->device != I->device) || (D->device != PD->device)) msg("Tensors in different devices", "Tensor::D_Exp");
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_Exp");
 
+        PROFILING_HEADER(D_Exp);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_exp(D, I, PD);
@@ -519,6 +616,8 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_Exp);
     }
 
 // Tanh
@@ -526,6 +625,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::Tanh");
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Tanh");
 
+        PROFILING_HEADER(Tanh);
+
         B->tsem->lock();
         if (A->isCPU()) {
             cpu_tanh(A, B);
@@ -543,6 +644,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(Tanh);
     }
 
 // Tanh Derivative
@@ -551,6 +654,8 @@ namespace tensorNN {
             msg("Tensors in different devices", "Tensor::D_Tanh");
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_Tanh");
 
+        PROFILING_HEADER(D_Tanh);
+
         PD->tsem->lock();
         if (D->isCPU()) {
             cpu_d_tanh(D, I, PD);
@@ -568,6 +673,8 @@ namespace tensorNN {
     }
 #endif
         PD->tsem->unlock();
+
+        PROFILING_FOOTER(D_Tanh);
     }
 
 
@@ -577,6 +684,8 @@ namespace tensorNN {
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Softmax");
         if (A->ndim != 2) msg("Softmax only over 2D Tensor (batch x logits)", "Tensor::Softmax");
 
+        PROFILING_HEADER(Softmax);
+
         B->tsem->lock();
 
         if (A->isCPU()) {
@@ -595,6 +704,8 @@ namespace tensorNN {
 #endif
 
         B->tsem->unlock();
+
+        PROFILING_FOOTER(Softmax);
     }
 
 // SOFTMAX DERIVATIVE
@@ -604,6 +715,8 @@ namespace tensorNN {
         if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_Softmax");
         if (D->ndim != 2) msg("D_Softmax only over 2D Tensor (batch x delta_probs)", "Tensor::D_Softmax");
 
+        PROFILING_HEADER(D_Softmax);
+
         if (D->isCPU()) {
             cpu_d_softmax(D, I, PD);
         }
@@ -627,6 +740,8 @@ namespace tensorNN {
     }
 #endif
 
+    PROFILING_FOOTER(D_Softmax);
+
     }
 
 }
diff --git a/src/tensor/nn/tensor_conv.cpp b/src/tensor/nn/tensor_conv.cpp
index ec876fc92..896e2257b 100644
--- a/src/tensor/nn/tensor_conv.cpp
+++ b/src/tensor/nn/tensor_conv.cpp
@@ -21,11 +21,11 @@
 #include "eddl/hardware/fpga/nn/fpga_nn.h"
 #endif
 
-namespace tensorNN{
-
-	PROFILING_ENABLE(Conv2D);
-
+PROFILING_ENABLE_EXTERN(Conv2D);
+PROFILING_ENABLE_EXTERN(Conv2D_grad);
+PROFILING_ENABLE_EXTERN(Conv2D_back);
 
+namespace tensorNN{
 
 void Conv2D(ConvolDescriptor *D) {
     /////////////////////////////////////////////////////////////////////
@@ -36,7 +36,7 @@ void Conv2D(ConvolDescriptor *D) {
     /////////////////////////////////////////////////////////////////////
     if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::Conv2D");
 
-    PROFILING_HEADER_EXTERN(Conv2D);
+    PROFILING_HEADER(Conv2D);
 
     D->O->tsem->lock();
     if (D->I->isCPU()) {
@@ -57,7 +57,6 @@ void Conv2D(ConvolDescriptor *D) {
     D->O->tsem->unlock();
 
     PROFILING_FOOTER(Conv2D);
-    PROFILING_PRINTF(Conv2D);
 }
 
 void Conv2D_grad(ConvolDescriptor *D) {
@@ -69,6 +68,8 @@ void Conv2D_grad(ConvolDescriptor *D) {
     /////////////////////////////////////////////////////////////////////
     if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::Conv2D");
 
+    PROFILING_HEADER(Conv2D_grad);
+
     D->gK->tsem->lock();
     if (D->I->isCPU()) {
         cpu_conv2D_grad(D);
@@ -85,6 +86,8 @@ void Conv2D_grad(ConvolDescriptor *D) {
     }
 #endif
     D->gK->tsem->unlock();
+
+    PROFILING_FOOTER(Conv2D_grad);
 }
 
 void Conv2D_back(ConvolDescriptor *D) {
@@ -96,6 +99,8 @@ void Conv2D_back(ConvolDescriptor *D) {
     /////////////////////////////////////////////////////////////////////
     if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::Conv2D");
 
+    PROFILING_HEADER(Conv2D_back);
+
     D->ID->tsem->lock();
     if (D->I->isCPU()) {
         cpu_conv2D_back(D);
@@ -112,6 +117,8 @@ void Conv2D_back(ConvolDescriptor *D) {
     }
 #endif
     D->ID->tsem->unlock();
+
+    PROFILING_FOOTER(Conv2D_back);
 }
 
 }
diff --git a/src/tensor/tensor_reduction.cpp b/src/tensor/tensor_reduction.cpp
index c5746c884..250b1865a 100644
--- a/src/tensor/tensor_reduction.cpp
+++ b/src/tensor/tensor_reduction.cpp
@@ -11,6 +11,7 @@
 #include "eddl/tensor/tensor.h"
 #include "eddl/tensor/tensor_reduction.h"
 #include "eddl/hardware/cpu/cpu_tensor.h"
+#include "eddl/profiling.h"
 
 
 #ifdef cGPU
@@ -102,6 +103,8 @@ void reduce(Tensor *A, Tensor *B,string mode,vector<int> axis,int* map)
      }
   }
 
+  PROFILING_HEADER_EXTERN(reduce);
+
   if (map==nullptr)
     map=get_reduction_map(A,axis);
 
@@ -118,6 +121,8 @@ void reduce(Tensor *A, Tensor *B,string mode,vector<int> axis,int* map)
     fpga_reduce(A,B,mode,map);
   }
   #endif
+
+  PROFILING_FOOTER(reduce);
 }
 
 void reduce_mean(Tensor *A, Tensor *B,vector<int> axis,int* map)
@@ -140,6 +145,9 @@ void reduce_min(Tensor *A, Tensor *B,vector<int> axis,int* map)
 
 void reduce(Tensor *A, Tensor *B,string mode,MapReduceDescriptor *MD)
 {
+
+  PROFILING_HEADER_EXTERN(reduce);
+
   if (A->isCPU()) {
       cpu_reduce(A,B,mode,MD);
     }
@@ -153,6 +161,8 @@ void reduce(Tensor *A, Tensor *B,string mode,MapReduceDescriptor *MD)
       fpga_reduce(A,B,mode,MD);
   }
   #endif
+
+  PROFILING_FOOTER(reduce);
 }
 
 
@@ -179,7 +189,7 @@ void reduce_op(Tensor *A, Tensor *B,string op,vector<int> axis,int* map)
 {
   int i,j;
 
-
+  PROFILING_HEADER_EXTERN(reduce_op);
 
   if (B->ndim!=A->ndim-axis.size())
     msg("dims don't match in reduction","reduce");
@@ -209,6 +219,8 @@ void reduce_op(Tensor *A, Tensor *B,string op,vector<int> axis,int* map)
       fpga_reduce_op(A,B,op,map);
   }
   #endif
+
+  PROFILING_FOOTER(reduce_op);
 }
 
 void reduce_sum(Tensor *A, Tensor *B,vector<int> axis,int* map)
@@ -230,6 +242,9 @@ void reduce_div(Tensor *A, Tensor *B,vector<int> axis,int* map)
 
  void reduce_op(Tensor *A, Tensor *B,string op, MapReduceDescriptor *MD)
 {
+
+  PROFILING_HEADER_EXTERN(reduce_op);
+
   if (A->isCPU()) {
     cpu_reduce_op(A,B,op,MD);
   }
@@ -244,7 +259,9 @@ void reduce_div(Tensor *A, Tensor *B,vector<int> axis,int* map)
   }
   #endif
 
+  PROFILING_FOOTER(reduce_op);
 }
+
  void reduce_sum(Tensor *A, Tensor *B,MapReduceDescriptor *MD)
 {
   reduce_op(A,B,"sum",MD);
@@ -264,6 +281,8 @@ void reduce_div(Tensor *A, Tensor *B,vector<int> axis,int* map)
 ////////////
 void reduction(ReduceDescriptor *RD){
 
+    PROFILING_HEADER_EXTERN(reduction);
+
     if (RD->I->isCPU()) {
       cpu_reduction(RD);
     }
@@ -278,12 +297,16 @@ void reduction(ReduceDescriptor *RD){
         fpga_reduction(RD);
         }
     #endif
+
+    PROFILING_FOOTER(reduction);
 }
 
 
 void reduction_back(ReduceDescriptor *RD)
 {
 
+  PROFILING_HEADER_EXTERN(reduction_back);
+
   if (RD->I->isCPU()) {
     cpu_reduction_back(RD);
   }
@@ -298,4 +321,6 @@ void reduction_back(ReduceDescriptor *RD)
       fpga_reduction_back(RD);
       }
   #endif
+
+  PROFILING_FOOTER(reduction_back);
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index 4393ec4f2..b75ba8a90 100755
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -506,6 +506,37 @@ PROFILING_ENABLE(reduce);
 PROFILING_ENABLE(reduce_op);
 PROFILING_ENABLE(reduction);
 PROFILING_ENABLE(reduction_back);
+// activations
+PROFILING_ENABLE(ELu);
+PROFILING_ENABLE(Exp);
+PROFILING_ENABLE(ReLu);
+PROFILING_ENABLE(Tanh);
+PROFILING_ENABLE(D_ELu);
+PROFILING_ENABLE(D_Exp);
+PROFILING_ENABLE(D_Tanh);
+PROFILING_ENABLE(D_ThresholdedReLu);
+PROFILING_ENABLE(D_HardSigmoid);
+PROFILING_ENABLE(D_LeakyRelu);
+PROFILING_ENABLE(D_Linear);
+PROFILING_ENABLE(D_ReLu);
+PROFILING_ENABLE(D_LeakyReLu);
+PROFILING_ENABLE(D_Sigmoid);
+PROFILING_ENABLE(D_Softmax);
+PROFILING_ENABLE(D_softplus);
+PROFILING_ENABLE(HardSigmoid);
+PROFILING_ENABLE(D_softsign);
+PROFILING_ENABLE(LeakyReLu);
+PROFILING_ENABLE(Linear);
+PROFILING_ENABLE(Sigmoid);
+PROFILING_ENABLE(Softmax);
+PROFILING_ENABLE(Softplus);
+PROFILING_ENABLE(Softsign);
+PROFILING_ENABLE(ThresholdedReLu);
+// conv
+PROFILING_ENABLE(Conv2D);
+PROFILING_ENABLE(Conv2D_grad);
+PROFILING_ENABLE(Conv2D_back);
+
 
 void __show_profile() {
 
@@ -592,5 +623,35 @@ void __show_profile() {
   PROFILING_PRINTF(reduce_op);
   PROFILING_PRINTF(reduction);
   PROFILING_PRINTF(reduction_back);
+  // activations
+  PROFILING_ENABLE(ELu);
+  PROFILING_PRINTF(Exp);
+  PROFILING_PRINTF(ReLu);
+  PROFILING_PRINTF(Tanh);
+  PROFILING_PRINTF(D_ELu);
+  PROFILING_PRINTF(D_Exp);
+  PROFILING_PRINTF(D_Tanh);
+  PROFILING_PRINTF(D_ThresholdedReLu);
+  PROFILING_PRINTF(D_HardSigmoid);
+  PROFILING_PRINTF(D_LeakyRelu);
+  PROFILING_PRINTF(D_Linear);
+  PROFILING_PRINTF(D_ReLu);
+  PROFILING_PRINTF(D_LeakyReLu);
+  PROFILING_PRINTF(D_Sigmoid);
+  PROFILING_PRINTF(D_Softmax);
+  PROFILING_PRINTF(D_softplus);
+  PROFILING_PRINTF(HardSigmoid);
+  PROFILING_PRINTF(D_softsign);
+  PROFILING_PRINTF(LeakyReLu);
+  PROFILING_PRINTF(Linear);
+  PROFILING_PRINTF(Sigmoid);
+  PROFILING_PRINTF(Softmax);
+  PROFILING_PRINTF(Softplus);
+  PROFILING_PRINTF(Softsign);
+  PROFILING_PRINTF(ThresholdedReLu);
+  // conv
+  PROFILING_PRINTF(Conv2D);
+  PROFILING_PRINTF(Conv2D_grad);
+  PROFILING_PRINTF(Conv2D_back);
 
 }
\ No newline at end of file

From 068fb04e480f74d4d91f6d8b754418227ac411da Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Tue, 3 Nov 2020 17:20:00 +0100
Subject: [PATCH 14/15] UPV-GAP: Added profiling for losses, comparison, and
 generator operations

---
 src/tensor/nn/tensor_losses.cpp  |   7 ++
 src/tensor/tensor_comparison.cpp | 117 +++++++++++++++++++++++++++++++
 src/tensor/tensor_generator.cpp  |  23 ++++++
 src/utils.cpp                    |  58 ++++++++++++++-
 4 files changed, 203 insertions(+), 2 deletions(-)

diff --git a/src/tensor/nn/tensor_losses.cpp b/src/tensor/nn/tensor_losses.cpp
index 4e2a0682e..2089ea492 100644
--- a/src/tensor/nn/tensor_losses.cpp
+++ b/src/tensor/nn/tensor_losses.cpp
@@ -8,6 +8,7 @@
 */
 #include "eddl/tensor/nn/tensor_nn.h"
 #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h"
+#include "eddl/profiling.h"
 
 #ifdef cGPU
 #include "eddl/hardware/gpu/gpu_tensor.h"
@@ -20,6 +21,8 @@
 #include "eddl/hardware/fpga/nn/fpga_nn.h"
 #endif
 
+PROFILING_ENABLE_EXTERN(cent);
+
 namespace tensorNN {
 
 
@@ -28,6 +31,8 @@ namespace tensorNN {
         if (A->device != B->device) msg("Tensors in different devices", "Tensor::cross-entropy");
         if ((!Tensor::sameShape(A, B)) || (!Tensor::sameShape(A, C))) msg("Incompatible dims", "Tensor::cross-entropy");
 
+        PROFILING_HEADER(cent);
+
         C->tsem->lock();
         if (A->isCPU()) {
             cpu_cent(A, B, C);
@@ -45,6 +50,8 @@ namespace tensorNN {
       }
 #endif
         C->tsem->unlock();
+
+        PROFILING_FOOTER(cent);
     }
 
 }
\ No newline at end of file
diff --git a/src/tensor/tensor_comparison.cpp b/src/tensor/tensor_comparison.cpp
index fde935324..da1d8b22c 100644
--- a/src/tensor/tensor_comparison.cpp
+++ b/src/tensor/tensor_comparison.cpp
@@ -8,6 +8,7 @@
 */
 #include "eddl/tensor/tensor.h"
 #include "eddl/hardware/cpu/cpu_tensor.h"
+#include "eddl/profiling.h"
 
 #ifdef cGPU
 #include "eddl/hardware/gpu/gpu_tensor.h"
@@ -21,7 +22,32 @@
 
 using namespace std;
 
+PROFILING_ENABLE_EXTERN(all);
+PROFILING_ENABLE_EXTERN(any);
+PROFILING_ENABLE_EXTERN(isfinite);
+PROFILING_ENABLE_EXTERN(isinf);
+PROFILING_ENABLE_EXTERN(isnan);
+PROFILING_ENABLE_EXTERN(isneginf);
+PROFILING_ENABLE_EXTERN(isposinf);
+PROFILING_ENABLE_EXTERN(logical_and);
+PROFILING_ENABLE_EXTERN(logical_or);
+PROFILING_ENABLE_EXTERN(logical_not);
+PROFILING_ENABLE_EXTERN(logical_xor);
+PROFILING_ENABLE_EXTERN(allclose);
+PROFILING_ENABLE_EXTERN(isclose);
+PROFILING_ENABLE_EXTERN(greater);
+PROFILING_ENABLE_EXTERN(greater_equal);
+PROFILING_ENABLE_EXTERN(less);
+PROFILING_ENABLE_EXTERN(less_equal);
+PROFILING_ENABLE_EXTERN(equal);
+PROFILING_ENABLE_EXTERN(not_equal);
+PROFILING_ENABLE_EXTERN(equivalent);
+
+
 bool Tensor::all(Tensor *A){
+
+    PROFILING_HEADER(all);
+
     bool res = false;
 
     if (A->isCPU()) {
@@ -39,10 +65,15 @@ bool Tensor::all(Tensor *A){
     }
 #endif
 
+    PROFILING_FOOTER(all);
+
     return res;
 }
 
 bool Tensor::any(Tensor *A){
+
+    PROFILING_HEADER(any);
+
     bool res = false;
 
     if (A->isCPU()) {
@@ -60,6 +91,8 @@ bool Tensor::any(Tensor *A){
     }
 #endif
 
+   PROFILING_FOOTER(any);
+
    return res;
 }
 
@@ -67,6 +100,8 @@ bool Tensor::any(Tensor *A){
 void Tensor::isfinite(Tensor *A, Tensor* B){
     checkCompatibility(A, B, "Tensor::isfinite");
 
+    PROFILING_HEADER(isfinite);
+
     if (A->isCPU()) {
         cpu_isfinite(A, B);
     }
@@ -81,11 +116,14 @@ void Tensor::isfinite(Tensor *A, Tensor* B){
          fpga_isfinite(A, B);
         }
 #endif
+    PROFILING_FOOTER(isfinite);
 }
 
 void Tensor::isinf(Tensor *A, Tensor* B){
     checkCompatibility(A, B, "Tensor::isinf");
 
+    PROFILING_HEADER(isinf);
+
     if (A->isCPU()) {
         cpu_isinf(A, B);
     }
@@ -100,11 +138,14 @@ void Tensor::isinf(Tensor *A, Tensor* B){
           fpga_isinf(A, B);
         }
 #endif
+    PROFILING_FOOTER(isinf);
 }
 
 void Tensor::isnan(Tensor *A, Tensor* B){
     checkCompatibility(A, B, "Tensor::isnan");
 
+    PROFILING_HEADER(isnan);
+
     if (A->isCPU()) {
         cpu_isnan(A, B);
     }
@@ -119,11 +160,14 @@ void Tensor::isnan(Tensor *A, Tensor* B){
           fpga_isnan(A, B);
         }
 #endif
+    PROFILING_FOOTER(isnan);
 }
 
 void Tensor::isneginf(Tensor *A, Tensor* B){
     checkCompatibility(A, B, "Tensor::isneginf");
 
+    PROFILING_HEADER(isneginf);
+
     if (A->isCPU()) {
         cpu_isneginf(A, B);
     }
@@ -138,11 +182,14 @@ void Tensor::isneginf(Tensor *A, Tensor* B){
          fpga_isneginf(A, B);
         }
 #endif
+    PROFILING_FOOTER(isneginf);
 }
 
 void Tensor::isposinf(Tensor *A, Tensor* B){
     checkCompatibility(A, B, "Tensor::isposinf");
 
+    PROFILING_HEADER(isposinf);
+
     if (A->isCPU()) {
         cpu_isposinf(A, B);
     }
@@ -157,6 +204,7 @@ void Tensor::isposinf(Tensor *A, Tensor* B){
          fpga_isposinf(A, B);
         }
 #endif
+    PROFILING_FOOTER(isposinf);
 }
 
 
@@ -165,6 +213,8 @@ void Tensor::isposinf(Tensor *A, Tensor* B){
 void Tensor::logical_and(Tensor *A, Tensor *B, Tensor *C){
     checkCompatibility(A, B, C, "Tensor::logical_and");
 
+    PROFILING_HEADER(logical_and);
+
     if (A->isCPU()) {
         cpu_logical_and(A, B, C);
     }
@@ -179,11 +229,14 @@ void Tensor::logical_and(Tensor *A, Tensor *B, Tensor *C){
           fpga_logical_and(A, B, C);
         }
 #endif
+    PROFILING_FOOTER(logical_and);
 }
 
 void Tensor::logical_or(Tensor *A, Tensor *B, Tensor *C){
     checkCompatibility(A, B, C, "Tensor::logical_or");
 
+    PROFILING_HEADER(logical_or);
+
     if (A->isCPU()) {
         cpu_logical_or(A, B, C);
     }
@@ -198,11 +251,14 @@ void Tensor::logical_or(Tensor *A, Tensor *B, Tensor *C){
           fpga_logical_or(A, B, C);
         }
 #endif
+    PROFILING_FOOTER(logical_or);
 }
 
 void Tensor::logical_not(Tensor *A, Tensor *B){
     checkCompatibility(A, B, "Tensor::logical_not");
 
+    PROFILING_HEADER(logical_not);
+
     if (A->isCPU()) {
         cpu_logical_not(A, B);
     }
@@ -217,11 +273,14 @@ void Tensor::logical_not(Tensor *A, Tensor *B){
           fpga_logical_not(A, B);
         }
 #endif
+    PROFILING_FOOTER(logical_not);
 }
 
 void Tensor::logical_xor(Tensor *A, Tensor *B, Tensor *C){
     checkCompatibility(A, B, C, "Tensor::logical_xor");
 
+    PROFILING_HEADER(logical_xor);
+
     if (A->isCPU()) {
         cpu_logical_xor(A, B, C);
     }
@@ -236,12 +295,15 @@ void Tensor::logical_xor(Tensor *A, Tensor *B, Tensor *C){
           fpga_logical_xor(A, B, C);
         }
 #endif
+    PROFILING_FOOTER(logical_xor);
 }
 
 
 bool Tensor::allclose(Tensor *A, Tensor *B, float rtol, float atol, bool equal_nan){
     checkCompatibility(A, B, "Tensor::allclose");
 
+    PROFILING_HEADER(allclose);
+
     if (A->isCPU()) {
         return cpu_allclose(A, B, rtol, atol, equal_nan);
     }
@@ -256,6 +318,8 @@ bool Tensor::allclose(Tensor *A, Tensor *B, float rtol, float atol, bool equal_n
           return fpga_allclose(A, B, rtol, atol, equal_nan);
         }
 #endif
+    PROFILING_FOOTER(allclose);
+
     return 0;
 }
 
@@ -263,6 +327,8 @@ bool Tensor::allclose(Tensor *A, Tensor *B, float rtol, float atol, bool equal_n
 void Tensor::isclose(Tensor *A, Tensor *B, Tensor *C, float rtol, float atol, bool equal_nan){
     checkCompatibility(A, B, C, "Tensor::isclose");
 
+    PROFILING_HEADER(isclose);
+
     if (A->isCPU()) {
         cpu_isclose(A, B, C, rtol, atol, equal_nan);
     }
@@ -277,6 +343,7 @@ void Tensor::isclose(Tensor *A, Tensor *B, Tensor *C, float rtol, float atol, bo
           fpga_isclose(A, B, C, rtol, atol, equal_nan);
         }
 #endif
+    PROFILING_FOOTER(isclose);
 }
 
 void Tensor::greater_(float v){
@@ -292,6 +359,8 @@ Tensor* Tensor::greater(float v){
 void Tensor::greater(Tensor *A, Tensor *B, float v){
     checkCompatibility(A, B, "Tensor::greater");
 
+    PROFILING_HEADER(greater);
+
     if (A->isCPU()) {
         cpu_greater(A, B, v);
     }
@@ -307,6 +376,8 @@ void Tensor::greater(Tensor *A, Tensor *B, float v){
             fpga_greater(A, B, v);
           }
 #endif
+
+    PROFILING_FOOTER(greater);
 }
 
 Tensor* Tensor::greater(Tensor *A){
@@ -318,6 +389,8 @@ Tensor* Tensor::greater(Tensor *A){
 void Tensor::greater(Tensor *A, Tensor *B, Tensor *C){
     checkCompatibility(A, B, C, "Tensor::greater");
 
+    PROFILING_HEADER(greater);
+
     if (A->isCPU()) {
         cpu_greater(A, B, C);
     }
@@ -332,6 +405,8 @@ void Tensor::greater(Tensor *A, Tensor *B, Tensor *C){
             fpga_greater(A, B, C);
         }
 #endif
+
+    PROFILING_FOOTER(greater);
 }
 
 
@@ -348,6 +423,8 @@ Tensor* Tensor::greater_equal(float v){
 void Tensor::greater_equal(Tensor *A, Tensor *B, float v){
     checkCompatibility(A, B, "Tensor::greater_equal");
 
+    PROFILING_HEADER(greater_equal);
+
     if (A->isCPU()) {
         cpu_greater_equal(A, B, v);
     }
@@ -363,6 +440,7 @@ void Tensor::greater_equal(Tensor *A, Tensor *B, float v){
             fpga_greater_equal(A, B, v);
           }
 #endif
+    PROFILING_FOOTER(greater_equal);
 }
 
 Tensor* Tensor::greater_equal(Tensor *A){
@@ -374,6 +452,8 @@ Tensor* Tensor::greater_equal(Tensor *A){
 void Tensor::greater_equal(Tensor *A, Tensor *B, Tensor *C){
     checkCompatibility(A, B, C, "Tensor::greater_equal");
 
+    PROFILING_HEADER(greater_equal);
+
     if (A->isCPU()) {
         cpu_greater_equal(A, B, C);
     }
@@ -389,6 +469,8 @@ void Tensor::greater_equal(Tensor *A, Tensor *B, Tensor *C){
         fpga_greater_equal(A, B, C);
         }
 #endif
+
+    PROFILING_FOOTER(greater_equal);
 }
 
 
@@ -405,6 +487,8 @@ Tensor* Tensor::less(float v){
 void Tensor::less(Tensor *A, Tensor *B, float v){
     checkCompatibility(A, B, "Tensor::less");
 
+    PROFILING_HEADER(less);
+
     if (A->isCPU()) {
         cpu_less(A, B, v);
     }
@@ -420,6 +504,7 @@ void Tensor::less(Tensor *A, Tensor *B, float v){
              fpga_less(A, B, v);
         }
 #endif
+    PROFILING_FOOTER(less);
 }
 
 Tensor* Tensor::less(Tensor *A){
@@ -431,6 +516,8 @@ Tensor* Tensor::less(Tensor *A){
 void Tensor::less(Tensor *A, Tensor *B, Tensor *C){
     checkCompatibility(A, B, C, "Tensor::less");
 
+    PROFILING_HEADER(less);
+
     if (A->isCPU()) {
         cpu_less(A, B, C);
     }
@@ -445,6 +532,8 @@ void Tensor::less(Tensor *A, Tensor *B, Tensor *C){
          fpga_less(A, B, C);
         }
 #endif
+
+    PROFILING_FOOTER(less);
 }
 
 
@@ -461,6 +550,8 @@ Tensor* Tensor::less_equal(float v){
 void Tensor::less_equal(Tensor *A, Tensor *B, float v){
     checkCompatibility(A, B, "Tensor::less_equal");
 
+    PROFILING_HEADER(less_equal);
+
     if (A->isCPU()) {
         cpu_less_equal(A, B, v);
     }
@@ -476,6 +567,8 @@ void Tensor::less_equal(Tensor *A, Tensor *B, float v){
             fpga_less_equal(A, B, v);
           }
 #endif
+
+    PROFILING_FOOTER(less_equal);
 }
 
 
@@ -488,6 +581,8 @@ Tensor* Tensor::less_equal(Tensor *A){
 void Tensor::less_equal(Tensor *A, Tensor *B, Tensor *C){
     checkCompatibility(A, B, C, "Tensor::less_equal");
 
+    PROFILING_HEADER(less_equal);
+
     if (A->isCPU()) {
         cpu_less_equal(A, B, C);
     }
@@ -502,6 +597,7 @@ void Tensor::less_equal(Tensor *A, Tensor *B, Tensor *C){
           fpga_less_equal(A, B, C);
         }
 #endif
+    PROFILING_FOOTER(less_equal);
 }
 
 
@@ -518,6 +614,8 @@ Tensor* Tensor::equal(float v){
 void Tensor::equal(Tensor *A, Tensor *B, float v){
     checkCompatibility(A, B, "Tensor::equal");
 
+    PROFILING_HEADER(equal);
+
     if (A->isCPU()) {
         cpu_equal(A, B, v);
     }
@@ -533,6 +631,8 @@ void Tensor::equal(Tensor *A, Tensor *B, float v){
             fpga_equal(A, B, v);
           }
 #endif
+
+    PROFILING_FOOTER(equal);
 }
 
 Tensor* Tensor::equal(Tensor *A){
@@ -544,6 +644,8 @@ Tensor* Tensor::equal(Tensor *A){
 void Tensor::equal(Tensor *A, Tensor *B, Tensor *C){
     checkCompatibility(A, B, C, "Tensor::equal");
 
+    PROFILING_HEADER(equal);
+
     if (A->isCPU()) {
         cpu_equal(A, B, C);
     }
@@ -558,6 +660,8 @@ void Tensor::equal(Tensor *A, Tensor *B, Tensor *C){
          fpga_equal(A, B, C);
         }
 #endif
+
+    PROFILING_FOOTER(equal);
 }
 
 void Tensor::not_equal_(float v){
@@ -573,6 +677,8 @@ Tensor* Tensor::not_equal(float v){
 void Tensor::not_equal(Tensor *A, Tensor *B, float v){
     checkCompatibility(A, B, "Tensor::not_equal");
 
+    PROFILING_HEADER(not_equal);
+
     if (A->isCPU()) {
         cpu_not_equal(A, B, v);
     }
@@ -588,6 +694,8 @@ void Tensor::not_equal(Tensor *A, Tensor *B, float v){
             fpga_not_equal(A, B, v);
           }
 #endif
+
+    PROFILING_FOOTER(not_equal);
 }
 
 Tensor* Tensor::not_equal(Tensor *A){
@@ -599,6 +707,8 @@ Tensor* Tensor::not_equal(Tensor *A){
 void Tensor::not_equal(Tensor *A, Tensor *B, Tensor *C){
     checkCompatibility(A, B, C, "Tensor::not_equal");
 
+    PROFILING_HEADER(not_equal);
+
     if (A->isCPU()) {
         cpu_not_equal(A, B, C);
     }
@@ -613,6 +723,8 @@ void Tensor::not_equal(Tensor *A, Tensor *B, Tensor *C){
            fpga_not_equal(A, B, C);
         }
 #endif
+
+    PROFILING_FOOTER(not_equal);
 }
 
 int Tensor::eqsize(Tensor *A, Tensor *B){
@@ -639,12 +751,15 @@ int Tensor::sameShape(Tensor *A, Tensor *B) {
 }
 
 int Tensor::equivalent(Tensor *A, Tensor *B, float atol, float rtol, bool equal_nan) {
+
     // Equal device
     if (A->device != B->device) msg("Tensors in different devices", "Tensor::equivalent");
 
     // Equal ndims and shapes
     if (!sameShape(A, B)) return 0;
 
+    PROFILING_HEADER(equivalent);
+    
     // Equal data
     if (A->isCPU() && B->isCPU()) {
 //        return cpu_allclose(A, B, rtol, atol, equal_nan);
@@ -663,5 +778,7 @@ int Tensor::equivalent(Tensor *A, Tensor *B, float atol, float rtol, bool equal_
         }
 #endif
 
+    PROFILING_FOOTER(equivalent);
+
     return 1;
 }
diff --git a/src/tensor/tensor_generator.cpp b/src/tensor/tensor_generator.cpp
index 7c6c7f9a4..c44306cd0 100644
--- a/src/tensor/tensor_generator.cpp
+++ b/src/tensor/tensor_generator.cpp
@@ -9,6 +9,7 @@
 
 #include "eddl/tensor/tensor.h"
 #include "eddl/hardware/cpu/cpu_tensor.h"
+#include "eddl/profiling.h"
 
 #ifdef cGPU
 #include "eddl/hardware/gpu/gpu_tensor.h"
@@ -22,7 +23,15 @@
 
 using namespace std;
 
+PROFILING_ENABLE_EXTERN(fill_rand_uniform);
+PROFILING_ENABLE_EXTERN(fill_rand_signed_uniform);
+PROFILING_ENABLE_EXTERN(fill_rand_normal);
+PROFILING_ENABLE_EXTERN(fill_rand_binary);
+
 void Tensor::fill_rand_uniform_(float v) {
+
+    PROFILING_HEADER(fill_rand_uniform);
+
     if (isCPU()) {
         cpu_rand_uniform(this, v);
     }
@@ -38,6 +47,8 @@ void Tensor::fill_rand_uniform_(float v) {
     }
 #endif
 
+    PROFILING_FOOTER(fill_rand_uniform);
+
 }
 
 Tensor* Tensor::fill_rand_uniform(float v){
@@ -47,6 +58,9 @@ Tensor* Tensor::fill_rand_uniform(float v){
 }
 
 void Tensor::fill_rand_signed_uniform_(float v) {
+
+    PROFILING_HEADER(fill_rand_signed_uniform);
+
     if (isCPU()) {
         cpu_rand_signed_uniform(this, v);
     }
@@ -62,6 +76,7 @@ void Tensor::fill_rand_signed_uniform_(float v) {
     }
 #endif
 
+    PROFILING_FOOTER(fill_rand_signed_uniform);
 }
 
 Tensor* Tensor::fill_rand_signed_uniform(float v){
@@ -71,6 +86,9 @@ Tensor* Tensor::fill_rand_signed_uniform(float v){
 }
 
 void Tensor::fill_rand_normal_(float m, float s, bool fast_math) {
+
+    PROFILING_HEADER(fill_rand_normal);
+
     if (isCPU()) {
         cpu_rand_normal(this, m, s, fast_math);
     }
@@ -86,6 +104,7 @@ void Tensor::fill_rand_normal_(float m, float s, bool fast_math) {
     }
 #endif
 
+    PROFILING_FOOTER(fill_rand_normal);
 }
 
 Tensor* Tensor::fill_rand_normal(float m, float s, bool fast_math) {
@@ -95,6 +114,9 @@ Tensor* Tensor::fill_rand_normal(float m, float s, bool fast_math) {
 }
 
 void Tensor::fill_rand_binary_(float v) {
+
+    PROFILING_HEADER(fill_rand_binary);
+
     if (isCPU()) {
         cpu_rand_binary(this, v);
     }
@@ -110,6 +132,7 @@ void Tensor::fill_rand_binary_(float v) {
     }
 #endif
 
+    PROFILING_FOOTER(fill_rand_binary);
 }
 
 Tensor* Tensor::fill_rand_binary(float v) {
diff --git a/src/utils.cpp b/src/utils.cpp
index b75ba8a90..c151d1cb2 100755
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -536,7 +536,34 @@ PROFILING_ENABLE(ThresholdedReLu);
 PROFILING_ENABLE(Conv2D);
 PROFILING_ENABLE(Conv2D_grad);
 PROFILING_ENABLE(Conv2D_back);
-
+// losses
+PROFILING_ENABLE(cent);
+// generator
+PROFILING_ENABLE(fill_rand_uniform);
+PROFILING_ENABLE(fill_rand_signed_uniform);
+PROFILING_ENABLE(fill_rand_normal);
+PROFILING_ENABLE(fill_rand_binary);
+// comparison
+PROFILING_ENABLE(all);
+PROFILING_ENABLE(any);
+PROFILING_ENABLE(isfinite);
+PROFILING_ENABLE(isinf);
+PROFILING_ENABLE(isnan);
+PROFILING_ENABLE(isneginf);
+PROFILING_ENABLE(isposinf);
+PROFILING_ENABLE(logical_and);
+PROFILING_ENABLE(logical_or);
+PROFILING_ENABLE(logical_not);
+PROFILING_ENABLE(logical_xor);
+PROFILING_ENABLE(allclose);
+PROFILING_ENABLE(isclose);
+PROFILING_ENABLE(greater);
+PROFILING_ENABLE(greater_equal);
+PROFILING_ENABLE(less);
+PROFILING_ENABLE(less_equal);
+PROFILING_ENABLE(equal);
+PROFILING_ENABLE(not_equal);
+PROFILING_ENABLE(equivalent);
 
 void __show_profile() {
 
@@ -653,5 +680,32 @@ void __show_profile() {
   PROFILING_PRINTF(Conv2D);
   PROFILING_PRINTF(Conv2D_grad);
   PROFILING_PRINTF(Conv2D_back);
-
+  // losses
+  PROFILING_PRINTF(cent);
+  // generator
+  PROFILING_PRINTF(fill_rand_uniform);
+  PROFILING_PRINTF(fill_rand_signed_uniform);
+  PROFILING_PRINTF(fill_rand_normal);
+  PROFILING_PRINTF(fill_rand_binary);  
+  // comparison
+  PROFILING_PRINTF(all);
+  PROFILING_PRINTF(any);
+  PROFILING_PRINTF(isfinite);
+  PROFILING_PRINTF(isinf);
+  PROFILING_PRINTF(isnan);
+  PROFILING_PRINTF(isneginf);
+  PROFILING_PRINTF(isposinf);
+  PROFILING_PRINTF(logical_and);
+  PROFILING_PRINTF(logical_or);
+  PROFILING_PRINTF(logical_not);
+  PROFILING_PRINTF(logical_xor);
+  PROFILING_PRINTF(allclose);
+  PROFILING_PRINTF(isclose);
+  PROFILING_PRINTF(greater);
+  PROFILING_PRINTF(greater_equal);
+  PROFILING_PRINTF(less);
+  PROFILING_PRINTF(less_equal);
+  PROFILING_PRINTF(equal);
+  PROFILING_PRINTF(not_equal);
+  PROFILING_PRINTF(equivalent);
 }
\ No newline at end of file

From 962008edef5db40ef42a8a81c966b46878db198f Mon Sep 17 00:00:00 2001
From: Jose Flich <jflich@disca.upv.es>
Date: Tue, 3 Nov 2020 17:42:48 +0100
Subject: [PATCH 15/15] UPV-GAP: Added profling of batch normalization, core
 nn, metrics, pool functions

---
 src/tensor/nn/tensor_bn.cpp      | 48 +++++++++++++++++++++++---------
 src/tensor/nn/tensor_core_nn.cpp | 39 ++++++++++++++++++++++----
 src/tensor/nn/tensor_metrics.cpp | 13 +++++++++
 src/tensor/nn/tensor_pool.cpp    | 22 ++++++++++++---
 src/utils.cpp                    | 41 +++++++++++++++++++++++++++
 5 files changed, 141 insertions(+), 22 deletions(-)

diff --git a/src/tensor/nn/tensor_bn.cpp b/src/tensor/nn/tensor_bn.cpp
index db2d855ce..17b0ce0e4 100644
--- a/src/tensor/nn/tensor_bn.cpp
+++ b/src/tensor/nn/tensor_bn.cpp
@@ -8,6 +8,7 @@
 */
 #include "eddl/tensor/nn/tensor_nn.h"
 #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h"
+#include "eddl/profiling.h"
 
 #ifdef cFPGA
 #include "eddl/hardware/fpga/fpga_hw.h"
@@ -20,10 +21,18 @@
 #include "eddl/hardware/gpu/nn/gpu_tensor_nn.h"
 #endif
 
+PROFILING_ENABLE_EXTERN(permute_channels_last);
+PROFILING_ENABLE_EXTERN(permute_channels_first);
+PROFILING_ENABLE_EXTERN(permute_batch_last);
+PROFILING_ENABLE_EXTERN(permute_batch_first);
+
 namespace tensorNN {
 
 
     void permute_channels_last(Tensor *A, Tensor *B) {
+
+        PROFILING_HEADER(permute_channels_last);
+
         if (A->isCPU()) {
             cpu_permute_channels_last(A, B);
         }
@@ -34,13 +43,17 @@ namespace tensorNN {
             }
 #endif
 #ifdef cFPGA
-  else {
-      fpga_permute_channels_last(A, B);
-    }
+        else {
+          fpga_permute_channels_last(A, B);
+        }
 #endif
-    }
+        PROFILING_FOOTER(permute_channels_last);
+        }
 
     void permute_channels_first(Tensor *A, Tensor *B) {
+
+        PROFILING_HEADER(permute_channels_first);
+
         if (A->isCPU()) {
             cpu_permute_channels_first(A, B);
         }
@@ -51,14 +64,18 @@ namespace tensorNN {
             }
 #endif
 #ifdef cFPGA
-  else {
-      fpga_permute_channels_first(A, B);
-    }
+        else {
+          fpga_permute_channels_first(A, B);
+        }
 #endif
+        PROFILING_FOOTER(permute_channels_first);
     }
 
 
     void permute_batch_last(Tensor *A, Tensor *B) {
+
+        PROFILING_HEADER(permute_batch_last);
+
         if (A->isCPU()) {
             cpu_permute_batch_last(A, B);
         }
@@ -69,13 +86,17 @@ namespace tensorNN {
             }
 #endif
 #ifdef cFPGA
-  else {
-      fpga_permute_batch_last(A, B);
-    }
+        else {
+          fpga_permute_batch_last(A, B);
+        }
 #endif
+        PROFILING_FOOTER(permute_batch_last);
     }
 
     void permute_batch_first(Tensor *A, Tensor *B) {
+
+        PROFILING_HEADER(permute_batch_first);
+
         if (A->isCPU()) {
             cpu_permute_batch_first(A, B);
         }
@@ -86,10 +107,11 @@ namespace tensorNN {
             }
 #endif
 #ifdef cFPGA
-  else {
-      fpga_permute_batch_first(A, B);
-    }
+        else {
+          fpga_permute_batch_first(A, B);
+        }
 #endif
+        PROFILING_FOOTER(permute_batch_last);
     }
 
 }
\ No newline at end of file
diff --git a/src/tensor/nn/tensor_core_nn.cpp b/src/tensor/nn/tensor_core_nn.cpp
index 2db8eafc1..5635bd25e 100644
--- a/src/tensor/nn/tensor_core_nn.cpp
+++ b/src/tensor/nn/tensor_core_nn.cpp
@@ -11,6 +11,7 @@
 
 #include "eddl/tensor/nn/tensor_nn.h"
 #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h"
+#include "eddl/profiling.h"
 
 #ifdef cGPU
 #include "eddl/hardware/gpu/gpu_tensor.h"
@@ -25,6 +26,13 @@
 extern int next_fpga_tensor_id;
 #endif
 
+PROFILING_ENABLE_EXTERN(repeat_nn);
+PROFILING_ENABLE_EXTERN(d_repeat_nn);
+PROFILING_ENABLE_EXTERN(select);
+PROFILING_ENABLE_EXTERN(select_back);
+PROFILING_ENABLE_EXTERN(set_select);
+PROFILING_ENABLE_EXTERN(set_select_back);
+
 namespace tensorNN {
 
 
@@ -41,6 +49,8 @@ namespace tensorNN {
             }
         }
 
+        PROFILING_HEADER(repeat_nn);
+
         if (A->isCPU() && B->isCPU()) {
             cpu_repeat_nn(A, B, size);
         }
@@ -51,15 +61,19 @@ namespace tensorNN {
 #endif
 #ifdef cFPGA
         else {
-
+            printf("repeat_nn not supported yet on FPGA\n");
+            exit(1);
         }
 #endif
+        PROFILING_FOOTER(repeat_nn);
     }
 
     void d_repeat_nn(Tensor *D, Tensor *A, vector<int> size) {
         // TODO: Should be for N dimensions, not 2 (...and generic, not just NN)
         if ((D->device != A->device)) msg("Tensors in different devices", "Tensor::D_Repeat_NN");
 
+        PROFILING_HEADER(d_repeat_nn);
+
         if (D->isCPU() && A->isCPU()) {
             cpu_d_repeat_nn(D, A, size);
         }
@@ -70,13 +84,18 @@ namespace tensorNN {
 #endif
 #ifdef cFPGA
         else {
-
+            printf("d_repeat_nn not implemented in FPGA yet\n");
+            exit(1);
         }
 #endif
+        PROFILING_FOOTER(d_repeat_nn);
     }
 
 
     void select(Tensor *A, Tensor* B, SelDescriptor *sd){
+
+        PROFILING_HEADER(select);
+
         if (A->isCPU() && B->isCPU()) {
             cpu_select_nn(A, B, sd);
         }
@@ -92,10 +111,13 @@ namespace tensorNN {
             fpga_select_nn(A, B, sd);
         }
 #endif
-
+        PROFILING_FOOTER(select);
     }
 
     void select_back(Tensor *A, Tensor* B, SelDescriptor *sd){
+
+        PROFILING_HEADER(select_back);
+
         if (A->isCPU() && B->isCPU()) {
             cpu_select_back_nn(A, B, sd);
         }
@@ -111,10 +133,13 @@ namespace tensorNN {
            fpga_select_back_nn(A, B, sd);
         }
 #endif
-
+        PROFILING_FOOTER(select_back);
     }
 
     void set_select(Tensor *A, Tensor *B, SelDescriptor *sd){
+
+        PROFILING_HEADER(set_select);
+
         if (A->isCPU() && B->isCPU()) {
             cpu_set_select_nn(A, B, sd);
         }
@@ -130,10 +155,14 @@ namespace tensorNN {
             fpga_set_select_nn(A, B, sd);
         }
 #endif
+        PROFILING_FOOTER(set_select);
     }
 
 
     void set_select_back(Tensor *A, Tensor* B, SelDescriptor *sd){
+
+        PROFILING_HEADER(set_select_back);
+
         if (A->isCPU() && B->isCPU()) {
             cpu_set_select_back_nn(A, B, sd);
         }
@@ -149,7 +178,7 @@ namespace tensorNN {
             fpga_set_select_back_nn(A, B, sd);
         }
 #endif
-
+        PROFILING_FOOTER(set_select_back);
     }
 
 }
diff --git a/src/tensor/nn/tensor_metrics.cpp b/src/tensor/nn/tensor_metrics.cpp
index b1a6ffe5d..ac6753fe3 100644
--- a/src/tensor/nn/tensor_metrics.cpp
+++ b/src/tensor/nn/tensor_metrics.cpp
@@ -8,6 +8,7 @@
 */
 #include "eddl/tensor/nn/tensor_nn.h"
 #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h"
+#include "eddl/profiling.h"
 
 #ifdef cGPU
 #include "eddl/hardware/gpu/gpu_tensor.h"
@@ -20,6 +21,9 @@
 #include "eddl/hardware/fpga/nn/fpga_nn.h"
 #endif
 
+PROFILING_ENABLE_EXTERN(accuracy);
+PROFILING_ENABLE_EXTERN(bin_accuracy);
+
 namespace tensorNN {
 
 
@@ -28,6 +32,8 @@ namespace tensorNN {
         if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::accuracy");
         if (A->ndim != 2) msg("Accuracy only over 2D Tensor (batch x probs)", "Tensor::Accuracy");
 
+        PROFILING_HEADER(accuracy);
+
         int acc = 0;
 
         B->tsem->lock();
@@ -46,6 +52,9 @@ namespace tensorNN {
         }
 #endif
         B->tsem->unlock();
+
+        PROFILING_FOOTER(accuracy);
+
         return acc;
 
     }
@@ -58,6 +67,7 @@ namespace tensorNN {
         if (A->shape[1] != 1)
             msg("Accuracy only over 2D Tensor (batch x prob) within shape:{batchx1}", "Tensor::Bin_Accuracy");
 
+        PROFILING_HEADER(bin_accuracy);
 
         int acc = 0;
 
@@ -77,6 +87,9 @@ namespace tensorNN {
         }
 #endif
         B->tsem->unlock();
+
+        PROFILING_FOOTER(bin_accuracy);
+        
         return acc;
 
     }
diff --git a/src/tensor/nn/tensor_pool.cpp b/src/tensor/nn/tensor_pool.cpp
index d3e0a7a39..c33db48c7 100644
--- a/src/tensor/nn/tensor_pool.cpp
+++ b/src/tensor/nn/tensor_pool.cpp
@@ -21,7 +21,10 @@
 #include "eddl/hardware/fpga/nn/fpga_nn.h"
 #endif
 
-PROFILING_ENABLE(MPool2D);
+PROFILING_ENABLE_EXTERN(MPool2D);
+PROFILING_ENABLE_EXTERN(MPool2D_back);
+PROFILING_ENABLE_EXTERN(AvgPool2D);
+PROFILING_ENABLE_EXTERN(AvgPool2D_back);
 
 namespace tensorNN {
 
@@ -35,7 +38,7 @@ namespace tensorNN {
         /////////////////////////////////////////////////////////////////////
         if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::MPool2D");
 
-	PROFILING_HEADER(MPool2D);
+	      PROFILING_HEADER(MPool2D);
 
         D->O->tsem->lock();
         if (D->I->isCPU()) {
@@ -55,8 +58,7 @@ namespace tensorNN {
 #endif
         D->O->tsem->unlock();
 
-	PROFILING_FOOTER(MPool2D);
-	PROFILING_PRINTF(MPool2D);
+	      PROFILING_FOOTER(MPool2D);
     }
 
     void MPool2D_back(PoolDescriptor *D) {
@@ -68,6 +70,8 @@ namespace tensorNN {
         /////////////////////////////////////////////////////////////////////
         if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::MPool2D_back");
 
+        PROFILING_HEADER(MPool2D_back);
+
         D->ID->tsem->lock();
         if (D->I->isCPU()) {
             cpu_mpool2D_back(D);
@@ -85,6 +89,8 @@ namespace tensorNN {
       }
 #endif
         D->ID->tsem->unlock();
+
+        PROFILING_FOOTER(MPool2D_back);
     }
 
 
@@ -97,6 +103,8 @@ namespace tensorNN {
         /////////////////////////////////////////////////////////////////////
         if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::AvgPool2D");
 
+        PROFILING_HEADER(AvgPool2D);
+
         D->O->tsem->lock();
         if (D->I->isCPU()) {
             cpu_avgpool2D(D);
@@ -114,6 +122,8 @@ namespace tensorNN {
       }
 #endif
         D->O->tsem->unlock();
+
+        PROFILING_FOOTER(AvgPool2D);
     }
 
     void AvgPool2D_back(PoolDescriptor *D) {
@@ -125,6 +135,8 @@ namespace tensorNN {
         /////////////////////////////////////////////////////////////////////
         if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::AvgPool2D_back");
 
+        PROFILING_HEADER(AvgPool2D_back);
+
         D->ID->tsem->lock();
         if (D->I->isCPU()) {
             cpu_avgpool2D_back(D);
@@ -142,6 +154,8 @@ namespace tensorNN {
       }
 #endif
         D->ID->tsem->unlock();
+
+        PROFILING_FOOTER(AvgPool2D_back);
     }
 
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index c151d1cb2..7aa50e00a 100755
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -564,6 +564,26 @@ PROFILING_ENABLE(less_equal);
 PROFILING_ENABLE(equal);
 PROFILING_ENABLE(not_equal);
 PROFILING_ENABLE(equivalent);
+// bn
+PROFILING_ENABLE(permute_channels_last);
+PROFILING_ENABLE(permute_channels_first);
+PROFILING_ENABLE(permute_batch_last);
+PROFILING_ENABLE(permute_batch_first);
+// core_nn
+PROFILING_ENABLE(repeat_nn);
+PROFILING_ENABLE(d_repeat_nn);
+PROFILING_ENABLE(select);
+PROFILING_ENABLE(select_back);
+PROFILING_ENABLE(set_select);
+PROFILING_ENABLE(set_select_back);
+// metrics
+PROFILING_ENABLE(accuracy);
+PROFILING_ENABLE(bin_accuracy);
+// pool
+PROFILING_ENABLE(MPool2D);
+PROFILING_ENABLE(MPool2D_back);
+PROFILING_ENABLE(AvgPool2D);
+PROFILING_ENABLE(AvgPool2D_back);
 
 void __show_profile() {
 
@@ -708,4 +728,25 @@ void __show_profile() {
   PROFILING_PRINTF(equal);
   PROFILING_PRINTF(not_equal);
   PROFILING_PRINTF(equivalent);
+  // bn
+  PROFILING_PRINTF(permute_channels_last);
+  PROFILING_PRINTF(permute_channels_first);
+  PROFILING_PRINTF(permute_batch_last);
+  PROFILING_PRINTF(permute_batch_first);
+  // core_nn
+  PROFILING_PRINTF(repeat_nn);
+  PROFILING_PRINTF(d_repeat_nn);
+  PROFILING_PRINTF(select);
+  PROFILING_PRINTF(select_back);
+  PROFILING_PRINTF(set_select);
+  PROFILING_PRINTF(set_select_back);
+  // metrics
+  PROFILING_PRINTF(accuracy);
+  PROFILING_PRINTF(bin_accuracy);
+  // pool
+  PROFILING_PRINTF(MPool2D);
+  PROFILING_PRINTF(MPool2D_back);
+  PROFILING_PRINTF(AvgPool2D);
+  PROFILING_PRINTF(AvgPool2D_back);
+
 }
\ No newline at end of file