From e1316417c3583eb8e6cecd2f520305b3be1d7b25 Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Wed, 14 Oct 2020 15:03:40 +0000 Subject: [PATCH 01/15] adaptation to FPGA completed for pipeline use case --- fpga_kernels/generate_makefile.cpp | 3 + .../kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp | 704 ++++++++++++++++++ fpga_kernels/test_fpga/Makefile | 19 + fpga_kernels/test_fpga/description.json | 64 ++ fpga_kernels/test_fpga/src/Makefile | 189 +++++ .../src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp | 383 ++++++++++ fpga_kernels/test_fpga/src/test_mult2d.cpp | 566 ++++++++++++++ fpga_kernels/test_fpga/src/test_relu.cpp | 304 ++++++++ fpga_kernels/test_fpga/utils.mk | 101 +++ include/eddl/hardware/fpga/fpga_enables.h | 3 +- include/eddl/hardware/fpga/fpga_hw.h | 5 +- include/eddl/profiling.h | 37 + include/eddl/tensor/tensor.h | 2 +- src/hardware/fpga/fpga_core.cpp | 7 +- src/hardware/fpga/nn/fpga_conv.cpp | 68 +- src/layers/core/layer_activation.cpp | 3 + src/layers/core/layer_reshape.cpp | 11 + src/tensor/nn/tensor_activations.cpp | 10 +- src/tensor/nn/tensor_conv.cpp | 10 +- src/tensor/nn/tensor_pool.cpp | 10 +- src/tensor/tensor.cpp | 64 +- src/tensor/tensor_comparison.cpp | 3 +- src/tensor/tensor_math.cpp | 17 +- 23 files changed, 2534 insertions(+), 49 deletions(-) create mode 100644 fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp create mode 100644 fpga_kernels/test_fpga/Makefile create mode 100644 fpga_kernels/test_fpga/description.json create mode 100644 fpga_kernels/test_fpga/src/Makefile create mode 100644 fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp create mode 100644 fpga_kernels/test_fpga/src/test_mult2d.cpp create mode 100644 fpga_kernels/test_fpga/src/test_relu.cpp create mode 100755 fpga_kernels/test_fpga/utils.mk create mode 100644 include/eddl/profiling.h diff --git a/fpga_kernels/generate_makefile.cpp b/fpga_kernels/generate_makefile.cpp index 2d19598d1..303ed3bec 100644 --- a/fpga_kernels/generate_makefile.cpp +++ b/fpga_kernels/generate_makefile.cpp @@ -166,6 +166,9 @@ int main(int argc, char **argv) { #ifdef K_ENABLED_CONV2D strcpy(szKernels[num_kernels++], "conv2d"); #endif + #ifdef K_ENABLED_CONV2D_K3X3_S1X1_P1X1_BS1 + strcpy(szKernels[num_kernels++], "conv2D_K3x3_S1x1_P1x1_BS1"); + #endif // Core diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp new file mode 100644 index 000000000..20b25cbd2 --- /dev/null +++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp @@ -0,0 +1,704 @@ +//KERNEL_CONV2D_4.cpp +//Modified by: Jorge García Martinez +//Date: 17/09/2020 +//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs +//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel. +//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores +//the data that It should be written into the memory. + + + +#include +#include +#include + +#include + +//#define DEBUG_VERBOSE + +extern "C" { + +// Fixed parameters (optimized at compilation/synthesis time) +#define KW 3 // kernel width +#define KH 3 // kernel height +//#define I 8 // number of input channels +//#define O 8 // number of output channels +#define CPI 4 // channels per input port +#define CPO 4 // channels per output port +//#define W 256 // input width +//#define H 256 // input height +//#define I_ITER I/CPI // iterations per input +//#define O_ITER O/CPO // iterations per output + +#define LOAD_MODEL +#define READ_MODEL +#define READ_INPUT +#define WRITE_OUTPUT + +// pixel_in +struct pixel_in_t { + float pixel[CPI]; +}; + +struct pixel_out_t { + float pixel[CPO]; +}; + +// frames struct +struct frame_t { + pixel_in_t pixel[9]; +}; + +// -------------------------------------------------------------------------------------- +// read_input: +// The function reads and writes the kernels, bias and data in different stream. +// Data are sent to padding module, kenels to mul and bias to add modules. +// LOOP FLOW +// ko = 0 +// b = 0 +// for o_iter 0 .. n +// read bias[b..b+3] +// b = b + 4 +// d = 0 +// ki = 0 +// for i_iter 0 .. n +// read kernel[ki..ki+3][ko..ko+3] +// ki = ki +4 +// read data[d..d+3] +// d = d + 4 +// +// ko = ko + 4 +// +// +// Arguments: +// ptr : Pointer to input data (in) +// k_ptr: pointer to kernels (in) +// b_ptr: pointer to bias (in) +// out : data output stream (out) +// k_out: pointer to kernel (out) +// b_out: pointer to bias (out) +// +static void read_input(int H, int W, int I, int O, int I_ITER, int O_ITER, pixel_in_t *ptr, float *k_ptr, float *b_ptr, hls::stream &k_out, hls::stream &b_out, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("read_input: start\n"); +#endif + + frame_t frame_k; + #pragma HLS ARRAY_PARTITION variable=frame_k dim=0 + + pixel_out_t bias; + #pragma HLS ARRAY_PARTITION variable=bias dim=0 + + pixel_in_t data; + #pragma HLS ARRAY_PARTITION variable=data dim=0 + + + read_input_o_iter_loop: + for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + //Sending bias to add in pack of CPO bias + // int data_pointer = 0; + read_loop_bias_load: + for (int b=0; b &in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("padding: start\n"); +#endif + +//we init zero only first time + +pixel_in_t data; +#pragma HLS ARRAY_PARTITION variable=data complete + +pixel_in_t zero; +#pragma HLS ARRAY_PARTITION variable=data complete + +for (int cpi=0; cpi &in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("relu: start\n"); +#endif + + int data_size = W * H * O; + for (int i=0; i < data_size; i++) { + #pragma HLS PIPELINE II=1 + float data = in.read(); + if (data < 0) data = 0.f; + out << data; + } + +#ifdef DEBUG_VERBOSE + printf("relu: end\n"); +#endif +} + +// -------------------------------------------------------------------------------- +// write_output: Writes data comming from one stream into memory +// LOOP FLOW: +// for o_iter 0 .. n +// write data[do .. do+3] +// +// d = d + 4 +// +// Arguments: +// ptr: memory address pointer +// in: input stream +// +static void write_output(int H, int W, int O_ITER, pixel_out_t *ptr, hls::stream &in) { + +#ifdef DEBUG_VERBOSE + printf("write_output: start\n"); +#endif + + + + // int data_pointer = 0; + + // write_output_o_iter_loop: + // for (int o_iter = 0; o_iter &in, hls::stream &out, int id) { + +#ifdef DEBUG_VERBOSE + printf("cvt_%d: start\n", id); +#endif + +cvt_o_iter_loop: +for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + cvt_i_iter_loop: + for(int i_iter = 0; i_iter < I_ITER; i_iter++){ + + // Now we process the input data and convert the data into frames + + // buffers (keep three rows) + pixel_in_t buffer0[W+2]; + pixel_in_t buffer1[W+2]; + pixel_in_t buffer2[W+2]; + #pragma HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=2 + #pragma HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=2 + #pragma HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=2 + + // frame + frame_t frame; + #pragma HLS ARRAY_PARTITION variable=frame + + // We loop for every incoming pixel + cvt_loop_1: + for (int pin_row=0; pin_row < H+2; pin_row++) { + cvt_loop_2: + for (int pin_col=0; pin_col < W+2; pin_col++) { + // get the pixel + pixel_in_t pixel; + pixel = in.read(); + // row buffer write (in which buffer row we write the pixel) + int row0_buffer_write = (pin_row % 3) == 0; + int row1_buffer_write = (pin_row % 3) == 1; + // first row buffer + int row0 = (pin_row <= 2) | ((pin_row % 3) == 2); + int row1 = !row0 & ((pin_row % 3) == 0); + // we write the pixel into the buffer + if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel; + // build the frame + pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8; + int shift_frame = (pin_row>1) & (pin_col > 2); + int send_frame = (pin_row>1) & (pin_col > 1); + pixel_in_t pixel_b0, pixel_b1, pixel_b2; + pixel_b0 = buffer0[pin_col]; + pixel_b1 = buffer1[pin_col]; + pixel_b2 = buffer2[pin_col]; + // p0, p1, p2 + if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;} + if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;} + if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2; + // p3, p4, p5 + if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;} + if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;} + if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0; + // p6, p7, p8 + if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;} + if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;} + if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1; + + if (send_frame) { + frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2; + frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5; + frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8; + out << frame; + #ifdef DEBUG_VERBOSE + printf("cvt_%d: frame sent:\n", id); + for (int cpi=0; cpi &in, hls::stream &k_in, hls::stream &out, int id) { + +#ifdef DEBUG_VERBOSE + printf("mul_%d: start\n", id); +#endif + + // first we read the kernels + frame_t kernel[CPI]; + #pragma HLS ARRAY_PARTITION variable=kernel dim=0 + frame_t data_in; + +#ifdef LOAD_MODEL + + mul_o_iter_loop: + for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + mul_i_iter_loop: + for(int i_iter = 0; i_iter < I_ITER; i_iter++){ + //we load the kernels into pack of frames + loop_mul_kernels_load_cpo: + for (int cpi=0; cpi %6.4f\n", cpo, sum[cpo]); + #endif + p_out.pixel[cpo] = sum[cpo]; + sum[cpo] = 0.f; + } + out << p_out; + } + } //i_iter +} //o_iter + +#endif + + +#ifdef DEBUG_VERBOSE + printf("mul_%d: end\n", id); +#endif +} + +// ------------------------------------------------------------------------------- +// add: This function performs the addition of all subpixels for the same channel. +// It adds also the corresponding bias. +// LOOP FLOW +// for o_iter 0 .. n +// receive bias[b..b+3] +// init buff_o_channels with bias +// for i_iter 0 .. n +// receive data[do..d+3] +// buff_o_channels = buff_o_channels + data +// +// for num_iterations +// for CPO +// send data to write module +// +// Arguments: +// in: input streams data +// b_in: input stream bias +// out: output stream +// +static void add(int H, int W, int I_ITER, int O_ITER, hls::stream &in, hls::stream &b_in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("add: start\n"); +#endif + + float bias[CPO]; + + //number of iterations by CPI || CPO channels + int num_iterations = W * H; + + //Buffer for all data and CPO channels + float buff_o_channels[CPO][num_iterations]; + #pragma HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=4 + + //We read Bias in O_iter packs of CPO size + add_o_iter_loop: + for (int o_iter = 0; o_iter &in, hls::stream &k_in, hls::stream &b_in, hls::stream &out) { + + // streams + static hls::stream str_pad_cvt; // padding->cvt + static hls::stream str_cvt_mul; // cvt->mul + static hls::stream str_mul_add; // mul->add + + + // topology + #pragma HLS dataflow + padding(H, W, I_ITER, O_ITER, in, str_pad_cvt); // padding + cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0); // cvt + mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0); // mul + add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out); // add +} + +void k_conv2D_K3x3_S1x1_P1x1_BS1(pixel_in_t *ptr_data, int H, int W, int I, float *ptr_kernel, float *ptr_bias, pixel_out_t *ptr_out, int O) { + + //#pragma HLS INTERFACE s_axilite port=W bundle=control + //#pragma HLS INTERFACE s_axilite port=H bundle=control + #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_out offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE s_axilite port=return bundle=control + + // ptr_data struct to be packed as a single element vector (to improve memory read) + // the compiler will do full structure access (all elements of structure) + #pragma HLS data_pack variable = ptr_data + #pragma HLS data_pack variable = ptr_out + + int I_ITER = I/CPI; + int O_ITER = O/CPO; + + // input and output streams + static hls::stream out_read; + static hls::stream out_read_kernel; + static hls::stream out_read_bias; + static hls::stream out_conv; + + // stream sizes + #pragma HLS STREAM variable = out_read depth = 32 + #pragma HLS STREAM variable = out_read_kernel depth = 32 + #pragma HLS STREAM variable = out_read_bias depth = 32 + #pragma HLS STREAM variable = out_conv depth = 32 + #pragma HLS STREAM variable = out_relu depth = 32 + + #pragma HLS dataflow + read_input(H, W, I, O, I_ITER, O_ITER, ptr_data, ptr_kernel, ptr_bias, out_read_kernel, out_read_bias, out_read); + conv(H, W, I, O, I_ITER, O_ITER, out_read, out_read_kernel, out_read_bias, out_conv); + write_output(H, W, O_ITER, ptr_out, out_conv); +} + +} // end extern "C" diff --git a/fpga_kernels/test_fpga/Makefile b/fpga_kernels/test_fpga/Makefile new file mode 100644 index 000000000..7dc6b92d8 --- /dev/null +++ b/fpga_kernels/test_fpga/Makefile @@ -0,0 +1,19 @@ +# list of kernel test to compile +LIST ?=conv2D_K3x3_S1x1_P1x1_BS1 + +# default target +all build clean cleanall: KERNELS + +KERNELS : + for krnl in $(LIST); do \ + $(info Launch Makefile to generate test for kernel $(krnl)) \ + $(MAKE) -C ./src -e KNAME=$$krnl $(MAKECMDGOALS) ; \ + done +# build any target by forwarding to $(dirs) rule +#% : $(DIRS) ; + +.PHONY: KERNELS all build clean cleanall + + +$(info all done) + diff --git a/fpga_kernels/test_fpga/description.json b/fpga_kernels/test_fpga/description.json new file mode 100644 index 000000000..4625f665d --- /dev/null +++ b/fpga_kernels/test_fpga/description.json @@ -0,0 +1,64 @@ +{ + "name": "Data Transfer (C)", + "description": [ + "This example illustrates several ways to use the OpenCL API to transfer data to and from the FPGA" + ], + "keywords": [ + "enqueueWriteBuffer", + "enqueueReadBuffer", + "enqueueMapBuffer", + "enqueueUnmapMemObject", + "enqueueMigrateMemObjects" + ], + "key_concepts": [ + "OpenCL API", + "Data Transfer", + "Write Buffers", + "Read Buffers", + "Map Buffers", + "Async Memcpy" + ], + "os": [ + "Linux" + ], + "runtime": [ + "OpenCL" + ], + "host": { + "host_exe": "data_transfer", + "compiler": { + "sources": [ + "REPO_DIR/common/includes/xcl2" + ], + "includepaths": [ + "REPO_DIR/common/includes/xcl2" + ] + } + }, + "containers": [ + { + "accelerators": [ + { + "name": "dummy_kernel", + "location": "src/dummy_kernel.cpp" + } + ], + "name": "dummy_kernel" + } + ], + "launch": [ + { + "cmd_args": "BUILD/dummy_kernel.xclbin", + "name": "generic launch for all flows" + } + ], + "contributors": [ + { + "url": "http://www.xilinx.com", + "group": "Xilinx" + } + ], + "testinfo": { + "profile": "no" + } +} diff --git a/fpga_kernels/test_fpga/src/Makefile b/fpga_kernels/test_fpga/src/Makefile new file mode 100644 index 000000000..97b860376 --- /dev/null +++ b/fpga_kernels/test_fpga/src/Makefile @@ -0,0 +1,189 @@ +.PHONY: help + +help:: + $(ECHO) "Makefile Usage:" + $(ECHO) " make all TARGET= DEVICE= HOST_ARCH= EDGE_COMMON_SW=" + $(ECHO) " Command to generate the design for specified Target and Shell." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH and EDGE_COMMON_SW is required for SoC shells" + $(ECHO) "" + $(ECHO) " make clean " + $(ECHO) " Command to remove the generated non-hardware files." + $(ECHO) "" + $(ECHO) " make cleanall" + $(ECHO) " Command to remove all the generated files." + $(ECHO) "" + $(ECHO) " make test DEVICE=" + $(ECHO) " Command to run the application. This is same as 'check' target but does not have any makefile dependency." + $(ECHO) "" + $(ECHO) " make sd_card TARGET= DEVICE= HOST_ARCH= EDGE_COMMON_SW=" + $(ECHO) " Command to prepare sd_card files." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH and EDGE_COMMON_SW is required for SoC shells" + $(ECHO) "" + $(ECHO) " make check TARGET= DEVICE= HOST_ARCH= EDGE_COMMON_SW=" + $(ECHO) " Command to run application in emulation." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH and EDGE_COMMON_SW is required for SoC shells" + $(ECHO) "" + $(ECHO) " make build TARGET= DEVICE= HOST_ARCH= EDGE_COMMON_SW=" + $(ECHO) " Command to build xclbin application." + $(ECHO) " By default, HOST_ARCH=x86. HOST_ARCH and EDGE_COMMON_SW is required for SoC shells" + $(ECHO) "" + + +KNAME ?= DUMMY + +KRNL_DIR ?= ./../.. +KRNL_FUNC ?= k_$(KNAME) +KRNL_NAME ?= kernel_$(KNAME) +KRNL_SRCS ?= $(KRNL_DIR)/kernel_$(KNAME).cpp +KRNL_FILE := kernel_$(KNAME).cpp +TEST_SRCS ?= test_$(KNAME).cpp + +DEVICE ?= xilinx_u200_xdma_201830_2 +TARGET ?= sw_emu +HOST_ARCH ?= x86 +SYSROOT ?= + +$(info ) +$(info ) +$(info Running Makefile for KERNEL $(KNAME) DEVICE $(DEVICE) TARGET $(TARGET) ) +$(info ) + +BASE_DIR = ./.. + +# Points to top directory of Git repository +COMMON_REPO = $(BASE_DIR) +PWD = $(shell readlink -f .) +ABS_COMMON_REPO = $(shell readlink -f $(COMMON_REPO)) + + +include $(BASE_DIR)/utils.mk + +XSA := $(call device2xsa, $(DEVICE)) +TEMP_DIR := $(BASE_DIR)/_x.$(TARGET).$(XSA) +BUILD_DIR := $(BASE_DIR)/build_dir.$(TARGET).$(XSA) + +# SoC variables +RUN_APP_SCRIPT = run_app.sh +PACKAGE_OUT = package.$(TARGET) + +LAUNCH_EMULATOR = $(PACKAGE_OUT)/launch_$(TARGET).sh +RESULT_STRING = TEST PASSED + +VPP := v++ +SDCARD := sd_card + +#Include Libraries +include $(ABS_COMMON_REPO)/common/includes/opencl/opencl.mk +include $(ABS_COMMON_REPO)/common/includes/xcl2/xcl2.mk +CXXFLAGS += $(xcl2_CXXFLAGS) +LDFLAGS += $(xcl2_LDFLAGS) +HOST_SRCS += $(xcl2_SRCS) +CXXFLAGS += $(opencl_CXXFLAGS) -Wall -O0 -g -std=c++11 +LDFLAGS += $(opencl_LDFLAGS) +INCL_DIR := -I$(KRNL_DIR) + +HOST_SRCS += $(TEST_SRCS) + +# Host compiler global settings +CXXFLAGS += -fmessage-length=0 +LDFLAGS += -lrt -lstdc++ + + +ifneq ($(HOST_ARCH), x86) + LDFLAGS += --sysroot=$(SYSROOT) +endif + +# Kernel compiler global settings +CLFLAGS += -t $(TARGET) --platform $(DEVICE) --save-temps +ifneq ($(TARGET), hw) + CLFLAGS += -g +endif + +EXECUTABLE := $(TEST_SRCS:%.cpp=%) +#$(info "EXECUTABLE is $(EXECUTABLE)" ) +KRNL_XCLBIN = $(KRNL_NAME:%=%.xclbin) +KRNL_OBJ = $(KRNL_NAME:%=%.xo) + +CMD_ARGS = $(BUILD_DIR)/$(KRNL_XCLBIN) +EMCONFIG_DIR = $(TEMP_DIR) +EMU_DIR = $(SDCARD)/data/emulation + +BINARY_CONTAINERS += $(BUILD_DIR)/$(KRNL_XCLBIN) +BINARY_CONTAINER_kernel_OBJS += $(TEMP_DIR)/$(KRNL_OBJ) + +CP = cp -rf + +.PHONY: all clean cleanall docs emconfig +all: check-devices $(EXECUTABLE) $(BINARY_CONTAINERS) emconfig sd_card + +.PHONY: exe +exe: $(EXECUTABLE) + +.PHONY: build +build: check-vitis $(BINARY_CONTAINERS) + + +# Building kernel +$(TEMP_DIR)/$(KRNL_OBJ): $(KRNL_SRCS) + mkdir -p $(TEMP_DIR) + $(VPP) $(CLFLAGS) --temp_dir $(TEMP_DIR) -c -k $(KRNL_FUNC) -I'$( /* printf, scanf, NULL */ +#include /* malloc, free, rand */ + +#include +#include +#include +#include +#include +#include "xcl2.hpp" + +using std::vector; + +// CL +cl::Buffer buf; +cl::Context context; +cl::CommandQueue q; +cl::Program program; + + +#define W 256 //256 +#define H 256 //256 +#define C 4 //I +#define COUT 4 //O +#define KW 3 +#define KH 3 + +// buffers +float data_in[ W * H * C ] __attribute__ ((__aligned__(16))); +float kernel [ KW * KH * C * COUT] __attribute__ ((__aligned__(16))); +float bias [ COUT ] __attribute__ ((__aligned__(16))); +float out [ W * H * COUT ] __attribute__ ((__aligned__(16))); +float out_cpu[ W * H * COUT ] __attribute__ ((__aligned__(16))); + +void cpu_conv2d() { + + int size_out = W * H * COUT; + for (int i=0; i 0.001) { + printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, out_cpu[addr_o], out[addr_o], fabs(out_cpu[addr_o]-out[addr_o])); + error = 1; + return; + } + } + } + } + if (!error) printf("results OK!\n"); else { + printf("results differ:\n"); + //cpu_print_out(); + } +} + + +//--------------------------------------------------------------------------------------------------------------------- +//--------------------------------------------------------------------------------------------------------------------- + +// An event callback function that prints the operations performed by the OpenCL +// runtime. +void event_cb(cl_event event1, cl_int cmd_status, void *data) { + cl_int err; + cl_command_type command; + cl::Event event(event1, true); + OCL_CHECK(err, err = event.getInfo(CL_EVENT_COMMAND_TYPE, &command)); + cl_int status; + OCL_CHECK(err, + err = event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status)); + const char *command_str; + const char *status_str; + switch (command) { + case CL_COMMAND_READ_BUFFER: + command_str = "buffer read"; + break; + case CL_COMMAND_WRITE_BUFFER: + command_str = "buffer write"; + break; + case CL_COMMAND_NDRANGE_KERNEL: + command_str = "kernel"; + break; + case CL_COMMAND_MAP_BUFFER: + command_str = "kernel"; + break; + case CL_COMMAND_COPY_BUFFER: + command_str = "kernel"; + break; + case CL_COMMAND_MIGRATE_MEM_OBJECTS: + command_str = "buffer migrate"; + break; + default: + command_str = "unknown"; + } + switch (status) { + case CL_QUEUED: + status_str = "Queued"; + break; + case CL_SUBMITTED: + status_str = "Submitted"; + break; + case CL_RUNNING: + status_str = "Executing"; + break; + case CL_COMPLETE: + status_str = "Completed"; + break; + } + printf("[%s]: %s %s\n", reinterpret_cast(data), status_str, + command_str); + fflush(stdout); +} + +// Sets the callback for a particular event +void set_callback(cl::Event event, const char *queue_name) { + cl_int err; + OCL_CHECK(err, + err = event.setCallback(CL_COMPLETE, event_cb, (void *)queue_name)); +} + +//--------------------------------------------------------------------------------------------------------------------- + +int main(int argc, char **argv) { + if (argc != 2) { + std::cout << "Usage: " << argv[0] << " " << std::endl; + return EXIT_FAILURE; + } + + printf("Test CONV: [WxHxC] = [%dx%dx%d] -> [WxHxC] = [%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", W, H, C, W, H, COUT, KW, KH); + + std::string binaryFile = argv[1]; + cl_int err; + cl::Kernel kernel_conv2d_2; + + std::cout << "Creating Context..." << std::endl; + auto devices = xcl::get_xil_devices(); + auto device = devices[0]; + OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err)); + OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); + + std::string device_name = device.getInfo(); + auto fileBuf = xcl::read_binary_file(binaryFile); + cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}}; + devices.resize(1); + + OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err)); + std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl; + + OCL_CHECK(err, kernel_conv2d_2 = cl::Kernel(program,"k_conv2D_K3x3_S1x1_P1x1_BS1", &err)); + std::cout << "Kernel sucessfully created" << std::endl ; + + size_t size_data_in_bytes = W*H*C*sizeof(float); + size_t size_output_in_bytes = W*H*COUT * sizeof(float); + size_t size_kernel_in_bytes = KW * KH * C * COUT * sizeof(float); + size_t size_bias_in_bytes = COUT * sizeof(float); + // Allocate memory on the host and fill with random data. + + //----------------------------- + // fill data vector with random data + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + std::cout << "Filling buffer with useful data" << std::endl ; + int addr = 0; + for (int h=0; h kernel_events(1); + vector read_events(1); + vector write_events(1); + cl::Buffer buffer_a; + cl::Buffer buffer_b; + cl::Buffer buffer_k; + cl::Buffer buffer_bias; + + //----------------------------- + // Allocate Buffer in Global Memory + // Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and + // Device-to-host communication + std::cout << "Creating Buffers..." << std::endl; + + OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_data_in_bytes, &data_in, &err)); + OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR , size_output_in_bytes, &out, &err)); + OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, &kernel, &err)); + OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, &bias, &err)); + + // set kernel arguments + int arg = 0; + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, H)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, W)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, C)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_k)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_bias)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_b)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, COUT)); + + //----------------------------- + // Copy input data to device global memory + std::cout << "Copying data (Host to Device)..." << std::endl; + // Because we are passing the write_events, it returns an event object + // that identifies this particular command and can be used to query + // or queue a wait for this particular command to complete. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0])); + set_callback(write_events[0], "ooo_queue"); + + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[0])); + set_callback(write_events[0], "ooo_queue"); + + //----------------------------- + printf("Enqueueing NDRange kernel.\n"); + // This event needs to wait for the write buffer operations to complete + // before executing. We are sending the write_events into its wait list to + // ensure that the order of operations is correct. + // Launch the Kernel + std::vector waitList; + waitList.push_back(write_events[0]); + OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[0])); + set_callback(kernel_events[0], "ooo_queue"); + + + + std::cout << "Getting Results (Device to Host)..." << std::endl; + std::vector eventList; + eventList.push_back(kernel_events[0]); + // This operation only needs to wait for the kernel call. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0])); + set_callback(read_events[0], "ooo_queue"); + OCL_CHECK(err, err = read_events[0].wait()); + + // Wait for all of the OpenCL operations to complete + std::cout << "Waiting..." << std::endl; + OCL_CHECK(err, err = q.flush()); + OCL_CHECK(err, err = q.finish()); + + + std::cout << "computing conv in CPU..." << std::endl; + + // cpu_print_data_in(); + // cpu_print_kernels(); + // cpu_print_bias(); + // cpu_conv2d(); + // cpu_print_out(); + + // check_result(); + + //----------------------------- + std::cout << "" << std::endl; + std::cout << "All done" << std::endl; + std::cout << "quit now" << std::endl; + + // exit + return 0; +} diff --git a/fpga_kernels/test_fpga/src/test_mult2d.cpp b/fpga_kernels/test_fpga/src/test_mult2d.cpp new file mode 100644 index 000000000..37add80b6 --- /dev/null +++ b/fpga_kernels/test_fpga/src/test_mult2d.cpp @@ -0,0 +1,566 @@ +#include /* printf, scanf, NULL */ +#include /* malloc, free, rand */ + +#include +#include +#include +#include +#include +#include "xcl2.hpp" + +//#define VERBOSE +//#define DEBUG + +using std::vector; + +//--------------------------------------------------------------------------------------------------------------------- +//--------------------------------------------------------------------------------------------------------------------- + +// An event callback function that prints the operations performed by the OpenCL +// runtime. +void event_cb( + cl_event event1, + cl_int cmd_status + , void *data +) { + cl_int err; + cl_command_type command; + cl::Event event(event1, true); + OCL_CHECK(err, err = event.getInfo(CL_EVENT_COMMAND_TYPE, &command)); + cl_int status; + OCL_CHECK(err, + err = event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status)); + const char *command_str; + const char *status_str; + switch (command) { + case CL_COMMAND_READ_BUFFER: + command_str = "buffer read"; + break; + case CL_COMMAND_WRITE_BUFFER: + command_str = "buffer write"; + break; + case CL_COMMAND_NDRANGE_KERNEL: + command_str = "kernel"; + break; + case CL_COMMAND_MAP_BUFFER: + command_str = "kernel"; + break; + case CL_COMMAND_COPY_BUFFER: + command_str = "kernel"; + break; + case CL_COMMAND_MIGRATE_MEM_OBJECTS: + command_str = "buffer migrate"; + break; + default: + command_str = "unknown"; + } + switch (status) { + case CL_QUEUED: + status_str = "Queued"; + break; + case CL_SUBMITTED: + status_str = "Submitted"; + break; + case CL_RUNNING: + status_str = "Executing"; + break; + case CL_COMPLETE: + status_str = "Completed"; + break; + } + printf("[%s]: %s %s\n", reinterpret_cast(data), status_str, + command_str); + fflush(stdout); +} + +//--------------------------------------------------------------------------------------------------------------------- +// Sets the callback for a particular event +void set_callback( + cl::Event event, + const char *queue_name +) { + cl_int err; + OCL_CHECK(err, err = event.setCallback(CL_COMPLETE, event_cb, (void *)queue_name)); +} + +//--------------------------------------------------------------------------------------------------------------------- +void usage ( + char *p_name +) { + std::cout << "ERROR: unexpected number of parameters" << std::endl; + std::cout << "Usage: " << p_name << " " << " " << std::endl; + exit(EXIT_FAILURE); +} + +//--------------------------------------------------------------------------------------------------------------------- +void fpga_init( + cl::Context &context, + cl::CommandQueue &q, + cl::Program &program, + cl::Kernel &kernel_ut, + const char *fname, // fpga device binary file name + const char *k_name // kernel name +) { + cl_int err; + std::string binaryFile = fname; + + // OPENCL HOST CODE AREA START + // get_xil_devices() is a utility API which will find the xilinx + // platforms and will return list of devices connected to Xilinx platform + std::cout << "Creating Context..." << std::endl; + // The get_xil_devices will return vector of Xilinx Devices + auto devices = xcl::get_xil_devices(); + auto device = devices[0]; + +// std::vector> host_memory(elements, 42); + // Creating Context and Command Queue for selected Device + OCL_CHECK(err, context = cl::Context(device, NULL, NULL, NULL, &err)); + + std::cout << " setting command queue" << std::endl; + OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); + + std::string device_name = device.getInfo(); + std::cout << "Allocating and transferring binary file to " << device_name.c_str() << std::endl; + + auto fileBuf = xcl::read_binary_file(binaryFile); + cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}}; + devices.resize(1); + + + std::cout << "Loading program to " << device_name.c_str() << std::endl; + OCL_CHECK(err, program = cl::Program(context, devices, bins, NULL, &err)); + std::cout << " ... program successful!" << std::endl; + + std::cout << "Creating kernel in program" << std::endl; + OCL_CHECK(err, kernel_ut = cl::Kernel(program, k_name, &err)); + std::cout << " ... kernel sucessfully created" << std::endl ; + +} + +//--------------------------------------------------------------------------------------------------------------------- +void create_buffers( + cl::Context &context, + cl::Buffer &buffer_a, + cl::Buffer &buffer_b, + cl::Buffer &buffer_c, + vector> &a, + vector> &b, + vector> &c, + size_t size_a_in_bytes, + size_t size_b_in_bytes, + size_t size_c_in_bytes + +) { + cl_int err; + + //----------------------------- + // Allocate Buffer in Global Memory + // Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and + // Device-to-host communication + std::cout << "Creating Buffers..." << std::endl; + + OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_a_in_bytes, &a[0], &err)); + OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_b_in_bytes, &b[0], &err)); + // buffer c will be used for write, and depending on the params can also be read (incremental mmul) + OCL_CHECK(err, buffer_c = cl::Buffer(context, CL_MEM_USE_HOST_PTR , size_c_in_bytes, &c[0], &err)); +} + +//--------------------------------------------------------------------------------------------------------------------- +void fill( + vector> &a, + vector> &b, + vector> &c, + vector> &c_local, + int Ashape0, int Ashape1, + int Bshape0, int Bshape1, + int Cshape0, int Cshape1 +) { + // Set/Initialize matrices + // fill data vectors + int val = 0; + std::cout << "Filling matrix A[" << Ashape0 << " , " << Ashape1 << "] with sequential values" << std::endl ; + for (int i = 0; i < Ashape0; i++) { + for (int j = 0; j < Ashape1; j++) { + int ind = i*Ashape1 + j; + a[ind] = val; + val += 1; + } + } + + std::cout << "Filling matrix B[" << Bshape0 << " , " << Bshape1 << "] to be the Identity Matrix" << std::endl ; + for (int i = 0; i < Bshape0; i++) { + for (int j = 0; j < Bshape1; j++) { + int ind = i*Bshape1 + j; + b[ind] = (i == j) ? 1 : 0; + } + } + + std::cout << "result matrix C will be dimensioned to: C[" << Cshape0 << ", " << Cshape1 << "]" << std::endl; + std::cout << "Filling matrix C[" << Cshape0 << " , " << Bshape1 << "] with 1s" << std::endl ; + for (int i = 0; i < Cshape0; i++) { + for (int j = 0; j < Cshape1; j++) { + int ind = i*Cshape1 + j; + c[ind] = 1; + c_local[ind] = c[ind]; + } + } + +} + +//--------------------------------------------------------------------------------------------------------------------- +void run( + cl::Context &context, + cl::CommandQueue &q, + cl::Kernel &kernel_ut, + cl::Buffer &buffer_a, + cl::Buffer &buffer_b, + cl::Buffer &buffer_c, + int Ashape0, int Ashape1, + int Bshape0, int Bshape1, + int tA, int tB, int incC +) { + cl_int err; + + // These events will be used to track when a kernel is finished with + // the input and output buffers. Once the kernel is finished processing the data, + // a new set of elements will be written into the output buffer. + vector kernel_events(1); + vector read_events(1); + vector write_events(1); + + + //----------------------------- + // These events will be used to track when a kernel is finished with + // the input and output buffers. Once the kernel is finished processing the data, + // a new set of elements will be written into the output buffer. + //vector kernel_events(1); + //vector read_events(1); + //vector write_events(1); + + // set kernel arguments + //test_run_index++; + std::cout << std::endl; + //std::cout << "RUN "<< test_run_index << std::endl; + std::cout << "Setting kernel arguments... tA " << tA << " tB " << tB << " incC " << incC << std::endl; + OCL_CHECK(err, err = kernel_ut.setArg(0, buffer_a)); + OCL_CHECK(err, err = kernel_ut.setArg(1, buffer_b)); + OCL_CHECK(err, err = kernel_ut.setArg(2, buffer_c)); + OCL_CHECK(err, err = kernel_ut.setArg(3, Ashape0)); + OCL_CHECK(err, err = kernel_ut.setArg(4, Ashape1)); + OCL_CHECK(err, err = kernel_ut.setArg(5, Bshape0)); + OCL_CHECK(err, err = kernel_ut.setArg(6, Bshape1)); + OCL_CHECK(err, err = kernel_ut.setArg(7, tA)); + OCL_CHECK(err, err = kernel_ut.setArg(8, tB)); + OCL_CHECK(err, err = kernel_ut.setArg(9, incC)); + + //----------------------------- + // Copy input data to device global memory + std::cout << "Copying data (Host to Device)..." << std::endl; + // Because we are passing the write_events, it returns an event object + // that identifies this particular command and can be used to query + // or queue a wait for this particular command to complete. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a, buffer_b, buffer_c}, 0 /*0 means from host*/, NULL, &write_events[0])); + set_callback(write_events[0], "ooo_queue"); + + //----------------------------- + printf("Enqueueing NDRange kernel.\n"); + // This event needs to wait for the write buffer operations to complete + // before executing. We are sending the write_events into its wait list to + // ensure that the order of operations is correct. + // Launch the Kernel + std::vector waitList; + waitList.push_back(write_events[0]); + OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_ut, 0, 1, 1, &waitList, &kernel_events[0])); + set_callback(kernel_events[0], "ooo_queue"); + + //----------------------------- + // Copy Result from Device Global Memory to Host Local Memory + std::cout << "Getting Results (Device to Host)..." << std::endl; + std::vector eventList; + eventList.push_back(kernel_events[0]); + // This operation only needs to wait for the kernel call. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_c}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0])); + set_callback(read_events[0], "ooo_queue"); + OCL_CHECK(err, err = read_events[0].wait()); + + std::cout << " Matrix C retrieved from device memory" << std::endl; + + //----------------------------- + // HEY !!!! + // It is necessary to release the resources, all of them, + // memories, buffers, kernels, programs,... + + // Wait for all of the OpenCL operations to complete + std::cout << "Waiting for all the operations to complete..." << std::endl; + OCL_CHECK(err, err = q.flush()); + OCL_CHECK(err, err = q.finish()); + + // clear event queues + kernel_events.clear(); + kernel_events.shrink_to_fit(); + read_events.clear(); + read_events.shrink_to_fit(); + write_events.clear(); + write_events.shrink_to_fit(); + +} + +//--------------------------------------------------------------------------------------------------------------------- +void run_cpu(const vector> &a, + const vector> &b, + //const vector> &c, + vector> &c_local, + int Ashape0, int Ashape1, + int Bshape0, int Bshape1, + int Cshape0, int Cshape1, + int tA, int tB, int incC +) { + int *fA_sum, *fA_mult, *fB_sum, *fB_mult; + int kmax; // common dimension, + int i, j, k; + + std::cout << std::endl; + std::cout << "Performing kernel opeation in CPU" << std::endl; + + if (tA == 0) { + fA_mult = &i; + fA_sum = &k; + kmax = Ashape1; + } + else + { + fA_mult = &k; + fA_sum = &i; + kmax = Ashape0; + } + + if (tB == 0) { + fB_mult = &k; + fB_sum = &j; + } + else + { + fB_mult = &j; + fB_sum = &k; + } + + #ifdef VERBOSE + std::cout << "c_local" << std::endl; + for (i = 0; i < Cshape0; i++) { + for (j = 0; j < Cshape1; j++) { + int ind_c = i * Cshape1 + j; + std::cout << "C[" << i << "][" << j << "] = " << c_local[ind_c] << std::endl; + } + } + #endif + + for (i = 0; i < Cshape0; i++) { + for (j = 0; j < Cshape1; j++) { + int ind_c = i * Cshape1 + j; + float sum = 0.0f; + + #ifdef VERBOSE + std::cout << "C[" << i << "][" << j << "] = "; + #endif + for (k = 0; k < kmax; k++) { + int ind_x = ((*fA_mult) * Ashape1) + (*fA_sum); + int ind_y = ((*fB_mult) * Bshape1) + (*fB_sum); + sum += a[ind_x] * b[ind_y]; + #ifdef VERBOSE + std::cout << "a[" << ind_x << "] * b[" << ind_y << "] "; + if (k < (kmax -1)) std::cout << "+ "; + #endif + } + #ifdef VERBOSE + std::cout << " = " << c_local[ind_c] << " + " << sum << " = " << (c_local[ind_c] + sum) << std::endl; + #endif + c_local[ind_c] = (incC ? c_local[ind_c]:0) + sum; + } + } + + #ifdef VERBOSE + std::cout << "CPU result" << std::endl; + for (i = 0; i < Cshape0; i++) { + for (j = 0; j < Cshape1; j++) { + int ind_c = i * Cshape1 + j; + std::cout << "C[" << i << "][" << j << "] = " << c_local[ind_c] << std::endl; + } + } + #endif +} + +//--------------------------------------------------------------------------------------------------------------------- +// return status of comparison +// ret 1 - matrices match +// otherwise return 0 +int compare( const vector> &c, + const vector> &c_local, + size_t size +) { + int matrices_match = 1; + + for(size_t i = 0; i < size; i++) { + if (c[i] != c_local[i]) { + std::cout << "Data mismatch found" << std::endl; + matrices_match = 0; + break; + } + } + return matrices_match; +} + +//--------------------------------------------------------------------------------------------------------------------- +int main(int argc, char **argv) { + int Ashape0; + int Ashape1; + int Bshape0; + int Bshape1; + int Cshape0; + int Cshape1 ; + int tA; + int tB; + int incC; + + int test_ok = 1; + //int test_run_index = 0; + + // CL + cl::Context context; + cl::CommandQueue q; + cl::Program program; + cl::Kernel kernel_ut; + cl::Buffer buffer_a, buffer_b, buffer_c; + + //--------------------------------------------------------------------------- + if (argc != 9) { + usage(argv[0]); + } + + Ashape0 = atoi(argv[2]); + Ashape1 = atoi(argv[3]); + Bshape0 = atoi(argv[4]); + Bshape1 = atoi(argv[5]); + tA = atoi(argv[6]); + tB = atoi(argv[7]); + incC = atoi(argv[8]); + + // check input configuration + { + int ic_err = 0; + if ((tA == 0) && (tB == 0)) { + if (Ashape1 != Bshape0) { + ic_err = 1; + } + } + else if ((tA == 0) && (tB == 1)) { + if (Ashape1 != Bshape1) { + ic_err = 1; + } + } + else if ((tA == 1) && (tB == 0)) { + if (Ashape0 != Bshape0) { + ic_err = 1; + } + } + else if ((tA == 1) && (tB == 1)) { + if (Ashape0 != Bshape1) { + ic_err = 1; + } + } + else { + std::cout << "Unexpected configuration" << std::endl; + ic_err = 1; + } + + if (ic_err != 0) { + std::cout << "Error matrix dimensions mismatch for requested operation" << std::endl << std::endl; + return EXIT_FAILURE; + } + } + + //--------------------------------------------------------------------------- + std::ofstream outfile; + std::string outfname = "output.txt"; + outfile.open (outfname.c_str()); // we delete file content by open/close operations + outfile.close (); //we close the file in case any error happens and the test exits before completion + + //--------------------------------------------------------------------------- + // set matrices dimensions + Cshape0 = tA? Ashape1:Ashape0; + Cshape1 = tB? Bshape0:Bshape1; + + size_t size_a = Ashape0 * Ashape1; + size_t size_b = Bshape0 * Bshape1; + size_t size_c = Cshape0 * Cshape1; + size_t size_a_in_bytes = size_a * sizeof(float); + size_t size_b_in_bytes = size_b * sizeof(float); + size_t size_c_in_bytes = size_c * sizeof(float); + + // Allocate memory on the host + vector> a(size_a, 0); + vector> b(size_b, 0); + vector> c(size_c, 0); + vector> c_local(size_c, 0); + + std::cout << "tA " << tA << " tB " << tB << " incC " << incC << std::endl; + std::cout << "A[" << Ashape0 << "x" << Ashape1 << "] B[" << Bshape0 << "x" << Bshape1 << "] C[" << Cshape0 << "x" << Cshape1 << "] " << std::endl; + //--------------- + // fill matrices + fill(a, b, c, c_local, Ashape0, Ashape1, Bshape0, Bshape1, Cshape0, Cshape1); + + //--------------------------------------------------------------------------- + // Initialize fpga, load binary and kernel + fpga_init(context, q, program, kernel_ut, argv[1], "k_mult2d"); + + // create CL buffers + create_buffers(context, buffer_a, buffer_b, buffer_c, a, b, c, size_a_in_bytes, size_b_in_bytes, size_c_in_bytes); + + // Run the kernel + run(context, q, kernel_ut, buffer_a, buffer_b, buffer_c, Ashape0, Ashape1, Bshape0, Bshape1, tA, tB, incC); + + // locally calculate result + run_cpu(a, b, c_local, Ashape0, Ashape1, Bshape0, Bshape1, Cshape0, Cshape1, tA,tB,incC); + + // compare results + test_ok = compare (c, c_local, size_c); + + + outfile.open(outfname.c_str(), std::ofstream::out | std::ofstream::app); + if (test_ok != 0) { + std::cout << "" << std::endl; + std::cout << "TEST PASSED" << std::endl << std::endl; + + outfile << "" << std::endl; + outfile << "TEST PASSED" << std::endl << std::endl; + } + else { + std::cout << "" << std::endl; + std::cout << "ERRORS DETECTED" << std::endl << std::endl; + std::cout << "TEST KO" << std::endl; + + outfile << "" << std::endl; + outfile << "ERRORS DETECTED" << std::endl << std::endl; + outfile << "TEST KO" << std::endl; + } + outfile.close(); + + + //----------------------------- + // It is necessary to release the resources, all of them, + a.clear(); + b.clear(); + c.clear(); + c_local.clear(); + a.shrink_to_fit(); + b.shrink_to_fit(); + c.shrink_to_fit(); + c_local.shrink_to_fit(); + + //----------------------------- + std::cout << "" << std::endl; + std::cout << "All done" << std::endl; + std::cout << "quit now" << std::endl; + + // exit + return 0; +} diff --git a/fpga_kernels/test_fpga/src/test_relu.cpp b/fpga_kernels/test_fpga/src/test_relu.cpp new file mode 100644 index 000000000..ab83ab7bd --- /dev/null +++ b/fpga_kernels/test_fpga/src/test_relu.cpp @@ -0,0 +1,304 @@ +#include /* printf, scanf, NULL */ +#include /* malloc, free, rand */ + +#include +#include +#include +#include +#include +#include "xcl2.hpp" +//#include "/home/jorga20j/integration_eddl/eddl/fpga_kernels/test_fpga/test/src/xcl2.hpp" +//#include "/home/jomarm10/workspace/Vitis_Accel_Examples/common/includes/xcl2/xcl2.hpp" + +using std::vector; + +// CL +cl::Buffer buf; +cl::Context context; +cl::CommandQueue q; +cl::Program program; + + +#define SIZE 1024 + +static const int elements = 256; + + + +//--------------------------------------------------------------------------------------------------------------------- +//--------------------------------------------------------------------------------------------------------------------- + +// An event callback function that prints the operations performed by the OpenCL +// runtime. +void event_cb(cl_event event1, cl_int cmd_status, void *data) { + cl_int err; + cl_command_type command; + cl::Event event(event1, true); + OCL_CHECK(err, err = event.getInfo(CL_EVENT_COMMAND_TYPE, &command)); + cl_int status; + OCL_CHECK(err, + err = event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status)); + const char *command_str; + const char *status_str; + switch (command) { + case CL_COMMAND_READ_BUFFER: + command_str = "buffer read"; + break; + case CL_COMMAND_WRITE_BUFFER: + command_str = "buffer write"; + break; + case CL_COMMAND_NDRANGE_KERNEL: + command_str = "kernel"; + break; + case CL_COMMAND_MAP_BUFFER: + command_str = "kernel"; + break; + case CL_COMMAND_COPY_BUFFER: + command_str = "kernel"; + break; + case CL_COMMAND_MIGRATE_MEM_OBJECTS: + command_str = "buffer migrate"; + break; + default: + command_str = "unknown"; + } + switch (status) { + case CL_QUEUED: + status_str = "Queued"; + break; + case CL_SUBMITTED: + status_str = "Submitted"; + break; + case CL_RUNNING: + status_str = "Executing"; + break; + case CL_COMPLETE: + status_str = "Completed"; + break; + } + printf("[%s]: %s %s\n", reinterpret_cast(data), status_str, + command_str); + fflush(stdout); +} + +// Sets the callback for a particular event +void set_callback(cl::Event event, const char *queue_name) { + cl_int err; + OCL_CHECK(err, + err = event.setCallback(CL_COMPLETE, event_cb, (void *)queue_name)); +} + + + +//--------------------------------------------------------------------------------------------------------------------- + + + + + + + + + + + + + +void fpga_init(){ // initialize only once + + + +} + +void create_buffers() { + + + + +} + +void fill(cl::Buffer *buf) { + + +} + +void run() { + + +} + +void run_cpu() { +} + +void compare() { +} + +int main(int argc, char **argv) { + if (argc != 2) { + std::cout << "Usage: " << argv[0] << " " << std::endl; + return EXIT_FAILURE; + } + + std::string binaryFile = argv[1]; + cl_int err; + cl::Kernel kernel_relu; + + + // size_t size_in_bytes = host_memory.size() * sizeof(int); + + // OPENCL HOST CODE AREA START + // get_xil_devices() is a utility API which will find the xilinx + // platforms and will return list of devices connected to Xilinx platform + std::cout << "Creating Context..." << std::endl; + // The get_xil_devices will return vector of Xilinx Devices + auto devices = xcl::get_xil_devices(); + auto device = devices[0]; + + std::vector> host_memory(elements, 42); + // Creating Context and Command Queue for selected Device + OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err)); + OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); + + std::string device_name = device.getInfo(); + std::cout << "Allocating and transferring data to " << device_name.c_str() << std::endl; + + auto fileBuf = xcl::read_binary_file(binaryFile); + cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}}; + devices.resize(1); + + OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err)); + std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl; + + OCL_CHECK(err, kernel_relu = cl::Kernel(program,"k_relu", &err)); + std::cout << "Kernel sucessfully created" << std::endl ; + + size_t size_in_bytes = 4096*sizeof(float); + // Allocate memory on the host and fill with random data. + vector> a(size_in_bytes); + vector> b(size_in_bytes); + + + //----------------------------- + // fill data vector with random data + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + std::cout << "Filling Tensor A with random values [-20.0, 30.0]" << std::endl ; + for (int i = 0; i < SIZE; i++) { + a[i] = dist(gen); + } + std::cout << "A[] = {" << std::endl; + for (int i = 0; i < 20; i++) { + std::cout << " " << a[i] << ","; + } + std::cout << " ...}" << std::endl ; + + //----------------------------- + // THIS PAIR OF EVENTS WILL BE USED TO TRACK WHEN A KERNEL IS FINISHED WITH + // THE INPUT BUFFERS. ONCE THE KERNEL IS FINISHED PROCESSING THE DATA, A NEW + // SET OF ELEMENTS WILL BE WRITTEN INTO THE BUFFER. + vector kernel_events(1); + vector read_events(1); + vector write_events(1); + cl::Buffer buffer_a, buffer_b; + + //----------------------------- + // Allocate Buffer in Global Memory + // Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and + // Device-to-host communication + std::cout << "Creating Buffers..." << std::endl; + + OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_in_bytes, &a[0], &err)); + OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR , size_in_bytes, &b[0], &err)); + + // set kernel arguments + OCL_CHECK(err, err = kernel_relu.setArg(0, buffer_a)); + OCL_CHECK(err, err = kernel_relu.setArg(1, buffer_b)); + OCL_CHECK(err, err = kernel_relu.setArg(2, (long int)SIZE)); + + //----------------------------- + // Copy input data to device global memory + std::cout << "Copying data (Host to Device)..." << std::endl; + // Because we are passing the write_events, it returns an event object + // that identifies this particular command and can be used to query + // or queue a wait for this particular command to complete. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0])); + set_callback(write_events[0], "ooo_queue"); + + //----------------------------- + printf("Enqueueing NDRange kernel.\n"); + // This event needs to wait for the write buffer operations to complete + // before executing. We are sending the write_events into its wait list to + // ensure that the order of operations is correct. + // Launch the Kernel + std::vector waitList; + waitList.push_back(write_events[0]); + OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_relu, 0, 1, 1, &waitList, &kernel_events[0])); + set_callback(kernel_events[0], "ooo_queue"); + + //----------------------------- + // Copy Result from Device Global Memory to Host Local Memory + std::cout << "Getting Results (Device to Host)..." << std::endl; + std::vector eventList; + eventList.push_back(kernel_events[0]); + // This operation only needs to wait for the kernel call. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0])); + set_callback(read_events[0], "ooo_queue"); + OCL_CHECK(err, err = read_events[0].wait()); + + std::cout << "kernel returned" << std::endl ; + std::cout << " B [] = {" ; + for (int i = 0; i < 10; i++) { + std::cout << " " << b[i] << ","; + } + std::cout << " ... }" << std::endl; + + //----------------------------- + // check received data + std::cout << "Check kernel output, checking " << SIZE << " values"<< std::endl; + { + vector> res_local(size_in_bytes); + // perform kernel operation in host + for (int i = 0; i < SIZE; i++ ) { + if (a[i] < 0.0) res_local[i] = 0.0f; + else res_local[i] = a[i]; + } + // compare data vectors + int data_matches = 1; + for (int i = 0; i < SIZE; i++) { + if (res_local [i] != b[i]) { + data_matches = 0; + std::cout << "DATA MISMATCH v_local[= " << i << "] = " << res_local[i] << " != b[" << i << "] = " << b[i] << std::endl; + } + } + + if (data_matches) { + std::cout << "" << std::endl; + std::cout << "TEST PASSED" << std::endl; + } + else { + std::cout << "" << std::endl; + std::cout << "ERRORS DETECTED" << std::endl; + std::cout << "TEST KO" << std::endl; + } + } + + //----------------------------- + // HEY !!!! + // It is necessary to release the resources, all of them, + // memories, buffers, kernels, programs,... + + // Wait for all of the OpenCL operations to complete + std::cout << "Waiting..." << std::endl; + OCL_CHECK(err, err = q.flush()); + OCL_CHECK(err, err = q.finish()); + + //----------------------------- + std::cout << "" << std::endl; + std::cout << "All done" << std::endl; + std::cout << "quit now" << std::endl; + + // exit + return 0; +} diff --git a/fpga_kernels/test_fpga/utils.mk b/fpga_kernels/test_fpga/utils.mk new file mode 100755 index 000000000..c4a81e29c --- /dev/null +++ b/fpga_kernels/test_fpga/utils.mk @@ -0,0 +1,101 @@ +#+------------------------------------------------------------------------------- +# The following parameters are assigned with default values. These parameters can +# be overridden through the make command line +#+------------------------------------------------------------------------------- + +DEBUG := no +B_TEMP = `$(ABS_COMMON_REPO)/common/utility/parse_platform_list.py $(DEVICE)` + +#Generates debug summary report +ifeq ($(DEBUG), yes) +LDCLFLAGS += --dk list_ports +endif + +#Setting Platform Path +ifeq ($(findstring xpfm, $(DEVICE)), xpfm) + B_NAME = $(shell dirname $(DEVICE)) +else + B_NAME = $(B_TEMP)/$(DEVICE) +endif + +#Checks for XILINX_VITIS +check-vitis: +ifndef XILINX_VITIS + $(error XILINX_VITIS variable is not set, please set correctly and rerun) +endif + +#Checks for Device Family +ifeq ($(HOST_ARCH), aarch32) + DEV_FAM = 7Series +else ifeq ($(HOST_ARCH), aarch64) + DEV_FAM = Ultrascale +endif + +#Checks for XILINX_XRT +check-xrt: +ifeq ($(HOST_ARCH), x86) +ifndef XILINX_XRT + $(error XILINX_XRT variable is not set, please set correctly and rerun) +endif +else +ifndef XILINX_VITIS + $(error XILINX_VITIS variable is not set, please set correctly and rerun) +endif +endif + +#Checks for Correct architecture +ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) +$(error HOST_ARCH variable not set, please set correctly and rerun) +endif + +#Checks for EDGE_COMMON_SW +ifneq ($(HOST_ARCH), x86) +ifndef EDGE_COMMON_SW +$(error EDGE_COMMON_SW variable is not set, please set correctly and rerun) +endif +ifeq ($(HOST_ARCH), aarch64) +SYSROOT := $(EDGE_COMMON_SW)/sysroots/aarch64-xilinx-linux +SD_IMAGE_FILE := $(EDGE_COMMON_SW)/Image +CXX := $(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++ +else ifeq ($(HOST_ARCH), aarch32) +SYSROOT := $(EDGE_COMMON_SW)/sysroots/cortexa9t2hf-neon-xilinx-linux-gnueabi/ +SD_IMAGE_FILE := $(EDGE_COMMON_SW)/uImage +CXX := $(XILINX_VITIS)/gnu/aarch32/lin/gcc-arm-linux-gnueabi/bin/arm-linux-gnueabihf-g++ +endif +endif + +gen_run_app: +ifneq ($(HOST_ARCH), x86) + rm -rf run_app.sh + $(ECHO) 'export LD_LIBRARY_PATH=/mnt:/tmp:$(LD_LIBRARY_PATH)' >> run_app.sh + $(ECHO) 'export XILINX_XRT=/usr' >> run_app.sh +ifeq ($(TARGET),$(filter $(TARGET),sw_emu hw_emu)) + $(ECHO) 'export XILINX_VITIS=/mnt' >> run_app.sh + $(ECHO) 'export XCL_EMULATION_MODE=$(TARGET)' >> run_app.sh +endif + $(ECHO) './$(EXECUTABLE) dummy_kernel.xclbin' >> run_app.sh + $(ECHO) 'return_code=$$?' >> run_app.sh + $(ECHO) 'if [ $$return_code -ne 0 ]; then' >> run_app.sh + $(ECHO) 'echo "ERROR: host run failed, RC=$$return_code"' >> run_app.sh + $(ECHO) 'fi' >> run_app.sh + $(ECHO) 'echo "INFO: host run completed."' >> run_app.sh +endif +check-devices: +ifndef DEVICE + $(error DEVICE not set. Please set the DEVICE properly and rerun. Run "make help" for more details.) +endif + +# device2xsa - create a filesystem friendly name from device name +# $(1) - full name of device +device2xsa = $(strip $(patsubst %.xpfm, % , $(shell basename $(DEVICE)))) + +# Cleaning stuff +RM = rm -f +RMDIR = rm -rf + +ECHO:= @echo + +docs: README.md + +README.md: description.json + $(ABS_COMMON_REPO)/common/utility/readme_gen/readme_gen.py description.json diff --git a/include/eddl/hardware/fpga/fpga_enables.h b/include/eddl/hardware/fpga/fpga_enables.h index 4057aa3ef..55f183d85 100644 --- a/include/eddl/hardware/fpga/fpga_enables.h +++ b/include/eddl/hardware/fpga/fpga_enables.h @@ -2,7 +2,7 @@ // implemented on the FPGA //Activations -//#define K_ENABLED_RELU +#define K_ENABLED_RELU //#define K_ENABLED_D_RELU //#define K_ENABLED_THRESHOLDED_RELU //#define K_ENABLED_D_TRHESHOLDED_RELU @@ -65,6 +65,7 @@ //#define K_ENABLED_CONV2D //#define K_ENALBED_CONV2D_GRAD //#define K_ENABLED_CONV2D_BACK +#define K_ENABLED_CONV2D_K3X3_S1X1_P1X1_BS1 //Core //#define K_ENABLED_FILL_ diff --git a/include/eddl/hardware/fpga/fpga_hw.h b/include/eddl/hardware/fpga/fpga_hw.h index a3c8c9df6..c14c372d4 100644 --- a/include/eddl/hardware/fpga/fpga_hw.h +++ b/include/eddl/hardware/fpga/fpga_hw.h @@ -20,7 +20,7 @@ extern cl::CommandQueue q; -#define FPGA_DEBUG +//#define FPGA_DEBUG #include "eddl/hardware/fpga/fpga_enables.h" @@ -49,8 +49,9 @@ extern cl::Kernel kernel_select, kernel_select_back, kernel_set_select, ker extern cl::Kernel kernel_set_select2, kernel_deselect, kernel_concat; extern cl::Kernel kernel_select_nn, kernel_select_back_nn, kernel_set_select_back_nn, kernel_set_select_nn; -// conv kernels (2) +// conv kernels (3) extern cl::Kernel kernel_im2col, kernel_conv2d; +extern cl::Kernel kernel_conv2D_K3x3_S1x1_P1x1_BS1; // create kernels (3) extern cl::Kernel kernel_range, kernel_eye, kernel_diag; diff --git a/include/eddl/profiling.h b/include/eddl/profiling.h new file mode 100644 index 000000000..8cca92267 --- /dev/null +++ b/include/eddl/profiling.h @@ -0,0 +1,37 @@ +#ifndef _PROFILING + +#define _PROFILING + +#include + +#define PROFILING_ENABLE(fn) \ + unsigned long long prof_##fn##_time; \ + unsigned long long prof_##fn##_calls; \ + +#define PROFILING_HEADER(fn) \ + struct timeval prof_t1; \ + gettimeofday(&prof_t1, NULL); + +#define PROFILING_HEADER_EXTERN(fn) \ + extern unsigned long long prof_##fn##_time; \ + extern unsigned long long prof_##fn##_calls; \ + extern int prof_##fn##_device; \ + struct timeval prof_t1; \ + gettimeofday(&prof_t1, NULL); + +#define PROFILING_FOOTER(fn) \ + struct timeval prof_t2; \ + gettimeofday(&prof_t2, NULL); \ + prof_##fn##_time += ((prof_t2.tv_sec - prof_t1.tv_sec) * 1000000) + (prof_t2.tv_usec - prof_t1.tv_usec); \ + prof_##fn##_calls += 1; + +#define PROFILING_PRINTF(fn) \ + if (prof_##fn##_calls > 0) printf(" %-50s: %8lld calls, %8lld us , %10.4f us/call\n", #fn, \ + prof_##fn##_calls, prof_##fn##_time, \ + (float) prof_##fn##_time / (float) prof_##fn##_calls); + +#define PROFILING_PRINTF2(fn, acc) \ + if (prof_##fn##_calls > 0) printf(" %-50s: %8lld calls, %8lld us (%6.2f), %10.4f us/call\n", #fn, \ + prof_##fn##_calls, prof_##fn##_time, \ + 100.0 * prof_##fn##_time / acc, (float) prof_##fn##_time / (float) prof_##fn##_calls); +#endif diff --git a/include/eddl/tensor/tensor.h b/include/eddl/tensor/tensor.h index 44552d4e9..b365cfc46 100644 --- a/include/eddl/tensor/tensor.h +++ b/include/eddl/tensor/tensor.h @@ -126,7 +126,7 @@ class Tensor { * @param dev One of ``DEV_CPU`` or ``DEV_GPU`` * @return a tensor */ - Tensor(const vector &shape, float *fptr, int dev); + Tensor(const vector &shape, float *fptr, int dev, void *fptr2=0); /** * @brief Constructor of an uninitialized tensor diff --git a/src/hardware/fpga/fpga_core.cpp b/src/hardware/fpga/fpga_core.cpp index 08ea5f7ae..d88e22c1a 100644 --- a/src/hardware/fpga/fpga_core.cpp +++ b/src/hardware/fpga/fpga_core.cpp @@ -60,8 +60,9 @@ cl::Kernel kernel_select, kernel_select_back, kernel_set_select, kernel_set cl::Kernel kernel_set_select2, kernel_deselect, kernel_concat; cl::Kernel kernel_select_nn, kernel_select_back_nn, kernel_set_select_nn, kernel_set_select_back_nn; -// conv kernels (2) +// conv kernels (3) cl::Kernel kernel_im2col, kernel_conv2d; +cl::Kernel kernel_conv2D_K3x3_S1x1_P1x1_BS1; // create kernels (3) cl::Kernel kernel_range, kernel_eye, kernel_diag; @@ -633,6 +634,10 @@ void fpga_init(){ // initialize only once OCL_CHECK(err, kernel_conv2d = cl::Kernel(program,"k_conv2d", &err)); if (err != CL_SUCCESS) printf("Error creating kernel\n"); #endif + #ifdef K_ENABLED_CONV2D_K3X3_S1X1_P1X1_BS1 + OCL_CHECK(err, kernel_conv2D_K3x3_S1x1_P1x1_BS1 = cl::Kernel(program, "k_conv2D_K3x3_S1x1_P1x1_BS1", &err)); + if (err != CL_SUCCESS) printf("Error creating kernel\n"); + #endif #ifdef K_ENABLED_RANGE OCL_CHECK(err, kernel_range = cl::Kernel(program,"k_range", &err)); if (err != CL_SUCCESS) printf("Error creating kernel\n"); diff --git a/src/hardware/fpga/nn/fpga_conv.cpp b/src/hardware/fpga/nn/fpga_conv.cpp index cf93d9c08..24f7ce232 100644 --- a/src/hardware/fpga/nn/fpga_conv.cpp +++ b/src/hardware/fpga/nn/fpga_conv.cpp @@ -37,10 +37,29 @@ void fpga_cpuemu_conv2D(ConvolDescriptor *D) { fpga_copy_memory_to_fpga(D->ptrI, D->fpga_ptrI, D->fpga_sizeI); } +// Convolution: Kernel(3x3), Stride(1x1), Padding(1x1), BatchSize=1 +void fpga_conv2D_K3x3_S1x1_P1x1_BS1(cl::Buffer I, int Irows, int Icols, int Ichannels, cl::Buffer K, cl::Buffer B, cl::Buffer O, int Ochannels) { + + cl_int err; + cl::Event event; + int arg=0; + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, I)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Irows)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Icols)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Ichannels)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, K)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, B)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, O)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Ochannels)); + + OCL_CHECK(err, err = q.enqueueTask(kernel_conv2D_K3x3_S1x1_P1x1_BS1, NULL, &event)); + q.finish(); +} + void fpga_conv2D(ConvolDescriptor *D) { _profile_fpga(_FPGA_CONV2D, 0); -#ifndef K_ENABLED_CONV2D +#if !defined(K_ENABLED_CONV2D) && !defined(K_ENABLED_CONV2D_K3X3_S1X1_P1X1_BS1) fpga_cpuemu_conv2D(D); #else cl_int err; @@ -66,27 +85,32 @@ void fpga_conv2D(ConvolDescriptor *D) int stride_rows = D->sr; // rows stride int stride_cols = D->sc; // cols stride - OCL_CHECK(err, err = kernel_conv2d.setArg(0, batch_size)); - OCL_CHECK(err, err = kernel_conv2d.setArg(1, I)); - OCL_CHECK(err, err = kernel_conv2d.setArg(2, Irows)); // input - OCL_CHECK(err, err = kernel_conv2d.setArg(3, Icols)); // output - OCL_CHECK(err, err = kernel_conv2d.setArg(4, Ichannels)); - OCL_CHECK(err, err = kernel_conv2d.setArg(5, K)); - OCL_CHECK(err, err = kernel_conv2d.setArg(6, Krows)); - OCL_CHECK(err, err = kernel_conv2d.setArg(7, Kcols)); - OCL_CHECK(err, err = kernel_conv2d.setArg(8, B)); - OCL_CHECK(err, err = kernel_conv2d.setArg(9, use_bias)); - OCL_CHECK(err, err = kernel_conv2d.setArg(10, O)); - OCL_CHECK(err, err = kernel_conv2d.setArg(11, Orows)); - OCL_CHECK(err, err = kernel_conv2d.setArg(12, Ocols)); - OCL_CHECK(err, err = kernel_conv2d.setArg(13, Ochannels)); - OCL_CHECK(err, err = kernel_conv2d.setArg(14, padding_rows)); - OCL_CHECK(err, err = kernel_conv2d.setArg(15, padding_cols)); - OCL_CHECK(err, err = kernel_conv2d.setArg(16, stride_rows)); - OCL_CHECK(err, err = kernel_conv2d.setArg(17, stride_cols)); - - OCL_CHECK(err, err = q.enqueueTask(kernel_conv2d, NULL, &event)); - q.finish(); + // depending on the conv parameters we select the kernel to launch + if ((stride_rows == 1) && (stride_cols == 1) && (Krows == 3) && (Kcols == 3) && (batch_size == 1) && (padding_rows == 1) && (padding_cols == 1)) { + fpga_conv2D_K3x3_S1x1_P1x1_BS1(I, Irows, Icols, Ichannels, K, B, O, Ochannels); + } else { + OCL_CHECK(err, err = kernel_conv2d.setArg(0, batch_size)); + OCL_CHECK(err, err = kernel_conv2d.setArg(1, I)); + OCL_CHECK(err, err = kernel_conv2d.setArg(2, Irows)); // input + OCL_CHECK(err, err = kernel_conv2d.setArg(3, Icols)); // output + OCL_CHECK(err, err = kernel_conv2d.setArg(4, Ichannels)); + OCL_CHECK(err, err = kernel_conv2d.setArg(5, K)); + OCL_CHECK(err, err = kernel_conv2d.setArg(6, Krows)); + OCL_CHECK(err, err = kernel_conv2d.setArg(7, Kcols)); + OCL_CHECK(err, err = kernel_conv2d.setArg(8, B)); + OCL_CHECK(err, err = kernel_conv2d.setArg(9, use_bias)); + OCL_CHECK(err, err = kernel_conv2d.setArg(10, O)); + OCL_CHECK(err, err = kernel_conv2d.setArg(11, Orows)); + OCL_CHECK(err, err = kernel_conv2d.setArg(12, Ocols)); + OCL_CHECK(err, err = kernel_conv2d.setArg(13, Ochannels)); + OCL_CHECK(err, err = kernel_conv2d.setArg(14, padding_rows)); + OCL_CHECK(err, err = kernel_conv2d.setArg(15, padding_cols)); + OCL_CHECK(err, err = kernel_conv2d.setArg(16, stride_rows)); + OCL_CHECK(err, err = kernel_conv2d.setArg(17, stride_cols)); + + OCL_CHECK(err, err = q.enqueueTask(kernel_conv2d, NULL, &event)); + q.finish(); + } #endif _profile_fpga(_FPGA_CONV2D, 1); } diff --git a/src/layers/core/layer_activation.cpp b/src/layers/core/layer_activation.cpp index 50ac3fe4c..6ba3bf1c0 100644 --- a/src/layers/core/layer_activation.cpp +++ b/src/layers/core/layer_activation.cpp @@ -26,6 +26,9 @@ LActivation::LActivation(Layer *parent, string act, vector params, string this->params = params; input = parent->output; +#ifdef DEBUG_FPGA + printf("creating output for RELU\n"); +#endif output = new Tensor(input->shape, dev); delta_bp = 0; diff --git a/src/layers/core/layer_reshape.cpp b/src/layers/core/layer_reshape.cpp index 5dc45ae9e..5d9668d13 100644 --- a/src/layers/core/layer_reshape.cpp +++ b/src/layers/core/layer_reshape.cpp @@ -71,6 +71,10 @@ LReshape::LReshape(Layer *parent, vector shape, string name, int dev, int m /////// // sharing the pointers to data +#ifdef cFPGA + printf("creating new tensor output for reshape (at constructor)\n"); +#endif + output = new Tensor(ls, parent->output); parent->addchild(this); @@ -85,6 +89,7 @@ LReshape::~LReshape(){ void LReshape::resize(int batch){ ls[0]=batch; #ifdef cFPGA + printf("voy a hacer resize!!!! batch %d shape[0] %d tensor_id %d, tensor_id parent %d fpga_ptr %p\n", batch, output->shape[0], output->fpga_tensor_id, parent[0]->output->fpga_tensor_id, parent[0]->output->fpga_ptr); output->resize(batch, parent[0]->output->ptr, parent[0]->output->fpga_ptr, false); #else output->resize(batch, parent[0]->output->ptr, nullptr, false); @@ -98,6 +103,9 @@ void LReshape::mem_delta() { parent[0]->mem_delta(); // Problem: Delta is always created, regardless of the low_mem +#ifdef cFPGA + printf("creating new delta tensor for reshape at mem_delta\n"); +#endif delta = new Tensor(ls, parent[0]->delta); if(this->verbosity_level >= 2){ @@ -111,6 +119,9 @@ void LReshape::free_delta() { if(this->delta != nullptr) { // Do not delete its delta directly (It's pointer points to parent's delta) delta->ptr = nullptr; +#ifdef cFPGA + delta->fpga_ptr = nullptr; +#endif delete delta; delta = nullptr; diff --git a/src/tensor/nn/tensor_activations.cpp b/src/tensor/nn/tensor_activations.cpp index f2232c099..b465a8918 100644 --- a/src/tensor/nn/tensor_activations.cpp +++ b/src/tensor/nn/tensor_activations.cpp @@ -8,6 +8,7 @@ */ #include "eddl/tensor/nn/tensor_nn.h" #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h" +#include "eddl/profiling.h" #ifdef cFPGA #include "eddl/hardware/fpga/nn/fpga_nn.h" @@ -21,12 +22,16 @@ namespace tensorNN { + PROFILING_ENABLE(ReLu); + // ReLU void ReLu(Tensor *A, Tensor *B) { if (A->device != B->device) msg("Tensors in different devices", "Tensor::ReLu"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::ReLu"); + PROFILING_HEADER_EXTERN(ReLu); + B->tsem->lock(); if (A->isCPU()) { cpu_relu(A, B); @@ -44,6 +49,9 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(ReLu); + PROFILING_PRINTF(ReLu); } // RELU Derivative, always increment over parent delta @@ -622,4 +630,4 @@ namespace tensorNN { } -} \ No newline at end of file +} diff --git a/src/tensor/nn/tensor_conv.cpp b/src/tensor/nn/tensor_conv.cpp index 5103f78f8..ec876fc92 100644 --- a/src/tensor/nn/tensor_conv.cpp +++ b/src/tensor/nn/tensor_conv.cpp @@ -8,6 +8,7 @@ */ #include "eddl/tensor/nn/tensor_nn.h" #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h" +#include "eddl/profiling.h" #ifdef cGPU #include "eddl/hardware/gpu/gpu_tensor.h" @@ -22,6 +23,8 @@ namespace tensorNN{ + PROFILING_ENABLE(Conv2D); + void Conv2D(ConvolDescriptor *D) { @@ -33,6 +36,8 @@ void Conv2D(ConvolDescriptor *D) { ///////////////////////////////////////////////////////////////////// if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::Conv2D"); + PROFILING_HEADER_EXTERN(Conv2D); + D->O->tsem->lock(); if (D->I->isCPU()) { cpu_conv2D(D); @@ -50,6 +55,9 @@ void Conv2D(ConvolDescriptor *D) { } #endif D->O->tsem->unlock(); + + PROFILING_FOOTER(Conv2D); + PROFILING_PRINTF(Conv2D); } void Conv2D_grad(ConvolDescriptor *D) { @@ -106,4 +114,4 @@ void Conv2D_back(ConvolDescriptor *D) { D->ID->tsem->unlock(); } -} \ No newline at end of file +} diff --git a/src/tensor/nn/tensor_pool.cpp b/src/tensor/nn/tensor_pool.cpp index d742eaee6..d3e0a7a39 100644 --- a/src/tensor/nn/tensor_pool.cpp +++ b/src/tensor/nn/tensor_pool.cpp @@ -8,6 +8,7 @@ */ #include "eddl/tensor/nn/tensor_nn.h" #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h" +#include "eddl/profiling.h" #ifdef cGPU #include "eddl/hardware/gpu/gpu_tensor.h" @@ -20,6 +21,8 @@ #include "eddl/hardware/fpga/nn/fpga_nn.h" #endif +PROFILING_ENABLE(MPool2D); + namespace tensorNN { @@ -32,6 +35,8 @@ namespace tensorNN { ///////////////////////////////////////////////////////////////////// if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::MPool2D"); + PROFILING_HEADER(MPool2D); + D->O->tsem->lock(); if (D->I->isCPU()) { cpu_mpool2D(D); @@ -49,6 +54,9 @@ namespace tensorNN { } #endif D->O->tsem->unlock(); + + PROFILING_FOOTER(MPool2D); + PROFILING_PRINTF(MPool2D); } void MPool2D_back(PoolDescriptor *D) { @@ -136,4 +144,4 @@ namespace tensorNN { D->ID->tsem->unlock(); } -} \ No newline at end of file +} diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index 8be55e465..80e39fc37 100755 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -54,7 +54,7 @@ void checkCompatibility(Tensor *A, Tensor *B, Tensor *C, const string &title){ Tensor::Tensor() : device(DEV_CPU), ndim(0), size(0) {} -Tensor::Tensor(const vector &shape, float *fptr, int dev){ +Tensor::Tensor(const vector &shape, float *fptr, int dev, void *fptr2){ /* * Important! If we are creating a GPU tensor, "fptr" must point to a GPU pointer. */ @@ -79,7 +79,7 @@ Tensor::Tensor(const vector &shape, float *fptr, int dev){ updateShape(shape); updateSize(); updateStrides(); - updateData(fptr); + updateData(fptr, fptr2); this->tsem = new mutex(); } @@ -88,7 +88,11 @@ Tensor::Tensor(const vector &shape, float *fptr, int dev){ Tensor::Tensor(const vector &shape, int dev):Tensor(shape, nullptr, dev){} // From shape and Tensor (sharing ptr) -Tensor::Tensor(const vector &shape, Tensor *T) : Tensor(shape,T->ptr, T->device) {} +Tensor::Tensor(const vector &shape, Tensor *T) : Tensor(shape,T->ptr, T->device +#ifdef cFPGA + , (void *)T->fpga_ptr +#endif + ) {} Tensor::Tensor(const vector& data, const vector &shape, int dev) : Tensor(shape, nullptr, DEV_CPU) { isshared=false; @@ -211,17 +215,20 @@ void Tensor::updateData(float *fptr, void *fptr2,bool setshared){ #ifdef cFPGA else if (this->isFPGA()) { + #ifdef FPGA_DEBUG + printf("Tensor::updateData: fptr=%p, fptr2=%p, setshared=%d\n", fptr, fptr2, setshared); + #endif fpga_device = device-DEV_FPGA; if (!initfpga[fpga_device]) { #ifdef FPGA_DEBUG - printf("Initializing FPGA device\n"); + printf(" initializing FPGA device\n"); #endif fpga_init(/*fpga_device*/); initfpga[fpga_device]=1; } if (fptr == nullptr) { #ifdef FPGA_DEBUG - printf(" ([updateData fptr==null] creating tensor size %d; id being assigned %d)\n", this->size, next_fpga_tensor_id); + printf(" creating tensor: size=%d fpga_tensor_id=%d\n", this->size, next_fpga_tensor_id); #endif this->fpga_ptr = fpga_create_tensor(fpga_device, this->size); this->fpga_size = this->size; @@ -231,27 +238,54 @@ void Tensor::updateData(float *fptr, void *fptr2,bool setshared){ this->fpga_tensor_id = next_fpga_tensor_id; next_fpga_tensor_id++; #ifdef FPGA_DEBUG - printf(" ([updateData] ptr %p fpga_ptr %p)\n", this->ptr, this->fpga_ptr); + printf(" new pointers: ptr=%p fpga_ptr=%p\n", this->ptr, this->fpga_ptr); #endif } else { - // The data has already been created in CPU, so we need now to create a buffer in FPGA and write the buffer into it - // we first update the cpu buffer + printf(" info: fpga_ptr %p fptr2 %p\n", this->fpga_ptr, fptr2); + if ((this->fpga_ptr == (cl::Buffer *)nullptr) && (fptr2 == nullptr)) { + this->fpga_ptr = fpga_create_tensor(fpga_device, this->size); + this->fpga_size = this->size; + this->fpga_tensor_id = next_fpga_tensor_id; + next_fpga_tensor_id++; + fpga_copy_to_fpga(fptr, this); + #ifdef FPGA_DEBUG + printf(" fpga_ptr and fptr2 were null, we create a buffer with tensor id %d\n", this->fpga_tensor_id); + #endif + } else if ((this->fpga_ptr == (cl::Buffer *)nullptr) && (fptr2 != nullptr)) { + #ifdef FPGA_DEBUG + printf(" fpga_ptr null but fptr2 not\n"); + #endif + this->fpga_size = this->size; + this->fpga_ptr = (cl::Buffer *)fptr2; + this->fpga_tensor_id = next_fpga_tensor_id; + next_fpga_tensor_id++; + #ifdef FPGA_DEBUG + printf(" new fpga_size %d fpga_ptr %p fpga_tensor_id %d\n", this->fpga_size, this->fpga_ptr, this->fpga_tensor_id); + #endif + } else { + #ifdef FPGA_DEBUG + printf(" fpga_ptr and fptr2 are not null\n"); + #endif + this->fpga_size = this->size; + this->fpga_ptr = (cl::Buffer *)fptr2; + #ifdef FPGA_DEBUG + printf(" new fpga_size %d fpga_ptr %x\n", this->fpga_size, this->fpga_ptr); + #endif + } #ifdef FPGA_DEBUG - printf(" ([updateData fptr!=null] fptr %p tensor id %d ptr %p fpga_ptr %p size %d fpga_size %d)\n", fptr, this->fpga_tensor_id, this->ptr, this->fpga_ptr, this->size, this->fpga_size); + printf(" end of changes: fptr %p tensor id %d ptr %p fpga_ptr %p size %d fpga_size %d fptr2 %p)\n", fptr, this->fpga_tensor_id, this->ptr, this->fpga_ptr, this->size, this->fpga_size, fptr2); #endif - this->fpga_size = this->size; - #ifdef FPGA_DEBUG - printf(" reallocated tensor id %d new size %d\n", this->fpga_tensor_id, this->fpga_size); - #endif this->ptr = fptr; - this->fpga_ptr = (cl::Buffer *)fptr2; } // For 2 dimensions, map to data to Eigen for efficiency // Efficient operations will be done over ptr2, which also points to ptr if (this->ndim == 2) { this->ptr2= new Eigen::Map(this->ptr, this->shape[1], this->shape[0]); } - } + #ifdef FPGA_DEBUG + printf("-------------------------\n"); + #endif + } #endif } diff --git a/src/tensor/tensor_comparison.cpp b/src/tensor/tensor_comparison.cpp index a696eaee9..d0663761e 100644 --- a/src/tensor/tensor_comparison.cpp +++ b/src/tensor/tensor_comparison.cpp @@ -650,7 +650,8 @@ int Tensor::equivalent(Tensor *A, Tensor *B, float atol, float rtol, bool equal_ #endif #ifdef cFPGA else { - return fpga_equal2(A, B, epsilon); + printf("Error, please check (FPGA), epsilon does not exist\n"); +// return fpga_equal2(A, B, epsilon); } #endif diff --git a/src/tensor/tensor_math.cpp b/src/tensor/tensor_math.cpp index a79fb4d34..ac5f5d343 100644 --- a/src/tensor/tensor_math.cpp +++ b/src/tensor/tensor_math.cpp @@ -12,6 +12,7 @@ #include #include "eddl/tensor/tensor.h" +#include "eddl/profiling.h" #include "eddl/hardware/cpu/cpu_tensor.h" #ifdef cGPU @@ -25,6 +26,9 @@ using namespace std; +PROFILING_ENABLE(sum2D_rowwise); +PROFILING_ENABLE(mult2D); + // Math operations (Tensor-Tensor, Tensor-float) ************************ Tensor* Tensor::maximum(float v){ @@ -2276,9 +2280,6 @@ void Tensor::el_div(Tensor *A, Tensor *B, Tensor *C, int incC) { } - - - void Tensor::mult2D(Tensor *A, int tA, Tensor *B, int tB, Tensor *C, int incC) { /////////////////////////////////////// //// MULT2D C=A*B @@ -2288,6 +2289,8 @@ void Tensor::mult2D(Tensor *A, int tA, Tensor *B, int tB, Tensor *C, int incC) { //// Dimensions and types must be compatible //// Only for 2D Tensors /////////////////////////////////////// + + PROFILING_HEADER_EXTERN(mult2D); if ((A->device != B->device) || (A->device != C->device)) {A->info();B->info();C->info();msg("Tensors in different devices", "Tensor::mult2D");} if ((A->ndim != 2) || (B->ndim != 2) || (C->ndim != 2)) msg("Only 2D tensors", "Tensor::mult2D"); @@ -2325,6 +2328,9 @@ void Tensor::mult2D(Tensor *A, int tA, Tensor *B, int tB, Tensor *C, int incC) { } #endif C->tsem->unlock(); + + PROFILING_FOOTER(mult2D); + PROFILING_PRINTF(mult2D); } @@ -2374,6 +2380,8 @@ void Tensor::sum2D_rowwise(Tensor *A, Tensor *B, Tensor *C) { if ((A->ndim != 2) || (B->ndim != 1) || (C->ndim != 2)) msg("sum2D_rowwise dims"); if ((!sameShape(A, C)) || (A->shape[1] != B->shape[0])) msg("Incompatible dims", "Tensor::sum2D_rowwise"); + PROFILING_HEADER(sum2D_rowwise); + C->tsem->lock(); if (A->isCPU()) { cpu_sum2D_rowwise(A, B, C); @@ -2391,6 +2399,9 @@ void Tensor::sum2D_rowwise(Tensor *A, Tensor *B, Tensor *C) { } #endif C->tsem->unlock(); + + PROFILING_FOOTER(sum2D_rowwise); + PROFILING_PRINTF(sum2D_rowwise); } From c4bba3985e4a3681ba5b67b04aad63624c98bb51 Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Sat, 17 Oct 2020 06:51:14 +0000 Subject: [PATCH 02/15] conv2D with arbitrary precission --- .../kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp | 73 +- .../kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp | 700 ++++++++++++++++++ fpga_kernels/setenv.sh | 4 + fpga_kernels/test_fpga/Makefile | 2 +- .../src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp | 389 ++++++++++ include/eddl/hardware/fpga/fpga_enables.h | 2 +- include/eddl/profiling.h | 11 + src/hardware/fpga/fpga_core.cpp | 58 +- src/hardware/fpga/nn/fpga_activations.cpp | 19 + src/hardware/fpga/nn/fpga_conv.cpp | 4 + 10 files changed, 1219 insertions(+), 43 deletions(-) create mode 100644 fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp create mode 100644 fpga_kernels/setenv.sh create mode 100644 fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp index 20b25cbd2..9ec02806c 100644 --- a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp +++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp @@ -18,17 +18,18 @@ extern "C" { +// To allow using defines inside Xilinx pragmas +#define PRAGMA_SUB(x) _Pragma (#x) +#define DO_PRAGMA(x) PRAGMA_SUB(x) + // Fixed parameters (optimized at compilation/synthesis time) #define KW 3 // kernel width #define KH 3 // kernel height -//#define I 8 // number of input channels -//#define O 8 // number of output channels #define CPI 4 // channels per input port #define CPO 4 // channels per output port -//#define W 256 // input width -//#define H 256 // input height -//#define I_ITER I/CPI // iterations per input -//#define O_ITER O/CPO // iterations per output +// +#define WMAX 512 +#define WHMAX 512*512 #define LOAD_MODEL #define READ_MODEL @@ -173,7 +174,7 @@ static void read_input(int H, int W, int I, int O, int I_ITER, int O_ITER, pixel // in : input stream // out : vector of output streams // -static void padding(int H, int W, int I_ITER, int O_ITER, hls::stream &in, hls::stream &out) { +static void padding(int H, int W, int ITER, hls::stream &in, hls::stream &out) { #ifdef DEBUG_VERBOSE printf("padding: start\n"); @@ -182,36 +183,29 @@ static void padding(int H, int W, int I_ITER, int O_ITER, hls::stream &in, // first we read the kernels frame_t kernel[CPI]; - #pragma HLS ARRAY_PARTITION variable=kernel dim=0 + DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0) frame_t data_in; #ifdef LOAD_MODEL @@ -440,7 +434,7 @@ static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream &in, // now we read frames and produce the pixels float sum[CPO]; - #pragma HLS ARRAY_PARTITION variable=sum dim=0 block factor=4 + DO_PRAGMA(HLS ARRAY_PARTITION variable=sum dim=0 block factor=CPO) //factor = 16 //the array_partition factor in this case is assumed to be CPO value int num_iterations = W * H; @@ -529,8 +523,8 @@ static void add(int H, int W, int I_ITER, int O_ITER, hls::stream & int num_iterations = W * H; //Buffer for all data and CPO channels - float buff_o_channels[CPO][num_iterations]; - #pragma HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=4 + float buff_o_channels[CPO][WHMAX]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO) //We read Bias in O_iter packs of CPO size add_o_iter_loop: @@ -655,10 +649,9 @@ static void conv(int H, int W, int I, int O, int I_ITER, int O_ITER, hls::stream static hls::stream str_cvt_mul; // cvt->mul static hls::stream str_mul_add; // mul->add - // topology #pragma HLS dataflow - padding(H, W, I_ITER, O_ITER, in, str_pad_cvt); // padding + padding(H, W, I_ITER * O_ITER, in, str_pad_cvt); // padding cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0); // cvt mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0); // mul add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out); // add @@ -666,8 +659,10 @@ static void conv(int H, int W, int I, int O, int I_ITER, int O_ITER, hls::stream void k_conv2D_K3x3_S1x1_P1x1_BS1(pixel_in_t *ptr_data, int H, int W, int I, float *ptr_kernel, float *ptr_bias, pixel_out_t *ptr_out, int O) { - //#pragma HLS INTERFACE s_axilite port=W bundle=control - //#pragma HLS INTERFACE s_axilite port=H bundle=control + #pragma HLS INTERFACE s_axilite port=W bundle=control + #pragma HLS INTERFACE s_axilite port=H bundle=control + #pragma HLS INTERFACE s_axilite port=I bundle=control + #pragma HLS INTERFACE s_axilite port=O bundle=control #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp new file mode 100644 index 000000000..0f1e99ece --- /dev/null +++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp @@ -0,0 +1,700 @@ +//KERNEL_CONV2D_4.cpp +//Modified by: Jorge García Martinez +//Date: 17/09/2020 +//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs +//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel. +//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores +//the data that It should be written into the memory. + + + +#include +#include +#include + +#include + +#define DEBUG_VERBOSE + +extern "C" { + +#define data_type ap_fixed<8,4,AP_TRN,AP_WRAP> +//#define data_type float + +// To allow using defines inside Xilinx pragmas +#define PRAGMA_SUB(x) _Pragma (#x) +#define DO_PRAGMA(x) PRAGMA_SUB(x) + +// Fixed parameters (optimized at compilation/synthesis time) +#define KW 3 // kernel width +#define KH 3 // kernel height +#define CPI 16 // channels per input port +#define CPO 16 // channels per output port +// +#define WMAX 256 +#define WHMAX 256*256 + +#define LOAD_MODEL +#define READ_MODEL +#define READ_INPUT +#define WRITE_OUTPUT + +// pixel_in +struct pixel_in_t { + data_type pixel[CPI]; +}; + +struct pixel_out_t { + data_type pixel[CPO]; +}; + +// frames struct +struct frame_t { + pixel_in_t pixel[9]; +}; + +// -------------------------------------------------------------------------------------- +// read_input: +// The function reads and writes the kernels, bias and data in different stream. +// Data are sent to padding module, kenels to mul and bias to add modules. +// LOOP FLOW +// ko = 0 +// b = 0 +// for o_iter 0 .. n +// read bias[b..b+3] +// b = b + 4 +// d = 0 +// ki = 0 +// for i_iter 0 .. n +// read kernel[ki..ki+3][ko..ko+3] +// ki = ki +4 +// read data[d..d+3] +// d = d + 4 +// +// ko = ko + 4 +// +// +// Arguments: +// ptr : Pointer to input data (in) +// k_ptr: pointer to kernels (in) +// b_ptr: pointer to bias (in) +// out : data output stream (out) +// k_out: pointer to kernel (out) +// b_out: pointer to bias (out) +// +static void read_input(int H, int W, int I, int O, int I_ITER, int O_ITER, pixel_in_t *ptr, data_type *k_ptr, data_type *b_ptr, hls::stream &k_out, hls::stream &b_out, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("read_input: start\n"); +#endif + + frame_t frame_k; + #pragma HLS ARRAY_PARTITION variable=frame_k dim=0 + + pixel_out_t bias; + #pragma HLS ARRAY_PARTITION variable=bias dim=0 + + pixel_in_t data; + #pragma HLS ARRAY_PARTITION variable=data dim=0 + + + read_input_o_iter_loop: + for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + //Sending bias to add in pack of CPO bias + // int data_pointer = 0; + read_loop_bias_load: + for (int b=0; b &in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("padding: start\n"); +#endif + +//we init zero only first time + +pixel_in_t data; +DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete) + +pixel_in_t zero; +DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete) + +for (int cpi=0; cpi &in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("relu: start\n"); +#endif + + int data_size = W * H * O; + for (int i=0; i < data_size; i++) { + #pragma HLS PIPELINE II=1 + data_type data = in.read(); + if (data < 0) data = 0.f; + out << data; + } + +#ifdef DEBUG_VERBOSE + printf("relu: end\n"); +#endif +} + +// -------------------------------------------------------------------------------- +// write_output: Writes data comming from one stream into memory +// LOOP FLOW: +// for o_iter 0 .. n +// write data[do .. do+3] +// +// d = d + 4 +// +// Arguments: +// ptr: memory address pointer +// in: input stream +// +static void write_output(int H, int W, int O_ITER, pixel_out_t *ptr, hls::stream &in) { + +#ifdef DEBUG_VERBOSE + printf("write_output: start\n"); +#endif + + + + // int data_pointer = 0; + + // write_output_o_iter_loop: + // for (int o_iter = 0; o_iter &in, hls::stream &out, int id) { + +#ifdef DEBUG_VERBOSE + printf("cvt_%d: start\n", id); +#endif + +cvt_o_iter_loop: +for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + cvt_i_iter_loop: + for(int i_iter = 0; i_iter < I_ITER; i_iter++){ + + // Now we process the input data and convert the data into frames + + // buffers (keep three rows) + pixel_in_t buffer0[WMAX+2]; + pixel_in_t buffer1[WMAX+2]; + pixel_in_t buffer2[WMAX+2]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI) + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI) + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI) + + // frame + frame_t frame; + DO_PRAGMA(HLS ARRAY_PARTITION variable=frame) + + // We loop for every incoming pixel + cvt_loop_1: + for (int pin_row=0; pin_row < H+2; pin_row++) { + cvt_loop_2: + for (int pin_col=0; pin_col < W+2; pin_col++) { + // get the pixel + pixel_in_t pixel; + pixel = in.read(); + // row buffer write (in which buffer row we write the pixel) + int row0_buffer_write = (pin_row % 3) == 0; + int row1_buffer_write = (pin_row % 3) == 1; + // first row buffer + int row0 = (pin_row <= 2) | ((pin_row % 3) == 2); + int row1 = !row0 & ((pin_row % 3) == 0); + // we write the pixel into the buffer + if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel; + // build the frame + pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8; + int shift_frame = (pin_row>1) & (pin_col > 2); + int send_frame = (pin_row>1) & (pin_col > 1); + pixel_in_t pixel_b0, pixel_b1, pixel_b2; + pixel_b0 = buffer0[pin_col]; + pixel_b1 = buffer1[pin_col]; + pixel_b2 = buffer2[pin_col]; + // p0, p1, p2 + if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;} + if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;} + if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2; + // p3, p4, p5 + if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;} + if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;} + if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0; + // p6, p7, p8 + if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;} + if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;} + if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1; + + if (send_frame) { + frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2; + frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5; + frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8; + out << frame; + #ifdef DEBUG_VERBOSE + printf("cvt_%d: frame sent:\n", id); + for (int cpi=0; cpi &in, hls::stream &k_in, hls::stream &out, int id) { + +#ifdef DEBUG_VERBOSE + printf("mul_%d: start\n", id); +#endif + + // first we read the kernels + frame_t kernel[CPI]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0) + frame_t data_in; + +#ifdef LOAD_MODEL + + mul_o_iter_loop: + for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + mul_i_iter_loop: + for(int i_iter = 0; i_iter < I_ITER; i_iter++){ + //we load the kernels into pack of frames + loop_mul_kernels_load_cpo: + for (int cpi=0; cpi %6.4f\n", cpo, float(sum[cpo])); + #endif + p_out.pixel[cpo] = sum[cpo]; + sum[cpo] = 0.f; + } + out << p_out; + } + } //i_iter +} //o_iter + +#endif + + +#ifdef DEBUG_VERBOSE + printf("mul_%d: end\n", id); +#endif +} + +// ------------------------------------------------------------------------------- +// add: This function performs the addition of all subpixels for the same channel. +// It adds also the corresponding bias. +// LOOP FLOW +// for o_iter 0 .. n +// receive bias[b..b+3] +// init buff_o_channels with bias +// for i_iter 0 .. n +// receive data[do..d+3] +// buff_o_channels = buff_o_channels + data +// +// for num_iterations +// for CPO +// send data to write module +// +// Arguments: +// in: input streams data +// b_in: input stream bias +// out: output stream +// +static void add(int H, int W, int I_ITER, int O_ITER, hls::stream &in, hls::stream &b_in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("add: start\n"); +#endif + + data_type bias[CPO]; + + //number of iterations by CPI || CPO channels + int num_iterations = W * H; + + //Buffer for all data and CPO channels + data_type buff_o_channels[CPO][WHMAX]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO) + + //We read Bias in O_iter packs of CPO size + add_o_iter_loop: + for (int o_iter = 0; o_iter &in, hls::stream &k_in, hls::stream &b_in, hls::stream &out) { + + // streams + static hls::stream str_pad_cvt; // padding->cvt + static hls::stream str_cvt_mul; // cvt->mul + static hls::stream str_mul_add; // mul->add + + // topology + #pragma HLS dataflow + padding(H, W, I_ITER * O_ITER, in, str_pad_cvt); // padding + cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0); // cvt + mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0); // mul + add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out); // add +} + +void k_conv2D_K3x3_S1x1_P1x1_BS1_ap(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O) { + + #pragma HLS INTERFACE s_axilite port=W bundle=control + #pragma HLS INTERFACE s_axilite port=H bundle=control + #pragma HLS INTERFACE s_axilite port=I bundle=control + #pragma HLS INTERFACE s_axilite port=O bundle=control + #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_out offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE s_axilite port=return bundle=control + + // ptr_data struct to be packed as a single element vector (to improve memory read) + // the compiler will do full structure access (all elements of structure) + #pragma HLS data_pack variable = ptr_data + #pragma HLS data_pack variable = ptr_out + + int I_ITER = I/CPI; + int O_ITER = O/CPO; + + // input and output streams + static hls::stream out_read; + static hls::stream out_read_kernel; + static hls::stream out_read_bias; + static hls::stream out_conv; + + // stream sizes + #pragma HLS STREAM variable = out_read depth = 32 + #pragma HLS STREAM variable = out_read_kernel depth = 32 + #pragma HLS STREAM variable = out_read_bias depth = 32 + #pragma HLS STREAM variable = out_conv depth = 32 + #pragma HLS STREAM variable = out_relu depth = 32 + + #pragma HLS dataflow + read_input(H, W, I, O, I_ITER, O_ITER, ptr_data, ptr_kernel, ptr_bias, out_read_kernel, out_read_bias, out_read); + conv(H, W, I, O, I_ITER, O_ITER, out_read, out_read_kernel, out_read_bias, out_conv); + write_output(H, W, O_ITER, ptr_out, out_conv); +} + +} // end extern "C" diff --git a/fpga_kernels/setenv.sh b/fpga_kernels/setenv.sh new file mode 100644 index 000000000..2a72e6de9 --- /dev/null +++ b/fpga_kernels/setenv.sh @@ -0,0 +1,4 @@ +source /opt/xilinx/xrt/setup.sh +source /opt/Xilinx/Vitis/2019.2/settings64.sh +export XILINX_SDX=/opt/Xilinx/Vitis/2019.2 +export XCL_EMULATION_MODE=sw_emu diff --git a/fpga_kernels/test_fpga/Makefile b/fpga_kernels/test_fpga/Makefile index 7dc6b92d8..165a7c2d9 100644 --- a/fpga_kernels/test_fpga/Makefile +++ b/fpga_kernels/test_fpga/Makefile @@ -1,5 +1,5 @@ # list of kernel test to compile -LIST ?=conv2D_K3x3_S1x1_P1x1_BS1 +LIST ?=conv2D_K3x3_S1x1_P1x1_BS1_ap # default target all build clean cleanall: KERNELS diff --git a/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp new file mode 100644 index 000000000..facd8b604 --- /dev/null +++ b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp @@ -0,0 +1,389 @@ +#include /* printf, scanf, NULL */ +#include /* malloc, free, rand */ + +#include +#include +#include +#include +#include +#include "xcl2.hpp" + +#include + +using std::vector; + +// data type +//#define data_type ap_fixed<8,4,AP_TRN,AP_WRAP> +#define data_type float + +// CL +cl::Buffer buf; +cl::Context context; +cl::CommandQueue q; +cl::Program program; + + +#define W 256 //256 +#define H 256 //256 +#define C 16 //I +#define COUT 16 //O +#define KW 3 +#define KH 3 + +// buffers +data_type data_in[ W * H * C ] __attribute__ ((__aligned__(16))); +data_type kernel [ KW * KH * C * COUT] __attribute__ ((__aligned__(16))); +data_type bias [ COUT ] __attribute__ ((__aligned__(16))); +data_type out [ W * H * COUT ] __attribute__ ((__aligned__(16))); +data_type out_cpu[ W * H * COUT ] __attribute__ ((__aligned__(16))); + +void cpu_conv2d() { + + int size_out = W * H * COUT; + for (int i=0; i 0.001) { + printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, float(out_cpu[addr_o]), float(out[addr_o]), fabs(float(out_cpu[addr_o]-out[addr_o]))); + error = 1; + return; + } + } + } + } + if (!error) printf("results OK!\n"); else { + printf("results differ:\n"); + //cpu_print_out(); + } +} + + +//--------------------------------------------------------------------------------------------------------------------- +//--------------------------------------------------------------------------------------------------------------------- + +// An event callback function that prints the operations performed by the OpenCL +// runtime. +void event_cb(cl_event event1, cl_int cmd_status, void *data) { + cl_int err; + cl_command_type command; + cl::Event event(event1, true); + OCL_CHECK(err, err = event.getInfo(CL_EVENT_COMMAND_TYPE, &command)); + cl_int status; + OCL_CHECK(err, + err = event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status)); + const char *command_str; + const char *status_str; + switch (command) { + case CL_COMMAND_READ_BUFFER: + command_str = "buffer read"; + break; + case CL_COMMAND_WRITE_BUFFER: + command_str = "buffer write"; + break; + case CL_COMMAND_NDRANGE_KERNEL: + command_str = "kernel"; + break; + case CL_COMMAND_MAP_BUFFER: + command_str = "kernel"; + break; + case CL_COMMAND_COPY_BUFFER: + command_str = "kernel"; + break; + case CL_COMMAND_MIGRATE_MEM_OBJECTS: + command_str = "buffer migrate"; + break; + default: + command_str = "unknown"; + } + switch (status) { + case CL_QUEUED: + status_str = "Queued"; + break; + case CL_SUBMITTED: + status_str = "Submitted"; + break; + case CL_RUNNING: + status_str = "Executing"; + break; + case CL_COMPLETE: + status_str = "Completed"; + break; + } + printf("[%s]: %s %s\n", reinterpret_cast(data), status_str, + command_str); + fflush(stdout); +} + +// Sets the callback for a particular event +void set_callback(cl::Event event, const char *queue_name) { + cl_int err; + OCL_CHECK(err, + err = event.setCallback(CL_COMPLETE, event_cb, (void *)queue_name)); +} + +//--------------------------------------------------------------------------------------------------------------------- + +int main(int argc, char **argv) { + if (argc != 2) { + std::cout << "Usage: " << argv[0] << " " << std::endl; + return EXIT_FAILURE; + } + + printf("Test CONV: [WxHxC] = [%dx%dx%d] -> [WxHxC] = [%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", W, H, C, W, H, COUT, KW, KH); + + std::string binaryFile = argv[1]; + cl_int err; + cl::Kernel kernel_conv2d_2; + + std::cout << "Creating Context..." << std::endl; + auto devices = xcl::get_xil_devices(); + auto device = devices[0]; + OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err)); + OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); + + std::string device_name = device.getInfo(); + auto fileBuf = xcl::read_binary_file(binaryFile); + cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}}; + devices.resize(1); + + OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err)); + std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl; + + OCL_CHECK(err, kernel_conv2d_2 = cl::Kernel(program,"k_conv2D_K3x3_S1x1_P1x1_BS1_ap", &err)); + std::cout << "Kernel sucessfully created" << std::endl ; + + size_t size_data_in_bytes = W*H*C*sizeof(data_type); + size_t size_output_in_bytes = W*H*COUT * sizeof(data_type); + size_t size_kernel_in_bytes = KW * KH * C * COUT * sizeof(data_type); + size_t size_bias_in_bytes = COUT * sizeof(data_type); + // Allocate memory on the host and fill with random data. + + //----------------------------- + // fill data vector with random data + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + std::cout << "Filling buffer with useful data" << std::endl ; + int addr = 0; + for (int h=0; h kernel_events(1); + vector read_events(1); + vector write_events(1); + cl::Buffer buffer_a; + cl::Buffer buffer_b; + cl::Buffer buffer_k; + cl::Buffer buffer_bias; + + //----------------------------- + // Allocate Buffer in Global Memory + // Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and + // Device-to-host communication + std::cout << "Creating Buffers..." << std::endl; + + OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_data_in_bytes, &data_in, &err)); + OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR , size_output_in_bytes, &out, &err)); + OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, &kernel, &err)); + OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, &bias, &err)); + + // set kernel arguments + int arg = 0; + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, H)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, W)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, C)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_k)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_bias)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_b)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, COUT)); + + //----------------------------- + // Copy input data to device global memory + std::cout << "Copying data (Host to Device)..." << std::endl; + // Because we are passing the write_events, it returns an event object + // that identifies this particular command and can be used to query + // or queue a wait for this particular command to complete. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0])); + set_callback(write_events[0], "ooo_queue"); + + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[0])); + set_callback(write_events[0], "ooo_queue"); + + //----------------------------- + printf("Enqueueing NDRange kernel.\n"); + // This event needs to wait for the write buffer operations to complete + // before executing. We are sending the write_events into its wait list to + // ensure that the order of operations is correct. + // Launch the Kernel + std::vector waitList; + waitList.push_back(write_events[0]); + OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[0])); + set_callback(kernel_events[0], "ooo_queue"); + + + + std::cout << "Getting Results (Device to Host)..." << std::endl; + std::vector eventList; + eventList.push_back(kernel_events[0]); + // This operation only needs to wait for the kernel call. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0])); + set_callback(read_events[0], "ooo_queue"); + OCL_CHECK(err, err = read_events[0].wait()); + + // Wait for all of the OpenCL operations to complete + std::cout << "Waiting..." << std::endl; + OCL_CHECK(err, err = q.flush()); + OCL_CHECK(err, err = q.finish()); + + + std::cout << "computing conv in CPU..." << std::endl; + + // cpu_print_data_in(); + // cpu_print_kernels(); + // cpu_print_bias(); + // cpu_conv2d(); + // cpu_print_out(); + + // check_result(); + + //----------------------------- + std::cout << "" << std::endl; + std::cout << "All done" << std::endl; + std::cout << "quit now" << std::endl; + + // exit + return 0; +} diff --git a/include/eddl/hardware/fpga/fpga_enables.h b/include/eddl/hardware/fpga/fpga_enables.h index 55f183d85..7aacfec2d 100644 --- a/include/eddl/hardware/fpga/fpga_enables.h +++ b/include/eddl/hardware/fpga/fpga_enables.h @@ -2,7 +2,7 @@ // implemented on the FPGA //Activations -#define K_ENABLED_RELU +//#define K_ENABLED_RELU //#define K_ENABLED_D_RELU //#define K_ENABLED_THRESHOLDED_RELU //#define K_ENABLED_D_TRHESHOLDED_RELU diff --git a/include/eddl/profiling.h b/include/eddl/profiling.h index 8cca92267..c02848225 100644 --- a/include/eddl/profiling.h +++ b/include/eddl/profiling.h @@ -35,3 +35,14 @@ prof_##fn##_calls, prof_##fn##_time, \ 100.0 * prof_##fn##_time / acc, (float) prof_##fn##_time / (float) prof_##fn##_calls); #endif + + + +//CxHxW +// +//HxWxC +// +//GxHxWxC (C=4) Reshape + Permute +// +//32xHxW -> Reshape -> 8x4xHxW -> Permute(0, 2, 3, 1) -> 8xHxWx4 // hay capas y funciones + diff --git a/src/hardware/fpga/fpga_core.cpp b/src/hardware/fpga/fpga_core.cpp index d88e22c1a..84bfee3f4 100644 --- a/src/hardware/fpga/fpga_core.cpp +++ b/src/hardware/fpga/fpga_core.cpp @@ -331,8 +331,11 @@ void fpga_init(){ // initialize only once cl_int err; std::string binaryFile = "eddl.xclbin"; unsigned fileBufSize; - std::vector devices = xcl::get_xil_devices(); - cl::Device device = devices[0]; + //std::vector devices = xcl::get_xil_devices(); + //cl::Device device = devices[0]; + auto devices = xcl::get_xil_devices(); + auto device = devices[0]; + OCL_CHECK(err, context = cl::Context(device, NULL, NULL, NULL, &err)); OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err)); char *fileBuf = xcl::read_binary_file(binaryFile, fileBufSize); @@ -342,8 +345,27 @@ void fpga_init(){ // initialize only once OCL_CHECK(err, program = cl::Program(context, devices, bins, NULL, &err)); #ifdef K_ENABLED_RELU + printf("creating ReLu kernel\n"); OCL_CHECK(err, kernel_relu = cl::Kernel(program,"k_relu", &err)); if (err != CL_SUCCESS) printf("Error creating kernel\n"); + + // prueba + cl::Event event; + cl::Buffer b1; + cl::Buffer b2; + long sizeA = 1024; + OCL_CHECK(err,b1 = cl::Buffer(context,CL_MEM_READ_WRITE, sizeA*sizeof(float), NULL, &err)); + OCL_CHECK(err,b2 = cl::Buffer(context,CL_MEM_READ_WRITE, sizeA*sizeof(float), NULL, &err)); + OCL_CHECK(err, err = kernel_relu.setArg(0, b1)); + OCL_CHECK(err, err = kernel_relu.setArg(1, b2)); + OCL_CHECK(err, err = kernel_relu.setArg(2, sizeA)); + OCL_CHECK(err, err = q.enqueueTask(kernel_relu, NULL, &event)); + printf("relu kernel lanzado en init\n"); + // event.wait(); + q.finish(); + + + #endif #ifdef K_ENABLED_D_RELU OCL_CHECK(err, kernel_d_relu = cl::Kernel(program,"k_d_relu", &err)); @@ -637,6 +659,38 @@ void fpga_init(){ // initialize only once #ifdef K_ENABLED_CONV2D_K3X3_S1X1_P1X1_BS1 OCL_CHECK(err, kernel_conv2D_K3x3_S1x1_P1x1_BS1 = cl::Kernel(program, "k_conv2D_K3x3_S1x1_P1x1_BS1", &err)); if (err != CL_SUCCESS) printf("Error creating kernel\n"); + + // prueba + cl::Event event1; + cl::Buffer I; + cl::Buffer K; + cl::Buffer B; + cl::Buffer O; + int Ich = 4; + int W = 256; + int H = 256; + int Och = 4; + int arg = 0; + + OCL_CHECK(err,I = cl::Buffer(context,CL_MEM_READ_WRITE, Ich * W * H * sizeof(float), NULL, &err)); + OCL_CHECK(err,K = cl::Buffer(context,CL_MEM_READ_WRITE, 3 * 3 * Ich * Och * sizeof(float), NULL, &err)); + OCL_CHECK(err,B = cl::Buffer(context,CL_MEM_READ_WRITE, Och * sizeof(float), NULL, &err)); + OCL_CHECK(err,O = cl::Buffer(context,CL_MEM_READ_WRITE, Och * W * H * sizeof(float), NULL, &err)); + + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, I)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, H)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, W)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Ich)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, K)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, B)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, O)); + OCL_CHECK(err, err = kernel_conv2D_K3x3_S1x1_P1x1_BS1.setArg(arg++, Och)); + + OCL_CHECK(err, err = q.enqueueTask(kernel_conv2D_K3x3_S1x1_P1x1_BS1, NULL, &event1)); + printf("conv kernel lanzado en init\n"); + // event.wait(); + q.finish(); + #endif #ifdef K_ENABLED_RANGE OCL_CHECK(err, kernel_range = cl::Kernel(program,"k_range", &err)); diff --git a/src/hardware/fpga/nn/fpga_activations.cpp b/src/hardware/fpga/nn/fpga_activations.cpp index 543bf3f22..fe3953259 100644 --- a/src/hardware/fpga/nn/fpga_activations.cpp +++ b/src/hardware/fpga/nn/fpga_activations.cpp @@ -17,6 +17,9 @@ #include "eddl/hardware/fpga/nn/fpga_nn.h" #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h" // for cpu emulation purposes +// prueba +extern cl::Context context; + // ----------------------------------------------------------------- // relu // @@ -35,13 +38,29 @@ void fpga_relu(Tensor *A, Tensor *B){ cl_int err; cl::Event event; + // prueba +/* cl::Buffer b1; + cl::Buffer b2; + printf("1\n"); + OCL_CHECK(err,b1 = cl::Buffer(context,CL_MEM_READ_WRITE, A->size*sizeof(float), NULL, &err)); + printf("2\n"); + OCL_CHECK(err,b2 = cl::Buffer(context,CL_MEM_READ_WRITE, B->size*sizeof(float), NULL, &err)); + printf("3\n"); + OCL_CHECK(err, err = kernel_relu.setArg(0, b1)); + printf("4\n"); + OCL_CHECK(err, err = kernel_relu.setArg(1, b2)); + printf("5\n"); + OCL_CHECK(err, err = kernel_relu.setArg(2, A->size));*/ + OCL_CHECK(err, err = kernel_relu.setArg(0, *(A->fpga_ptr))); OCL_CHECK(err, err = kernel_relu.setArg(1, *(B->fpga_ptr))); OCL_CHECK(err, err = kernel_relu.setArg(2, A->size)); OCL_CHECK(err, err = q.enqueueTask(kernel_relu, NULL, &event)); + // event.wait(); q.finish(); + #endif _profile_fpga_tensor(B); _profile_fpga(_FPGA_RELU, 1); diff --git a/src/hardware/fpga/nn/fpga_conv.cpp b/src/hardware/fpga/nn/fpga_conv.cpp index 24f7ce232..8be62b2cc 100644 --- a/src/hardware/fpga/nn/fpga_conv.cpp +++ b/src/hardware/fpga/nn/fpga_conv.cpp @@ -89,6 +89,9 @@ void fpga_conv2D(ConvolDescriptor *D) if ((stride_rows == 1) && (stride_cols == 1) && (Krows == 3) && (Kcols == 3) && (batch_size == 1) && (padding_rows == 1) && (padding_cols == 1)) { fpga_conv2D_K3x3_S1x1_P1x1_BS1(I, Irows, Icols, Ichannels, K, B, O, Ochannels); } else { + #if !defined(K_ENABLED_CONV2D) + fpga_cpuemu_conv2D(D); + #else OCL_CHECK(err, err = kernel_conv2d.setArg(0, batch_size)); OCL_CHECK(err, err = kernel_conv2d.setArg(1, I)); OCL_CHECK(err, err = kernel_conv2d.setArg(2, Irows)); // input @@ -110,6 +113,7 @@ void fpga_conv2D(ConvolDescriptor *D) OCL_CHECK(err, err = q.enqueueTask(kernel_conv2d, NULL, &event)); q.finish(); + #endif } #endif _profile_fpga(_FPGA_CONV2D, 1); From 07513eeac240352e1be1f878f3b7cff82fb27192 Mon Sep 17 00:00:00 2001 From: jorga20j Date: Tue, 20 Oct 2020 14:30:10 +0000 Subject: [PATCH 03/15] new read distribution --- .../kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp | 1416 +++++++++-------- 1 file changed, 716 insertions(+), 700 deletions(-) diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp index 0f1e99ece..28a2441d9 100644 --- a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp +++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp @@ -1,700 +1,716 @@ -//KERNEL_CONV2D_4.cpp -//Modified by: Jorge García Martinez -//Date: 17/09/2020 -//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs -//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel. -//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores -//the data that It should be written into the memory. - - - -#include -#include -#include - -#include - -#define DEBUG_VERBOSE - -extern "C" { - -#define data_type ap_fixed<8,4,AP_TRN,AP_WRAP> -//#define data_type float - -// To allow using defines inside Xilinx pragmas -#define PRAGMA_SUB(x) _Pragma (#x) -#define DO_PRAGMA(x) PRAGMA_SUB(x) - -// Fixed parameters (optimized at compilation/synthesis time) -#define KW 3 // kernel width -#define KH 3 // kernel height -#define CPI 16 // channels per input port -#define CPO 16 // channels per output port -// -#define WMAX 256 -#define WHMAX 256*256 - -#define LOAD_MODEL -#define READ_MODEL -#define READ_INPUT -#define WRITE_OUTPUT - -// pixel_in -struct pixel_in_t { - data_type pixel[CPI]; -}; - -struct pixel_out_t { - data_type pixel[CPO]; -}; - -// frames struct -struct frame_t { - pixel_in_t pixel[9]; -}; - -// -------------------------------------------------------------------------------------- -// read_input: -// The function reads and writes the kernels, bias and data in different stream. -// Data are sent to padding module, kenels to mul and bias to add modules. -// LOOP FLOW -// ko = 0 -// b = 0 -// for o_iter 0 .. n -// read bias[b..b+3] -// b = b + 4 -// d = 0 -// ki = 0 -// for i_iter 0 .. n -// read kernel[ki..ki+3][ko..ko+3] -// ki = ki +4 -// read data[d..d+3] -// d = d + 4 -// -// ko = ko + 4 -// -// -// Arguments: -// ptr : Pointer to input data (in) -// k_ptr: pointer to kernels (in) -// b_ptr: pointer to bias (in) -// out : data output stream (out) -// k_out: pointer to kernel (out) -// b_out: pointer to bias (out) -// -static void read_input(int H, int W, int I, int O, int I_ITER, int O_ITER, pixel_in_t *ptr, data_type *k_ptr, data_type *b_ptr, hls::stream &k_out, hls::stream &b_out, hls::stream &out) { - -#ifdef DEBUG_VERBOSE - printf("read_input: start\n"); -#endif - - frame_t frame_k; - #pragma HLS ARRAY_PARTITION variable=frame_k dim=0 - - pixel_out_t bias; - #pragma HLS ARRAY_PARTITION variable=bias dim=0 - - pixel_in_t data; - #pragma HLS ARRAY_PARTITION variable=data dim=0 - - - read_input_o_iter_loop: - for (int o_iter = 0; o_iter < O_ITER; o_iter++){ - //Sending bias to add in pack of CPO bias - // int data_pointer = 0; - read_loop_bias_load: - for (int b=0; b &in, hls::stream &out) { - -#ifdef DEBUG_VERBOSE - printf("padding: start\n"); -#endif - -//we init zero only first time - -pixel_in_t data; -DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete) - -pixel_in_t zero; -DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete) - -for (int cpi=0; cpi &in, hls::stream &out) { - -#ifdef DEBUG_VERBOSE - printf("relu: start\n"); -#endif - - int data_size = W * H * O; - for (int i=0; i < data_size; i++) { - #pragma HLS PIPELINE II=1 - data_type data = in.read(); - if (data < 0) data = 0.f; - out << data; - } - -#ifdef DEBUG_VERBOSE - printf("relu: end\n"); -#endif -} - -// -------------------------------------------------------------------------------- -// write_output: Writes data comming from one stream into memory -// LOOP FLOW: -// for o_iter 0 .. n -// write data[do .. do+3] -// -// d = d + 4 -// -// Arguments: -// ptr: memory address pointer -// in: input stream -// -static void write_output(int H, int W, int O_ITER, pixel_out_t *ptr, hls::stream &in) { - -#ifdef DEBUG_VERBOSE - printf("write_output: start\n"); -#endif - - - - // int data_pointer = 0; - - // write_output_o_iter_loop: - // for (int o_iter = 0; o_iter &in, hls::stream &out, int id) { - -#ifdef DEBUG_VERBOSE - printf("cvt_%d: start\n", id); -#endif - -cvt_o_iter_loop: -for (int o_iter = 0; o_iter < O_ITER; o_iter++){ - cvt_i_iter_loop: - for(int i_iter = 0; i_iter < I_ITER; i_iter++){ - - // Now we process the input data and convert the data into frames - - // buffers (keep three rows) - pixel_in_t buffer0[WMAX+2]; - pixel_in_t buffer1[WMAX+2]; - pixel_in_t buffer2[WMAX+2]; - DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI) - DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI) - DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI) - - // frame - frame_t frame; - DO_PRAGMA(HLS ARRAY_PARTITION variable=frame) - - // We loop for every incoming pixel - cvt_loop_1: - for (int pin_row=0; pin_row < H+2; pin_row++) { - cvt_loop_2: - for (int pin_col=0; pin_col < W+2; pin_col++) { - // get the pixel - pixel_in_t pixel; - pixel = in.read(); - // row buffer write (in which buffer row we write the pixel) - int row0_buffer_write = (pin_row % 3) == 0; - int row1_buffer_write = (pin_row % 3) == 1; - // first row buffer - int row0 = (pin_row <= 2) | ((pin_row % 3) == 2); - int row1 = !row0 & ((pin_row % 3) == 0); - // we write the pixel into the buffer - if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel; - // build the frame - pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8; - int shift_frame = (pin_row>1) & (pin_col > 2); - int send_frame = (pin_row>1) & (pin_col > 1); - pixel_in_t pixel_b0, pixel_b1, pixel_b2; - pixel_b0 = buffer0[pin_col]; - pixel_b1 = buffer1[pin_col]; - pixel_b2 = buffer2[pin_col]; - // p0, p1, p2 - if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;} - if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;} - if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2; - // p3, p4, p5 - if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;} - if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;} - if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0; - // p6, p7, p8 - if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;} - if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;} - if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1; - - if (send_frame) { - frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2; - frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5; - frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8; - out << frame; - #ifdef DEBUG_VERBOSE - printf("cvt_%d: frame sent:\n", id); - for (int cpi=0; cpi &in, hls::stream &k_in, hls::stream &out, int id) { - -#ifdef DEBUG_VERBOSE - printf("mul_%d: start\n", id); -#endif - - // first we read the kernels - frame_t kernel[CPI]; - DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0) - frame_t data_in; - -#ifdef LOAD_MODEL - - mul_o_iter_loop: - for (int o_iter = 0; o_iter < O_ITER; o_iter++){ - mul_i_iter_loop: - for(int i_iter = 0; i_iter < I_ITER; i_iter++){ - //we load the kernels into pack of frames - loop_mul_kernels_load_cpo: - for (int cpi=0; cpi %6.4f\n", cpo, float(sum[cpo])); - #endif - p_out.pixel[cpo] = sum[cpo]; - sum[cpo] = 0.f; - } - out << p_out; - } - } //i_iter -} //o_iter - -#endif - - -#ifdef DEBUG_VERBOSE - printf("mul_%d: end\n", id); -#endif -} - -// ------------------------------------------------------------------------------- -// add: This function performs the addition of all subpixels for the same channel. -// It adds also the corresponding bias. -// LOOP FLOW -// for o_iter 0 .. n -// receive bias[b..b+3] -// init buff_o_channels with bias -// for i_iter 0 .. n -// receive data[do..d+3] -// buff_o_channels = buff_o_channels + data -// -// for num_iterations -// for CPO -// send data to write module -// -// Arguments: -// in: input streams data -// b_in: input stream bias -// out: output stream -// -static void add(int H, int W, int I_ITER, int O_ITER, hls::stream &in, hls::stream &b_in, hls::stream &out) { - -#ifdef DEBUG_VERBOSE - printf("add: start\n"); -#endif - - data_type bias[CPO]; - - //number of iterations by CPI || CPO channels - int num_iterations = W * H; - - //Buffer for all data and CPO channels - data_type buff_o_channels[CPO][WHMAX]; - DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO) - - //We read Bias in O_iter packs of CPO size - add_o_iter_loop: - for (int o_iter = 0; o_iter &in, hls::stream &k_in, hls::stream &b_in, hls::stream &out) { - - // streams - static hls::stream str_pad_cvt; // padding->cvt - static hls::stream str_cvt_mul; // cvt->mul - static hls::stream str_mul_add; // mul->add - - // topology - #pragma HLS dataflow - padding(H, W, I_ITER * O_ITER, in, str_pad_cvt); // padding - cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0); // cvt - mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0); // mul - add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out); // add -} - -void k_conv2D_K3x3_S1x1_P1x1_BS1_ap(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O) { - - #pragma HLS INTERFACE s_axilite port=W bundle=control - #pragma HLS INTERFACE s_axilite port=H bundle=control - #pragma HLS INTERFACE s_axilite port=I bundle=control - #pragma HLS INTERFACE s_axilite port=O bundle=control - #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 - #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 - #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 - #pragma HLS INTERFACE m_axi port=ptr_out offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 - #pragma HLS INTERFACE s_axilite port=return bundle=control - - // ptr_data struct to be packed as a single element vector (to improve memory read) - // the compiler will do full structure access (all elements of structure) - #pragma HLS data_pack variable = ptr_data - #pragma HLS data_pack variable = ptr_out - - int I_ITER = I/CPI; - int O_ITER = O/CPO; - - // input and output streams - static hls::stream out_read; - static hls::stream out_read_kernel; - static hls::stream out_read_bias; - static hls::stream out_conv; - - // stream sizes - #pragma HLS STREAM variable = out_read depth = 32 - #pragma HLS STREAM variable = out_read_kernel depth = 32 - #pragma HLS STREAM variable = out_read_bias depth = 32 - #pragma HLS STREAM variable = out_conv depth = 32 - #pragma HLS STREAM variable = out_relu depth = 32 - - #pragma HLS dataflow - read_input(H, W, I, O, I_ITER, O_ITER, ptr_data, ptr_kernel, ptr_bias, out_read_kernel, out_read_bias, out_read); - conv(H, W, I, O, I_ITER, O_ITER, out_read, out_read_kernel, out_read_bias, out_conv); - write_output(H, W, O_ITER, ptr_out, out_conv); -} - -} // end extern "C" +//KERNEL_CONV2D_4.cpp +//Modified by: Jorge García Martinez +//Date: 17/09/2020 +//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs +//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel. +//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores +//the data that It should be written into the memory. + + + +#include +#include +#include + +#include + +// #define DEBUG_VERBOSE + +extern "C" { + +// #define data_type ap_fixed<8,4,AP_TRN,AP_WRAP> +#define data_type float + +// To allow using defines inside Xilinx pragmas +#define PRAGMA_SUB(x) _Pragma (#x) +#define DO_PRAGMA(x) PRAGMA_SUB(x) + +// Fixed parameters (optimized at compilation/synthesis time) +#define KW 3 // kernel width +#define KH 3 // kernel height +#define CPI 4 // channels per input port +#define CPO 4 // channels per output port + +#define WMAX 256 +#define WHMAX 256*256 + +#define LOAD_MODEL +#define READ_MODEL +#define READ_INPUT +#define WRITE_OUTPUT + +// pixel_in +struct pixel_in_t { + data_type pixel[CPI]; +}; + +struct pixel_out_t { + data_type pixel[CPO]; +}; + +// frames struct +struct frame_t { + pixel_in_t pixel[9]; +}; + +// --------------------------------------------------------------------------------------- +// read_bias. Reading bias from memory and sending to add module. +// +// Arguments: +// b_ptr : pointer to bias +// b_out : output streams +// +static void read_bias(int O_ITER, data_type *b_ptr, hls::stream &b_out){ + +#ifdef DEBUG_VERBOSE + printf("read_bias: start\n"); +#endif + pixel_out_t bias; + #pragma HLS ARRAY_PARTITION variable=bias dim=0 + + for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + // printf("o_iter = %d \n ", o_iter); + //Sending bias to add in pack of CPO bias + read_loop_bias_load: + for (int b=0; b &k_out){ + +#ifdef DEBUG_VERBOSE + printf("read_kernel: start\n"); +#endif + + frame_t frame_k; + #pragma HLS ARRAY_PARTITION variable=frame_k dim=0 + //Sending kernels to mul in pack of CPI*CPO kernels + int kernel_size_cpo = CPO*KH*KW; //kernels size each i_iter + int i_offset = I_ITER * CPI * CPO * KH * KW; //addr_k offset for each i_iter + int cpo = 0; //index for kernel size + int kx = 0; //index for channels + + read_input_o_iter_loop: + for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + read_input_i_iter_loop: + for (int i_iter = 0; i_iter < I_ITER; i_iter++){ + + read_loop_kernel_load_ext: + for(int i = 0; i < CPI; i++){ + // printf("i = %d -- kernel_size_cpo = %d \n", i, kernel_size_cpo); + read_loop_kernel_load_int: + for (int j = 0; j < kernel_size_cpo; j++) { + int addr_k = j + i*kernel_size_cpo*I_ITER + i_iter*i_offset + o_iter*kernel_size_cpo; + data_type v = k_ptr[addr_k]; + frame_k.pixel[kx].pixel[cpo] = v; + + #ifdef DEBUG_VERBOSE + printf("[%d]:", addr_k); + printf("%6.4f ", v); + #endif + + kx = kx + 1; + if (kx == 9) { + // printf("\n"); + kx = 0; + cpo = cpo + 1; + if (cpo == CPO) { + cpo = 0; + k_out << frame_k; + } + } + } + } + } //i_iter + } //o_iter + +#ifdef DEBUG_VERBOSE + printf("read_kernel: end\n"); +#endif + +} + +// -------------------------------------------------------------------------------------- +// read_data: Reading data from memory and sending to conv module +// Arguments: +// ptr : Pointer to input data (in) +// out : data output stream (out) +// +static void read_data(int H, int W, int I_ITER, int O_ITER, pixel_in_t *ptr, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("read_data: start\n"); +#endif + + read_input_o_iter_loop: + for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + //Sending data to padding in pack of CPI channels + read_loop_data_load_i: + for (int r=0; r &in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("padding: start\n"); +#endif + +//we init zero only first time + +pixel_in_t data; +DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete) + +pixel_in_t zero; +DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete) + +for (int cpi=0; cpi &in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("relu: start\n"); +#endif + + int data_size = W * H * O; + for (int i=0; i < data_size; i++) { + #pragma HLS PIPELINE II=1 + data_type data = in.read(); + if (data < 0) data = 0.f; + out << data; + } + +#ifdef DEBUG_VERBOSE + printf("relu: end\n"); +#endif +} + +// -------------------------------------------------------------------------------- +// write_output: Writes data comming from one stream into memory +// LOOP FLOW: +// for o_iter 0 .. n +// write data[do .. do+3] +// +// d = d + 4 +// +// Arguments: +// ptr: memory address pointer +// in: input stream +// +static void write_output(int H, int W, int O_ITER, pixel_out_t *ptr, hls::stream &in) { + +#ifdef DEBUG_VERBOSE + printf("write_output: start\n"); +#endif + + + write_output_data_size_loop: + for (int i=0; i &in, hls::stream &out, int id) { + +#ifdef DEBUG_VERBOSE + printf("cvt_%d: start\n", id); +#endif + +cvt_o_iter_loop: +for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + cvt_i_iter_loop: + for(int i_iter = 0; i_iter < I_ITER; i_iter++){ + + // Now we process the input data and convert the data into frames + + // buffers (keep three rows) + pixel_in_t buffer0[WMAX+2]; + pixel_in_t buffer1[WMAX+2]; + pixel_in_t buffer2[WMAX+2]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI) + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI) + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI) + + // frame + frame_t frame; + DO_PRAGMA(HLS ARRAY_PARTITION variable=frame) + + // We loop for every incoming pixel + cvt_loop_1: + for (int pin_row=0; pin_row < H+2; pin_row++) { + cvt_loop_2: + for (int pin_col=0; pin_col < W+2; pin_col++) { + // get the pixel + pixel_in_t pixel; + pixel = in.read(); + // row buffer write (in which buffer row we write the pixel) + int row0_buffer_write = (pin_row % 3) == 0; + int row1_buffer_write = (pin_row % 3) == 1; + // first row buffer + int row0 = (pin_row <= 2) | ((pin_row % 3) == 2); + int row1 = !row0 & ((pin_row % 3) == 0); + // we write the pixel into the buffer + if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel; + // build the frame + pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8; + int shift_frame = (pin_row>1) & (pin_col > 2); + int send_frame = (pin_row>1) & (pin_col > 1); + pixel_in_t pixel_b0, pixel_b1, pixel_b2; + pixel_b0 = buffer0[pin_col]; + pixel_b1 = buffer1[pin_col]; + pixel_b2 = buffer2[pin_col]; + // p0, p1, p2 + if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;} + if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;} + if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2; + // p3, p4, p5 + if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;} + if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;} + if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0; + // p6, p7, p8 + if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;} + if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;} + if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1; + + if (send_frame) { + frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2; + frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5; + frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8; + out << frame; + #ifdef DEBUG_VERBOSE + printf("cvt_%d: frame sent:\n", id); + for (int cpi=0; cpi &in, hls::stream &k_in, hls::stream &out, int id) { + +#ifdef DEBUG_VERBOSE + printf("mul_%d: start\n", id); +#endif + + // first we read the kernels + frame_t kernel[CPI]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0) + frame_t data_in; + +#ifdef LOAD_MODEL + + mul_o_iter_loop: + for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + mul_i_iter_loop: + for(int i_iter = 0; i_iter < I_ITER; i_iter++){ + //we load the kernels into pack of frames + loop_mul_kernels_load_cpo: + for (int cpi=0; cpi %6.4f\n", cpo, float(sum[cpo])); + #endif + p_out.pixel[cpo] = sum[cpo]; + sum[cpo] = 0.f; + } + out << p_out; + } + } //i_iter +} //o_iter + +#endif + + +#ifdef DEBUG_VERBOSE + printf("mul_%d: end\n", id); +#endif +} + +// ------------------------------------------------------------------------------- +// add: This function performs the addition of all subpixels for the same channel. +// It adds also the corresponding bias. +// LOOP FLOW +// for o_iter 0 .. n +// receive bias[b..b+3] +// init buff_o_channels with bias +// for i_iter 0 .. n +// receive data[do..d+3] +// buff_o_channels = buff_o_channels + data +// +// for num_iterations +// for CPO +// send data to write module +// +// Arguments: +// in: input streams data +// b_in: input stream bias +// out: output stream +// +static void add(int H, int W, int I_ITER, int O_ITER, hls::stream &in, hls::stream &b_in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("add: start\n"); +#endif + + data_type bias[CPO]; + + //number of iterations by CPI || CPO channels + int num_iterations = W * H; + + //Buffer for all data and CPO channels + data_type buff_o_channels[CPO][WHMAX]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO) + + //We read Bias in O_iter packs of CPO size + add_o_iter_loop: + for (int o_iter = 0; o_iter &in, hls::stream &k_in, hls::stream &b_in, hls::stream &out) { + + // streams + static hls::stream str_pad_cvt; // padding->cvt + static hls::stream str_cvt_mul; // cvt->mul + static hls::stream str_mul_add; // mul->add + + int ITER = O_ITER*I_ITER; + // topology + #pragma HLS dataflow + padding(H, W, ITER, in, str_pad_cvt); // padding + cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0); // cvt + mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0); // mul + add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out); // add +} + +void k_conv2D_K3x3_S1x1_P1x1_BS1_ap(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O) { + + #pragma HLS INTERFACE s_axilite port=W bundle=control + #pragma HLS INTERFACE s_axilite port=H bundle=control + #pragma HLS INTERFACE s_axilite port=I bundle=control + #pragma HLS INTERFACE s_axilite port=O bundle=control + #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem1 max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem2 max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_out offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE s_axilite port=return bundle=control + + // ptr_data struct to be packed as a single element vector (to improve memory read) + // the compiler will do full structure access (all elements of structure) + #pragma HLS data_pack variable = ptr_data + #pragma HLS data_pack variable = ptr_out + + int I_ITER = I/CPI; + int O_ITER = O/CPO; + + // input and output streams + static hls::stream out_read_data; + static hls::stream out_read_kernel; + static hls::stream out_read_bias; + static hls::stream out_conv; + + // stream sizes + #pragma HLS STREAM variable = out_read_data depth = 32 + #pragma HLS STREAM variable = out_read_kernel depth = 32 + #pragma HLS STREAM variable = out_read_bias depth = 32 + #pragma HLS STREAM variable = out_conv depth = 32 + // #pragma HLS STREAM variable = out_relu depth = 32 + + #pragma HLS dataflow + read_data(H, W, I_ITER, O_ITER, ptr_data, out_read_data); + read_bias(O_ITER, ptr_bias, out_read_bias); + read_kernel(O_ITER, I_ITER, ptr_kernel, out_read_kernel); + conv(H, W, I, O, I_ITER, O_ITER, out_read_data, out_read_kernel, out_read_bias, out_conv); + write_output(H, W, O_ITER, ptr_out, out_conv); +} + +} // end extern "C" From 089a3050645266bbd5240494a2b84d9d06e37d0a Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Wed, 21 Oct 2020 07:33:32 +0000 Subject: [PATCH 04/15] improved conv --- .../kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp | 122 ++++++++---- .../src/test_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp | 184 +++++++++++++----- 2 files changed, 212 insertions(+), 94 deletions(-) diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp index 28a2441d9..3e91eebb2 100644 --- a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp +++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1_ap.cpp @@ -14,7 +14,7 @@ #include -// #define DEBUG_VERBOSE +#define DEBUG_VERBOSE extern "C" { @@ -28,8 +28,8 @@ extern "C" { // Fixed parameters (optimized at compilation/synthesis time) #define KW 3 // kernel width #define KH 3 // kernel height -#define CPI 4 // channels per input port -#define CPO 4 // channels per output port +#define CPI 2 // channels per input port +#define CPO 2 // channels per output port #define WMAX 256 #define WHMAX 256*256 @@ -68,7 +68,27 @@ static void read_bias(int O_ITER, data_type *b_ptr, hls::stream &b_ pixel_out_t bias; #pragma HLS ARRAY_PARTITION variable=bias dim=0 - for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + // we read the bias + int size = O_ITER * CPO; + int cpo = 0; + for (int i=0; i &b_ bias.pixel[b] = v; } b_out << bias; - } + }*/ #ifdef DEBUG_VERBOSE printf("read_bias: end\n"); #endif @@ -97,8 +117,38 @@ static void read_kernel(int O_ITER, int I_ITER, data_type *k_ptr, hls::stream &in, #endif // first we read the kernels - frame_t kernel[CPI]; + frame_t kernel[CPO]; DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0) frame_t data_in; @@ -419,19 +470,19 @@ static void mul(int H, int W, int I_ITER, int O_ITER, hls::stream &in, for(int i_iter = 0; i_iter < I_ITER; i_iter++){ //we load the kernels into pack of frames loop_mul_kernels_load_cpo: - for (int cpi=0; cpi &in, loop_mul_cpo: for (int cpo=0; cpo & } //i_iter #ifdef DEBUG_VERBOSE - printf("CH %d: ", o_iter*CPO); - for (int it=0; it /* printf, scanf, NULL */ #include /* malloc, free, rand */ @@ -23,27 +58,31 @@ cl::CommandQueue q; cl::Program program; -#define W 256 //256 -#define H 256 //256 -#define C 16 //I -#define COUT 16 //O +#define W 4 // 256 //256 +#define H 4 // 256 //256 +#define GI 2 +#define CPI 2 // 16 +#define I GI * CPI +#define GO 2 // 16 +#define CPO 2 +#define O GO * CPO #define KW 3 #define KH 3 // buffers -data_type data_in[ W * H * C ] __attribute__ ((__aligned__(16))); -data_type kernel [ KW * KH * C * COUT] __attribute__ ((__aligned__(16))); -data_type bias [ COUT ] __attribute__ ((__aligned__(16))); -data_type out [ W * H * COUT ] __attribute__ ((__aligned__(16))); -data_type out_cpu[ W * H * COUT ] __attribute__ ((__aligned__(16))); +data_type data_in[ GI * W * H * CPI ] __attribute__ ((__aligned__(16))); +data_type kernel [ GO * GI * CPO * CPI * KW * KH ] __attribute__ ((__aligned__(16))); +data_type bias [ O ] __attribute__ ((__aligned__(16))); +data_type out [ GO * W * H * CPO ] __attribute__ ((__aligned__(16))); +data_type out_cpu[ GO * W * H * CPO ] __attribute__ ((__aligned__(16))); void cpu_conv2d() { - int size_out = W * H * COUT; + int size_out = GO * W * H * CPO; for (int i=0; i 0.001) { printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, float(out_cpu[addr_o]), float(out[addr_o]), fabs(float(out_cpu[addr_o]-out[addr_o]))); error = 1; @@ -231,7 +309,7 @@ int main(int argc, char **argv) { return EXIT_FAILURE; } - printf("Test CONV: [WxHxC] = [%dx%dx%d] -> [WxHxC] = [%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", W, H, C, W, H, COUT, KW, KH); + printf("Test CONV: [GIxWxHxCPI] = [%dx%dx%dx%d] -> [GOxWxHxCPO] = [%d%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", GI, W, H, CPI, GO, W, H, CPO, KW, KH); std::string binaryFile = argv[1]; cl_int err; @@ -254,10 +332,10 @@ int main(int argc, char **argv) { OCL_CHECK(err, kernel_conv2d_2 = cl::Kernel(program,"k_conv2D_K3x3_S1x1_P1x1_BS1_ap", &err)); std::cout << "Kernel sucessfully created" << std::endl ; - size_t size_data_in_bytes = W*H*C*sizeof(data_type); - size_t size_output_in_bytes = W*H*COUT * sizeof(data_type); - size_t size_kernel_in_bytes = KW * KH * C * COUT * sizeof(data_type); - size_t size_bias_in_bytes = COUT * sizeof(data_type); + size_t size_data_in_bytes = W * H * I * sizeof(data_type); + size_t size_output_in_bytes = W * H * O * sizeof(data_type); + size_t size_kernel_in_bytes = KW * KH * I * O * sizeof(data_type); + size_t size_bias_in_bytes = O * sizeof(data_type); // Allocate memory on the host and fill with random data. //----------------------------- @@ -268,25 +346,27 @@ int main(int argc, char **argv) { std::cout << "Filling buffer with useful data" << std::endl ; int addr = 0; - for (int h=0; h Date: Fri, 23 Oct 2020 16:00:45 +0000 Subject: [PATCH 05/15] Conv2d: o_iter in host --- .../kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp | 667 ++++++++++++++++++ .../src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp | 490 +++++++++++++ 2 files changed, 1157 insertions(+) create mode 100644 fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp create mode 100644 fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp diff --git a/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp new file mode 100644 index 000000000..fbfc6a162 --- /dev/null +++ b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp @@ -0,0 +1,667 @@ +//KERNEL_CONV2D_4.cpp +//Modified by: Jorge García Martinez +//Date: 17/09/2020 +//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs +//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel. +//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores +//the data that It should be written into the memory. + + + +#include +#include +#include + +#include + +// #define DEBUG_VERBOSE + +extern "C" { + +// #define data_type ap_fixed<8,4,AP_TRN,AP_WRAP> +#define data_type float + +// To allow using defines inside Xilinx pragmas +#define PRAGMA_SUB(x) _Pragma (#x) +#define DO_PRAGMA(x) PRAGMA_SUB(x) + +// Fixed parameters (optimized at compilation/synthesis time) +#define KW 3 // kernel width +#define KH 3 // kernel height +#define CPI 4 // channels per input port +#define CPO 4 // channels per output port + +#define WMAX 256 +#define WHMAX 256*256 + +#define LOAD_MODEL +#define READ_MODEL +#define READ_INPUT +#define WRITE_OUTPUT + +// pixel_in +struct pixel_in_t { + data_type pixel[CPI]; +}; + +struct pixel_out_t { + data_type pixel[CPO]; +}; + +// frames struct +struct frame_t { + pixel_in_t pixel[9]; +}; + +// --------------------------------------------------------------------------------------- +// read_bias. Reading bias from memory and sending to add module. +// +// Arguments: +// b_ptr : pointer to bias +// b_out : output streams +// +static void read_bias(int offset_bias, data_type *b_ptr, hls::stream &b_out){ + +#ifdef DEBUG_VERBOSE + printf("read_bias: start\n"); +#endif + pixel_out_t bias; + #pragma HLS ARRAY_PARTITION variable=bias dim=0 + + // we read the bias + for (int i=0; i &k_out){ + +#ifdef DEBUG_VERBOSE + printf("read_kernel: start\n"); +#endif + + // we read all the kernels and send it through the stream + frame_t frame_k; + #pragma HLS ARRAY_PARTITION variable=frame_k dim=0 + int cpo = 0; + int p = 0; + + int size = KW * KH * CPO * I_ITER * CPI; + read_kernel_loop: + for (int i=0; i &out) { + +#ifdef DEBUG_VERBOSE + printf("read_data: start\n"); +#endif + + + read_loop_data_load_i: + for (int r=0; r &in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("padding: start\n"); +#endif + +//we init zero only first time + +pixel_in_t data; +DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete) + +pixel_in_t zero; +DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete) + +for (int cpi=0; cpi &in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("relu: start\n"); +#endif + + int data_size = W * H * O; + for (int i=0; i < data_size; i++) { + #pragma HLS PIPELINE II=1 + data_type data = in.read(); + if (data < 0) data = 0.f; + out << data; + } + +#ifdef DEBUG_VERBOSE + printf("relu: end\n"); +#endif +} + +// -------------------------------------------------------------------------------- +// write_output: Writes data comming from one stream into memory +// LOOP FLOW: +// for o_iter 0 .. n +// write data[do .. do+3] +// +// d = d + 4 +// +// Arguments: +// ptr: memory address pointer +// in: input stream +// +static void write_output(int H, int W, int offset_data_out, pixel_out_t *ptr, hls::stream &in) { + +#ifdef DEBUG_VERBOSE + printf("write_output: start\n"); +#endif + + + write_output_data_size_loop: + for (int i=0; i &in, hls::stream &out, int id) { + +#ifdef DEBUG_VERBOSE + printf("cvt_%d: start\n", id); +#endif + + + cvt_i_iter_loop: + for(int i_iter = 0; i_iter < I_ITER; i_iter++){ + + // Now we process the input data and convert the data into frames + + // buffers (keep three rows) + pixel_in_t buffer0[WMAX+2]; + pixel_in_t buffer1[WMAX+2]; + pixel_in_t buffer2[WMAX+2]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI) + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI) + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI) + + // frame + frame_t frame; + DO_PRAGMA(HLS ARRAY_PARTITION variable=frame) + + // We loop for every incoming pixel + cvt_loop_1: + for (int pin_row=0; pin_row < H+2; pin_row++) { + cvt_loop_2: + for (int pin_col=0; pin_col < W+2; pin_col++) { + // get the pixel + pixel_in_t pixel; + pixel = in.read(); + // row buffer write (in which buffer row we write the pixel) + int row0_buffer_write = (pin_row % 3) == 0; + int row1_buffer_write = (pin_row % 3) == 1; + // first row buffer + int row0 = (pin_row <= 2) | ((pin_row % 3) == 2); + int row1 = !row0 & ((pin_row % 3) == 0); + // we write the pixel into the buffer + if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel; + // build the frame + pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8; + int shift_frame = (pin_row>1) & (pin_col > 2); + int send_frame = (pin_row>1) & (pin_col > 1); + pixel_in_t pixel_b0, pixel_b1, pixel_b2; + pixel_b0 = buffer0[pin_col]; + pixel_b1 = buffer1[pin_col]; + pixel_b2 = buffer2[pin_col]; + // p0, p1, p2 + if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;} + if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;} + if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2; + // p3, p4, p5 + if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;} + if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;} + if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0; + // p6, p7, p8 + if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;} + if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;} + if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1; + + if (send_frame) { + frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2; + frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5; + frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8; + out << frame; + #ifdef DEBUG_VERBOSE + printf("cvt_%d: frame sent:\n", id); + for (int cpi=0; cpi &in, hls::stream &k_in, hls::stream &out, int id) { + +#ifdef DEBUG_VERBOSE + printf("mul_%d: start\n", id); +#endif + + // first we read the kernels + frame_t kernel[CPO]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0) + frame_t data_in; + +#ifdef LOAD_MODEL + + mul_i_iter_loop: + for(int i_iter = 0; i_iter < I_ITER; i_iter++){ + //we load the kernels into pack of frames + loop_mul_kernels_load_cpo: + for (int cpo=0; cpo %6.4f\n", cpo, float(sum[cpo])); + #endif + p_out.pixel[cpo] = sum[cpo]; + sum[cpo] = 0.f; + } + out << p_out; + } + } //i_iter + +#endif + + +#ifdef DEBUG_VERBOSE + printf("mul_%d: end\n", id); +#endif +} + +// ------------------------------------------------------------------------------- +// add: This function performs the addition of all subpixels for the same channel. +// It adds also the corresponding bias. +// LOOP FLOW +// for o_iter 0 .. n +// receive bias[b..b+3] +// init buff_o_channels with bias +// for i_iter 0 .. n +// receive data[do..d+3] +// buff_o_channels = buff_o_channels + data +// +// for num_iterations +// for CPO +// send data to write module +// +// Arguments: +// in: input streams data +// b_in: input stream bias +// out: output stream +// +static void add(int H, int W, int I_ITER, hls::stream &in, hls::stream &b_in, hls::stream &out) { + +#ifdef DEBUG_VERBOSE + printf("add: start\n"); +#endif + + data_type bias[CPO]; + + //number of iterations by CPI || CPO channels + int num_iterations = W * H; + + //Buffer for all data and CPO channels + data_type buff_o_channels[CPO][WHMAX]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO) + + //We receive bias in packs of CPO + pixel_out_t p_out; + p_out = b_in.read(); + add_load_bias_loop: + for (int b=0; b &in, hls::stream &k_in, hls::stream &b_in, hls::stream &out) { + + // streams + static hls::stream str_pad_cvt; // padding->cvt + static hls::stream str_cvt_mul; // cvt->mul + static hls::stream str_mul_add; // mul->add + + + // topology + #pragma HLS dataflow + padding(H, W, I_ITER, in, str_pad_cvt); // padding + cvt(H, W, I_ITER, str_pad_cvt, str_cvt_mul, 0); // cvt + mul(H, W, I_ITER, str_cvt_mul, k_in, str_mul_add, 0); // mul + add(H, W, I_ITER, str_mul_add, b_in, out); // add +} + + +void k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O, int offset_bias, int offset_kernel, int offset_data_out) { + + #pragma HLS INTERFACE s_axilite port=W bundle=control + #pragma HLS INTERFACE s_axilite port=H bundle=control + #pragma HLS INTERFACE s_axilite port=I bundle=control + #pragma HLS INTERFACE s_axilite port=O bundle=control + #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem1 max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem2 max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_out offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE s_axilite port=offset_bias bundle=control + #pragma HLS INTERFACE s_axilite port=offset_kernel bundle=control + #pragma HLS INTERFACE s_axilite port=offset_data_out bundle=control + #pragma HLS INTERFACE s_axilite port=return bundle=control + + // ptr_data struct to be packed as a single element vector (to improve memory read) + // the compiler will do full structure access (all elements of structure) + #pragma HLS data_pack variable = ptr_data + #pragma HLS data_pack variable = ptr_out + + int I_ITER = I/CPI; + + // input and output streams + static hls::stream out_read_data; + static hls::stream out_read_kernel; + static hls::stream out_read_bias; + static hls::stream out_conv; + + // stream sizes + #pragma HLS STREAM variable = out_read_data depth = 32 + #pragma HLS STREAM variable = out_read_kernel depth = 32 + #pragma HLS STREAM variable = out_read_bias depth = 32 + #pragma HLS STREAM variable = out_conv depth = 32 + // #pragma HLS STREAM variable = out_relu depth = 32 + + #pragma HLS dataflow + read_data(H, W, I_ITER, ptr_data, out_read_data); + read_bias(offset_bias, ptr_bias, out_read_bias); + read_kernel(I_ITER, offset_kernel, ptr_kernel, out_read_kernel); + conv(H, W, I_ITER, out_read_data, out_read_kernel, out_read_bias, out_conv); + write_output(H, W, offset_data_out, ptr_out, out_conv); + +} + +} // end extern "C" diff --git a/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp new file mode 100644 index 000000000..bd7de32a4 --- /dev/null +++ b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp @@ -0,0 +1,490 @@ +// +// test_conv2D. +// +// Constants: +// +// - CPI +// - CPO +// - KW = 3 +// - KH = 3 +// - PW = 1 +// - PH = 1 +// - SW = 1 +// - SH = 1 +// +// Arguments: +// +// - W +// - H +// - I +// - O +// +// Data formats: +// +// - kernel : GO x GI x CPO x CPI x KH x KW +// - bias : O +// - data_in : GI x H x W x CPI +// - data_out : GO x H x W x CPO +// +// GI = I / CPI +// GO = O / CPO +// +// + + + +#include /* printf, scanf, NULL */ +#include /* malloc, free, rand */ + +#include +#include +#include +#include +#include +#include "xcl2.hpp" + +#include + +using std::vector; + +// data type +//#define data_type ap_fixed<8,4,AP_TRN,AP_WRAP> +#define data_type float + +// CL +cl::Buffer buf; +cl::Context context; +cl::CommandQueue q; +cl::Program program; + + +#define W 256 //256 +#define H 256 //256 +#define GI 2 +#define CPI 4 // 16 +#define I GI * CPI +#define GO 2 // 16 +#define CPO 4 +#define O GO * CPO +#define KW 3 +#define KH 3 + +// buffers +data_type data_in[ GI * W * H * CPI ] __attribute__ ((__aligned__(16))); +data_type kernel [ GO * GI * CPO * CPI * KW * KH ] __attribute__ ((__aligned__(16))); +data_type bias [ O ] __attribute__ ((__aligned__(16))); +data_type out [ GO * W * H * CPO ] __attribute__ ((__aligned__(16))); +data_type out_cpu[ GO * W * H * CPO ] __attribute__ ((__aligned__(16))); + +void cpu_conv2d() { + + int size_out = GO * W * H * CPO; + for (int i=0; i 0.001) { + printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, float(out_cpu[addr_o]), float(out[addr_o]), fabs(float(out_cpu[addr_o]-out[addr_o]))); + error = 1; + return; + } + } + } + } + if (!error) printf("results OK!\n"); else { + printf("results differ:\n"); + //cpu_print_out(); + } +} + + +//--------------------------------------------------------------------------------------------------------------------- +//--------------------------------------------------------------------------------------------------------------------- + +// An event callback function that prints the operations performed by the OpenCL +// runtime. +void event_cb(cl_event event1, cl_int cmd_status, void *data) { + cl_int err; + cl_command_type command; + cl::Event event(event1, true); + OCL_CHECK(err, err = event.getInfo(CL_EVENT_COMMAND_TYPE, &command)); + cl_int status; + OCL_CHECK(err, + err = event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status)); + const char *command_str; + const char *status_str; + switch (command) { + case CL_COMMAND_READ_BUFFER: + command_str = "buffer read"; + break; + case CL_COMMAND_WRITE_BUFFER: + command_str = "buffer write"; + break; + case CL_COMMAND_NDRANGE_KERNEL: + command_str = "kernel"; + break; + case CL_COMMAND_MAP_BUFFER: + command_str = "kernel"; + break; + case CL_COMMAND_COPY_BUFFER: + command_str = "kernel"; + break; + case CL_COMMAND_MIGRATE_MEM_OBJECTS: + command_str = "buffer migrate"; + break; + default: + command_str = "unknown"; + } + switch (status) { + case CL_QUEUED: + status_str = "Queued"; + break; + case CL_SUBMITTED: + status_str = "Submitted"; + break; + case CL_RUNNING: + status_str = "Executing"; + break; + case CL_COMPLETE: + status_str = "Completed"; + break; + } + printf("[%s]: %s %s\n", reinterpret_cast(data), status_str, + command_str); + fflush(stdout); +} + +// Sets the callback for a particular event +void set_callback(cl::Event event, const char *queue_name) { + cl_int err; + OCL_CHECK(err, + err = event.setCallback(CL_COMPLETE, event_cb, (void *)queue_name)); +} + +//--------------------------------------------------------------------------------------------------------------------- + +int main(int argc, char **argv) { + if (argc != 2) { + std::cout << "Usage: " << argv[0] << " " << std::endl; + return EXIT_FAILURE; + } + + printf("Test CONV: [GIxWxHxCPI] = [%dx%dx%dx%d] -> [GOxWxHxCPO] = [%d%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", GI, W, H, CPI, GO, W, H, CPO, KW, KH); + + std::string binaryFile = argv[1]; + cl_int err; + cl::Kernel kernel_conv2d_2; + + std::cout << "Creating Context..." << std::endl; + auto devices = xcl::get_xil_devices(); + auto device = devices[0]; + OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err)); + OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); + + std::string device_name = device.getInfo(); + auto fileBuf = xcl::read_binary_file(binaryFile); + cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}}; + devices.resize(1); + + OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err)); + std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl; + + OCL_CHECK(err, kernel_conv2d_2 = cl::Kernel(program,"k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2", &err)); + std::cout << "Kernel sucessfully created" << std::endl ; + + size_t size_data_in_bytes = W * H * I * sizeof(data_type); + size_t size_output_in_bytes = W * H * O * sizeof(data_type); + size_t size_kernel_in_bytes = KW * KH * I * O * sizeof(data_type); + size_t size_bias_in_bytes = O * sizeof(data_type); + // Allocate memory on the host and fill with random data. + + //----------------------------- + // fill data vector with random data + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + std::cout << "Filling buffer with useful data" << std::endl ; + int addr = 0; + for (int gi=0; gi kernel_events(1); + vector read_events(1); + vector write_events(1); + cl::Buffer buffer_a; + cl::Buffer buffer_b; + cl::Buffer buffer_k; + cl::Buffer buffer_bias; + + //----------------------------- + // Allocate Buffer in Global Memory + // Buffers are allocated using CL_MEM_USE_HOST_PTR for efficient memory and + // Device-to-host communication + std::cout << "Creating Buffers..." << std::endl; + + OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_data_in_bytes, &data_in, &err)); + OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR , size_output_in_bytes, &out, &err)); + OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, &kernel, &err)); + OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, &bias, &err)); + + //Arguments for loop + int I_ITER = I/CPI; //GO + int O_ITER = O/CPO; //GO + int offset_bias = 0; //offset to pointer bias each loop + int offset_kernel = 0; //offset to pointer kernel each loop + int offset_data_out = 0; //offset to poiter output data loop + + for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + // set kernel arguments + int arg = 0; + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, H)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, W)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, I)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_k)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_bias)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_b)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, O)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, offset_bias)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, offset_kernel)); + OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, offset_data_out)); + + // Update the offset poiter to bias, kernels and output data + offset_bias = offset_bias + CPO; + offset_kernel = offset_kernel + KW * KH * CPO * I_ITER * CPI; + offset_data_out = offset_data_out + H * W; + //----------------------------- + // Copy input data to device global memory + // std::cout << "Copying data (Host to Device)..." << std::endl; + // Because we are passing the write_events, it returns an event object + // that identifies this particular command and can be used to query + // or queue a wait for this particular command to complete. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0])); + set_callback(write_events[0], "ooo_queue"); + + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[0])); + set_callback(write_events[0], "ooo_queue"); + + //----------------------------- + // printf("Enqueueing NDRange kernel.\n"); + // This event needs to wait for the write buffer operations to complete + // before executing. We are sending the write_events into its wait list to + // ensure that the order of operations is correct. + // Launch the Kernel + std::vector waitList; + waitList.push_back(write_events[0]); + OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[0])); + set_callback(kernel_events[0], "ooo_queue"); + + // std::cout << "Getting Results (Device to Host)..." << std::endl; + std::vector eventList; + eventList.push_back(kernel_events[0]); + // This operation only needs to wait for the kernel call. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0])); + set_callback(read_events[0], "ooo_queue"); + OCL_CHECK(err, err = read_events[0].wait()); + + +} + + + + + + + // Wait for all of the OpenCL operations to complete + std::cout << "Waiting..." << std::endl; + OCL_CHECK(err, err = q.flush()); + OCL_CHECK(err, err = q.finish()); + + + std::cout << "computing conv in CPU..." << std::endl; + + // cpu_print_data_in(); + // cpu_print_kernels(); + // cpu_print_bias(); + cpu_conv2d(); + // cpu_print_out(); + + check_result(); + + //----------------------------- + std::cout << "" << std::endl; + std::cout << "All done" << std::endl; + std::cout << "quit now" << std::endl; + + // exit + return 0; +} From 851a0425ec752b60de911e1a4eb99e299c963f2a Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Sun, 25 Oct 2020 07:15:33 +0000 Subject: [PATCH 06/15] stats to cn kernel --- .../kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp | 2 +- .../src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp | 192 ++++++++++++------ 2 files changed, 131 insertions(+), 63 deletions(-) diff --git a/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp index fbfc6a162..1545e91e4 100644 --- a/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp +++ b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp @@ -18,7 +18,7 @@ extern "C" { -// #define data_type ap_fixed<8,4,AP_TRN,AP_WRAP> +//#define data_type ap_fixed<8,4,AP_TRN,AP_WRAP> #define data_type float // To allow using defines inside Xilinx pragmas diff --git a/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp index bd7de32a4..fc1fae149 100644 --- a/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp +++ b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp @@ -44,6 +44,7 @@ #include "xcl2.hpp" #include +#include using std::vector; @@ -56,25 +57,66 @@ cl::Buffer buf; cl::Context context; cl::CommandQueue q; cl::Program program; +std::string binaryFile; +#define WMAX 256 +#define HMAX 256 +#define IMAX 512 +#define OMAX 512 + +#define CPI 4 +#define CPO 4 + +#define KW 3 +#define KH 3 + +int W; +int H; +int GI; +int GO; +int I; +int O; -#define W 256 //256 -#define H 256 //256 -#define GI 2 -#define CPI 4 // 16 -#define I GI * CPI -#define GO 2 // 16 -#define CPO 4 -#define O GO * CPO -#define KW 3 -#define KH 3 // buffers -data_type data_in[ GI * W * H * CPI ] __attribute__ ((__aligned__(16))); -data_type kernel [ GO * GI * CPO * CPI * KW * KH ] __attribute__ ((__aligned__(16))); -data_type bias [ O ] __attribute__ ((__aligned__(16))); -data_type out [ GO * W * H * CPO ] __attribute__ ((__aligned__(16))); -data_type out_cpu[ GO * W * H * CPO ] __attribute__ ((__aligned__(16))); +data_type *data_in; //[ IMAX * W * H * CPI ] __attribute__ ((__aligned__(16))); +data_type *kernel; // [ GO * GI * CPO * CPI * KW * KH ] __attribute__ ((__aligned__(16))); +data_type *bias;// [ O ] __attribute__ ((__aligned__(16))); +data_type *out; // [ GO * W * H * CPO ] __attribute__ ((__aligned__(16))); +data_type *out_cpu; //[ GO * W * H * CPO ] __attribute__ ((__aligned__(16))); + +void allocate_buffers() { + data_in = (data_type*)malloc(I * W * H * sizeof(data_type)); + kernel = (data_type*)malloc(I * O * KW * KH * sizeof(data_type)); + bias = (data_type*)malloc(O * sizeof(data_type)); + out = (data_type*)malloc(O * W * H * sizeof(data_type)); + out_cpu = (data_type*)malloc(O * W * H * sizeof(data_type)); +} + +void parse_arguments(int argc, char **argv) { + if (argc != 6) { + printf("syntax:\n%s \n", argv[0]); + exit(1); + } + + binaryFile = argv[1]; + W = atoi(argv[2]); + H = atoi(argv[3]); + I = atoi(argv[4]); + O = atoi(argv[5]); + if ((I % CPI) != 0) {printf("Error, I must me multiple of %d\n", CPI); exit(1);} + if ((O % CPO) != 0) {printf("Error, O must be multiple of %d\n", CPO); exit(1);} + GI = I / CPI; + GO = O / CPO; +} + +void deallocate_buffers() { + free(data_in); + free(kernel); + free(bias); + free(out); + free(out_cpu); +} void cpu_conv2d() { @@ -223,7 +265,7 @@ void check_result() { int go = cout / CPO; int o = cout % CPO; int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o; - if (fabs(out_cpu[addr_o] - out[addr_o]) > 0.001) { + if (fabs(float(out_cpu[addr_o]) - float(out[addr_o])) > 0.001) { printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, float(out_cpu[addr_o]), float(out[addr_o]), fabs(float(out_cpu[addr_o]-out[addr_o]))); error = 1; return; @@ -304,28 +346,39 @@ void set_callback(cl::Event event, const char *queue_name) { //--------------------------------------------------------------------------------------------------------------------- int main(int argc, char **argv) { - if (argc != 2) { - std::cout << "Usage: " << argv[0] << " " << std::endl; - return EXIT_FAILURE; - } - printf("Test CONV: [GIxWxHxCPI] = [%dx%dx%dx%d] -> [GOxWxHxCPO] = [%d%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", GI, W, H, CPI, GO, W, H, CPO, KW, KH); + parse_arguments(argc, argv); + + printf("Test CONV: [GIxWxHxCPI] = [%dx%dx%dx%d] -> [GOxWxHxCPO] = [%dx%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", GI, W, H, CPI, GO, W, H, CPO, KW, KH); + + allocate_buffers(); - std::string binaryFile = argv[1]; cl_int err; cl::Kernel kernel_conv2d_2; std::cout << "Creating Context..." << std::endl; + + printf("1\n"); auto devices = xcl::get_xil_devices(); + printf("2\n"); auto device = devices[0]; + + printf("hola1\n"); + OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err)); + + printf("hola2\n"); OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); + printf("hola\n"); + std::string device_name = device.getInfo(); auto fileBuf = xcl::read_binary_file(binaryFile); cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}}; devices.resize(1); + printf("hola2\n"); + OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err)); std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl; @@ -342,7 +395,7 @@ int main(int argc, char **argv) { // fill data vector with random data std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution dist(-1.0f, 1.0f); + std::uniform_real_distribution dist(-1.0f, 1.0f); std::cout << "Filling buffer with useful data" << std::endl ; int addr = 0; @@ -351,7 +404,7 @@ int main(int argc, char **argv) { for (int w=0; w kernel_events(1); + vector kernel_events(GO); vector read_events(1); - vector write_events(1); + vector write_events(3); cl::Buffer buffer_a; cl::Buffer buffer_b; cl::Buffer buffer_k; @@ -394,19 +447,37 @@ int main(int argc, char **argv) { // Device-to-host communication std::cout << "Creating Buffers..." << std::endl; - OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_data_in_bytes, &data_in, &err)); - OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR , size_output_in_bytes, &out, &err)); - OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, &kernel, &err)); - OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, &bias, &err)); + OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_data_in_bytes, data_in, &err)); + OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR , size_output_in_bytes, out, &err)); + OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, kernel, &err)); + OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, bias, &err)); //Arguments for loop - int I_ITER = I/CPI; //GO - int O_ITER = O/CPO; //GO int offset_bias = 0; //offset to pointer bias each loop int offset_kernel = 0; //offset to pointer kernel each loop int offset_data_out = 0; //offset to poiter output data loop - for (int o_iter = 0; o_iter < O_ITER; o_iter++){ + //----------------------------- + // Copy input data to device global memory + // std::cout << "Copying data (Host to Device)..." << std::endl; + // Because we are passing the write_events, it returns an event object + // that identifies this particular command and can be used to query + // or queue a wait for this particular command to complete. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0])); + set_callback(write_events[0], "ooo_queue"); + + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[1])); + set_callback(write_events[1], "ooo_queue"); + + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_bias}, 0 /*0 means from host*/, NULL, &write_events[2])); + set_callback(write_events[2], "ooo_queue"); + + // timint stats + unsigned long long prof_time; + struct timeval prof_t1; + gettimeofday(&prof_t1, NULL); + + for (int o_iter = 0; o_iter < GO; o_iter++){ // set kernel arguments int arg = 0; OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a)); @@ -423,19 +494,8 @@ int main(int argc, char **argv) { // Update the offset poiter to bias, kernels and output data offset_bias = offset_bias + CPO; - offset_kernel = offset_kernel + KW * KH * CPO * I_ITER * CPI; + offset_kernel = offset_kernel + KW * KH * CPO * GI * CPI; offset_data_out = offset_data_out + H * W; - //----------------------------- - // Copy input data to device global memory - // std::cout << "Copying data (Host to Device)..." << std::endl; - // Because we are passing the write_events, it returns an event object - // that identifies this particular command and can be used to query - // or queue a wait for this particular command to complete. - OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0])); - set_callback(write_events[0], "ooo_queue"); - - OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[0])); - set_callback(write_events[0], "ooo_queue"); //----------------------------- // printf("Enqueueing NDRange kernel.\n"); @@ -445,24 +505,30 @@ int main(int argc, char **argv) { // Launch the Kernel std::vector waitList; waitList.push_back(write_events[0]); - OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[0])); - set_callback(kernel_events[0], "ooo_queue"); - - // std::cout << "Getting Results (Device to Host)..." << std::endl; - std::vector eventList; - eventList.push_back(kernel_events[0]); - // This operation only needs to wait for the kernel call. - OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0])); - set_callback(read_events[0], "ooo_queue"); - OCL_CHECK(err, err = read_events[0].wait()); - - -} - - + waitList.push_back(write_events[1]); + waitList.push_back(write_events[2]); + OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[o_iter])); + set_callback(kernel_events[o_iter], "ooo_queue"); + } + // we wait all kernels to have completed + for (int o_iter = 0; o_iter < GO; o_iter++) { + OCL_CHECK(err, err = kernel_events[o_iter].wait()); + } + // timing + struct timeval prof_t2; + gettimeofday(&prof_t2, NULL); + prof_time = ((prof_t2.tv_sec - prof_t1.tv_sec) * 1000000) + (prof_t2.tv_usec - prof_t1.tv_usec); + printf("Timing: %8lld usec\n", prof_time); + // std::cout << "Getting Results (Device to Host)..." << std::endl; + std::vector eventList; + eventList.push_back(kernel_events[0]); + // This operation only needs to wait for the kernel call. + OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_b}, CL_MIGRATE_MEM_OBJECT_HOST, &eventList, &read_events[0])); + set_callback(read_events[0], "ooo_queue"); + OCL_CHECK(err, err = read_events[0].wait()); // Wait for all of the OpenCL operations to complete std::cout << "Waiting..." << std::endl; @@ -485,6 +551,8 @@ int main(int argc, char **argv) { std::cout << "All done" << std::endl; std::cout << "quit now" << std::endl; + deallocate_buffers(); + // exit return 0; } From d1952fc786825d2cc8004503d39ec681d142afeb Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Thu, 29 Oct 2020 08:05:03 +0000 Subject: [PATCH 07/15] Added final CONV kernel --- .../kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp | 1371 ++++++++--------- .../src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp | 366 +++-- 2 files changed, 929 insertions(+), 808 deletions(-) diff --git a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp index 9ec02806c..a66c9bd67 100644 --- a/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp +++ b/fpga_kernels/kernel_conv2D_K3x3_S1x1_P1x1_BS1.cpp @@ -1,699 +1,672 @@ -//KERNEL_CONV2D_4.cpp -//Modified by: Jorge García Martinez -//Date: 17/09/2020 -//Description: Based on kenel_conv2d_3.cpp. The goal of this code is to perform convolutions with a large number of inputs -//and outputs.For this, we use iteratively a limited number of input and output channels in the kernel. -//In all functions are used two loops for output and input iterations. In add function is added a buffer which stores -//the data that It should be written into the memory. - - - -#include -#include -#include - -#include - -//#define DEBUG_VERBOSE - -extern "C" { - -// To allow using defines inside Xilinx pragmas -#define PRAGMA_SUB(x) _Pragma (#x) -#define DO_PRAGMA(x) PRAGMA_SUB(x) - -// Fixed parameters (optimized at compilation/synthesis time) -#define KW 3 // kernel width -#define KH 3 // kernel height -#define CPI 4 // channels per input port -#define CPO 4 // channels per output port -// -#define WMAX 512 -#define WHMAX 512*512 - -#define LOAD_MODEL -#define READ_MODEL -#define READ_INPUT -#define WRITE_OUTPUT - -// pixel_in -struct pixel_in_t { - float pixel[CPI]; -}; - -struct pixel_out_t { - float pixel[CPO]; -}; - -// frames struct -struct frame_t { - pixel_in_t pixel[9]; -}; - -// -------------------------------------------------------------------------------------- -// read_input: -// The function reads and writes the kernels, bias and data in different stream. -// Data are sent to padding module, kenels to mul and bias to add modules. -// LOOP FLOW -// ko = 0 -// b = 0 -// for o_iter 0 .. n -// read bias[b..b+3] -// b = b + 4 -// d = 0 -// ki = 0 -// for i_iter 0 .. n -// read kernel[ki..ki+3][ko..ko+3] -// ki = ki +4 -// read data[d..d+3] -// d = d + 4 -// -// ko = ko + 4 -// -// -// Arguments: -// ptr : Pointer to input data (in) -// k_ptr: pointer to kernels (in) -// b_ptr: pointer to bias (in) -// out : data output stream (out) -// k_out: pointer to kernel (out) -// b_out: pointer to bias (out) -// -static void read_input(int H, int W, int I, int O, int I_ITER, int O_ITER, pixel_in_t *ptr, float *k_ptr, float *b_ptr, hls::stream &k_out, hls::stream &b_out, hls::stream &out) { - -#ifdef DEBUG_VERBOSE - printf("read_input: start\n"); -#endif - - frame_t frame_k; - #pragma HLS ARRAY_PARTITION variable=frame_k dim=0 - - pixel_out_t bias; - #pragma HLS ARRAY_PARTITION variable=bias dim=0 - - pixel_in_t data; - #pragma HLS ARRAY_PARTITION variable=data dim=0 - - - read_input_o_iter_loop: - for (int o_iter = 0; o_iter < O_ITER; o_iter++){ - //Sending bias to add in pack of CPO bias - // int data_pointer = 0; - read_loop_bias_load: - for (int b=0; b &in, hls::stream &out) { - -#ifdef DEBUG_VERBOSE - printf("padding: start\n"); -#endif - -//we init zero only first time - -pixel_in_t data; -DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete) - -pixel_in_t zero; -DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete) - -for (int cpi=0; cpi &in, hls::stream &out) { - -#ifdef DEBUG_VERBOSE - printf("relu: start\n"); -#endif - - int data_size = W * H * O; - for (int i=0; i < data_size; i++) { - #pragma HLS PIPELINE II=1 - float data = in.read(); - if (data < 0) data = 0.f; - out << data; - } - -#ifdef DEBUG_VERBOSE - printf("relu: end\n"); -#endif -} - -// -------------------------------------------------------------------------------- -// write_output: Writes data comming from one stream into memory -// LOOP FLOW: -// for o_iter 0 .. n -// write data[do .. do+3] -// -// d = d + 4 -// -// Arguments: -// ptr: memory address pointer -// in: input stream -// -static void write_output(int H, int W, int O_ITER, pixel_out_t *ptr, hls::stream &in) { - -#ifdef DEBUG_VERBOSE - printf("write_output: start\n"); -#endif - - - - // int data_pointer = 0; - - // write_output_o_iter_loop: - // for (int o_iter = 0; o_iter &in, hls::stream &out, int id) { - -#ifdef DEBUG_VERBOSE - printf("cvt_%d: start\n", id); -#endif - -cvt_o_iter_loop: -for (int o_iter = 0; o_iter < O_ITER; o_iter++){ - cvt_i_iter_loop: - for(int i_iter = 0; i_iter < I_ITER; i_iter++){ - - // Now we process the input data and convert the data into frames - - // buffers (keep three rows) - pixel_in_t buffer0[WMAX+2]; - pixel_in_t buffer1[WMAX+2]; - pixel_in_t buffer2[WMAX+2]; - DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI) - DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI) - DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI) - - // frame - frame_t frame; - DO_PRAGMA(HLS ARRAY_PARTITION variable=frame) - - // We loop for every incoming pixel - cvt_loop_1: - for (int pin_row=0; pin_row < H+2; pin_row++) { - cvt_loop_2: - for (int pin_col=0; pin_col < W+2; pin_col++) { - // get the pixel - pixel_in_t pixel; - pixel = in.read(); - // row buffer write (in which buffer row we write the pixel) - int row0_buffer_write = (pin_row % 3) == 0; - int row1_buffer_write = (pin_row % 3) == 1; - // first row buffer - int row0 = (pin_row <= 2) | ((pin_row % 3) == 2); - int row1 = !row0 & ((pin_row % 3) == 0); - // we write the pixel into the buffer - if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel; - // build the frame - pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8; - int shift_frame = (pin_row>1) & (pin_col > 2); - int send_frame = (pin_row>1) & (pin_col > 1); - pixel_in_t pixel_b0, pixel_b1, pixel_b2; - pixel_b0 = buffer0[pin_col]; - pixel_b1 = buffer1[pin_col]; - pixel_b2 = buffer2[pin_col]; - // p0, p1, p2 - if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;} - if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;} - if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2; - // p3, p4, p5 - if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;} - if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;} - if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0; - // p6, p7, p8 - if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;} - if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;} - if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1; - - if (send_frame) { - frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2; - frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5; - frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8; - out << frame; - #ifdef DEBUG_VERBOSE - printf("cvt_%d: frame sent:\n", id); - for (int cpi=0; cpi &in, hls::stream &k_in, hls::stream &out, int id) { - -#ifdef DEBUG_VERBOSE - printf("mul_%d: start\n", id); -#endif - - // first we read the kernels - frame_t kernel[CPI]; - DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0) - frame_t data_in; - -#ifdef LOAD_MODEL - - mul_o_iter_loop: - for (int o_iter = 0; o_iter < O_ITER; o_iter++){ - mul_i_iter_loop: - for(int i_iter = 0; i_iter < I_ITER; i_iter++){ - //we load the kernels into pack of frames - loop_mul_kernels_load_cpo: - for (int cpi=0; cpi %6.4f\n", cpo, sum[cpo]); - #endif - p_out.pixel[cpo] = sum[cpo]; - sum[cpo] = 0.f; - } - out << p_out; - } - } //i_iter -} //o_iter - -#endif - - -#ifdef DEBUG_VERBOSE - printf("mul_%d: end\n", id); -#endif -} - -// ------------------------------------------------------------------------------- -// add: This function performs the addition of all subpixels for the same channel. -// It adds also the corresponding bias. -// LOOP FLOW -// for o_iter 0 .. n -// receive bias[b..b+3] -// init buff_o_channels with bias -// for i_iter 0 .. n -// receive data[do..d+3] -// buff_o_channels = buff_o_channels + data -// -// for num_iterations -// for CPO -// send data to write module -// -// Arguments: -// in: input streams data -// b_in: input stream bias -// out: output stream -// -static void add(int H, int W, int I_ITER, int O_ITER, hls::stream &in, hls::stream &b_in, hls::stream &out) { - -#ifdef DEBUG_VERBOSE - printf("add: start\n"); -#endif - - float bias[CPO]; - - //number of iterations by CPI || CPO channels - int num_iterations = W * H; - - //Buffer for all data and CPO channels - float buff_o_channels[CPO][WHMAX]; - DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO) - - //We read Bias in O_iter packs of CPO size - add_o_iter_loop: - for (int o_iter = 0; o_iter &in, hls::stream &k_in, hls::stream &b_in, hls::stream &out) { - - // streams - static hls::stream str_pad_cvt; // padding->cvt - static hls::stream str_cvt_mul; // cvt->mul - static hls::stream str_mul_add; // mul->add - - // topology - #pragma HLS dataflow - padding(H, W, I_ITER * O_ITER, in, str_pad_cvt); // padding - cvt(H, W, I_ITER, O_ITER, str_pad_cvt, str_cvt_mul, 0); // cvt - mul(H, W, I_ITER, O_ITER, str_cvt_mul, k_in, str_mul_add, 0); // mul - add(H, W, I_ITER, O_ITER, str_mul_add, b_in, out); // add -} - -void k_conv2D_K3x3_S1x1_P1x1_BS1(pixel_in_t *ptr_data, int H, int W, int I, float *ptr_kernel, float *ptr_bias, pixel_out_t *ptr_out, int O) { - - #pragma HLS INTERFACE s_axilite port=W bundle=control - #pragma HLS INTERFACE s_axilite port=H bundle=control - #pragma HLS INTERFACE s_axilite port=I bundle=control - #pragma HLS INTERFACE s_axilite port=O bundle=control - #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 - #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 - #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 - #pragma HLS INTERFACE m_axi port=ptr_out offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 - #pragma HLS INTERFACE s_axilite port=return bundle=control - - // ptr_data struct to be packed as a single element vector (to improve memory read) - // the compiler will do full structure access (all elements of structure) - #pragma HLS data_pack variable = ptr_data - #pragma HLS data_pack variable = ptr_out - - int I_ITER = I/CPI; - int O_ITER = O/CPO; - - // input and output streams - static hls::stream out_read; - static hls::stream out_read_kernel; - static hls::stream out_read_bias; - static hls::stream out_conv; - - // stream sizes - #pragma HLS STREAM variable = out_read depth = 32 - #pragma HLS STREAM variable = out_read_kernel depth = 32 - #pragma HLS STREAM variable = out_read_bias depth = 32 - #pragma HLS STREAM variable = out_conv depth = 32 - #pragma HLS STREAM variable = out_relu depth = 32 - - #pragma HLS dataflow - read_input(H, W, I, O, I_ITER, O_ITER, ptr_data, ptr_kernel, ptr_bias, out_read_kernel, out_read_bias, out_read); - conv(H, W, I, O, I_ITER, O_ITER, out_read, out_read_kernel, out_read_bias, out_conv); - write_output(H, W, O_ITER, ptr_out, out_conv); -} - -} // end extern "C" +// Convolution kernel +// Description: This kernel computes the convolution operation for a given set of output +// channels. The kernel has a defined set of input channels (CPI) and output +// channels (CPO) where the convolution is performed in parallel. +// The kernel receives the input geometry (I, W, H) as arguments and performs +// the convolution over CPO channels. For I>CPI configurations the kernel iterates on the +// input channels to produce the output channels. For O>CPO the kernel must be called for each +// CPO set of channels to computer. For this, offsets are provided to the kernel as arguments +// to read from and write to the proper memory locations. +// The kernel uses DataFlow model and is optimized in order to be bounded by the memory bandwidth. +// +// Dataflow: +// +// ------- +// | | ---> read_bias --------------------------------------- +// | | | +// | | ---> read_kernel ---------------------------- | +// | DDR | | | +// | | ---> read_data ---> padding ---> cvt ---> mul ---> add ---> write_data +// | | | +// | | <---------------------------------------------------------------- +// ------- +// +// The kernels asumes the following memory allocation for data: +// - input data : GI x H x W x CPI +// - kernels : GO x GI x CPO x CPI x KH x KW +// - bias : O +// - output data: GO x H x W x CPO +// +// (GI = group of input channels, GO = group of output channels) +// (I = GI x CPI, O = GO x CPO) +// +// Fixed (static) parameters: +// - CPI: Number of input channels supported in one iteration of the kernel +// - CPO: Number of output channels supported in one iteration of the kernel +// - KH, KW: Kernel size (3x3) +// - PH, PW: Padding (1x1) (implicit in the code) +// - SH, SW: Stride (1x1) (implicit in the code) +// - WMAX: Maximum value of the width of an input channel +// - WHMAX: Maximum value of the width multiplied by the height of an input channels +// +// Arguments: +// - I: Number of input channels +// - O: Number of output channels +// - W: Channel width +// - H: Channel height +// - ptr_data: Memory pointer to input data +// - ptr_kernel: Memory pointer to kernels +// - ptr_bias: Memory pointer to bias +// - ptr_out: Memory pointer to output buffer +// - offset_kernel: Offset within kernel data +// - offset_bias: Offset within bias data +// - offset_data_out: Offset within output buffer +// + +// Headers +#include +#include +#include +#include + +// Enable this define to get information (sw_emu) +// #define DEBUG_VERBOSE + +extern "C" { + +// Data type to be used +#define data_type float + +// To allow using defines inside Xilinx pragmas +#define PRAGMA_SUB(x) _Pragma (#x) +#define DO_PRAGMA(x) PRAGMA_SUB(x) + +// Fixed parameters (optimized at compilation/synthesis time) +#define KW 3 // kernel width +#define KH 3 // kernel height +#define CPI 4 // channels per input port +#define CPO 4 // channels per output port + +// Maximum width and width*height +#define WMAX 256 +#define WHMAX 256*256 + +// Data type for input reads +struct pixel_in_t { // pixel in + data_type pixel[CPI]; +}; + +// Data type for output writes +struct pixel_out_t { // pixel out + data_type pixel[CPO]; +}; + +// frames struct (KWxKH) +struct frame_t { + pixel_in_t pixel[9]; +}; + +// --------------------------------------------------------------------------------------- +// read_bias. Reading bias from memory and sending to add module. +// +// Arguments: +// b_ptr : pointer to bias +// offset_bias : offset to bias +// b_out : output stream +// +static void read_bias(int offset_bias, data_type *b_ptr, hls::stream &b_out){ + + #ifdef DEBUG_VERBOSE + printf("read_bias: start\n"); + #endif + + pixel_out_t bias; + #pragma HLS ARRAY_PARTITION variable=bias dim=0 + + // we read the bias + for (int i=0; i &k_out){ + + #ifdef DEBUG_VERBOSE + printf("read_kernel: start\n"); + #endif + + // we read all the kernels and send them through the stream + frame_t frame_k; + #pragma HLS ARRAY_PARTITION variable=frame_k dim=0 + int cpo = 0; + int p = 0; + + int size = KW * KH * CPO * I_ITER * CPI; + read_kernel_loop: + for (int i=0; i &out) { + + #ifdef DEBUG_VERBOSE + printf("read_data: start\n"); + #endif + + read_loop_data_load_i: + for (int r=0; r &in, hls::stream &out) { + + #ifdef DEBUG_VERBOSE + printf("padding: start\n"); + #endif + + pixel_in_t data; + DO_PRAGMA(HLS ARRAY_PARTITION variable=data complete) + + pixel_in_t zero; + DO_PRAGMA(HLS ARRAY_PARTITION variable=zero complete) + + padding_cpi_loop: + for (int cpi=0; cpi &in) { + + #ifdef DEBUG_VERBOSE + printf("write_output: start\n"); + #endif + + write_output_data_size_loop: + for (int i=0; i &in, hls::stream &out) { + + #ifdef DEBUG_VERBOSE + printf("cvt: start\n"); + #endif + + cvt_i_iter_loop: + for(int i_iter = 0; i_iter < I_ITER; i_iter++){ + + // Now we process the input data and convert the data into frames + // buffers (keep three rows) + pixel_in_t buffer0[WMAX+2]; + pixel_in_t buffer1[WMAX+2]; + pixel_in_t buffer2[WMAX+2]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer0 cyclic dim=1 factor=CPI) + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer1 cyclic dim=1 factor=CPI) + DO_PRAGMA(HLS ARRAY_PARTITION variable=buffer2 cyclic dim=1 factor=CPI) + + // frame + frame_t frame; + DO_PRAGMA(HLS ARRAY_PARTITION variable=frame) + + // We loop for every incoming pixel + cvt_loop_1: + for (int pin_row=0; pin_row < H+2; pin_row++) { + cvt_loop_2: + for (int pin_col=0; pin_col < W+2; pin_col++) { + // get the pixel + pixel_in_t pixel; + pixel = in.read(); + // row buffer write (in which buffer row we write the pixel) + int row0_buffer_write = (pin_row % 3) == 0; + int row1_buffer_write = (pin_row % 3) == 1; + // first row buffer + int row0 = (pin_row <= 2) | ((pin_row % 3) == 2); + int row1 = !row0 & ((pin_row % 3) == 0); + // we write the pixel into the buffer + if (row0_buffer_write) buffer0[pin_col] = pixel; else if (row1_buffer_write) buffer1[pin_col] = pixel; else buffer2[pin_col] = pixel; + // build the frame + pixel_in_t p0, p1, p2, p3, p4, p5, p6, p7, p8; + int shift_frame = (pin_row>1) & (pin_col > 2); + int send_frame = (pin_row>1) & (pin_col > 1); + pixel_in_t pixel_b0, pixel_b1, pixel_b2; + pixel_b0 = buffer0[pin_col]; + pixel_b1 = buffer1[pin_col]; + pixel_b2 = buffer2[pin_col]; + // p0, p1, p2 + if (shift_frame) {p0 = p1;} else if (pin_col==0) {if (row0) p0 = pixel_b0; else if (row1) p0 = pixel_b1; else p0 = pixel_b2;} + if (shift_frame) {p1 = p2;} else if (pin_col==1) {if (row0) p1 = pixel_b0; else if (row1) p1 = pixel_b1; else p1 = pixel_b2;} + if (row0) p2 = pixel_b0; else if (row1) p2 = pixel_b1; else p2 = pixel_b2; + // p3, p4, p5 + if (shift_frame) {p3 = p4;} else if (pin_col==0) {if (row0) p3 = pixel_b1; else if (row1) p3 = pixel_b2; else p3 = pixel_b0;} + if (shift_frame) {p4 = p5;} else if (pin_col==1) {if (row0) p4 = pixel_b1; else if (row1) p4 = pixel_b2; else p4 = pixel_b0;} + if (row0) p5 = pixel_b1; else if (row1) p5 = pixel_b2; else p5 = pixel_b0; + // p6, p7, p8 + if (shift_frame) {p6 = p7;} else if (pin_col==0) {if (row0) p6 = pixel_b2; else if (row1) p6 = pixel_b0; else p6 = pixel_b1;} + if (shift_frame) {p7 = p8;} else if (pin_col==1) {if (row0) p7 = pixel_b2; else if (row1) p7 = pixel_b0; else p7 = pixel_b1;} + if (row0) p8 = pixel_b2; else if (row1) p8 = pixel_b0; else p8 = pixel_b1; + + if (send_frame) { + frame.pixel[0] = p0; frame.pixel[1] = p1; frame.pixel[2] = p2; + frame.pixel[3] = p3; frame.pixel[4] = p4; frame.pixel[5] = p5; + frame.pixel[6] = p6; frame.pixel[7] = p7; frame.pixel[8] = p8; + out << frame; + #ifdef DEBUG_VERBOSE + printf("cvt_%d: frame sent:\n", id); + for (int cpi=0; cpi &in, hls::stream &k_in, hls::stream &out) { + + #ifdef DEBUG_VERBOSE + printf("mul: start\n"); + #endif + + frame_t kernel[CPO]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=kernel dim=0) + frame_t data_in; + + // Reading the kernels + mul_i_iter_loop: + for(int i_iter = 0; i_iter < I_ITER; i_iter++){ + loop_mul_kernels_load_cpo: + for (int cpo=0; cpo %6.4f\n", cpo, float(sum[cpo])); + #endif + p_out.pixel[cpo] = sum[cpo]; + sum[cpo] = 0.f; + } + out << p_out; + } + } //i_iter + + #ifdef DEBUG_VERBOSE + printf("mul: end\n"); + #endif +} + +// ------------------------------------------------------------------------------- +// add: This function performs the addition of all subpixels for the same channel. +// It adds also the corresponding bias. +// +// Arguments: +// H : Height of input channel +// W : Width of input channel +// in : input streams data +// b_in : input stream bias +// out : output stream +// +static void add(int H, int W, int I_ITER, hls::stream &in, hls::stream &b_in, hls::stream &out) { + + #ifdef DEBUG_VERBOSE + printf("add: start\n"); + #endif + + data_type bias[CPO]; + + // number of iterations by CPI || CPO channels + int num_iterations = W * H; + + // Buffer for all data and CPO channels + data_type buff_o_channels[CPO][WHMAX]; + DO_PRAGMA(HLS ARRAY_PARTITION variable=buff_o_channels dim=0 block factor=CPO) + + // We receive bias in packs of CPO + pixel_out_t p_out; + p_out = b_in.read(); + add_load_bias_loop: + for (int b=0; b &in, hls::stream &k_in, hls::stream &b_in, hls::stream &out) { + + // streams + static hls::stream str_pad_cvt; // padding->cvt + static hls::stream str_cvt_mul; // cvt->mul + static hls::stream str_mul_add; // mul->add + + // topology + #pragma HLS dataflow + padding(H, W, I_ITER, in, str_pad_cvt); // padding + cvt(H, W, I_ITER, str_pad_cvt, str_cvt_mul); // cvt + mul(H, W, I_ITER, str_cvt_mul, k_in, str_mul_add); // mul + add(H, W, I_ITER, str_mul_add, b_in, out); // add +} + +// ------------------------------------------------------------------------------- +// k_conv2D_K3x3_S1x1_P1x1_BS1 +// Main kernel +// +// Arguments: +// ptr_data : pointer to input data +// H : Height of input channel +// W : Width of input channel +// I : Number of input channels +// ptr_kernel : pinter to kernels +// ptr_bias : pointer to bias +// ptr_out : pointer to output buffer +// O : Number of output channels +// offset_bias : Offset within bias buffer +// offset_kernel : Offset within kernel buffer +// offset_data_out: Offset within data out buffer +// +void k_conv2D_K3x3_S1x1_P1x1_BS1(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O, int offset_bias, int offset_kernel, int offset_data_out) { + + #pragma HLS INTERFACE s_axilite port=W bundle=control + #pragma HLS INTERFACE s_axilite port=H bundle=control + #pragma HLS INTERFACE s_axilite port=I bundle=control + #pragma HLS INTERFACE s_axilite port=O bundle=control + #pragma HLS INTERFACE m_axi port=ptr_data offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_kernel offset=slave bundle=gmem1 max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_bias offset=slave bundle=gmem2 max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE m_axi port=ptr_out offset=slave bundle=gmem max_read_burst_length=256 max_write_burst_length=256 + #pragma HLS INTERFACE s_axilite port=offset_bias bundle=control + #pragma HLS INTERFACE s_axilite port=offset_kernel bundle=control + #pragma HLS INTERFACE s_axilite port=offset_data_out bundle=control + #pragma HLS INTERFACE s_axilite port=return bundle=control + + // ptr_data struct to be packed as a single element vector (to improve memory read) + // the compiler will do full structure access (all elements of structure) + #pragma HLS data_pack variable = ptr_data + #pragma HLS data_pack variable = ptr_out + + int I_ITER = I/CPI; + + // input and output streams + static hls::stream out_read_data; + static hls::stream out_read_kernel; + static hls::stream out_read_bias; + static hls::stream out_conv; + + // stream sizes + #pragma HLS STREAM variable = out_read_data depth = 32 + #pragma HLS STREAM variable = out_read_kernel depth = 32 + #pragma HLS STREAM variable = out_read_bias depth = 32 + #pragma HLS STREAM variable = out_conv depth = 32 + // #pragma HLS STREAM variable = out_relu depth = 32 + + #pragma HLS dataflow + read_data(H, W, I_ITER, ptr_data, out_read_data); + read_bias(offset_bias, ptr_bias, out_read_bias); + read_kernel(I_ITER, offset_kernel, ptr_kernel, out_read_kernel); + conv(H, W, I_ITER, out_read_data, out_read_kernel, out_read_bias, out_conv); + write_output(H, W, offset_data_out, ptr_out, out_conv); +} + +} // end extern "C" diff --git a/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp index 4b06398cf..0b5537b3e 100644 --- a/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp +++ b/fpga_kernels/test_fpga/src/test_conv2D_K3x3_S1x1_P1x1_BS1.cpp @@ -1,3 +1,36 @@ +// +// test_conv2D. +// +// Constants: +// +// - CPI +// - CPO +// - KW = 3 +// - KH = 3 +// - PW = 1 +// - PH = 1 +// - SW = 1 +// - SH = 1 +// +// Arguments: +// +// - W +// - H +// - I +// - O +// +// Data formats: +// +// - kernel : GO x GI x CPO x CPI x KH x KW +// - bias : O +// - data_in : GI x H x W x CPI +// - data_out : GO x H x W x CPO +// +// GI = I / CPI +// GO = O / CPO +// +// + #include /* printf, scanf, NULL */ #include /* malloc, free, rand */ @@ -8,36 +41,87 @@ #include #include "xcl2.hpp" +#include +#include + using std::vector; +// data type +#define data_type float + // CL cl::Buffer buf; cl::Context context; cl::CommandQueue q; cl::Program program; +std::string binaryFile; + +#define WMAX 256 +#define HMAX 256 +#define IMAX 512 +#define OMAX 512 + +#define CPI 4 +#define CPO 4 +#define KW 3 +#define KH 3 + +int W; +int H; +int GI; +int GO; +int I; +int O; -#define W 256 //256 -#define H 256 //256 -#define C 4 //I -#define COUT 4 //O -#define KW 3 -#define KH 3 // buffers -float data_in[ W * H * C ] __attribute__ ((__aligned__(16))); -float kernel [ KW * KH * C * COUT] __attribute__ ((__aligned__(16))); -float bias [ COUT ] __attribute__ ((__aligned__(16))); -float out [ W * H * COUT ] __attribute__ ((__aligned__(16))); -float out_cpu[ W * H * COUT ] __attribute__ ((__aligned__(16))); +data_type *data_in; //[ IMAX * W * H * CPI ] __attribute__ ((__aligned__(16))); +data_type *kernel; //[ GO * GI * CPO * CPI * KW * KH ] __attribute__ ((__aligned__(16))); +data_type *bias; //[ O ] __attribute__ ((__aligned__(16))); +data_type *out; //[ GO * W * H * CPO ] __attribute__ ((__aligned__(16))); +data_type *out_cpu; //[ GO * W * H * CPO ] __attribute__ ((__aligned__(16))); + +void allocate_buffers() { + data_in = (data_type*)malloc(I * W * H * sizeof(data_type)); + kernel = (data_type*)malloc(I * O * KW * KH * sizeof(data_type)); + bias = (data_type*)malloc(O * sizeof(data_type)); + out = (data_type*)malloc(O * W * H * sizeof(data_type)); + out_cpu = (data_type*)malloc(O * W * H * sizeof(data_type)); +} + +void parse_arguments(int argc, char **argv) { + if (argc != 6) { + printf("syntax:\n%s \n", argv[0]); + exit(1); + } + + binaryFile = argv[1]; + W = atoi(argv[2]); + H = atoi(argv[3]); + I = atoi(argv[4]); + O = atoi(argv[5]); + if ((I % CPI) != 0) {printf("Error, I must me multiple of %d\n", CPI); exit(1);} + if ((O % CPO) != 0) {printf("Error, O must be multiple of %d\n", CPO); exit(1);} + GI = I / CPI; + GO = O / CPO; +} + +void deallocate_buffers() { + free(data_in); + free(kernel); + free(bias); + free(out); + free(out_cpu); +} void cpu_conv2d() { - int size_out = W * H * COUT; + int size_out = GO * W * H * CPO; for (int i=0; i 0.001) { - printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, out_cpu[addr_o], out[addr_o], fabs(out_cpu[addr_o]-out[addr_o])); + // data_out pixel position + int go = cout / CPO; + int o = cout % CPO; + int addr_o = (go * W * H * CPO) + (h * W * CPO) + (w * CPO) + o; + if (fabs(float(out_cpu[addr_o]) - float(out[addr_o])) > 0.001) { + printf("Results mismatch at cout %d h %d w %d: %6.4f %6.4f (diff %6.4f)\n", cout, h, w, float(out_cpu[addr_o]), float(out[addr_o]), fabs(float(out_cpu[addr_o]-out[addr_o]))); error = 1; return; } @@ -220,23 +333,21 @@ void set_callback(cl::Event event, const char *queue_name) { //--------------------------------------------------------------------------------------------------------------------- int main(int argc, char **argv) { - if (argc != 2) { - std::cout << "Usage: " << argv[0] << " " << std::endl; - return EXIT_FAILURE; - } - printf("Test CONV: [WxHxC] = [%dx%dx%d] -> [WxHxC] = [%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", W, H, C, W, H, COUT, KW, KH); + parse_arguments(argc, argv); + + printf("Test CONV: [GIxWxHxCPI] = [%dx%dx%dx%d] -> [GOxWxHxCPO] = [%dx%dx%dx%d] (kernel [%dx%d], stride [1x1], padding [1x1])\n", GI, W, H, CPI, GO, W, H, CPO, KW, KH); + + allocate_buffers(); - std::string binaryFile = argv[1]; cl_int err; - cl::Kernel kernel_conv2d_2; + cl::Kernel kernel_conv2d; std::cout << "Creating Context..." << std::endl; auto devices = xcl::get_xil_devices(); auto device = devices[0]; OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err)); OCL_CHECK(err, cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); - std::string device_name = device.getInfo(); auto fileBuf = xcl::read_binary_file(binaryFile); cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}}; @@ -245,13 +356,13 @@ int main(int argc, char **argv) { OCL_CHECK(err, cl::Program program(context, devices, bins, NULL, &err)); std::cout << "Device " << device_name.c_str() << ": program successful!" << std::endl; - OCL_CHECK(err, kernel_conv2d_2 = cl::Kernel(program,"k_conv2D_K3x3_S1x1_P1x1_BS1", &err)); + OCL_CHECK(err, kernel_conv2d = cl::Kernel(program,"k_conv2D_K3x3_S1x1_P1x1_BS1", &err)); std::cout << "Kernel sucessfully created" << std::endl ; - size_t size_data_in_bytes = W*H*C*sizeof(float); - size_t size_output_in_bytes = W*H*COUT * sizeof(float); - size_t size_kernel_in_bytes = KW * KH * C * COUT * sizeof(float); - size_t size_bias_in_bytes = COUT * sizeof(float); + size_t size_data_in_bytes = W * H * I * sizeof(data_type); + size_t size_output_in_bytes = W * H * O * sizeof(data_type); + size_t size_kernel_in_bytes = KW * KH * I * O * sizeof(data_type); + size_t size_bias_in_bytes = O * sizeof(data_type); // Allocate memory on the host and fill with random data. //----------------------------- @@ -262,25 +373,27 @@ int main(int argc, char **argv) { std::cout << "Filling buffer with useful data" << std::endl ; int addr = 0; - for (int h=0; h kernel_events(1); + vector kernel_events(GO); vector read_events(1); - vector write_events(1); + vector write_events(3); cl::Buffer buffer_a; cl::Buffer buffer_b; cl::Buffer buffer_k; @@ -308,48 +421,82 @@ int main(int argc, char **argv) { // Device-to-host communication std::cout << "Creating Buffers..." << std::endl; - OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_data_in_bytes, &data_in, &err)); - OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR , size_output_in_bytes, &out, &err)); - OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, &kernel, &err)); - OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, &bias, &err)); - - // set kernel arguments - int arg = 0; - OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_a)); - OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, H)); - OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, W)); - OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, C)); - OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_k)); - OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_bias)); - OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, buffer_b)); - OCL_CHECK(err, err = kernel_conv2d_2.setArg(arg++, COUT)); + OCL_CHECK(err, buffer_a = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_data_in_bytes, data_in, &err)); + OCL_CHECK(err, buffer_b = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR , size_output_in_bytes, out, &err)); + OCL_CHECK(err, buffer_k = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_kernel_in_bytes, kernel, &err)); + OCL_CHECK(err, buffer_bias = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR , size_bias_in_bytes, bias, &err)); + + //Arguments for loop + int offset_bias = 0; //offset to pointer bias each loop + int offset_kernel = 0; //offset to pointer kernel each loop + int offset_data_out = 0; //offset to poiter output data loop //----------------------------- // Copy input data to device global memory - std::cout << "Copying data (Host to Device)..." << std::endl; + // std::cout << "Copying data (Host to Device)..." << std::endl; // Because we are passing the write_events, it returns an event object // that identifies this particular command and can be used to query // or queue a wait for this particular command to complete. OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_a}, 0 /*0 means from host*/, NULL, &write_events[0])); set_callback(write_events[0], "ooo_queue"); - OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[0])); - set_callback(write_events[0], "ooo_queue"); - - //----------------------------- - printf("Enqueueing NDRange kernel.\n"); - // This event needs to wait for the write buffer operations to complete - // before executing. We are sending the write_events into its wait list to - // ensure that the order of operations is correct. - // Launch the Kernel - std::vector waitList; - waitList.push_back(write_events[0]); - OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d_2, 0, 1, 1, &waitList, &kernel_events[0])); - set_callback(kernel_events[0], "ooo_queue"); + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_k}, 0 /*0 means from host*/, NULL, &write_events[1])); + set_callback(write_events[1], "ooo_queue"); + + OCL_CHECK(err, err = q.enqueueMigrateMemObjects( {buffer_bias}, 0 /*0 means from host*/, NULL, &write_events[2])); + set_callback(write_events[2], "ooo_queue"); + + // timint stats + unsigned long long prof_time; + struct timeval prof_t1; + gettimeofday(&prof_t1, NULL); + + for (int o_iter = 0; o_iter < GO; o_iter++){ + // set kernel arguments + int arg = 0; + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, buffer_a)); + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, H)); + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, W)); + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, I)); + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, buffer_k)); + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, buffer_bias)); + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, buffer_b)); + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, O)); + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, offset_bias)); + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, offset_kernel)); + OCL_CHECK(err, err = kernel_conv2d.setArg(arg++, offset_data_out)); + + // Update the offset poiter to bias, kernels and output data + offset_bias = offset_bias + CPO; + offset_kernel = offset_kernel + KW * KH * CPO * GI * CPI; + offset_data_out = offset_data_out + H * W; + + //----------------------------- + // printf("Enqueueing NDRange kernel.\n"); + // This event needs to wait for the write buffer operations to complete + // before executing. We are sending the write_events into its wait list to + // ensure that the order of operations is correct. + // Launch the Kernel + std::vector waitList; + waitList.push_back(write_events[0]); + waitList.push_back(write_events[1]); + waitList.push_back(write_events[2]); + OCL_CHECK(err, err = q.enqueueNDRangeKernel(kernel_conv2d, 0, 1, 1, &waitList, &kernel_events[o_iter])); + set_callback(kernel_events[o_iter], "ooo_queue"); + } + // we wait all kernels to have completed + for (int o_iter = 0; o_iter < GO; o_iter++) { + OCL_CHECK(err, err = kernel_events[o_iter].wait()); + } + // timing + struct timeval prof_t2; + gettimeofday(&prof_t2, NULL); + prof_time = ((prof_t2.tv_sec - prof_t1.tv_sec) * 1000000) + (prof_t2.tv_usec - prof_t1.tv_usec); + printf("Timing: %8lld usec\n", prof_time); - std::cout << "Getting Results (Device to Host)..." << std::endl; + // std::cout << "Getting Results (Device to Host)..." << std::endl; std::vector eventList; eventList.push_back(kernel_events[0]); // This operation only needs to wait for the kernel call. @@ -362,22 +509,23 @@ int main(int argc, char **argv) { OCL_CHECK(err, err = q.flush()); OCL_CHECK(err, err = q.finish()); - std::cout << "computing conv in CPU..." << std::endl; - // cpu_print_data_in(); + // cpu_print_data_in(); // cpu_print_kernels(); - // cpu_print_bias(); - // cpu_conv2d(); - // cpu_print_out(); + // cpu_print_bias(); + cpu_conv2d(); + // cpu_print_out(); - // check_result(); + check_result(); //----------------------------- std::cout << "" << std::endl; std::cout << "All done" << std::endl; std::cout << "quit now" << std::endl; + deallocate_buffers(); + // exit return 0; } From 96c9256fafd4398292f936328ce1b11833c7896b Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Mon, 2 Nov 2020 09:04:09 +0000 Subject: [PATCH 08/15] adding profiling info --- src/tensor/nn/tensor_activations.cpp | 1 - src/tensor/tensor_math.cpp | 248 +++++++++++++++++++++++++-- 2 files changed, 234 insertions(+), 15 deletions(-) diff --git a/src/tensor/nn/tensor_activations.cpp b/src/tensor/nn/tensor_activations.cpp index b465a8918..e3cb2c9ef 100644 --- a/src/tensor/nn/tensor_activations.cpp +++ b/src/tensor/nn/tensor_activations.cpp @@ -51,7 +51,6 @@ namespace tensorNN { B->tsem->unlock(); PROFILING_FOOTER(ReLu); - PROFILING_PRINTF(ReLu); } // RELU Derivative, always increment over parent delta diff --git a/src/tensor/tensor_math.cpp b/src/tensor/tensor_math.cpp index ac5f5d343..27711e3fd 100644 --- a/src/tensor/tensor_math.cpp +++ b/src/tensor/tensor_math.cpp @@ -26,8 +26,68 @@ using namespace std; -PROFILING_ENABLE(sum2D_rowwise); +// profiling declarations +PROFILING_ENABLE(maximum); +PROFILING_ENABLE(minimum); +PROFILING_ENABLE(max); +PROFILING_ENABLE(argmax); +PROFILING_ENABLE(argmax_d); +PROFILING_ENABLE(min); +PROFILING_ENABLE(argmin); +PROFILING_ENABLE(sum); +PROFILING_ENABLE(sum_abs); +PROFILING_ENABLE(prod); +PROFILING_ENABLE(mean); +PROFILING_ENABLE(median); +PROFILING_ENABLE(std); +PROFILING_ENABLE(var); +PROFILING_ENABLE(mode); +PROFILING_ENABLE(abs); +PROFILING_ENABLE(acos); +PROFILING_ENABLE(add); +PROFILING_ENABLE(asin); +PROFILING_ENABLE(atan); +PROFILING_ENABLE(cell); +PROFILING_ENABLE(clamp); +PROFILING_ENABLE(clampmax); +PROFILING_ENABLE(clampmin); +PROFILING_ENABLE(cos); +PROFILING_ENABLE(cosh); +PROFILING_ENABLE(div); +PROFILING_ENABLE(exp); +PROFILING_ENABLE(floor); +PROFILING_ENABLE(inv); +PROFILING_ENABLE(log); +PROFILING_ENABLE(log2); +PROFILING_ENABLE(log10); +PROFILING_ENABLE(logn); +PROFILING_ENABLE(mod); +PROFILING_ENABLE(mult); +PROFILING_ENABLE(neg); +PROFILING_ENABLE(normalize); +PROFILING_ENABLE(pow); +PROFILING_ENABLE(powb); +PROFILING_ENABLE(reciprocal); +PROFILING_ENABLE(remainder); +PROFILING_ENABLE(round); +PROFILING_ENABLE(rsqrt); +PROFILING_ENABLE(sigmoid); +PROFILING_ENABLE(sign); +PROFILING_ENABLE(sin); +PROFILING_ENABLE(sinh); +PROFILING_ENABLE(sqr); +PROFILING_ENABLE(sqrt); +PROFILING_ENABLE(sub); +PROFILING_ENABLE(tan); +PROFILING_ENABLE(tanh); +PROFILING_ENABLE(trunc); +PROFILING_ENABLE(inc); +PROFILING_ENABLE(eldiv); PROFILING_ENABLE(mult2D); +PROFILING_ENABLE(el_mult); +PROFILING_ENABLE(sum2D_rowwise); +PROFILING_ENABLE(reduce_sum2D); +PROFILING_ENABLE(sum2D_colwise); // Math operations (Tensor-Tensor, Tensor-float) ************************ @@ -44,6 +104,9 @@ Tensor* Tensor::maximum(Tensor* A, float v){ } void Tensor::maximum(Tensor* A, Tensor* B, float v){ + + PROFILING_HEADER(maximum); + if (A->isCPU() && B->isCPU()){ cpu_maximum(A, B, v); } @@ -59,6 +122,7 @@ void Tensor::maximum(Tensor* A, Tensor* B, float v){ } #endif + PROFILING_FOOTER(maximum); } Tensor* Tensor::maximum(Tensor* A, Tensor* B){ @@ -68,6 +132,9 @@ Tensor* Tensor::maximum(Tensor* A, Tensor* B){ } void Tensor::maximum(Tensor* A, Tensor* B, Tensor* C){ + + PROFILING_HEADER(maximum); + if (A->isCPU() && B->isCPU() && C->isCPU()){ cpu_maximum(A, B, C); } @@ -83,6 +150,7 @@ void Tensor::maximum(Tensor* A, Tensor* B, Tensor* C){ } #endif + PROFILING_FOOTER(maximum); } Tensor* Tensor::minimum(float v){ @@ -98,6 +166,9 @@ Tensor* Tensor::minimum(Tensor* A, float v){ } void Tensor::minimum(Tensor* A, Tensor* B, float v){ + + PROFILING_HEADER(minimum); + if (A->isCPU() && B->isCPU()){ cpu_minimum(A, B, v); } @@ -113,6 +184,7 @@ void Tensor::minimum(Tensor* A, Tensor* B, float v){ } #endif + PROFILING_FOOTER(minimum); } Tensor* Tensor::minimum(Tensor* A, Tensor* B){ @@ -122,6 +194,9 @@ Tensor* Tensor::minimum(Tensor* A, Tensor* B){ } void Tensor::minimum(Tensor* A, Tensor* B, Tensor* C){ + + PROFILING_HEADER(minimum); + if (A->isCPU() && B->isCPU() && C->isCPU()){ cpu_minimum(A, B, C); } @@ -137,6 +212,7 @@ void Tensor::minimum(Tensor* A, Tensor* B, Tensor* C){ } #endif + PROFILING_FOOTER(minimum); } @@ -149,6 +225,9 @@ float Tensor::max(){ float Tensor::max(Tensor* A){ + + PROFILING_HEADER(max); + if (A->isCPU()) { return cpu_max(A); } @@ -164,6 +243,9 @@ float Tensor::max(Tensor* A){ return fpga_max(A); } #endif + + PROFILING_FOOTER(max); + return 0.0f; // Never used, this is for the compiler warning } @@ -181,6 +263,9 @@ Tensor* Tensor::max(vector axis, bool keepdims){ } void Tensor::max(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ + + PROFILING_HEADER(max); + if (A->isCPU() && B->isCPU()) { cpu_max(A, B, rd); } @@ -195,6 +280,8 @@ void Tensor::max(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ fpga_max(A, B, rd); } #endif + + PROFILING_FOOTER(max); } @@ -205,21 +292,29 @@ int Tensor::argmax(){ int Tensor::argmax(Tensor* A){ + + PROFILING_HEADER(argmax); + if (A->isCPU()) { + PROFILING_FOOTER(argmax); return cpu_argmax(A); } #ifdef cGPU else if (A->isGPU()) { + PROFILING_FOOTER(argmax); return gpu_argmax(A); } #endif #ifdef cFPGA else { - fpga_argmax(A); + PROFILING_FOOTER(argmax); + return fpga_argmax(A); } #endif + PROFILING_FOOTER(argmax); + msg("Invalid device", "Tensor::argmax"); return 0.0f; // Never used, this is for the compiler warning } @@ -238,6 +333,9 @@ Tensor* Tensor::argmax(vector axis, bool keepdims){ } void Tensor::argmax(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ + + PROFILING_HEADER(argmax); + if (A->isCPU() && B->isCPU()) { cpu_argmax(A, B, rd); } @@ -252,9 +350,14 @@ void Tensor::argmax(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ fpga_argmax(A, B, rd); } #endif + + PROFILING_FOOTER(argmax); } void Tensor::argmax_d(Tensor *D, Tensor *O, Tensor *PD){ + + PROFILING_HEADER(argmax_d); + if (D->isCPU() && O->isCPU() && PD->isCPU()) { cpu_argmax_d(D, O, PD); } @@ -269,6 +372,8 @@ void Tensor::argmax_d(Tensor *D, Tensor *O, Tensor *PD){ //fpga_argmax_d(D, O, PD); } #endif + + PROFILING_FOOTER(argmax_d); } float Tensor::min(){ @@ -277,21 +382,29 @@ float Tensor::min(){ float Tensor::min(Tensor* A){ + + PROFILING_HEADER(min); + if (A->isCPU()) { + PROFILING_FOOTER(min); return cpu_min(A); } #ifdef cGPU else if (A->isGPU()) { - return gpu_min(A); + PROFILING_FOOTER(min); + return gpu_min(A); } #endif #ifdef cFPGA else { - fpga_min(A); + PROFILING_FOOTER(min); + return fpga_min(A); } #endif + PROFILING_FOOTER(min); + msg("Invalid device", "Tensor::min"); return 0.0f; // Never used, this is for the compiler warning } @@ -310,6 +423,9 @@ Tensor* Tensor::min(vector axis, bool keepdims){ } void Tensor::min(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ + + PROFILING_HEADER(min); + if (A->isCPU() && B->isCPU()) { cpu_min(A, B, rd); } @@ -324,6 +440,8 @@ void Tensor::min(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ fpga_min(A, B, rd); } #endif + + PROFILING_FOOTER(min); } @@ -333,21 +451,29 @@ int Tensor::argmin(){ int Tensor::argmin(Tensor* A){ + + PROFILING_HEADER(argmin); + if (A->isCPU()) { + PROFILING_FOOTER(argmin); return cpu_argmin(A); } #ifdef cGPU else if (A->isGPU()) { + PROFILING_FOOTER(argmin); return gpu_argmin(A); } #endif #ifdef cFPGA else { - fpga_argmin(A); + PROFILING_FOOTER(argmin); + return fpga_argmin(A); } #endif + PROFILING_FOOTER(argmin); + msg("Invalid device", "Tensor::argmax"); return 0.0f; // Never used, this is for the compiler warning } @@ -366,6 +492,9 @@ Tensor* Tensor::argmin(vector axis, bool keepdims){ } void Tensor::argmin(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ + + PROFILING_HEADER(argmin); + if (A->isCPU() && B->isCPU()) { cpu_argmin(A, B, rd); } @@ -381,6 +510,8 @@ void Tensor::argmin(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ fpga_argmin(A, B, rd); } #endif + + PROFILING_FOOTER(argmin); } @@ -390,21 +521,31 @@ float Tensor::sum(){ float Tensor::sum(Tensor* A){ + + PROFILING_HEADER(sum); + if (A->isCPU()) { + PROFILING_FOOTER(sum); return cpu_sum(A); } #ifdef cGPU else if (A->isGPU()) { + + PROFILING_FOOTER(sum); return gpu_sum(A); } #endif #ifdef cFPGA else { + + PROFILING_FOOTER(sum); return fpga_sum(A); } #endif + PROFILING_FOOTER(sum); + msg("Invalid device", "Tensor::sum"); return 0.0f; // Never used, this is for the compiler warning } @@ -423,6 +564,9 @@ Tensor* Tensor::sum(vector axis, bool keepdims){ } void Tensor::sum(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ + + PROFILING_HEADER(sum); + if (A->isCPU() && B->isCPU()) { cpu_sum(A, B, rd); } @@ -437,6 +581,8 @@ void Tensor::sum(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ fpga_sum(A, B, rd); } #endif + + PROFILING_FOOTER(sum); } float Tensor::sum_abs(){ @@ -445,21 +591,30 @@ float Tensor::sum_abs(){ float Tensor::sum_abs(Tensor* A){ + + PROFILING_HEADER(sum_abs); + if (A->isCPU()) { - return cpu_sum_abs(A); + + PROFILING_FOOTER(sum_abs); + return cpu_sum_abs(A); } #ifdef cGPU else if (A->isGPU()) { + PROFILING_FOOTER(sum_abs); return gpu_sum_abs(A); } #endif #ifdef cFPGA else { + PROFILING_FOOTER(sum_abs); return fpga_sum_abs(A); } #endif + PROFILING_FOOTER(sum_abs); + msg("Invalid device", "Tensor::sum_abs"); return 0.0f; // Never used, this is for the compiler warning } @@ -479,6 +634,9 @@ Tensor* Tensor::sum_abs(vector axis, bool keepdims){ } void Tensor::sum_abs(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ + + PROFILING_HEADER(sum_abs); + if (A->isCPU() && B->isCPU()) { cpu_sum_abs(A, B, rd); } @@ -493,6 +651,8 @@ void Tensor::sum_abs(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ fpga_sum_abs(A, B, rd); } #endif + + PROFILING_FOOTER(sum_abs); } float Tensor::prod(){ @@ -501,21 +661,29 @@ float Tensor::prod(){ float Tensor::prod(Tensor* A){ // AKA factorial + + PROFILING_HEADER(prod); + if (A->isCPU()) { + PROFILING_FOOTER(prod); return cpu_prod(A); } #ifdef cGPU else if (A->isGPU()) { + PROFILING_FOOTER(prod); return gpu_prod(A); } #endif #ifdef cFPGA else { - fpga_prod(A); + PROFILING_FOOTER(prod); + return fpga_prod(A); } #endif + PROFILING_FOOTER(prod); + msg("Invalid device", "Tensor::prod"); return 0.0f; // Never used, this is for the compiler warning } @@ -535,6 +703,9 @@ Tensor* Tensor::prod(vector axis, bool keepdims){ } void Tensor::prod(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ + + PROFILING_HEADER(prod); + if (A->isCPU() && B->isCPU()) { cpu_prod(A, B, rd); } @@ -549,6 +720,8 @@ void Tensor::prod(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ fpga_prod(A, B, rd); } #endif + + PROFILING_FOOTER(prod); } @@ -575,6 +748,9 @@ Tensor* Tensor::mean(vector axis, bool keepdims){ } void Tensor::mean(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ + + PROFILING_HEADER(mean); + if (A->isCPU() && B->isCPU()) { cpu_mean(A, B, rd); } @@ -589,6 +765,8 @@ void Tensor::mean(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ fpga_mean(A, B, rd); } #endif + + PROFILING_FOOTER(mean); } @@ -599,6 +777,9 @@ float Tensor::median(){ float Tensor::median(Tensor* A){ + + PROFILING_HEADER(median); + float res = 0.0f; // Clone tensor (needs to be sorted first) @@ -619,6 +800,8 @@ float Tensor::median(Tensor* A){ } #endif + PROFILING_FOOTER(median); + delete tmp; return res; } @@ -637,6 +820,9 @@ Tensor* Tensor::median(vector axis, bool keepdims){ } void Tensor::median(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ + + PROFILING_HEADER(median); + if (A->isCPU() && B->isCPU()) { cpu_median(A, B, rd); } @@ -651,10 +837,9 @@ void Tensor::median(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ fpga_median(A, B, rd); } #endif -} - - + PROFILING_FOOTER(median); +} float Tensor::std(bool unbiased){ return Tensor::std(this, unbiased); @@ -662,21 +847,29 @@ float Tensor::std(bool unbiased){ float Tensor::std(Tensor* A, bool unbiased){ + + PROFILING_HEADER(std); + if (A->isCPU()) { + PROFILING_FOOTER(std); return cpu_std(A, unbiased); } #ifdef cGPU else if (A->isGPU()) { + PROFILING_FOOTER(std); return gpu_std(A, unbiased); } #endif #ifdef cFPGA else { + PROFILING_FOOTER(std); fpga_std(A, unbiased); } #endif + PROFILING_FOOTER(std); + msg("Invalid device", "Tensor::std"); return 0.0f; // Never used, this is for the compiler warning } @@ -696,6 +889,9 @@ Tensor* Tensor::std(vector axis, bool keepdims, bool unbiased){ } void Tensor::std(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){ + + PROFILING_HEADER(std); + if (A->isCPU() && B->isCPU()) { cpu_std(A, B, rd, unbiased); } @@ -710,6 +906,8 @@ void Tensor::std(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){ fpga_std(A, B, rd, unbiased); } #endif + + PROFILING_FOOTER(std); } @@ -719,22 +917,33 @@ float Tensor::var(bool unbiased){ float Tensor::var(Tensor* A, bool unbiased){ + + PROFILING_HEADER(var); + if (A->isCPU()) { + + PROFILING_FOOTER(var); return cpu_var(A, unbiased); } #ifdef cGPU else if (A->isGPU()) { + + PROFILING_FOOTER(var); return gpu_var(A, unbiased); } #endif #ifdef cFPGA else if (A->isFPGA()) { - return fpga_var(A, unbiased); + + PROFILING_FOOTER(var); + return fpga_var(A, unbiased); } #endif + PROFILING_FOOTER(var); + msg("Invalid device", "Tensor::var"); return 0.0f; // Never used, this is for the compiler warning } @@ -754,6 +963,9 @@ Tensor* Tensor::var(vector axis, bool keepdims, bool unbiased){ } void Tensor::var(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){ + + PROFILING_HEADER(var); + if (A->isCPU() && B->isCPU()) { cpu_var(A, B, rd, unbiased); } @@ -769,6 +981,8 @@ void Tensor::var(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){ fpga_var(A, B, rd, unbiased); } #endif + + PROFILING_FOOTER(var); } @@ -778,22 +992,30 @@ int Tensor::mode(){ int Tensor::mode(Tensor* A){ + + PROFILING_HEADER(mode); + if (A->isCPU()) { + PROFILING_FOOTER(mode); return cpu_mode(A); } #ifdef cGPU else if (A->isGPU()) { + PROFILING_FOOTER(mode); return gpu_mode(A); } #endif #ifdef cFPGA else if (A->isFPGA()) { + PROFILING_FOOTER(mode); return fpga_mode(A); } #endif + PROFILING_FOOTER(mode); + msg("Invalid device", "Tensor::mode"); return 0; // Never used, this is for the compiler warning } @@ -2289,7 +2511,7 @@ void Tensor::mult2D(Tensor *A, int tA, Tensor *B, int tB, Tensor *C, int incC) { //// Dimensions and types must be compatible //// Only for 2D Tensors /////////////////////////////////////// - + PROFILING_HEADER_EXTERN(mult2D); if ((A->device != B->device) || (A->device != C->device)) {A->info();B->info();C->info();msg("Tensors in different devices", "Tensor::mult2D");} @@ -2330,7 +2552,6 @@ void Tensor::mult2D(Tensor *A, int tA, Tensor *B, int tB, Tensor *C, int incC) { C->tsem->unlock(); PROFILING_FOOTER(mult2D); - PROFILING_PRINTF(mult2D); } @@ -2401,7 +2622,6 @@ void Tensor::sum2D_rowwise(Tensor *A, Tensor *B, Tensor *C) { C->tsem->unlock(); PROFILING_FOOTER(sum2D_rowwise); - PROFILING_PRINTF(sum2D_rowwise); } From e74ee6d138d2de845233186aa733e861630a72b2 Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Mon, 2 Nov 2020 13:30:41 +0100 Subject: [PATCH 09/15] - Update from UPV-GAP - CONV2D kernel optimized and finalized - Profiling added to arithmetic tensor operations - FPGA support for pipeline use case (skin lession classification) --- include/eddl/apis/eddl.h | 9 +- include/eddl/utils.h | 2 + src/apis/eddl.cpp | 6 + src/net/net_api.cpp | 10 -- src/tensor/tensor_math.cpp | 322 ++++++++++++++++++++++++++----------- src/utils.cpp | 135 ++++++++++++++++ 6 files changed, 376 insertions(+), 108 deletions(-) diff --git a/include/eddl/apis/eddl.h b/include/eddl/apis/eddl.h index be91e4b90..df94710e4 100644 --- a/include/eddl/apis/eddl.h +++ b/include/eddl/apis/eddl.h @@ -677,7 +677,9 @@ namespace eddl { * @param l Layer to detach * @return Detached Layer */ - layer detach(layer l);/** + layer detach(layer l); + + /** * @brief Sets the provided layers as detached, excluding them from the computation of the gradients. * * @param l Layers to detach @@ -685,6 +687,11 @@ namespace eddl { */ vlayer detach(vlayer l); + /** + * @brief Shows profile information. + */ + void show_profile(); + /////////////////////////////////////// // LAYERS diff --git a/include/eddl/utils.h b/include/eddl/utils.h index de0917590..2ddd7d143 100755 --- a/include/eddl/utils.h +++ b/include/eddl/utils.h @@ -59,4 +59,6 @@ string printVector(vector myvector){ enum WrappingMode {Constant=0, Reflect=1, Nearest=2, Mirror=3, Wrap=4, Original=5}; WrappingMode getWrappingMode(string mode); +void __show_profile(); + #endif //EDDL_UTILS_H diff --git a/src/apis/eddl.cpp b/src/apis/eddl.cpp index 9941fffbb..ebdb394cd 100644 --- a/src/apis/eddl.cpp +++ b/src/apis/eddl.cpp @@ -15,6 +15,7 @@ #include #include "eddl/apis/eddl.h" +#include "eddl/utils.h" using namespace std; @@ -295,6 +296,11 @@ namespace eddl { net->train_batch(in, out, indices,1); } + void show_profile() { + printf("profile:\n"); + __show_profile(); + } + void next_batch(vector in,vector out) { int i,n; diff --git a/src/net/net_api.cpp b/src/net/net_api.cpp index 9dc563ac7..d5a7bfd14 100644 --- a/src/net/net_api.cpp +++ b/src/net/net_api.cpp @@ -143,9 +143,6 @@ void *update_t(void *t) { } ///////////////////////////////////////// - - - ///////////////////////////////////////// // "a ring to rule them all" void Net::run_snets(void *(*F)(void *t)) @@ -1113,11 +1110,4 @@ vtensor Net::predict(vtensor tin) { } - - - - - - - ////// diff --git a/src/tensor/tensor_math.cpp b/src/tensor/tensor_math.cpp index 27711e3fd..257ba5e43 100644 --- a/src/tensor/tensor_math.cpp +++ b/src/tensor/tensor_math.cpp @@ -26,69 +26,6 @@ using namespace std; -// profiling declarations -PROFILING_ENABLE(maximum); -PROFILING_ENABLE(minimum); -PROFILING_ENABLE(max); -PROFILING_ENABLE(argmax); -PROFILING_ENABLE(argmax_d); -PROFILING_ENABLE(min); -PROFILING_ENABLE(argmin); -PROFILING_ENABLE(sum); -PROFILING_ENABLE(sum_abs); -PROFILING_ENABLE(prod); -PROFILING_ENABLE(mean); -PROFILING_ENABLE(median); -PROFILING_ENABLE(std); -PROFILING_ENABLE(var); -PROFILING_ENABLE(mode); -PROFILING_ENABLE(abs); -PROFILING_ENABLE(acos); -PROFILING_ENABLE(add); -PROFILING_ENABLE(asin); -PROFILING_ENABLE(atan); -PROFILING_ENABLE(cell); -PROFILING_ENABLE(clamp); -PROFILING_ENABLE(clampmax); -PROFILING_ENABLE(clampmin); -PROFILING_ENABLE(cos); -PROFILING_ENABLE(cosh); -PROFILING_ENABLE(div); -PROFILING_ENABLE(exp); -PROFILING_ENABLE(floor); -PROFILING_ENABLE(inv); -PROFILING_ENABLE(log); -PROFILING_ENABLE(log2); -PROFILING_ENABLE(log10); -PROFILING_ENABLE(logn); -PROFILING_ENABLE(mod); -PROFILING_ENABLE(mult); -PROFILING_ENABLE(neg); -PROFILING_ENABLE(normalize); -PROFILING_ENABLE(pow); -PROFILING_ENABLE(powb); -PROFILING_ENABLE(reciprocal); -PROFILING_ENABLE(remainder); -PROFILING_ENABLE(round); -PROFILING_ENABLE(rsqrt); -PROFILING_ENABLE(sigmoid); -PROFILING_ENABLE(sign); -PROFILING_ENABLE(sin); -PROFILING_ENABLE(sinh); -PROFILING_ENABLE(sqr); -PROFILING_ENABLE(sqrt); -PROFILING_ENABLE(sub); -PROFILING_ENABLE(tan); -PROFILING_ENABLE(tanh); -PROFILING_ENABLE(trunc); -PROFILING_ENABLE(inc); -PROFILING_ENABLE(eldiv); -PROFILING_ENABLE(mult2D); -PROFILING_ENABLE(el_mult); -PROFILING_ENABLE(sum2D_rowwise); -PROFILING_ENABLE(reduce_sum2D); -PROFILING_ENABLE(sum2D_colwise); - // Math operations (Tensor-Tensor, Tensor-float) ************************ Tensor* Tensor::maximum(float v){ @@ -105,7 +42,7 @@ Tensor* Tensor::maximum(Tensor* A, float v){ void Tensor::maximum(Tensor* A, Tensor* B, float v){ - PROFILING_HEADER(maximum); + PROFILING_HEADER_EXTERN(maximum); if (A->isCPU() && B->isCPU()){ cpu_maximum(A, B, v); @@ -133,7 +70,7 @@ Tensor* Tensor::maximum(Tensor* A, Tensor* B){ void Tensor::maximum(Tensor* A, Tensor* B, Tensor* C){ - PROFILING_HEADER(maximum); + PROFILING_HEADER_EXTERN(maximum); if (A->isCPU() && B->isCPU() && C->isCPU()){ cpu_maximum(A, B, C); @@ -167,7 +104,7 @@ Tensor* Tensor::minimum(Tensor* A, float v){ void Tensor::minimum(Tensor* A, Tensor* B, float v){ - PROFILING_HEADER(minimum); + PROFILING_HEADER_EXTERN(minimum); if (A->isCPU() && B->isCPU()){ cpu_minimum(A, B, v); @@ -195,7 +132,7 @@ Tensor* Tensor::minimum(Tensor* A, Tensor* B){ void Tensor::minimum(Tensor* A, Tensor* B, Tensor* C){ - PROFILING_HEADER(minimum); + PROFILING_HEADER_EXTERN(minimum); if (A->isCPU() && B->isCPU() && C->isCPU()){ cpu_minimum(A, B, C); @@ -226,7 +163,7 @@ float Tensor::max(){ float Tensor::max(Tensor* A){ - PROFILING_HEADER(max); + PROFILING_HEADER_EXTERN(max); if (A->isCPU()) { return cpu_max(A); @@ -264,7 +201,7 @@ Tensor* Tensor::max(vector axis, bool keepdims){ void Tensor::max(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ - PROFILING_HEADER(max); + PROFILING_HEADER_EXTERN(max); if (A->isCPU() && B->isCPU()) { cpu_max(A, B, rd); @@ -293,7 +230,7 @@ int Tensor::argmax(){ int Tensor::argmax(Tensor* A){ - PROFILING_HEADER(argmax); + PROFILING_HEADER_EXTERN(argmax); if (A->isCPU()) { PROFILING_FOOTER(argmax); @@ -334,7 +271,7 @@ Tensor* Tensor::argmax(vector axis, bool keepdims){ void Tensor::argmax(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ - PROFILING_HEADER(argmax); + PROFILING_HEADER_EXTERN(argmax); if (A->isCPU() && B->isCPU()) { cpu_argmax(A, B, rd); @@ -356,7 +293,7 @@ void Tensor::argmax(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ void Tensor::argmax_d(Tensor *D, Tensor *O, Tensor *PD){ - PROFILING_HEADER(argmax_d); + PROFILING_HEADER_EXTERN(argmax_d); if (D->isCPU() && O->isCPU() && PD->isCPU()) { cpu_argmax_d(D, O, PD); @@ -383,7 +320,7 @@ float Tensor::min(){ float Tensor::min(Tensor* A){ - PROFILING_HEADER(min); + PROFILING_HEADER_EXTERN(min); if (A->isCPU()) { PROFILING_FOOTER(min); @@ -424,7 +361,7 @@ Tensor* Tensor::min(vector axis, bool keepdims){ void Tensor::min(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ - PROFILING_HEADER(min); + PROFILING_HEADER_EXTERN(min); if (A->isCPU() && B->isCPU()) { cpu_min(A, B, rd); @@ -452,7 +389,7 @@ int Tensor::argmin(){ int Tensor::argmin(Tensor* A){ - PROFILING_HEADER(argmin); + PROFILING_HEADER_EXTERN(argmin); if (A->isCPU()) { PROFILING_FOOTER(argmin); @@ -493,7 +430,7 @@ Tensor* Tensor::argmin(vector axis, bool keepdims){ void Tensor::argmin(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ - PROFILING_HEADER(argmin); + PROFILING_HEADER_EXTERN(argmin); if (A->isCPU() && B->isCPU()) { cpu_argmin(A, B, rd); @@ -522,7 +459,7 @@ float Tensor::sum(){ float Tensor::sum(Tensor* A){ - PROFILING_HEADER(sum); + PROFILING_HEADER_EXTERN(sum); if (A->isCPU()) { PROFILING_FOOTER(sum); @@ -565,7 +502,7 @@ Tensor* Tensor::sum(vector axis, bool keepdims){ void Tensor::sum(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ - PROFILING_HEADER(sum); + PROFILING_HEADER_EXTERN(sum); if (A->isCPU() && B->isCPU()) { cpu_sum(A, B, rd); @@ -592,7 +529,7 @@ float Tensor::sum_abs(){ float Tensor::sum_abs(Tensor* A){ - PROFILING_HEADER(sum_abs); + PROFILING_HEADER_EXTERN(sum_abs); if (A->isCPU()) { @@ -635,7 +572,7 @@ Tensor* Tensor::sum_abs(vector axis, bool keepdims){ void Tensor::sum_abs(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ - PROFILING_HEADER(sum_abs); + PROFILING_HEADER_EXTERN(sum_abs); if (A->isCPU() && B->isCPU()) { cpu_sum_abs(A, B, rd); @@ -662,7 +599,7 @@ float Tensor::prod(){ float Tensor::prod(Tensor* A){ // AKA factorial - PROFILING_HEADER(prod); + PROFILING_HEADER_EXTERN(prod); if (A->isCPU()) { PROFILING_FOOTER(prod); @@ -704,7 +641,7 @@ Tensor* Tensor::prod(vector axis, bool keepdims){ void Tensor::prod(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ - PROFILING_HEADER(prod); + PROFILING_HEADER_EXTERN(prod); if (A->isCPU() && B->isCPU()) { cpu_prod(A, B, rd); @@ -749,7 +686,7 @@ Tensor* Tensor::mean(vector axis, bool keepdims){ void Tensor::mean(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ - PROFILING_HEADER(mean); + PROFILING_HEADER_EXTERN(mean); if (A->isCPU() && B->isCPU()) { cpu_mean(A, B, rd); @@ -778,7 +715,7 @@ float Tensor::median(){ float Tensor::median(Tensor* A){ - PROFILING_HEADER(median); + PROFILING_HEADER_EXTERN(median); float res = 0.0f; @@ -821,7 +758,7 @@ Tensor* Tensor::median(vector axis, bool keepdims){ void Tensor::median(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ - PROFILING_HEADER(median); + PROFILING_HEADER_EXTERN(median); if (A->isCPU() && B->isCPU()) { cpu_median(A, B, rd); @@ -848,7 +785,7 @@ float Tensor::std(bool unbiased){ float Tensor::std(Tensor* A, bool unbiased){ - PROFILING_HEADER(std); + PROFILING_HEADER_EXTERN(std); if (A->isCPU()) { PROFILING_FOOTER(std); @@ -890,7 +827,7 @@ Tensor* Tensor::std(vector axis, bool keepdims, bool unbiased){ void Tensor::std(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){ - PROFILING_HEADER(std); + PROFILING_HEADER_EXTERN(std); if (A->isCPU() && B->isCPU()) { cpu_std(A, B, rd, unbiased); @@ -918,7 +855,7 @@ float Tensor::var(bool unbiased){ float Tensor::var(Tensor* A, bool unbiased){ - PROFILING_HEADER(var); + PROFILING_HEADER_EXTERN(var); if (A->isCPU()) { @@ -964,7 +901,7 @@ Tensor* Tensor::var(vector axis, bool keepdims, bool unbiased){ void Tensor::var(Tensor* A, Tensor *B, ReduceDescriptor2 *rd, bool unbiased){ - PROFILING_HEADER(var); + PROFILING_HEADER_EXTERN(var); if (A->isCPU() && B->isCPU()) { cpu_var(A, B, rd, unbiased); @@ -993,23 +930,23 @@ int Tensor::mode(){ int Tensor::mode(Tensor* A){ - PROFILING_HEADER(mode); + PROFILING_HEADER_EXTERN(mode); if (A->isCPU()) { - PROFILING_FOOTER(mode); + PROFILING_FOOTER(mode); return cpu_mode(A); } #ifdef cGPU else if (A->isGPU()) { - PROFILING_FOOTER(mode); + PROFILING_FOOTER(mode); return gpu_mode(A); } #endif #ifdef cFPGA else if (A->isFPGA()) { - PROFILING_FOOTER(mode); + PROFILING_FOOTER(mode); return fpga_mode(A); } #endif @@ -1035,6 +972,9 @@ Tensor* Tensor::mode(vector axis, bool keepdims){ } void Tensor::mode(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ + + PROFILING_HEADER_EXTERN(mode); + if (A->isCPU() && B->isCPU()) { cpu_mode(A, B, rd); } @@ -1050,6 +990,8 @@ void Tensor::mode(Tensor* A, Tensor *B, ReduceDescriptor2 *rd){ fpga_mode(A, B, rd); } #endif + + PROFILING_FOOTER(mode); } @@ -1064,6 +1006,9 @@ Tensor* Tensor::abs(){ } void Tensor::abs(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(abs); + if (A->isCPU() && B->isCPU()) { cpu_abs(A, B); } @@ -1079,6 +1024,8 @@ void Tensor::abs(Tensor *A, Tensor *B){ fpga_abs(A, B); } #endif + + PROFILING_FOOTER(abs); } @@ -1093,6 +1040,9 @@ Tensor* Tensor::acos(){ } void Tensor::acos(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(acos); + if (A->isCPU() && B->isCPU()) { cpu_acos(A, B); } @@ -1108,6 +1058,8 @@ void Tensor::acos(Tensor *A, Tensor *B){ fpga_acos(A, B); } #endif + + PROFILING_FOOTER(acos); } @@ -1132,6 +1084,8 @@ Tensor* Tensor::add(Tensor* A){ } void Tensor::add(Tensor *A, Tensor *B, float v){ + PROFILING_HEADER_EXTERN(add); + if (A->isCPU() && B->isCPU()) { cpu_add(A, B, v); } @@ -1147,6 +1101,8 @@ void Tensor::add(Tensor *A, Tensor *B, float v){ fpga_add(A, B, v); } #endif + + PROFILING_FOOTER(add); } @@ -1162,6 +1118,9 @@ Tensor* Tensor::asin(){ } void Tensor::asin(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(asin); + if (A->isCPU() && B->isCPU()) { cpu_asin(A, B); } @@ -1177,6 +1136,8 @@ void Tensor::asin(Tensor *A, Tensor *B){ fpga_asin(A, B); } #endif + + PROFILING_FOOTER(asin); } @@ -1193,6 +1154,9 @@ Tensor* Tensor::atan(){ void Tensor::atan(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(atan); + if (A->isCPU() && B->isCPU()) { cpu_atan(A, B); } @@ -1208,6 +1172,8 @@ void Tensor::atan(Tensor *A, Tensor *B){ fpga_atan(A, B); } #endif + + PROFILING_FOOTER(atan); } @@ -1224,6 +1190,9 @@ Tensor* Tensor::ceil(){ void Tensor::ceil(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(ceil); + if (A->isCPU() && B->isCPU()) { cpu_ceil(A, B); } @@ -1239,6 +1208,8 @@ void Tensor::ceil(Tensor *A, Tensor *B){ fpga_ceil(A, B); } #endif + + PROFILING_FOOTER(ceil); } @@ -1255,6 +1226,9 @@ Tensor* Tensor::clamp(float min, float max){ void Tensor::clamp(Tensor *A, Tensor *B, float min, float max){ + + PROFILING_HEADER_EXTERN(clamp); + if (A->isCPU() && B->isCPU()) { cpu_clamp(A, B, min, max); } @@ -1270,6 +1244,8 @@ void Tensor::clamp(Tensor *A, Tensor *B, float min, float max){ fpga_clamp(A, B, min, max); } #endif + + PROFILING_FOOTER(clamp); } @@ -1319,6 +1295,9 @@ Tensor* Tensor::cos(){ void Tensor::cos(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(cos); + if (A->isCPU() && B->isCPU()) { cpu_cos(A, B); } @@ -1334,6 +1313,8 @@ void Tensor::cos(Tensor *A, Tensor *B){ fpga_cos(A, B); } #endif + + PROFILING_FOOTER(cos); } @@ -1349,6 +1330,9 @@ Tensor* Tensor::cosh(){ } void Tensor::cosh(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(cosh); + if (A->isCPU() && B->isCPU()) { cpu_cosh(A, B); } @@ -1364,6 +1348,8 @@ void Tensor::cosh(Tensor *A, Tensor *B){ fpga_cosh(A, B); } #endif + + PROFILING_FOOTER(cosh); } @@ -1409,6 +1395,9 @@ Tensor* Tensor::exp(){ void Tensor::exp(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(exp); + if (A->isCPU() && B->isCPU()) { cpu_exp(A, B); } @@ -1424,6 +1413,8 @@ void Tensor::exp(Tensor *A, Tensor *B){ fpga_exp(A, B); } #endif + + PROFILING_FOOTER(exp); } @@ -1440,6 +1431,9 @@ Tensor* Tensor::floor(){ void Tensor::floor(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(floor); + if (A->isCPU() && B->isCPU()) { cpu_floor(A, B); } @@ -1455,6 +1449,8 @@ void Tensor::floor(Tensor *A, Tensor *B){ fpga_floor(A, B); } #endif + + PROFILING_FOOTER(floor); } @@ -1471,6 +1467,9 @@ Tensor* Tensor::inv(float v){ void Tensor::inv(Tensor *A, Tensor *B, float v){ + + PROFILING_HEADER_EXTERN(inv); + if (A->isCPU() && B->isCPU()) { cpu_inv(A, B, v); } @@ -1486,6 +1485,8 @@ void Tensor::inv(Tensor *A, Tensor *B, float v){ fpga_inv(A, B, v); } #endif + + PROFILING_FOOTER(inv); } @@ -1502,6 +1503,9 @@ Tensor* Tensor::log(){ void Tensor::log(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(log); + if (A->isCPU() && B->isCPU()) { cpu_log(A, B); } @@ -1517,6 +1521,8 @@ void Tensor::log(Tensor *A, Tensor *B){ fpga_log(A, B); } #endif + + PROFILING_FOOTER(log); } @@ -1533,6 +1539,9 @@ Tensor* Tensor::log2(){ void Tensor::log2(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(log2); + if (A->isCPU() && B->isCPU()) { cpu_log2(A, B); } @@ -1548,6 +1557,8 @@ void Tensor::log2(Tensor *A, Tensor *B){ fpga_log2(A, B); } #endif + + PROFILING_FOOTER(log2); } @@ -1564,6 +1575,9 @@ Tensor* Tensor::log10(){ void Tensor::log10(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(log10); + if (A->isCPU() && B->isCPU()) { cpu_log10(A, B); } @@ -1579,6 +1593,8 @@ void Tensor::log10(Tensor *A, Tensor *B){ fpga_log10(A, B); } #endif + + PROFILING_FOOTER(log10); } @@ -1595,6 +1611,9 @@ Tensor* Tensor::logn(float n){ void Tensor::logn(Tensor *A, Tensor *B, float n){ + + PROFILING_HEADER_EXTERN(logn); + if (A->isCPU() && B->isCPU()) { cpu_logn(A, B, n); } @@ -1610,6 +1629,8 @@ void Tensor::logn(Tensor *A, Tensor *B, float n){ fpga_logn(A, B, n); } #endif + + PROFILING_FOOTER(logn); } @@ -1626,6 +1647,9 @@ Tensor* Tensor::mod(float v){ void Tensor::mod(Tensor *A, Tensor *B, float v){ + + PROFILING_HEADER_EXTERN(mod); + if (A->isCPU() && B->isCPU()) { cpu_mod(A, B, v); } @@ -1641,6 +1665,8 @@ void Tensor::mod(Tensor *A, Tensor *B, float v){ fpga_mod(A, B, v); } #endif + + PROFILING_FOOTER(mod); } @@ -1669,6 +1695,9 @@ Tensor* Tensor::mult(Tensor* A){ void Tensor::mult(Tensor *A, Tensor *B, float v){ + + PROFILING_HEADER_EXTERN(mult); + if (A->isCPU() && B->isCPU()) { cpu_mult(A, B, v); } @@ -1684,6 +1713,8 @@ void Tensor::mult(Tensor *A, Tensor *B, float v){ fpga_mult(A, B, v); } #endif + + PROFILING_FOOTER(mult); } @@ -1717,6 +1748,9 @@ Tensor* Tensor::normalize(float min, float max){ void Tensor::normalize(Tensor *A, Tensor *B, float min, float max){ + + PROFILING_HEADER_EXTERN(normalize); + if (A->isCPU() && B->isCPU()) { cpu_normalize(A, B, min, max); } @@ -1732,6 +1766,8 @@ void Tensor::normalize(Tensor *A, Tensor *B, float min, float max){ fpga_normalize(A, B, min, max); } #endif + + PROFILING_FOOTER(normalize); } @@ -1748,6 +1784,9 @@ Tensor* Tensor::pow(float exp){ void Tensor::pow(Tensor *A, Tensor *B, float exp){ + + PROFILING_HEADER_EXTERN(pow); + if (A->isCPU() && B->isCPU()) { cpu_pow(A, B, exp); } @@ -1763,6 +1802,8 @@ void Tensor::pow(Tensor *A, Tensor *B, float exp){ fpga_pow(A, B, exp); } #endif + + PROFILING_FOOTER(pow); } @@ -1779,6 +1820,9 @@ Tensor* Tensor::powb(float base){ void Tensor::powb(Tensor *A, Tensor *B, float base){ + + PROFILING_HEADER_EXTERN(powb); + if (A->isCPU() && B->isCPU()) { cpu_powb(A, B, base); } @@ -1794,6 +1838,8 @@ void Tensor::powb(Tensor *A, Tensor *B, float base){ fpga_powb(A, B, base); } #endif + + PROFILING_FOOTER(powb); } @@ -1827,6 +1873,9 @@ Tensor* Tensor::remainder(float v){ void Tensor::remainder(Tensor *A, Tensor *B, float v){ + + PROFILING_HEADER_EXTERN(remainder); + if (A->isCPU() && B->isCPU()) { cpu_remainder(A, B, v); } @@ -1842,6 +1891,9 @@ void Tensor::remainder(Tensor *A, Tensor *B, float v){ fpga_remainder(A, B, v); } #endif + + + PROFILING_FOOTER(remainder); } @@ -1858,6 +1910,9 @@ Tensor* Tensor::round(){ void Tensor::round(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(round); + if (A->isCPU() && B->isCPU()) { cpu_round(A, B); } @@ -1873,6 +1928,8 @@ void Tensor::round(Tensor *A, Tensor *B){ fpga_round(A, B); } #endif + + PROFILING_FOOTER(round); } @@ -1889,6 +1946,9 @@ Tensor* Tensor::rsqrt(){ void Tensor::rsqrt(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(rsqrt); + if (A->isCPU() && B->isCPU()) { cpu_rsqrt(A, B); } @@ -1904,6 +1964,8 @@ void Tensor::rsqrt(Tensor *A, Tensor *B){ fpga_rsqrt(A, B); } #endif + + PROFILING_FOOTER(rsqrt); } @@ -1920,6 +1982,9 @@ Tensor* Tensor::sigmoid(){ void Tensor::sigmoid(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(sigmoid); + if (A->isCPU() && B->isCPU()) { cpu_sigmoid(A, B); } @@ -1935,6 +2000,8 @@ void Tensor::sigmoid(Tensor *A, Tensor *B){ fpga_sigmoid(A, B); } #endif + + PROFILING_FOOTER(sigmoid); } @@ -1951,6 +2018,9 @@ Tensor* Tensor::sign(float zero_sign){ void Tensor::sign(Tensor *A, Tensor *B, float zero_sign) { + + PROFILING_HEADER_EXTERN(sign); + if (A->isCPU() && B->isCPU()) { cpu_sign(A, B, zero_sign); } @@ -1966,6 +2036,8 @@ void Tensor::sign(Tensor *A, Tensor *B, float zero_sign) { fpga_sign(A, B, zero_sign); } #endif + + PROFILING_FOOTER(sign); } @@ -1982,6 +2054,9 @@ Tensor* Tensor::sin(){ void Tensor::sin(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(sin); + if (A->isCPU() && B->isCPU()) { cpu_sin(A, B); } @@ -1997,6 +2072,8 @@ void Tensor::sin(Tensor *A, Tensor *B){ fpga_sin(A, B); } #endif + + PROFILING_FOOTER(sin); } @@ -2013,6 +2090,9 @@ Tensor* Tensor::sinh(){ void Tensor::sinh(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(sinh); + if (A->isCPU() && B->isCPU()) { cpu_sinh(A, B); } @@ -2028,6 +2108,8 @@ void Tensor::sinh(Tensor *A, Tensor *B){ fpga_sinh(A, B); } #endif + + PROFILING_FOOTER(sinh); } @@ -2044,6 +2126,9 @@ Tensor* Tensor::sqr(){ void Tensor::sqr(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(sqr); + if (A->isCPU() && B->isCPU()) { cpu_sqr(A, B); } @@ -2059,6 +2144,8 @@ void Tensor::sqr(Tensor *A, Tensor *B){ fpga_sqr(A, B); } #endif + + PROFILING_FOOTER(sqr); } @@ -2075,6 +2162,9 @@ Tensor* Tensor::sqrt(){ void Tensor::sqrt(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(sqrt); + if (A->isCPU() && B->isCPU()) { cpu_sqrt(A, B); } @@ -2090,6 +2180,8 @@ void Tensor::sqrt(Tensor *A, Tensor *B){ fpga_sqrt(A, B); } #endif + + PROFILING_FOOTER(sqrt); } @@ -2135,6 +2227,9 @@ Tensor* Tensor::tan(){ void Tensor::tan(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(tan); + if (A->isCPU() && B->isCPU()) { cpu_tan(A, B); } @@ -2150,6 +2245,8 @@ void Tensor::tan(Tensor *A, Tensor *B){ fpga_tan(A, B); } #endif + + PROFILING_FOOTER(tan); } @@ -2166,6 +2263,9 @@ Tensor* Tensor::tanh(){ void Tensor::tanh(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(tanh); + if (A->isCPU() && B->isCPU()) { cpu_tanh(A, B); } @@ -2181,6 +2281,8 @@ void Tensor::tanh(Tensor *A, Tensor *B){ fpga_tanh(A, B); } #endif + + PROFILING_FOOTER(tanh); } @@ -2197,6 +2299,9 @@ Tensor* Tensor::trunc(){ void Tensor::trunc(Tensor *A, Tensor *B){ + + PROFILING_HEADER_EXTERN(trunc); + if (A->isCPU() && B->isCPU()) { cpu_trunc(A, B); } @@ -2212,6 +2317,8 @@ void Tensor::trunc(Tensor *A, Tensor *B){ fpga_trunc(A, B); } #endif + + PROFILING_FOOTER(trunc); } @@ -2405,6 +2512,7 @@ void Tensor::add(float scA, Tensor *A, float scB, Tensor *B, Tensor *C, int incC /////////////////////////////////////// int aux = 0; + PROFILING_HEADER_EXTERN(add); if ((A->device != B->device) || (A->device != C->device)) msg("Tensors in different devices", "Tensor::add_"); if ((!sameShape(A, B)) || (!sameShape(A, C))) { @@ -2432,12 +2540,16 @@ void Tensor::add(float scA, Tensor *A, float scB, Tensor *B, Tensor *C, int incC #endif C->tsem->unlock(); + + PROFILING_FOOTER(add); } void Tensor::inc(Tensor *A, Tensor *B) { // TODO: Review against add + PROFILING_HEADER_EXTERN(inc); + if (!Tensor::sameShape(A, B)) msg("Tensors with different shape", "Tensor::inc"); @@ -2467,10 +2579,9 @@ void Tensor::inc(Tensor *A, Tensor *B) { Tensor::add(1,n,1,B,B,0); delete n; } -} - - + PROFILING_FOOTER(inc); +} void Tensor::el_div(Tensor *A, Tensor *B, Tensor *C, int incC) { /////////////////////////////////////// @@ -2482,6 +2593,8 @@ void Tensor::el_div(Tensor *A, Tensor *B, Tensor *C, int incC) { if ((A->device != B->device) || (A->device != C->device)) msg("Tensors in different devices", "Tensor::el_div"); if ((!sameShape(A, B)) || (!sameShape(A, C))) msg("Incompatible dims", "Tensor::el_div"); + PROFILING_HEADER_EXTERN(el_div); + C->tsem->lock(); if (A->isCPU()) { cpu_el_div(A, B, C, incC); @@ -2499,6 +2612,8 @@ void Tensor::el_div(Tensor *A, Tensor *B, Tensor *C, int incC) { } #endif C->tsem->unlock(); + + PROFILING_FOOTER(el_div); } @@ -2561,6 +2676,9 @@ void Tensor::el_mult(Tensor *A, Tensor *B, Tensor *C, int incC) { //// incC 1 means C+=A.*B (increment over C) //// Dimensions must be compatible /////////////////////////////////////// + + PROFILING_HEADER_EXTERN(el_mult); + C->tsem->lock(); if ((A->device != B->device) || (A->device != C->device)) msg("Tensors in different devices", "Tensor::el_mult"); if ((!sameShape(A, B)) || (!sameShape(A, C))) { @@ -2586,6 +2704,8 @@ void Tensor::el_mult(Tensor *A, Tensor *B, Tensor *C, int incC) { } #endif C->tsem->unlock(); + + PROFILING_FOOTER(el_mult); } @@ -2601,7 +2721,7 @@ void Tensor::sum2D_rowwise(Tensor *A, Tensor *B, Tensor *C) { if ((A->ndim != 2) || (B->ndim != 1) || (C->ndim != 2)) msg("sum2D_rowwise dims"); if ((!sameShape(A, C)) || (A->shape[1] != B->shape[0])) msg("Incompatible dims", "Tensor::sum2D_rowwise"); - PROFILING_HEADER(sum2D_rowwise); + PROFILING_HEADER_EXTERN(sum2D_rowwise); C->tsem->lock(); if (A->isCPU()) { @@ -2637,6 +2757,8 @@ void Tensor::reduce_sum2D(Tensor *A, Tensor *B, int axis, int incB) { if ((A->ndim - 1) != B->ndim) msg("Incorrect dims", "Tensor::reduce_sum2D"); if ((A->shape[1 - axis] != B->shape[0])) msg("Incompatible dims", "Tensor::reduce_sum2D"); + PROFILING_HEADER_EXTERN(reduce_sum2D); + B->tsem->lock(); if (A->isCPU()) { cpu_reduce_sum2D(A, B, axis, incB); @@ -2654,6 +2776,8 @@ void Tensor::reduce_sum2D(Tensor *A, Tensor *B, int axis, int incB) { } #endif B->tsem->unlock(); + + PROFILING_FOOTER(reduce_sum2D); } void Tensor::sum2D_colwise(Tensor *A, Tensor *B, Tensor *C) { @@ -2668,6 +2792,8 @@ void Tensor::sum2D_colwise(Tensor *A, Tensor *B, Tensor *C) { if ((A->ndim != 2) || (B->ndim != 1) || (C->ndim != 2)) msg("sum2D_colwise dims"); if ((!sameShape(A, C)) || (A->shape[0] != B->shape[0])) msg("Incompatible dims", "Tensor::sum2D_colwise"); + PROFILING_HEADER_EXTERN(sum2D_colwise); + C->tsem->lock(); if (A->isCPU()) { cpu_sum2D_colwise(A, B, C); @@ -2685,4 +2811,6 @@ void Tensor::sum2D_colwise(Tensor *A, Tensor *B, Tensor *C) { } #endif C->tsem->unlock(); + + PROFILING_FOOTER(sum2D_colwise); } diff --git a/src/utils.cpp b/src/utils.cpp index c61bac82d..0419e271b 100755 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -31,6 +31,7 @@ #include "eddl/system_info.h" #include "eddl/utils.h" +#include "eddl/profiling.h" #ifdef EDDL_LINUX #include "sys/mman.h" @@ -418,3 +419,137 @@ WrappingMode getWrappingMode(string mode){ return WrappingMode::Constant; } } + +// --------------------------------------------------------------------------------------------- +// Profiling + +// profiling declarations +PROFILING_ENABLE(maximum); +PROFILING_ENABLE(minimum); +PROFILING_ENABLE(max); +PROFILING_ENABLE(argmax); +PROFILING_ENABLE(argmax_d); +PROFILING_ENABLE(min); +PROFILING_ENABLE(argmin); +PROFILING_ENABLE(sum); +PROFILING_ENABLE(sum_abs); +PROFILING_ENABLE(prod); +PROFILING_ENABLE(mean); +PROFILING_ENABLE(median); +PROFILING_ENABLE(std); +PROFILING_ENABLE(var); +PROFILING_ENABLE(mode); +PROFILING_ENABLE(abs); +PROFILING_ENABLE(acos); +PROFILING_ENABLE(add); +PROFILING_ENABLE(asin); +PROFILING_ENABLE(atan); +PROFILING_ENABLE(cell); +PROFILING_ENABLE(clamp); +PROFILING_ENABLE(clampmax); +PROFILING_ENABLE(clampmin); +PROFILING_ENABLE(cos); +PROFILING_ENABLE(cosh); +PROFILING_ENABLE(div); +PROFILING_ENABLE(exp); +PROFILING_ENABLE(floor); +PROFILING_ENABLE(inv); +PROFILING_ENABLE(log); +PROFILING_ENABLE(log2); +PROFILING_ENABLE(log10); +PROFILING_ENABLE(logn); +PROFILING_ENABLE(mod); +PROFILING_ENABLE(mult); +PROFILING_ENABLE(neg); +PROFILING_ENABLE(normalize); +PROFILING_ENABLE(pow); +PROFILING_ENABLE(powb); +PROFILING_ENABLE(reciprocal); +PROFILING_ENABLE(remainder); +PROFILING_ENABLE(round); +PROFILING_ENABLE(rsqrt); +PROFILING_ENABLE(sigmoid); +PROFILING_ENABLE(sign); +PROFILING_ENABLE(sin); +PROFILING_ENABLE(sinh); +PROFILING_ENABLE(sqr); +PROFILING_ENABLE(sqrt); +PROFILING_ENABLE(sub); +PROFILING_ENABLE(tan); +PROFILING_ENABLE(tanh); +PROFILING_ENABLE(trunc); +PROFILING_ENABLE(inc); +PROFILING_ENABLE(el_div); +PROFILING_ENABLE(mult2D); +PROFILING_ENABLE(el_mult); +PROFILING_ENABLE(sum2D_rowwise); +PROFILING_ENABLE(reduce_sum2D); +PROFILING_ENABLE(sum2D_colwise); +PROFILING_ENABLE(ceil); + +void __show_profile() { + + // profiling declarations + PROFILING_PRINTF(maximum); + PROFILING_PRINTF(minimum); + PROFILING_PRINTF(max); + PROFILING_PRINTF(argmax); + PROFILING_PRINTF(argmax_d); + PROFILING_PRINTF(min); + PROFILING_PRINTF(argmin); + PROFILING_PRINTF(sum); + PROFILING_PRINTF(sum_abs); + PROFILING_PRINTF(prod); + PROFILING_PRINTF(mean); + PROFILING_PRINTF(median); + PROFILING_PRINTF(std); + PROFILING_PRINTF(var); + PROFILING_PRINTF(mode); + PROFILING_PRINTF(abs); + PROFILING_PRINTF(acos); + PROFILING_PRINTF(add); + PROFILING_PRINTF(asin); + PROFILING_PRINTF(atan); + PROFILING_PRINTF(cell); + PROFILING_PRINTF(clamp); + PROFILING_PRINTF(clampmax); + PROFILING_PRINTF(clampmin); + PROFILING_PRINTF(cos); + PROFILING_PRINTF(cosh); + PROFILING_PRINTF(div); + PROFILING_PRINTF(exp); + PROFILING_PRINTF(floor); + PROFILING_PRINTF(inv); + PROFILING_PRINTF(log); + PROFILING_PRINTF(log2); + PROFILING_PRINTF(log10); + PROFILING_PRINTF(logn); + PROFILING_PRINTF(mod); + PROFILING_PRINTF(mult); + PROFILING_PRINTF(neg); + PROFILING_PRINTF(normalize); + PROFILING_PRINTF(pow); + PROFILING_PRINTF(powb); + PROFILING_PRINTF(reciprocal); + PROFILING_PRINTF(remainder); + PROFILING_PRINTF(round); + PROFILING_PRINTF(rsqrt); + PROFILING_PRINTF(sigmoid); + PROFILING_PRINTF(sign); + PROFILING_PRINTF(sin); + PROFILING_PRINTF(sinh); + PROFILING_PRINTF(sqr); + PROFILING_PRINTF(sqrt); + PROFILING_PRINTF(sub); + PROFILING_PRINTF(tan); + PROFILING_PRINTF(tanh); + PROFILING_PRINTF(trunc); + PROFILING_PRINTF(inc); + PROFILING_PRINTF(el_div); + PROFILING_PRINTF(mult2D); + PROFILING_PRINTF(el_mult); + PROFILING_PRINTF(sum2D_rowwise); + PROFILING_PRINTF(reduce_sum2D); + PROFILING_PRINTF(sum2D_colwise); + PROFILING_PRINTF(ceil); +} \ No newline at end of file From 9cc6d9042877153dd226098cf8e7bce311734ca1 Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Mon, 2 Nov 2020 13:57:24 +0100 Subject: [PATCH 10/15] UPV-GAP: added profiling support for data augmentation operations --- src/tensor/tensor_da.cpp | 57 ++++++++++++++++++++++++++++++++++++++++ src/utils.cpp | 32 ++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/src/tensor/tensor_da.cpp b/src/tensor/tensor_da.cpp index 410f00811..3d74422d5 100644 --- a/src/tensor/tensor_da.cpp +++ b/src/tensor/tensor_da.cpp @@ -14,6 +14,7 @@ #include "eddl/tensor/tensor.h" #include "eddl/hardware/cpu/cpu_tensor.h" +#include "eddl/profiling.h" #ifdef cGPU #include "eddl/hardware/gpu/gpu_tensor.h" @@ -48,6 +49,8 @@ void Tensor::shift(Tensor *A, Tensor *B, vector shift, WrappingMode mode, f msg("This method requires two 4D tensors", "Tensor::shift"); } + PROFILING_HEADER_EXTERN(shift); + if (A->isCPU()) { cpu_shift(A, B, std::move(shift), mode, cval); } @@ -62,6 +65,8 @@ void Tensor::shift(Tensor *A, Tensor *B, vector shift, WrappingMode mode, f fpga_shift(A, B, std::move(shift), mode, cval); } #endif + + PROFILING_FOOTER(shift); } Tensor* Tensor::rotate(float angle, vector offset_center, WrappingMode mode, float cval){ @@ -78,6 +83,8 @@ void Tensor::rotate(Tensor *A, Tensor *B, float angle, vector offset_center msg("This method requires two 4D tensors", "Tensor::rotate"); } + PROFILING_HEADER_EXTERN(rotate); + if (A->isCPU()) { cpu_rotate(A, B, angle, std::move(offset_center), mode, cval); } @@ -92,6 +99,8 @@ void Tensor::rotate(Tensor *A, Tensor *B, float angle, vector offset_center fpga_rotate(A, B, angle, std::move(offset_center), mode, cval); } #endif + + PROFILING_FOOTER(rotate); } Tensor* Tensor::scale(vector new_shape, WrappingMode mode, float cval, bool keep_size) { @@ -120,6 +129,8 @@ void Tensor::scale(Tensor *A, Tensor *B, vector new_shape, WrappingMode mod msg("This method requires two 4D tensors", "Tensor::scale"); } + PROFILING_HEADER_EXTERN(scale); + if (A->isCPU()) { cpu_scale(A, B, std::move(new_shape), mode, cval); } @@ -134,6 +145,8 @@ void Tensor::scale(Tensor *A, Tensor *B, vector new_shape, WrappingMode mod fpga_scale(A, B, std::move(new_shape), mode, cval); } #endif + + PROFILING_FOOTER(scale); } @@ -156,6 +169,8 @@ void Tensor::flip(Tensor *A, Tensor *B, int axis) { msg("This method requires two 4D tensors", "Tensor::flip"); } + PROFILING_HEADER_EXTERN(flip); + if (A->isCPU()) { cpu_flip(A, B, axis); } @@ -170,6 +185,8 @@ void Tensor::flip(Tensor *A, Tensor *B, int axis) { fpga_flip(A, B, axis); } #endif + + PROFILING_FOOTER(flip); } Tensor* Tensor::crop(vector coords_from, vector coords_to, float cval, bool keep_size){ @@ -200,6 +217,8 @@ void Tensor::crop(Tensor *A, Tensor *B, vector coords_from, vector coo msg("This method requires two 4D tensors", "Tensor::crop"); } + PROFILING_HEADER_EXTERN(crop); + if (A->isCPU()) { cpu_crop(A, B, std::move(coords_from), std::move(coords_to), cval, false); } @@ -214,6 +233,8 @@ void Tensor::crop(Tensor *A, Tensor *B, vector coords_from, vector coo fpga_crop(A, B, std::move(coords_from), std::move(coords_to), cval, false); } #endif + + PROFILING_FOOTER(crop); } Tensor* Tensor::crop_scale(vector coords_from, vector coords_to, WrappingMode mode, float cval){ @@ -237,6 +258,8 @@ void Tensor::crop_scale(Tensor *A, Tensor *B, vector coords_from, vectorisCPU()) { cpu_crop_scale(A, B, std::move(coords_from), std::move(coords_to), mode, cval); } @@ -251,6 +274,8 @@ void Tensor::crop_scale(Tensor *A, Tensor *B, vector coords_from, vector coords_from, vector c msg("This method requires two 4D tensors", "Tensor::cutout"); } + PROFILING_HEADER_EXTERN(cutout); + if (A->isCPU()) { cpu_crop(A, B, std::move(coords_from), std::move(coords_to), cval, true); } @@ -291,6 +318,8 @@ void Tensor::cutout(Tensor *A, Tensor *B, vector coords_from, vector c fpga_crop(A, B, std::move(coords_from), std::move(coords_to), cval, true); } #endif + + PROFILING_FOOTER(cutout); } @@ -318,6 +347,8 @@ void Tensor::shift_random(Tensor *A, Tensor *B, vector factor_x, vectorisCPU()) { cpu_shift_random(A, B, std::move(factor_x), std::move(factor_y), mode, cval); } @@ -332,6 +363,8 @@ void Tensor::shift_random(Tensor *A, Tensor *B, vector factor_x, vector factor, vectorisCPU()) { cpu_rotate_random(A, B, std::move(factor), std::move(offset_center), mode, cval); } @@ -363,6 +398,8 @@ void Tensor::rotate_random(Tensor *A, Tensor *B, vector factor, vector factor, WrappingMode mode, float cval){ @@ -384,6 +421,8 @@ void Tensor::scale_random(Tensor *A, Tensor *B, vector factor, WrappingMo msg("This method requires two 4D tensors", "Tensor::scale_random"); } + PROFILING_HEADER_EXTERN(scale_random); + if (A->isCPU()) { cpu_scale_random(A, B, std::move(factor), mode, cval); } @@ -398,6 +437,8 @@ void Tensor::scale_random(Tensor *A, Tensor *B, vector factor, WrappingMo fpga_scale_random(A, B, std::move(factor), mode, cval); } #endif + + PROFILING_FOOTER(scale_random); } @@ -420,6 +461,8 @@ void Tensor::flip_random(Tensor *A, Tensor *B, int axis) { msg("This method requires two 4D tensors", "Tensor::flip_random"); } + PROFILING_HEADER_EXTERN(flip_random); + if (A->isCPU()) { cpu_flip_random(A, B, axis); } @@ -434,6 +477,8 @@ void Tensor::flip_random(Tensor *A, Tensor *B, int axis) { fpga_flip_random(A, B, axis); } #endif + + PROFILING_FOOTER(flip_random); } Tensor* Tensor::crop_random(int height, int width, float cval, bool keep_size){ @@ -464,6 +509,8 @@ void Tensor::crop_random(Tensor *A, Tensor *B) { msg("This method requires two 4D tensors", "Tensor::crop_random"); } + PROFILING_HEADER_EXTERN(crop_random); + if (A->isCPU()) { cpu_crop_random(A, B); } @@ -478,6 +525,8 @@ void Tensor::crop_random(Tensor *A, Tensor *B) { fpga_crop_random(A, B); } #endif + + PROFILING_FOOTER(crop_random); } Tensor* Tensor::crop_scale_random(vector factor, WrappingMode mode, float cval){ @@ -498,6 +547,8 @@ void Tensor::crop_scale_random(Tensor *A, Tensor *B, vector factor, Wrapp msg("This method requires two 4D tensors", "Tensor::crop_scale_random"); } + PROFILING_HEADER_EXTERN(crop_scale_random); + if (A->isCPU()) { cpu_crop_scale_random(A, B, std::move(factor), mode, cval); } @@ -512,6 +563,8 @@ void Tensor::crop_scale_random(Tensor *A, Tensor *B, vector factor, Wrapp fpga_crop_scale_random(A, B, std::move(factor), mode, cval); } #endif + + PROFILING_FOOTER(crop_scale_random); } Tensor* Tensor::cutout_random(vector factor_x, vector factor_y, float cval){ @@ -536,6 +589,8 @@ void Tensor::cutout_random(Tensor *A, Tensor *B, vector factor_x, vector< msg("This method requires two 4D tensors", "Tensor::cutout_random"); } + PROFILING_HEADER_EXTERN(cutout_random); + if (A->isCPU()) { cpu_cutout_random(A, B, std::move(factor_x), std::move(factor_y), cval); } @@ -550,4 +605,6 @@ void Tensor::cutout_random(Tensor *A, Tensor *B, vector factor_x, vector< fpga_cutout_random(A, B, std::move(factor_x), std::move(factor_y), cval); } #endif + + PROFILING_FOOTER(cutout_random); } diff --git a/src/utils.cpp b/src/utils.cpp index 0419e271b..a661eb31c 100755 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -486,6 +486,22 @@ PROFILING_ENABLE(sum2D_rowwise); PROFILING_ENABLE(reduce_sum2D); PROFILING_ENABLE(sum2D_colwise); PROFILING_ENABLE(ceil); +// da +PROFILING_ENABLE(shift); +PROFILING_ENABLE(rotate); +PROFILING_ENABLE(scale); +PROFILING_ENABLE(flip); +PROFILING_ENABLE(crop); +PROFILING_ENABLE(crop_scale); +PROFILING_ENABLE(cutout); +PROFILING_ENABLE(shift_random); +PROFILING_ENABLE(rotate_random); +PROFILING_ENABLE(scale_random); +PROFILING_ENABLE(flip_random); +PROFILING_ENABLE(crop_random); +PROFILING_ENABLE(crop_scale_random); +PROFILING_ENABLE(cutout_random); + void __show_profile() { @@ -552,4 +568,20 @@ void __show_profile() { PROFILING_PRINTF(reduce_sum2D); PROFILING_PRINTF(sum2D_colwise); PROFILING_PRINTF(ceil); + // da + PROFILING_PRINTF(shift); + PROFILING_PRINTF(rotate); + PROFILING_PRINTF(scale); + PROFILING_PRINTF(flip); + PROFILING_PRINTF(crop); + PROFILING_PRINTF(crop_scale); + PROFILING_PRINTF(cutout); + PROFILING_PRINTF(shift_random); + PROFILING_PRINTF(rotate_random); + PROFILING_PRINTF(scale_random); + PROFILING_PRINTF(flip_random); + PROFILING_PRINTF(crop_random); + PROFILING_PRINTF(crop_scale_random); + PROFILING_PRINTF(cutout_random); + } \ No newline at end of file From e4005d888928df9d6d042f47af7481e9898838cc Mon Sep 17 00:00:00 2001 From: jorga20j Date: Tue, 3 Nov 2020 07:48:19 +0000 Subject: [PATCH 11/15] adding relu support to convolution --- .../kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp | 21 ++++++++++------ .../src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp | 25 ++++++++++++------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp index 1545e91e4..898b07c53 100644 --- a/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp +++ b/fpga_kernels/kernel_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp @@ -225,17 +225,19 @@ for (int cpi=0; cpi &in, hls::stream &out) { +static void relu(int flag_relu, int H, int W, hls::stream &in, hls::stream &out) { #ifdef DEBUG_VERBOSE printf("relu: start\n"); #endif - - int data_size = W * H * O; + pixel_out_t data; + int data_size = W * H; for (int i=0; i < data_size; i++) { #pragma HLS PIPELINE II=1 - data_type data = in.read(); - if (data < 0) data = 0.f; + data = in.read(); + for(int cpo = 0; cpo &in, hls::str } -void k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O, int offset_bias, int offset_kernel, int offset_data_out) { +void k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2(pixel_in_t *ptr_data, int H, int W, int I, data_type *ptr_kernel, data_type *ptr_bias, pixel_out_t *ptr_out, int O, int offset_bias, int offset_kernel, int offset_data_out, int flag_relu) { #pragma HLS INTERFACE s_axilite port=W bundle=control #pragma HLS INTERFACE s_axilite port=H bundle=control @@ -633,6 +635,7 @@ void k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2(pixel_in_t *ptr_data, int H, int W, int I, d #pragma HLS INTERFACE s_axilite port=offset_bias bundle=control #pragma HLS INTERFACE s_axilite port=offset_kernel bundle=control #pragma HLS INTERFACE s_axilite port=offset_data_out bundle=control + #pragma HLS INTERFACE s_axilite port=flag_relu bundle=control #pragma HLS INTERFACE s_axilite port=return bundle=control // ptr_data struct to be packed as a single element vector (to improve memory read) @@ -647,20 +650,22 @@ void k_cn2D_K3x3_S1x1_P1x1_BS1_ap_2(pixel_in_t *ptr_data, int H, int W, int I, d static hls::stream out_read_kernel; static hls::stream out_read_bias; static hls::stream out_conv; + static hls::stream out_relu; // stream sizes #pragma HLS STREAM variable = out_read_data depth = 32 #pragma HLS STREAM variable = out_read_kernel depth = 32 #pragma HLS STREAM variable = out_read_bias depth = 32 #pragma HLS STREAM variable = out_conv depth = 32 - // #pragma HLS STREAM variable = out_relu depth = 32 + #pragma HLS STREAM variable = out_relu depth = 32 #pragma HLS dataflow read_data(H, W, I_ITER, ptr_data, out_read_data); read_bias(offset_bias, ptr_bias, out_read_bias); read_kernel(I_ITER, offset_kernel, ptr_kernel, out_read_kernel); conv(H, W, I_ITER, out_read_data, out_read_kernel, out_read_bias, out_conv); - write_output(H, W, offset_data_out, ptr_out, out_conv); + relu(flag_relu, H, W, out_conv, out_relu); + write_output(H, W, offset_data_out, ptr_out, out_relu); } diff --git a/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp index fc1fae149..9f700f012 100644 --- a/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp +++ b/fpga_kernels/test_fpga/src/test_cn2D_K3x3_S1x1_P1x1_BS1_ap_2.cpp @@ -70,6 +70,8 @@ std::string binaryFile; #define KW 3 #define KH 3 +#define RELU 1 // 0/1 function relu is activated or not + int W; int H; int GI; @@ -99,7 +101,7 @@ void parse_arguments(int argc, char **argv) { exit(1); } - binaryFile = argv[1]; + binaryFile = argv[1]; W = atoi(argv[2]); H = atoi(argv[3]); I = atoi(argv[4]); @@ -174,14 +176,16 @@ void cpu_conv2d() { } // aplicamos relu -/* for (int cout=0; cout Date: Tue, 3 Nov 2020 13:38:11 +0100 Subject: [PATCH 12/15] UPV-GAP additions: - Profiling reduction functions --- src/utils.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/utils.cpp b/src/utils.cpp index a661eb31c..4393ec4f2 100755 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -501,7 +501,11 @@ PROFILING_ENABLE(flip_random); PROFILING_ENABLE(crop_random); PROFILING_ENABLE(crop_scale_random); PROFILING_ENABLE(cutout_random); - +// reduction +PROFILING_ENABLE(reduce); +PROFILING_ENABLE(reduce_op); +PROFILING_ENABLE(reduction); +PROFILING_ENABLE(reduction_back); void __show_profile() { @@ -583,5 +587,10 @@ void __show_profile() { PROFILING_PRINTF(crop_random); PROFILING_PRINTF(crop_scale_random); PROFILING_PRINTF(cutout_random); + //reduction + PROFILING_PRINTF(reduce); + PROFILING_PRINTF(reduce_op); + PROFILING_PRINTF(reduction); + PROFILING_PRINTF(reduction_back); } \ No newline at end of file From 4449f36dbc4c24e7d51cebd26234a95b9ac605a7 Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Tue, 3 Nov 2020 16:45:22 +0100 Subject: [PATCH 13/15] UPV-GAP: Added profiling of activation functions, convs, reduction functions --- include/eddl/profiling.h | 12 ++- src/tensor/nn/tensor_activations.cpp | 123 ++++++++++++++++++++++++++- src/tensor/nn/tensor_conv.cpp | 19 +++-- src/tensor/tensor_reduction.cpp | 27 +++++- src/utils.cpp | 61 +++++++++++++ 5 files changed, 229 insertions(+), 13 deletions(-) diff --git a/include/eddl/profiling.h b/include/eddl/profiling.h index c02848225..43ce4b9c5 100644 --- a/include/eddl/profiling.h +++ b/include/eddl/profiling.h @@ -4,9 +4,18 @@ #include +#define PROFILING_HEADER(fn) \ + struct timeval prof_t1; \ + gettimeofday(&prof_t1, NULL); + + #define PROFILING_ENABLE(fn) \ unsigned long long prof_##fn##_time; \ - unsigned long long prof_##fn##_calls; \ + unsigned long long prof_##fn##_calls; + +#define PROFILING_ENABLE_EXTERN(fn) \ + extern unsigned long long prof_##fn##_time; \ + extern unsigned long long prof_##fn##_calls; \ #define PROFILING_HEADER(fn) \ struct timeval prof_t1; \ @@ -15,7 +24,6 @@ #define PROFILING_HEADER_EXTERN(fn) \ extern unsigned long long prof_##fn##_time; \ extern unsigned long long prof_##fn##_calls; \ - extern int prof_##fn##_device; \ struct timeval prof_t1; \ gettimeofday(&prof_t1, NULL); diff --git a/src/tensor/nn/tensor_activations.cpp b/src/tensor/nn/tensor_activations.cpp index e3cb2c9ef..92755f99f 100644 --- a/src/tensor/nn/tensor_activations.cpp +++ b/src/tensor/nn/tensor_activations.cpp @@ -20,17 +20,39 @@ #include "eddl/hardware/gpu/nn/gpu_tensor_nn.h" #endif -namespace tensorNN { - - PROFILING_ENABLE(ReLu); +PROFILING_ENABLE_EXTERN(ReLu); +PROFILING_ENABLE_EXTERN(D_ReLu); +PROFILING_ENABLE_EXTERN(ThresholdedReLu); +PROFILING_ENABLE_EXTERN(LeakyReLu); +PROFILING_ENABLE_EXTERN(D_ThresholdedReLu); +PROFILING_ENABLE_EXTERN(D_LeakyReLu); +PROFILING_ENABLE_EXTERN(ELu); +PROFILING_ENABLE_EXTERN(D_ELu); +PROFILING_ENABLE_EXTERN(Sigmoid); +PROFILING_ENABLE_EXTERN(D_Sigmoid); +PROFILING_ENABLE_EXTERN(HardSigmoid); +PROFILING_ENABLE_EXTERN(D_HardSigmoid); +PROFILING_ENABLE_EXTERN(Tanh); +PROFILING_ENABLE_EXTERN(D_Tanh); +PROFILING_ENABLE_EXTERN(Softmax); +PROFILING_ENABLE_EXTERN(D_Softmax); +PROFILING_ENABLE_EXTERN(Exp); +PROFILING_ENABLE_EXTERN(D_Exp); +PROFILING_ENABLE_EXTERN(Linear); +PROFILING_ENABLE_EXTERN(D_Linear); +PROFILING_ENABLE_EXTERN(Softsign); +PROFILING_ENABLE_EXTERN(D_softsign); +PROFILING_ENABLE_EXTERN(Softplus); +PROFILING_ENABLE_EXTERN(D_softplus); +namespace tensorNN { // ReLU void ReLu(Tensor *A, Tensor *B) { if (A->device != B->device) msg("Tensors in different devices", "Tensor::ReLu"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::ReLu"); - PROFILING_HEADER_EXTERN(ReLu); + PROFILING_HEADER(ReLu); B->tsem->lock(); if (A->isCPU()) { @@ -60,6 +82,8 @@ namespace tensorNN { } if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_ReLu"); + PROFILING_HEADER(D_ReLu); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_relu(D, I, PD); @@ -77,6 +101,8 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_ReLu); } // ThresholdedReLu @@ -84,6 +110,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::ThresholdedReLu"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::ThresholdedReLu"); + PROFILING_HEADER(ThresholdedReLu); + B->tsem->lock(); if (A->isCPU()) { cpu_thresholded_relu(A, B, param); @@ -101,6 +129,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(ThresholdedReLu); } // ThresholdedReLu Derivative @@ -109,6 +139,8 @@ namespace tensorNN { msg("Tensors in different devices", "Tensor::D_ThresholdedReLu"); if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_ThresholdedReLu"); + PROFILING_HEADER(D_ThresholdedReLu); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_thresholded_relu(D, I, PD, param); @@ -126,6 +158,8 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_ThresholdedReLu); } // LeakyReLU @@ -133,6 +167,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::LeakyReLu"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::LeakyReLu"); + PROFILING_HEADER(LeakyReLu); + B->tsem->lock(); if (A->isCPU()) { cpu_leaky_relu(A, B, param); @@ -150,6 +186,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(LeakyReLu); } // RELU Derivative, always increment over parent delta @@ -158,6 +196,8 @@ namespace tensorNN { msg("Tensors in different devices", "Tensor::D_ReLu"); if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_ReLu"); + PROFILING_HEADER(D_LeakyReLu); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_leaky_relu(D, I, PD, param); @@ -175,6 +215,8 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_LeakyReLu); } @@ -183,6 +225,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::ELu"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::ELu"); + PROFILING_HEADER(ELu); + B->tsem->lock(); if (A->isCPU()) { cpu_elu(A, B, param); @@ -200,6 +244,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(ELu); } // ELU Derivative @@ -207,6 +253,8 @@ namespace tensorNN { if ((D->device != I->device) || (D->device != PD->device)) msg("Tensors in different devices", "Tensor::D_ELu"); if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_ELu"); + PROFILING_HEADER(D_ELu); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_elu(D, I, PD, param); @@ -224,6 +272,8 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_ELu); } @@ -232,6 +282,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::Softplus"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Softplus"); + PROFILING_HEADER(Softplus); + B->tsem->lock(); if (A->isCPU()) { cpu_softplus(A, B); @@ -249,6 +301,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(Softplus); } // Softplus Derivative @@ -257,6 +311,8 @@ namespace tensorNN { msg("Tensors in different devices", "Tensor::D_softplus"); if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_softplus"); + PROFILING_HEADER(D_softplus); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_softplus(D, I, PD); @@ -274,6 +330,8 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_softplus); } @@ -282,6 +340,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::Softsign"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Softsign"); + PROFILING_HEADER(Softsign); + B->tsem->lock(); if (A->isCPU()) { cpu_softsign(A, B); @@ -299,6 +359,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(Softsign); } // Softsign Derivative @@ -307,6 +369,8 @@ namespace tensorNN { msg("Tensors in different devices", "Tensor::D_softsign"); if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_softsign"); + PROFILING_HEADER(D_softsign); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_softsign(D, I, PD); @@ -324,6 +388,8 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_softsign); } // Linear @@ -331,6 +397,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::Linear"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Linear"); + PROFILING_HEADER(Linear); + B->tsem->lock(); if (A->isCPU()) { cpu_linear(A, B, param); @@ -348,6 +416,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(Linear); } // Linear Derivative @@ -356,6 +426,8 @@ namespace tensorNN { msg("Tensors in different devices", "Tensor::D_Linear"); if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_Linear"); + PROFILING_HEADER(D_Linear); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_linear(D, I, PD, param); @@ -373,6 +445,9 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_Linear); + } // Sigmoid @@ -380,6 +455,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::Sigmoid"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Sigmoid"); + PROFILING_HEADER(Sigmoid); + B->tsem->lock(); if (A->isCPU()) { cpu_sigmoid(A, B); @@ -397,6 +474,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(Sigmoid); } // Sigmoid Derivative, always increment over parent delta @@ -405,6 +484,8 @@ namespace tensorNN { msg("Tensors in different devices", "Tensor::D_Sigmoid"); if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_Sigmoid"); + PROFILING_HEADER(D_Sigmoid); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_sigmoid(D, I, PD); @@ -422,6 +503,8 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_Sigmoid); } // Hard Sigmoid @@ -429,6 +512,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::HardSigmoid"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::HardSigmoid"); + PROFILING_HEADER(HardSigmoid); + B->tsem->lock(); if (A->isCPU()) { cpu_hard_sigmoid(A, B); @@ -446,6 +531,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(HardSigmoid); } // Hard Sigmoid Derivative @@ -454,6 +541,8 @@ namespace tensorNN { msg("Tensors in different devices", "Tensor::D_HardSigmoid"); if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_HardSigmoid"); + PROFILING_HEADER(D_HardSigmoid); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_hard_sigmoid(D, I, PD); @@ -471,6 +560,8 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_HardSigmoid); } // Exponential @@ -478,6 +569,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::Exp"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Exp"); + PROFILING_HEADER(Exp); + B->tsem->lock(); if (A->isCPU()) { cpu_exp(A, B); @@ -496,6 +589,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(Exp); } // Exponential Derivative @@ -503,6 +598,8 @@ namespace tensorNN { if ((D->device != I->device) || (D->device != PD->device)) msg("Tensors in different devices", "Tensor::D_Exp"); if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_Exp"); + PROFILING_HEADER(D_Exp); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_exp(D, I, PD); @@ -519,6 +616,8 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_Exp); } // Tanh @@ -526,6 +625,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::Tanh"); if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Tanh"); + PROFILING_HEADER(Tanh); + B->tsem->lock(); if (A->isCPU()) { cpu_tanh(A, B); @@ -543,6 +644,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(Tanh); } // Tanh Derivative @@ -551,6 +654,8 @@ namespace tensorNN { msg("Tensors in different devices", "Tensor::D_Tanh"); if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_Tanh"); + PROFILING_HEADER(D_Tanh); + PD->tsem->lock(); if (D->isCPU()) { cpu_d_tanh(D, I, PD); @@ -568,6 +673,8 @@ namespace tensorNN { } #endif PD->tsem->unlock(); + + PROFILING_FOOTER(D_Tanh); } @@ -577,6 +684,8 @@ namespace tensorNN { if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::Softmax"); if (A->ndim != 2) msg("Softmax only over 2D Tensor (batch x logits)", "Tensor::Softmax"); + PROFILING_HEADER(Softmax); + B->tsem->lock(); if (A->isCPU()) { @@ -595,6 +704,8 @@ namespace tensorNN { #endif B->tsem->unlock(); + + PROFILING_FOOTER(Softmax); } // SOFTMAX DERIVATIVE @@ -604,6 +715,8 @@ namespace tensorNN { if ((!Tensor::sameShape(D, I)) || (!Tensor::sameShape(D, PD))) msg("Incompatible dims", "Tensor::D_Softmax"); if (D->ndim != 2) msg("D_Softmax only over 2D Tensor (batch x delta_probs)", "Tensor::D_Softmax"); + PROFILING_HEADER(D_Softmax); + if (D->isCPU()) { cpu_d_softmax(D, I, PD); } @@ -627,6 +740,8 @@ namespace tensorNN { } #endif + PROFILING_FOOTER(D_Softmax); + } } diff --git a/src/tensor/nn/tensor_conv.cpp b/src/tensor/nn/tensor_conv.cpp index ec876fc92..896e2257b 100644 --- a/src/tensor/nn/tensor_conv.cpp +++ b/src/tensor/nn/tensor_conv.cpp @@ -21,11 +21,11 @@ #include "eddl/hardware/fpga/nn/fpga_nn.h" #endif -namespace tensorNN{ - - PROFILING_ENABLE(Conv2D); - +PROFILING_ENABLE_EXTERN(Conv2D); +PROFILING_ENABLE_EXTERN(Conv2D_grad); +PROFILING_ENABLE_EXTERN(Conv2D_back); +namespace tensorNN{ void Conv2D(ConvolDescriptor *D) { ///////////////////////////////////////////////////////////////////// @@ -36,7 +36,7 @@ void Conv2D(ConvolDescriptor *D) { ///////////////////////////////////////////////////////////////////// if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::Conv2D"); - PROFILING_HEADER_EXTERN(Conv2D); + PROFILING_HEADER(Conv2D); D->O->tsem->lock(); if (D->I->isCPU()) { @@ -57,7 +57,6 @@ void Conv2D(ConvolDescriptor *D) { D->O->tsem->unlock(); PROFILING_FOOTER(Conv2D); - PROFILING_PRINTF(Conv2D); } void Conv2D_grad(ConvolDescriptor *D) { @@ -69,6 +68,8 @@ void Conv2D_grad(ConvolDescriptor *D) { ///////////////////////////////////////////////////////////////////// if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::Conv2D"); + PROFILING_HEADER(Conv2D_grad); + D->gK->tsem->lock(); if (D->I->isCPU()) { cpu_conv2D_grad(D); @@ -85,6 +86,8 @@ void Conv2D_grad(ConvolDescriptor *D) { } #endif D->gK->tsem->unlock(); + + PROFILING_FOOTER(Conv2D_grad); } void Conv2D_back(ConvolDescriptor *D) { @@ -96,6 +99,8 @@ void Conv2D_back(ConvolDescriptor *D) { ///////////////////////////////////////////////////////////////////// if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::Conv2D"); + PROFILING_HEADER(Conv2D_back); + D->ID->tsem->lock(); if (D->I->isCPU()) { cpu_conv2D_back(D); @@ -112,6 +117,8 @@ void Conv2D_back(ConvolDescriptor *D) { } #endif D->ID->tsem->unlock(); + + PROFILING_FOOTER(Conv2D_back); } } diff --git a/src/tensor/tensor_reduction.cpp b/src/tensor/tensor_reduction.cpp index c5746c884..250b1865a 100644 --- a/src/tensor/tensor_reduction.cpp +++ b/src/tensor/tensor_reduction.cpp @@ -11,6 +11,7 @@ #include "eddl/tensor/tensor.h" #include "eddl/tensor/tensor_reduction.h" #include "eddl/hardware/cpu/cpu_tensor.h" +#include "eddl/profiling.h" #ifdef cGPU @@ -102,6 +103,8 @@ void reduce(Tensor *A, Tensor *B,string mode,vector axis,int* map) } } + PROFILING_HEADER_EXTERN(reduce); + if (map==nullptr) map=get_reduction_map(A,axis); @@ -118,6 +121,8 @@ void reduce(Tensor *A, Tensor *B,string mode,vector axis,int* map) fpga_reduce(A,B,mode,map); } #endif + + PROFILING_FOOTER(reduce); } void reduce_mean(Tensor *A, Tensor *B,vector axis,int* map) @@ -140,6 +145,9 @@ void reduce_min(Tensor *A, Tensor *B,vector axis,int* map) void reduce(Tensor *A, Tensor *B,string mode,MapReduceDescriptor *MD) { + + PROFILING_HEADER_EXTERN(reduce); + if (A->isCPU()) { cpu_reduce(A,B,mode,MD); } @@ -153,6 +161,8 @@ void reduce(Tensor *A, Tensor *B,string mode,MapReduceDescriptor *MD) fpga_reduce(A,B,mode,MD); } #endif + + PROFILING_FOOTER(reduce); } @@ -179,7 +189,7 @@ void reduce_op(Tensor *A, Tensor *B,string op,vector axis,int* map) { int i,j; - + PROFILING_HEADER_EXTERN(reduce_op); if (B->ndim!=A->ndim-axis.size()) msg("dims don't match in reduction","reduce"); @@ -209,6 +219,8 @@ void reduce_op(Tensor *A, Tensor *B,string op,vector axis,int* map) fpga_reduce_op(A,B,op,map); } #endif + + PROFILING_FOOTER(reduce_op); } void reduce_sum(Tensor *A, Tensor *B,vector axis,int* map) @@ -230,6 +242,9 @@ void reduce_div(Tensor *A, Tensor *B,vector axis,int* map) void reduce_op(Tensor *A, Tensor *B,string op, MapReduceDescriptor *MD) { + + PROFILING_HEADER_EXTERN(reduce_op); + if (A->isCPU()) { cpu_reduce_op(A,B,op,MD); } @@ -244,7 +259,9 @@ void reduce_div(Tensor *A, Tensor *B,vector axis,int* map) } #endif + PROFILING_FOOTER(reduce_op); } + void reduce_sum(Tensor *A, Tensor *B,MapReduceDescriptor *MD) { reduce_op(A,B,"sum",MD); @@ -264,6 +281,8 @@ void reduce_div(Tensor *A, Tensor *B,vector axis,int* map) //////////// void reduction(ReduceDescriptor *RD){ + PROFILING_HEADER_EXTERN(reduction); + if (RD->I->isCPU()) { cpu_reduction(RD); } @@ -278,12 +297,16 @@ void reduction(ReduceDescriptor *RD){ fpga_reduction(RD); } #endif + + PROFILING_FOOTER(reduction); } void reduction_back(ReduceDescriptor *RD) { + PROFILING_HEADER_EXTERN(reduction_back); + if (RD->I->isCPU()) { cpu_reduction_back(RD); } @@ -298,4 +321,6 @@ void reduction_back(ReduceDescriptor *RD) fpga_reduction_back(RD); } #endif + + PROFILING_FOOTER(reduction_back); } diff --git a/src/utils.cpp b/src/utils.cpp index 4393ec4f2..b75ba8a90 100755 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -506,6 +506,37 @@ PROFILING_ENABLE(reduce); PROFILING_ENABLE(reduce_op); PROFILING_ENABLE(reduction); PROFILING_ENABLE(reduction_back); +// activations +PROFILING_ENABLE(ELu); +PROFILING_ENABLE(Exp); +PROFILING_ENABLE(ReLu); +PROFILING_ENABLE(Tanh); +PROFILING_ENABLE(D_ELu); +PROFILING_ENABLE(D_Exp); +PROFILING_ENABLE(D_Tanh); +PROFILING_ENABLE(D_ThresholdedReLu); +PROFILING_ENABLE(D_HardSigmoid); +PROFILING_ENABLE(D_LeakyRelu); +PROFILING_ENABLE(D_Linear); +PROFILING_ENABLE(D_ReLu); +PROFILING_ENABLE(D_LeakyReLu); +PROFILING_ENABLE(D_Sigmoid); +PROFILING_ENABLE(D_Softmax); +PROFILING_ENABLE(D_softplus); +PROFILING_ENABLE(HardSigmoid); +PROFILING_ENABLE(D_softsign); +PROFILING_ENABLE(LeakyReLu); +PROFILING_ENABLE(Linear); +PROFILING_ENABLE(Sigmoid); +PROFILING_ENABLE(Softmax); +PROFILING_ENABLE(Softplus); +PROFILING_ENABLE(Softsign); +PROFILING_ENABLE(ThresholdedReLu); +// conv +PROFILING_ENABLE(Conv2D); +PROFILING_ENABLE(Conv2D_grad); +PROFILING_ENABLE(Conv2D_back); + void __show_profile() { @@ -592,5 +623,35 @@ void __show_profile() { PROFILING_PRINTF(reduce_op); PROFILING_PRINTF(reduction); PROFILING_PRINTF(reduction_back); + // activations + PROFILING_ENABLE(ELu); + PROFILING_PRINTF(Exp); + PROFILING_PRINTF(ReLu); + PROFILING_PRINTF(Tanh); + PROFILING_PRINTF(D_ELu); + PROFILING_PRINTF(D_Exp); + PROFILING_PRINTF(D_Tanh); + PROFILING_PRINTF(D_ThresholdedReLu); + PROFILING_PRINTF(D_HardSigmoid); + PROFILING_PRINTF(D_LeakyRelu); + PROFILING_PRINTF(D_Linear); + PROFILING_PRINTF(D_ReLu); + PROFILING_PRINTF(D_LeakyReLu); + PROFILING_PRINTF(D_Sigmoid); + PROFILING_PRINTF(D_Softmax); + PROFILING_PRINTF(D_softplus); + PROFILING_PRINTF(HardSigmoid); + PROFILING_PRINTF(D_softsign); + PROFILING_PRINTF(LeakyReLu); + PROFILING_PRINTF(Linear); + PROFILING_PRINTF(Sigmoid); + PROFILING_PRINTF(Softmax); + PROFILING_PRINTF(Softplus); + PROFILING_PRINTF(Softsign); + PROFILING_PRINTF(ThresholdedReLu); + // conv + PROFILING_PRINTF(Conv2D); + PROFILING_PRINTF(Conv2D_grad); + PROFILING_PRINTF(Conv2D_back); } \ No newline at end of file From 068fb04e480f74d4d91f6d8b754418227ac411da Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Tue, 3 Nov 2020 17:20:00 +0100 Subject: [PATCH 14/15] UPV-GAP: Added profiling for losses, comparison, and generator operations --- src/tensor/nn/tensor_losses.cpp | 7 ++ src/tensor/tensor_comparison.cpp | 117 +++++++++++++++++++++++++++++++ src/tensor/tensor_generator.cpp | 23 ++++++ src/utils.cpp | 58 ++++++++++++++- 4 files changed, 203 insertions(+), 2 deletions(-) diff --git a/src/tensor/nn/tensor_losses.cpp b/src/tensor/nn/tensor_losses.cpp index 4e2a0682e..2089ea492 100644 --- a/src/tensor/nn/tensor_losses.cpp +++ b/src/tensor/nn/tensor_losses.cpp @@ -8,6 +8,7 @@ */ #include "eddl/tensor/nn/tensor_nn.h" #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h" +#include "eddl/profiling.h" #ifdef cGPU #include "eddl/hardware/gpu/gpu_tensor.h" @@ -20,6 +21,8 @@ #include "eddl/hardware/fpga/nn/fpga_nn.h" #endif +PROFILING_ENABLE_EXTERN(cent); + namespace tensorNN { @@ -28,6 +31,8 @@ namespace tensorNN { if (A->device != B->device) msg("Tensors in different devices", "Tensor::cross-entropy"); if ((!Tensor::sameShape(A, B)) || (!Tensor::sameShape(A, C))) msg("Incompatible dims", "Tensor::cross-entropy"); + PROFILING_HEADER(cent); + C->tsem->lock(); if (A->isCPU()) { cpu_cent(A, B, C); @@ -45,6 +50,8 @@ namespace tensorNN { } #endif C->tsem->unlock(); + + PROFILING_FOOTER(cent); } } \ No newline at end of file diff --git a/src/tensor/tensor_comparison.cpp b/src/tensor/tensor_comparison.cpp index fde935324..da1d8b22c 100644 --- a/src/tensor/tensor_comparison.cpp +++ b/src/tensor/tensor_comparison.cpp @@ -8,6 +8,7 @@ */ #include "eddl/tensor/tensor.h" #include "eddl/hardware/cpu/cpu_tensor.h" +#include "eddl/profiling.h" #ifdef cGPU #include "eddl/hardware/gpu/gpu_tensor.h" @@ -21,7 +22,32 @@ using namespace std; +PROFILING_ENABLE_EXTERN(all); +PROFILING_ENABLE_EXTERN(any); +PROFILING_ENABLE_EXTERN(isfinite); +PROFILING_ENABLE_EXTERN(isinf); +PROFILING_ENABLE_EXTERN(isnan); +PROFILING_ENABLE_EXTERN(isneginf); +PROFILING_ENABLE_EXTERN(isposinf); +PROFILING_ENABLE_EXTERN(logical_and); +PROFILING_ENABLE_EXTERN(logical_or); +PROFILING_ENABLE_EXTERN(logical_not); +PROFILING_ENABLE_EXTERN(logical_xor); +PROFILING_ENABLE_EXTERN(allclose); +PROFILING_ENABLE_EXTERN(isclose); +PROFILING_ENABLE_EXTERN(greater); +PROFILING_ENABLE_EXTERN(greater_equal); +PROFILING_ENABLE_EXTERN(less); +PROFILING_ENABLE_EXTERN(less_equal); +PROFILING_ENABLE_EXTERN(equal); +PROFILING_ENABLE_EXTERN(not_equal); +PROFILING_ENABLE_EXTERN(equivalent); + + bool Tensor::all(Tensor *A){ + + PROFILING_HEADER(all); + bool res = false; if (A->isCPU()) { @@ -39,10 +65,15 @@ bool Tensor::all(Tensor *A){ } #endif + PROFILING_FOOTER(all); + return res; } bool Tensor::any(Tensor *A){ + + PROFILING_HEADER(any); + bool res = false; if (A->isCPU()) { @@ -60,6 +91,8 @@ bool Tensor::any(Tensor *A){ } #endif + PROFILING_FOOTER(any); + return res; } @@ -67,6 +100,8 @@ bool Tensor::any(Tensor *A){ void Tensor::isfinite(Tensor *A, Tensor* B){ checkCompatibility(A, B, "Tensor::isfinite"); + PROFILING_HEADER(isfinite); + if (A->isCPU()) { cpu_isfinite(A, B); } @@ -81,11 +116,14 @@ void Tensor::isfinite(Tensor *A, Tensor* B){ fpga_isfinite(A, B); } #endif + PROFILING_FOOTER(isfinite); } void Tensor::isinf(Tensor *A, Tensor* B){ checkCompatibility(A, B, "Tensor::isinf"); + PROFILING_HEADER(isinf); + if (A->isCPU()) { cpu_isinf(A, B); } @@ -100,11 +138,14 @@ void Tensor::isinf(Tensor *A, Tensor* B){ fpga_isinf(A, B); } #endif + PROFILING_FOOTER(isinf); } void Tensor::isnan(Tensor *A, Tensor* B){ checkCompatibility(A, B, "Tensor::isnan"); + PROFILING_HEADER(isnan); + if (A->isCPU()) { cpu_isnan(A, B); } @@ -119,11 +160,14 @@ void Tensor::isnan(Tensor *A, Tensor* B){ fpga_isnan(A, B); } #endif + PROFILING_FOOTER(isnan); } void Tensor::isneginf(Tensor *A, Tensor* B){ checkCompatibility(A, B, "Tensor::isneginf"); + PROFILING_HEADER(isneginf); + if (A->isCPU()) { cpu_isneginf(A, B); } @@ -138,11 +182,14 @@ void Tensor::isneginf(Tensor *A, Tensor* B){ fpga_isneginf(A, B); } #endif + PROFILING_FOOTER(isneginf); } void Tensor::isposinf(Tensor *A, Tensor* B){ checkCompatibility(A, B, "Tensor::isposinf"); + PROFILING_HEADER(isposinf); + if (A->isCPU()) { cpu_isposinf(A, B); } @@ -157,6 +204,7 @@ void Tensor::isposinf(Tensor *A, Tensor* B){ fpga_isposinf(A, B); } #endif + PROFILING_FOOTER(isposinf); } @@ -165,6 +213,8 @@ void Tensor::isposinf(Tensor *A, Tensor* B){ void Tensor::logical_and(Tensor *A, Tensor *B, Tensor *C){ checkCompatibility(A, B, C, "Tensor::logical_and"); + PROFILING_HEADER(logical_and); + if (A->isCPU()) { cpu_logical_and(A, B, C); } @@ -179,11 +229,14 @@ void Tensor::logical_and(Tensor *A, Tensor *B, Tensor *C){ fpga_logical_and(A, B, C); } #endif + PROFILING_FOOTER(logical_and); } void Tensor::logical_or(Tensor *A, Tensor *B, Tensor *C){ checkCompatibility(A, B, C, "Tensor::logical_or"); + PROFILING_HEADER(logical_or); + if (A->isCPU()) { cpu_logical_or(A, B, C); } @@ -198,11 +251,14 @@ void Tensor::logical_or(Tensor *A, Tensor *B, Tensor *C){ fpga_logical_or(A, B, C); } #endif + PROFILING_FOOTER(logical_or); } void Tensor::logical_not(Tensor *A, Tensor *B){ checkCompatibility(A, B, "Tensor::logical_not"); + PROFILING_HEADER(logical_not); + if (A->isCPU()) { cpu_logical_not(A, B); } @@ -217,11 +273,14 @@ void Tensor::logical_not(Tensor *A, Tensor *B){ fpga_logical_not(A, B); } #endif + PROFILING_FOOTER(logical_not); } void Tensor::logical_xor(Tensor *A, Tensor *B, Tensor *C){ checkCompatibility(A, B, C, "Tensor::logical_xor"); + PROFILING_HEADER(logical_xor); + if (A->isCPU()) { cpu_logical_xor(A, B, C); } @@ -236,12 +295,15 @@ void Tensor::logical_xor(Tensor *A, Tensor *B, Tensor *C){ fpga_logical_xor(A, B, C); } #endif + PROFILING_FOOTER(logical_xor); } bool Tensor::allclose(Tensor *A, Tensor *B, float rtol, float atol, bool equal_nan){ checkCompatibility(A, B, "Tensor::allclose"); + PROFILING_HEADER(allclose); + if (A->isCPU()) { return cpu_allclose(A, B, rtol, atol, equal_nan); } @@ -256,6 +318,8 @@ bool Tensor::allclose(Tensor *A, Tensor *B, float rtol, float atol, bool equal_n return fpga_allclose(A, B, rtol, atol, equal_nan); } #endif + PROFILING_FOOTER(allclose); + return 0; } @@ -263,6 +327,8 @@ bool Tensor::allclose(Tensor *A, Tensor *B, float rtol, float atol, bool equal_n void Tensor::isclose(Tensor *A, Tensor *B, Tensor *C, float rtol, float atol, bool equal_nan){ checkCompatibility(A, B, C, "Tensor::isclose"); + PROFILING_HEADER(isclose); + if (A->isCPU()) { cpu_isclose(A, B, C, rtol, atol, equal_nan); } @@ -277,6 +343,7 @@ void Tensor::isclose(Tensor *A, Tensor *B, Tensor *C, float rtol, float atol, bo fpga_isclose(A, B, C, rtol, atol, equal_nan); } #endif + PROFILING_FOOTER(isclose); } void Tensor::greater_(float v){ @@ -292,6 +359,8 @@ Tensor* Tensor::greater(float v){ void Tensor::greater(Tensor *A, Tensor *B, float v){ checkCompatibility(A, B, "Tensor::greater"); + PROFILING_HEADER(greater); + if (A->isCPU()) { cpu_greater(A, B, v); } @@ -307,6 +376,8 @@ void Tensor::greater(Tensor *A, Tensor *B, float v){ fpga_greater(A, B, v); } #endif + + PROFILING_FOOTER(greater); } Tensor* Tensor::greater(Tensor *A){ @@ -318,6 +389,8 @@ Tensor* Tensor::greater(Tensor *A){ void Tensor::greater(Tensor *A, Tensor *B, Tensor *C){ checkCompatibility(A, B, C, "Tensor::greater"); + PROFILING_HEADER(greater); + if (A->isCPU()) { cpu_greater(A, B, C); } @@ -332,6 +405,8 @@ void Tensor::greater(Tensor *A, Tensor *B, Tensor *C){ fpga_greater(A, B, C); } #endif + + PROFILING_FOOTER(greater); } @@ -348,6 +423,8 @@ Tensor* Tensor::greater_equal(float v){ void Tensor::greater_equal(Tensor *A, Tensor *B, float v){ checkCompatibility(A, B, "Tensor::greater_equal"); + PROFILING_HEADER(greater_equal); + if (A->isCPU()) { cpu_greater_equal(A, B, v); } @@ -363,6 +440,7 @@ void Tensor::greater_equal(Tensor *A, Tensor *B, float v){ fpga_greater_equal(A, B, v); } #endif + PROFILING_FOOTER(greater_equal); } Tensor* Tensor::greater_equal(Tensor *A){ @@ -374,6 +452,8 @@ Tensor* Tensor::greater_equal(Tensor *A){ void Tensor::greater_equal(Tensor *A, Tensor *B, Tensor *C){ checkCompatibility(A, B, C, "Tensor::greater_equal"); + PROFILING_HEADER(greater_equal); + if (A->isCPU()) { cpu_greater_equal(A, B, C); } @@ -389,6 +469,8 @@ void Tensor::greater_equal(Tensor *A, Tensor *B, Tensor *C){ fpga_greater_equal(A, B, C); } #endif + + PROFILING_FOOTER(greater_equal); } @@ -405,6 +487,8 @@ Tensor* Tensor::less(float v){ void Tensor::less(Tensor *A, Tensor *B, float v){ checkCompatibility(A, B, "Tensor::less"); + PROFILING_HEADER(less); + if (A->isCPU()) { cpu_less(A, B, v); } @@ -420,6 +504,7 @@ void Tensor::less(Tensor *A, Tensor *B, float v){ fpga_less(A, B, v); } #endif + PROFILING_FOOTER(less); } Tensor* Tensor::less(Tensor *A){ @@ -431,6 +516,8 @@ Tensor* Tensor::less(Tensor *A){ void Tensor::less(Tensor *A, Tensor *B, Tensor *C){ checkCompatibility(A, B, C, "Tensor::less"); + PROFILING_HEADER(less); + if (A->isCPU()) { cpu_less(A, B, C); } @@ -445,6 +532,8 @@ void Tensor::less(Tensor *A, Tensor *B, Tensor *C){ fpga_less(A, B, C); } #endif + + PROFILING_FOOTER(less); } @@ -461,6 +550,8 @@ Tensor* Tensor::less_equal(float v){ void Tensor::less_equal(Tensor *A, Tensor *B, float v){ checkCompatibility(A, B, "Tensor::less_equal"); + PROFILING_HEADER(less_equal); + if (A->isCPU()) { cpu_less_equal(A, B, v); } @@ -476,6 +567,8 @@ void Tensor::less_equal(Tensor *A, Tensor *B, float v){ fpga_less_equal(A, B, v); } #endif + + PROFILING_FOOTER(less_equal); } @@ -488,6 +581,8 @@ Tensor* Tensor::less_equal(Tensor *A){ void Tensor::less_equal(Tensor *A, Tensor *B, Tensor *C){ checkCompatibility(A, B, C, "Tensor::less_equal"); + PROFILING_HEADER(less_equal); + if (A->isCPU()) { cpu_less_equal(A, B, C); } @@ -502,6 +597,7 @@ void Tensor::less_equal(Tensor *A, Tensor *B, Tensor *C){ fpga_less_equal(A, B, C); } #endif + PROFILING_FOOTER(less_equal); } @@ -518,6 +614,8 @@ Tensor* Tensor::equal(float v){ void Tensor::equal(Tensor *A, Tensor *B, float v){ checkCompatibility(A, B, "Tensor::equal"); + PROFILING_HEADER(equal); + if (A->isCPU()) { cpu_equal(A, B, v); } @@ -533,6 +631,8 @@ void Tensor::equal(Tensor *A, Tensor *B, float v){ fpga_equal(A, B, v); } #endif + + PROFILING_FOOTER(equal); } Tensor* Tensor::equal(Tensor *A){ @@ -544,6 +644,8 @@ Tensor* Tensor::equal(Tensor *A){ void Tensor::equal(Tensor *A, Tensor *B, Tensor *C){ checkCompatibility(A, B, C, "Tensor::equal"); + PROFILING_HEADER(equal); + if (A->isCPU()) { cpu_equal(A, B, C); } @@ -558,6 +660,8 @@ void Tensor::equal(Tensor *A, Tensor *B, Tensor *C){ fpga_equal(A, B, C); } #endif + + PROFILING_FOOTER(equal); } void Tensor::not_equal_(float v){ @@ -573,6 +677,8 @@ Tensor* Tensor::not_equal(float v){ void Tensor::not_equal(Tensor *A, Tensor *B, float v){ checkCompatibility(A, B, "Tensor::not_equal"); + PROFILING_HEADER(not_equal); + if (A->isCPU()) { cpu_not_equal(A, B, v); } @@ -588,6 +694,8 @@ void Tensor::not_equal(Tensor *A, Tensor *B, float v){ fpga_not_equal(A, B, v); } #endif + + PROFILING_FOOTER(not_equal); } Tensor* Tensor::not_equal(Tensor *A){ @@ -599,6 +707,8 @@ Tensor* Tensor::not_equal(Tensor *A){ void Tensor::not_equal(Tensor *A, Tensor *B, Tensor *C){ checkCompatibility(A, B, C, "Tensor::not_equal"); + PROFILING_HEADER(not_equal); + if (A->isCPU()) { cpu_not_equal(A, B, C); } @@ -613,6 +723,8 @@ void Tensor::not_equal(Tensor *A, Tensor *B, Tensor *C){ fpga_not_equal(A, B, C); } #endif + + PROFILING_FOOTER(not_equal); } int Tensor::eqsize(Tensor *A, Tensor *B){ @@ -639,12 +751,15 @@ int Tensor::sameShape(Tensor *A, Tensor *B) { } int Tensor::equivalent(Tensor *A, Tensor *B, float atol, float rtol, bool equal_nan) { + // Equal device if (A->device != B->device) msg("Tensors in different devices", "Tensor::equivalent"); // Equal ndims and shapes if (!sameShape(A, B)) return 0; + PROFILING_HEADER(equivalent); + // Equal data if (A->isCPU() && B->isCPU()) { // return cpu_allclose(A, B, rtol, atol, equal_nan); @@ -663,5 +778,7 @@ int Tensor::equivalent(Tensor *A, Tensor *B, float atol, float rtol, bool equal_ } #endif + PROFILING_FOOTER(equivalent); + return 1; } diff --git a/src/tensor/tensor_generator.cpp b/src/tensor/tensor_generator.cpp index 7c6c7f9a4..c44306cd0 100644 --- a/src/tensor/tensor_generator.cpp +++ b/src/tensor/tensor_generator.cpp @@ -9,6 +9,7 @@ #include "eddl/tensor/tensor.h" #include "eddl/hardware/cpu/cpu_tensor.h" +#include "eddl/profiling.h" #ifdef cGPU #include "eddl/hardware/gpu/gpu_tensor.h" @@ -22,7 +23,15 @@ using namespace std; +PROFILING_ENABLE_EXTERN(fill_rand_uniform); +PROFILING_ENABLE_EXTERN(fill_rand_signed_uniform); +PROFILING_ENABLE_EXTERN(fill_rand_normal); +PROFILING_ENABLE_EXTERN(fill_rand_binary); + void Tensor::fill_rand_uniform_(float v) { + + PROFILING_HEADER(fill_rand_uniform); + if (isCPU()) { cpu_rand_uniform(this, v); } @@ -38,6 +47,8 @@ void Tensor::fill_rand_uniform_(float v) { } #endif + PROFILING_FOOTER(fill_rand_uniform); + } Tensor* Tensor::fill_rand_uniform(float v){ @@ -47,6 +58,9 @@ Tensor* Tensor::fill_rand_uniform(float v){ } void Tensor::fill_rand_signed_uniform_(float v) { + + PROFILING_HEADER(fill_rand_signed_uniform); + if (isCPU()) { cpu_rand_signed_uniform(this, v); } @@ -62,6 +76,7 @@ void Tensor::fill_rand_signed_uniform_(float v) { } #endif + PROFILING_FOOTER(fill_rand_signed_uniform); } Tensor* Tensor::fill_rand_signed_uniform(float v){ @@ -71,6 +86,9 @@ Tensor* Tensor::fill_rand_signed_uniform(float v){ } void Tensor::fill_rand_normal_(float m, float s, bool fast_math) { + + PROFILING_HEADER(fill_rand_normal); + if (isCPU()) { cpu_rand_normal(this, m, s, fast_math); } @@ -86,6 +104,7 @@ void Tensor::fill_rand_normal_(float m, float s, bool fast_math) { } #endif + PROFILING_FOOTER(fill_rand_normal); } Tensor* Tensor::fill_rand_normal(float m, float s, bool fast_math) { @@ -95,6 +114,9 @@ Tensor* Tensor::fill_rand_normal(float m, float s, bool fast_math) { } void Tensor::fill_rand_binary_(float v) { + + PROFILING_HEADER(fill_rand_binary); + if (isCPU()) { cpu_rand_binary(this, v); } @@ -110,6 +132,7 @@ void Tensor::fill_rand_binary_(float v) { } #endif + PROFILING_FOOTER(fill_rand_binary); } Tensor* Tensor::fill_rand_binary(float v) { diff --git a/src/utils.cpp b/src/utils.cpp index b75ba8a90..c151d1cb2 100755 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -536,7 +536,34 @@ PROFILING_ENABLE(ThresholdedReLu); PROFILING_ENABLE(Conv2D); PROFILING_ENABLE(Conv2D_grad); PROFILING_ENABLE(Conv2D_back); - +// losses +PROFILING_ENABLE(cent); +// generator +PROFILING_ENABLE(fill_rand_uniform); +PROFILING_ENABLE(fill_rand_signed_uniform); +PROFILING_ENABLE(fill_rand_normal); +PROFILING_ENABLE(fill_rand_binary); +// comparison +PROFILING_ENABLE(all); +PROFILING_ENABLE(any); +PROFILING_ENABLE(isfinite); +PROFILING_ENABLE(isinf); +PROFILING_ENABLE(isnan); +PROFILING_ENABLE(isneginf); +PROFILING_ENABLE(isposinf); +PROFILING_ENABLE(logical_and); +PROFILING_ENABLE(logical_or); +PROFILING_ENABLE(logical_not); +PROFILING_ENABLE(logical_xor); +PROFILING_ENABLE(allclose); +PROFILING_ENABLE(isclose); +PROFILING_ENABLE(greater); +PROFILING_ENABLE(greater_equal); +PROFILING_ENABLE(less); +PROFILING_ENABLE(less_equal); +PROFILING_ENABLE(equal); +PROFILING_ENABLE(not_equal); +PROFILING_ENABLE(equivalent); void __show_profile() { @@ -653,5 +680,32 @@ void __show_profile() { PROFILING_PRINTF(Conv2D); PROFILING_PRINTF(Conv2D_grad); PROFILING_PRINTF(Conv2D_back); - + // losses + PROFILING_PRINTF(cent); + // generator + PROFILING_PRINTF(fill_rand_uniform); + PROFILING_PRINTF(fill_rand_signed_uniform); + PROFILING_PRINTF(fill_rand_normal); + PROFILING_PRINTF(fill_rand_binary); + // comparison + PROFILING_PRINTF(all); + PROFILING_PRINTF(any); + PROFILING_PRINTF(isfinite); + PROFILING_PRINTF(isinf); + PROFILING_PRINTF(isnan); + PROFILING_PRINTF(isneginf); + PROFILING_PRINTF(isposinf); + PROFILING_PRINTF(logical_and); + PROFILING_PRINTF(logical_or); + PROFILING_PRINTF(logical_not); + PROFILING_PRINTF(logical_xor); + PROFILING_PRINTF(allclose); + PROFILING_PRINTF(isclose); + PROFILING_PRINTF(greater); + PROFILING_PRINTF(greater_equal); + PROFILING_PRINTF(less); + PROFILING_PRINTF(less_equal); + PROFILING_PRINTF(equal); + PROFILING_PRINTF(not_equal); + PROFILING_PRINTF(equivalent); } \ No newline at end of file From 962008edef5db40ef42a8a81c966b46878db198f Mon Sep 17 00:00:00 2001 From: Jose Flich Date: Tue, 3 Nov 2020 17:42:48 +0100 Subject: [PATCH 15/15] UPV-GAP: Added profling of batch normalization, core nn, metrics, pool functions --- src/tensor/nn/tensor_bn.cpp | 48 +++++++++++++++++++++++--------- src/tensor/nn/tensor_core_nn.cpp | 39 ++++++++++++++++++++++---- src/tensor/nn/tensor_metrics.cpp | 13 +++++++++ src/tensor/nn/tensor_pool.cpp | 22 ++++++++++++--- src/utils.cpp | 41 +++++++++++++++++++++++++++ 5 files changed, 141 insertions(+), 22 deletions(-) diff --git a/src/tensor/nn/tensor_bn.cpp b/src/tensor/nn/tensor_bn.cpp index db2d855ce..17b0ce0e4 100644 --- a/src/tensor/nn/tensor_bn.cpp +++ b/src/tensor/nn/tensor_bn.cpp @@ -8,6 +8,7 @@ */ #include "eddl/tensor/nn/tensor_nn.h" #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h" +#include "eddl/profiling.h" #ifdef cFPGA #include "eddl/hardware/fpga/fpga_hw.h" @@ -20,10 +21,18 @@ #include "eddl/hardware/gpu/nn/gpu_tensor_nn.h" #endif +PROFILING_ENABLE_EXTERN(permute_channels_last); +PROFILING_ENABLE_EXTERN(permute_channels_first); +PROFILING_ENABLE_EXTERN(permute_batch_last); +PROFILING_ENABLE_EXTERN(permute_batch_first); + namespace tensorNN { void permute_channels_last(Tensor *A, Tensor *B) { + + PROFILING_HEADER(permute_channels_last); + if (A->isCPU()) { cpu_permute_channels_last(A, B); } @@ -34,13 +43,17 @@ namespace tensorNN { } #endif #ifdef cFPGA - else { - fpga_permute_channels_last(A, B); - } + else { + fpga_permute_channels_last(A, B); + } #endif - } + PROFILING_FOOTER(permute_channels_last); + } void permute_channels_first(Tensor *A, Tensor *B) { + + PROFILING_HEADER(permute_channels_first); + if (A->isCPU()) { cpu_permute_channels_first(A, B); } @@ -51,14 +64,18 @@ namespace tensorNN { } #endif #ifdef cFPGA - else { - fpga_permute_channels_first(A, B); - } + else { + fpga_permute_channels_first(A, B); + } #endif + PROFILING_FOOTER(permute_channels_first); } void permute_batch_last(Tensor *A, Tensor *B) { + + PROFILING_HEADER(permute_batch_last); + if (A->isCPU()) { cpu_permute_batch_last(A, B); } @@ -69,13 +86,17 @@ namespace tensorNN { } #endif #ifdef cFPGA - else { - fpga_permute_batch_last(A, B); - } + else { + fpga_permute_batch_last(A, B); + } #endif + PROFILING_FOOTER(permute_batch_last); } void permute_batch_first(Tensor *A, Tensor *B) { + + PROFILING_HEADER(permute_batch_first); + if (A->isCPU()) { cpu_permute_batch_first(A, B); } @@ -86,10 +107,11 @@ namespace tensorNN { } #endif #ifdef cFPGA - else { - fpga_permute_batch_first(A, B); - } + else { + fpga_permute_batch_first(A, B); + } #endif + PROFILING_FOOTER(permute_batch_last); } } \ No newline at end of file diff --git a/src/tensor/nn/tensor_core_nn.cpp b/src/tensor/nn/tensor_core_nn.cpp index 2db8eafc1..5635bd25e 100644 --- a/src/tensor/nn/tensor_core_nn.cpp +++ b/src/tensor/nn/tensor_core_nn.cpp @@ -11,6 +11,7 @@ #include "eddl/tensor/nn/tensor_nn.h" #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h" +#include "eddl/profiling.h" #ifdef cGPU #include "eddl/hardware/gpu/gpu_tensor.h" @@ -25,6 +26,13 @@ extern int next_fpga_tensor_id; #endif +PROFILING_ENABLE_EXTERN(repeat_nn); +PROFILING_ENABLE_EXTERN(d_repeat_nn); +PROFILING_ENABLE_EXTERN(select); +PROFILING_ENABLE_EXTERN(select_back); +PROFILING_ENABLE_EXTERN(set_select); +PROFILING_ENABLE_EXTERN(set_select_back); + namespace tensorNN { @@ -41,6 +49,8 @@ namespace tensorNN { } } + PROFILING_HEADER(repeat_nn); + if (A->isCPU() && B->isCPU()) { cpu_repeat_nn(A, B, size); } @@ -51,15 +61,19 @@ namespace tensorNN { #endif #ifdef cFPGA else { - + printf("repeat_nn not supported yet on FPGA\n"); + exit(1); } #endif + PROFILING_FOOTER(repeat_nn); } void d_repeat_nn(Tensor *D, Tensor *A, vector size) { // TODO: Should be for N dimensions, not 2 (...and generic, not just NN) if ((D->device != A->device)) msg("Tensors in different devices", "Tensor::D_Repeat_NN"); + PROFILING_HEADER(d_repeat_nn); + if (D->isCPU() && A->isCPU()) { cpu_d_repeat_nn(D, A, size); } @@ -70,13 +84,18 @@ namespace tensorNN { #endif #ifdef cFPGA else { - + printf("d_repeat_nn not implemented in FPGA yet\n"); + exit(1); } #endif + PROFILING_FOOTER(d_repeat_nn); } void select(Tensor *A, Tensor* B, SelDescriptor *sd){ + + PROFILING_HEADER(select); + if (A->isCPU() && B->isCPU()) { cpu_select_nn(A, B, sd); } @@ -92,10 +111,13 @@ namespace tensorNN { fpga_select_nn(A, B, sd); } #endif - + PROFILING_FOOTER(select); } void select_back(Tensor *A, Tensor* B, SelDescriptor *sd){ + + PROFILING_HEADER(select_back); + if (A->isCPU() && B->isCPU()) { cpu_select_back_nn(A, B, sd); } @@ -111,10 +133,13 @@ namespace tensorNN { fpga_select_back_nn(A, B, sd); } #endif - + PROFILING_FOOTER(select_back); } void set_select(Tensor *A, Tensor *B, SelDescriptor *sd){ + + PROFILING_HEADER(set_select); + if (A->isCPU() && B->isCPU()) { cpu_set_select_nn(A, B, sd); } @@ -130,10 +155,14 @@ namespace tensorNN { fpga_set_select_nn(A, B, sd); } #endif + PROFILING_FOOTER(set_select); } void set_select_back(Tensor *A, Tensor* B, SelDescriptor *sd){ + + PROFILING_HEADER(set_select_back); + if (A->isCPU() && B->isCPU()) { cpu_set_select_back_nn(A, B, sd); } @@ -149,7 +178,7 @@ namespace tensorNN { fpga_set_select_back_nn(A, B, sd); } #endif - + PROFILING_FOOTER(set_select_back); } } diff --git a/src/tensor/nn/tensor_metrics.cpp b/src/tensor/nn/tensor_metrics.cpp index b1a6ffe5d..ac6753fe3 100644 --- a/src/tensor/nn/tensor_metrics.cpp +++ b/src/tensor/nn/tensor_metrics.cpp @@ -8,6 +8,7 @@ */ #include "eddl/tensor/nn/tensor_nn.h" #include "eddl/hardware/cpu/nn/cpu_tensor_nn.h" +#include "eddl/profiling.h" #ifdef cGPU #include "eddl/hardware/gpu/gpu_tensor.h" @@ -20,6 +21,9 @@ #include "eddl/hardware/fpga/nn/fpga_nn.h" #endif +PROFILING_ENABLE_EXTERN(accuracy); +PROFILING_ENABLE_EXTERN(bin_accuracy); + namespace tensorNN { @@ -28,6 +32,8 @@ namespace tensorNN { if (!Tensor::sameShape(A, B)) msg("Incompatible dims", "Tensor::accuracy"); if (A->ndim != 2) msg("Accuracy only over 2D Tensor (batch x probs)", "Tensor::Accuracy"); + PROFILING_HEADER(accuracy); + int acc = 0; B->tsem->lock(); @@ -46,6 +52,9 @@ namespace tensorNN { } #endif B->tsem->unlock(); + + PROFILING_FOOTER(accuracy); + return acc; } @@ -58,6 +67,7 @@ namespace tensorNN { if (A->shape[1] != 1) msg("Accuracy only over 2D Tensor (batch x prob) within shape:{batchx1}", "Tensor::Bin_Accuracy"); + PROFILING_HEADER(bin_accuracy); int acc = 0; @@ -77,6 +87,9 @@ namespace tensorNN { } #endif B->tsem->unlock(); + + PROFILING_FOOTER(bin_accuracy); + return acc; } diff --git a/src/tensor/nn/tensor_pool.cpp b/src/tensor/nn/tensor_pool.cpp index d3e0a7a39..c33db48c7 100644 --- a/src/tensor/nn/tensor_pool.cpp +++ b/src/tensor/nn/tensor_pool.cpp @@ -21,7 +21,10 @@ #include "eddl/hardware/fpga/nn/fpga_nn.h" #endif -PROFILING_ENABLE(MPool2D); +PROFILING_ENABLE_EXTERN(MPool2D); +PROFILING_ENABLE_EXTERN(MPool2D_back); +PROFILING_ENABLE_EXTERN(AvgPool2D); +PROFILING_ENABLE_EXTERN(AvgPool2D_back); namespace tensorNN { @@ -35,7 +38,7 @@ namespace tensorNN { ///////////////////////////////////////////////////////////////////// if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::MPool2D"); - PROFILING_HEADER(MPool2D); + PROFILING_HEADER(MPool2D); D->O->tsem->lock(); if (D->I->isCPU()) { @@ -55,8 +58,7 @@ namespace tensorNN { #endif D->O->tsem->unlock(); - PROFILING_FOOTER(MPool2D); - PROFILING_PRINTF(MPool2D); + PROFILING_FOOTER(MPool2D); } void MPool2D_back(PoolDescriptor *D) { @@ -68,6 +70,8 @@ namespace tensorNN { ///////////////////////////////////////////////////////////////////// if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::MPool2D_back"); + PROFILING_HEADER(MPool2D_back); + D->ID->tsem->lock(); if (D->I->isCPU()) { cpu_mpool2D_back(D); @@ -85,6 +89,8 @@ namespace tensorNN { } #endif D->ID->tsem->unlock(); + + PROFILING_FOOTER(MPool2D_back); } @@ -97,6 +103,8 @@ namespace tensorNN { ///////////////////////////////////////////////////////////////////// if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::AvgPool2D"); + PROFILING_HEADER(AvgPool2D); + D->O->tsem->lock(); if (D->I->isCPU()) { cpu_avgpool2D(D); @@ -114,6 +122,8 @@ namespace tensorNN { } #endif D->O->tsem->unlock(); + + PROFILING_FOOTER(AvgPool2D); } void AvgPool2D_back(PoolDescriptor *D) { @@ -125,6 +135,8 @@ namespace tensorNN { ///////////////////////////////////////////////////////////////////// if ((D->I->ndim != 4)) msg("Tensors are not 4D", "Tensor::AvgPool2D_back"); + PROFILING_HEADER(AvgPool2D_back); + D->ID->tsem->lock(); if (D->I->isCPU()) { cpu_avgpool2D_back(D); @@ -142,6 +154,8 @@ namespace tensorNN { } #endif D->ID->tsem->unlock(); + + PROFILING_FOOTER(AvgPool2D_back); } } diff --git a/src/utils.cpp b/src/utils.cpp index c151d1cb2..7aa50e00a 100755 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -564,6 +564,26 @@ PROFILING_ENABLE(less_equal); PROFILING_ENABLE(equal); PROFILING_ENABLE(not_equal); PROFILING_ENABLE(equivalent); +// bn +PROFILING_ENABLE(permute_channels_last); +PROFILING_ENABLE(permute_channels_first); +PROFILING_ENABLE(permute_batch_last); +PROFILING_ENABLE(permute_batch_first); +// core_nn +PROFILING_ENABLE(repeat_nn); +PROFILING_ENABLE(d_repeat_nn); +PROFILING_ENABLE(select); +PROFILING_ENABLE(select_back); +PROFILING_ENABLE(set_select); +PROFILING_ENABLE(set_select_back); +// metrics +PROFILING_ENABLE(accuracy); +PROFILING_ENABLE(bin_accuracy); +// pool +PROFILING_ENABLE(MPool2D); +PROFILING_ENABLE(MPool2D_back); +PROFILING_ENABLE(AvgPool2D); +PROFILING_ENABLE(AvgPool2D_back); void __show_profile() { @@ -708,4 +728,25 @@ void __show_profile() { PROFILING_PRINTF(equal); PROFILING_PRINTF(not_equal); PROFILING_PRINTF(equivalent); + // bn + PROFILING_PRINTF(permute_channels_last); + PROFILING_PRINTF(permute_channels_first); + PROFILING_PRINTF(permute_batch_last); + PROFILING_PRINTF(permute_batch_first); + // core_nn + PROFILING_PRINTF(repeat_nn); + PROFILING_PRINTF(d_repeat_nn); + PROFILING_PRINTF(select); + PROFILING_PRINTF(select_back); + PROFILING_PRINTF(set_select); + PROFILING_PRINTF(set_select_back); + // metrics + PROFILING_PRINTF(accuracy); + PROFILING_PRINTF(bin_accuracy); + // pool + PROFILING_PRINTF(MPool2D); + PROFILING_PRINTF(MPool2D_back); + PROFILING_PRINTF(AvgPool2D); + PROFILING_PRINTF(AvgPool2D_back); + } \ No newline at end of file