diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 8b3117af7a..7ba2ad4fbb 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -5,14 +5,19 @@
from hls4ml.backends.vivado.vivado_backend import VivadoBackend
from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401
-
from hls4ml.backends.catapult.catapult_backend import CatapultBackend # isort: skip
-
from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import (
+ VitisAcceleratorIPFlowBackend,
+)
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import (
+ VitisAcceleratorIPFlowConfig,
+)
register_backend('Vivado', VivadoBackend)
register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
register_backend('Vitis', VitisBackend)
+register_backend('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowBackend)
register_backend('Quartus', QuartusBackend)
register_backend('Catapult', CatapultBackend)
register_backend('SymbolicExpression', SymbolicExpressionBackend)
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
new file mode 100644
index 0000000000..1279ec22d0
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
@@ -0,0 +1,42 @@
+{
+ "pynq-z2": {
+ "part": "xc7z020clg400-1",
+ "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl"},
+ "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+ "c_drivers": {}
+ },
+ "zcu102": {
+ "part": "xczu9eg-ffvb1156-2-e",
+ "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
+ "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+ "c_drivers": {}
+ },
+ "alveo-u50": {
+ "part": "xcu50-fsvh2104-2-e",
+ "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+ "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+ "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"},
+ "c_drivers": {}
+ },
+ "alveo-u250": {
+ "part": "xcu250-figd2104-2L-e",
+ "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+ "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+ "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"},
+ "c_drivers": {}
+ },
+ "alveo-u200": {
+ "part": "xcu200-fsgd2104-2-e",
+ "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+ "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+ "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"},
+ "c_drivers": {}
+ },
+ "alveo-u280": {
+ "part": "xcu280-fsvh2892-2L-e",
+ "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+ "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+ "krnl_rtl_srcs": {"axi_stream": "krnl_rtl_src"},
+ "c_drivers": {}
+ }
+}
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
new file mode 100644
index 0000000000..0372a75b75
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -0,0 +1,122 @@
+import os
+
+from hls4ml.backends import VitisBackend, VivadoBackend
+from hls4ml.model.flow import register_flow
+from hls4ml.report import parse_vivado_report
+
+
+class VitisAcceleratorIPFlowBackend(VitisBackend):
+ def __init__(self):
+ super(VivadoBackend, self).__init__(name='VitisAcceleratorIPFlow')
+ self._register_layer_attributes()
+ self._register_flows()
+
+ def build(
+ self,
+ model,
+ reset=False,
+ csim=True,
+ synth=True,
+ cosim=False,
+ validation=False,
+ export=False,
+ vsynth=False,
+ # fifo_opt=False,
+ bitfile=False,
+ ):
+ # run the VitisBackend build
+ super().build(
+ model,
+ reset=reset,
+ csim=csim,
+ synth=synth,
+ cosim=cosim,
+ validation=validation,
+ export=export,
+ vsynth=vsynth,
+ # fifo_opt=fifo_opt,
+ )
+ # Get Config to view Board and Platform
+ # from hls4ml.backends import VitisAcceleratorIPFlowConfig
+
+ # vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig(
+ # model.config, model.get_input_variables(), model.get_output_variables()
+ # )
+ # now make a bitfile
+ if bitfile:
+ curr_dir = os.getcwd()
+ os.chdir(model.config.get_output_dir())
+ try:
+ os.system('vivado -mode batch -source design.tcl') # check if this is accepted as a command
+ except Exception:
+ print("Something went wrong, check the Vivado logs")
+ os.chdir(curr_dir)
+
+ return parse_vivado_report(model.config.get_output_dir())
+
+ def create_initial_config(
+ self,
+ board='pynq-z2',
+ part=None,
+ clock_period=5,
+ clock_uncertainty='12.5%',
+ io_type='io_parallel',
+ interface='axi_stream',
+ driver='python',
+ input_type='float',
+ output_type='float',
+ platform='xilinx_u250_xdma_201830_2',
+ ):
+ '''
+ Create initial accelerator config with default parameters
+
+ Args:
+ board: one of the keys defined in supported_boards.json
+ clock_period: clock period passed to hls project
+ io_type: io_parallel or io_stream
+ interface: `axi_stream`: generate hardware designs and drivers which exploit axi stream channels.
+ `axi_master`: generate hardware designs and drivers which exploit axi master channels.
+ `axi_lite` : generate hardware designs and drivers which exploit axi lite channels. (Don't use it
+ to exchange large amount of data)
+ driver: `python`: generates the python driver to use the accelerator in the PYNQ stack.
+ `c`: generates the c driver to use the accelerator bare-metal.
+ input_type: the wrapper input precision. Can be `float` or an `ap_type`. Note: VivadoAcceleratorBackend
+ will round the number of bits used to the next power-of-2 value.
+ output_type: the wrapper output precision. Can be `float` or an `ap_type`. Note:
+ VivadoAcceleratorBackend will round the number of bits used to the next power-of-2 value.
+ platform: development target platform
+
+ Returns:
+ populated config
+ '''
+ board = board if board is not None else 'pynq-z2'
+ config = super().create_initial_config(part, clock_period, clock_uncertainty, io_type)
+ config['AcceleratorConfig'] = {}
+ config['AcceleratorConfig']['Board'] = board
+ config['AcceleratorConfig']['Interface'] = interface # axi_stream, axi_master, axi_lite
+ config['AcceleratorConfig']['Driver'] = driver
+ config['AcceleratorConfig']['Precision'] = {}
+ config['AcceleratorConfig']['Precision']['Input'] = {}
+ config['AcceleratorConfig']['Precision']['Output'] = {}
+ config['AcceleratorConfig']['Precision']['Input'] = input_type # float, double or ap_fixed
+ config['AcceleratorConfig']['Precision']['Output'] = output_type # float, double or ap_fixed
+ # if board.startswith('alveo'):
+ # config['AcceleratorConfig']['Platform'] = platform
+
+ return config
+
+ def get_default_flow(self):
+ return self._default_flow
+
+ def get_writer_flow(self):
+ return self._writer_flow
+
+ def _register_flows(self):
+ vivado_ip = 'vivado:ip'
+ writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls']
+ self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name)
+ self._default_flow = vivado_ip
+
+ # fifo_depth_opt_passes = ['vivadoaccelerator:fifo_depth_optimization'] + writer_passes
+
+ # register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[vivado_ip], backend=self.name)
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
new file mode 100644
index 0000000000..07961a9b6f
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
@@ -0,0 +1,169 @@
+import json
+import os
+
+import numpy as np
+
+from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType
+
+
+class VitisAcceleratorIPFlowConfig:
+ def __init__(self, config, model_inputs, model_outputs):
+ self.config = config.config
+ self.board = self.config.get('AcceleratorConfig', {}).get('Board', 'pynq-z2')
+ self.supported_boards = json.load(open(os.path.dirname(__file__) + '/supported_boards.json'))
+ if self.board in self.supported_boards.keys():
+ board_info = self.supported_boards[self.board]
+ self.part = board_info['part']
+ else:
+ raise Exception('The board does not appear in supported_boards.json file')
+
+ if self.config.get('Part') is not None:
+ if self.config.get('Part') != self.part:
+ print(
+ 'WARNING: You set a Part that does not correspond to the Board you specified. The correct '
+ 'Part is now set.'
+ )
+ self.config['Part'] = self.part
+ accel_config = self.config.get('AcceleratorConfig', None)
+ if accel_config is not None:
+ prec = accel_config.get('Precision')
+ if prec is None:
+ raise Exception('Precision must be provided in the AcceleratorConfig')
+ else:
+ if prec.get('Input') is None or prec.get('Output') is None:
+ raise Exception('Input and Output fields must be provided in the AcceleratorConfig->Precision')
+ else:
+ accel_config = {
+ 'Precision': {'Input': 'float', 'Output': 'float'},
+ 'Driver': 'python',
+ 'Interface': 'axi_stream',
+ }
+ config.config['AcceleratorConfig'] = accel_config
+
+ self.interface = self.config['AcceleratorConfig'].get('Interface', 'axi_stream') # axi_stream, axi_master, axi_lite
+ self.driver = self.config['AcceleratorConfig'].get('Driver', 'python') # python or c
+ self.input_type = self.config['AcceleratorConfig']['Precision'].get(
+ 'Input', 'float'
+ ) # float, double or ap_fixed
+ self.output_type = self.config['AcceleratorConfig']['Precision'].get(
+ 'Output', 'float'
+ ) # float, double or ap_fixed
+ self.platform = self.config['AcceleratorConfig'].get(
+ 'Platform', 'xilinx_u250_xdma_201830_2'
+ ) # Get platform folder name
+
+ assert (
+ len(model_inputs) == 1
+ ), "Only models with one input tensor are currently supported by VitisAcceleratorIPFlowBackend"
+ assert (
+ len(model_outputs) == 1
+ ), "Only models with one output tensor are currently supported by VitisAcceleratorIPFlowBackend"
+ self.inp = model_inputs[0]
+ self.out = model_outputs[0]
+ inp_axi_t = self.input_type
+ out_axi_t = self.output_type
+
+ if inp_axi_t not in ['float', 'double']:
+ self.input_type = self._next_factor8_type(config.backend.convert_precision_string(inp_axi_t))
+ if out_axi_t not in ['float', 'double']:
+ self.output_type = self._next_factor8_type(config.backend.convert_precision_string(out_axi_t))
+
+ if self.input_type == 'float':
+ self.input_bitwidth = 32
+ elif self.input_type == 'double':
+ self.input_bitwidth = 64
+ else:
+ self.input_bitwidth = config.backend.convert_precision_string(inp_axi_t).width
+
+ if out_axi_t == 'float':
+ self.output_bitwidth = 32
+ elif out_axi_t == 'double':
+ self.output_bitwidth = 64
+ else:
+ self.output_bitwidth = config.backend.convert_precision_string(out_axi_t).width
+
+ def _next_factor8_type(self, p):
+ '''Return a new type with the width rounded to the next factor of 8 up to p's width
+ Args:
+ p : IntegerPrecisionType or FixedPrecisionType
+ Returns:
+ An IntegerPrecisionType or FixedPrecisionType with the width rounder up to the next factor of 8
+ of p's width. Other parameters (fractional bits, extra modes) stay the same.
+ '''
+ W = p.width
+ newW = int(np.ceil(W / 8) * 8)
+ if isinstance(p, FixedPrecisionType):
+ return FixedPrecisionType(newW, p.integer, p.signed, p.rounding_mode, p.saturation_mode, p.saturation_bits)
+ elif isinstance(p, IntegerPrecisionType):
+ return IntegerPrecisionType(newW, p.signed)
+
+ def get_io_bitwidth(self):
+ return self.input_bitwidth, self.output_bitwidth
+
+ def get_corrected_types(self):
+ return self.input_type, self.output_type, self.inp, self.out
+
+ def get_interface(self):
+ return self.interface
+
+ def get_board_info(self, board=None):
+ if board is None:
+ board = self.board
+ if board in self.supported_boards.keys():
+ return self.supported_boards[board]
+ else:
+ raise Exception('The board is still not supported')
+
+ def get_part(self):
+ return self.part
+
+ def get_driver(self):
+ return self.driver
+
+ def get_board(self):
+ return self.board
+
+ def get_platform(self):
+ return self.platform
+
+ def get_clock_period(self):
+ return self.clock_period
+
+ def get_driver_path(self):
+ if self.board.startswith('alveo'):
+ return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file()
+ else:
+ return (
+ '../templates/vitis_accelerator_ip_flow/'
+ + self.board
+ + '/'
+ + self.driver
+ + '_drivers/'
+ + self.get_driver_file()
+ )
+
+ def get_driver_file(self):
+ driver_ext = '.py' if self.driver == 'python' else '.h'
+ return self.interface + '_driver' + driver_ext
+
+ def get_krnl_rtl_src_dir(self):
+ return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/krnl_rtl_src'
+
+ def get_input_type(self):
+ return self.input_type
+
+ def get_output_type(self):
+ return self.output_type
+
+ def get_tcl_file_path(self):
+ board_info = self.get_board_info(self.board)
+ tcl_scripts = board_info.get('tcl_scripts', None)
+ if tcl_scripts is None:
+ raise Exception('No tcl scripts definition available for the board in supported_board.json')
+ tcl_script = tcl_scripts.get(self.interface, None)
+ if tcl_script is None:
+ raise Exception('No tcl script definition available for the desired interface in supported_board.json')
+ if self.board.startswith('alveo'):
+ return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/tcl_scripts/' + tcl_script
+ else:
+ return '../templates/vitis_accelerator_ip_flow/' + self.board + '/tcl_scripts/' + tcl_script
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
index 20b6fecb49..aad5d9a430 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
@@ -86,7 +86,7 @@ void separable_conv_1d_cl(hls::stream &data, hls::stream &res,
#pragma HLS DATAFLOW
hls::stream depthwise_res;
- unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+ const unsigned res_depth = CONFIG_T::depthwise_config::out_width;
#pragma HLS STREAM variable=depthwise_res depth=res_depth
depthwise_conv_1d_buffer_cl(data, depthwise_res,
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
index a3747990e0..a119fb9e2a 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
@@ -120,7 +120,7 @@ void separable_conv_2d_cl(hls::stream &data, hls::stream &res,
#pragma HLS DATAFLOW
hls::stream depthwise_res;
- unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+ const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
#pragma HLS STREAM variable=depthwise_res depth=res_depth
depthwise_conv_2d_buffer_cl(data, depthwise_res,
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
new file mode 100644
index 0000000000..262ce00d63
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+ CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+ CFLAGS="-O3 -fPIC -std=c++11"
+fi
+VITIS_ACCELERATOR_FLAGS="VITIS_ACCELERATOR_IP_FLOW"
+CFLAGS="$CFLAGS -D$VITIS_ACCELERATOR_FLAGS"
+
+INCFLAGS="-Ifirmware/ap_types/"
+
+PROJECT=myproject
+LIB_STAMP=mystamp
+
+${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}_axi.cpp -o ${PROJECT}_axi.o
+${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_axi.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
+rm -f *.o
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
new file mode 100644
index 0000000000..cf6c0b9c25
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
@@ -0,0 +1,14 @@
+// hls-fpga-machine-learning insert include
+
+void myproject_axi(hls::stream &in, hls::stream &out) {
+
+ // hls-fpga-machine-learning insert interface
+
+ // hls-fpga-machine-learning insert local vars
+
+ // hls-fpga-machine-learning insert enqueue
+
+ // hls-fpga-machine-learning insert call
+
+ // hls-fpga-machine-learning insert dequeue
+}
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
new file mode 100644
index 0000000000..d0d88bfecf
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
@@ -0,0 +1,10 @@
+#ifndef MYPROJECT_AXI_H_
+#define MYPROJECT_AXI_H_
+
+#include
+// hls-fpga-machine-learning insert include
+
+// hls-fpga-machine-learning insert definitions
+
+void myproject_axi(hls::stream &in, hls::stream &out);
+#endif
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..1aac79f2d3
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py
@@ -0,0 +1,75 @@
+from datetime import datetime
+
+import numpy as np
+from pynq import Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+ def __init__(
+ self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None
+ ):
+ super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+ self.sendchannel = self.hier_0.axi_dma_0.sendchannel
+ self.recvchannel = self.hier_0.axi_dma_0.recvchannel
+ self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+ self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+
+ def _print_dt(self, timea, timeb, N):
+ dt = timeb - timea
+ dts = dt.seconds + dt.microseconds * 10**-6
+ rate = N / dts
+ print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+ return dts, rate
+
+ def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+ """
+ Obtain the predictions of the NN implemented in the FPGA.
+ Parameters:
+ - X : the input vector. Should be numpy ndarray.
+ - dtype : the data type of the elements of the input/output vectors.
+ Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+ types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+ Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot
+ any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+ doc for more info).
+ In this case the encoding/decoding has to be computed by the PS. For example for
+ 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+ 'float' -> 'ap_fixed<16,6>':
+ ```
+ def encode(xi):
+ return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+ def decode(yi):
+ return yi * 2**-10
+ encode_v = np.vectorize(encode) # to apply them element-wise
+ decode_v = np.vectorize(decode)
+ ```
+ - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+ - encode/decode: function pointers. See `dtype` section for more information.
+ - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+ the namesake parameter.
+ """
+ if profile:
+ timea = datetime.now()
+ if encode is not None:
+ X = encode(X)
+ self.input_buffer[:] = X
+ self.sendchannel.transfer(self.input_buffer)
+ self.recvchannel.transfer(self.output_buffer)
+ if debug:
+ print("Transfer OK")
+ self.sendchannel.wait()
+ if debug:
+ print("Send OK")
+ self.recvchannel.wait()
+ if debug:
+ print("Receive OK")
+ # result = self.output_buffer.copy()
+ if decode is not None:
+ self.output_buffer = decode(self.output_buffer)
+
+ if profile:
+ timeb = datetime.now()
+ dts, rate = self._print_dt(timea, timeb, len(X))
+ return self.output_buffer, dts, rate
+ else:
+ return self.output_buffer
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..e8db1e6782
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,69 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vitis_accelerator_ip_flow -part xc7z020clg400-1 -force
+
+# set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property ip_repo_paths ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0]
+
+startgroup
+set_property -dict [list \
+ CONFIG.PCW_USE_S_AXI_HP0 {1} \
+ CONFIG.PCW_USE_S_AXI_HP2 {1} \
+] [get_bd_cells processing_system7_0]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list \
+ CONFIG.c_include_sg {0} \
+ CONFIG.c_m_axi_mm2s_data_width {64} \
+ CONFIG.c_m_axi_s2mm_data_width {64} \
+ CONFIG.c_mm2s_burst_size {32} \
+ CONFIG.c_sg_length_width {26} \
+] [get_bd_cells axi_dma_0]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
+
+#todo: make clock a variable
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP2} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_HP2]
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (50 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins myproject_axi_0/ap_clk]
+endgroup
+
+validate_bd_design
+
+open_bd_design {./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd}
+
+make_wrapper -files [get_files ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+#todo: make number of jobs a variable
+launch_runs impl_1 -to_step write_bitstream -jobs 10
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..1d70e55406
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
@@ -0,0 +1,83 @@
+from datetime import datetime
+
+import numpy as np
+from pynq import PL, Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+ def __init__(self, bitfile_name, dtbo=None, download=True, ignore_version=False, device=None):
+ super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+
+ def _print_dt(self, timea, timeb, N):
+ dt = timeb - timea
+ dts = dt.seconds + dt.microseconds * 10**-6
+ rate = N / dts
+ print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+ return dts, rate
+
+ def reset_PL():
+ PL.reset()
+
+ def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None, decode=None):
+ """
+ Obtain the predictions of the NN implemented in the FPGA.
+ Parameters:
+ - X : the input vector. Should be numpy ndarray.
+ - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and
+ for sizing the output vector shape.
+ - dtype : the data type of the elements of the input/output vectors.
+ Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+ types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+ Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot
+ any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+ doc for more info).
+ In this case the encoding/decoding has to be computed by the PS. For example for
+ 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+ 'float' -> 'ap_fixed<16,6>':
+ ```
+ def encode(xi):
+ return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+ def decode(yi):
+ return yi * 2**-10
+ encode_v = np.vectorize(encode) # to apply them element-wise
+ decode_v = np.vectorize(decode)
+ ```
+ - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+ - encode/decode: function pointers. See `dtype` section for more information.
+ - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+ the namesake parameter.
+ """
+
+ if encode is not None:
+ X = encode(X)
+ with allocate(shape=X.shape, dtype=dtype) as input_buffer, allocate(shape=y_shape, dtype=dtype) as output_buffer:
+ input_buffer[:] = X
+
+ if profile:
+ timea = datetime.now()
+
+ self.axi_dma_0.sendchannel.transfer(input_buffer)
+ self.axi_dma_0.recvchannel.transfer(output_buffer)
+ if debug:
+ print("Transfer OK")
+ self.axi_dma_0.sendchannel.wait()
+ if debug:
+ print("Send OK")
+ self.axi_dma_0.recvchannel.wait()
+
+ if profile:
+ timeb = datetime.now()
+
+ if debug:
+ print("Receive OK")
+
+ result = output_buffer.copy()
+
+ if decode is not None:
+ result = decode(result)
+
+ if profile:
+ dts, rate = self._print_dt(timea, timeb, len(X))
+ return result, dts, rate
+
+ return result
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..103fec0178
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,65 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
+
+set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
+set_property ip_repo_paths ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+set_property ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project]
+update_ip_catalog
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.5 zynq_ultra_ps_e_1
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ultra_ps_e_1]
+
+set_property -dict [list \
+ CONFIG.PSU__SAXIGP2__DATA_WIDTH {64} \
+ CONFIG.PSU__SAXIGP4__DATA_WIDTH {64} \
+ CONFIG.PSU__USE__S_AXI_GP2 {1} \
+ CONFIG.PSU__USE__S_AXI_GP4 {1} \
+] [get_bd_cells zynq_ultra_ps_e_1]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list \
+ CONFIG.c_include_sg {0} \
+ CONFIG.c_m_axi_mm2s_data_width {64} \
+ CONFIG.c_m_axi_s2mm_data_width {64} \
+ CONFIG.c_mm2s_burst_size {32} \
+ CONFIG.c_sg_length_width {26} \
+] [get_bd_cells axi_dma_0]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_1/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_1/S_AXI_HP0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP0_FPD]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_1/S_AXI_HP2_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP2_FPD]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_1/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_1/M_AXI_HPM1_FPD]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk]
+
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h
new file mode 100755
index 0000000000..e01c8a8cd1
--- /dev/null
+++ b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h
@@ -0,0 +1,441 @@
+// 67d7842dbbe25473c3c32b93c0da8047785f30d78e8a024de1b57352245f9689
+/*****************************************************************************
+ *
+ * Author: Xilinx, Inc.
+ *
+ * This text contains proprietary, confidential information of
+ * Xilinx, Inc. , is distributed by under license from Xilinx,
+ * Inc., and may be used, copied and/or disclosed only pursuant to
+ * the terms of a valid license agreement with Xilinx, Inc.
+ *
+ * XILINX IS PROVIDING THIS DESIGN, CODE, OR INFORMATION "AS IS"
+ * AS A COURTESY TO YOU, SOLELY FOR USE IN DEVELOPING PROGRAMS AND
+ * SOLUTIONS FOR XILINX DEVICES. BY PROVIDING THIS DESIGN, CODE,
+ * OR INFORMATION AS ONE POSSIBLE IMPLEMENTATION OF THIS FEATURE,
+ * APPLICATION OR STANDARD, XILINX IS MAKING NO REPRESENTATION
+ * THAT THIS IMPLEMENTATION IS FREE FROM ANY CLAIMS OF INFRINGEMENT,
+ * AND YOU ARE RESPONSIBLE FOR OBTAINING ANY RIGHTS YOU MAY REQUIRE
+ * FOR YOUR IMPLEMENTATION. XILINX EXPRESSLY DISCLAIMS ANY
+ * WARRANTY WHATSOEVER WITH RESPECT TO THE ADEQUACY OF THE
+ * IMPLEMENTATION, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OR
+ * REPRESENTATIONS THAT THIS IMPLEMENTATION IS FREE FROM CLAIMS OF
+ * INFRINGEMENT, IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE.
+ *
+ * Xilinx products are not intended for use in life support appliances,
+ * devices, or systems. Use in such applications is expressly prohibited.
+ *
+#- (c) Copyright 2011-2022 Xilinx, Inc. All rights reserved.
+#-
+#- This file contains confidential and proprietary information
+#- of Xilinx, Inc. and is protected under U.S. and
+#- international copyright and other intellectual property
+#- laws.
+#-
+#- DISCLAIMER
+#- This disclaimer is not a license and does not grant any
+#- rights to the materials distributed herewith. Except as
+#- otherwise provided in a valid license issued to you by
+#- Xilinx, and to the maximum extent permitted by applicable
+#- law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#- WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#- AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#- BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#- INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#- (2) Xilinx shall not be liable (whether in contract or tort,
+#- including negligence, or under any other theory of
+#- liability) for any loss or damage of any kind or nature
+#- related to, arising under or in connection with these
+#- materials, including for any direct, or any indirect,
+#- special, incidental, or consequential loss or damage
+#- (including loss of data, profits, goodwill, or any type of
+#- loss or damage suffered as a result of any action brought
+#- by a third party) even if such damage or loss was
+#- reasonably foreseeable or Xilinx had been advised of the
+#- possibility of the same.
+#-
+#- CRITICAL APPLICATIONS
+#- Xilinx products are not designed or intended to be fail-
+#- safe, or for use in any application requiring fail-safe
+#- performance, such as life-support or safety devices or
+#- systems, Class III medical devices, nuclear facilities,
+#- applications related to the deployment of airbags, or any
+#- other applications that could lead to death, personal
+#- injury, or severe property or environmental damage
+#- (individually and collectively, "Critical
+#- Applications"). Customer assumes the sole risk and
+#- liability of any use of Xilinx products in Critical
+#- Applications, subject only to applicable laws and
+#- regulations governing limitations on product liability.
+#-
+#- THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#- PART OF THIS FILE AT ALL TIMES.
+#- ************************************************************************
+
+ *
+ *****************************************************************************/
+
+/*
+ * This file contains the definition of the data types for AXI streaming.
+ * ap_axi_s is a signed interpretation of the AXI stream
+ * ap_axi_u is an unsigned interpretation of the AXI stream
+ */
+
+#ifndef __AP__AXI_SDATA__
+#define __AP__AXI_SDATA__
+
+#include
+#include "ap_int.h"
+//#include "ap_fixed.h"
+template
+struct ap_fixed;
+template
+struct ap_ufixed;
+
+namespace hls {
+
+template constexpr std::size_t bitwidth = sizeof(T) * CHAR_BIT;
+
+template constexpr std::size_t bitwidth> = W;
+template constexpr std::size_t bitwidth> = W;
+template
+constexpr std::size_t bitwidth> = _AP_W;
+template
+constexpr std::size_t bitwidth> = _AP_W;
+
+template
+constexpr std::size_t bytewidth = (bitwidth + CHAR_BIT - 1) / CHAR_BIT;
+
+template struct axis {
+ static constexpr std::size_t NewWUser = (WUser == 0) ? 1 : WUser;
+ static constexpr std::size_t NewWId = (WId == 0) ? 1 : WId;
+ static constexpr std::size_t NewWDest = (WDest == 0) ? 1 : WDest;
+ T data;
+ ap_uint> keep;
+ ap_uint> strb;
+ ap_uint user;
+ ap_uint<1> last;
+ ap_uint id;
+ ap_uint dest;
+
+ ap_uint *get_user_ptr() {
+#pragma HLS inline
+ return (WUser == 0) ? nullptr : &user;
+ }
+ ap_uint *get_id_ptr() {
+#pragma HLS inline
+ return (WId == 0) ? nullptr : &id;
+ }
+ ap_uint *get_dest_ptr() {
+#pragma HLS inline
+ return (WDest == 0) ? nullptr : &dest;
+ }
+};
+
+} // namespace hls
+
+template
+using ap_axis = hls::axis, WUser, WId, WDest>;
+
+template
+using ap_axiu = hls::axis, WUser, WId, WDest>;
+
+// Isolate out qdma_axis from hls::axis for special APIs.
+template
+struct qdma_axis;
+
+template struct qdma_axis {
+ // private:
+ static constexpr std::size_t kBytes = (WData + 7) / 8;
+
+ ap_uint data;
+ ap_uint keep;
+ ap_uint<1> strb;
+ ap_uint<1> user;
+ ap_uint<1> last;
+ ap_uint<1> id;
+ ap_uint<1> dest;
+
+ ap_uint<1> *get_strb_ptr() {
+#pragma HLS inline
+ return nullptr;
+ }
+ ap_uint<1> *get_user_ptr() {
+#pragma HLS inline
+ return nullptr;
+ }
+ ap_uint<1> *get_id_ptr() {
+#pragma HLS inline
+ return nullptr;
+ }
+ ap_uint<1> *get_dest_ptr() {
+#pragma HLS inline
+ return nullptr;
+ }
+
+ // public:
+ ap_uint get_data() const {
+#pragma HLS inline
+ return data;
+ }
+ ap_uint get_keep() const {
+#pragma HLS inline
+ return keep;
+ }
+ ap_uint<1> get_last() const {
+#pragma HLS inline
+ return last;
+ }
+
+ void set_data(const ap_uint &d) {
+#pragma HLS inline
+ data = d;
+ }
+ void set_keep(const ap_uint &k) {
+#pragma HLS inline
+ keep = k;
+ }
+ void set_last(const ap_uint<1> &l) {
+#pragma HLS inline
+ last = l;
+ }
+ void keep_all() {
+#pragma HLS inline
+ ap_uint k = 0;
+ keep = ~k;
+ }
+
+ qdma_axis() {
+#pragma HLS inline
+ ;
+ }
+ qdma_axis(ap_uint d) : data(d) {
+#pragma HLS inline
+ ;
+ }
+ qdma_axis(ap_uint d, ap_uint k) : data(d), keep(k) {
+#pragma HLS inline
+ ;
+ }
+ qdma_axis(ap_uint d, ap_uint k, ap_uint<1> l)
+ : data(d), keep(k), last(l) {
+#pragma HLS inline
+ ;
+ }
+ qdma_axis(const qdma_axis &d)
+ : data(d.data), keep(d.keep), last(d.last) {
+#pragma HLS inline
+ ;
+ }
+ qdma_axis &operator=(const qdma_axis &d) {
+#pragma HLS inline
+ data = d.data;
+ keep = d.keep;
+ last = d.last;
+ return *this;
+ }
+};
+
+#ifdef AESL_SYN
+#if ((__clang_major__ != 3) || (__clang_minor__ != 1))
+#include "hls_stream.h"
+namespace hls {
+
+template
+class stream> final {
+ typedef axis __STREAM_T__;
+
+public:
+ /// Constructors
+ INLINE stream() {}
+
+ INLINE stream(const char *name) { (void)name; }
+
+ /// Make copy constructor and assignment operator private
+private:
+ INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {}
+
+public:
+ /// Overload >> and << operators to implement read() and write()
+ INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); }
+
+ INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); }
+
+ /// empty & full
+ bool empty() {
+#pragma HLS inline
+ bool tmp = __fpga_axis_valid(&V.data, &V.keep, &V.strb, V.get_user_ptr(),
+ &V.last, V.get_id_ptr(), V.get_dest_ptr());
+ return !tmp;
+ }
+
+ bool full() {
+#pragma HLS inline
+ bool tmp = __fpga_axis_ready(&V.data, &V.keep, &V.strb, V.get_user_ptr(),
+ &V.last, V.get_id_ptr(), V.get_dest_ptr());
+ return !tmp;
+ }
+
+ /// Blocking read
+ void read(__STREAM_T__ &dout) {
+#pragma HLS inline
+ __STREAM_T__ tmp;
+ __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+ V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+ &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+ tmp.get_dest_ptr());
+ dout = tmp;
+ }
+
+ __STREAM_T__ read() {
+#pragma HLS inline
+ __STREAM_T__ tmp;
+ __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+ V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+ &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+ tmp.get_dest_ptr());
+ return tmp;
+ }
+
+ /// Blocking write
+ void write(const __STREAM_T__ &din) {
+#pragma HLS inline
+ __STREAM_T__ tmp = din;
+ __fpga_axis_push(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+ V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+ &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+ tmp.get_dest_ptr());
+ }
+
+ /// Non-Blocking read
+ bool read_nb(__STREAM_T__ &dout) {
+#pragma HLS inline
+ __STREAM_T__ tmp;
+ if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+ V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+ &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+ &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) {
+ dout = tmp;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /// Non-Blocking write
+ bool write_nb(const __STREAM_T__ &in) {
+#pragma HLS inline
+ __STREAM_T__ tmp = in;
+ bool full_n = __fpga_axis_nb_push(
+ &V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, V.get_id_ptr(),
+ V.get_dest_ptr(), &tmp.data, &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+ &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+ return full_n;
+ }
+
+private:
+ __STREAM_T__ V NO_CTOR;
+};
+
+// specialization for qdma
+template
+class stream> {
+ typedef qdma_axis __STREAM_T__;
+
+public:
+ /// Constructors
+ INLINE stream() {}
+
+ INLINE stream(const char *name) { (void)name; }
+
+ /// Make copy constructor and assignment operator private
+private:
+ INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {}
+
+public:
+ /// Overload >> and << operators to implement read() and write()
+ INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); }
+
+ INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); }
+
+ /// empty & full
+ bool empty() {
+#pragma HLS inline
+ bool tmp = __fpga_axis_valid(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+ &V.last, V.get_id_ptr(), V.get_dest_ptr());
+ return !tmp;
+ }
+
+ bool full() {
+#pragma HLS inline
+ bool tmp = __fpga_axis_ready(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+ &V.last, V.get_id_ptr(), V.get_dest_ptr());
+ return !tmp;
+ }
+
+ /// Blocking read
+ void read(__STREAM_T__ &dout) {
+#pragma HLS inline
+ __STREAM_T__ tmp;
+ __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+ &V.last, V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+ &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(),
+ &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+ dout = tmp;
+ }
+
+ __STREAM_T__ read() {
+#pragma HLS inline
+ __STREAM_T__ tmp;
+ __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last,
+ V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+ tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+ tmp.get_dest_ptr());
+ return tmp;
+ }
+
+ /// Blocking write
+ void write(const __STREAM_T__ &din) {
+#pragma HLS inline
+ __STREAM_T__ tmp = din;
+ __fpga_axis_push(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last,
+ V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+ tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+ tmp.get_dest_ptr());
+ }
+
+ /// Non-Blocking read
+ bool read_nb(__STREAM_T__ &dout) {
+#pragma HLS inline
+ __STREAM_T__ tmp;
+
+ if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+ V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+ &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+ &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) {
+ dout = tmp;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /// Non-Blocking write
+ bool write_nb(const __STREAM_T__ &in) {
+#pragma HLS inline
+ __STREAM_T__ tmp = in;
+ bool full_n = __fpga_axis_nb_push(
+ &V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last, V.get_id_ptr(),
+ V.get_dest_ptr(), &tmp.data, &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(),
+ &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+ return full_n;
+ }
+
+private:
+ __STREAM_T__ V NO_CTOR;
+};
+
+} // namespace hls
+#endif
+#endif
+#endif
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index af37b0f4aa..50596091f2 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -206,7 +206,7 @@ if {$opt(cosim)} {
set time_end [clock clicks -milliseconds]
puts "INFO:"
- if {[string equal "$backend" "vivadoaccelerator"]} {
+ if {[string equal "$backend" "vivadoaccelerator"] || [string equal $backend "vitisacceleratoripflow"]} {
puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_axi_cosim.rpt r]]
} else {
puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_cosim.rpt r]]
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index b8c2a48d19..2a695d4e5a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -11,6 +11,11 @@
#include
#include
+// this header cannot be included by Vivado HLS
+// "VITIS_ACCELERATOR_IP_FLOW" is defined on the build_lib.sh of the `Vitis Accelerator` template files
+#ifdef VITIS_ACCELERATOR_IP_FLOW
+#include "ap_axi_sdata.h"
+#endif
namespace nnet {
#ifndef __SYNTHESIS__
@@ -161,6 +166,26 @@ template void convert_data(hls::stre
}
}
+#ifdef VITIS_ACCELERATOR_IP_FLOW
+// todo avoid hardcoding hls::axis and use template
+template
+void convert_data(srcType *src, hls::stream> &dst) {
+ for (size_t i = 0; i < SIZE; i++) {
+ hls::axis ctype;
+ ctype.data = dstType(src[i]);
+ dst.write(ctype);
+ }
+}
+
+template
+void convert_data(hls::stream> &src, dstType *dst) {
+ for (size_t i = 0; i < SIZE; i++) {
+ hls::axis ctype = src.read();
+ dst[i] = dstType(ctype.data);
+ }
+}
+#endif
+
extern bool trace_enabled;
extern std::map *trace_outputs;
extern size_t trace_type_size;
@@ -247,8 +272,6 @@ template void save_layer_output(hls::stream &data, const
}
}
-#endif
-
template void copy_data(std::vector src, dst_T dst[SIZE]) {
typename std::vector::const_iterator in_begin = src.cbegin() + OFFSET;
typename std::vector::const_iterator in_end = in_begin + SIZE;
@@ -272,14 +295,27 @@ void copy_data(std::vector src, hls::stream &dst) {
}
template void copy_data_axi(std::vector src, dst_T dst[SIZE]) {
- for (auto i = 0; i < SIZE; i++)
+ for (auto i = 0; i < SIZE; i++) {
+ dst[i].data = src[i];
if (i == SIZE - 1) {
- dst[i].data = src[i];
dst[i].last = 1;
} else {
- dst[i].data = src[i];
dst[i].last = 0;
}
+ }
+}
+
+template void copy_data_axi(std::vector src, hls::stream &dst) {
+ for (auto i = 0; i < SIZE; i++) {
+ dst_T pack;
+ pack.data = src[i];
+ if (i == SIZE - 1) {
+ pack.last = 1;
+ } else {
+ pack.last = 0;
+ }
+ dst.write(pack);
+ }
}
template void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
@@ -289,21 +325,55 @@ template void print_result(res_T result[SIZE], std::o
out << std::endl;
}
-template void print_result(hls::stream &result, std::ostream &out, bool keep = false) {
+template ::value, int>::type = 0>
+void print_result(hls::stream &result, std::ostream &out, bool keep = false) {
for (int i = 0; i < SIZE / res_T::size; i++) {
res_T res_pack = result.read();
for (int j = 0; j < res_T::size; j++) {
out << res_pack[j] << " ";
}
- if (keep)
+ if (keep) {
+ result.write(res_pack);
+ }
+ }
+ out << std::endl;
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis<...> and io_parallel
+template ::value, int>::type = 0>
+void print_result(hls::stream &result, std::ostream &out, bool keep = false) {
+ for (int i = 0; i < SIZE; i++) {
+ res_T res_pack = result.read();
+
+ out << res_pack.data << " ";
+
+ if (keep) {
result.write(res_pack);
+ }
+ }
+ out << std::endl;
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis and io_stream
+template
+void print_result(hls::stream &result, std::ostream &out, bool keep = false) {
+ for (int i = 0; i < SIZE / underlying_res_T::size; i++) {
+ res_T res_pack;
+ for (int j = 0; j < underlying_res_T::size; j++) {
+ res_pack = result.read();
+ out << res_pack.data << " ";
+ if (keep) {
+ result.write(res_pack);
+ }
+ }
}
out << std::endl;
}
template void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
-template void fill_zero(hls::stream &data) {
+template ::value, int>::type = 0>
+void fill_zero(hls::stream &data) {
for (int i = 0; i < SIZE / data_T::size; i++) {
data_T data_pack;
for (int j = 0; j < data_T::size; j++) {
@@ -313,6 +383,36 @@ template void fill_zero(hls::stream &data) {
}
}
+template ::value, int>::type = 0>
+void fill_zero(hls::stream &data) {
+ for (int i = 0; i < SIZE; i++) {
+ data_T data_pack;
+ data_pack.data = 0.;
+ if (i == SIZE - 1) {
+ data_pack.last = 1;
+ } else {
+ data_pack.last = 0;
+ }
+ data.write(data_pack);
+ }
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis
+template void fill_zero(hls::stream &data) {
+ for (int i = 0; i < SIZE / underlying_data_T::size; i++) {
+ data_T data_pack;
+ for (int j = 0; j < underlying_data_T::size; j++) {
+ data_pack.data = 0.;
+ if ((i == (SIZE / underlying_data_T::size - 1)) && (j == (underlying_data_T::size - 1))) {
+ data_pack.last = 1;
+ } else {
+ data_pack.last = 0;
+ }
+ data.write(data_pack);
+ }
+ }
+}
+
template int read_file_1D(const char *filename, dataType data[nrows]) {
FILE *fp;
fp = fopen(filename, "r");
@@ -370,6 +470,7 @@ template void hls_stream_debug(hls::stream &dat
res << datareg;
}
}
+#endif
constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
index ca3143d01e..11622efbf0 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
@@ -109,7 +109,7 @@ void separable_conv_1d_cl(hls::stream &data, hls::stream &res,
#pragma HLS DATAFLOW
hls::stream depthwise_res;
- unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+ const unsigned res_depth = CONFIG_T::depthwise_config::out_width;
#pragma HLS STREAM variable=depthwise_res depth=res_depth
depthwise_conv_1d_cl(data, depthwise_res, depthwise_weights,
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
index 7f4dd866c9..f5cafd2ee7 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
@@ -133,7 +133,7 @@ void separable_conv_2d_cl(hls::stream &data, hls::stream &res,
#pragma HLS DATAFLOW
hls::stream depthwise_res;
- unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+ const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
#pragma HLS STREAM variable=depthwise_res depth=res_depth
depthwise_conv_2d_cl(data, depthwise_res, depthwise_weights,
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index f16cccc9fa..31238b18c8 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -1,6 +1,7 @@
from hls4ml.writer.catapult_writer import CatapultWriter
from hls4ml.writer.quartus_writer import QuartusWriter
from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
+from hls4ml.writer.vitis_accelerator_ip_flow_writer import VitisAcceleratorIPFlowWriter
from hls4ml.writer.vitis_writer import VitisWriter
from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
from hls4ml.writer.vivado_writer import VivadoWriter
@@ -9,6 +10,7 @@
register_writer('Vivado', VivadoWriter)
register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
register_writer('Vitis', VitisWriter)
+register_writer('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowWriter)
register_writer('Quartus', QuartusWriter)
register_writer('Catapult', CatapultWriter)
register_writer('SymbolicExpression', SymbolicExpressionWriter)
diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
new file mode 100644
index 0000000000..78e1fa982d
--- /dev/null
+++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
@@ -0,0 +1,387 @@
+import os
+from distutils.dir_util import copy_tree
+from shutil import copyfile
+
+# from hls4ml.writer.vivado_writer import VivadoWriter
+from hls4ml.writer.vitis_writer import VitisWriter
+
+
+class VitisAcceleratorIPFlowWriter(VitisWriter):
+ def __init__(self):
+ super().__init__()
+ self.vitis_accelerator_ip_flow_config = None
+
+ def write_axi_wrapper(self, model):
+ '''Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces
+ Args:
+ model : The ModelGraph to write the wrapper for
+ '''
+ inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types()
+ indent = ' '
+
+ #######################
+ # myproject_axi.h
+ #######################
+
+ filedir = os.path.dirname(os.path.abspath(__file__))
+ f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.h'))
+ fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.h', 'w')
+
+ for line in f.readlines():
+ if 'MYPROJECT' in line:
+ newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
+ elif '// hls-fpga-machine-learning insert include' in line:
+ newline = f'#include "{model.config.get_project_name()}.h"\n'
+ newline += '#include "ap_axi_sdata.h"\n'
+ elif 'myproject' in line:
+ newline = line.replace('myproject', model.config.get_project_name())
+ elif '// hls-fpga-machine-learning insert definitions' in line:
+ newline = ''
+ newline += f'static const unsigned N_IN = {inp.size()};\n'
+ newline += f'static const unsigned N_OUT = {out.size()};\n'
+ if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+ newline += 'typedef hls::axis my_pkt;\n'
+ # might need to make "float" a variable according to the
+ # configuration set by the user and the DMA available data widths
+ else: # TODO: handle this case
+ newline += f'typedef {inp_axi_t} input_axi_t;\n'
+ newline += f'typedef {out_axi_t} output_axi_t;\n'
+ else:
+ newline = line
+ fout.write(newline)
+ f.close()
+ fout.close()
+
+ #######################
+ # myproject_axi.cpp
+ #######################
+
+ f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.cpp'))
+ fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.cpp', 'w')
+
+ io_type = model.config.get_config_value("IOType")
+
+ for line in f.readlines():
+ if 'myproject' in line:
+ newline = line.replace('myproject', model.config.get_project_name())
+ elif '// hls-fpga-machine-learning insert include' in line:
+ newline = f'#include "{model.config.get_project_name()}_axi.h"\n'
+ elif '// hls-fpga-machine-learning insert local vars' in line:
+ newline = ''
+ if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+ newline += indent + 'bool is_last = false;\n'
+ if io_type == 'io_parallel': # TODO: handle io_parallel
+ newline += indent + inp.type.name + ' in_local[N_IN];\n'
+ newline += indent + out.type.name + ' out_local[N_OUT];\n'
+ newline += indent + 'my_pkt tmp;\n'
+ elif io_type == 'io_stream':
+ newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n'
+ newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n'
+ newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'.format(
+ model.get_input_variables()[0].pragma[1]
+ )
+ newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'.format(
+ model.get_output_variables()[0].pragma[1]
+ )
+ elif '// hls-fpga-machine-learning insert call' in line:
+ newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n'
+ elif '// hls-fpga-machine-learning insert interface' in line:
+ if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_lite': # TODO: handle axi_lite
+ newline = ''
+ newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
+ newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n'
+ newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n'
+ elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_master': # TODO: handle axi_master
+ newline = ''
+ newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n'
+ newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format(
+ model.get_input_variables()[0].pragma[1]
+ )
+ newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'.format(
+ model.get_output_variables()[0].pragma[1]
+ )
+ elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+ newline = ''
+ newline += indent + '#pragma HLS INTERFACE axis port=in\n'
+ newline += indent + '#pragma HLS INTERFACE axis port=out\n'
+ newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
+ if model.config.get_config_value("IOType") == 'io_stream':
+ newline += indent + '#pragma HLS DATAFLOW\n'
+ elif '// hls-fpga-machine-learning insert enqueue' in line:
+ io_type = model.config.get_config_value("IOType")
+ if io_type == 'io_parallel': # TODO: handle io_parallel
+ newline = ''
+ newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n'
+ if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+ newline += indent + indent + '#pragma HLS PIPELINE\n'
+ newline += indent + indent + 'tmp = in.read(); // Read input with cast\n'
+ newline += indent + indent + 'in_local[i] = tmp.data;\n'
+ newline += indent + indent + 'is_last = tmp.last;\n'
+ else:
+ newline += indent + indent + '#pragma HLS UNROLL\n'
+ newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n'
+ newline += indent + '}\n'
+ newline += indent + 'tmp.last = 0;\n'
+ elif io_type == 'io_stream':
+ newline = ''
+ newline += indent + 'my_pkt tmp;\n'
+
+ newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n'
+ # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed
+ newline += indent + indent + '{input_t} ctype;\n'
+ # newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n'
+ # newline += indent + indent + 'pragma HLS aggregate variable=ctype compact=auto' # TODO: check if needed
+ newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n'
+ # newline += indent + indent + indent + '#pragma HLS UNROLL\n' # TODO: check if needed
+ if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+ newline += indent + indent + indent + 'in.read(tmp);\n'
+ newline += indent + indent + indent + 'ctype[j] = tmp.data;\n'
+ newline += indent + indent + indent + 'is_last = tmp.last;\n'
+ else: # TODO: handle this case
+ newline += (
+ indent
+ + indent
+ + indent
+ + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n'
+ )
+ newline += indent + indent + '}}\n'
+ newline += indent + indent + 'in_local.write(ctype);\n'
+ newline += indent + '}}\n'
+ newline += indent + 'tmp.last = 0;\n'
+ newline = newline.format(input_t=inp.type.name)
+ elif '// hls-fpga-machine-learning insert dequeue' in line:
+ io_type = model.config.get_config_value("IOType")
+ if io_type == 'io_parallel': # TODO: handle this case
+ newline = ''
+ newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n'
+ if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+ newline += indent + indent + '#pragma HLS PIPELINE\n'
+ newline += indent + indent + 'tmp.data = out_local[i];\n'
+ newline += indent + indent + 'tmp.last = (is_last && (i == N_OUT - 1))? true : false;\n'
+ newline += indent + indent + 'out.write(tmp);\n'
+ else:
+ newline += indent + indent + '#pragma HLS UNROLL\n'
+ newline += indent + indent + 'out[i] = out_local[i]; // Write output with cast\n'
+ newline += indent + '}\n'
+ elif io_type == 'io_stream':
+ newline = ''
+ newline += indent + 'for(unsigned i = 0; i < N_OUT / {result_t}::size; ++i) {{\n'
+ # newline += indent + indent + '#pragma HLS PIPELINE\n'
+ newline += indent + indent + '{result_t} ctype = out_local.read();\n'
+ newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n'
+ # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
+ if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+ newline += indent + indent + indent + f'tmp.data = ({inp_axi_t}) (ctype[j]);\n'
+
+ newline += indent + indent + indent + 'if(is_last) {{tmp.last = (((i+1)*(j+1))==N_OUT);}}\n'
+
+ newline += indent + indent + indent + 'out.write(tmp);\n'
+ else:
+ newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n'
+ newline += indent + indent + '}}\n'
+ newline += indent + '}}\n'
+ newline = newline.format(result_t=out.type.name)
+ else:
+ newline = line
+ fout.write(newline)
+ f.close()
+ fout.close()
+
+ def modify_build_script(self, model):
+ '''
+ Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function
+ '''
+ filedir = os.path.dirname(os.path.abspath(__file__))
+ oldfile = f'{model.config.get_output_dir()}/build_prj.tcl'
+ newfile = f'{model.config.get_output_dir()}/build_prj_axi.tcl'
+ f = open(oldfile)
+ fout = open(newfile, 'w')
+
+ for line in f.readlines():
+ if 'set_top' in line:
+ newline = line[:-1] + '_axi\n' # remove the newline from the line end and append _axi for the new top
+ newline += f'add_files firmware/{model.config.get_project_name()}_axi.cpp -cflags "-std=c++0x"\n'
+ elif f'{model.config.get_project_name()}_cosim' in line:
+ newline = line.replace(
+ f'{model.config.get_project_name()}_cosim',
+ f'{model.config.get_project_name()}_axi_cosim',
+ )
+ elif '${project_name}.tcl' in line:
+ newline = line.replace('${project_name}.tcl', '${project_name}_axi.tcl')
+ else:
+ newline = line
+ fout.write(newline)
+
+ f.close()
+ fout.close()
+ os.rename(newfile, oldfile)
+
+ ###################
+ # build_lib.sh
+ ###################
+
+ f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/build_lib.sh'))
+ fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w')
+
+ for line in f.readlines():
+ line = line.replace('myproject', model.config.get_project_name())
+ line = line.replace('mystamp', model.config.get_config_value('Stamp'))
+
+ fout.write(line)
+ f.close()
+ fout.close()
+
+ def write_wrapper_test(self, model):
+ ###################
+ # write myproject_test_wrapper.cpp
+ ###################
+ oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp'
+ newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp'
+
+ inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types()
+
+ f = open(oldfile)
+ fout = open(newfile, 'w')
+
+ inp = model.get_input_variables()[0]
+ out = model.get_output_variables()[0]
+ io_type = model.config.get_config_value("IOType")
+
+ for line in f.readlines():
+ if f'{model.config.get_project_name()}.h' in line:
+ newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
+ elif inp.definition_cpp() in line:
+ newline = line.replace(
+ inp.definition_cpp(), 'hls::stream< my_pkt > inputs'
+ ) # TODO instead of replacing strings, how about we use proper variables and their definition?
+ elif out.definition_cpp() in line:
+ newline = line.replace(out.definition_cpp(), 'hls::stream< my_pkt > outputs')
+ elif 'unsigned short' in line:
+ newline = ''
+ elif f'{model.config.get_project_name()}(' in line:
+ indent_amount = line.split(model.config.get_project_name())[0]
+ newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n'
+ elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
+ newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'my_pkt')
+ elif out.size_cpp() in line or out.name in line or out.type.name in line:
+ newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'my_pkt')
+ else:
+ newline = line
+ if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+ if 'copy_data' in line:
+ newline = newline.replace('copy_data', 'copy_data_axi').replace("0,", "")
+
+ if io_type == 'io_stream':
+ if 'nnet::fill_zero' in line:
+ newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ")
+ # indent = line.split('n')[0]
+ # newline = indent + indent + 'inputs[N_IN-1].last = 1;\n'
+ if 'print_result' in line:
+ newline = newline.replace("print_result<", f"print_result<{out.type.name}, ")
+ fout.write(newline)
+
+ f.close()
+ fout.close()
+ os.rename(newfile, oldfile)
+
+ ###################
+ # write myproject_bridge_wrapper.cpp
+ ###################
+ oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp'
+ newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge_wrapper.cpp'
+
+ f = open(oldfile)
+ fout = open(newfile, 'w')
+
+ inp = model.get_input_variables()[0]
+ out = model.get_output_variables()[0]
+
+ for line in f.readlines():
+ if f'{model.config.get_project_name()}.h' in line:
+ newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
+ elif inp.definition_cpp(name_suffix='_ap') in line:
+ newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {inp.name}_ap')
+ elif out.definition_cpp(name_suffix='_ap') in line:
+ newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {out.name}_ap')
+ elif f'{model.config.get_project_name()}(' in line:
+ indent_amount = line.split(model.config.get_project_name())[0]
+ newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(
+ model.config.get_project_name(), inp.name, out.name
+ )
+ elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
+ newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, inp_axi_t)
+ elif out.size_cpp() in line or out.name in line or out.type.name in line:
+ newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, out_axi_t)
+ else:
+ newline = line
+ fout.write(newline)
+
+ f.close()
+ fout.close()
+ os.rename(newfile, oldfile)
+
+ def write_board_script(self, model):
+ '''
+ Write the tcl scripts and kernel sources to create a Vivado IPI project for the VitisAcceleratorIPFlow
+ '''
+ filedir = os.path.dirname(os.path.abspath(__file__))
+ copyfile(
+ os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_tcl_file_path()),
+ f'{model.config.get_output_dir()}/design.tcl',
+ )
+ # Generic alveo board
+ if self.vitis_accelerator_ip_flow_config.get_board().startswith('alveo'):
+ src_dir = os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_krnl_rtl_src_dir())
+ dst_dir = os.path.abspath(model.config.get_output_dir()) + '/src'
+ copy_tree(src_dir, dst_dir)
+
+ ###################
+ # project.tcl
+ ###################
+ f = open(f'{model.config.get_output_dir()}/project.tcl', 'w')
+ f.write('variable project_name\n')
+ f.write(f'set project_name "{model.config.get_project_name()}"\n')
+ f.write('variable backend\n')
+ f.write('set backend "vitisacceleratoripflow"\n')
+ f.write('variable part\n')
+ f.write(f'set part "{self.vitis_accelerator_ip_flow_config.get_part()}"\n')
+ f.write('variable clock_period\n')
+ f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
+ f.write('variable clock_uncertainty\n')
+ f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
+ f.write('variable version\n')
+ f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
+ if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+ in_bit, out_bit = self.vitis_accelerator_ip_flow_config.get_io_bitwidth()
+ f.write(f'set bit_width_hls_output {in_bit}\n')
+ f.write(f'set bit_width_hls_input {out_bit}\n')
+ f.close()
+
+ def write_driver(self, model):
+ filedir = os.path.dirname(os.path.abspath(__file__))
+ copyfile(
+ os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_driver_path()),
+ ('{}/' + self.vitis_accelerator_ip_flow_config.get_driver_file()).format(model.config.get_output_dir()),
+ )
+
+ def write_new_tar(self, model):
+ # os.remove(model.config.get_output_dir() + '.tar.gz')
+ super().write_tar(model)
+
+ def write_hls(self, model):
+ """
+ Write the HLS project. Calls the VivadoBackend writer, and extra steps for VitisAcceleratorIPFlow/AXI interface
+ """
+ # TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package
+ from hls4ml.backends import VitisAcceleratorIPFlowConfig
+
+ self.vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig(
+ model.config, model.get_input_variables(), model.get_output_variables()
+ )
+ super().write_hls(model)
+ self.write_board_script(model)
+ self.write_driver(model)
+ self.write_wrapper_test(model)
+ self.write_axi_wrapper(model)
+ self.modify_build_script(model)
+ self.write_new_tar(model)