diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 8b3117af7a..7ba2ad4fbb 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -5,14 +5,19 @@
 from hls4ml.backends.vivado.vivado_backend import VivadoBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
 from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig  # noqa: F401
-
 from hls4ml.backends.catapult.catapult_backend import CatapultBackend  # isort: skip
-
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_backend import (
+    VitisAcceleratorIPFlowBackend,
+)
+from hls4ml.backends.vitis_accelerator_ip_flow.vitis_accelerator_ip_flow_config import (
+    VitisAcceleratorIPFlowConfig,
+)
 
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
 register_backend('Vitis', VitisBackend)
+register_backend('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowBackend)
 register_backend('Quartus', QuartusBackend)
 register_backend('Catapult', CatapultBackend)
 register_backend('SymbolicExpression', SymbolicExpressionBackend)
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py b/hls4ml/backends/vitis_accelerator_ip_flow/passes/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
new file mode 100644
index 0000000000..1279ec22d0
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/supported_boards.json
@@ -0,0 +1,42 @@
+{
+  "pynq-z2": {
+    "part": "xc7z020clg400-1",
+    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  },
+  "zcu102": {
+    "part": "xczu9eg-ffvb1156-2-e",
+    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  },
+  "alveo-u50": {
+    "part": "xcu50-fsvh2104-2-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u250": {
+    "part": "xcu250-figd2104-2L-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u200": {
+    "part": "xcu200-fsgd2104-2-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u280": {
+    "part": "xcu280-fsvh2892-2L-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  }
+}
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
new file mode 100644
index 0000000000..0372a75b75
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_backend.py
@@ -0,0 +1,122 @@
+import os
+
+from hls4ml.backends import VitisBackend, VivadoBackend
+from hls4ml.model.flow import register_flow
+from hls4ml.report import parse_vivado_report
+
+
+class VitisAcceleratorIPFlowBackend(VitisBackend):
+    def __init__(self):
+        super(VivadoBackend, self).__init__(name='VitisAcceleratorIPFlow')
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def build(
+        self,
+        model,
+        reset=False,
+        csim=True,
+        synth=True,
+        cosim=False,
+        validation=False,
+        export=False,
+        vsynth=False,
+        # fifo_opt=False,
+        bitfile=False,
+    ):
+        # run the VitisBackend build
+        super().build(
+            model,
+            reset=reset,
+            csim=csim,
+            synth=synth,
+            cosim=cosim,
+            validation=validation,
+            export=export,
+            vsynth=vsynth,
+            # fifo_opt=fifo_opt,
+        )
+        # Get Config to view Board and Platform
+        # from hls4ml.backends import VitisAcceleratorIPFlowConfig
+
+        # vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig(
+        #     model.config, model.get_input_variables(), model.get_output_variables()
+        # )
+        # now make a bitfile
+        if bitfile:
+            curr_dir = os.getcwd()
+            os.chdir(model.config.get_output_dir())
+            try:
+                os.system('vivado -mode batch -source design.tcl')  # check if this is accepted as a command
+            except Exception:
+                print("Something went wrong, check the Vivado logs")
+            os.chdir(curr_dir)
+
+        return parse_vivado_report(model.config.get_output_dir())
+
+    def create_initial_config(
+        self,
+        board='pynq-z2',
+        part=None,
+        clock_period=5,
+        clock_uncertainty='12.5%',
+        io_type='io_parallel',
+        interface='axi_stream',
+        driver='python',
+        input_type='float',
+        output_type='float',
+        platform='xilinx_u250_xdma_201830_2',
+    ):
+        '''
+        Create initial accelerator config with default parameters
+
+        Args:
+            board: one of the keys defined in supported_boards.json
+            clock_period: clock period passed to hls project
+            io_type: io_parallel or io_stream
+            interface: `axi_stream`: generate hardware designs and drivers which exploit axi stream channels.
+                       `axi_master`: generate hardware designs and drivers which exploit axi master channels.
+                       `axi_lite` : generate hardware designs and drivers which exploit axi lite channels. (Don't use it
+                       to exchange large amount of data)
+            driver: `python`: generates the python driver to use the accelerator in the PYNQ stack.
+                    `c`: generates the c driver to use the accelerator bare-metal.
+            input_type: the wrapper input precision. Can be `float` or an `ap_type`. Note: VivadoAcceleratorBackend
+                             will round the number of bits used to the next power-of-2 value.
+            output_type: the wrapper output precision. Can be `float` or an `ap_type`. Note:
+                              VivadoAcceleratorBackend will round the number of bits used to the next power-of-2 value.
+            platform: development target platform
+
+        Returns:
+            populated config
+        '''
+        board = board if board is not None else 'pynq-z2'
+        config = super().create_initial_config(part, clock_period, clock_uncertainty, io_type)
+        config['AcceleratorConfig'] = {}
+        config['AcceleratorConfig']['Board'] = board
+        config['AcceleratorConfig']['Interface'] = interface  # axi_stream, axi_master, axi_lite
+        config['AcceleratorConfig']['Driver'] = driver
+        config['AcceleratorConfig']['Precision'] = {}
+        config['AcceleratorConfig']['Precision']['Input'] = {}
+        config['AcceleratorConfig']['Precision']['Output'] = {}
+        config['AcceleratorConfig']['Precision']['Input'] = input_type  # float, double or ap_fixed<a,b>
+        config['AcceleratorConfig']['Precision']['Output'] = output_type  # float, double or ap_fixed<a,b>
+        # if board.startswith('alveo'):
+        #     config['AcceleratorConfig']['Platform'] = platform
+
+        return config
+
+    def get_default_flow(self):
+        return self._default_flow
+
+    def get_writer_flow(self):
+        return self._writer_flow
+
+    def _register_flows(self):
+        vivado_ip = 'vivado:ip'
+        writer_passes = ['make_stamp', 'vitisacceleratoripflow:write_hls']
+        self._writer_flow = register_flow('write', writer_passes, requires=[vivado_ip], backend=self.name)
+        self._default_flow = vivado_ip
+
+        # fifo_depth_opt_passes = ['vivadoaccelerator:fifo_depth_optimization'] + writer_passes
+
+        # register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=[vivado_ip], backend=self.name)
diff --git a/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
new file mode 100644
index 0000000000..07961a9b6f
--- /dev/null
+++ b/hls4ml/backends/vitis_accelerator_ip_flow/vitis_accelerator_ip_flow_config.py
@@ -0,0 +1,169 @@
+import json
+import os
+
+import numpy as np
+
+from hls4ml.model.layers import FixedPrecisionType, IntegerPrecisionType
+
+
+class VitisAcceleratorIPFlowConfig:
+    def __init__(self, config, model_inputs, model_outputs):
+        self.config = config.config
+        self.board = self.config.get('AcceleratorConfig', {}).get('Board', 'pynq-z2')
+        self.supported_boards = json.load(open(os.path.dirname(__file__) + '/supported_boards.json'))
+        if self.board in self.supported_boards.keys():
+            board_info = self.supported_boards[self.board]
+            self.part = board_info['part']
+        else:
+            raise Exception('The board does not appear in supported_boards.json file')
+
+        if self.config.get('Part') is not None:
+            if self.config.get('Part') != self.part:
+                print(
+                    'WARNING: You set a Part that does not correspond to the Board you specified. The correct '
+                    'Part is now set.'
+                )
+                self.config['Part'] = self.part
+        accel_config = self.config.get('AcceleratorConfig', None)
+        if accel_config is not None:
+            prec = accel_config.get('Precision')
+            if prec is None:
+                raise Exception('Precision must be provided in the AcceleratorConfig')
+            else:
+                if prec.get('Input') is None or prec.get('Output') is None:
+                    raise Exception('Input and Output fields must be provided in the AcceleratorConfig->Precision')
+        else:
+            accel_config = {
+                'Precision': {'Input': 'float', 'Output': 'float'},
+                'Driver': 'python',
+                'Interface': 'axi_stream',
+            }
+            config.config['AcceleratorConfig'] = accel_config
+
+        self.interface = self.config['AcceleratorConfig'].get('Interface', 'axi_stream')  # axi_stream, axi_master, axi_lite
+        self.driver = self.config['AcceleratorConfig'].get('Driver', 'python')  # python or c
+        self.input_type = self.config['AcceleratorConfig']['Precision'].get(
+            'Input', 'float'
+        )  # float, double or ap_fixed<a,b>
+        self.output_type = self.config['AcceleratorConfig']['Precision'].get(
+            'Output', 'float'
+        )  # float, double or ap_fixed<a,b>
+        self.platform = self.config['AcceleratorConfig'].get(
+            'Platform', 'xilinx_u250_xdma_201830_2'
+        )  # Get platform folder name
+
+        assert (
+            len(model_inputs) == 1
+        ), "Only models with one input tensor are currently supported by VitisAcceleratorIPFlowBackend"
+        assert (
+            len(model_outputs) == 1
+        ), "Only models with one output tensor are currently supported by VitisAcceleratorIPFlowBackend"
+        self.inp = model_inputs[0]
+        self.out = model_outputs[0]
+        inp_axi_t = self.input_type
+        out_axi_t = self.output_type
+
+        if inp_axi_t not in ['float', 'double']:
+            self.input_type = self._next_factor8_type(config.backend.convert_precision_string(inp_axi_t))
+        if out_axi_t not in ['float', 'double']:
+            self.output_type = self._next_factor8_type(config.backend.convert_precision_string(out_axi_t))
+
+        if self.input_type == 'float':
+            self.input_bitwidth = 32
+        elif self.input_type == 'double':
+            self.input_bitwidth = 64
+        else:
+            self.input_bitwidth = config.backend.convert_precision_string(inp_axi_t).width
+
+        if out_axi_t == 'float':
+            self.output_bitwidth = 32
+        elif out_axi_t == 'double':
+            self.output_bitwidth = 64
+        else:
+            self.output_bitwidth = config.backend.convert_precision_string(out_axi_t).width
+
+    def _next_factor8_type(self, p):
+        '''Return a new type with the width rounded to the next factor of 8 up to p's width
+        Args:
+            p : IntegerPrecisionType or FixedPrecisionType
+        Returns:
+            An IntegerPrecisionType or FixedPrecisionType with the width rounder up to the next factor of 8
+            of p's width. Other parameters (fractional bits, extra modes) stay the same.
+        '''
+        W = p.width
+        newW = int(np.ceil(W / 8) * 8)
+        if isinstance(p, FixedPrecisionType):
+            return FixedPrecisionType(newW, p.integer, p.signed, p.rounding_mode, p.saturation_mode, p.saturation_bits)
+        elif isinstance(p, IntegerPrecisionType):
+            return IntegerPrecisionType(newW, p.signed)
+
+    def get_io_bitwidth(self):
+        return self.input_bitwidth, self.output_bitwidth
+
+    def get_corrected_types(self):
+        return self.input_type, self.output_type, self.inp, self.out
+
+    def get_interface(self):
+        return self.interface
+
+    def get_board_info(self, board=None):
+        if board is None:
+            board = self.board
+        if board in self.supported_boards.keys():
+            return self.supported_boards[board]
+        else:
+            raise Exception('The board is still not supported')
+
+    def get_part(self):
+        return self.part
+
+    def get_driver(self):
+        return self.driver
+
+    def get_board(self):
+        return self.board
+
+    def get_platform(self):
+        return self.platform
+
+    def get_clock_period(self):
+        return self.clock_period
+
+    def get_driver_path(self):
+        if self.board.startswith('alveo'):
+            return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + self.driver + '_drivers/' + self.get_driver_file()
+        else:
+            return (
+                '../templates/vitis_accelerator_ip_flow/'
+                + self.board
+                + '/'
+                + self.driver
+                + '_drivers/'
+                + self.get_driver_file()
+            )
+
+    def get_driver_file(self):
+        driver_ext = '.py' if self.driver == 'python' else '.h'
+        return self.interface + '_driver' + driver_ext
+
+    def get_krnl_rtl_src_dir(self):
+        return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/krnl_rtl_src'
+
+    def get_input_type(self):
+        return self.input_type
+
+    def get_output_type(self):
+        return self.output_type
+
+    def get_tcl_file_path(self):
+        board_info = self.get_board_info(self.board)
+        tcl_scripts = board_info.get('tcl_scripts', None)
+        if tcl_scripts is None:
+            raise Exception('No tcl scripts definition available for the board in supported_board.json')
+        tcl_script = tcl_scripts.get(self.interface, None)
+        if tcl_script is None:
+            raise Exception('No tcl script definition available for the desired interface in supported_board.json')
+        if self.board.startswith('alveo'):
+            return '../templates/vitis_accelerator_ip_flow/' + 'alveo/' + '/tcl_scripts/' + tcl_script
+        else:
+            return '../templates/vitis_accelerator_ip_flow/' + self.board + '/tcl_scripts/' + tcl_script
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
index 20b6fecb49..aad5d9a430 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv1d_stream.h
@@ -86,7 +86,7 @@ void separable_conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+    const unsigned res_depth = CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_1d_buffer_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res,
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
index a3747990e0..a119fb9e2a 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_sepconv2d_stream.h
@@ -120,7 +120,7 @@ void separable_conv_2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+    const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_2d_buffer_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res,
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
new file mode 100644
index 0000000000..262ce00d63
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/build_lib.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11"
+fi
+VITIS_ACCELERATOR_FLAGS="VITIS_ACCELERATOR_IP_FLOW"
+CFLAGS="$CFLAGS -D$VITIS_ACCELERATOR_FLAGS"
+
+INCFLAGS="-Ifirmware/ap_types/"
+
+PROJECT=myproject
+LIB_STAMP=mystamp
+
+${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}_axi.cpp -o ${PROJECT}_axi.o
+${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_axi.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
+rm -f *.o
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
new file mode 100644
index 0000000000..cf6c0b9c25
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.cpp
@@ -0,0 +1,14 @@
+// hls-fpga-machine-learning insert include
+
+void myproject_axi(hls::stream<my_pkt> &in, hls::stream<my_pkt> &out) {
+
+    // hls-fpga-machine-learning insert interface
+
+    // hls-fpga-machine-learning insert local vars
+
+    // hls-fpga-machine-learning insert enqueue
+
+    // hls-fpga-machine-learning insert call
+
+    // hls-fpga-machine-learning insert dequeue
+}
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
new file mode 100644
index 0000000000..d0d88bfecf
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/myproject_axi.h
@@ -0,0 +1,10 @@
+#ifndef MYPROJECT_AXI_H_
+#define MYPROJECT_AXI_H_
+
+#include <iostream>
+// hls-fpga-machine-learning insert include
+
+// hls-fpga-machine-learning insert definitions
+
+void myproject_axi(hls::stream<my_pkt> &in, hls::stream<my_pkt> &out);
+#endif
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..1aac79f2d3
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/python_drivers/axi_stream_driver.py
@@ -0,0 +1,75 @@
+from datetime import datetime
+
+import numpy as np
+from pynq import Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(
+        self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None
+    ):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+        self.sendchannel = self.hier_0.axi_dma_0.sendchannel
+        self.recvchannel = self.hier_0.axi_dma_0.recvchannel
+        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = timeb - timea
+        dts = dt.seconds + dt.microseconds * 10**-6
+        rate = N / dts
+        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+        return dts, rate
+
+    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        self.input_buffer[:] = X
+        self.sendchannel.transfer(self.input_buffer)
+        self.recvchannel.transfer(self.output_buffer)
+        if debug:
+            print("Transfer OK")
+        self.sendchannel.wait()
+        if debug:
+            print("Send OK")
+        self.recvchannel.wait()
+        if debug:
+            print("Receive OK")
+        # result = self.output_buffer.copy()
+        if decode is not None:
+            self.output_buffer = decode(self.output_buffer)
+
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return self.output_buffer, dts, rate
+        else:
+            return self.output_buffer
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..e8db1e6782
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/pynq-z2/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,69 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vitis_accelerator_ip_flow -part xc7z020clg400-1 -force
+
+# set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
+
+startgroup
+set_property -dict [list \
+  CONFIG.PCW_USE_S_AXI_HP0 {1} \
+  CONFIG.PCW_USE_S_AXI_HP2 {1} \
+] [get_bd_cells processing_system7_0]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list \
+  CONFIG.c_include_sg {0} \
+  CONFIG.c_m_axi_mm2s_data_width {64} \
+  CONFIG.c_m_axi_s2mm_data_width {64} \
+  CONFIG.c_mm2s_burst_size {32} \
+  CONFIG.c_sg_length_width {26} \
+] [get_bd_cells axi_dma_0]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
+
+#todo: make clock a variable
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP2} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP2]
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (50 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins myproject_axi_0/ap_clk]
+endgroup
+
+validate_bd_design
+
+open_bd_design {./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd}
+
+make_wrapper -files [get_files ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vitis_accelerator_ip_flow/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+#todo: make number of jobs a variable
+launch_runs impl_1 -to_step write_bitstream -jobs 10
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..1d70e55406
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/python_drivers/axi_stream_driver.py
@@ -0,0 +1,83 @@
+from datetime import datetime
+
+import numpy as np
+from pynq import PL, Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(self, bitfile_name, dtbo=None, download=True, ignore_version=False, device=None):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = timeb - timea
+        dts = dt.seconds + dt.microseconds * 10**-6
+        rate = N / dts
+        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+        return dts, rate
+
+    def reset_PL():
+        PL.reset()
+
+    def predict(self, X, y_shape, dtype=np.float32, debug=None, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - y_shape : the shape of the output vector. Needed to the accelerator to set the TLAST bit properly and
+                    for sizing the output vector shape.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+
+        if encode is not None:
+            X = encode(X)
+        with allocate(shape=X.shape, dtype=dtype) as input_buffer, allocate(shape=y_shape, dtype=dtype) as output_buffer:
+            input_buffer[:] = X
+
+            if profile:
+                timea = datetime.now()
+
+            self.axi_dma_0.sendchannel.transfer(input_buffer)
+            self.axi_dma_0.recvchannel.transfer(output_buffer)
+            if debug:
+                print("Transfer OK")
+            self.axi_dma_0.sendchannel.wait()
+            if debug:
+                print("Send OK")
+            self.axi_dma_0.recvchannel.wait()
+
+            if profile:
+                timeb = datetime.now()
+
+            if debug:
+                print("Receive OK")
+
+            result = output_buffer.copy()
+
+        if decode is not None:
+            result = decode(result)
+
+        if profile:
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return result, dts, rate
+
+        return result
diff --git a/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..103fec0178
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator_ip_flow/zcu102/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,65 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
+
+set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+set_property  ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project]
+update_ip_catalog
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.5 zynq_ultra_ps_e_1
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_1]
+
+set_property -dict [list \
+  CONFIG.PSU__SAXIGP2__DATA_WIDTH {64} \
+  CONFIG.PSU__SAXIGP4__DATA_WIDTH {64} \
+  CONFIG.PSU__USE__S_AXI_GP2 {1} \
+  CONFIG.PSU__USE__S_AXI_GP4 {1} \
+] [get_bd_cells zynq_ultra_ps_e_1]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list \
+  CONFIG.c_include_sg {0} \
+  CONFIG.c_m_axi_mm2s_data_width {64} \
+  CONFIG.c_m_axi_s2mm_data_width {64} \
+  CONFIG.c_mm2s_burst_size {32} \
+  CONFIG.c_sg_length_width {26} \
+] [get_bd_cells axi_dma_0]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_1/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_1/S_AXI_HP0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP0_FPD]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_1/S_AXI_HP2_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_1/S_AXI_HP2_FPD]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_1/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_1/M_AXI_HPM1_FPD]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_1/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h
new file mode 100755
index 0000000000..e01c8a8cd1
--- /dev/null
+++ b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h
@@ -0,0 +1,441 @@
+// 67d7842dbbe25473c3c32b93c0da8047785f30d78e8a024de1b57352245f9689
+/*****************************************************************************
+ *
+ *     Author: Xilinx, Inc.
+ *
+ *     This text contains proprietary, confidential information of
+ *     Xilinx, Inc. , is distributed by under license from Xilinx,
+ *     Inc., and may be used, copied and/or disclosed only pursuant to
+ *     the terms of a valid license agreement with Xilinx, Inc.
+ *
+ *     XILINX IS PROVIDING THIS DESIGN, CODE, OR INFORMATION "AS IS"
+ *     AS A COURTESY TO YOU, SOLELY FOR USE IN DEVELOPING PROGRAMS AND
+ *     SOLUTIONS FOR XILINX DEVICES.  BY PROVIDING THIS DESIGN, CODE,
+ *     OR INFORMATION AS ONE POSSIBLE IMPLEMENTATION OF THIS FEATURE,
+ *     APPLICATION OR STANDARD, XILINX IS MAKING NO REPRESENTATION
+ *     THAT THIS IMPLEMENTATION IS FREE FROM ANY CLAIMS OF INFRINGEMENT,
+ *     AND YOU ARE RESPONSIBLE FOR OBTAINING ANY RIGHTS YOU MAY REQUIRE
+ *     FOR YOUR IMPLEMENTATION.  XILINX EXPRESSLY DISCLAIMS ANY
+ *     WARRANTY WHATSOEVER WITH RESPECT TO THE ADEQUACY OF THE
+ *     IMPLEMENTATION, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OR
+ *     REPRESENTATIONS THAT THIS IMPLEMENTATION IS FREE FROM CLAIMS OF
+ *     INFRINGEMENT, IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE.
+ *
+ *     Xilinx products are not intended for use in life support appliances,
+ *     devices, or systems. Use in such applications is expressly prohibited.
+ *
+#-  (c) Copyright 2011-2022 Xilinx, Inc. All rights reserved.
+#-
+#-  This file contains confidential and proprietary information
+#-  of Xilinx, Inc. and is protected under U.S. and
+#-  international copyright and other intellectual property
+#-  laws.
+#-
+#-  DISCLAIMER
+#-  This disclaimer is not a license and does not grant any
+#-  rights to the materials distributed herewith. Except as
+#-  otherwise provided in a valid license issued to you by
+#-  Xilinx, and to the maximum extent permitted by applicable
+#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#-  (2) Xilinx shall not be liable (whether in contract or tort,
+#-  including negligence, or under any other theory of
+#-  liability) for any loss or damage of any kind or nature
+#-  related to, arising under or in connection with these
+#-  materials, including for any direct, or any indirect,
+#-  special, incidental, or consequential loss or damage
+#-  (including loss of data, profits, goodwill, or any type of
+#-  loss or damage suffered as a result of any action brought
+#-  by a third party) even if such damage or loss was
+#-  reasonably foreseeable or Xilinx had been advised of the
+#-  possibility of the same.
+#-
+#-  CRITICAL APPLICATIONS
+#-  Xilinx products are not designed or intended to be fail-
+#-  safe, or for use in any application requiring fail-safe
+#-  performance, such as life-support or safety devices or
+#-  systems, Class III medical devices, nuclear facilities,
+#-  applications related to the deployment of airbags, or any
+#-  other applications that could lead to death, personal
+#-  injury, or severe property or environmental damage
+#-  (individually and collectively, "Critical
+#-  Applications"). Customer assumes the sole risk and
+#-  liability of any use of Xilinx products in Critical
+#-  Applications, subject only to applicable laws and
+#-  regulations governing limitations on product liability.
+#-
+#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#-  PART OF THIS FILE AT ALL TIMES. 
+#- ************************************************************************
+
+ *
+ *****************************************************************************/
+
+/*
+ * This file contains the definition of the data types for AXI streaming. 
+ * ap_axi_s is a signed interpretation of the AXI stream
+ * ap_axi_u is an unsigned interpretation of the AXI stream
+ */
+
+#ifndef __AP__AXI_SDATA__
+#define __AP__AXI_SDATA__
+
+#include <climits>
+#include "ap_int.h"
+//#include "ap_fixed.h"
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_fixed;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_ufixed;
+
+namespace hls {
+
+template <typename T> constexpr std::size_t bitwidth = sizeof(T) * CHAR_BIT;
+
+template <std::size_t W> constexpr std::size_t bitwidth<ap_int<W>> = W;
+template <std::size_t W> constexpr std::size_t bitwidth<ap_uint<W>> = W;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+constexpr std::size_t bitwidth<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>> = _AP_W;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+constexpr std::size_t bitwidth<ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>> = _AP_W;
+
+template <typename T>
+constexpr std::size_t bytewidth = (bitwidth<T> + CHAR_BIT - 1) / CHAR_BIT;
+
+template <typename T, std::size_t WUser, std::size_t WId, std::size_t WDest> struct axis {
+  static constexpr std::size_t NewWUser = (WUser == 0) ? 1 : WUser;
+  static constexpr std::size_t NewWId = (WId == 0) ? 1 : WId;
+  static constexpr std::size_t NewWDest = (WDest == 0) ? 1 : WDest;
+  T data;
+  ap_uint<bytewidth<T>> keep;
+  ap_uint<bytewidth<T>> strb;
+  ap_uint<NewWUser> user;
+  ap_uint<1> last;
+  ap_uint<NewWId> id;
+  ap_uint<NewWDest> dest;
+
+  ap_uint<NewWUser> *get_user_ptr() { 
+#pragma HLS inline
+    return (WUser == 0) ? nullptr : &user;
+  }
+  ap_uint<NewWId> *get_id_ptr() {
+#pragma HLS inline
+    return (WId == 0) ? nullptr : &id;
+  }
+  ap_uint<NewWDest> *get_dest_ptr() {
+#pragma HLS inline
+    return (WDest == 0) ? nullptr : &dest;
+  }
+};
+
+} // namespace hls
+
+template <std::size_t WData, std::size_t WUser, std::size_t WId, std::size_t WDest>
+using ap_axis = hls::axis<ap_int<WData>, WUser, WId, WDest>;
+
+template <std::size_t WData, std::size_t WUser, std::size_t WId, std::size_t WDest>
+using ap_axiu = hls::axis<ap_uint<WData>, WUser, WId, WDest>;
+
+// Isolate out qdma_axis from hls::axis for special APIs.
+template <std::size_t WData, std::size_t WUser, std::size_t WId, std::size_t WDest>
+struct qdma_axis;
+
+template <std::size_t WData> struct qdma_axis<WData, 0, 0, 0> {
+  //  private:
+  static constexpr std::size_t kBytes = (WData + 7) / 8;
+
+  ap_uint<WData> data;
+  ap_uint<kBytes> keep;
+  ap_uint<1> strb;
+  ap_uint<1> user;
+  ap_uint<1> last;
+  ap_uint<1> id;
+  ap_uint<1> dest;
+
+  ap_uint<1> *get_strb_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+  ap_uint<1> *get_user_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+  ap_uint<1> *get_id_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+  ap_uint<1> *get_dest_ptr() {
+#pragma HLS inline
+    return nullptr;
+  }
+
+  //  public:
+  ap_uint<WData> get_data() const {
+#pragma HLS inline
+    return data;
+  }
+  ap_uint<kBytes> get_keep() const {
+#pragma HLS inline
+    return keep;
+  }
+  ap_uint<1> get_last() const {
+#pragma HLS inline
+    return last;
+  }
+
+  void set_data(const ap_uint<WData> &d) {
+#pragma HLS inline
+    data = d;
+  }
+  void set_keep(const ap_uint<kBytes> &k) {
+#pragma HLS inline
+    keep = k;
+  }
+  void set_last(const ap_uint<1> &l) {
+#pragma HLS inline
+    last = l;
+  }
+  void keep_all() {
+#pragma HLS inline
+    ap_uint<kBytes> k = 0;
+    keep = ~k;
+  }
+
+  qdma_axis() {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(ap_uint<WData> d) : data(d) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(ap_uint<WData> d, ap_uint<kBytes> k) : data(d), keep(k) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(ap_uint<WData> d, ap_uint<kBytes> k, ap_uint<1> l)
+      : data(d), keep(k), last(l) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis(const qdma_axis<WData, 0, 0, 0> &d)
+      : data(d.data), keep(d.keep), last(d.last) {
+#pragma HLS inline
+    ;
+  }
+  qdma_axis &operator=(const qdma_axis<WData, 0, 0, 0> &d) {
+#pragma HLS inline
+    data = d.data;
+    keep = d.keep;
+    last = d.last;
+    return *this;
+  }
+};
+
+#ifdef AESL_SYN 
+#if ((__clang_major__ != 3) || (__clang_minor__ != 1))
+#include "hls_stream.h"
+namespace hls {
+
+template <typename T, std::size_t WUser, std::size_t WId, std::size_t WDest>
+class stream<axis<T, WUser, WId, WDest>> final {
+  typedef axis<T, WUser, WId, WDest> __STREAM_T__;
+
+public:
+  /// Constructors
+  INLINE stream() {}
+
+  INLINE stream(const char *name) { (void)name; }
+
+  /// Make copy constructor and assignment operator private
+private:
+  INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {}
+
+public:
+  /// Overload >> and << operators to implement read() and write()
+  INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); }
+
+  INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); }
+
+  /// empty & full
+  bool empty() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_valid(&V.data, &V.keep, &V.strb, V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  bool full() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_ready(&V.data, &V.keep, &V.strb, V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  /// Blocking read
+  void read(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                    V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                    &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                    tmp.get_dest_ptr());
+    dout = tmp;
+  }
+
+  __STREAM_T__ read() {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                    V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                    &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                    tmp.get_dest_ptr());
+    return tmp;
+  }
+
+  /// Blocking write
+  void write(const __STREAM_T__ &din) {
+#pragma HLS inline
+    __STREAM_T__ tmp = din;
+    __fpga_axis_push(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                     V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                     &tmp.strb, tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                     tmp.get_dest_ptr());
+  }
+
+  /// Non-Blocking read
+  bool read_nb(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                           V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+                           &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+                           &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) {
+      dout = tmp;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /// Non-Blocking write
+  bool write_nb(const __STREAM_T__ &in) {
+#pragma HLS inline
+    __STREAM_T__ tmp = in;
+    bool full_n = __fpga_axis_nb_push(
+        &V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last, V.get_id_ptr(),
+        V.get_dest_ptr(), &tmp.data, &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+        &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+    return full_n;
+  }
+
+private:
+  __STREAM_T__ V NO_CTOR;
+};
+
+// specialization for qdma
+template <std::size_t WData>
+class stream<qdma_axis<WData, 0, 0, 0>> {
+  typedef qdma_axis<WData, 0, 0, 0> __STREAM_T__;
+
+public:
+  /// Constructors
+  INLINE stream() {}
+
+  INLINE stream(const char *name) { (void)name; }
+
+  /// Make copy constructor and assignment operator private
+private:
+  INLINE stream(const stream<__STREAM_T__> &chn) : V(chn.V) {}
+
+public:
+  /// Overload >> and << operators to implement read() and write()
+  INLINE void operator>>(__STREAM_T__ &rdata) { read(rdata); }
+
+  INLINE void operator<<(const __STREAM_T__ &wdata) { write(wdata); }
+
+  /// empty & full
+  bool empty() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_valid(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  bool full() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_ready(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+                                 &V.last, V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  /// Blocking read
+  void read(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(),
+                    &V.last, V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+                    &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(),
+                    &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+    dout = tmp;
+  }
+
+  __STREAM_T__ read() {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last,
+                    V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                    tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                    tmp.get_dest_ptr());
+    return tmp;
+  }
+
+  /// Blocking write
+  void write(const __STREAM_T__ &din) {
+#pragma HLS inline
+    __STREAM_T__ tmp = din;
+    __fpga_axis_push(&V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last,
+                     V.get_id_ptr(), V.get_dest_ptr(), &tmp.data, &tmp.keep,
+                     tmp.get_strb_ptr(), tmp.get_user_ptr(), &tmp.last, tmp.get_id_ptr(),
+                     tmp.get_dest_ptr());
+  }
+
+  /// Non-Blocking read
+  bool read_nb(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+
+    if (__fpga_axis_nb_pop(&V.data, &V.keep, &V.strb, V.get_user_ptr(), &V.last,
+                           V.get_id_ptr(), V.get_dest_ptr(), &tmp.data,
+                           &tmp.keep, &tmp.strb, tmp.get_user_ptr(),
+                           &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr())) {
+      dout = tmp;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /// Non-Blocking write
+  bool write_nb(const __STREAM_T__ &in) {
+#pragma HLS inline
+    __STREAM_T__ tmp = in;
+    bool full_n = __fpga_axis_nb_push(
+        &V.data, &V.keep, V.get_strb_ptr(), V.get_user_ptr(), &V.last, V.get_id_ptr(),
+        V.get_dest_ptr(), &tmp.data, &tmp.keep, tmp.get_strb_ptr(), tmp.get_user_ptr(),
+        &tmp.last, tmp.get_id_ptr(), tmp.get_dest_ptr());
+    return full_n;
+  }
+
+private:
+  __STREAM_T__ V NO_CTOR;
+};
+
+} // namespace hls
+#endif
+#endif
+#endif
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index af37b0f4aa..50596091f2 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -206,7 +206,7 @@ if {$opt(cosim)} {
 
     set time_end [clock clicks -milliseconds]
     puts "INFO:"
-    if {[string equal "$backend" "vivadoaccelerator"]} {
+    if {[string equal "$backend" "vivadoaccelerator"] || [string equal $backend "vitisacceleratoripflow"]} {
         puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_axi_cosim.rpt r]]
     } else {
         puts [read [open ${project_name}_prj/solution1/sim/report/${project_name}_cosim.rpt r]]
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index b8c2a48d19..2a695d4e5a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -11,6 +11,11 @@
 #include <stdlib.h>
 #include <vector>
 
+// this header cannot be included by Vivado HLS
+// "VITIS_ACCELERATOR_IP_FLOW" is defined on the build_lib.sh of the `Vitis Accelerator` template files
+#ifdef VITIS_ACCELERATOR_IP_FLOW
+#include "ap_axi_sdata.h"
+#endif
 namespace nnet {
 
 #ifndef __SYNTHESIS__
@@ -161,6 +166,26 @@ template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stre
     }
 }
 
+#ifdef VITIS_ACCELERATOR_IP_FLOW
+// todo avoid hardcoding hls::axis<float, 0, 0, 0> and use template
+template <class srcType, typename dstType, size_t SIZE>
+void convert_data(srcType *src, hls::stream<hls::axis<float, 0, 0, 0>> &dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        hls::axis<float, 0, 0, 0> ctype;
+        ctype.data = dstType(src[i]);
+        dst.write(ctype);
+    }
+}
+
+template <typename srcType, class dstType, size_t SIZE>
+void convert_data(hls::stream<hls::axis<float, 0, 0, 0>> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        hls::axis<float, 0, 0, 0> ctype = src.read();
+        dst[i] = dstType(ctype.data);
+    }
+}
+#endif
+
 extern bool trace_enabled;
 extern std::map<std::string, void *> *trace_outputs;
 extern size_t trace_type_size;
@@ -247,8 +272,6 @@ template <class data_T> void save_layer_output(hls::stream<data_T> &data, const
     }
 }
 
-#endif
-
 template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
     typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
     typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
@@ -272,14 +295,27 @@ void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
 }
 
 template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
-    for (auto i = 0; i < SIZE; i++)
+    for (auto i = 0; i < SIZE; i++) {
+        dst[i].data = src[i];
         if (i == SIZE - 1) {
-            dst[i].data = src[i];
             dst[i].last = 1;
         } else {
-            dst[i].data = src[i];
             dst[i].last = 0;
         }
+    }
+}
+
+template <class src_T, class dst_T, size_t SIZE> void copy_data_axi(std::vector<src_T> src, hls::stream<dst_T> &dst) {
+    for (auto i = 0; i < SIZE; i++) {
+        dst_T pack;
+        pack.data = src[i];
+        if (i == SIZE - 1) {
+            pack.last = 1;
+        } else {
+            pack.last = 0;
+        }
+        dst.write(pack);
+    }
 }
 
 template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
@@ -289,21 +325,55 @@ template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::o
     out << std::endl;
 }
 
-template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+template <class res_T, size_t SIZE, typename std::enable_if<std::is_array<res_T>::value, int>::type = 0>
+void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
     for (int i = 0; i < SIZE / res_T::size; i++) {
         res_T res_pack = result.read();
         for (int j = 0; j < res_T::size; j++) {
             out << res_pack[j] << " ";
         }
-        if (keep)
+        if (keep) {
+            result.write(res_pack);
+        }
+    }
+    out << std::endl;
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis<...> and io_parallel
+template <class res_T, size_t SIZE, typename std::enable_if<!std::is_array<res_T>::value, int>::type = 0>
+void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE; i++) {
+        res_T res_pack = result.read();
+
+        out << res_pack.data << " ";
+
+        if (keep) {
             result.write(res_pack);
+        }
+    }
+    out << std::endl;
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis<underlying_data_T, ...> and io_stream
+template <class underlying_res_T, class res_T, size_t SIZE>
+void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE / underlying_res_T::size; i++) {
+        res_T res_pack;
+        for (int j = 0; j < underlying_res_T::size; j++) {
+            res_pack = result.read();
+            out << res_pack.data << " ";
+            if (keep) {
+                result.write(res_pack);
+            }
+        }
     }
     out << std::endl;
 }
 
 template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
 
-template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+template <class data_T, size_t SIZE, typename std::enable_if<std::is_array<data_T>::value, int>::type = 0>
+void fill_zero(hls::stream<data_T> &data) {
     for (int i = 0; i < SIZE / data_T::size; i++) {
         data_T data_pack;
         for (int j = 0; j < data_T::size; j++) {
@@ -313,6 +383,36 @@ template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
     }
 }
 
+template <class data_T, size_t SIZE, typename std::enable_if<!std::is_array<data_T>::value, int>::type = 0>
+void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE; i++) {
+        data_T data_pack;
+        data_pack.data = 0.;
+        if (i == SIZE - 1) {
+            data_pack.last = 1;
+        } else {
+            data_pack.last = 0;
+        }
+        data.write(data_pack);
+    }
+}
+
+// compatible with Vitis Accelerator for res_T = hls::axis<underlying_data_T, ...>
+template <class underlying_data_T, class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE / underlying_data_T::size; i++) {
+        data_T data_pack;
+        for (int j = 0; j < underlying_data_T::size; j++) {
+            data_pack.data = 0.;
+            if ((i == (SIZE / underlying_data_T::size - 1)) && (j == (underlying_data_T::size - 1))) {
+                data_pack.last = 1;
+            } else {
+                data_pack.last = 0;
+            }
+            data.write(data_pack);
+        }
+    }
+}
+
 template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
     FILE *fp;
     fp = fopen(filename, "r");
@@ -370,6 +470,7 @@ template <class data_T, int N_IN> void hls_stream_debug(hls::stream<data_T> &dat
         res << datareg;
     }
 }
+#endif
 
 constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
index ca3143d01e..11622efbf0 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv1d_stream.h
@@ -109,7 +109,7 @@ void separable_conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    unsigned res_depth = CONFIG_T::depthwise_config::out_width;
+    const unsigned res_depth = CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_1d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
index 7f4dd866c9..f5cafd2ee7 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_sepconv2d_stream.h
@@ -133,7 +133,7 @@ void separable_conv_2d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
     #pragma HLS DATAFLOW
 
     hls::stream<dw_res_T> depthwise_res;
-    unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
+    const unsigned res_depth = CONFIG_T::depthwise_config::out_height * CONFIG_T::depthwise_config::out_width;
     #pragma HLS STREAM variable=depthwise_res depth=res_depth
 
     depthwise_conv_2d_cl<data_T, dw_res_T, typename CONFIG_T::depthwise_config>(data, depthwise_res, depthwise_weights,
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index f16cccc9fa..31238b18c8 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -1,6 +1,7 @@
 from hls4ml.writer.catapult_writer import CatapultWriter
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
+from hls4ml.writer.vitis_accelerator_ip_flow_writer import VitisAcceleratorIPFlowWriter
 from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
 from hls4ml.writer.vivado_writer import VivadoWriter
@@ -9,6 +10,7 @@
 register_writer('Vivado', VivadoWriter)
 register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
 register_writer('Vitis', VitisWriter)
+register_writer('VitisAcceleratorIPFlow', VitisAcceleratorIPFlowWriter)
 register_writer('Quartus', QuartusWriter)
 register_writer('Catapult', CatapultWriter)
 register_writer('SymbolicExpression', SymbolicExpressionWriter)
diff --git a/hls4ml/writer/vitis_accelerator_ip_flow_writer.py b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
new file mode 100644
index 0000000000..78e1fa982d
--- /dev/null
+++ b/hls4ml/writer/vitis_accelerator_ip_flow_writer.py
@@ -0,0 +1,387 @@
+import os
+from distutils.dir_util import copy_tree
+from shutil import copyfile
+
+# from hls4ml.writer.vivado_writer import VivadoWriter
+from hls4ml.writer.vitis_writer import VitisWriter
+
+
+class VitisAcceleratorIPFlowWriter(VitisWriter):
+    def __init__(self):
+        super().__init__()
+        self.vitis_accelerator_ip_flow_config = None
+
+    def write_axi_wrapper(self, model):
+        '''Write a top level HLS C++ file to wrap the hls4ml project with AXI interfaces
+        Args:
+            model : The ModelGraph to write the wrapper for
+        '''
+        inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types()
+        indent = '    '
+
+        #######################
+        # myproject_axi.h
+        #######################
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.h'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.h', 'w')
+
+        for line in f.readlines():
+            if 'MYPROJECT' in line:
+                newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
+            elif '// hls-fpga-machine-learning insert include' in line:
+                newline = f'#include "{model.config.get_project_name()}.h"\n'
+                newline += '#include "ap_axi_sdata.h"\n'
+            elif 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert definitions' in line:
+                newline = ''
+                newline += f'static const unsigned N_IN = {inp.size()};\n'
+                newline += f'static const unsigned N_OUT = {out.size()};\n'
+                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                    newline += 'typedef hls::axis<float, 0, 0, 0> my_pkt;\n'
+                    # might need to make "float" a variable according to the
+                    # configuration set by the user and the DMA available data widths
+                else:  # TODO: handle this case
+                    newline += f'typedef {inp_axi_t} input_axi_t;\n'
+                    newline += f'typedef {out_axi_t} output_axi_t;\n'
+            else:
+                newline = line
+            fout.write(newline)
+        f.close()
+        fout.close()
+
+        #######################
+        # myproject_axi.cpp
+        #######################
+
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/myproject_axi.cpp'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/{model.config.get_project_name()}_axi.cpp', 'w')
+
+        io_type = model.config.get_config_value("IOType")
+
+        for line in f.readlines():
+            if 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert include' in line:
+                newline = f'#include "{model.config.get_project_name()}_axi.h"\n'
+            elif '// hls-fpga-machine-learning insert local vars' in line:
+                newline = ''
+                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                    newline += indent + 'bool is_last = false;\n'
+                if io_type == 'io_parallel':  # TODO: handle io_parallel
+                    newline += indent + inp.type.name + ' in_local[N_IN];\n'
+                    newline += indent + out.type.name + ' out_local[N_OUT];\n'
+                    newline += indent + 'my_pkt tmp;\n'
+                elif io_type == 'io_stream':
+                    newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n'
+                    newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n'
+                    newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'.format(
+                        model.get_input_variables()[0].pragma[1]
+                    )
+                    newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'.format(
+                        model.get_output_variables()[0].pragma[1]
+                    )
+            elif '// hls-fpga-machine-learning insert call' in line:
+                newline = indent + f'{model.config.get_project_name()}(in_local, out_local);\n'
+            elif '// hls-fpga-machine-learning insert interface' in line:
+                if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_lite':  # TODO: handle axi_lite
+                    newline = ''
+                    newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
+                    newline += indent + '#pragma HLS INTERFACE s_axilite port=in\n'
+                    newline += indent + '#pragma HLS INTERFACE s_axilite port=out\n'
+                elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_master':  # TODO: handle axi_master
+                    newline = ''
+                    newline += indent + '#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS\n'
+                    newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=in offset=slave bundle=IN_BUS\n'.format(
+                        model.get_input_variables()[0].pragma[1]
+                    )
+                    newline += indent + '#pragma HLS INTERFACE m_axi depth={} port=out offset=slave bundle=OUT_BUS\n'.format(
+                        model.get_output_variables()[0].pragma[1]
+                    )
+                elif self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                    newline = ''
+                    newline += indent + '#pragma HLS INTERFACE axis port=in\n'
+                    newline += indent + '#pragma HLS INTERFACE axis port=out\n'
+                    newline += indent + '#pragma HLS INTERFACE ap_ctrl_none port=return\n'
+                    if model.config.get_config_value("IOType") == 'io_stream':
+                        newline += indent + '#pragma HLS DATAFLOW\n'
+            elif '// hls-fpga-machine-learning insert enqueue' in line:
+                io_type = model.config.get_config_value("IOType")
+                if io_type == 'io_parallel':  # TODO: handle io_parallel
+                    newline = ''
+                    newline += indent + 'for(unsigned i = 0; i < N_IN; i++){\n'
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                        newline += indent + indent + '#pragma HLS PIPELINE\n'
+                        newline += indent + indent + 'tmp = in.read(); // Read input with cast\n'
+                        newline += indent + indent + 'in_local[i] = tmp.data;\n'
+                        newline += indent + indent + 'is_last = tmp.last;\n'
+                    else:
+                        newline += indent + indent + '#pragma HLS UNROLL\n'
+                        newline += indent + indent + 'in_local[i] = in[i].data; // Read input with cast\n'
+                    newline += indent + '}\n'
+                    newline += indent + 'tmp.last = 0;\n'
+                elif io_type == 'io_stream':
+                    newline = ''
+                    newline += indent + 'my_pkt tmp;\n'
+
+                    newline += indent + 'for(unsigned i = 0; i < N_IN / {input_t}::size; ++i) {{\n'
+                    # newline += indent + indent + '#pragma HLS PIPELINE\n' # TODO: check if needed
+                    newline += indent + indent + '{input_t} ctype;\n'
+                    # newline += indent + indent + '#pragma HLS DATA_PACK variable=ctype\n'
+                    # newline += indent + indent + 'pragma HLS aggregate variable=ctype compact=auto' # TODO: check if needed
+                    newline += indent + indent + 'for(unsigned j = 0; j < {input_t}::size; j++) {{\n'
+                    # newline += indent + indent + indent + '#pragma HLS UNROLL\n' # TODO: check if needed
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                        newline += indent + indent + indent + 'in.read(tmp);\n'
+                        newline += indent + indent + indent + 'ctype[j] = tmp.data;\n'
+                        newline += indent + indent + indent + 'is_last = tmp.last;\n'
+                    else:  # TODO: handle this case
+                        newline += (
+                            indent
+                            + indent
+                            + indent
+                            + 'ctype[j] = typename {input_t}::value_type(in[i * {input_t}::size + j].data);\n'
+                        )
+                    newline += indent + indent + '}}\n'
+                    newline += indent + indent + 'in_local.write(ctype);\n'
+                    newline += indent + '}}\n'
+                    newline += indent + 'tmp.last = 0;\n'
+                    newline = newline.format(input_t=inp.type.name)
+            elif '// hls-fpga-machine-learning insert dequeue' in line:
+                io_type = model.config.get_config_value("IOType")
+                if io_type == 'io_parallel':  # TODO: handle this case
+                    newline = ''
+                    newline += indent + 'for(unsigned i = 0; i < N_OUT; i++){\n'
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                        newline += indent + indent + '#pragma HLS PIPELINE\n'
+                        newline += indent + indent + 'tmp.data = out_local[i];\n'
+                        newline += indent + indent + 'tmp.last = (is_last && (i == N_OUT - 1))? true : false;\n'
+                        newline += indent + indent + 'out.write(tmp);\n'
+                    else:
+                        newline += indent + indent + '#pragma HLS UNROLL\n'
+                        newline += indent + indent + 'out[i] = out_local[i]; // Write output with cast\n'
+                    newline += indent + '}\n'
+                elif io_type == 'io_stream':
+                    newline = ''
+                    newline += indent + 'for(unsigned i = 0; i < N_OUT / {result_t}::size; ++i) {{\n'
+                    # newline += indent + indent + '#pragma HLS PIPELINE\n'
+                    newline += indent + indent + '{result_t} ctype = out_local.read();\n'
+                    newline += indent + indent + 'for(unsigned j = 0; j < {result_t}::size; j++) {{\n'
+                    # newline += indent + indent + indent + '#pragma HLS UNROLL\n'
+                    if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                        newline += indent + indent + indent + f'tmp.data = ({inp_axi_t}) (ctype[j]);\n'
+
+                        newline += indent + indent + indent + 'if(is_last) {{tmp.last = (((i+1)*(j+1))==N_OUT);}}\n'
+
+                        newline += indent + indent + indent + 'out.write(tmp);\n'
+                    else:
+                        newline += indent + indent + indent + 'out[i * {result_t}::size + j] = output_axi_t(ctype[j]);\n'
+                    newline += indent + indent + '}}\n'
+                    newline += indent + '}}\n'
+                    newline = newline.format(result_t=out.type.name)
+            else:
+                newline = line
+            fout.write(newline)
+        f.close()
+        fout.close()
+
+    def modify_build_script(self, model):
+        '''
+        Modify the build_prj.tcl and build_lib.sh scripts to add the extra wrapper files and set the top function
+        '''
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        oldfile = f'{model.config.get_output_dir()}/build_prj.tcl'
+        newfile = f'{model.config.get_output_dir()}/build_prj_axi.tcl'
+        f = open(oldfile)
+        fout = open(newfile, 'w')
+
+        for line in f.readlines():
+            if 'set_top' in line:
+                newline = line[:-1] + '_axi\n'  # remove the newline from the line end and append _axi for the new top
+                newline += f'add_files firmware/{model.config.get_project_name()}_axi.cpp -cflags "-std=c++0x"\n'
+            elif f'{model.config.get_project_name()}_cosim' in line:
+                newline = line.replace(
+                    f'{model.config.get_project_name()}_cosim',
+                    f'{model.config.get_project_name()}_axi_cosim',
+                )
+            elif '${project_name}.tcl' in line:
+                newline = line.replace('${project_name}.tcl', '${project_name}_axi.tcl')
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+        ###################
+        # build_lib.sh
+        ###################
+
+        f = open(os.path.join(filedir, '../templates/vitis_accelerator_ip_flow/build_lib.sh'))
+        fout = open(f'{model.config.get_output_dir()}/build_lib.sh', 'w')
+
+        for line in f.readlines():
+            line = line.replace('myproject', model.config.get_project_name())
+            line = line.replace('mystamp', model.config.get_config_value('Stamp'))
+
+            fout.write(line)
+        f.close()
+        fout.close()
+
+    def write_wrapper_test(self, model):
+        ###################
+        # write myproject_test_wrapper.cpp
+        ###################
+        oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test.cpp'
+        newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_test_wrapper.cpp'
+
+        inp_axi_t, out_axi_t, inp, out = self.vitis_accelerator_ip_flow_config.get_corrected_types()
+
+        f = open(oldfile)
+        fout = open(newfile, 'w')
+
+        inp = model.get_input_variables()[0]
+        out = model.get_output_variables()[0]
+        io_type = model.config.get_config_value("IOType")
+
+        for line in f.readlines():
+            if f'{model.config.get_project_name()}.h' in line:
+                newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
+            elif inp.definition_cpp() in line:
+                newline = line.replace(
+                    inp.definition_cpp(), 'hls::stream< my_pkt > inputs'
+                )  # TODO instead of replacing strings, how about we use proper variables and their definition?
+            elif out.definition_cpp() in line:
+                newline = line.replace(out.definition_cpp(), 'hls::stream< my_pkt > outputs')
+            elif 'unsigned short' in line:
+                newline = ''
+            elif f'{model.config.get_project_name()}(' in line:
+                indent_amount = line.split(model.config.get_project_name())[0]
+                newline = indent_amount + f'{model.config.get_project_name()}_axi(inputs,outputs);\n'
+            elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
+                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.name, 'inputs').replace(inp.type.name, 'my_pkt')
+            elif out.size_cpp() in line or out.name in line or out.type.name in line:
+                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.name, 'outputs').replace(out.type.name, 'my_pkt')
+            else:
+                newline = line
+            if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+                if 'copy_data' in line:
+                    newline = newline.replace('copy_data', 'copy_data_axi').replace("0,", "")
+
+                if io_type == 'io_stream':
+                    if 'nnet::fill_zero' in line:
+                        newline = newline.replace("nnet::fill_zero<", f"nnet::fill_zero<{inp.type.name}, ")
+                        # indent = line.split('n')[0]
+                        # newline = indent + indent + 'inputs[N_IN-1].last = 1;\n'
+                    if 'print_result' in line:
+                        newline = newline.replace("print_result<", f"print_result<{out.type.name}, ")
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+        ###################
+        # write myproject_bridge_wrapper.cpp
+        ###################
+        oldfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp'
+        newfile = f'{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge_wrapper.cpp'
+
+        f = open(oldfile)
+        fout = open(newfile, 'w')
+
+        inp = model.get_input_variables()[0]
+        out = model.get_output_variables()[0]
+
+        for line in f.readlines():
+            if f'{model.config.get_project_name()}.h' in line:
+                newline = line.replace(f'{model.config.get_project_name()}.h', f'{model.config.get_project_name()}_axi.h')
+            elif inp.definition_cpp(name_suffix='_ap') in line:
+                newline = line.replace(inp.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {inp.name}_ap')
+            elif out.definition_cpp(name_suffix='_ap') in line:
+                newline = line.replace(out.definition_cpp(name_suffix='_ap'), f'hls::stream< my_pkt > {out.name}_ap')
+            elif f'{model.config.get_project_name()}(' in line:
+                indent_amount = line.split(model.config.get_project_name())[0]
+                newline = indent_amount + '{}_axi({}_ap,{}_ap);\n'.format(
+                    model.config.get_project_name(), inp.name, out.name
+                )
+            elif inp.size_cpp() in line or inp.name in line or inp.type.name in line:
+                newline = line.replace(inp.size_cpp(), 'N_IN').replace(inp.type.name, inp_axi_t)
+            elif out.size_cpp() in line or out.name in line or out.type.name in line:
+                newline = line.replace(out.size_cpp(), 'N_OUT').replace(out.type.name, out_axi_t)
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+    def write_board_script(self, model):
+        '''
+        Write the tcl scripts and kernel sources to create a Vivado IPI project for the VitisAcceleratorIPFlow
+        '''
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        copyfile(
+            os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_tcl_file_path()),
+            f'{model.config.get_output_dir()}/design.tcl',
+        )
+        # Generic alveo board
+        if self.vitis_accelerator_ip_flow_config.get_board().startswith('alveo'):
+            src_dir = os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_krnl_rtl_src_dir())
+            dst_dir = os.path.abspath(model.config.get_output_dir()) + '/src'
+            copy_tree(src_dir, dst_dir)
+
+        ###################
+        # project.tcl
+        ###################
+        f = open(f'{model.config.get_output_dir()}/project.tcl', 'w')
+        f.write('variable project_name\n')
+        f.write(f'set project_name "{model.config.get_project_name()}"\n')
+        f.write('variable backend\n')
+        f.write('set backend "vitisacceleratoripflow"\n')
+        f.write('variable part\n')
+        f.write(f'set part "{self.vitis_accelerator_ip_flow_config.get_part()}"\n')
+        f.write('variable clock_period\n')
+        f.write('set clock_period {}\n'.format(model.config.get_config_value('ClockPeriod')))
+        f.write('variable clock_uncertainty\n')
+        f.write('set clock_uncertainty {}\n'.format(model.config.get_config_value('ClockUncertainty', '12.5%')))
+        f.write('variable version\n')
+        f.write('set version "{}"\n'.format(model.config.get_config_value('Version', '1.0.0')))
+        if self.vitis_accelerator_ip_flow_config.get_interface() == 'axi_stream':
+            in_bit, out_bit = self.vitis_accelerator_ip_flow_config.get_io_bitwidth()
+            f.write(f'set bit_width_hls_output {in_bit}\n')
+            f.write(f'set bit_width_hls_input {out_bit}\n')
+        f.close()
+
+    def write_driver(self, model):
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        copyfile(
+            os.path.join(filedir, self.vitis_accelerator_ip_flow_config.get_driver_path()),
+            ('{}/' + self.vitis_accelerator_ip_flow_config.get_driver_file()).format(model.config.get_output_dir()),
+        )
+
+    def write_new_tar(self, model):
+        # os.remove(model.config.get_output_dir() + '.tar.gz')
+        super().write_tar(model)
+
+    def write_hls(self, model):
+        """
+        Write the HLS project. Calls the VivadoBackend writer, and extra steps for VitisAcceleratorIPFlow/AXI interface
+        """
+        # TODO temporarily move config import here to avoid cyclic dependency, until config is moved to its own package
+        from hls4ml.backends import VitisAcceleratorIPFlowConfig
+
+        self.vitis_accelerator_ip_flow_config = VitisAcceleratorIPFlowConfig(
+            model.config, model.get_input_variables(), model.get_output_variables()
+        )
+        super().write_hls(model)
+        self.write_board_script(model)
+        self.write_driver(model)
+        self.write_wrapper_test(model)
+        self.write_axi_wrapper(model)
+        self.modify_build_script(model)
+        self.write_new_tar(model)