From 59d37ee850f57b5fa76cab1bd4f1f0db4ab69cbb Mon Sep 17 00:00:00 2001
From: Alex Forencich <alex@alexforencich.com>
Date: Tue, 28 Mar 2023 20:59:47 -0700
Subject: [PATCH] Add AXI virtual FIFO

Signed-off-by: Alex Forencich <alex@alexforencich.com>
---
 rtl/axi_vfifo.v                        | 607 +++++++++++++++++++
 rtl/axi_vfifo_dec.v                    | 720 ++++++++++++++++++++++
 rtl/axi_vfifo_enc.v                    | 794 ++++++++++++++++++++++++
 rtl/axi_vfifo_raw.v                    | 381 ++++++++++++
 rtl/axi_vfifo_raw_rd.v                 | 580 ++++++++++++++++++
 rtl/axi_vfifo_raw_wr.v                 | 567 +++++++++++++++++
 syn/vivado/axi_vfifo.tcl               |  86 +++
 syn/vivado/axi_vfifo_raw.tcl           |  31 +
 syn/vivado/axi_vfifo_raw_rd.tcl        |  77 +++
 syn/vivado/axi_vfifo_raw_wr.tcl        |  68 +++
 tb/axi_vfifo/Makefile                  |  94 +++
 tb/axi_vfifo/test_axi_vfifo.py         | 685 +++++++++++++++++++++
 tb/axi_vfifo_dec/Makefile              |  79 +++
 tb/axi_vfifo_dec/test_axi_vfifo_dec.py | 394 ++++++++++++
 tb/axi_vfifo_enc/Makefile              |  79 +++
 tb/axi_vfifo_enc/test_axi_vfifo_enc.py | 426 +++++++++++++
 tb/axi_vfifo_raw/Makefile              |  83 +++
 tb/axi_vfifo_raw/test_axi_vfifo_raw.py | 804 +++++++++++++++++++++++++
 18 files changed, 6555 insertions(+)
 create mode 100644 rtl/axi_vfifo.v
 create mode 100644 rtl/axi_vfifo_dec.v
 create mode 100644 rtl/axi_vfifo_enc.v
 create mode 100644 rtl/axi_vfifo_raw.v
 create mode 100644 rtl/axi_vfifo_raw_rd.v
 create mode 100644 rtl/axi_vfifo_raw_wr.v
 create mode 100644 syn/vivado/axi_vfifo.tcl
 create mode 100644 syn/vivado/axi_vfifo_raw.tcl
 create mode 100644 syn/vivado/axi_vfifo_raw_rd.tcl
 create mode 100644 syn/vivado/axi_vfifo_raw_wr.tcl
 create mode 100644 tb/axi_vfifo/Makefile
 create mode 100644 tb/axi_vfifo/test_axi_vfifo.py
 create mode 100644 tb/axi_vfifo_dec/Makefile
 create mode 100644 tb/axi_vfifo_dec/test_axi_vfifo_dec.py
 create mode 100644 tb/axi_vfifo_enc/Makefile
 create mode 100644 tb/axi_vfifo_enc/test_axi_vfifo_enc.py
 create mode 100644 tb/axi_vfifo_raw/Makefile
 create mode 100644 tb/axi_vfifo_raw/test_axi_vfifo_raw.py

diff --git a/rtl/axi_vfifo.v b/rtl/axi_vfifo.v
new file mode 100644
index 0000000..e8ae9fd
--- /dev/null
+++ b/rtl/axi_vfifo.v
@@ -0,0 +1,607 @@
+/*
+
+Copyright (c) 2023 Alex Forencich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+// Language: Verilog 2001
+
+`resetall
+`timescale 1ns / 1ps
+`default_nettype none
+
+/*
+ * AXI4 virtual FIFO
+ */
+module axi_vfifo #
+(
+    // AXI channel count
+    parameter AXI_CH = 1,
+    // Width of AXI data bus in bits
+    parameter AXI_DATA_WIDTH = 32,
+    // Width of AXI address bus in bits
+    parameter AXI_ADDR_WIDTH = 16,
+    // Width of AXI wstrb (width of data bus in words)
+    parameter AXI_STRB_WIDTH = (AXI_DATA_WIDTH/8),
+    // Width of AXI ID signal
+    parameter AXI_ID_WIDTH = 8,
+    // Maximum AXI burst length to generate
+    parameter AXI_MAX_BURST_LEN = 16,
+    // Width of AXI stream interfaces in bits
+    parameter AXIS_DATA_WIDTH = AXI_DATA_WIDTH*AXI_CH/2,
+    // Use AXI stream tkeep signal
+    parameter AXIS_KEEP_ENABLE = (AXIS_DATA_WIDTH>8),
+    // AXI stream tkeep signal width (words per cycle)
+    parameter AXIS_KEEP_WIDTH = (AXIS_DATA_WIDTH/8),
+    // Use AXI stream tlast signal
+    parameter AXIS_LAST_ENABLE = 1,
+    // Propagate AXI stream tid signal
+    parameter AXIS_ID_ENABLE = 0,
+    // AXI stream tid signal width
+    parameter AXIS_ID_WIDTH = 8,
+    // Propagate AXI stream tdest signal
+    parameter AXIS_DEST_ENABLE = 0,
+    // AXI stream tdest signal width
+    parameter AXIS_DEST_WIDTH = 8,
+    // Propagate AXI stream tuser signal
+    parameter AXIS_USER_ENABLE = 1,
+    // AXI stream tuser signal width
+    parameter AXIS_USER_WIDTH = 1,
+    // Width of length field
+    parameter LEN_WIDTH = AXI_ADDR_WIDTH,
+    // Maximum segment width
+    parameter MAX_SEG_WIDTH = 256,
+    // Input FIFO depth for AXI write data (full-width words)
+    parameter WRITE_FIFO_DEPTH = 64,
+    // Max AXI write burst length
+    parameter WRITE_MAX_BURST_LEN = WRITE_FIFO_DEPTH/4,
+    // Output FIFO depth for AXI read data (full-width words)
+    parameter READ_FIFO_DEPTH = 128,
+    // Max AXI read burst length
+    parameter READ_MAX_BURST_LEN = WRITE_MAX_BURST_LEN
+)
+(
+    input  wire                               clk,
+    input  wire                               rst,
+
+    /*
+     * AXI stream data input
+     */
+    input  wire                               s_axis_clk,
+    input  wire                               s_axis_rst,
+    output wire                               s_axis_rst_out,
+    input  wire [AXIS_DATA_WIDTH-1:0]         s_axis_tdata,
+    input  wire [AXIS_KEEP_WIDTH-1:0]         s_axis_tkeep,
+    input  wire                               s_axis_tvalid,
+    output wire                               s_axis_tready,
+    input  wire                               s_axis_tlast,
+    input  wire [AXIS_ID_WIDTH-1:0]           s_axis_tid,
+    input  wire [AXIS_DEST_WIDTH-1:0]         s_axis_tdest,
+    input  wire [AXIS_USER_WIDTH-1:0]         s_axis_tuser,
+
+    /*
+     * AXI stream data output
+     */
+    input  wire                               m_axis_clk,
+    input  wire                               m_axis_rst,
+    output wire                               m_axis_rst_out,
+    output wire [AXIS_DATA_WIDTH-1:0]         m_axis_tdata,
+    output wire [AXIS_KEEP_WIDTH-1:0]         m_axis_tkeep,
+    output wire                               m_axis_tvalid,
+    input  wire                               m_axis_tready,
+    output wire                               m_axis_tlast,
+    output wire [AXIS_ID_WIDTH-1:0]           m_axis_tid,
+    output wire [AXIS_DEST_WIDTH-1:0]         m_axis_tdest,
+    output wire [AXIS_USER_WIDTH-1:0]         m_axis_tuser,
+
+    /*
+     * AXI master interfaces
+     */
+    input  wire [AXI_CH-1:0]                  m_axi_clk,
+    input  wire [AXI_CH-1:0]                  m_axi_rst,
+    output wire [AXI_CH*AXI_ID_WIDTH-1:0]     m_axi_awid,
+    output wire [AXI_CH*AXI_ADDR_WIDTH-1:0]   m_axi_awaddr,
+    output wire [AXI_CH*8-1:0]                m_axi_awlen,
+    output wire [AXI_CH*3-1:0]                m_axi_awsize,
+    output wire [AXI_CH*2-1:0]                m_axi_awburst,
+    output wire [AXI_CH-1:0]                  m_axi_awlock,
+    output wire [AXI_CH*4-1:0]                m_axi_awcache,
+    output wire [AXI_CH*3-1:0]                m_axi_awprot,
+    output wire [AXI_CH-1:0]                  m_axi_awvalid,
+    input  wire [AXI_CH-1:0]                  m_axi_awready,
+    output wire [AXI_CH*AXI_DATA_WIDTH-1:0]   m_axi_wdata,
+    output wire [AXI_CH*AXI_STRB_WIDTH-1:0]   m_axi_wstrb,
+    output wire [AXI_CH-1:0]                  m_axi_wlast,
+    output wire [AXI_CH-1:0]                  m_axi_wvalid,
+    input  wire [AXI_CH-1:0]                  m_axi_wready,
+    input  wire [AXI_CH*AXI_ID_WIDTH-1:0]     m_axi_bid,
+    input  wire [AXI_CH*2-1:0]                m_axi_bresp,
+    input  wire [AXI_CH-1:0]                  m_axi_bvalid,
+    output wire [AXI_CH-1:0]                  m_axi_bready,
+    output wire [AXI_CH*AXI_ID_WIDTH-1:0]     m_axi_arid,
+    output wire [AXI_CH*AXI_ADDR_WIDTH-1:0]   m_axi_araddr,
+    output wire [AXI_CH*8-1:0]                m_axi_arlen,
+    output wire [AXI_CH*3-1:0]                m_axi_arsize,
+    output wire [AXI_CH*2-1:0]                m_axi_arburst,
+    output wire [AXI_CH-1:0]                  m_axi_arlock,
+    output wire [AXI_CH*4-1:0]                m_axi_arcache,
+    output wire [AXI_CH*3-1:0]                m_axi_arprot,
+    output wire [AXI_CH-1:0]                  m_axi_arvalid,
+    input  wire [AXI_CH-1:0]                  m_axi_arready,
+    input  wire [AXI_CH*AXI_ID_WIDTH-1:0]     m_axi_rid,
+    input  wire [AXI_CH*AXI_DATA_WIDTH-1:0]   m_axi_rdata,
+    input  wire [AXI_CH*2-1:0]                m_axi_rresp,
+    input  wire [AXI_CH-1:0]                  m_axi_rlast,
+    input  wire [AXI_CH-1:0]                  m_axi_rvalid,
+    output wire [AXI_CH-1:0]                  m_axi_rready,
+
+    /*
+     * Configuration
+     */
+    input  wire [AXI_CH*AXI_ADDR_WIDTH-1:0]   cfg_fifo_base_addr,
+    input  wire [LEN_WIDTH-1:0]               cfg_fifo_size_mask,
+    input  wire                               cfg_enable,
+    input  wire                               cfg_reset,
+
+    /*
+     * Status
+     */
+    output wire [AXI_CH*(LEN_WIDTH+1)-1:0]    sts_fifo_occupancy,
+    output wire [AXI_CH-1:0]                  sts_fifo_empty,
+    output wire [AXI_CH-1:0]                  sts_fifo_full,
+    output wire [AXI_CH-1:0]                  sts_reset,
+    output wire [AXI_CH-1:0]                  sts_active,
+    output wire                               sts_hdr_parity_err
+);
+
+parameter CH_SEG_CNT = AXI_DATA_WIDTH > MAX_SEG_WIDTH ? AXI_DATA_WIDTH / MAX_SEG_WIDTH : 1;
+parameter SEG_CNT = CH_SEG_CNT * AXI_CH;
+parameter SEG_WIDTH = AXI_DATA_WIDTH / CH_SEG_CNT;
+
+wire [AXI_CH-1:0]             ch_input_rst_out;
+wire [AXI_CH-1:0]             ch_input_watermark;
+wire [SEG_CNT*SEG_WIDTH-1:0]  ch_input_data;
+wire [SEG_CNT-1:0]            ch_input_valid;
+wire [SEG_CNT-1:0]            ch_input_ready;
+
+wire [AXI_CH-1:0]             ch_output_rst_out;
+wire [SEG_CNT*SEG_WIDTH-1:0]  ch_output_data;
+wire [SEG_CNT-1:0]            ch_output_valid;
+wire [SEG_CNT-1:0]            ch_output_ready;
+wire [SEG_CNT*SEG_WIDTH-1:0]  ch_output_ctrl_data;
+wire [SEG_CNT-1:0]            ch_output_ctrl_valid;
+wire [SEG_CNT-1:0]            ch_output_ctrl_ready;
+
+wire [AXI_CH-1:0] ch_rst_req;
+
+// config management
+reg [AXI_CH*AXI_ADDR_WIDTH-1:0] cfg_fifo_base_addr_reg = 0;
+reg [LEN_WIDTH-1:0] cfg_fifo_size_mask_reg = 0;
+reg cfg_enable_reg = 0;
+reg cfg_reset_reg = 0;
+
+always @(posedge clk) begin
+    if (cfg_enable_reg) begin
+        if (cfg_reset) begin
+            cfg_enable_reg <= 1'b0;
+        end
+    end else begin
+        if (cfg_enable) begin
+            cfg_enable_reg <= 1'b1;
+        end
+        cfg_fifo_base_addr_reg <= cfg_fifo_base_addr;
+        cfg_fifo_size_mask_reg <= cfg_fifo_size_mask;
+    end
+
+    cfg_reset_reg <= cfg_reset;
+
+    if (rst) begin
+        cfg_enable_reg <= 0;
+        cfg_reset_reg <= 0;
+    end
+end
+
+// status sync
+wire [AXI_CH*(LEN_WIDTH+1)-1:0] sts_fifo_occupancy_int;
+wire [AXI_CH-1:0] sts_fifo_empty_int;
+wire [AXI_CH-1:0] sts_fifo_full_int;
+wire [AXI_CH-1:0] sts_reset_int;
+wire [AXI_CH-1:0] sts_active_int;
+wire sts_hdr_parity_err_int;
+reg [3:0] sts_hdr_parity_err_cnt_reg = 0;
+reg sts_hdr_parity_err_reg = 1'b0;
+
+reg [2:0] sts_sync_count_reg = 0;
+reg sts_sync_flag_reg = 1'b0;
+
+(* shreg_extract = "no" *)
+reg [AXI_CH*(LEN_WIDTH+1)-1:0] sts_fifo_occupancy_sync_reg = 0;
+(* shreg_extract = "no" *)
+reg [AXI_CH-1:0] sts_fifo_empty_sync_1_reg = 0, sts_fifo_empty_sync_2_reg = 0;
+(* shreg_extract = "no" *)
+reg [AXI_CH-1:0] sts_fifo_full_sync_1_reg = 0, sts_fifo_full_sync_2_reg = 0;
+(* shreg_extract = "no" *)
+reg [AXI_CH-1:0] sts_reset_sync_1_reg = 0, sts_reset_sync_2_reg = 0;
+(* shreg_extract = "no" *)
+reg [AXI_CH-1:0] sts_active_sync_1_reg = 0, sts_active_sync_2_reg = 0;
+(* shreg_extract = "no" *)
+reg sts_hdr_parity_err_sync_1_reg = 0, sts_hdr_parity_err_sync_2_reg = 0;
+
+assign sts_fifo_occupancy = sts_fifo_occupancy_sync_reg;
+assign sts_fifo_empty = sts_fifo_empty_sync_2_reg;
+assign sts_fifo_full = sts_fifo_full_sync_2_reg;
+assign sts_reset = sts_reset_sync_2_reg;
+assign sts_active = sts_active_sync_2_reg;
+assign sts_hdr_parity_err = sts_hdr_parity_err_sync_2_reg;
+
+always @(posedge m_axis_clk) begin
+    sts_hdr_parity_err_reg <= 1'b0;
+
+    if (sts_hdr_parity_err_cnt_reg) begin
+        sts_hdr_parity_err_reg <= 1'b1;
+        sts_hdr_parity_err_cnt_reg <= sts_hdr_parity_err_cnt_reg - 1;
+    end
+
+    if (sts_hdr_parity_err_int) begin
+        sts_hdr_parity_err_cnt_reg <= 4'hf;
+    end
+
+    if (m_axis_rst) begin
+        sts_hdr_parity_err_cnt_reg <= 4'h0;
+        sts_hdr_parity_err_reg <= 1'b0;
+    end
+end
+
+always @(posedge clk) begin
+    sts_sync_count_reg <= sts_sync_count_reg + 1;
+
+    if (sts_sync_count_reg == 0) begin
+        sts_sync_flag_reg <= !sts_sync_flag_reg;
+        sts_fifo_occupancy_sync_reg <= sts_fifo_occupancy_int;
+    end
+
+    sts_fifo_empty_sync_1_reg <= sts_fifo_empty_int;
+    sts_fifo_empty_sync_2_reg <= sts_fifo_empty_sync_1_reg;
+    sts_fifo_full_sync_1_reg <= sts_fifo_full_int;
+    sts_fifo_full_sync_2_reg <= sts_fifo_full_sync_1_reg;
+    sts_reset_sync_1_reg <= sts_reset_int;
+    sts_reset_sync_2_reg <= sts_reset_sync_1_reg;
+    sts_active_sync_1_reg <= sts_active_int;
+    sts_active_sync_2_reg <= sts_active_sync_1_reg;
+    sts_hdr_parity_err_sync_1_reg <= sts_hdr_parity_err_reg;
+    sts_hdr_parity_err_sync_2_reg <= sts_hdr_parity_err_sync_1_reg;
+end
+
+assign s_axis_rst_out = |ch_input_rst_out;
+
+axi_vfifo_enc #(
+    .SEG_WIDTH(SEG_WIDTH),
+    .SEG_CNT(SEG_CNT),
+    .AXIS_DATA_WIDTH(AXIS_DATA_WIDTH),
+    .AXIS_KEEP_ENABLE(AXIS_KEEP_ENABLE),
+    .AXIS_KEEP_WIDTH(AXIS_KEEP_WIDTH),
+    .AXIS_LAST_ENABLE(AXIS_LAST_ENABLE),
+    .AXIS_ID_ENABLE(AXIS_ID_ENABLE),
+    .AXIS_ID_WIDTH(AXIS_ID_WIDTH),
+    .AXIS_DEST_ENABLE(AXIS_DEST_ENABLE),
+    .AXIS_DEST_WIDTH(AXIS_DEST_WIDTH),
+    .AXIS_USER_ENABLE(AXIS_USER_ENABLE),
+    .AXIS_USER_WIDTH(AXIS_USER_WIDTH)
+)
+axi_vfifo_enc_inst (
+    .clk(s_axis_clk),
+    .rst(s_axis_rst),
+
+    /*
+     * AXI stream data input
+     */
+    .s_axis_tdata(s_axis_tdata),
+    .s_axis_tkeep(s_axis_tkeep),
+    .s_axis_tvalid(s_axis_tvalid),
+    .s_axis_tready(s_axis_tready),
+    .s_axis_tlast(s_axis_tlast),
+    .s_axis_tid(s_axis_tid),
+    .s_axis_tdest(s_axis_tdest),
+    .s_axis_tuser(s_axis_tuser),
+
+    /*
+     * Segmented data output (to virtual FIFO channel)
+     */
+    .fifo_rst_in(s_axis_rst_out),
+    .output_data(ch_input_data),
+    .output_valid(ch_input_valid),
+    .fifo_watermark_in(|ch_input_watermark)
+);
+
+generate
+
+genvar  n;
+
+for (n = 0; n < AXI_CH; n = n + 1) begin : axi_ch
+    
+    wire ch_clk = m_axi_clk[1*n +: 1];
+    wire ch_rst = m_axi_rst[1*n +: 1];
+
+    wire [AXI_ID_WIDTH-1:0]    ch_axi_awid;
+    wire [AXI_ADDR_WIDTH-1:0]  ch_axi_awaddr;
+    wire [7:0]                 ch_axi_awlen;
+    wire [2:0]                 ch_axi_awsize;
+    wire [1:0]                 ch_axi_awburst;
+    wire                       ch_axi_awlock;
+    wire [3:0]                 ch_axi_awcache;
+    wire [2:0]                 ch_axi_awprot;
+    wire                       ch_axi_awvalid;
+    wire                       ch_axi_awready;
+    wire [AXI_DATA_WIDTH-1:0]  ch_axi_wdata;
+    wire [AXI_STRB_WIDTH-1:0]  ch_axi_wstrb;
+    wire                       ch_axi_wlast;
+    wire                       ch_axi_wvalid;
+    wire                       ch_axi_wready;
+    wire [AXI_ID_WIDTH-1:0]    ch_axi_bid;
+    wire [1:0]                 ch_axi_bresp;
+    wire                       ch_axi_bvalid;
+    wire                       ch_axi_bready;
+    wire [AXI_ID_WIDTH-1:0]    ch_axi_arid;
+    wire [AXI_ADDR_WIDTH-1:0]  ch_axi_araddr;
+    wire [7:0]                 ch_axi_arlen;
+    wire [2:0]                 ch_axi_arsize;
+    wire [1:0]                 ch_axi_arburst;
+    wire                       ch_axi_arlock;
+    wire [3:0]                 ch_axi_arcache;
+    wire [2:0]                 ch_axi_arprot;
+    wire                       ch_axi_arvalid;
+    wire                       ch_axi_arready;
+    wire [AXI_ID_WIDTH-1:0]    ch_axi_rid;
+    wire [AXI_DATA_WIDTH-1:0]  ch_axi_rdata;
+    wire [1:0]                 ch_axi_rresp;
+    wire                       ch_axi_rlast;
+    wire                       ch_axi_rvalid;
+    wire                       ch_axi_rready;
+
+    assign m_axi_awid[AXI_ID_WIDTH*n +: AXI_ID_WIDTH] = ch_axi_awid;
+    assign m_axi_awaddr[AXI_ADDR_WIDTH*n +: AXI_ADDR_WIDTH] = ch_axi_awaddr;
+    assign m_axi_awlen[8*n +: 8] = ch_axi_awlen;
+    assign m_axi_awsize[3*n +: 3] = ch_axi_awsize;
+    assign m_axi_awburst[2*n +: 2] = ch_axi_awburst;
+    assign m_axi_awlock[1*n +: 1] = ch_axi_awlock;
+    assign m_axi_awcache[4*n +: 4] = ch_axi_awcache;
+    assign m_axi_awprot[3*n +: 3] = ch_axi_awprot;
+    assign m_axi_awvalid[1*n +: 1] = ch_axi_awvalid;
+    assign ch_axi_awready = m_axi_awready[1*n +: 1];
+    assign m_axi_wdata[AXI_DATA_WIDTH*n +: AXI_DATA_WIDTH] = ch_axi_wdata;
+    assign m_axi_wstrb[AXI_STRB_WIDTH*n +: AXI_STRB_WIDTH] = ch_axi_wstrb;
+    assign m_axi_wlast[1*n +: 1] = ch_axi_wlast;
+    assign m_axi_wvalid[1*n +: 1] = ch_axi_wvalid;
+    assign ch_axi_wready = m_axi_wready[1*n +: 1];
+    assign ch_axi_bid = m_axi_bid[AXI_ID_WIDTH*n +: AXI_ID_WIDTH];
+    assign ch_axi_bresp = m_axi_bresp[2*n +: 2];
+    assign ch_axi_bvalid = m_axi_bvalid[1*n +: 1];
+    assign m_axi_bready[1*n +: 1] = ch_axi_bready;
+    assign m_axi_arid[AXI_ID_WIDTH*n +: AXI_ID_WIDTH] = ch_axi_arid;
+    assign m_axi_araddr[AXI_ADDR_WIDTH*n +: AXI_ADDR_WIDTH] = ch_axi_araddr;
+    assign m_axi_arlen[8*n +: 8] = ch_axi_arlen;
+    assign m_axi_arsize[3*n +: 3] = ch_axi_arsize;
+    assign m_axi_arburst[2*n +: 2] = ch_axi_arburst;
+    assign m_axi_arlock[1*n +: 1] = ch_axi_arlock;
+    assign m_axi_arcache[4*n +: 4] = ch_axi_arcache;
+    assign m_axi_arprot[3*n +: 3] = ch_axi_arprot;
+    assign m_axi_arvalid[1*n +: 1] = ch_axi_arvalid;
+    assign ch_axi_arready = m_axi_arready[1*n +: 1];
+    assign ch_axi_rid = m_axi_rid[AXI_ID_WIDTH*n +: AXI_ID_WIDTH];
+    assign ch_axi_rdata = m_axi_rdata[AXI_DATA_WIDTH*n +: AXI_DATA_WIDTH];
+    assign ch_axi_rresp = m_axi_rresp[2*n +: 2];
+    assign ch_axi_rlast = m_axi_rlast[1*n +: 1];
+    assign ch_axi_rvalid = m_axi_rvalid[1*n +: 1];
+    assign m_axi_rready[1*n +: 1] = ch_axi_rready;
+
+    // control sync
+    (* shreg_extract = "no" *)
+    reg ch_cfg_enable_sync_1_reg = 1'b0,  ch_cfg_enable_sync_2_reg = 1'b0;
+
+    always @(posedge ch_clk) begin
+        ch_cfg_enable_sync_1_reg <= cfg_enable_reg;
+        ch_cfg_enable_sync_2_reg <= ch_cfg_enable_sync_1_reg;
+    end
+
+    // status sync
+    wire [LEN_WIDTH+1-1:0] ch_sts_fifo_occupancy;
+    reg [LEN_WIDTH+1-1:0] ch_sts_fifo_occupancy_reg;
+
+    (* shreg_extract = "no" *)
+    reg ch_sts_flag_sync_1_reg = 1'b0,  ch_sts_flag_sync_2_reg = 1'b0,  ch_sts_flag_sync_3_reg = 1'b0;
+
+    assign sts_fifo_occupancy_int[(LEN_WIDTH+1)*n +: LEN_WIDTH+1] = ch_sts_fifo_occupancy_reg;
+
+    always @(posedge ch_clk) begin
+        ch_sts_flag_sync_1_reg <= sts_sync_flag_reg;
+        ch_sts_flag_sync_2_reg <= ch_sts_flag_sync_1_reg;
+        ch_sts_flag_sync_3_reg <= ch_sts_flag_sync_2_reg;
+
+        if (ch_sts_flag_sync_3_reg ^ ch_sts_flag_sync_2_reg) begin
+            ch_sts_fifo_occupancy_reg <= ch_sts_fifo_occupancy;
+        end
+    end
+
+    axi_vfifo_raw #(
+        .SEG_WIDTH(SEG_WIDTH),
+        .SEG_CNT(CH_SEG_CNT),
+        .AXI_DATA_WIDTH(AXI_DATA_WIDTH),
+        .AXI_ADDR_WIDTH(AXI_ADDR_WIDTH),
+        .AXI_STRB_WIDTH(AXI_STRB_WIDTH),
+        .AXI_ID_WIDTH(AXI_ID_WIDTH),
+        .AXI_MAX_BURST_LEN(AXI_MAX_BURST_LEN),
+        .LEN_WIDTH(LEN_WIDTH),
+        .WRITE_FIFO_DEPTH(WRITE_FIFO_DEPTH),
+        .WRITE_MAX_BURST_LEN(WRITE_MAX_BURST_LEN),
+        .READ_FIFO_DEPTH(READ_FIFO_DEPTH),
+        .READ_MAX_BURST_LEN(READ_MAX_BURST_LEN),
+        .WATERMARK_LEVEL(WRITE_FIFO_DEPTH-4),
+        .CTRL_OUT_EN(1)
+    )
+    axi_vfifo_raw_inst (
+        .clk(ch_clk),
+        .rst(ch_rst),
+
+        /*
+         * Segmented data input (from encode logic)
+         */
+        .input_clk(s_axis_clk),
+        .input_rst(s_axis_rst),
+        .input_rst_out(ch_input_rst_out[n]),
+        .input_watermark(ch_input_watermark[n]),
+        .input_data(ch_input_data[SEG_WIDTH*CH_SEG_CNT*n +: SEG_WIDTH*CH_SEG_CNT]),
+        .input_valid(ch_input_valid[CH_SEG_CNT*n +: CH_SEG_CNT]),
+        .input_ready(ch_input_ready[CH_SEG_CNT*n +: CH_SEG_CNT]),
+
+        /*
+         * Segmented data output (to decode logic)
+         */
+        .output_clk(m_axis_clk),
+        .output_rst(m_axis_rst),
+        .output_rst_out(ch_output_rst_out[n]),
+        .output_data(ch_output_data[SEG_WIDTH*CH_SEG_CNT*n +: SEG_WIDTH*CH_SEG_CNT]),
+        .output_valid(ch_output_valid[CH_SEG_CNT*n +: CH_SEG_CNT]),
+        .output_ready(ch_output_ready[CH_SEG_CNT*n +: CH_SEG_CNT]),
+        .output_ctrl_data(ch_output_ctrl_data[SEG_WIDTH*CH_SEG_CNT*n +: SEG_WIDTH*CH_SEG_CNT]),
+        .output_ctrl_valid(ch_output_ctrl_valid[CH_SEG_CNT*n +: CH_SEG_CNT]),
+        .output_ctrl_ready(ch_output_ctrl_ready[CH_SEG_CNT*n +: CH_SEG_CNT]),
+
+        /*
+         * AXI master interface
+         */
+        .m_axi_awid(ch_axi_awid),
+        .m_axi_awaddr(ch_axi_awaddr),
+        .m_axi_awlen(ch_axi_awlen),
+        .m_axi_awsize(ch_axi_awsize),
+        .m_axi_awburst(ch_axi_awburst),
+        .m_axi_awlock(ch_axi_awlock),
+        .m_axi_awcache(ch_axi_awcache),
+        .m_axi_awprot(ch_axi_awprot),
+        .m_axi_awvalid(ch_axi_awvalid),
+        .m_axi_awready(ch_axi_awready),
+        .m_axi_wdata(ch_axi_wdata),
+        .m_axi_wstrb(ch_axi_wstrb),
+        .m_axi_wlast(ch_axi_wlast),
+        .m_axi_wvalid(ch_axi_wvalid),
+        .m_axi_wready(ch_axi_wready),
+        .m_axi_bid(ch_axi_bid),
+        .m_axi_bresp(ch_axi_bresp),
+        .m_axi_bvalid(ch_axi_bvalid),
+        .m_axi_bready(ch_axi_bready),
+        .m_axi_arid(ch_axi_arid),
+        .m_axi_araddr(ch_axi_araddr),
+        .m_axi_arlen(ch_axi_arlen),
+        .m_axi_arsize(ch_axi_arsize),
+        .m_axi_arburst(ch_axi_arburst),
+        .m_axi_arlock(ch_axi_arlock),
+        .m_axi_arcache(ch_axi_arcache),
+        .m_axi_arprot(ch_axi_arprot),
+        .m_axi_arvalid(ch_axi_arvalid),
+        .m_axi_arready(ch_axi_arready),
+        .m_axi_rid(ch_axi_rid),
+        .m_axi_rdata(ch_axi_rdata),
+        .m_axi_rresp(ch_axi_rresp),
+        .m_axi_rlast(ch_axi_rlast),
+        .m_axi_rvalid(ch_axi_rvalid),
+        .m_axi_rready(ch_axi_rready),
+
+        /*
+         * Reset sync
+         */
+        .rst_req_out(ch_rst_req[n]),
+        .rst_req_in(|ch_rst_req),
+
+        /*
+         * Configuration
+         */
+        .cfg_fifo_base_addr(cfg_fifo_base_addr_reg[AXI_ADDR_WIDTH*n +: AXI_ADDR_WIDTH]),
+        .cfg_fifo_size_mask(cfg_fifo_size_mask_reg),
+        .cfg_enable(ch_cfg_enable_sync_2_reg),
+        .cfg_reset(cfg_reset_reg),
+
+        /*
+         * Status
+         */
+        .sts_fifo_occupancy(ch_sts_fifo_occupancy),
+        .sts_fifo_empty(sts_fifo_empty_int[n]),
+        .sts_fifo_full(sts_fifo_full_int[n]),
+        .sts_reset(sts_reset_int[n]),
+        .sts_active(sts_active_int[n]),
+        .sts_write_active(),
+        .sts_read_active()
+    );
+
+end
+
+endgenerate
+
+assign m_axis_rst_out = |ch_output_rst_out;
+
+axi_vfifo_dec #(
+    .SEG_WIDTH(SEG_WIDTH),
+    .SEG_CNT(SEG_CNT),
+    .AXIS_DATA_WIDTH(AXIS_DATA_WIDTH),
+    .AXIS_KEEP_ENABLE(AXIS_KEEP_ENABLE),
+    .AXIS_KEEP_WIDTH(AXIS_KEEP_WIDTH),
+    .AXIS_LAST_ENABLE(AXIS_LAST_ENABLE),
+    .AXIS_ID_ENABLE(AXIS_ID_ENABLE),
+    .AXIS_ID_WIDTH(AXIS_ID_WIDTH),
+    .AXIS_DEST_ENABLE(AXIS_DEST_ENABLE),
+    .AXIS_DEST_WIDTH(AXIS_DEST_WIDTH),
+    .AXIS_USER_ENABLE(AXIS_USER_ENABLE),
+    .AXIS_USER_WIDTH(AXIS_USER_WIDTH)
+)
+axi_vfifo_dec_inst (
+    .clk(m_axis_clk),
+    .rst(m_axis_rst),
+
+    /*
+     * Segmented data input (from virtual FIFO channel)
+     */
+    .fifo_rst_in(m_axis_rst_out),
+    .input_data(ch_output_data),
+    .input_valid(ch_output_valid),
+    .input_ready(ch_output_ready),
+    .input_ctrl_data(ch_output_ctrl_data),
+    .input_ctrl_valid(ch_output_ctrl_valid),
+    .input_ctrl_ready(ch_output_ctrl_ready),
+
+    /*
+     * AXI stream data output
+     */
+    .m_axis_tdata(m_axis_tdata),
+    .m_axis_tkeep(m_axis_tkeep),
+    .m_axis_tvalid(m_axis_tvalid),
+    .m_axis_tready(m_axis_tready),
+    .m_axis_tlast(m_axis_tlast),
+    .m_axis_tid(m_axis_tid),
+    .m_axis_tdest(m_axis_tdest),
+    .m_axis_tuser(m_axis_tuser),
+
+    /*
+     * Status
+     */
+    .sts_hdr_parity_err(sts_hdr_parity_err_int)
+);
+
+endmodule
+
+`resetall
diff --git a/rtl/axi_vfifo_dec.v b/rtl/axi_vfifo_dec.v
new file mode 100644
index 0000000..e6c3ef7
--- /dev/null
+++ b/rtl/axi_vfifo_dec.v
@@ -0,0 +1,720 @@
+/*
+
+Copyright (c) 2023 Alex Forencich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+// Language: Verilog 2001
+
+`resetall
+`timescale 1ns / 1ps
+`default_nettype none
+
+/*
+ * AXI4 virtual FIFO (decoder)
+ */
+module axi_vfifo_dec #
+(
+    // Width of input segment
+    parameter SEG_WIDTH = 32,
+    // Segment count
+    parameter SEG_CNT = 2,
+    // Width of AXI stream interfaces in bits
+    parameter AXIS_DATA_WIDTH = SEG_WIDTH*SEG_CNT/2,
+    // Use AXI stream tkeep signal
+    parameter AXIS_KEEP_ENABLE = (AXIS_DATA_WIDTH>8),
+    // AXI stream tkeep signal width (words per cycle)
+    parameter AXIS_KEEP_WIDTH = (AXIS_DATA_WIDTH/8),
+    // Use AXI stream tlast signal
+    parameter AXIS_LAST_ENABLE = 1,
+    // Propagate AXI stream tid signal
+    parameter AXIS_ID_ENABLE = 0,
+    // AXI stream tid signal width
+    parameter AXIS_ID_WIDTH = 8,
+    // Propagate AXI stream tdest signal
+    parameter AXIS_DEST_ENABLE = 0,
+    // AXI stream tdest signal width
+    parameter AXIS_DEST_WIDTH = 8,
+    // Propagate AXI stream tuser signal
+    parameter AXIS_USER_ENABLE = 1,
+    // AXI stream tuser signal width
+    parameter AXIS_USER_WIDTH = 1
+)
+(
+    input  wire                          clk,
+    input  wire                          rst,
+
+    /*
+     * Segmented data input (from virtual FIFO channel)
+     */
+    input  wire                          fifo_rst_in,
+    input  wire [SEG_CNT*SEG_WIDTH-1:0]  input_data,
+    input  wire [SEG_CNT-1:0]            input_valid,
+    output wire [SEG_CNT-1:0]            input_ready,
+    input  wire [SEG_CNT*SEG_WIDTH-1:0]  input_ctrl_data,
+    input  wire [SEG_CNT-1:0]            input_ctrl_valid,
+    output wire [SEG_CNT-1:0]            input_ctrl_ready,
+
+    /*
+     * AXI stream data output
+     */
+    output wire [AXIS_DATA_WIDTH-1:0]    m_axis_tdata,
+    output wire [AXIS_KEEP_WIDTH-1:0]    m_axis_tkeep,
+    output wire                          m_axis_tvalid,
+    input  wire                          m_axis_tready,
+    output wire                          m_axis_tlast,
+    output wire [AXIS_ID_WIDTH-1:0]      m_axis_tid,
+    output wire [AXIS_DEST_WIDTH-1:0]    m_axis_tdest,
+    output wire [AXIS_USER_WIDTH-1:0]    m_axis_tuser,
+
+    /*
+     * Status
+     */
+    output wire                          sts_hdr_parity_err
+);
+
+parameter AXIS_KEEP_WIDTH_INT = AXIS_KEEP_ENABLE ? AXIS_KEEP_WIDTH : 1;
+parameter AXIS_BYTE_LANES = AXIS_KEEP_WIDTH_INT;
+parameter AXIS_BYTE_SIZE = AXIS_DATA_WIDTH/AXIS_BYTE_LANES;
+parameter AXIS_BYTE_IDX_WIDTH = $clog2(AXIS_BYTE_LANES);
+
+parameter BYTE_SIZE = AXIS_BYTE_SIZE;
+
+parameter SEG_BYTE_LANES = SEG_WIDTH / BYTE_SIZE;
+
+parameter EXPAND_INPUT = SEG_CNT < 2;
+
+parameter SEG_CNT_INT = EXPAND_INPUT ? SEG_CNT*2 : SEG_CNT;
+
+parameter SEG_IDX_WIDTH = $clog2(SEG_CNT_INT);
+parameter SEG_BYTE_IDX_WIDTH = $clog2(SEG_BYTE_LANES);
+
+parameter AXIS_SEG_CNT = (AXIS_DATA_WIDTH + SEG_WIDTH-1) / SEG_WIDTH;
+parameter AXIS_SEG_IDX_WIDTH = AXIS_SEG_CNT > 1 ? $clog2(AXIS_SEG_CNT) : 1;
+parameter AXIS_LEN_MASK = AXIS_BYTE_LANES-1;
+
+parameter OUT_OFFS_WIDTH = AXIS_SEG_IDX_WIDTH;
+
+parameter META_ID_OFFSET = 0;
+parameter META_DEST_OFFSET = META_ID_OFFSET + (AXIS_ID_ENABLE ? AXIS_ID_WIDTH : 0);
+parameter META_USER_OFFSET = META_DEST_OFFSET + (AXIS_DEST_ENABLE ? AXIS_DEST_WIDTH : 0);
+parameter META_WIDTH = META_USER_OFFSET + (AXIS_USER_ENABLE ? AXIS_USER_WIDTH : 0);
+parameter HDR_SIZE = (16 + META_WIDTH + BYTE_SIZE-1) / BYTE_SIZE;
+parameter HDR_WIDTH = HDR_SIZE * BYTE_SIZE;
+
+parameter HDR_LEN_WIDTH = 12;
+parameter HDR_SEG_LEN_WIDTH = HDR_LEN_WIDTH-SEG_BYTE_IDX_WIDTH;
+
+parameter CTRL_FIFO_ADDR_WIDTH = 5;
+parameter OUTPUT_FIFO_ADDR_WIDTH = 5;
+
+parameter CTRL_FIFO_PTR_WIDTH = CTRL_FIFO_ADDR_WIDTH + SEG_IDX_WIDTH;
+
+// validate parameters
+initial begin
+    if (AXIS_BYTE_SIZE * AXIS_KEEP_WIDTH_INT != AXIS_DATA_WIDTH) begin
+        $error("Error: AXI stream data width not evenly divisible (instance %m)");
+        $finish;
+    end
+
+    if (AXIS_SEG_CNT * SEG_WIDTH != AXIS_DATA_WIDTH) begin
+        $error("Error: AXI stream data width not evenly divisible into segments (instance %m)");
+        $finish;
+    end
+
+    if (SEG_WIDTH < HDR_WIDTH) begin
+        $error("Error: Segment smaller than header (instance %m)");
+        $finish;
+    end
+end
+
+reg frame_reg = 1'b0, frame_next, frame_cyc;
+reg last_reg = 1'b0, last_next, last_cyc;
+reg extra_cycle_reg = 1'b0, extra_cycle_next, extra_cycle_cyc;
+reg last_straddle_reg = 1'b0, last_straddle_next, last_straddle_cyc;
+reg [HDR_SEG_LEN_WIDTH-1:0] seg_cnt_reg = 0, seg_cnt_next, seg_cnt_cyc;
+reg hdr_parity_err_reg = 1'b0, hdr_parity_err_next, hdr_parity_err_cyc;
+
+reg out_frame_reg = 1'b0, out_frame_next, out_frame_cyc;
+reg [SEG_IDX_WIDTH-1:0] out_seg_offset_reg = 0, out_seg_offset_next, out_seg_offset_cyc;
+reg [OUT_OFFS_WIDTH-1:0] output_offset_reg = 0, output_offset_next, output_offset_cyc;
+reg [SEG_CNT_INT-1:0] out_seg_consumed;
+reg [SEG_CNT_INT-1:0] out_seg_consumed_reg = 0, out_seg_consumed_next;
+reg out_valid, out_valid_straddle, out_frame, out_last, out_abort, out_done;
+
+reg [SEG_CNT_INT-1:0] seg_valid;
+reg [SEG_CNT_INT-1:0] seg_valid_straddle;
+reg [SEG_CNT_INT-1:0] seg_hdr_start_pkt;
+reg [SEG_CNT_INT-1:0] seg_hdr_last;
+reg [SEG_CNT_INT-1:0] seg_hdr_last_straddle;
+reg [SEG_CNT_INT-1:0] seg_hdr_parity_err;
+reg [HDR_LEN_WIDTH-1:0] seg_hdr_len[SEG_CNT_INT-1:0];
+reg [HDR_SEG_LEN_WIDTH-1:0] seg_hdr_seg_cnt[SEG_CNT_INT-1:0];
+
+reg [SEG_CNT_INT-1:0] shift_out_seg_valid;
+reg [SEG_CNT_INT-1:0] shift_out_seg_valid_straddle;
+reg [SEG_CNT_INT-1:0] shift_out_seg_sop;
+reg [SEG_CNT_INT-1:0] shift_out_seg_eop;
+reg [SEG_CNT_INT-1:0] shift_out_seg_end;
+reg [SEG_CNT_INT-1:0] shift_out_seg_last;
+
+reg [SEG_CNT-1:0] input_ready_cmb;
+reg [SEG_CNT-1:0] input_ctrl_ready_cmb;
+
+reg [SEG_CNT*SEG_WIDTH-1:0] input_data_int_reg = 0, input_data_int_next;
+reg [SEG_CNT-1:0] input_valid_int_reg = 0, input_valid_int_next;
+
+wire [SEG_CNT_INT*SEG_WIDTH*2-1:0] input_data_full = EXPAND_INPUT ? {2{{input_data, input_data_int_reg}}} : {2{input_data}};
+wire [SEG_CNT_INT-1:0] input_valid_full = EXPAND_INPUT ? {input_valid, input_valid_int_reg} : input_valid;
+
+reg out_ctrl_en_reg = 0, out_ctrl_en_next;
+reg out_ctrl_hdr_reg = 0, out_ctrl_hdr_next;
+reg out_ctrl_last_reg = 0, out_ctrl_last_next;
+reg [AXIS_BYTE_IDX_WIDTH-1:0] out_ctrl_last_len_reg = 0, out_ctrl_last_len_next;
+reg [SEG_IDX_WIDTH-1:0] out_ctrl_seg_offset_reg = 0, out_ctrl_seg_offset_next;
+
+reg [AXIS_ID_WIDTH-1:0] axis_tid_reg = 0, axis_tid_next;
+reg [AXIS_DEST_WIDTH-1:0] axis_tdest_reg = 0, axis_tdest_next;
+reg [AXIS_USER_WIDTH-1:0] axis_tuser_reg = 0, axis_tuser_next;
+
+// internal datapath
+reg  [AXIS_DATA_WIDTH-1:0] m_axis_tdata_int;
+reg  [AXIS_KEEP_WIDTH-1:0] m_axis_tkeep_int;
+reg                        m_axis_tvalid_int;
+wire                       m_axis_tready_int;
+reg                        m_axis_tlast_int;
+reg  [AXIS_ID_WIDTH-1:0]   m_axis_tid_int;
+reg  [AXIS_DEST_WIDTH-1:0] m_axis_tdest_int;
+reg  [AXIS_USER_WIDTH-1:0] m_axis_tuser_int;
+
+assign input_ready = input_ready_cmb;
+assign input_ctrl_ready = input_ctrl_ready_cmb;
+
+assign sts_hdr_parity_err = hdr_parity_err_reg;
+
+// segmented control FIFO
+reg [CTRL_FIFO_PTR_WIDTH+1-1:0] ctrl_fifo_wr_ptr_reg = 0, ctrl_fifo_wr_ptr_next;
+reg [CTRL_FIFO_PTR_WIDTH+1-1:0] ctrl_fifo_rd_ptr_reg = 0, ctrl_fifo_rd_ptr_next;
+
+reg [SEG_CNT-1:0] ctrl_mem_rd_data_valid_reg = 0, ctrl_mem_rd_data_valid_next;
+
+reg [SEG_CNT-1:0] ctrl_fifo_wr_sop;
+reg [SEG_CNT-1:0] ctrl_fifo_wr_eop;
+reg [SEG_CNT-1:0] ctrl_fifo_wr_end;
+reg [SEG_CNT-1:0] ctrl_fifo_wr_last;
+reg [SEG_CNT*AXIS_BYTE_IDX_WIDTH-1:0] ctrl_fifo_wr_last_len;
+reg [SEG_CNT-1:0] ctrl_fifo_wr_en;
+
+wire [SEG_CNT-1:0] ctrl_fifo_rd_sop;
+wire [SEG_CNT-1:0] ctrl_fifo_rd_eop;
+wire [SEG_CNT-1:0] ctrl_fifo_rd_end;
+wire [SEG_CNT-1:0] ctrl_fifo_rd_last;
+wire [SEG_CNT*AXIS_BYTE_IDX_WIDTH-1:0] ctrl_fifo_rd_last_len;
+wire [SEG_CNT-1:0] ctrl_fifo_rd_valid;
+reg [SEG_CNT-1:0] ctrl_fifo_rd_en;
+
+wire [SEG_CNT-1:0] ctrl_fifo_seg_full;
+wire [SEG_CNT-1:0] ctrl_fifo_seg_half_full;
+wire [SEG_CNT-1:0] ctrl_fifo_seg_empty;
+
+wire ctrl_fifo_full = |ctrl_fifo_seg_full;
+wire ctrl_fifo_half_full = |ctrl_fifo_seg_half_full;
+wire ctrl_fifo_empty = |ctrl_fifo_seg_empty;
+
+generate
+
+genvar n;
+
+for (n = 0; n < SEG_CNT; n = n + 1) begin : ctrl_fifo_seg
+
+    reg [CTRL_FIFO_ADDR_WIDTH+1-1:0] seg_wr_ptr_reg = 0;
+    reg [CTRL_FIFO_ADDR_WIDTH+1-1:0] seg_rd_ptr_reg = 0;
+
+    (* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+    reg seg_mem_sop[2**CTRL_FIFO_ADDR_WIDTH-1:0];
+    (* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+    reg seg_mem_eop[2**CTRL_FIFO_ADDR_WIDTH-1:0];
+    (* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+    reg seg_mem_end[2**CTRL_FIFO_ADDR_WIDTH-1:0];
+    (* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+    reg seg_mem_last[2**CTRL_FIFO_ADDR_WIDTH-1:0];
+    (* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+    reg [AXIS_BYTE_IDX_WIDTH-1:0] seg_mem_last_len[2**CTRL_FIFO_ADDR_WIDTH-1:0];
+
+    reg seg_rd_sop_reg = 0;
+    reg seg_rd_eop_reg = 0;
+    reg seg_rd_end_reg = 0;
+    reg seg_rd_last_reg = 0;
+    reg [AXIS_BYTE_IDX_WIDTH-1:0] seg_rd_last_len_reg = 0;
+    reg seg_rd_valid_reg = 0;
+
+    reg seg_half_full_reg = 1'b0;
+
+    assign ctrl_fifo_rd_sop[n] = seg_rd_sop_reg;
+    assign ctrl_fifo_rd_eop[n] = seg_rd_eop_reg;
+    assign ctrl_fifo_rd_end[n] = seg_rd_end_reg;
+    assign ctrl_fifo_rd_last[n] = seg_rd_last_reg;
+    assign ctrl_fifo_rd_last_len[AXIS_BYTE_IDX_WIDTH*n +: AXIS_BYTE_IDX_WIDTH] = seg_rd_last_len_reg;
+    assign ctrl_fifo_rd_valid[n] = seg_rd_valid_reg;
+
+    wire seg_full = seg_wr_ptr_reg == (seg_rd_ptr_reg ^ {1'b1, {CTRL_FIFO_ADDR_WIDTH{1'b0}}});
+    wire seg_empty = seg_wr_ptr_reg == seg_rd_ptr_reg;
+
+    assign ctrl_fifo_seg_full[n] = seg_full;
+    assign ctrl_fifo_seg_half_full[n] = seg_half_full_reg;
+    assign ctrl_fifo_seg_empty[n] = seg_empty;
+
+    always @(posedge clk) begin
+        seg_rd_valid_reg <= seg_rd_valid_reg && !ctrl_fifo_rd_en[n];
+
+        seg_half_full_reg <= $unsigned(seg_wr_ptr_reg - seg_rd_ptr_reg) >= 2**(CTRL_FIFO_ADDR_WIDTH-1);
+
+        if (ctrl_fifo_wr_en[n]) begin
+            seg_mem_sop[seg_wr_ptr_reg[CTRL_FIFO_ADDR_WIDTH-1:0]] <= ctrl_fifo_wr_sop[n];
+            seg_mem_eop[seg_wr_ptr_reg[CTRL_FIFO_ADDR_WIDTH-1:0]] <= ctrl_fifo_wr_eop[n];
+            seg_mem_end[seg_wr_ptr_reg[CTRL_FIFO_ADDR_WIDTH-1:0]] <= ctrl_fifo_wr_end[n];
+            seg_mem_last[seg_wr_ptr_reg[CTRL_FIFO_ADDR_WIDTH-1:0]] <= ctrl_fifo_wr_last[n];
+            seg_mem_last_len[seg_wr_ptr_reg[CTRL_FIFO_ADDR_WIDTH-1:0]] <= ctrl_fifo_wr_last_len[AXIS_BYTE_IDX_WIDTH*n +: AXIS_BYTE_IDX_WIDTH];
+
+            seg_wr_ptr_reg <= seg_wr_ptr_reg + 1;
+        end
+
+        if (!seg_empty && (!seg_rd_valid_reg || ctrl_fifo_rd_en[n])) begin
+            seg_rd_sop_reg <= seg_mem_sop[seg_rd_ptr_reg[CTRL_FIFO_ADDR_WIDTH-1:0]];
+            seg_rd_eop_reg <= seg_mem_eop[seg_rd_ptr_reg[CTRL_FIFO_ADDR_WIDTH-1:0]];
+            seg_rd_end_reg <= seg_mem_end[seg_rd_ptr_reg[CTRL_FIFO_ADDR_WIDTH-1:0]];
+            seg_rd_last_reg <= seg_mem_last[seg_rd_ptr_reg[CTRL_FIFO_ADDR_WIDTH-1:0]];
+            seg_rd_last_len_reg <= seg_mem_last_len[seg_rd_ptr_reg[CTRL_FIFO_ADDR_WIDTH-1:0]];
+            seg_rd_valid_reg <= 1'b1;
+
+            seg_rd_ptr_reg <= seg_rd_ptr_reg + 1;
+        end
+
+        if (rst || fifo_rst_in) begin
+            seg_wr_ptr_reg <= 0;
+            seg_rd_ptr_reg <= 0;
+            seg_rd_valid_reg <= 1'b0;
+        end
+    end
+
+end
+
+endgenerate
+
+// parse segment headers
+integer seg;
+
+always @* begin
+    input_ctrl_ready_cmb = 0;
+
+    frame_next = frame_reg;
+    frame_cyc = frame_reg;
+    last_next = last_reg;
+    last_cyc = last_reg;
+    extra_cycle_next = extra_cycle_reg;
+    extra_cycle_cyc = extra_cycle_reg;
+    last_straddle_next = last_straddle_reg;
+    last_straddle_cyc = last_straddle_reg;
+    seg_cnt_next = seg_cnt_reg;
+    seg_cnt_cyc = seg_cnt_reg;
+    hdr_parity_err_next = 1'b0;
+    hdr_parity_err_cyc = 1'b0;
+
+    ctrl_fifo_wr_sop = 0;
+    ctrl_fifo_wr_eop = 0;
+    ctrl_fifo_wr_end = 0;
+    ctrl_fifo_wr_last = 0;
+    ctrl_fifo_wr_last_len = 0;
+    ctrl_fifo_wr_en = 0;
+
+    // decode segment headers
+    for (seg = 0; seg < SEG_CNT; seg = seg + 1) begin
+        seg_valid[seg] = input_ctrl_valid[seg];
+        seg_hdr_start_pkt[seg] = input_ctrl_data[SEG_WIDTH*seg + 0 +: 1];
+        seg_hdr_last[seg] = input_ctrl_data[SEG_WIDTH*seg + 1 +: 1];
+        seg_hdr_len[seg] = input_ctrl_data[SEG_WIDTH*seg + 4 +: 12];
+        seg_hdr_seg_cnt[seg] = (seg_hdr_len[seg] + SEG_BYTE_LANES) >> SEG_BYTE_IDX_WIDTH;
+        seg_hdr_last_straddle[seg] = ((seg_hdr_len[seg] & (SEG_BYTE_LANES-1)) + HDR_SIZE) >> SEG_BYTE_IDX_WIDTH != 0;
+        seg_hdr_parity_err[seg] = ^input_ctrl_data[SEG_WIDTH*seg + 0 +: 3] || ^input_ctrl_data[SEG_WIDTH*seg + 3 +: 13];
+    end
+    seg_valid_straddle = {2{seg_valid}} >> 1;
+
+    for (seg = 0; seg < SEG_CNT; seg = seg + 1) begin
+        if (!frame_cyc) begin
+            if (seg_valid[seg]) begin
+                if (seg_hdr_start_pkt[seg]) begin
+                    // start of frame
+                    last_cyc = seg_hdr_last[seg];
+                    extra_cycle_cyc = 1'b0;
+                    last_straddle_cyc = seg_hdr_last_straddle[seg];
+                    seg_cnt_cyc = seg_hdr_seg_cnt[seg];
+
+                    ctrl_fifo_wr_sop[seg] = 1'b1;
+                    ctrl_fifo_wr_last_len[AXIS_BYTE_IDX_WIDTH*seg +: AXIS_BYTE_IDX_WIDTH] = seg_hdr_len[seg];
+
+                    frame_cyc = 1'b1;
+                end else  begin
+                    // consume null segment
+                end
+
+                if (seg_hdr_parity_err[seg]) begin
+                    hdr_parity_err_cyc = 1'b1;
+                end
+            end
+        end
+
+        if (frame_cyc) begin
+            if (extra_cycle_cyc) begin
+                // extra cycle
+                frame_cyc = 0;
+                extra_cycle_cyc = 0;
+
+                ctrl_fifo_wr_eop[seg] = 1'b1;
+            end else if (seg_cnt_cyc == 1) begin
+                // last output cycle
+                if (last_cyc) begin
+                    ctrl_fifo_wr_last[seg] = 1'b1;
+                end
+
+                if (last_straddle_cyc) begin
+                    // last output cycle, with segment straddle
+                    extra_cycle_cyc = 1'b1;
+
+                    ctrl_fifo_wr_end[seg] = 1'b1;
+                end else begin
+                    // last output cycle, no segment straddle
+                    frame_cyc = 0;
+
+                    ctrl_fifo_wr_eop[seg] = 1'b1;
+                    ctrl_fifo_wr_end[seg] = 1'b1;
+                end
+            end else begin
+                // middle cycle
+            end
+        end
+
+        seg_cnt_cyc = seg_cnt_cyc - 1;
+    end
+
+    if (&seg_valid && !ctrl_fifo_half_full) begin
+        input_ctrl_ready_cmb = {SEG_CNT{1'b1}};
+
+        ctrl_fifo_wr_en = {SEG_CNT{1'b1}};
+
+        frame_next = frame_cyc;
+        last_next = last_cyc;
+        extra_cycle_next = extra_cycle_cyc;
+        last_straddle_next = last_straddle_cyc;
+        seg_cnt_next = seg_cnt_cyc;
+        hdr_parity_err_next = hdr_parity_err_cyc;
+    end
+end
+
+// re-pack data
+integer out_seg;
+reg [SEG_IDX_WIDTH-1:0] out_cur_seg;
+
+always @* begin
+    input_ready_cmb = 0;
+
+    out_frame_next = out_frame_reg;
+    out_frame_cyc = out_frame_reg;
+    out_seg_offset_next = out_seg_offset_reg;
+    out_seg_offset_cyc = out_seg_offset_reg;
+    output_offset_next = output_offset_reg;
+    // output_offset_cyc = output_offset_reg;
+    output_offset_cyc = 0;
+    out_seg_consumed_next = 0;
+
+
+    out_ctrl_en_next = 0;
+    out_ctrl_hdr_next = 0;
+    out_ctrl_last_next = 0;
+    out_ctrl_last_len_next = out_ctrl_last_len_reg;
+    out_ctrl_seg_offset_next = out_ctrl_seg_offset_reg;
+
+    axis_tid_next = axis_tid_reg;
+    axis_tdest_next = axis_tdest_reg;
+    axis_tuser_next = axis_tuser_reg;
+
+    input_data_int_next = input_data_int_reg;
+    input_valid_int_next = input_valid_int_reg;
+
+    ctrl_fifo_rd_en = 0;
+
+    // apply segment offset
+    shift_out_seg_valid = {2{ctrl_fifo_rd_valid}} >> out_seg_offset_reg;
+    shift_out_seg_valid_straddle = {2{ctrl_fifo_rd_valid}} >> (out_seg_offset_reg+1);
+    shift_out_seg_valid_straddle[SEG_CNT-1] = 1'b0; // wrapped, so cannot be consumed
+    shift_out_seg_sop = {2{ctrl_fifo_rd_sop}} >> out_seg_offset_reg;
+    shift_out_seg_eop = {2{ctrl_fifo_rd_eop}} >> out_seg_offset_reg;
+    shift_out_seg_end = {2{ctrl_fifo_rd_end}} >> out_seg_offset_reg;
+    shift_out_seg_last = {2{ctrl_fifo_rd_last}} >> out_seg_offset_reg;
+
+    // extract data
+    out_valid = 0;
+    out_valid_straddle = 0;
+    out_frame = out_frame_cyc;
+    out_abort = 0;
+    out_done = 0;
+    out_seg_consumed = 0;
+
+    out_ctrl_seg_offset_next = out_seg_offset_reg;
+
+    out_cur_seg = out_seg_offset_reg;
+    for (out_seg = 0; out_seg < SEG_CNT; out_seg = out_seg + 1) begin
+        out_seg_offset_cyc = out_seg_offset_cyc + 1;
+
+        // check for contiguous valid segments
+        out_valid = (~shift_out_seg_valid & ({SEG_CNT{1'b1}} >> (SEG_CNT-1 - out_seg))) == 0;
+        out_valid_straddle = shift_out_seg_valid_straddle[0];
+
+        if (!out_frame_cyc) begin
+            if (out_valid) begin
+                if (shift_out_seg_sop[0]) begin
+                    // start of frame
+                    out_frame_cyc = 1'b1;
+
+                    if (!out_done) begin
+                        out_ctrl_hdr_next = 1'b1;
+                        out_ctrl_last_len_next = ctrl_fifo_rd_last_len[AXIS_BYTE_IDX_WIDTH*out_cur_seg +: AXIS_BYTE_IDX_WIDTH];
+                        out_ctrl_seg_offset_next = out_cur_seg;
+                    end
+                end else if (!out_abort) begin
+                    // consume null segment
+                    out_seg_consumed[out_cur_seg] = 1'b1;
+                    out_seg_consumed_next = out_seg_consumed;
+                    ctrl_fifo_rd_en = out_seg_consumed;
+
+                    out_seg_offset_next = out_seg_offset_cyc;
+                end
+            end
+        end
+        out_frame = out_frame_cyc;
+
+        if (out_frame && !out_done) begin
+            if (shift_out_seg_end[0]) begin
+                // last output cycle
+                out_frame_cyc = 0;
+                out_done = 1;
+
+                if (shift_out_seg_last[0]) begin
+                    out_ctrl_last_next = 1'b1;
+                end
+
+                if (out_valid && (out_valid_straddle || shift_out_seg_eop[0]) && m_axis_tready_int) begin
+                    out_ctrl_en_next = 1'b1;
+                    out_seg_consumed[out_cur_seg] = 1'b1;
+                    out_seg_consumed_next = out_seg_consumed;
+                    ctrl_fifo_rd_en = out_seg_consumed;
+                    out_frame_next = out_frame_cyc;
+                    out_seg_offset_next = out_seg_offset_cyc;
+                end else begin
+                    out_abort = 1'b1;
+                end
+            end else if (output_offset_cyc == AXIS_SEG_CNT-1) begin
+                // output full
+                out_done = 1;
+
+                if (out_valid && out_valid_straddle && m_axis_tready_int) begin
+                    out_ctrl_en_next = 1'b1;
+                    out_seg_consumed[out_cur_seg] = 1'b1;
+                    out_seg_consumed_next = out_seg_consumed;
+                    ctrl_fifo_rd_en = out_seg_consumed;
+                    out_frame_next = out_frame_cyc;
+                    out_seg_offset_next = out_seg_offset_cyc;
+                end else begin
+                    out_abort = 1'b1;
+                end
+            end else begin
+                // middle cycle
+
+                if (out_valid && out_valid_straddle && m_axis_tready_int) begin
+                    out_seg_consumed[out_cur_seg] = 1'b1;
+                end else begin
+                    out_abort = 1'b1;
+                end
+            end
+
+            if (output_offset_cyc == AXIS_SEG_CNT-1) begin
+                output_offset_cyc = 0;
+            end else begin
+                output_offset_cyc = output_offset_cyc + 1;
+            end
+        end
+
+        out_cur_seg = out_cur_seg + 1;
+
+        // shift_out_seg_valid = shift_out_seg_valid >> 1;
+        shift_out_seg_valid_straddle = shift_out_seg_valid_straddle >> 1;
+        shift_out_seg_sop = shift_out_seg_sop >> 1;
+        shift_out_seg_eop = shift_out_seg_eop >> 1;
+        shift_out_seg_end = shift_out_seg_end >> 1;
+        shift_out_seg_last = shift_out_seg_last >> 1;
+    end
+
+    // construct output
+    input_ready_cmb = out_seg_consumed_reg;
+
+    m_axis_tdata_int = input_data_full >> (SEG_WIDTH*out_ctrl_seg_offset_reg + HDR_WIDTH);
+
+    if (out_ctrl_last_reg) begin
+        m_axis_tkeep_int = {AXIS_KEEP_WIDTH{1'b1}} >> (AXIS_KEEP_WIDTH-1 - out_ctrl_last_len_reg);
+    end else begin
+        m_axis_tkeep_int = {AXIS_KEEP_WIDTH{1'b1}};
+    end
+    m_axis_tlast_int = out_ctrl_last_reg;
+
+    if (out_ctrl_hdr_reg) begin
+        axis_tid_next = input_data_full >> (SEG_WIDTH*out_ctrl_seg_offset_reg + 16 + META_ID_OFFSET);
+        axis_tdest_next = input_data_full >> (SEG_WIDTH*out_ctrl_seg_offset_reg + 16 + META_DEST_OFFSET);
+        axis_tuser_next = input_data_full >> (SEG_WIDTH*out_ctrl_seg_offset_reg + 16 + META_USER_OFFSET);
+    end
+
+    m_axis_tvalid_int = out_ctrl_en_reg;
+
+    m_axis_tid_int = axis_tid_next;
+    m_axis_tdest_int = axis_tdest_next;
+    m_axis_tuser_int = axis_tuser_next;
+
+    if (EXPAND_INPUT) begin
+        for (seg = 0; seg < SEG_CNT; seg = seg + 1) begin
+            if (input_ready[seg] && input_valid[seg]) begin
+                input_data_int_next[SEG_WIDTH*seg +: SEG_WIDTH] = input_data[SEG_WIDTH*seg +: SEG_WIDTH];
+                input_valid_int_next[seg] = 1'b1;
+            end
+        end
+    end
+end
+
+always @(posedge clk) begin
+    frame_reg <= frame_next;
+    last_reg <= last_next;
+    extra_cycle_reg <= extra_cycle_next;
+    last_straddle_reg <= last_straddle_next;
+    seg_cnt_reg <= seg_cnt_next;
+    hdr_parity_err_reg <= hdr_parity_err_next;
+
+    out_frame_reg <= out_frame_next;
+    out_seg_offset_reg <= out_seg_offset_next;
+    output_offset_reg <= output_offset_next;
+    out_seg_consumed_reg <= out_seg_consumed_next;
+
+    input_data_int_reg <= input_data_int_next;
+    input_valid_int_reg <= input_valid_int_next;
+
+    out_ctrl_en_reg <= out_ctrl_en_next;
+    out_ctrl_hdr_reg <= out_ctrl_hdr_next;
+    out_ctrl_last_reg <= out_ctrl_last_next;
+    out_ctrl_last_len_reg <= out_ctrl_last_len_next;
+    out_ctrl_seg_offset_reg <= out_ctrl_seg_offset_next;
+
+    axis_tid_reg <= axis_tid_next;
+    axis_tdest_reg <= axis_tdest_next;
+    axis_tuser_reg <= axis_tuser_next;
+
+    if (rst || fifo_rst_in) begin
+        frame_reg <= 1'b0;
+        hdr_parity_err_reg <= 1'b0;
+        out_frame_reg <= 1'b0;
+        out_seg_offset_reg <= 0;
+        output_offset_reg <= 0;
+        out_seg_consumed_reg <= 0;
+        input_valid_int_next <= 1'b0;
+        out_ctrl_en_reg <= 1'b0;
+    end
+end
+
+// output datapath logic
+reg [AXIS_DATA_WIDTH-1:0] m_axis_tdata_reg  = {AXIS_DATA_WIDTH{1'b0}};
+reg [AXIS_KEEP_WIDTH-1:0] m_axis_tkeep_reg  = {AXIS_KEEP_WIDTH{1'b0}};
+reg                       m_axis_tvalid_reg = 1'b0;
+reg                       m_axis_tlast_reg  = 1'b0;
+reg [AXIS_ID_WIDTH-1:0]   m_axis_tid_reg    = {AXIS_ID_WIDTH{1'b0}};
+reg [AXIS_DEST_WIDTH-1:0] m_axis_tdest_reg  = {AXIS_DEST_WIDTH{1'b0}};
+reg [AXIS_USER_WIDTH-1:0] m_axis_tuser_reg  = {AXIS_USER_WIDTH{1'b0}};
+
+reg [OUTPUT_FIFO_ADDR_WIDTH+1-1:0] out_fifo_wr_ptr_reg = 0;
+reg [OUTPUT_FIFO_ADDR_WIDTH+1-1:0] out_fifo_rd_ptr_reg = 0;
+reg out_fifo_half_full_reg = 1'b0;
+
+wire out_fifo_full = out_fifo_wr_ptr_reg == (out_fifo_rd_ptr_reg ^ {1'b1, {OUTPUT_FIFO_ADDR_WIDTH{1'b0}}});
+wire out_fifo_empty = out_fifo_wr_ptr_reg == out_fifo_rd_ptr_reg;
+
+(* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+reg [AXIS_DATA_WIDTH-1:0] out_fifo_tdata[2**OUTPUT_FIFO_ADDR_WIDTH-1:0];
+(* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+reg [AXIS_KEEP_WIDTH-1:0] out_fifo_tkeep[2**OUTPUT_FIFO_ADDR_WIDTH-1:0];
+(* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+reg                       out_fifo_tlast[2**OUTPUT_FIFO_ADDR_WIDTH-1:0];
+(* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+reg [AXIS_ID_WIDTH-1:0]   out_fifo_tid[2**OUTPUT_FIFO_ADDR_WIDTH-1:0];
+(* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+reg [AXIS_DEST_WIDTH-1:0] out_fifo_tdest[2**OUTPUT_FIFO_ADDR_WIDTH-1:0];
+(* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+reg [AXIS_USER_WIDTH-1:0] out_fifo_tuser[2**OUTPUT_FIFO_ADDR_WIDTH-1:0];
+
+assign m_axis_tready_int = !out_fifo_half_full_reg;
+
+assign m_axis_tdata  = m_axis_tdata_reg;
+assign m_axis_tkeep  = AXIS_KEEP_ENABLE ? m_axis_tkeep_reg : {AXIS_KEEP_WIDTH{1'b1}};
+assign m_axis_tvalid = m_axis_tvalid_reg;
+assign m_axis_tlast  = AXIS_LAST_ENABLE ? m_axis_tlast_reg : 1'b1;
+assign m_axis_tid    = AXIS_ID_ENABLE   ? m_axis_tid_reg   : {AXIS_ID_WIDTH{1'b0}};
+assign m_axis_tdest  = AXIS_DEST_ENABLE ? m_axis_tdest_reg : {AXIS_DEST_WIDTH{1'b0}};
+assign m_axis_tuser  = AXIS_USER_ENABLE ? m_axis_tuser_reg : {AXIS_USER_WIDTH{1'b0}};
+
+always @(posedge clk) begin
+    m_axis_tvalid_reg <= m_axis_tvalid_reg && !m_axis_tready;
+
+    out_fifo_half_full_reg <= $unsigned(out_fifo_wr_ptr_reg - out_fifo_rd_ptr_reg) >= 2**(OUTPUT_FIFO_ADDR_WIDTH-1);
+
+    if (!out_fifo_full && m_axis_tvalid_int) begin
+        out_fifo_tdata[out_fifo_wr_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]] <= m_axis_tdata_int;
+        out_fifo_tkeep[out_fifo_wr_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]] <= m_axis_tkeep_int;
+        out_fifo_tlast[out_fifo_wr_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]] <= m_axis_tlast_int;
+        out_fifo_tid[out_fifo_wr_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]] <= m_axis_tid_int;
+        out_fifo_tdest[out_fifo_wr_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]] <= m_axis_tdest_int;
+        out_fifo_tuser[out_fifo_wr_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]] <= m_axis_tuser_int;
+        out_fifo_wr_ptr_reg <= out_fifo_wr_ptr_reg + 1;
+    end
+
+    if (!out_fifo_empty && (!m_axis_tvalid_reg || m_axis_tready)) begin
+        m_axis_tdata_reg <= out_fifo_tdata[out_fifo_rd_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]];
+        m_axis_tkeep_reg <= out_fifo_tkeep[out_fifo_rd_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]];
+        m_axis_tvalid_reg <= 1'b1;
+        m_axis_tlast_reg <= out_fifo_tlast[out_fifo_rd_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]];
+        m_axis_tid_reg <= out_fifo_tid[out_fifo_rd_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]];
+        m_axis_tdest_reg <= out_fifo_tdest[out_fifo_rd_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]];
+        m_axis_tuser_reg <= out_fifo_tuser[out_fifo_rd_ptr_reg[OUTPUT_FIFO_ADDR_WIDTH-1:0]];
+        out_fifo_rd_ptr_reg <= out_fifo_rd_ptr_reg + 1;
+    end
+
+    if (rst || fifo_rst_in) begin
+        out_fifo_wr_ptr_reg <= 0;
+        out_fifo_rd_ptr_reg <= 0;
+        m_axis_tvalid_reg <= 1'b0;
+    end
+end
+
+endmodule
+
+`resetall
diff --git a/rtl/axi_vfifo_enc.v b/rtl/axi_vfifo_enc.v
new file mode 100644
index 0000000..d9c60d5
--- /dev/null
+++ b/rtl/axi_vfifo_enc.v
@@ -0,0 +1,794 @@
+/*
+
+Copyright (c) 2023 Alex Forencich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+// Language: Verilog 2001
+
+`resetall
+`timescale 1ns / 1ps
+`default_nettype none
+
+/*
+ * AXI4 virtual FIFO (encoder)
+ */
+module axi_vfifo_enc #
+(
+    // Width of input segment
+    parameter SEG_WIDTH = 32,
+    // Segment count
+    parameter SEG_CNT = 2,
+    // Width of AXI stream interfaces in bits
+    parameter AXIS_DATA_WIDTH = SEG_WIDTH*SEG_CNT/2,
+    // Use AXI stream tkeep signal
+    parameter AXIS_KEEP_ENABLE = (AXIS_DATA_WIDTH>8),
+    // AXI stream tkeep signal width (words per cycle)
+    parameter AXIS_KEEP_WIDTH = (AXIS_DATA_WIDTH/8),
+    // Use AXI stream tlast signal
+    parameter AXIS_LAST_ENABLE = 1,
+    // Propagate AXI stream tid signal
+    parameter AXIS_ID_ENABLE = 0,
+    // AXI stream tid signal width
+    parameter AXIS_ID_WIDTH = 8,
+    // Propagate AXI stream tdest signal
+    parameter AXIS_DEST_ENABLE = 0,
+    // AXI stream tdest signal width
+    parameter AXIS_DEST_WIDTH = 8,
+    // Propagate AXI stream tuser signal
+    parameter AXIS_USER_ENABLE = 1,
+    // AXI stream tuser signal width
+    parameter AXIS_USER_WIDTH = 1
+)
+(
+    input  wire                          clk,
+    input  wire                          rst,
+
+    /*
+     * AXI stream data input
+     */
+    input  wire [AXIS_DATA_WIDTH-1:0]    s_axis_tdata,
+    input  wire [AXIS_KEEP_WIDTH-1:0]    s_axis_tkeep,
+    input  wire                          s_axis_tvalid,
+    output wire                          s_axis_tready,
+    input  wire                          s_axis_tlast,
+    input  wire [AXIS_ID_WIDTH-1:0]      s_axis_tid,
+    input  wire [AXIS_DEST_WIDTH-1:0]    s_axis_tdest,
+    input  wire [AXIS_USER_WIDTH-1:0]    s_axis_tuser,
+
+    /*
+     * Segmented data output (to virtual FIFO channel)
+     */
+    input  wire                          fifo_rst_in,
+    output wire [SEG_CNT*SEG_WIDTH-1:0]  output_data,
+    output wire [SEG_CNT-1:0]            output_valid,
+    input  wire                          fifo_watermark_in
+);
+
+parameter AXIS_KEEP_WIDTH_INT = AXIS_KEEP_ENABLE ? AXIS_KEEP_WIDTH : 1;
+parameter AXIS_BYTE_LANES = AXIS_KEEP_WIDTH_INT;
+parameter AXIS_BYTE_SIZE = AXIS_DATA_WIDTH/AXIS_BYTE_LANES;
+parameter CL_AXIS_BYTE_LANES = $clog2(AXIS_BYTE_LANES);
+
+parameter BYTE_SIZE = AXIS_BYTE_SIZE;
+
+parameter SEG_BYTE_LANES = SEG_WIDTH / BYTE_SIZE;
+
+parameter EXPAND_OUTPUT = SEG_CNT < 2;
+
+parameter SEG_CNT_INT = EXPAND_OUTPUT ? SEG_CNT*2 : SEG_CNT;
+
+parameter SEG_IDX_WIDTH = $clog2(SEG_CNT_INT);
+parameter SEG_BYTE_IDX_WIDTH = $clog2(SEG_BYTE_LANES);
+
+parameter AXIS_SEG_CNT = (AXIS_DATA_WIDTH + SEG_WIDTH-1) / SEG_WIDTH;
+parameter AXIS_SEG_IDX_WIDTH = AXIS_SEG_CNT > 1 ? $clog2(AXIS_SEG_CNT) : 1;
+parameter AXIS_LEN_MASK = AXIS_BYTE_LANES-1;
+
+parameter IN_OFFS_WIDTH = AXIS_SEG_IDX_WIDTH;
+
+parameter META_ID_OFFSET = 0;
+parameter META_DEST_OFFSET = META_ID_OFFSET + (AXIS_ID_ENABLE ? AXIS_ID_WIDTH : 0);
+parameter META_USER_OFFSET = META_DEST_OFFSET + (AXIS_DEST_ENABLE ? AXIS_DEST_WIDTH : 0);
+parameter META_WIDTH = META_USER_OFFSET + (AXIS_USER_ENABLE ? AXIS_USER_WIDTH : 0);
+parameter HDR_SIZE = (16 + META_WIDTH + BYTE_SIZE-1) / BYTE_SIZE;
+parameter HDR_WIDTH = HDR_SIZE * BYTE_SIZE;
+
+parameter HDR_LEN_WIDTH = 12;
+parameter HDR_SEG_LEN_WIDTH = HDR_LEN_WIDTH-SEG_BYTE_IDX_WIDTH;
+
+parameter INPUT_FIFO_ADDR_WIDTH = 5;
+parameter HDR_FIFO_ADDR_WIDTH = INPUT_FIFO_ADDR_WIDTH + SEG_IDX_WIDTH;
+
+parameter INPUT_FIFO_PTR_WIDTH = INPUT_FIFO_ADDR_WIDTH + SEG_IDX_WIDTH;
+parameter HDR_FIFO_PTR_WIDTH = HDR_FIFO_ADDR_WIDTH;
+
+parameter INPUT_FIFO_SIZE = SEG_BYTE_LANES * SEG_CNT_INT * 2**INPUT_FIFO_ADDR_WIDTH;
+
+parameter MAX_BLOCK_LEN = INPUT_FIFO_SIZE / 2 > 4096 ? 4096 : INPUT_FIFO_SIZE / 2;
+
+// validate parameters
+initial begin
+    if (AXIS_BYTE_SIZE * AXIS_KEEP_WIDTH_INT != AXIS_DATA_WIDTH) begin
+        $error("Error: AXI stream data width not evenly divisible (instance %m)");
+        $finish;
+    end
+
+    if (AXIS_SEG_CNT * SEG_WIDTH != AXIS_DATA_WIDTH) begin
+        $error("Error: AXI stream data width not evenly divisible into segments (instance %m)");
+        $finish;
+    end
+
+    if (SEG_WIDTH < HDR_SIZE*BYTE_SIZE) begin
+        $error("Error: Segment smaller than header (instance %m)");
+        $finish;
+    end
+end
+
+reg [INPUT_FIFO_PTR_WIDTH+1-1:0] input_fifo_wr_ptr_reg = 0, input_fifo_wr_ptr_next;
+reg [INPUT_FIFO_PTR_WIDTH+1-1:0] input_fifo_rd_ptr_reg = 0, input_fifo_rd_ptr_next;
+reg [HDR_FIFO_PTR_WIDTH+1-1:0] hdr_fifo_wr_ptr_reg = 0, hdr_fifo_wr_ptr_next;
+reg [HDR_FIFO_PTR_WIDTH+1-1:0] hdr_fifo_rd_ptr_reg = 0, hdr_fifo_rd_ptr_next;
+
+reg [SEG_CNT_INT-1:0] mem_rd_data_valid_reg = 0, mem_rd_data_valid_next;
+reg hdr_mem_rd_data_valid_reg = 0, hdr_mem_rd_data_valid_next;
+
+reg [AXIS_DATA_WIDTH-1:0] int_seg_data;
+reg [AXIS_SEG_CNT-1:0] int_seg_valid;
+
+reg [SEG_CNT_INT*SEG_WIDTH-1:0] seg_mem_wr_data;
+reg [SEG_CNT_INT-1:0] seg_mem_wr_valid;
+reg [SEG_CNT_INT*INPUT_FIFO_ADDR_WIDTH-1:0] seg_mem_wr_addr_reg = 0, seg_mem_wr_addr_next;
+reg [SEG_CNT_INT-1:0] seg_mem_wr_en;
+reg [SEG_CNT_INT*SEG_IDX_WIDTH-1:0] seg_mem_wr_sel;
+
+wire [SEG_CNT_INT*SEG_WIDTH-1:0] seg_mem_rd_data;
+reg [SEG_CNT_INT*INPUT_FIFO_ADDR_WIDTH-1:0] seg_mem_rd_addr_reg = 0, seg_mem_rd_addr_next;
+reg [SEG_CNT_INT-1:0] seg_mem_rd_en;
+
+reg [HDR_LEN_WIDTH-1:0] hdr_mem_wr_len;
+reg hdr_mem_wr_last;
+reg [META_WIDTH-1:0] hdr_mem_wr_meta;
+reg [HDR_FIFO_ADDR_WIDTH-1:0] hdr_mem_wr_addr;
+reg hdr_mem_wr_en;
+
+wire [HDR_LEN_WIDTH-1:0] hdr_mem_rd_len;
+wire hdr_mem_rd_last;
+wire [META_WIDTH-1:0] hdr_mem_rd_meta;
+reg [HDR_FIFO_ADDR_WIDTH-1:0] hdr_mem_rd_addr_reg = 0, hdr_mem_rd_addr_next;
+reg hdr_mem_rd_en;
+
+reg input_fifo_full_reg = 1'b0;
+reg input_fifo_half_full_reg = 1'b0;
+reg input_fifo_empty_reg = 1'b1;
+reg [INPUT_FIFO_PTR_WIDTH+1-1:0] input_fifo_count_reg = 0;
+reg hdr_fifo_full_reg = 1'b0;
+reg hdr_fifo_half_full_reg = 1'b0;
+reg hdr_fifo_empty_reg = 1'b1;
+reg [HDR_FIFO_PTR_WIDTH+1-1:0] hdr_fifo_count_reg = 0;
+
+reg [SEG_CNT*SEG_WIDTH-1:0] output_data_reg = 0, output_data_next;
+reg [SEG_CNT-1:0] output_valid_reg = 0, output_valid_next;
+
+assign s_axis_tready = !input_fifo_full_reg && !hdr_fifo_full_reg && !fifo_rst_in;
+
+assign output_data = output_data_reg;
+assign output_valid = output_valid_reg;
+
+generate
+
+genvar n;
+
+for (n = 0; n < SEG_CNT_INT; n = n + 1) begin : seg_ram
+
+    (* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+    reg [SEG_WIDTH-1:0] seg_mem_data[2**INPUT_FIFO_ADDR_WIDTH-1:0];
+
+    wire wr_en = seg_mem_wr_en[n];
+    wire [INPUT_FIFO_ADDR_WIDTH-1:0] wr_addr = seg_mem_wr_addr_reg[n*INPUT_FIFO_ADDR_WIDTH +: INPUT_FIFO_ADDR_WIDTH];
+    wire [SEG_WIDTH-1:0] wr_data = seg_mem_wr_data[n*SEG_WIDTH +: SEG_WIDTH];
+
+    wire rd_en = seg_mem_rd_en[n];
+    wire [INPUT_FIFO_ADDR_WIDTH-1:0] rd_addr = seg_mem_rd_addr_reg[n*INPUT_FIFO_ADDR_WIDTH +: INPUT_FIFO_ADDR_WIDTH];
+    reg [SEG_WIDTH-1:0] rd_data_reg = 0;
+
+    assign seg_mem_rd_data[n*SEG_WIDTH +: SEG_WIDTH] = rd_data_reg;
+
+    always @(posedge clk) begin
+        if (wr_en) begin
+            seg_mem_data[wr_addr] <= wr_data;
+        end
+    end
+
+    always @(posedge clk) begin
+        if (rd_en) begin
+            rd_data_reg <= seg_mem_data[rd_addr];
+        end
+    end
+
+end
+
+endgenerate
+
+(* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+reg [HDR_LEN_WIDTH-1:0] hdr_mem_len[2**HDR_FIFO_ADDR_WIDTH-1:0];
+(* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+reg hdr_mem_last[2**HDR_FIFO_ADDR_WIDTH-1:0];
+(* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+reg [META_WIDTH-1:0] hdr_mem_meta[2**HDR_FIFO_ADDR_WIDTH-1:0];
+
+reg [HDR_LEN_WIDTH-1:0] hdr_mem_rd_len_reg = 0;
+reg hdr_mem_rd_last_reg = 1'b0;
+reg [META_WIDTH-1:0] hdr_mem_rd_meta_reg = 0;
+
+assign hdr_mem_rd_len = hdr_mem_rd_len_reg;
+assign hdr_mem_rd_last = hdr_mem_rd_last_reg;
+assign hdr_mem_rd_meta = hdr_mem_rd_meta_reg;
+
+always @(posedge clk) begin
+    if (hdr_mem_wr_en) begin
+        hdr_mem_len[hdr_mem_wr_addr] <= hdr_mem_wr_len;
+        hdr_mem_last[hdr_mem_wr_addr] <= hdr_mem_wr_last;
+        hdr_mem_meta[hdr_mem_wr_addr] <= hdr_mem_wr_meta;
+    end
+end
+
+always @(posedge clk) begin
+    if (hdr_mem_rd_en) begin
+        hdr_mem_rd_len_reg <= hdr_mem_len[hdr_mem_rd_addr_reg];
+        hdr_mem_rd_last_reg <= hdr_mem_last[hdr_mem_rd_addr_reg];
+        hdr_mem_rd_meta_reg <= hdr_mem_meta[hdr_mem_rd_addr_reg];
+    end
+end
+
+// limits
+always @(posedge clk) begin
+    input_fifo_full_reg <= $unsigned(input_fifo_wr_ptr_reg - input_fifo_rd_ptr_reg) >= (2**INPUT_FIFO_ADDR_WIDTH*SEG_CNT_INT)-SEG_CNT_INT*2;
+    input_fifo_half_full_reg <= $unsigned(input_fifo_wr_ptr_reg - input_fifo_rd_ptr_reg) >= (2**INPUT_FIFO_ADDR_WIDTH*SEG_CNT_INT)/2;
+    hdr_fifo_full_reg <= $unsigned(hdr_fifo_wr_ptr_reg - hdr_fifo_rd_ptr_reg) >= 2**HDR_FIFO_ADDR_WIDTH-4;
+    hdr_fifo_half_full_reg <= $unsigned(hdr_fifo_wr_ptr_reg - hdr_fifo_rd_ptr_reg) >= 2**HDR_FIFO_ADDR_WIDTH/2;
+
+    if (rst) begin
+        input_fifo_full_reg <= 1'b0;
+        input_fifo_half_full_reg <= 1'b0;
+        hdr_fifo_full_reg <= 1'b0;
+        hdr_fifo_half_full_reg <= 1'b0;
+    end
+end
+
+// Split input segments
+integer si;
+
+always @* begin
+    int_seg_data = s_axis_tdata;
+    int_seg_valid = 0;
+
+    if (s_axis_tvalid) begin
+        if (s_axis_tlast) begin
+            for (si = 0; si < AXIS_SEG_CNT; si = si + 1) begin
+                int_seg_valid[si] = s_axis_tkeep[SEG_BYTE_LANES*si +: SEG_BYTE_LANES] != 0;
+            end
+        end else begin
+            int_seg_valid = {AXIS_SEG_CNT{1'b1}};
+        end
+    end else begin
+        int_seg_valid = 0;
+    end
+end
+
+// Write logic
+integer seg, k;
+reg [SEG_IDX_WIDTH+1-1:0] seg_count;
+reg [SEG_IDX_WIDTH-1:0] cur_seg;
+
+reg frame_reg = 1'b0, frame_next;
+reg [HDR_LEN_WIDTH-1:0] len_reg = 0, len_next;
+
+reg cycle_valid_reg = 1'b0, cycle_valid_next;
+reg cycle_last_reg = 1'b0, cycle_last_next;
+reg [CL_AXIS_BYTE_LANES+1-1:0] cycle_len_reg = 0, cycle_len_next;
+reg [META_WIDTH-1:0] cycle_meta_reg = 0, cycle_meta_next;
+
+reg [CL_AXIS_BYTE_LANES+1-1:0] cycle_len;
+
+reg [HDR_LEN_WIDTH-1:0] hdr_len_reg = 0, hdr_len_next;
+reg [META_WIDTH-1:0] hdr_meta_reg = 0, hdr_meta_next;
+reg hdr_last_reg = 0, hdr_last_next;
+reg hdr_commit_reg = 0, hdr_commit_next;
+reg hdr_commit_prev_reg = 0, hdr_commit_prev_next;
+reg hdr_valid_reg = 0, hdr_valid_next;
+
+wire [META_WIDTH-1:0] s_axis_meta;
+
+generate
+
+if (AXIS_ID_ENABLE) assign s_axis_meta[META_ID_OFFSET +: AXIS_ID_WIDTH] = s_axis_tid;
+if (AXIS_DEST_ENABLE) assign s_axis_meta[META_DEST_OFFSET +: AXIS_DEST_WIDTH] = s_axis_tdest;
+if (AXIS_USER_ENABLE) assign s_axis_meta[META_USER_OFFSET +: AXIS_USER_WIDTH] = s_axis_tuser;
+
+endgenerate
+
+always @* begin
+    input_fifo_wr_ptr_next = input_fifo_wr_ptr_reg;
+    hdr_fifo_wr_ptr_next = hdr_fifo_wr_ptr_reg;
+
+    if (AXIS_KEEP_ENABLE) begin
+        cycle_len = 0;
+        for (k = 0; k < AXIS_BYTE_LANES; k = k + 1) begin
+            cycle_len = cycle_len + s_axis_tkeep[k];
+        end
+    end else begin
+        cycle_len = AXIS_BYTE_LANES;
+    end
+
+    // pack segments
+    seg_mem_wr_valid = 0;
+    seg_mem_wr_sel = 0;
+    cur_seg = input_fifo_wr_ptr_reg[SEG_IDX_WIDTH-1:0];
+    seg_count = 0;
+    for (seg = 0; seg < AXIS_SEG_CNT; seg = seg + 1) begin
+        if (int_seg_valid[seg]) begin
+            seg_mem_wr_valid[cur_seg +: 1] = 1'b1;
+            seg_mem_wr_sel[cur_seg*SEG_IDX_WIDTH +: SEG_IDX_WIDTH] = seg;
+            cur_seg = cur_seg + 1;
+            seg_count = seg_count + 1;
+        end
+    end
+
+    for (seg = 0; seg < SEG_CNT_INT; seg = seg + 1) begin
+        seg_mem_wr_data[seg*SEG_WIDTH +: SEG_WIDTH] = int_seg_data[seg_mem_wr_sel[seg*SEG_IDX_WIDTH +: SEG_IDX_WIDTH]*SEG_WIDTH +: SEG_WIDTH];
+    end
+
+    seg_mem_wr_addr_next = seg_mem_wr_addr_reg;
+    seg_mem_wr_en = 0;
+
+    hdr_mem_wr_len = hdr_len_reg;
+    hdr_mem_wr_last = hdr_last_reg;
+    hdr_mem_wr_meta = hdr_meta_reg;
+    hdr_mem_wr_addr = hdr_fifo_wr_ptr_reg;
+    hdr_mem_wr_en = 1'b0;
+
+    frame_next = frame_reg;
+    len_next = len_reg;
+
+    cycle_valid_next = 1'b0;
+    cycle_last_next = cycle_last_reg;
+    cycle_len_next = cycle_len_reg;
+    cycle_meta_next = cycle_meta_reg;
+
+    hdr_len_next = len_reg;
+    hdr_meta_next = cycle_meta_reg;
+    hdr_last_next = cycle_last_reg;
+    hdr_commit_next = 1'b0;
+    hdr_commit_prev_next = 1'b0;
+    hdr_valid_next = 1'b0;
+
+    if (s_axis_tvalid && s_axis_tready) begin
+        // transfer data
+        seg_mem_wr_en = seg_mem_wr_valid;
+        input_fifo_wr_ptr_next = input_fifo_wr_ptr_reg + seg_count;
+        for (seg = 0; seg < SEG_CNT_INT; seg = seg + 1) begin
+            seg_mem_wr_addr_next[seg*INPUT_FIFO_ADDR_WIDTH +: INPUT_FIFO_ADDR_WIDTH] = (input_fifo_wr_ptr_next + (SEG_CNT_INT-1 - seg)) >> SEG_IDX_WIDTH;
+        end
+
+        cycle_valid_next = 1'b1;
+        cycle_last_next = s_axis_tlast;
+        cycle_len_next = cycle_len;
+        cycle_meta_next = s_axis_meta;
+    end
+
+    if (cycle_valid_reg) begin
+        // process packets
+        if (!frame_reg) begin
+            frame_next = 1'b1;
+
+            if (cycle_last_reg) begin
+                len_next = cycle_len_reg;
+            end else begin
+                len_next = AXIS_BYTE_LANES;
+            end
+
+            hdr_len_next = len_next-1;
+            hdr_meta_next = cycle_meta_reg;
+            hdr_last_next = cycle_last_reg;
+            hdr_valid_next = 1'b1;
+
+            if (cycle_last_reg) begin
+                // end of frame
+
+                hdr_commit_next = 1'b1;
+
+                frame_next = 1'b0;
+            end
+        end else begin
+            if (cycle_meta_reg != hdr_meta_reg) begin
+                if (cycle_last_reg) begin
+                    len_next = cycle_len_reg;
+                end else begin
+                    len_next = AXIS_BYTE_LANES;
+                end
+            end else begin
+                if (cycle_last_reg) begin
+                    len_next = len_reg + cycle_len_reg;
+                end else begin
+                    len_next = len_reg + AXIS_BYTE_LANES;
+                end
+            end
+
+            hdr_len_next = len_next-1;
+            hdr_meta_next = cycle_meta_reg;
+            hdr_last_next = cycle_last_reg;
+            hdr_valid_next = 1'b1;
+
+            if (cycle_meta_reg != hdr_meta_reg) begin
+                // meta changed
+
+                hdr_commit_prev_next = 1'b1;
+
+                if (cycle_last_reg) begin
+                    hdr_commit_next = 1'b1;
+                    frame_next = 1'b0;
+                end
+            end else if (cycle_last_reg || len_next >= MAX_BLOCK_LEN) begin
+                // end of frame or block is full
+
+                hdr_commit_next = 1'b1;
+
+                frame_next = 1'b0;
+            end
+        end
+    end
+
+    if (hdr_valid_reg) begin
+        hdr_mem_wr_len = hdr_len_reg;
+        hdr_mem_wr_last = hdr_last_reg;
+        hdr_mem_wr_meta = hdr_meta_reg;
+        hdr_mem_wr_addr = hdr_fifo_wr_ptr_reg;
+        hdr_mem_wr_en = 1'b1;
+
+        if (hdr_commit_prev_reg) begin
+            if (hdr_commit_reg) begin
+                hdr_fifo_wr_ptr_next = hdr_fifo_wr_ptr_reg + 2;
+                hdr_mem_wr_addr = hdr_fifo_wr_ptr_reg + 1;
+            end else begin
+                hdr_fifo_wr_ptr_next = hdr_fifo_wr_ptr_reg + 1;
+                hdr_mem_wr_addr = hdr_fifo_wr_ptr_reg + 1;
+            end
+        end else begin
+            if (hdr_commit_reg) begin
+                hdr_fifo_wr_ptr_next = hdr_fifo_wr_ptr_reg + 1;
+                hdr_mem_wr_addr = hdr_fifo_wr_ptr_reg;
+            end
+        end
+    end
+end
+
+always @(posedge clk) begin
+    input_fifo_wr_ptr_reg <= input_fifo_wr_ptr_next;
+    hdr_fifo_wr_ptr_reg <= hdr_fifo_wr_ptr_next;
+
+    seg_mem_wr_addr_reg <= seg_mem_wr_addr_next;
+
+    frame_reg <= frame_next;
+    len_reg <= len_next;
+
+    cycle_valid_reg <= cycle_valid_next;
+    cycle_last_reg <= cycle_last_next;
+    cycle_len_reg <= cycle_len_next;
+    cycle_meta_reg <= cycle_meta_next;
+
+    hdr_len_reg <= hdr_len_next;
+    hdr_meta_reg <= hdr_meta_next;
+    hdr_last_reg <= hdr_last_next;
+    hdr_commit_reg <= hdr_commit_next;
+    hdr_commit_prev_reg <= hdr_commit_prev_next;
+    hdr_valid_reg <= hdr_valid_next;
+
+    if (rst || fifo_rst_in) begin
+        input_fifo_wr_ptr_reg <= 0;
+        hdr_fifo_wr_ptr_reg <= 0;
+
+        seg_mem_wr_addr_reg <= 0;
+
+        frame_reg <= 1'b0;
+
+        cycle_valid_reg <= 1'b0;
+        hdr_valid_reg <= 1'b0;
+    end
+end
+
+// Read logic
+integer rd_seg;
+reg [SEG_IDX_WIDTH-1:0] cur_rd_seg;
+reg rd_valid;
+
+reg out_frame_reg = 1'b0, out_frame_next;
+reg [HDR_LEN_WIDTH-1:0] out_len_reg = 0, out_len_next;
+reg out_split1_reg = 1'b0, out_split1_next;
+reg [HDR_SEG_LEN_WIDTH-1:0] out_seg_cnt_in_reg = 0, out_seg_cnt_in_next;
+reg out_seg_last_straddle_reg = 1'b0, out_seg_last_straddle_next;
+reg [SEG_IDX_WIDTH-1:0] out_seg_offset_reg = 0, out_seg_offset_next;
+reg [SEG_IDX_WIDTH-1:0] out_seg_fifo_offset_reg = 0, out_seg_fifo_offset_next;
+reg [SEG_IDX_WIDTH+1-1:0] out_seg_count_reg = 0, out_seg_count_next;
+
+reg [HDR_WIDTH-1:0] out_hdr_reg = 0, out_hdr_next;
+
+reg [SEG_CNT_INT-1:0] out_ctl_seg_hdr_reg = 0, out_ctl_seg_hdr_next, out_ctl_seg_hdr_raw;
+reg [SEG_CNT_INT-1:0] out_ctl_seg_split1_reg = 0, out_ctl_seg_split1_next, out_ctl_seg_split1_raw;
+reg [SEG_CNT_INT-1:0] out_ctl_seg_en_reg = 0, out_ctl_seg_en_next, out_ctl_seg_en_raw;
+reg [SEG_IDX_WIDTH-1:0] out_ctl_seg_idx_reg[SEG_CNT_INT-1:0], out_ctl_seg_idx_next[SEG_CNT_INT-1:0];
+reg [SEG_IDX_WIDTH-1:0] out_ctl_seg_offset_reg = 0, out_ctl_seg_offset_next;
+
+reg [HDR_WIDTH-1:0] out_shift_reg = 0, out_shift_next;
+
+reg [7:0] block_timeout_count_reg = 0, block_timeout_count_next;
+reg block_timeout_reg = 0, block_timeout_next;
+
+always @* begin
+    input_fifo_rd_ptr_next = input_fifo_rd_ptr_reg;
+    hdr_fifo_rd_ptr_next = hdr_fifo_rd_ptr_reg;
+
+    mem_rd_data_valid_next = mem_rd_data_valid_reg;
+    hdr_mem_rd_data_valid_next = hdr_mem_rd_data_valid_reg;
+
+    output_data_next = output_data_reg;
+    output_valid_next = 0;
+
+    seg_mem_rd_addr_next = seg_mem_rd_addr_reg;
+    seg_mem_rd_en = 0;
+
+    hdr_mem_rd_addr_next = hdr_mem_rd_addr_reg;
+    hdr_mem_rd_en = 0;
+
+    out_frame_next = out_frame_reg;
+    out_len_next = out_len_reg;
+    out_split1_next = out_split1_reg;
+    out_seg_cnt_in_next = out_seg_cnt_in_reg;
+    out_seg_last_straddle_next = out_seg_last_straddle_reg;
+    out_seg_offset_next = out_seg_offset_reg;
+    out_seg_fifo_offset_next = out_seg_fifo_offset_reg;
+
+    out_hdr_next = out_hdr_reg;
+
+    out_ctl_seg_hdr_raw = 0;
+    out_ctl_seg_hdr_next = 0;
+    out_ctl_seg_split1_raw = 0;
+    out_ctl_seg_split1_next = 0;
+    out_ctl_seg_en_raw = 0;
+    out_ctl_seg_en_next = 0;
+    out_ctl_seg_offset_next = out_seg_offset_reg;
+
+    for (seg = 0; seg < SEG_CNT_INT; seg = seg + 1) begin
+        out_ctl_seg_idx_next[seg] = out_seg_fifo_offset_reg - out_seg_offset_reg + seg;
+    end
+
+    // partial block timeout handling
+    block_timeout_count_next = block_timeout_count_reg;
+    block_timeout_next = block_timeout_count_reg == 0;
+    if (output_valid || out_seg_offset_reg == 0) begin
+        block_timeout_count_next = 8'hff;
+        block_timeout_next = 1'b0;
+    end else if (block_timeout_count_reg > 0) begin
+        block_timeout_count_next = block_timeout_count_reg - 1;
+    end
+
+    // process headers and generate output commands
+    if (!fifo_watermark_in) begin
+        if (out_frame_reg) begin
+            if (out_seg_cnt_in_next >= SEG_CNT_INT) begin
+                out_frame_next = out_seg_last_straddle_next || out_seg_cnt_in_next > SEG_CNT_INT;
+                out_ctl_seg_en_raw = {SEG_CNT_INT{1'b1}};
+                out_seg_offset_next = out_seg_offset_reg + SEG_CNT_INT;
+                out_seg_fifo_offset_next = out_seg_fifo_offset_reg + SEG_CNT_INT;
+            end else begin
+                out_frame_next = 1'b0;
+                if (out_seg_last_straddle_next) begin
+                    out_ctl_seg_split1_raw = 1 << out_seg_cnt_in_next;
+                    if (out_seg_cnt_in_next == SEG_CNT_INT-1) begin
+                        out_ctl_seg_en_raw = {SEG_CNT_INT{1'b1}};
+                    end else begin
+                        out_ctl_seg_en_raw = {SEG_CNT_INT{1'b1}} >> (SEG_CNT_INT - (out_seg_cnt_in_next+1));
+                    end
+                    out_seg_offset_next = out_seg_offset_reg + out_seg_cnt_in_next+1;
+                end else begin
+                    out_ctl_seg_en_raw = {SEG_CNT_INT{1'b1}} >> (SEG_CNT_INT - out_seg_cnt_in_next);
+                    out_seg_offset_next = out_seg_offset_reg + out_seg_cnt_in_next;
+                end
+                out_seg_fifo_offset_next = out_seg_fifo_offset_reg + out_seg_cnt_in_next;
+            end
+
+            out_seg_cnt_in_next = out_seg_cnt_in_next - SEG_CNT_INT;
+        end else begin
+            out_len_next = hdr_mem_rd_len;
+            out_seg_cnt_in_next = (hdr_mem_rd_len + SEG_BYTE_LANES) >> SEG_BYTE_IDX_WIDTH;
+            out_seg_last_straddle_next = ((hdr_mem_rd_len & (SEG_BYTE_LANES-1)) + HDR_SIZE) >> SEG_BYTE_IDX_WIDTH != 0;
+            out_hdr_next = 0;
+            out_hdr_next[0] = 1'b1;
+            out_hdr_next[1] = hdr_mem_rd_last;
+            out_hdr_next[2] = !hdr_mem_rd_last;
+            out_hdr_next[15:4] = hdr_mem_rd_len;
+            out_hdr_next[3] = ^hdr_mem_rd_len;
+            if (META_WIDTH > 0) begin
+                out_hdr_next[16 +: META_WIDTH] = hdr_mem_rd_meta;
+            end
+
+            out_ctl_seg_hdr_raw = 1;
+
+            if (hdr_mem_rd_data_valid_reg) begin
+                if (out_seg_cnt_in_next >= SEG_CNT_INT) begin
+                    out_frame_next = out_seg_last_straddle_next || out_seg_cnt_in_next > SEG_CNT_INT;
+                    out_ctl_seg_en_raw = {SEG_CNT_INT{1'b1}};
+                    out_seg_offset_next = out_seg_offset_reg + SEG_CNT_INT;
+                    out_seg_fifo_offset_next = out_seg_fifo_offset_reg + SEG_CNT_INT;
+                end else begin
+                    out_frame_next = 1'b0;
+                    if (out_seg_last_straddle_next) begin
+                        out_ctl_seg_split1_raw = 1 << out_seg_cnt_in_next;
+                        if (out_seg_cnt_in_next == SEG_CNT_INT-1) begin
+                            out_ctl_seg_en_raw = {SEG_CNT_INT{1'b1}};
+                        end else begin
+                            out_ctl_seg_en_raw = {SEG_CNT_INT{1'b1}} >> (SEG_CNT_INT - (out_seg_cnt_in_next+1));
+                        end
+                        out_seg_offset_next = out_seg_offset_reg + out_seg_cnt_in_next+1;
+                    end else begin
+                        out_ctl_seg_en_raw = {SEG_CNT_INT{1'b1}} >> (SEG_CNT_INT - out_seg_cnt_in_next);
+                        out_seg_offset_next = out_seg_offset_reg + out_seg_cnt_in_next;
+                    end
+                    out_seg_fifo_offset_next = out_seg_fifo_offset_reg + out_seg_cnt_in_next;
+                end
+
+                out_seg_cnt_in_next = out_seg_cnt_in_next - SEG_CNT_INT;
+
+                hdr_mem_rd_data_valid_next = 1'b0;
+            end else if (block_timeout_reg && out_seg_offset_reg) begin
+                // insert padding
+                out_hdr_next[15:0] = 0;
+
+                out_ctl_seg_en_raw = {SEG_CNT_INT{1'b1}} >> out_seg_offset_reg;
+                out_ctl_seg_hdr_raw = {SEG_CNT_INT{1'b1}};
+                out_ctl_seg_split1_raw = {SEG_CNT_INT{1'b1}};
+
+                out_seg_offset_next = 0;
+            end
+        end
+    end
+
+    out_ctl_seg_hdr_next = {2{out_ctl_seg_hdr_raw}} >> (SEG_CNT_INT - out_seg_offset_reg);
+    out_ctl_seg_split1_next = {2{out_ctl_seg_split1_raw}} >> (SEG_CNT_INT - out_seg_offset_reg);
+    out_ctl_seg_en_next = {2{out_ctl_seg_en_raw}} >> (SEG_CNT_INT - out_seg_offset_reg);
+
+    out_shift_next = out_shift_reg;
+
+    // mux segments
+    cur_rd_seg = out_ctl_seg_offset_reg;
+    for (rd_seg = 0; rd_seg < SEG_CNT_INT; rd_seg = rd_seg + 1) begin
+        output_data_next[cur_rd_seg*SEG_WIDTH +: SEG_WIDTH] = out_shift_next;
+        output_data_next[cur_rd_seg*SEG_WIDTH+HDR_WIDTH +: SEG_WIDTH-HDR_WIDTH] = seg_mem_rd_data[out_ctl_seg_idx_reg[cur_rd_seg]*SEG_WIDTH +: SEG_WIDTH-HDR_WIDTH];
+
+        if (out_ctl_seg_hdr_reg[cur_rd_seg]) begin
+            output_data_next[cur_rd_seg*SEG_WIDTH +: HDR_WIDTH] = out_hdr_reg;
+        end
+
+        output_valid_next[cur_rd_seg] = out_ctl_seg_en_reg[cur_rd_seg];
+
+        if (out_ctl_seg_en_reg[cur_rd_seg] && !out_ctl_seg_split1_reg[cur_rd_seg]) begin
+            mem_rd_data_valid_next[out_ctl_seg_idx_reg[cur_rd_seg]] = 1'b0;
+        end
+
+        if (out_ctl_seg_en_reg[cur_rd_seg]) begin
+            out_shift_next = seg_mem_rd_data[(out_ctl_seg_idx_reg[cur_rd_seg]+1)*SEG_WIDTH-HDR_WIDTH +: HDR_WIDTH];
+        end
+
+        cur_rd_seg = cur_rd_seg + 1;
+    end
+
+    // read segments
+    cur_rd_seg = input_fifo_rd_ptr_reg[SEG_IDX_WIDTH-1:0];
+    rd_valid = 1;
+    for (rd_seg = 0; rd_seg < SEG_CNT_INT; rd_seg = rd_seg + 1) begin
+        if (!mem_rd_data_valid_next[cur_rd_seg] && input_fifo_count_reg > rd_seg && rd_valid) begin
+            input_fifo_rd_ptr_next = input_fifo_rd_ptr_reg + rd_seg+1;
+            seg_mem_rd_en[cur_rd_seg] = 1'b1;
+            seg_mem_rd_addr_next[cur_rd_seg*INPUT_FIFO_ADDR_WIDTH +: INPUT_FIFO_ADDR_WIDTH] = ((input_fifo_rd_ptr_reg + rd_seg) >> SEG_IDX_WIDTH) + 1;
+            mem_rd_data_valid_next[cur_rd_seg] = 1'b1;
+        end else begin
+            rd_valid = 0;
+        end
+        cur_rd_seg = cur_rd_seg + 1;
+    end
+
+    // read header
+    if (!hdr_mem_rd_data_valid_next && !hdr_fifo_empty_reg) begin
+        hdr_fifo_rd_ptr_next = hdr_fifo_rd_ptr_reg + 1;
+        hdr_mem_rd_en = 1'b1;
+        hdr_mem_rd_addr_next = hdr_fifo_rd_ptr_next;
+        hdr_mem_rd_data_valid_next = 1'b1;
+    end
+end
+
+integer i;
+
+always @(posedge clk) begin
+    input_fifo_rd_ptr_reg <= input_fifo_rd_ptr_next;
+    input_fifo_count_reg <= input_fifo_wr_ptr_next - input_fifo_rd_ptr_next;
+    input_fifo_empty_reg <= input_fifo_wr_ptr_next == input_fifo_rd_ptr_next;
+    hdr_fifo_rd_ptr_reg <= hdr_fifo_rd_ptr_next;
+    hdr_fifo_count_reg <= hdr_fifo_wr_ptr_next - hdr_fifo_rd_ptr_next;
+    hdr_fifo_empty_reg <= hdr_fifo_wr_ptr_next == hdr_fifo_rd_ptr_next;
+
+    seg_mem_rd_addr_reg <= seg_mem_rd_addr_next;
+    hdr_mem_rd_addr_reg <= hdr_mem_rd_addr_next;
+
+    mem_rd_data_valid_reg <= mem_rd_data_valid_next;
+    hdr_mem_rd_data_valid_reg <= hdr_mem_rd_data_valid_next;
+
+    output_data_reg <= output_data_next;
+    output_valid_reg <= output_valid_next;
+
+    out_frame_reg <= out_frame_next;
+    out_len_reg <= out_len_next;
+    out_split1_reg <= out_split1_next;
+    out_seg_cnt_in_reg <= out_seg_cnt_in_next;
+    out_seg_last_straddle_reg <= out_seg_last_straddle_next;
+    out_seg_offset_reg <= out_seg_offset_next;
+    out_seg_fifo_offset_reg <= out_seg_fifo_offset_next;
+
+    out_hdr_reg <= out_hdr_next;
+
+    out_ctl_seg_hdr_reg <= out_ctl_seg_hdr_next;
+    out_ctl_seg_split1_reg <= out_ctl_seg_split1_next;
+    out_ctl_seg_en_reg <= out_ctl_seg_en_next;
+    for (i = 0; i < SEG_CNT_INT; i = i + 1) begin
+        out_ctl_seg_idx_reg[i] <= out_ctl_seg_idx_next[i];
+    end
+    out_ctl_seg_offset_reg <= out_ctl_seg_offset_next;
+
+    out_shift_reg <= out_shift_next;
+
+    block_timeout_count_reg <= block_timeout_count_next;
+    block_timeout_reg <= block_timeout_next;
+
+    if (rst || fifo_rst_in) begin
+        input_fifo_rd_ptr_reg <= 0;
+        input_fifo_count_reg <= 0;
+        input_fifo_empty_reg <= 1'b1;
+        hdr_fifo_rd_ptr_reg <= 0;
+        hdr_fifo_count_reg <= 0;
+        hdr_fifo_empty_reg <= 1'b1;
+
+        seg_mem_rd_addr_reg <= 0;
+        hdr_mem_rd_addr_reg <= 0;
+
+        mem_rd_data_valid_reg <= 0;
+        hdr_mem_rd_data_valid_reg <= 0;
+
+        out_frame_reg <= 1'b0;
+        out_len_reg <= 0;
+        out_split1_reg <= 0;
+        out_seg_offset_reg <= 0;
+        out_seg_fifo_offset_reg <= 0;
+        out_seg_count_reg <= 0;
+    end
+end
+
+endmodule
+
+`resetall
diff --git a/rtl/axi_vfifo_raw.v b/rtl/axi_vfifo_raw.v
new file mode 100644
index 0000000..6e4ade2
--- /dev/null
+++ b/rtl/axi_vfifo_raw.v
@@ -0,0 +1,381 @@
+/*
+
+Copyright (c) 2023 Alex Forencich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+// Language: Verilog 2001
+
+`resetall
+`timescale 1ns / 1ps
+`default_nettype none
+
+/*
+ * AXI4 virtual FIFO (raw)
+ */
+module axi_vfifo_raw #
+(
+    // Width of input segment
+    parameter SEG_WIDTH = 32,
+    // Segment count
+    parameter SEG_CNT = 2,
+    // Width of AXI data bus in bits
+    parameter AXI_DATA_WIDTH = SEG_WIDTH*SEG_CNT,
+    // Width of AXI address bus in bits
+    parameter AXI_ADDR_WIDTH = 16,
+    // Width of AXI wstrb (width of data bus in words)
+    parameter AXI_STRB_WIDTH = (AXI_DATA_WIDTH/8),
+    // Width of AXI ID signal
+    parameter AXI_ID_WIDTH = 8,
+    // Maximum AXI burst length to generate
+    parameter AXI_MAX_BURST_LEN = 16,
+    // Width of length field
+    parameter LEN_WIDTH = AXI_ADDR_WIDTH,
+    // Input FIFO depth for AXI write data (full-width words)
+    parameter WRITE_FIFO_DEPTH = 64,
+    // Max AXI write burst length
+    parameter WRITE_MAX_BURST_LEN = WRITE_FIFO_DEPTH/4,
+    // Output FIFO depth for AXI read data (full-width words)
+    parameter READ_FIFO_DEPTH = 128,
+    // Max AXI read burst length
+    parameter READ_MAX_BURST_LEN = WRITE_MAX_BURST_LEN,
+    // Watermark level
+    parameter WATERMARK_LEVEL = WRITE_FIFO_DEPTH/2,
+    // Use control output
+    parameter CTRL_OUT_EN = 0
+)
+(
+    input  wire                          clk,
+    input  wire                          rst,
+
+    /*
+     * Segmented data input (from encode logic)
+     */
+    input  wire                          input_clk,
+    input  wire                          input_rst,
+    output wire                          input_rst_out,
+    output wire                          input_watermark,
+    input  wire [SEG_CNT*SEG_WIDTH-1:0]  input_data,
+    input  wire [SEG_CNT-1:0]            input_valid,
+    output wire [SEG_CNT-1:0]            input_ready,
+
+    /*
+     * Segmented data output (to decode logic)
+     */
+    input  wire                          output_clk,
+    input  wire                          output_rst,
+    output wire                          output_rst_out,
+    output wire [SEG_CNT*SEG_WIDTH-1:0]  output_data,
+    output wire [SEG_CNT-1:0]            output_valid,
+    input  wire [SEG_CNT-1:0]            output_ready,
+    output wire [SEG_CNT*SEG_WIDTH-1:0]  output_ctrl_data,
+    output wire [SEG_CNT-1:0]            output_ctrl_valid,
+    input  wire [SEG_CNT-1:0]            output_ctrl_ready,
+
+    /*
+     * AXI master interface
+     */
+    output wire [AXI_ID_WIDTH-1:0]       m_axi_awid,
+    output wire [AXI_ADDR_WIDTH-1:0]     m_axi_awaddr,
+    output wire [7:0]                    m_axi_awlen,
+    output wire [2:0]                    m_axi_awsize,
+    output wire [1:0]                    m_axi_awburst,
+    output wire                          m_axi_awlock,
+    output wire [3:0]                    m_axi_awcache,
+    output wire [2:0]                    m_axi_awprot,
+    output wire                          m_axi_awvalid,
+    input  wire                          m_axi_awready,
+    output wire [AXI_DATA_WIDTH-1:0]     m_axi_wdata,
+    output wire [AXI_STRB_WIDTH-1:0]     m_axi_wstrb,
+    output wire                          m_axi_wlast,
+    output wire                          m_axi_wvalid,
+    input  wire                          m_axi_wready,
+    input  wire [AXI_ID_WIDTH-1:0]       m_axi_bid,
+    input  wire [1:0]                    m_axi_bresp,
+    input  wire                          m_axi_bvalid,
+    output wire                          m_axi_bready,
+    output wire [AXI_ID_WIDTH-1:0]       m_axi_arid,
+    output wire [AXI_ADDR_WIDTH-1:0]     m_axi_araddr,
+    output wire [7:0]                    m_axi_arlen,
+    output wire [2:0]                    m_axi_arsize,
+    output wire [1:0]                    m_axi_arburst,
+    output wire                          m_axi_arlock,
+    output wire [3:0]                    m_axi_arcache,
+    output wire [2:0]                    m_axi_arprot,
+    output wire                          m_axi_arvalid,
+    input  wire                          m_axi_arready,
+    input  wire [AXI_ID_WIDTH-1:0]       m_axi_rid,
+    input  wire [AXI_DATA_WIDTH-1:0]     m_axi_rdata,
+    input  wire [1:0]                    m_axi_rresp,
+    input  wire                          m_axi_rlast,
+    input  wire                          m_axi_rvalid,
+    output wire                          m_axi_rready,
+
+    /*
+     * Reset sync
+     */
+    output wire                          rst_req_out,
+    input  wire                          rst_req_in,
+
+    /*
+     * Configuration
+     */
+    input  wire [AXI_ADDR_WIDTH-1:0]     cfg_fifo_base_addr,
+    input  wire [LEN_WIDTH-1:0]          cfg_fifo_size_mask,
+    input  wire                          cfg_enable,
+    input  wire                          cfg_reset,
+
+    /*
+     * Status
+     */
+    output wire [LEN_WIDTH+1-1:0]        sts_fifo_occupancy,
+    output wire                          sts_fifo_empty,
+    output wire                          sts_fifo_full,
+    output wire                          sts_reset,
+    output wire                          sts_active,
+    output wire                          sts_write_active,
+    output wire                          sts_read_active
+);
+
+localparam ADDR_MASK = {AXI_ADDR_WIDTH{1'b1}} << $clog2(AXI_STRB_WIDTH);
+
+reg fifo_reset_reg = 1'b1, fifo_reset_next;
+reg fifo_enable_reg = 1'b0, fifo_enable_next;
+reg [AXI_ADDR_WIDTH-1:0] fifo_base_addr_reg = 0, fifo_base_addr_next;
+reg [LEN_WIDTH-1:0] fifo_size_mask_reg = 0, fifo_size_mask_next;
+
+assign sts_reset = fifo_reset_reg;
+assign sts_active = fifo_enable_reg;
+
+wire [LEN_WIDTH+1-1:0] wr_start_ptr;
+wire [LEN_WIDTH+1-1:0] wr_finish_ptr;
+wire [LEN_WIDTH+1-1:0] rd_start_ptr;
+wire [LEN_WIDTH+1-1:0] rd_finish_ptr;
+
+axi_vfifo_raw_wr #(
+    .SEG_WIDTH(SEG_WIDTH),
+    .SEG_CNT(SEG_CNT),
+    .AXI_DATA_WIDTH(AXI_DATA_WIDTH),
+    .AXI_ADDR_WIDTH(AXI_ADDR_WIDTH),
+    .AXI_STRB_WIDTH(AXI_STRB_WIDTH),
+    .AXI_ID_WIDTH(AXI_ID_WIDTH),
+    .AXI_MAX_BURST_LEN(AXI_MAX_BURST_LEN),
+    .LEN_WIDTH(LEN_WIDTH),
+    .WRITE_FIFO_DEPTH(WRITE_FIFO_DEPTH),
+    .WRITE_MAX_BURST_LEN(WRITE_MAX_BURST_LEN),
+    .WATERMARK_LEVEL(WATERMARK_LEVEL)
+)
+axi_vfifo_raw_wr_inst (
+    .clk(clk),
+    .rst(rst),
+
+    /*
+     * Segmented data input (from encode logic)
+     */
+    .input_clk(input_clk),
+    .input_rst(input_rst),
+    .input_rst_out(input_rst_out),
+    .input_watermark(input_watermark),
+    .input_data(input_data),
+    .input_valid(input_valid),
+    .input_ready(input_ready),
+
+    /*
+     * AXI master interface
+     */
+    .m_axi_awid(m_axi_awid),
+    .m_axi_awaddr(m_axi_awaddr),
+    .m_axi_awlen(m_axi_awlen),
+    .m_axi_awsize(m_axi_awsize),
+    .m_axi_awburst(m_axi_awburst),
+    .m_axi_awlock(m_axi_awlock),
+    .m_axi_awcache(m_axi_awcache),
+    .m_axi_awprot(m_axi_awprot),
+    .m_axi_awvalid(m_axi_awvalid),
+    .m_axi_awready(m_axi_awready),
+    .m_axi_wdata(m_axi_wdata),
+    .m_axi_wstrb(m_axi_wstrb),
+    .m_axi_wlast(m_axi_wlast),
+    .m_axi_wvalid(m_axi_wvalid),
+    .m_axi_wready(m_axi_wready),
+    .m_axi_bid(m_axi_bid),
+    .m_axi_bresp(m_axi_bresp),
+    .m_axi_bvalid(m_axi_bvalid),
+    .m_axi_bready(m_axi_bready),
+
+    /*
+     * FIFO control
+     */
+    .wr_start_ptr_out(wr_start_ptr),
+    .wr_finish_ptr_out(wr_finish_ptr),
+    .rd_start_ptr_in(rd_start_ptr),
+    .rd_finish_ptr_in(rd_finish_ptr),
+
+    /*
+     * Configuration
+     */
+    .cfg_fifo_base_addr(fifo_base_addr_reg),
+    .cfg_fifo_size_mask(fifo_size_mask_reg),
+    .cfg_enable(fifo_enable_reg),
+    .cfg_reset(fifo_reset_reg),
+
+    /*
+     * Status
+     */
+    .sts_fifo_occupancy(sts_fifo_occupancy),
+    .sts_fifo_empty(sts_fifo_empty),
+    .sts_fifo_full(sts_fifo_full),
+    .sts_write_active(sts_write_active)
+);
+
+axi_vfifo_raw_rd #(
+    .SEG_WIDTH(SEG_WIDTH),
+    .SEG_CNT(SEG_CNT),
+    .AXI_DATA_WIDTH(AXI_DATA_WIDTH),
+    .AXI_ADDR_WIDTH(AXI_ADDR_WIDTH),
+    .AXI_STRB_WIDTH(AXI_STRB_WIDTH),
+    .AXI_ID_WIDTH(AXI_ID_WIDTH),
+    .AXI_MAX_BURST_LEN(AXI_MAX_BURST_LEN),
+    .LEN_WIDTH(LEN_WIDTH),
+    .READ_FIFO_DEPTH(READ_FIFO_DEPTH),
+    .READ_MAX_BURST_LEN(READ_MAX_BURST_LEN),
+    .CTRL_OUT_EN(CTRL_OUT_EN)
+)
+axi_vfifo_raw_rd_inst (
+    .clk(clk),
+    .rst(rst),
+
+    /*
+     * Segmented data output (to decode logic)
+     */
+    .output_clk(output_clk),
+    .output_rst(output_rst),
+    .output_rst_out(output_rst_out),
+    .output_data(output_data),
+    .output_valid(output_valid),
+    .output_ready(output_ready),
+    .output_ctrl_data(output_ctrl_data),
+    .output_ctrl_valid(output_ctrl_valid),
+    .output_ctrl_ready(output_ctrl_ready),
+
+    /*
+     * AXI master interface
+     */
+    .m_axi_arid(m_axi_arid),
+    .m_axi_araddr(m_axi_araddr),
+    .m_axi_arlen(m_axi_arlen),
+    .m_axi_arsize(m_axi_arsize),
+    .m_axi_arburst(m_axi_arburst),
+    .m_axi_arlock(m_axi_arlock),
+    .m_axi_arcache(m_axi_arcache),
+    .m_axi_arprot(m_axi_arprot),
+    .m_axi_arvalid(m_axi_arvalid),
+    .m_axi_arready(m_axi_arready),
+    .m_axi_rid(m_axi_rid),
+    .m_axi_rdata(m_axi_rdata),
+    .m_axi_rresp(m_axi_rresp),
+    .m_axi_rlast(m_axi_rlast),
+    .m_axi_rvalid(m_axi_rvalid),
+    .m_axi_rready(m_axi_rready),
+
+    /*
+     * FIFO control
+     */
+    .wr_start_ptr_in(wr_start_ptr),
+    .wr_finish_ptr_in(wr_finish_ptr),
+    .rd_start_ptr_out(rd_start_ptr),
+    .rd_finish_ptr_out(rd_finish_ptr),
+
+    /*
+     * Configuration
+     */
+    .cfg_fifo_base_addr(fifo_base_addr_reg),
+    .cfg_fifo_size_mask(fifo_size_mask_reg),
+    .cfg_enable(fifo_enable_reg),
+    .cfg_reset(fifo_reset_reg),
+
+    /*
+     * Status
+     */
+    .sts_read_active(sts_read_active)
+);
+
+// reset synchronization
+assign rst_req_out = rst | input_rst | output_rst | cfg_reset;
+
+wire rst_req_int = rst_req_in | rst_req_out;
+
+(* shreg_extract = "no" *)
+reg rst_sync_1_reg = 1'b1,  rst_sync_2_reg = 1'b1, rst_sync_3_reg = 1'b1;
+
+always @(posedge clk or posedge rst_req_int) begin
+    if (rst_req_int) begin
+        rst_sync_1_reg <= 1'b1;
+    end else begin
+        rst_sync_1_reg <= 1'b0;
+    end
+end
+
+always @(posedge clk) begin
+    rst_sync_2_reg <= rst_sync_1_reg;
+    rst_sync_3_reg <= rst_sync_2_reg;
+end
+
+// reset and enable logic
+always @* begin
+    fifo_reset_next = 1'b0;
+    fifo_enable_next = fifo_enable_reg;
+    fifo_base_addr_next = fifo_base_addr_reg;
+    fifo_size_mask_next = fifo_size_mask_reg;
+
+    if (cfg_reset || rst_sync_3_reg) begin
+        fifo_reset_next = 1'b1;
+    end
+
+    if (fifo_reset_reg) begin
+        fifo_enable_next = 1'b0;
+        // hold reset until everything is flushed
+        if (sts_write_active || sts_read_active) begin
+            fifo_reset_next = 1'b1;
+        end
+    end else if (!fifo_enable_reg && cfg_enable) begin
+        fifo_base_addr_next = cfg_fifo_base_addr & ADDR_MASK;
+        fifo_size_mask_next = cfg_fifo_size_mask | ~ADDR_MASK;
+
+        fifo_enable_next = 1'b1;
+    end
+end
+
+always @(posedge clk) begin
+    fifo_reset_reg <= fifo_reset_next;
+    fifo_enable_reg <= fifo_enable_next;
+    fifo_base_addr_reg <= fifo_base_addr_next;
+    fifo_size_mask_reg <= fifo_size_mask_next;
+
+    if (rst) begin
+        fifo_reset_reg <= 1'b1;
+        fifo_enable_reg <= 1'b0;
+    end
+end
+
+endmodule
+
+`resetall
diff --git a/rtl/axi_vfifo_raw_rd.v b/rtl/axi_vfifo_raw_rd.v
new file mode 100644
index 0000000..d745ad0
--- /dev/null
+++ b/rtl/axi_vfifo_raw_rd.v
@@ -0,0 +1,580 @@
+/*
+
+Copyright (c) 2023 Alex Forencich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+// Language: Verilog 2001
+
+`resetall
+`timescale 1ns / 1ps
+`default_nettype none
+
+/*
+ * AXI4 virtual FIFO (raw, read side)
+ */
+module axi_vfifo_raw_rd #
+(
+    // Width of input segment
+    parameter SEG_WIDTH = 32,
+    // Segment count
+    parameter SEG_CNT = 2,
+    // Width of AXI data bus in bits
+    parameter AXI_DATA_WIDTH = SEG_WIDTH*SEG_CNT,
+    // Width of AXI address bus in bits
+    parameter AXI_ADDR_WIDTH = 16,
+    // Width of AXI wstrb (width of data bus in words)
+    parameter AXI_STRB_WIDTH = (AXI_DATA_WIDTH/8),
+    // Width of AXI ID signal
+    parameter AXI_ID_WIDTH = 8,
+    // Maximum AXI burst length to generate
+    parameter AXI_MAX_BURST_LEN = 16,
+    // Width of length field
+    parameter LEN_WIDTH = AXI_ADDR_WIDTH,
+    // Output FIFO depth for AXI read data (full-width words)
+    parameter READ_FIFO_DEPTH = 128,
+    // Max AXI read burst length
+    parameter READ_MAX_BURST_LEN = READ_FIFO_DEPTH/4,
+    // Use control output
+    parameter CTRL_OUT_EN = 0
+)
+(
+    input  wire                          clk,
+    input  wire                          rst,
+
+    /*
+     * Segmented data output (to decode logic)
+     */
+    input  wire                          output_clk,
+    input  wire                          output_rst,
+    output wire                          output_rst_out,
+    output wire [SEG_CNT*SEG_WIDTH-1:0]  output_data,
+    output wire [SEG_CNT-1:0]            output_valid,
+    input  wire [SEG_CNT-1:0]            output_ready,
+    output wire [SEG_CNT*SEG_WIDTH-1:0]  output_ctrl_data,
+    output wire [SEG_CNT-1:0]            output_ctrl_valid,
+    input  wire [SEG_CNT-1:0]            output_ctrl_ready,
+
+    /*
+     * AXI master interface
+     */
+    output wire [AXI_ID_WIDTH-1:0]       m_axi_arid,
+    output wire [AXI_ADDR_WIDTH-1:0]     m_axi_araddr,
+    output wire [7:0]                    m_axi_arlen,
+    output wire [2:0]                    m_axi_arsize,
+    output wire [1:0]                    m_axi_arburst,
+    output wire                          m_axi_arlock,
+    output wire [3:0]                    m_axi_arcache,
+    output wire [2:0]                    m_axi_arprot,
+    output wire                          m_axi_arvalid,
+    input  wire                          m_axi_arready,
+    input  wire [AXI_ID_WIDTH-1:0]       m_axi_rid,
+    input  wire [AXI_DATA_WIDTH-1:0]     m_axi_rdata,
+    input  wire [1:0]                    m_axi_rresp,
+    input  wire                          m_axi_rlast,
+    input  wire                          m_axi_rvalid,
+    output wire                          m_axi_rready,
+
+    /*
+     * FIFO control
+     */
+    input  wire [LEN_WIDTH+1-1:0]        wr_start_ptr_in,
+    input  wire [LEN_WIDTH+1-1:0]        wr_finish_ptr_in,
+    output wire [LEN_WIDTH+1-1:0]        rd_start_ptr_out,
+    output wire [LEN_WIDTH+1-1:0]        rd_finish_ptr_out,
+
+    /*
+     * Configuration
+     */
+    input  wire [AXI_ADDR_WIDTH-1:0]     cfg_fifo_base_addr,
+    input  wire [LEN_WIDTH-1:0]          cfg_fifo_size_mask,
+    input  wire                          cfg_enable,
+    input  wire                          cfg_reset,
+
+    /*
+     * Status
+     */
+    output wire                          sts_read_active
+);
+
+localparam AXI_BYTE_LANES = AXI_STRB_WIDTH;
+localparam AXI_BYTE_SIZE = AXI_DATA_WIDTH/AXI_BYTE_LANES;
+localparam AXI_BURST_SIZE = $clog2(AXI_STRB_WIDTH);
+localparam AXI_MAX_BURST_SIZE = AXI_MAX_BURST_LEN << AXI_BURST_SIZE;
+
+localparam OFFSET_ADDR_WIDTH = AXI_STRB_WIDTH > 1 ? $clog2(AXI_STRB_WIDTH) : 1;
+localparam OFFSET_ADDR_MASK = AXI_STRB_WIDTH > 1 ? {OFFSET_ADDR_WIDTH{1'b1}} : 0;
+localparam ADDR_MASK = {AXI_ADDR_WIDTH{1'b1}} << $clog2(AXI_STRB_WIDTH);
+localparam CYCLE_COUNT_WIDTH = LEN_WIDTH - AXI_BURST_SIZE + 1;
+
+localparam READ_FIFO_ADDR_WIDTH = $clog2(READ_FIFO_DEPTH);
+
+// mask(x) = (2**$clog2(x))-1
+// log2(min(x, y, z)) = (mask & mask & mask)+1
+// floor(log2(x)) = $clog2(x+1)-1
+// floor(log2(min(AXI_MAX_BURST_LEN, READ_MAX_BURST_LEN, 2**(READ_FIFO_ADDR_WIDTH-1), 4096/AXI_BYTE_LANES)))
+localparam READ_MAX_BURST_LEN_INT = ((2**($clog2(AXI_MAX_BURST_LEN+1)-1)-1) & (2**($clog2(READ_MAX_BURST_LEN+1)-1)-1) & (2**(READ_FIFO_ADDR_WIDTH-1)-1) & ((4096/AXI_BYTE_LANES)-1)) + 1;
+localparam READ_MAX_BURST_SIZE_INT = READ_MAX_BURST_LEN_INT << AXI_BURST_SIZE;
+localparam READ_BURST_LEN_WIDTH = $clog2(READ_MAX_BURST_LEN_INT);
+localparam READ_BURST_ADDR_WIDTH = $clog2(READ_MAX_BURST_SIZE_INT);
+localparam READ_BURST_ADDR_MASK = READ_BURST_ADDR_WIDTH > 1 ? {READ_BURST_ADDR_WIDTH{1'b1}} : 0;
+
+// validate parameters
+initial begin
+    if (AXI_BYTE_SIZE * AXI_STRB_WIDTH != AXI_DATA_WIDTH) begin
+        $error("Error: AXI data width not evenly divisible (instance %m)");
+        $finish;
+    end
+
+    if (2**$clog2(AXI_BYTE_LANES) != AXI_BYTE_LANES) begin
+        $error("Error: AXI byte lane count must be even power of two (instance %m)");
+        $finish;
+    end
+
+    if (AXI_MAX_BURST_LEN < 1 || AXI_MAX_BURST_LEN > 256) begin
+        $error("Error: AXI_MAX_BURST_LEN must be between 1 and 256 (instance %m)");
+        $finish;
+    end
+
+    if (SEG_CNT * SEG_WIDTH != AXI_DATA_WIDTH) begin
+        $error("Error: Width mismatch (instance %m)");
+        $finish;
+    end
+end
+
+localparam [1:0]
+    AXI_RESP_OKAY = 2'b00,
+    AXI_RESP_EXOKAY = 2'b01,
+    AXI_RESP_SLVERR = 2'b10,
+    AXI_RESP_DECERR = 2'b11;
+
+reg [AXI_ADDR_WIDTH-1:0] m_axi_araddr_reg = {AXI_ADDR_WIDTH{1'b0}}, m_axi_araddr_next;
+reg [7:0] m_axi_arlen_reg = 8'd0, m_axi_arlen_next;
+reg m_axi_arvalid_reg = 1'b0, m_axi_arvalid_next;
+
+assign m_axi_arid = {AXI_ID_WIDTH{1'b0}};
+assign m_axi_araddr = m_axi_araddr_reg;
+assign m_axi_arlen = m_axi_arlen_reg;
+assign m_axi_arsize = AXI_BURST_SIZE;
+assign m_axi_arburst = 2'b01;
+assign m_axi_arlock = 1'b0;
+assign m_axi_arcache = 4'b0011;
+assign m_axi_arprot = 3'b010;
+assign m_axi_arvalid = m_axi_arvalid_reg;
+
+// reset synchronization
+wire rst_req_int = cfg_reset;
+
+(* shreg_extract = "no" *)
+reg rst_sync_1_reg = 1'b1,  rst_sync_2_reg = 1'b1, rst_sync_3_reg = 1'b1;
+
+assign output_rst_out = rst_sync_3_reg;
+
+always @(posedge output_clk or posedge rst_req_int) begin
+    if (rst_req_int) begin
+        rst_sync_1_reg <= 1'b1;
+    end else begin
+        rst_sync_1_reg <= 1'b0;
+    end
+end
+
+always @(posedge output_clk) begin
+    rst_sync_2_reg <= rst_sync_1_reg;
+    rst_sync_3_reg <= rst_sync_2_reg;
+end
+
+// output datapath logic (read data)
+reg [AXI_DATA_WIDTH-1:0] m_axis_tdata_reg  = {AXI_DATA_WIDTH{1'b0}};
+reg                      m_axis_tvalid_reg = 1'b0;
+
+reg [READ_FIFO_ADDR_WIDTH-1:0] read_fifo_read_start_cnt = 0;
+reg read_fifo_read_start_en = 1'b0;
+
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_read_start_ptr_reg = 0;
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_wr_ptr_reg = 0;
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_wr_ptr_gray_reg = 0;
+wire [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_rd_ptr;
+wire [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_rd_ptr_gray;
+wire [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_ctrl_rd_ptr;
+wire [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_ctrl_rd_ptr_gray;
+
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_wr_ptr_temp;
+
+(* shreg_extract = "no" *)
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_wr_ptr_gray_sync_1_reg = 0;
+(* shreg_extract = "no" *)
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_wr_ptr_gray_sync_2_reg = 0;
+
+(* shreg_extract = "no" *)
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_rd_ptr_gray_sync_1_reg = 0;
+(* shreg_extract = "no" *)
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_rd_ptr_gray_sync_2_reg = 0;
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_rd_ptr_sync_reg = 0;
+
+(* shreg_extract = "no" *)
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_ctrl_rd_ptr_gray_sync_1_reg = 0;
+(* shreg_extract = "no" *)
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_ctrl_rd_ptr_gray_sync_2_reg = 0;
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_ctrl_rd_ptr_sync_reg = 0;
+
+reg read_fifo_half_full_reg = 1'b0;
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_occupancy_reg = 0;
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] read_fifo_occupancy_lookahead_reg = 0;
+
+wire read_fifo_full = read_fifo_wr_ptr_gray_reg == (read_fifo_rd_ptr_gray_sync_2_reg ^ {2'b11, {READ_FIFO_ADDR_WIDTH-1{1'b0}}});
+wire read_fifo_empty = read_fifo_rd_ptr_gray == read_fifo_wr_ptr_gray_sync_2_reg;
+
+wire read_fifo_ctrl_full = read_fifo_wr_ptr_gray_reg == (read_fifo_ctrl_rd_ptr_gray_sync_2_reg ^ {2'b11, {READ_FIFO_ADDR_WIDTH-1{1'b0}}});
+wire read_fifo_ctrl_empty = read_fifo_ctrl_rd_ptr_gray == read_fifo_wr_ptr_gray_sync_2_reg;
+
+assign m_axi_rready = (!read_fifo_full && (!CTRL_OUT_EN || !read_fifo_ctrl_full)) || cfg_reset;
+
+genvar n;
+integer k;
+
+generate
+
+for (n = 0; n < SEG_CNT; n = n + 1) begin : read_fifo_seg
+
+    reg [READ_FIFO_ADDR_WIDTH+1-1:0] seg_rd_ptr_reg = 0;
+    reg [READ_FIFO_ADDR_WIDTH+1-1:0] seg_rd_ptr_gray_reg = 0;
+
+    reg [READ_FIFO_ADDR_WIDTH+1-1:0] seg_rd_ptr_temp;
+
+    (* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+    reg [SEG_WIDTH-1:0] seg_mem_data[2**READ_FIFO_ADDR_WIDTH-1:0];
+
+    reg [SEG_WIDTH-1:0] seg_rd_data_reg = 0;
+    reg seg_rd_data_valid_reg = 0;
+
+    wire seg_empty = seg_rd_ptr_gray_reg == read_fifo_wr_ptr_gray_sync_2_reg;
+
+    assign output_data[n*SEG_WIDTH +: SEG_WIDTH] = seg_rd_data_reg;
+    assign output_valid[n] = seg_rd_data_valid_reg;
+
+    if (n == SEG_CNT-1) begin
+        assign read_fifo_rd_ptr = seg_rd_ptr_reg;
+        assign read_fifo_rd_ptr_gray = seg_rd_ptr_gray_reg;
+    end
+
+    always @(posedge clk) begin
+        if (!read_fifo_full && m_axi_rready && m_axi_rvalid) begin
+            seg_mem_data[read_fifo_wr_ptr_reg[READ_FIFO_ADDR_WIDTH-1:0]] <= m_axi_rdata[n*SEG_WIDTH +: SEG_WIDTH];
+        end
+    end
+
+    // per-segment read logic
+    always @(posedge output_clk) begin
+        seg_rd_data_valid_reg <= seg_rd_data_valid_reg && !output_ready[n];
+
+        if (!seg_empty && (!seg_rd_data_valid_reg || output_ready[n])) begin
+            seg_rd_data_reg <= seg_mem_data[seg_rd_ptr_reg[READ_FIFO_ADDR_WIDTH-1:0]];
+            seg_rd_data_valid_reg <= 1'b1;
+
+            seg_rd_ptr_temp = seg_rd_ptr_reg + 1;
+            seg_rd_ptr_reg <= seg_rd_ptr_temp;
+            seg_rd_ptr_gray_reg <= seg_rd_ptr_temp ^ (seg_rd_ptr_temp >> 1);
+        end
+
+        if (output_rst || output_rst_out) begin
+            seg_rd_ptr_reg <= 0;
+            seg_rd_ptr_gray_reg <= 0;
+            seg_rd_data_valid_reg <= 1'b0;
+        end
+    end
+
+end
+
+endgenerate
+
+// write logic
+always @(posedge clk) begin
+    read_fifo_occupancy_reg <= read_fifo_wr_ptr_reg - read_fifo_rd_ptr_sync_reg;
+    read_fifo_half_full_reg <= $unsigned(read_fifo_wr_ptr_reg - read_fifo_rd_ptr_sync_reg) >= 2**(READ_FIFO_ADDR_WIDTH-1);
+
+    if (read_fifo_read_start_en) begin
+        read_fifo_read_start_ptr_reg <= read_fifo_read_start_ptr_reg + read_fifo_read_start_cnt;
+        read_fifo_occupancy_lookahead_reg <= read_fifo_read_start_ptr_reg + read_fifo_read_start_cnt - read_fifo_rd_ptr_sync_reg;
+    end else begin
+        read_fifo_occupancy_lookahead_reg <= read_fifo_read_start_ptr_reg - read_fifo_rd_ptr_sync_reg;
+    end
+
+    if (!read_fifo_full && m_axi_rready && m_axi_rvalid) begin
+        read_fifo_wr_ptr_temp = read_fifo_wr_ptr_reg + 1;
+        read_fifo_wr_ptr_reg <= read_fifo_wr_ptr_temp;
+        read_fifo_wr_ptr_gray_reg <= read_fifo_wr_ptr_temp ^ (read_fifo_wr_ptr_temp >> 1);
+
+        read_fifo_occupancy_reg <= read_fifo_wr_ptr_temp - read_fifo_rd_ptr_sync_reg;
+    end
+
+    if (rst || cfg_reset) begin
+        read_fifo_read_start_ptr_reg <= 0;
+        read_fifo_wr_ptr_reg <= 0;
+        read_fifo_wr_ptr_gray_reg <= 0;
+    end
+end
+
+// pointer synchronization
+always @(posedge clk) begin
+    read_fifo_rd_ptr_gray_sync_1_reg <= read_fifo_rd_ptr_gray;
+    read_fifo_rd_ptr_gray_sync_2_reg <= read_fifo_rd_ptr_gray_sync_1_reg;
+
+    for (k = 0; k < READ_FIFO_ADDR_WIDTH+1; k = k + 1) begin
+        read_fifo_rd_ptr_sync_reg[k] <= ^(read_fifo_rd_ptr_gray_sync_2_reg >> k);
+    end
+
+    if (rst || cfg_reset) begin
+        read_fifo_rd_ptr_gray_sync_1_reg <= 0;
+        read_fifo_rd_ptr_gray_sync_2_reg <= 0;
+        read_fifo_rd_ptr_sync_reg <= 0;
+    end
+end
+
+always @(posedge clk) begin
+    read_fifo_ctrl_rd_ptr_gray_sync_1_reg <= read_fifo_ctrl_rd_ptr_gray;
+    read_fifo_ctrl_rd_ptr_gray_sync_2_reg <= read_fifo_ctrl_rd_ptr_gray_sync_1_reg;
+
+    for (k = 0; k < READ_FIFO_ADDR_WIDTH+1; k = k + 1) begin
+        read_fifo_ctrl_rd_ptr_sync_reg[k] <= ^(read_fifo_ctrl_rd_ptr_gray_sync_2_reg >> k);
+    end
+
+    if (rst || cfg_reset) begin
+        read_fifo_ctrl_rd_ptr_gray_sync_1_reg <= 0;
+        read_fifo_ctrl_rd_ptr_gray_sync_2_reg <= 0;
+        read_fifo_ctrl_rd_ptr_sync_reg <= 0;
+    end
+end
+
+always @(posedge output_clk) begin
+    read_fifo_wr_ptr_gray_sync_1_reg <= read_fifo_wr_ptr_gray_reg;
+    read_fifo_wr_ptr_gray_sync_2_reg <= read_fifo_wr_ptr_gray_sync_1_reg;
+
+    if (output_rst || output_rst_out) begin
+        read_fifo_wr_ptr_gray_sync_1_reg <= 0;
+        read_fifo_wr_ptr_gray_sync_2_reg <= 0;
+    end
+end
+
+generate
+
+if (CTRL_OUT_EN) begin
+    
+    for (n = 0; n < SEG_CNT; n = n + 1) begin : read_fifo_ctrl_seg
+
+        reg [READ_FIFO_ADDR_WIDTH+1-1:0] seg_rd_ptr_reg = 0;
+        reg [READ_FIFO_ADDR_WIDTH+1-1:0] seg_rd_ptr_gray_reg = 0;
+
+        reg [READ_FIFO_ADDR_WIDTH+1-1:0] seg_rd_ptr_temp;
+
+        (* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+        reg [SEG_WIDTH-1:0] seg_mem_data[2**READ_FIFO_ADDR_WIDTH-1:0];
+
+        reg [SEG_WIDTH-1:0] seg_rd_data_reg = 0;
+        reg seg_rd_data_valid_reg = 0;
+
+        reg seg_output_ready_reg = 1'b0;
+
+        wire seg_empty = seg_rd_ptr_gray_reg == read_fifo_wr_ptr_gray_sync_2_reg;
+
+        if (n == SEG_CNT-1) begin
+            assign read_fifo_ctrl_rd_ptr = seg_rd_ptr_reg;
+            assign read_fifo_ctrl_rd_ptr_gray = seg_rd_ptr_gray_reg;
+        end
+
+        always @(posedge clk) begin
+            if (!read_fifo_full && m_axi_rready && m_axi_rvalid) begin
+                seg_mem_data[read_fifo_wr_ptr_reg[READ_FIFO_ADDR_WIDTH-1:0]] <= m_axi_rdata[n*SEG_WIDTH +: SEG_WIDTH];
+            end
+        end
+
+        // per-segment read logic
+        always @(posedge output_clk) begin
+            seg_rd_data_valid_reg <= seg_rd_data_valid_reg && !seg_output_ready_reg;
+
+            if (!seg_empty && (!seg_rd_data_valid_reg || seg_output_ready_reg)) begin
+                seg_rd_data_reg <= seg_mem_data[seg_rd_ptr_reg[READ_FIFO_ADDR_WIDTH-1:0]];
+                seg_rd_data_valid_reg <= 1'b1;
+
+                seg_rd_ptr_temp = seg_rd_ptr_reg + 1;
+                seg_rd_ptr_reg <= seg_rd_ptr_temp;
+                seg_rd_ptr_gray_reg <= seg_rd_ptr_temp ^ (seg_rd_ptr_temp >> 1);
+            end
+
+            if (output_rst || output_rst_out) begin
+                seg_rd_ptr_reg <= 0;
+                seg_rd_ptr_gray_reg <= 0;
+                seg_rd_data_valid_reg <= 1'b0;
+            end
+        end
+
+        // skid buffer
+        reg [SEG_WIDTH-1:0] seg_output_data_reg = 0;
+        reg seg_output_valid_reg = 1'b0;
+
+        reg [SEG_WIDTH-1:0] temp_seg_output_data_reg = 0;
+        reg temp_seg_output_valid_reg = 1'b0;
+
+        assign output_ctrl_data[n*SEG_WIDTH +: SEG_WIDTH] = seg_output_data_reg;
+        assign output_ctrl_valid[n] = seg_output_valid_reg;
+
+        always @(posedge output_clk) begin
+            // enable ready input next cycle if output is ready or the temp reg will not be filled on the next cycle (output reg empty or no input)
+            seg_output_ready_reg <= output_ctrl_ready[n] || (!temp_seg_output_valid_reg && (!seg_output_valid_reg || !seg_rd_data_valid_reg));
+
+            if (seg_output_ready_reg) begin
+                // input is ready
+                if (output_ctrl_ready[n] || !seg_output_valid_reg) begin
+                    // output is ready or currently not valid, transfer data to output
+                    seg_output_data_reg <= seg_rd_data_reg;
+                    seg_output_valid_reg <= seg_rd_data_valid_reg;
+                end else begin
+                    // output is not ready, store input in temp
+                    temp_seg_output_data_reg <= seg_rd_data_reg;
+                    temp_seg_output_valid_reg <= seg_rd_data_valid_reg;
+                end
+            end else if (output_ctrl_ready[n]) begin
+                // input is not ready, but output is ready
+                seg_output_data_reg <= temp_seg_output_data_reg;
+                seg_output_valid_reg <= temp_seg_output_valid_reg;
+                temp_seg_output_valid_reg <= 1'b0;
+            end
+
+            if (output_rst || output_rst_out) begin
+                seg_output_ready_reg <= 1'b0;
+                seg_output_valid_reg <= 1'b0;
+                temp_seg_output_valid_reg <= 1'b0;
+            end
+        end
+
+    end
+
+end
+
+endgenerate
+
+reg [READ_BURST_LEN_WIDTH+1-1:0] rd_burst_len;
+reg [READ_BURST_LEN_WIDTH+1-1:0] rd_outstanding_inc;
+reg rd_outstanding_dec;
+reg [READ_FIFO_ADDR_WIDTH+1-1:0] rd_outstanding_reg = 0, rd_outstanding_next;
+reg [LEN_WIDTH+1-1:0] rd_start_ptr;
+reg [7:0] rd_timeout_count_reg = 0, rd_timeout_count_next;
+reg rd_timeout_reg = 0, rd_timeout_next;
+
+reg [LEN_WIDTH+1-1:0] rd_start_ptr_reg = 0, rd_start_ptr_next;
+reg [LEN_WIDTH+1-1:0] rd_finish_ptr_reg = 0, rd_finish_ptr_next;
+
+assign rd_start_ptr_out = rd_start_ptr_reg;
+assign rd_finish_ptr_out = rd_finish_ptr_reg;
+
+assign sts_read_active = rd_outstanding_reg != 0;
+
+// read logic
+always @* begin
+    rd_start_ptr_next = rd_start_ptr_reg;
+    rd_finish_ptr_next = rd_finish_ptr_reg;
+
+    rd_outstanding_inc = 0;
+    rd_outstanding_dec = 0;
+    rd_outstanding_next = rd_outstanding_reg;
+    rd_timeout_count_next = rd_timeout_count_reg;
+    rd_timeout_next = rd_timeout_reg;
+
+    m_axi_araddr_next = m_axi_araddr_reg;
+    m_axi_arlen_next = m_axi_arlen_reg;
+    m_axi_arvalid_next = m_axi_arvalid_reg && !m_axi_arready;
+
+    // partial burst timeout handling
+    rd_timeout_next = rd_timeout_count_reg == 0;
+    if (wr_finish_ptr_in == rd_start_ptr_reg || m_axi_arvalid) begin
+        rd_timeout_count_next = 8'hff;
+        rd_timeout_next = 1'b0;
+    end else if (rd_timeout_count_reg > 0) begin
+        rd_timeout_count_next = rd_timeout_count_reg - 1;
+    end
+
+    // compute length based on DRAM occupancy
+    if ((wr_finish_ptr_in ^ rd_start_ptr_reg) >> READ_BURST_ADDR_WIDTH != 0) begin
+        // crosses burst boundary, read up to burst boundary
+        rd_burst_len = READ_MAX_BURST_LEN_INT - ((rd_start_ptr_reg & READ_BURST_ADDR_MASK) >> AXI_BURST_SIZE);
+        rd_start_ptr = (rd_start_ptr_reg & ~READ_BURST_ADDR_MASK) + (1 << READ_BURST_ADDR_WIDTH);
+    end else begin
+        // does not cross burst boundary, read available data
+        rd_burst_len = (wr_finish_ptr_in - rd_start_ptr_reg) >> AXI_BURST_SIZE;
+        rd_start_ptr = wr_finish_ptr_in;
+    end
+
+    read_fifo_read_start_cnt = rd_burst_len;
+    read_fifo_read_start_en = 1'b0;
+
+    // generate AXI read bursts
+    if (!m_axi_arvalid_reg) begin
+        // ready to start new burst
+
+        m_axi_araddr_next = cfg_fifo_base_addr + (rd_start_ptr_reg & cfg_fifo_size_mask);
+        m_axi_arlen_next = rd_burst_len - 1;
+
+        if (cfg_enable && (wr_finish_ptr_in ^ rd_start_ptr_reg) != 0 && read_fifo_occupancy_lookahead_reg < 2**READ_FIFO_ADDR_WIDTH - READ_MAX_BURST_LEN_INT) begin
+            // enabled, have data to write, have space for data
+            if ((wr_finish_ptr_in ^ rd_start_ptr_reg) >> READ_BURST_ADDR_WIDTH != 0 || rd_timeout_reg) begin
+                // have full burst or timed out
+                read_fifo_read_start_en = 1'b1;
+                rd_outstanding_inc = rd_burst_len;
+                m_axi_arvalid_next = 1'b1;
+                rd_start_ptr_next = rd_start_ptr;
+            end
+        end
+    end
+
+    // handle AXI read completions
+    if (m_axi_rready && m_axi_rvalid) begin
+        rd_finish_ptr_next = rd_finish_ptr_reg + AXI_BYTE_LANES;
+        rd_outstanding_dec = 1;
+    end
+
+    rd_outstanding_next = rd_outstanding_reg + rd_outstanding_inc - rd_outstanding_dec;
+
+    if (cfg_reset) begin
+        rd_start_ptr_next = 0;
+        rd_finish_ptr_next = 0;
+    end
+end
+
+always @(posedge clk) begin
+    rd_start_ptr_reg <= rd_start_ptr_next;
+    rd_finish_ptr_reg <= rd_finish_ptr_next;
+
+    rd_outstanding_reg <= rd_outstanding_next;
+    rd_timeout_count_reg <= rd_timeout_count_next;
+    rd_timeout_reg <= rd_timeout_next;
+
+    m_axi_araddr_reg <= m_axi_araddr_next;
+    m_axi_arlen_reg <= m_axi_arlen_next;
+    m_axi_arvalid_reg <= m_axi_arvalid_next;
+
+    if (rst) begin
+        rd_outstanding_reg <= 0;
+        m_axi_arvalid_reg <= 1'b0;
+    end
+end
+
+endmodule
+
+`resetall
diff --git a/rtl/axi_vfifo_raw_wr.v b/rtl/axi_vfifo_raw_wr.v
new file mode 100644
index 0000000..b9e685a
--- /dev/null
+++ b/rtl/axi_vfifo_raw_wr.v
@@ -0,0 +1,567 @@
+/*
+
+Copyright (c) 2023 Alex Forencich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+// Language: Verilog 2001
+
+`resetall
+`timescale 1ns / 1ps
+`default_nettype none
+
+/*
+ * AXI4 virtual FIFO (raw, write side)
+ */
+module axi_vfifo_raw_wr #
+(
+    // Width of input segment
+    parameter SEG_WIDTH = 32,
+    // Segment count
+    parameter SEG_CNT = 2,
+    // Width of AXI data bus in bits
+    parameter AXI_DATA_WIDTH = SEG_WIDTH*SEG_CNT,
+    // Width of AXI address bus in bits
+    parameter AXI_ADDR_WIDTH = 16,
+    // Width of AXI wstrb (width of data bus in words)
+    parameter AXI_STRB_WIDTH = (AXI_DATA_WIDTH/8),
+    // Width of AXI ID signal
+    parameter AXI_ID_WIDTH = 8,
+    // Maximum AXI burst length to generate
+    parameter AXI_MAX_BURST_LEN = 16,
+    // Width of length field
+    parameter LEN_WIDTH = AXI_ADDR_WIDTH,
+    // Input FIFO depth for AXI write data (full-width words)
+    parameter WRITE_FIFO_DEPTH = 64,
+    // Max AXI write burst length
+    parameter WRITE_MAX_BURST_LEN = WRITE_FIFO_DEPTH/4,
+    // Watermark level
+    parameter WATERMARK_LEVEL = WRITE_FIFO_DEPTH/2
+)
+(
+    input  wire                          clk,
+    input  wire                          rst,
+
+    /*
+     * Segmented data input (from encode logic)
+     */
+    input  wire                          input_clk,
+    input  wire                          input_rst,
+    output wire                          input_rst_out,
+    output wire                          input_watermark,
+    input  wire [SEG_CNT*SEG_WIDTH-1:0]  input_data,
+    input  wire [SEG_CNT-1:0]            input_valid,
+    output wire [SEG_CNT-1:0]            input_ready,
+
+    /*
+     * AXI master interface
+     */
+    output wire [AXI_ID_WIDTH-1:0]       m_axi_awid,
+    output wire [AXI_ADDR_WIDTH-1:0]     m_axi_awaddr,
+    output wire [7:0]                    m_axi_awlen,
+    output wire [2:0]                    m_axi_awsize,
+    output wire [1:0]                    m_axi_awburst,
+    output wire                          m_axi_awlock,
+    output wire [3:0]                    m_axi_awcache,
+    output wire [2:0]                    m_axi_awprot,
+    output wire                          m_axi_awvalid,
+    input  wire                          m_axi_awready,
+    output wire [AXI_DATA_WIDTH-1:0]     m_axi_wdata,
+    output wire [AXI_STRB_WIDTH-1:0]     m_axi_wstrb,
+    output wire                          m_axi_wlast,
+    output wire                          m_axi_wvalid,
+    input  wire                          m_axi_wready,
+    input  wire [AXI_ID_WIDTH-1:0]       m_axi_bid,
+    input  wire [1:0]                    m_axi_bresp,
+    input  wire                          m_axi_bvalid,
+    output wire                          m_axi_bready,
+
+    /*
+     * FIFO control
+     */
+    output wire [LEN_WIDTH+1-1:0]        wr_start_ptr_out,
+    output wire [LEN_WIDTH+1-1:0]        wr_finish_ptr_out,
+    input  wire [LEN_WIDTH+1-1:0]        rd_start_ptr_in,
+    input  wire [LEN_WIDTH+1-1:0]        rd_finish_ptr_in,
+
+    /*
+     * Configuration
+     */
+    input  wire [AXI_ADDR_WIDTH-1:0]     cfg_fifo_base_addr,
+    input  wire [LEN_WIDTH-1:0]          cfg_fifo_size_mask,
+    input  wire                          cfg_enable,
+    input  wire                          cfg_reset,
+
+    /*
+     * Status
+     */
+    output wire [LEN_WIDTH+1-1:0]        sts_fifo_occupancy,
+    output wire                          sts_fifo_empty,
+    output wire                          sts_fifo_full,
+    output wire                          sts_write_active
+);
+
+localparam AXI_BYTE_LANES = AXI_STRB_WIDTH;
+localparam AXI_BYTE_SIZE = AXI_DATA_WIDTH/AXI_BYTE_LANES;
+localparam AXI_BURST_SIZE = $clog2(AXI_STRB_WIDTH);
+localparam AXI_MAX_BURST_SIZE = AXI_MAX_BURST_LEN << AXI_BURST_SIZE;
+
+localparam OFFSET_ADDR_WIDTH = AXI_STRB_WIDTH > 1 ? $clog2(AXI_STRB_WIDTH) : 1;
+localparam OFFSET_ADDR_MASK = AXI_STRB_WIDTH > 1 ? {OFFSET_ADDR_WIDTH{1'b1}} : 0;
+localparam ADDR_MASK = {AXI_ADDR_WIDTH{1'b1}} << $clog2(AXI_STRB_WIDTH);
+localparam CYCLE_COUNT_WIDTH = LEN_WIDTH - AXI_BURST_SIZE + 1;
+
+localparam WRITE_FIFO_ADDR_WIDTH = $clog2(WRITE_FIFO_DEPTH);
+localparam RESP_FIFO_ADDR_WIDTH = 5;
+
+// mask(x) = (2**$clog2(x))-1
+// log2(min(x, y, z)) = (mask & mask & mask)+1
+// floor(log2(x)) = $clog2(x+1)-1
+// floor(log2(min(AXI_MAX_BURST_LEN, WRITE_MAX_BURST_LEN, 2**(WRITE_FIFO_ADDR_WIDTH-1), 4096/AXI_BYTE_LANES)))
+localparam WRITE_MAX_BURST_LEN_INT = ((2**($clog2(AXI_MAX_BURST_LEN+1)-1)-1) & (2**($clog2(WRITE_MAX_BURST_LEN+1)-1)-1) & (2**(WRITE_FIFO_ADDR_WIDTH-1)-1) & ((4096/AXI_BYTE_LANES)-1)) + 1;
+localparam WRITE_MAX_BURST_SIZE_INT = WRITE_MAX_BURST_LEN_INT << AXI_BURST_SIZE;
+localparam WRITE_BURST_LEN_WIDTH = $clog2(WRITE_MAX_BURST_LEN_INT);
+localparam WRITE_BURST_ADDR_WIDTH = $clog2(WRITE_MAX_BURST_SIZE_INT);
+localparam WRITE_BURST_ADDR_MASK = WRITE_BURST_ADDR_WIDTH > 1 ? {WRITE_BURST_ADDR_WIDTH{1'b1}} : 0;
+
+// validate parameters
+initial begin
+    if (AXI_BYTE_SIZE * AXI_STRB_WIDTH != AXI_DATA_WIDTH) begin
+        $error("Error: AXI data width not evenly divisible (instance %m)");
+        $finish;
+    end
+
+    if (2**$clog2(AXI_BYTE_LANES) != AXI_BYTE_LANES) begin
+        $error("Error: AXI byte lane count must be even power of two (instance %m)");
+        $finish;
+    end
+
+    if (AXI_MAX_BURST_LEN < 1 || AXI_MAX_BURST_LEN > 256) begin
+        $error("Error: AXI_MAX_BURST_LEN must be between 1 and 256 (instance %m)");
+        $finish;
+    end
+
+    if (SEG_CNT * SEG_WIDTH != AXI_DATA_WIDTH) begin
+        $error("Error: Width mismatch (instance %m)");
+        $finish;
+    end
+end
+
+localparam [1:0]
+    AXI_RESP_OKAY = 2'b00,
+    AXI_RESP_EXOKAY = 2'b01,
+    AXI_RESP_SLVERR = 2'b10,
+    AXI_RESP_DECERR = 2'b11;
+
+reg [AXI_ADDR_WIDTH-1:0] m_axi_awaddr_reg = {AXI_ADDR_WIDTH{1'b0}}, m_axi_awaddr_next;
+reg [7:0] m_axi_awlen_reg = 8'd0, m_axi_awlen_next;
+reg m_axi_awvalid_reg = 1'b0, m_axi_awvalid_next;
+reg [AXI_DATA_WIDTH-1:0] m_axi_wdata_reg = {AXI_DATA_WIDTH{1'b0}}, m_axi_wdata_next;
+reg [AXI_STRB_WIDTH-1:0] m_axi_wstrb_reg = {AXI_STRB_WIDTH{1'b0}}, m_axi_wstrb_next;
+reg m_axi_wlast_reg = 1'b0, m_axi_wlast_next;
+reg m_axi_wvalid_reg = 1'b0, m_axi_wvalid_next;
+reg m_axi_bready_reg = 1'b0, m_axi_bready_next;
+
+assign m_axi_awid = {AXI_ID_WIDTH{1'b0}};
+assign m_axi_awaddr = m_axi_awaddr_reg;
+assign m_axi_awlen = m_axi_awlen_reg;
+assign m_axi_awsize = AXI_BURST_SIZE;
+assign m_axi_awburst = 2'b01;
+assign m_axi_awlock = 1'b0;
+assign m_axi_awcache = 4'b0011;
+assign m_axi_awprot = 3'b010;
+assign m_axi_awvalid = m_axi_awvalid_reg;
+assign m_axi_wdata = m_axi_wdata_reg;
+assign m_axi_wstrb = m_axi_wstrb_reg;
+assign m_axi_wvalid = m_axi_wvalid_reg;
+assign m_axi_wlast = m_axi_wlast_reg;
+assign m_axi_bready = m_axi_bready_reg;
+
+// reset synchronization
+wire rst_req_int = cfg_reset;
+
+(* shreg_extract = "no" *)
+reg rst_sync_1_reg = 1'b1,  rst_sync_2_reg = 1'b1, rst_sync_3_reg = 1'b1;
+
+assign input_rst_out = rst_sync_3_reg;
+
+always @(posedge input_clk or posedge rst_req_int) begin
+    if (rst_req_int) begin
+        rst_sync_1_reg <= 1'b1;
+    end else begin
+        rst_sync_1_reg <= 1'b0;
+    end
+end
+
+always @(posedge input_clk) begin
+    rst_sync_2_reg <= rst_sync_1_reg;
+    rst_sync_3_reg <= rst_sync_2_reg;
+end
+
+// input datapath logic (write data)
+wire [AXI_DATA_WIDTH-1:0] input_data_int;
+reg input_valid_int_reg = 1'b0;
+
+reg input_read_en;
+
+wire [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_wr_ptr;
+wire [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_wr_ptr_gray;
+reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_rd_ptr_reg = 0;
+reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_rd_ptr_gray_reg = 0;
+
+reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_rd_ptr_temp;
+
+(* shreg_extract = "no" *)
+reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_wr_ptr_gray_sync_1_reg = 0;
+(* shreg_extract = "no" *)
+reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_wr_ptr_gray_sync_2_reg = 0;
+reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_wr_ptr_sync_reg = 0;
+
+(* shreg_extract = "no" *)
+reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_rd_ptr_gray_sync_1_reg = 0;
+(* shreg_extract = "no" *)
+reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_rd_ptr_gray_sync_2_reg = 0;
+reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_rd_ptr_sync_reg = 0;
+
+reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] write_fifo_occupancy_reg = 0;
+
+wire [SEG_CNT-1:0] write_fifo_seg_full;
+wire [SEG_CNT-1:0] write_fifo_seg_empty;
+wire [SEG_CNT-1:0] write_fifo_seg_watermark;
+
+wire write_fifo_full = |write_fifo_seg_full;
+wire write_fifo_empty = |write_fifo_seg_empty;
+
+assign input_watermark = |write_fifo_seg_watermark | input_rst_out;
+
+genvar n;
+integer k;
+
+generate
+
+for (n = 0; n < SEG_CNT; n = n + 1) begin : write_fifo_seg
+
+    reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] seg_wr_ptr_reg = 0;
+    reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] seg_wr_ptr_gray_reg = 0;
+
+    reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] seg_wr_ptr_temp;
+
+    reg [WRITE_FIFO_ADDR_WIDTH+1-1:0] seg_occupancy_reg = 0;
+
+    (* ram_style = "distributed", ramstyle = "no_rw_check, mlab" *)
+    reg [SEG_WIDTH-1:0] seg_mem_data[2**WRITE_FIFO_ADDR_WIDTH-1:0];
+
+    reg [SEG_WIDTH-1:0] seg_rd_data_reg = 0;
+
+    wire seg_full = seg_wr_ptr_gray_reg == (write_fifo_rd_ptr_gray_sync_2_reg ^ {2'b11, {WRITE_FIFO_ADDR_WIDTH-1{1'b0}}});
+    wire seg_empty = write_fifo_rd_ptr_reg == write_fifo_wr_ptr_sync_reg;
+    wire seg_watermark = seg_occupancy_reg > WATERMARK_LEVEL;
+
+    assign input_data_int[n*SEG_WIDTH +: SEG_WIDTH] = seg_rd_data_reg;
+
+    assign input_ready[n] = !seg_full && !input_rst_out;
+
+    assign write_fifo_seg_full[n] = seg_full;
+    assign write_fifo_seg_empty[n] = seg_empty;
+    assign write_fifo_seg_watermark[n] = seg_watermark;
+
+    if (n == SEG_CNT-1) begin
+        assign write_fifo_wr_ptr = seg_wr_ptr_reg;
+        assign write_fifo_wr_ptr_gray = seg_wr_ptr_gray_reg;
+    end
+
+    // per-segment write logic
+    always @(posedge input_clk) begin
+        seg_occupancy_reg <= seg_wr_ptr_reg - write_fifo_rd_ptr_sync_reg;
+
+        if (input_ready[n] && input_valid[n]) begin
+            seg_mem_data[seg_wr_ptr_reg[WRITE_FIFO_ADDR_WIDTH-1:0]] <= input_data[n*SEG_WIDTH +: SEG_WIDTH];
+
+            seg_wr_ptr_temp = seg_wr_ptr_reg + 1;
+            seg_wr_ptr_reg <= seg_wr_ptr_temp;
+            seg_wr_ptr_gray_reg <= seg_wr_ptr_temp ^ (seg_wr_ptr_temp >> 1);
+        end
+
+        if (input_rst || input_rst_out) begin
+            seg_wr_ptr_reg <= 0;
+            seg_wr_ptr_gray_reg <= 0;
+        end
+    end
+
+    always @(posedge clk) begin
+        if (!write_fifo_empty && (!input_valid_int_reg || input_read_en)) begin
+            seg_rd_data_reg <= seg_mem_data[write_fifo_rd_ptr_reg[WRITE_FIFO_ADDR_WIDTH-1:0]];
+        end
+    end
+
+end
+
+endgenerate
+
+// pointer synchronization
+always @(posedge input_clk) begin
+    write_fifo_rd_ptr_gray_sync_1_reg <= write_fifo_rd_ptr_gray_reg;
+    write_fifo_rd_ptr_gray_sync_2_reg <= write_fifo_rd_ptr_gray_sync_1_reg;
+
+    for (k = 0; k < WRITE_FIFO_ADDR_WIDTH+1; k = k + 1) begin
+        write_fifo_rd_ptr_sync_reg[k] <= ^(write_fifo_rd_ptr_gray_sync_2_reg >> k);
+    end
+
+    if (input_rst || input_rst_out) begin
+        write_fifo_rd_ptr_gray_sync_1_reg <= 0;
+        write_fifo_rd_ptr_gray_sync_2_reg <= 0;
+        write_fifo_rd_ptr_sync_reg <= 0;
+    end
+end
+
+always @(posedge clk) begin
+    write_fifo_wr_ptr_gray_sync_1_reg <= write_fifo_wr_ptr_gray;
+    write_fifo_wr_ptr_gray_sync_2_reg <= write_fifo_wr_ptr_gray_sync_1_reg;
+
+    for (k = 0; k < WRITE_FIFO_ADDR_WIDTH+1; k = k + 1) begin
+        write_fifo_wr_ptr_sync_reg[k] <= ^(write_fifo_wr_ptr_gray_sync_2_reg >> k);
+    end
+
+    if (rst || cfg_reset) begin
+        write_fifo_wr_ptr_gray_sync_1_reg <= 0;
+        write_fifo_wr_ptr_gray_sync_2_reg <= 0;
+        write_fifo_wr_ptr_sync_reg <= 0;
+    end
+end
+
+// read logic
+always @(posedge clk) begin
+    write_fifo_occupancy_reg <= write_fifo_wr_ptr_sync_reg - write_fifo_rd_ptr_reg + input_valid_int_reg;
+
+    if (input_read_en) begin
+        input_valid_int_reg <= 1'b0;
+        write_fifo_occupancy_reg <= write_fifo_wr_ptr_sync_reg - write_fifo_rd_ptr_reg;
+    end
+
+    if (!write_fifo_empty && (!input_valid_int_reg || input_read_en)) begin
+        input_valid_int_reg <= 1'b1;
+
+        write_fifo_rd_ptr_temp = write_fifo_rd_ptr_reg + 1;
+        write_fifo_rd_ptr_reg <= write_fifo_rd_ptr_temp;
+        write_fifo_rd_ptr_gray_reg <= write_fifo_rd_ptr_temp ^ (write_fifo_rd_ptr_temp >> 1);
+
+        write_fifo_occupancy_reg <= write_fifo_wr_ptr_sync_reg - write_fifo_rd_ptr_reg;
+    end
+
+    if (rst || cfg_reset) begin
+        write_fifo_rd_ptr_reg <= 0;
+        write_fifo_rd_ptr_gray_reg <= 0;
+        input_valid_int_reg <= 1'b0;
+    end
+end
+
+reg [WRITE_BURST_LEN_WIDTH+1-1:0] wr_burst_len;
+reg [LEN_WIDTH+1-1:0] wr_start_ptr;
+reg [LEN_WIDTH+1-1:0] wr_start_ptr_blk_adj;
+reg wr_burst_reg = 1'b0, wr_burst_next;
+reg [WRITE_BURST_LEN_WIDTH-1:0] wr_burst_len_reg = 0, wr_burst_len_next;
+reg [7:0] wr_timeout_count_reg = 0, wr_timeout_count_next;
+reg wr_timeout_reg = 0, wr_timeout_next;
+reg fifo_full_wr_blk_adj_reg = 1'b0, fifo_full_wr_blk_adj_next;
+
+reg [LEN_WIDTH+1-1:0] wr_start_ptr_reg = 0, wr_start_ptr_next;
+reg [LEN_WIDTH+1-1:0] wr_start_ptr_blk_adj_reg = 0, wr_start_ptr_blk_adj_next;
+reg [LEN_WIDTH+1-1:0] wr_finish_ptr_reg = 0, wr_finish_ptr_next;
+
+reg resp_fifo_we_reg = 1'b0, resp_fifo_we_next;
+reg [RESP_FIFO_ADDR_WIDTH+1-1:0] resp_fifo_wr_ptr_reg = 0;
+reg [RESP_FIFO_ADDR_WIDTH+1-1:0] resp_fifo_rd_ptr_reg = 0, resp_fifo_rd_ptr_next;
+reg [WRITE_BURST_LEN_WIDTH+1-1:0] resp_fifo_burst_len[(2**RESP_FIFO_ADDR_WIDTH)-1:0];
+reg [WRITE_BURST_LEN_WIDTH+1-1:0] resp_fifo_wr_burst_len_reg = 0, resp_fifo_wr_burst_len_next;
+
+assign wr_start_ptr_out = wr_start_ptr_reg;
+assign wr_finish_ptr_out = wr_finish_ptr_reg;
+
+// FIFO occupancy using adjusted write start pointer
+wire [LEN_WIDTH+1-1:0] fifo_occupancy_wr_blk_adj = wr_start_ptr_blk_adj_reg - rd_finish_ptr_in;
+// FIFO full indication - no space to start writing a complete block
+wire fifo_full_wr_blk_adj = (fifo_occupancy_wr_blk_adj & ~cfg_fifo_size_mask) || ((~fifo_occupancy_wr_blk_adj & cfg_fifo_size_mask & ~WRITE_BURST_ADDR_MASK) == 0 && (fifo_occupancy_wr_blk_adj & WRITE_BURST_ADDR_MASK));
+
+// FIFO occupancy (including all in-progress reads and writes)
+assign sts_fifo_occupancy = wr_start_ptr_reg - rd_finish_ptr_in;
+// FIFO empty (including all in-progress reads and writes)
+assign sts_fifo_empty = wr_start_ptr_reg == rd_finish_ptr_in;
+// FIFO full
+assign sts_fifo_full = fifo_full_wr_blk_adj_reg;
+
+assign sts_write_active = wr_burst_reg || resp_fifo_we_reg || (resp_fifo_wr_ptr_reg != resp_fifo_rd_ptr_reg);
+
+// write logic
+always @* begin
+    wr_start_ptr_next = wr_start_ptr_reg;
+    wr_start_ptr_blk_adj_next = wr_start_ptr_blk_adj_reg;
+    wr_finish_ptr_next = wr_finish_ptr_reg;
+
+    wr_burst_next = wr_burst_reg;
+    wr_burst_len_next = wr_burst_len_reg;
+    wr_timeout_count_next = wr_timeout_count_reg;
+    wr_timeout_next = wr_timeout_reg;
+
+    fifo_full_wr_blk_adj_next = fifo_full_wr_blk_adj;
+
+    resp_fifo_we_next = 1'b0;
+    resp_fifo_rd_ptr_next = resp_fifo_rd_ptr_reg;
+    resp_fifo_wr_burst_len_next = wr_burst_len_reg;
+
+    input_read_en = 1'b0;
+
+    m_axi_awaddr_next = m_axi_awaddr_reg;
+    m_axi_awlen_next = m_axi_awlen_reg;
+    m_axi_awvalid_next = m_axi_awvalid_reg && !m_axi_awready;
+
+    m_axi_wdata_next = m_axi_wdata_reg;
+    m_axi_wstrb_next = m_axi_wstrb_reg;
+    m_axi_wlast_next = m_axi_wlast_reg;
+    m_axi_wvalid_next = m_axi_wvalid_reg && !m_axi_wready;
+
+    m_axi_bready_next = 1'b0;
+
+    // partial burst timeout handling
+    wr_timeout_next = wr_timeout_count_reg == 0;
+    if (!input_valid_int_reg || m_axi_awvalid) begin
+        wr_timeout_count_next = 8'hff;
+        wr_timeout_next = 1'b0;
+    end else if (wr_timeout_count_reg > 0) begin
+        wr_timeout_count_next = wr_timeout_count_reg - 1;
+    end
+
+    // compute length based on input FIFO occupancy
+    if ((((wr_start_ptr_reg & WRITE_BURST_ADDR_MASK) >> AXI_BURST_SIZE) + write_fifo_occupancy_reg) >> WRITE_BURST_LEN_WIDTH != 0) begin
+        // crosses burst boundary, write up to burst boundary
+        wr_burst_len = WRITE_MAX_BURST_LEN_INT-1 - ((wr_start_ptr_reg & WRITE_BURST_ADDR_MASK) >> AXI_BURST_SIZE);
+        wr_start_ptr = (wr_start_ptr_reg & ~WRITE_BURST_ADDR_MASK) + (1 << WRITE_BURST_ADDR_WIDTH);
+        wr_start_ptr_blk_adj = (wr_start_ptr_reg & ~WRITE_BURST_ADDR_MASK) + (1 << WRITE_BURST_ADDR_WIDTH);
+    end else begin
+        // does not cross burst boundary, write available data
+        wr_burst_len = write_fifo_occupancy_reg-1;
+        wr_start_ptr = wr_start_ptr_reg + (write_fifo_occupancy_reg << AXI_BURST_SIZE);
+        wr_start_ptr_blk_adj = (wr_start_ptr_reg & ~WRITE_BURST_ADDR_MASK) + (1 << WRITE_BURST_ADDR_WIDTH);
+    end
+
+    resp_fifo_wr_burst_len_next = wr_burst_len;
+
+    // generate AXI write bursts
+    if (!m_axi_awvalid_reg && !wr_burst_reg) begin
+        // ready to start new burst
+
+        wr_burst_len_next = wr_burst_len;
+
+        m_axi_awaddr_next = cfg_fifo_base_addr + (wr_start_ptr_reg & cfg_fifo_size_mask);
+        m_axi_awlen_next = wr_burst_len;
+
+        if (cfg_enable && input_valid_int_reg && !fifo_full_wr_blk_adj_reg) begin
+            // enabled, have data to write, have space for data
+            if ((write_fifo_occupancy_reg) >> WRITE_BURST_LEN_WIDTH != 0 || wr_timeout_reg) begin
+                // have full burst or timed out
+                wr_burst_next = 1'b1;
+                m_axi_awvalid_next = 1'b1;
+                resp_fifo_we_next = 1'b1;
+                wr_start_ptr_next = wr_start_ptr;
+                wr_start_ptr_blk_adj_next = wr_start_ptr_blk_adj;
+            end
+        end
+    end
+
+    if (!m_axi_wvalid_reg || m_axi_wready) begin
+        // transfer data
+        m_axi_wdata_next = input_data_int;
+        m_axi_wlast_next = wr_burst_len_reg == 0;
+
+        if (wr_burst_reg) begin
+            m_axi_wstrb_next = {AXI_STRB_WIDTH{1'b1}};
+            if (cfg_reset) begin
+                m_axi_wstrb_next = 0;
+                m_axi_wvalid_next = 1'b1;
+                wr_burst_len_next = wr_burst_len_reg - 1;
+                wr_burst_next = wr_burst_len_reg != 0;
+            end else if (input_valid_int_reg) begin
+                input_read_en = 1'b1;
+                m_axi_wvalid_next = 1'b1;
+                wr_burst_len_next = wr_burst_len_reg - 1;
+                wr_burst_next = wr_burst_len_reg != 0;
+            end
+        end
+    end
+
+    // handle AXI write completions
+    m_axi_bready_next = 1'b1;
+    if (m_axi_bvalid) begin
+        wr_finish_ptr_next = wr_finish_ptr_reg + ((resp_fifo_burst_len[resp_fifo_rd_ptr_reg[RESP_FIFO_ADDR_WIDTH-1:0]]+1) << AXI_BURST_SIZE);
+        resp_fifo_rd_ptr_next = resp_fifo_rd_ptr_reg + 1;
+    end
+
+    if (cfg_reset) begin
+        wr_start_ptr_next = 0;
+        wr_start_ptr_blk_adj_next = 0;
+        wr_finish_ptr_next = 0;
+    end
+end
+
+always @(posedge clk) begin
+    wr_start_ptr_reg <= wr_start_ptr_next;
+    wr_start_ptr_blk_adj_reg <= wr_start_ptr_blk_adj_next;
+    wr_finish_ptr_reg <= wr_finish_ptr_next;
+
+    wr_burst_reg <= wr_burst_next;
+    wr_burst_len_reg <= wr_burst_len_next;
+    wr_timeout_count_reg <= wr_timeout_count_next;
+    wr_timeout_reg <= wr_timeout_next;
+    fifo_full_wr_blk_adj_reg <= fifo_full_wr_blk_adj_next;
+
+    m_axi_awaddr_reg <= m_axi_awaddr_next;
+    m_axi_awlen_reg <= m_axi_awlen_next;
+    m_axi_awvalid_reg <= m_axi_awvalid_next;
+
+    m_axi_wdata_reg <= m_axi_wdata_next;
+    m_axi_wstrb_reg <= m_axi_wstrb_next;
+    m_axi_wlast_reg <= m_axi_wlast_next;
+    m_axi_wvalid_reg <= m_axi_wvalid_next;
+
+    m_axi_bready_reg <= m_axi_bready_next;
+
+    resp_fifo_we_reg <= resp_fifo_we_next;
+    resp_fifo_wr_burst_len_reg <= resp_fifo_wr_burst_len_next;
+
+    if (resp_fifo_we_reg) begin
+        resp_fifo_burst_len[resp_fifo_wr_ptr_reg[RESP_FIFO_ADDR_WIDTH-1:0]] <= resp_fifo_wr_burst_len_reg;
+        resp_fifo_wr_ptr_reg <= resp_fifo_wr_ptr_reg + 1;
+    end
+    resp_fifo_rd_ptr_reg <= resp_fifo_rd_ptr_next;
+
+    if (rst) begin
+        wr_burst_reg <= 1'b0;
+        m_axi_awvalid_reg <= 1'b0;
+        m_axi_wvalid_reg <= 1'b0;
+        m_axi_bready_reg <= 1'b0;
+        resp_fifo_we_reg <= 1'b0;
+        resp_fifo_wr_ptr_reg <= 0;
+        resp_fifo_rd_ptr_reg <= 0;
+    end
+end
+
+endmodule
+
+`resetall
diff --git a/syn/vivado/axi_vfifo.tcl b/syn/vivado/axi_vfifo.tcl
new file mode 100644
index 0000000..a93297b
--- /dev/null
+++ b/syn/vivado/axi_vfifo.tcl
@@ -0,0 +1,86 @@
+# Copyright (c) 2023 Alex Forencich
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# AXI virtual FIFO timing constraints
+
+foreach inst [get_cells -hier -filter {(ORIG_REF_NAME == axi_vfifo || REF_NAME == axi_vfifo)}] {
+    puts "Inserting timing constraints for axil_vfifo instance $inst"
+
+    proc constrain_sync_chain {inst driver args} {
+        set sync_ffs [get_cells -hier [concat $driver $args] -filter "PARENT == $inst"]
+
+        if {[llength $sync_ffs]} {
+            set_property ASYNC_REG TRUE $sync_ffs
+
+            set src_clk [get_clocks -of_objects [get_cells "$inst/$driver"]]
+
+            set src_clk_period [if {[llength $src_clk]} {get_property -min PERIOD $src_clk} {expr 1.0}]
+
+            set_max_delay -from [get_cells "$inst/$driver"] -to [get_cells "$inst/[lindex $args 0]"] -datapath_only $src_clk_period
+        }
+    }
+
+    proc constrain_sync_chain_async {inst driver args} {
+        set sync_ffs [get_cells -hier [concat $driver $args] -filter "PARENT == $inst"]
+
+        if {[llength $sync_ffs]} {
+            set_property ASYNC_REG TRUE $sync_ffs
+
+            set_false_path -to [get_pins "$inst/$driver/D"]
+        }
+    }
+
+    # control
+    constrain_sync_chain $inst "cfg_enable_reg_reg" "axi_ch[*].ch_cfg_enable_sync_1_reg_reg" "axi_ch[*].ch_cfg_enable_sync_2_reg_reg"
+
+    set sync_ffs [get_cells "$inst/cfg_fifo_base_addr_reg_reg[*] $inst/axi_ch[*].axi_vfifo_raw_inst/fifo_base_addr_reg_reg[*]"]
+    
+    if {[llength $sync_ffs]} {
+        set_property ASYNC_REG TRUE $sync_ffs
+
+        set src_clk [get_clocks -of_objects [get_cells "$inst/cfg_fifo_base_addr_reg_reg[*]"]]
+
+        set src_clk_period [if {[llength $src_clk]} {get_property -min PERIOD $src_clk} {expr 1.0}]
+
+        set_max_delay -from [get_cells "$inst/cfg_fifo_base_addr_reg_reg[*]"] -to [get_cells "$inst/axi_ch[*].axi_vfifo_raw_inst/fifo_base_addr_reg_reg[*]"] -datapath_only $src_clk_period
+    }
+
+    set sync_ffs [get_cells "$inst/cfg_fifo_size_mask_reg_reg[*] $inst/axi_ch[*].axi_vfifo_raw_inst/fifo_size_mask_reg_reg[*]"]
+    
+    if {[llength $sync_ffs]} {
+        set_property ASYNC_REG TRUE $sync_ffs
+
+        set src_clk [get_clocks -of_objects [get_cells "$inst/cfg_fifo_size_mask_reg_reg[*]"]]
+
+        set src_clk_period [if {[llength $src_clk]} {get_property -min PERIOD $src_clk} {expr 1.0}]
+
+        set_max_delay -from [get_cells "$inst/cfg_fifo_size_mask_reg_reg[*]"] -to [get_cells "$inst/axi_ch[*].axi_vfifo_raw_inst/fifo_size_mask_reg_reg[*]"] -datapath_only $src_clk_period
+    }
+
+    # status
+    constrain_sync_chain $inst "sts_sync_flag_reg_reg" "axi_ch[*].ch_sts_flag_sync_1_reg_reg" "axi_ch[*].ch_sts_flag_sync_2_reg_reg"
+    constrain_sync_chain_async $inst "sts_fifo_occupancy_sync_reg_reg[*]"
+
+    constrain_sync_chain_async $inst "sts_fifo_empty_sync_1_reg_reg[*]" "sts_fifo_empty_sync_2_reg_reg[*]"
+    constrain_sync_chain_async $inst "sts_fifo_full_sync_1_reg_reg[*]" "sts_fifo_full_sync_2_reg_reg[*]"
+    constrain_sync_chain_async $inst "sts_reset_sync_1_reg_reg[*]" "sts_reset_sync_2_reg_reg[*]"
+    constrain_sync_chain_async $inst "sts_active_sync_1_reg_reg[*]" "sts_active_sync_2_reg_reg[*]"
+    constrain_sync_chain $inst "sts_hdr_parity_err_reg_reg" "sts_hdr_parity_err_sync_1_reg_reg" "sts_hdr_parity_err_sync_2_reg_reg"
+}
diff --git a/syn/vivado/axi_vfifo_raw.tcl b/syn/vivado/axi_vfifo_raw.tcl
new file mode 100644
index 0000000..5f1299e
--- /dev/null
+++ b/syn/vivado/axi_vfifo_raw.tcl
@@ -0,0 +1,31 @@
+# Copyright (c) 2023 Alex Forencich
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# AXI virtual FIFO (raw) timing constraints
+
+foreach inst [get_cells -hier -regexp -filter {(ORIG_REF_NAME =~ "axi_vfifo_raw(__xdcDup__\d+)?" || REF_NAME =~ "axi_vfifo_raw(__xdcDup__\d+)?")}] {
+    puts "Inserting timing constraints for axi_vfifo_raw instance $inst"
+
+    # reset synchronization
+    set reset_ffs [get_cells -quiet -hier -regexp ".*/rst_sync_\[123\]_reg_reg" -filter "PARENT == $inst"]
+
+    set_property ASYNC_REG TRUE $reset_ffs
+    set_false_path -to [get_pins -of_objects $reset_ffs -filter {IS_PRESET || IS_RESET}]
+}
diff --git a/syn/vivado/axi_vfifo_raw_rd.tcl b/syn/vivado/axi_vfifo_raw_rd.tcl
new file mode 100644
index 0000000..dd8d0fa
--- /dev/null
+++ b/syn/vivado/axi_vfifo_raw_rd.tcl
@@ -0,0 +1,77 @@
+# Copyright (c) 2023 Alex Forencich
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# AXI virtual FIFO (raw, read) timing constraints
+
+foreach inst [get_cells -hier -regexp -filter {(ORIG_REF_NAME =~ "axi_vfifo_raw_rd(__xdcDup__\d+)?" || REF_NAME =~ "axi_vfifo_raw_rd(__xdcDup__\d+)?")}] {
+    puts "Inserting timing constraints for axi_vfifo_raw_rd instance $inst"
+
+    # get clock periods
+    set clk [get_clocks -of_objects [get_cells "$inst/rd_start_ptr_reg_reg[*]"]]
+    set output_clk [get_clocks -of_objects [get_cells "$inst/read_fifo_wr_ptr_gray_sync_1_reg_reg[*]"]]
+
+    set clk_period [if {[llength $clk]} {get_property -min PERIOD $clk} {expr 1.0}]
+    set output_clk_period [if {[llength $output_clk]} {get_property -min PERIOD $output_clk} {expr 1.0}]
+
+    set min_clk_period [expr min($clk_period, $output_clk_period)]
+
+    # reset synchronization
+    set reset_ffs [get_cells -quiet -hier -regexp ".*/rst_sync_\[123\]_reg_reg" -filter "PARENT == $inst"]
+
+    if {[llength $reset_ffs]} {
+        set_property ASYNC_REG TRUE $reset_ffs
+        set_false_path -to [get_pins -of_objects $reset_ffs -filter {IS_PRESET || IS_RESET}]
+    }
+
+    # read FIFO pointer synchronization
+    set sync_ffs [get_cells -quiet -hier -regexp ".*/read_fifo_wr_ptr_gray_sync_\[12\]_reg_reg\\\[\\d+\\\]" -filter "PARENT == $inst"]
+
+    if {[llength $sync_ffs]} {
+        set_property ASYNC_REG TRUE $sync_ffs
+
+        set_max_delay -from [get_cells "$inst/read_fifo_wr_ptr_reg_reg[*] $inst/read_fifo_wr_ptr_gray_reg_reg[*]"] -to [get_cells "$inst/read_fifo_wr_ptr_gray_sync_1_reg_reg[*]"] -datapath_only $clk_period
+        set_bus_skew  -from [get_cells "$inst/read_fifo_wr_ptr_reg_reg[*] $inst/read_fifo_wr_ptr_gray_reg_reg[*]"] -to [get_cells "$inst/read_fifo_wr_ptr_gray_sync_1_reg_reg[*]"] $output_clk_period
+    }
+
+    set sync_ffs [get_cells -quiet -hier -regexp ".*/read_fifo_rd_ptr_gray_sync_\[12\]_reg_reg\\\[\\d+\\\]" -filter "PARENT == $inst"]
+
+    if {[llength $sync_ffs]} {
+        set_property ASYNC_REG TRUE $sync_ffs
+
+        set_max_delay -from [get_cells "$inst/read_fifo_seg[*].seg_rd_ptr_reg_reg[*] $inst/read_fifo_seg[*].seg_rd_ptr_gray_reg_reg[*]"] -to [get_cells "$inst/read_fifo_rd_ptr_gray_sync_1_reg_reg[*]"] -datapath_only $output_clk_period
+        set_bus_skew  -from [get_cells "$inst/read_fifo_seg[*].seg_rd_ptr_reg_reg[*] $inst/read_fifo_seg[*].seg_rd_ptr_gray_reg_reg[*]"] -to [get_cells "$inst/read_fifo_rd_ptr_gray_sync_1_reg_reg[*]"] $clk_period
+    }
+
+    set sync_ffs [get_cells -quiet -hier -regexp ".*/read_fifo_ctrl_rd_ptr_gray_sync_\[12\]_reg_reg\\\[\\d+\\\]" -filter "PARENT == $inst"]
+
+    if {[llength $sync_ffs]} {
+        set_property ASYNC_REG TRUE $sync_ffs
+
+        set_max_delay -from [get_cells "$inst/read_fifo_ctrl_seg[*].seg_rd_ptr_reg_reg[*] $inst/read_fifo_ctrl_seg[*].seg_rd_ptr_gray_reg_reg[*]"] -to [get_cells "$inst/read_fifo_ctrl_rd_ptr_gray_sync_1_reg_reg[*]"] -datapath_only $output_clk_period
+        set_bus_skew  -from [get_cells "$inst/read_fifo_ctrl_seg[*].seg_rd_ptr_reg_reg[*] $inst/read_fifo_ctrl_seg[*].seg_rd_ptr_gray_reg_reg[*]"] -to [get_cells "$inst/read_fifo_ctrl_rd_ptr_gray_sync_1_reg_reg[*]"] $clk_period
+    }
+
+    # read FIFO output register (needed for distributed RAM sync write/async read)
+    set output_reg_ffs [get_cells -quiet "$inst/read_fifo_seg[*].seg_rd_data_reg_reg[*] $inst/read_fifo_ctrl_seg[*].seg_rd_data_reg_reg[*]"]
+
+    if {[llength $output_reg_ffs] && [llength $clk]} {
+        set_false_path -from $clk -to $output_reg_ffs
+    }
+}
diff --git a/syn/vivado/axi_vfifo_raw_wr.tcl b/syn/vivado/axi_vfifo_raw_wr.tcl
new file mode 100644
index 0000000..b8b2d3d
--- /dev/null
+++ b/syn/vivado/axi_vfifo_raw_wr.tcl
@@ -0,0 +1,68 @@
+# Copyright (c) 2023 Alex Forencich
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# AXI virtual FIFO (raw, write) timing constraints
+
+foreach inst [get_cells -hier -regexp -filter {(ORIG_REF_NAME =~ "axi_vfifo_raw_wr(__xdcDup__\d+)?" || REF_NAME =~ "axi_vfifo_raw_wr(__xdcDup__\d+)?")}] {
+    puts "Inserting timing constraints for axi_vfifo_raw_wr instance $inst"
+
+    # get clock periods
+    set clk [get_clocks -of_objects [get_cells "$inst/wr_start_ptr_reg_reg[*]"]]
+    set input_clk [get_clocks -of_objects [get_cells "$inst/write_fifo_rd_ptr_gray_sync_1_reg_reg[*]"]]
+
+    set clk_period [if {[llength $clk]} {get_property -min PERIOD $clk} {expr 1.0}]
+    set input_clk_period [if {[llength $input_clk]} {get_property -min PERIOD $input_clk} {expr 1.0}]
+
+    set min_clk_period [expr min($clk_period, $input_clk_period)]
+
+    # reset synchronization
+    set reset_ffs [get_cells -quiet -hier -regexp ".*/rst_sync_\[123\]_reg_reg" -filter "PARENT == $inst"]
+
+    if {[llength $reset_ffs]} {
+        set_property ASYNC_REG TRUE $reset_ffs
+        set_false_path -to [get_pins -of_objects $reset_ffs -filter {IS_PRESET || IS_RESET}]
+    }
+
+    # write FIFO pointer synchronization
+    set sync_ffs [get_cells -quiet -hier -regexp ".*/write_fifo_wr_ptr_gray_sync_\[12\]_reg_reg\\\[\\d+\\\]" -filter "PARENT == $inst"]
+
+    if {[llength $sync_ffs]} {
+        set_property ASYNC_REG TRUE $sync_ffs
+
+        set_max_delay -from [get_cells "$inst/write_fifo_seg[*].seg_wr_ptr_reg_reg[*] $inst/write_fifo_seg[*].seg_wr_ptr_gray_reg_reg[*]"] -to [get_cells "$inst/write_fifo_wr_ptr_gray_sync_1_reg_reg[*]"] -datapath_only $input_clk_period
+        set_bus_skew  -from [get_cells "$inst/write_fifo_seg[*].seg_wr_ptr_reg_reg[*] $inst/write_fifo_seg[*].seg_wr_ptr_gray_reg_reg[*]"] -to [get_cells "$inst/write_fifo_wr_ptr_gray_sync_1_reg_reg[*]"] $clk_period
+    }
+
+    set sync_ffs [get_cells -quiet -hier -regexp ".*/write_fifo_rd_ptr_gray_sync_\[12\]_reg_reg\\\[\\d+\\\]" -filter "PARENT == $inst"]
+
+    if {[llength $sync_ffs]} {
+        set_property ASYNC_REG TRUE $sync_ffs
+
+        set_max_delay -from [get_cells "$inst/write_fifo_rd_ptr_reg_reg[*] $inst/write_fifo_rd_ptr_gray_reg_reg[*]"] -to [get_cells "$inst/write_fifo_rd_ptr_gray_sync_1_reg_reg[*]"] -datapath_only $clk_period
+        set_bus_skew  -from [get_cells "$inst/write_fifo_rd_ptr_reg_reg[*] $inst/write_fifo_rd_ptr_gray_reg_reg[*]"] -to [get_cells "$inst/write_fifo_rd_ptr_gray_sync_1_reg_reg[*]"] $input_clk_period
+    }
+
+    # write FIFO output register (needed for distributed RAM sync write/async read)
+    set output_reg_ffs [get_cells -quiet "$inst/write_fifo_seg[*].seg_rd_data_reg_reg[*]"]
+
+    if {[llength $output_reg_ffs] && [llength $input_clk]} {
+        set_false_path -from $input_clk -to $output_reg_ffs
+    }
+}
diff --git a/tb/axi_vfifo/Makefile b/tb/axi_vfifo/Makefile
new file mode 100644
index 0000000..37eefc6
--- /dev/null
+++ b/tb/axi_vfifo/Makefile
@@ -0,0 +1,94 @@
+# Copyright (c) 2023 Alex Forencich
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+TOPLEVEL_LANG = verilog
+
+SIM ?= icarus
+WAVES ?= 0
+
+COCOTB_HDL_TIMEUNIT = 1ns
+COCOTB_HDL_TIMEPRECISION = 1ps
+
+DUT      = axi_vfifo
+TOPLEVEL = $(DUT)
+MODULE   = test_$(DUT)
+VERILOG_SOURCES += ../../rtl/$(DUT).v
+VERILOG_SOURCES += ../../rtl/axi_vfifo_raw.v
+VERILOG_SOURCES += ../../rtl/axi_vfifo_raw_wr.v
+VERILOG_SOURCES += ../../rtl/axi_vfifo_raw_rd.v
+VERILOG_SOURCES += ../../rtl/axi_vfifo_enc.v
+VERILOG_SOURCES += ../../rtl/axi_vfifo_dec.v
+
+# module parameters
+export PARAM_AXI_CH := 2
+export PARAM_AXI_DATA_WIDTH := 512
+export PARAM_AXI_ADDR_WIDTH := 16
+export PARAM_AXI_STRB_WIDTH := $(shell expr $(PARAM_AXI_DATA_WIDTH) / 8 )
+export PARAM_AXI_ID_WIDTH := 8
+export PARAM_AXI_MAX_BURST_LEN := 16
+export PARAM_AXIS_DATA_WIDTH := $(PARAM_AXI_DATA_WIDTH)
+export PARAM_AXIS_KEEP_ENABLE := $(shell expr $(PARAM_AXIS_DATA_WIDTH) \> 8 )
+export PARAM_AXIS_KEEP_WIDTH := $(shell expr $(PARAM_AXIS_DATA_WIDTH) / 8 )
+export PARAM_AXIS_LAST_ENABLE := 1
+export PARAM_AXIS_ID_ENABLE := 1
+export PARAM_AXIS_ID_WIDTH := 8
+export PARAM_AXIS_DEST_ENABLE := 1
+export PARAM_AXIS_DEST_WIDTH := 8
+export PARAM_AXIS_USER_ENABLE := 1
+export PARAM_AXIS_USER_WIDTH := 1
+export PARAM_LEN_WIDTH := $(PARAM_AXI_ADDR_WIDTH)
+export PARAM_MAX_SEG_WIDTH := 256
+export PARAM_WRITE_FIFO_DEPTH := 64
+export PARAM_WRITE_MAX_BURST_LEN := $(shell expr $(PARAM_WRITE_FIFO_DEPTH) / 4 )
+export PARAM_READ_FIFO_DEPTH := 128
+export PARAM_READ_MAX_BURST_LEN := $(PARAM_WRITE_MAX_BURST_LEN)
+
+ifeq ($(SIM), icarus)
+	PLUSARGS += -fst
+
+	COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-P $(TOPLEVEL).$(subst PARAM_,,$(v))=$($(v)))
+
+	ifeq ($(WAVES), 1)
+		VERILOG_SOURCES += iverilog_dump.v
+		COMPILE_ARGS += -s iverilog_dump
+	endif
+else ifeq ($(SIM), verilator)
+	COMPILE_ARGS += -Wno-SELRANGE -Wno-WIDTH -Wno-CASEINCOMPLETE
+
+	COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-G$(subst PARAM_,,$(v))=$($(v)))
+
+	ifeq ($(WAVES), 1)
+		COMPILE_ARGS += --trace-fst
+	endif
+endif
+
+include $(shell cocotb-config --makefiles)/Makefile.sim
+
+iverilog_dump.v:
+	echo 'module iverilog_dump();' > $@
+	echo 'initial begin' >> $@
+	echo '    $$dumpfile("$(TOPLEVEL).fst");' >> $@
+	echo '    $$dumpvars(0, $(TOPLEVEL));' >> $@
+	echo 'end' >> $@
+	echo 'endmodule' >> $@
+
+clean::
+	@rm -rf iverilog_dump.v
+	@rm -rf dump.fst $(TOPLEVEL).fst
diff --git a/tb/axi_vfifo/test_axi_vfifo.py b/tb/axi_vfifo/test_axi_vfifo.py
new file mode 100644
index 0000000..c6a6643
--- /dev/null
+++ b/tb/axi_vfifo/test_axi_vfifo.py
@@ -0,0 +1,685 @@
+"""
+
+Copyright (c) 2023 Alex Forencich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+"""
+
+import itertools
+import logging
+import os
+import random
+
+import cocotb_test.simulator
+import pytest
+
+import cocotb
+
+from cocotb.clock import Clock
+from cocotb.triggers import RisingEdge
+from cocotb.regression import TestFactory
+
+from cocotbext.axi import AxiBus, AxiRam
+from cocotbext.axi import AxiStreamBus, AxiStreamFrame, AxiStreamSource, AxiStreamSink
+
+
+class TB(object):
+    def __init__(self, dut):
+        self.dut = dut
+
+        self.log = logging.getLogger("cocotb.tb")
+        self.log.setLevel(logging.DEBUG)
+
+        cocotb.start_soon(Clock(dut.clk, 8, units="ns").start())
+
+        # streaming data in
+        cocotb.start_soon(Clock(dut.s_axis_clk, 6, units="ns").start())
+        self.source = AxiStreamSource(AxiStreamBus.from_prefix(dut, "s_axis"), dut.s_axis_clk, dut.s_axis_rst_out)
+
+        # streaming data out
+        cocotb.start_soon(Clock(dut.m_axis_clk, 6, units="ns").start())
+        self.sink = AxiStreamSink(AxiStreamBus.from_prefix(dut, "m_axis"), dut.m_axis_clk, dut.m_axis_rst_out)
+
+        # AXI interfaces
+        self.axi_ram = []
+        for ch in dut.axi_ch:
+            cocotb.start_soon(Clock(ch.ch_clk, 3, units="ns").start())
+            ram = AxiRam(AxiBus.from_prefix(ch.axi_vfifo_raw_inst, "m_axi"), ch.ch_clk, ch.ch_rst, size=2**16)
+            self.axi_ram.append(ram)
+
+        dut.cfg_fifo_base_addr.setimmediatevalue(0)
+        dut.cfg_fifo_size_mask.setimmediatevalue(0)
+        dut.cfg_enable.setimmediatevalue(0)
+        dut.cfg_reset.setimmediatevalue(0)
+
+    def set_idle_generator(self, generator=None):
+        if generator:
+            self.source.set_pause_generator(generator())
+            for ram in self.axi_ram:
+                ram.write_if.b_channel.set_pause_generator(generator())
+                ram.read_if.r_channel.set_pause_generator(generator())
+
+    def set_backpressure_generator(self, generator=None):
+        if generator:
+            self.sink.set_pause_generator(generator())
+            for ram in self.axi_ram:
+                ram.write_if.aw_channel.set_pause_generator(generator())
+                ram.write_if.w_channel.set_pause_generator(generator())
+                ram.read_if.ar_channel.set_pause_generator(generator())
+
+    def set_stream_idle_generator(self, generator=None):
+        if generator:
+            self.source.set_pause_generator(generator())
+
+    def set_stream_backpressure_generator(self, generator=None):
+        if generator:
+            self.sink.set_pause_generator(generator())
+
+    def set_axi_0_idle_generator(self, generator=None):
+        if generator:
+            self.axi_ram[0].write_if.b_channel.set_pause_generator(generator())
+            self.axi_ram[0].read_if.r_channel.set_pause_generator(generator())
+
+    def set_axi_0_backpressure_generator(self, generator=None):
+        if generator:
+            self.axi_ram[0].write_if.aw_channel.set_pause_generator(generator())
+            self.axi_ram[0].write_if.w_channel.set_pause_generator(generator())
+            self.axi_ram[0].read_if.ar_channel.set_pause_generator(generator())
+
+    def set_axi_idle_generator(self, generator=None):
+        if generator:
+            for ram in self.axi_ram:
+                ram.write_if.b_channel.set_pause_generator(generator())
+                ram.read_if.r_channel.set_pause_generator(generator())
+
+    def set_axi_backpressure_generator(self, generator=None):
+        if generator:
+            for ram in self.axi_ram:
+                ram.write_if.aw_channel.set_pause_generator(generator())
+                ram.write_if.w_channel.set_pause_generator(generator())
+                ram.read_if.ar_channel.set_pause_generator(generator())
+
+    async def reset(self):
+        self.dut.rst.setimmediatevalue(0)
+        self.dut.s_axis_rst.setimmediatevalue(0)
+        self.dut.m_axis_rst.setimmediatevalue(0)
+        for ram in self.axi_ram:
+            ram.write_if.reset.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.rst.value = 1
+        self.dut.s_axis_rst.value = 1
+        self.dut.m_axis_rst.value = 1
+        for ram in self.axi_ram:
+            ram.write_if.reset.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.rst.value = 0
+        self.dut.s_axis_rst.value = 0
+        self.dut.m_axis_rst.value = 0
+        for ram in self.axi_ram:
+            ram.write_if.reset.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+    async def reset_source(self):
+        self.dut.s_axis_rst.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.s_axis_clk)
+        self.dut.s_axis_rst.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.s_axis_clk)
+        self.dut.s_axis_rst.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.s_axis_clk)
+
+    async def reset_sink(self):
+        self.dut.m_axis_rst.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.m_axis_clk)
+        self.dut.m_axis_rst.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.m_axis_clk)
+        self.dut.m_axis_rst.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.m_axis_clk)
+
+    async def reset_axi_0(self):
+        self.axi_ram[0].write_if.reset.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.axi_ram[0].write_if.reset.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.axi_ram[0].write_if.reset.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+    async def reset_axi(self):
+        for ram in self.axi_ram:
+            ram.write_if.reset.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        for ram in self.axi_ram:
+            ram.write_if.reset.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        for ram in self.axi_ram:
+            ram.write_if.reset.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+    async def reset_cfg(self):
+        self.dut.cfg_reset.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.cfg_reset.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.cfg_reset.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+
+async def run_test(dut, payload_lengths=None, payload_data=None, space=False,
+        stream_idle_inserter=None, stream_backpressure_inserter=None,
+        axi_0_idle_inserter=None, axi_0_backpressure_inserter=None,
+        axi_idle_inserter=None, axi_backpressure_inserter=None):
+
+    tb = TB(dut)
+
+    id_count = 2**len(tb.source.bus.tid)
+
+    cur_id = 1
+
+    await tb.reset()
+
+    tb.set_stream_idle_generator(stream_idle_inserter)
+    tb.set_stream_backpressure_generator(stream_backpressure_inserter)
+    tb.set_axi_idle_generator(axi_idle_inserter)
+    tb.set_axi_backpressure_generator(axi_backpressure_inserter)
+    tb.set_axi_0_backpressure_generator(axi_0_backpressure_inserter)
+    tb.set_axi_0_idle_generator(axi_0_idle_inserter)
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**16-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    test_frames = []
+
+    for test_data in [payload_data(x) for x in payload_lengths()]:
+        test_frame = AxiStreamFrame(test_data)
+        test_frame.tid = cur_id
+        test_frame.tdest = cur_id
+
+        test_frames.append(test_frame)
+        await tb.source.send(test_frame)
+
+        cur_id = (cur_id + 1) % id_count
+
+        if space:
+            for k in range(1000):
+                await RisingEdge(dut.clk)
+
+                if dut.m_axis_tvalid.value.integer and dut.m_axis_tready.value.integer and dut.m_axis_tlast.value.integer:
+                    break
+
+    for test_frame in test_frames:
+        rx_frame = await tb.sink.recv()
+
+        assert rx_frame.tdata == test_frame.tdata
+        assert rx_frame.tid == test_frame.tid
+        assert rx_frame.tdest == test_frame.tdest
+        assert not rx_frame.tuser
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.s_axis_clk)
+    await RisingEdge(dut.s_axis_clk)
+
+
+async def run_test_tuser_assert(dut):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**16-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 32*byte_lanes))
+    test_frame = AxiStreamFrame(test_data, tuser=1)
+    await tb.source.send(test_frame)
+
+    rx_frame = await tb.sink.recv()
+
+    assert rx_frame.tdata == test_data
+    assert rx_frame.tuser
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.s_axis_clk)
+    await RisingEdge(dut.s_axis_clk)
+
+
+async def run_test_init_sink_pause(dut):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**16-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    tb.sink.pause = True
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 1024*byte_lanes))
+    test_frame = AxiStreamFrame(test_data)
+    await tb.source.send(test_frame)
+
+    for k in range(256):
+        await RisingEdge(dut.s_axis_clk)
+
+    tb.sink.pause = False
+
+    rx_frame = await tb.sink.recv()
+
+    assert rx_frame.tdata == test_data
+    assert not rx_frame.tuser
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.s_axis_clk)
+    await RisingEdge(dut.s_axis_clk)
+
+
+async def run_test_init_sink_pause_reset(dut, reset_type=TB.reset):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**16-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    tb.sink.pause = True
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 1024*byte_lanes))
+    test_frame = AxiStreamFrame(test_data)
+    await tb.source.send(test_frame)
+
+    for k in range(256):
+        await RisingEdge(dut.s_axis_clk)
+
+    await reset_type(tb)
+
+    tb.sink.pause = False
+
+    for k in range(2048):
+        await RisingEdge(dut.s_axis_clk)
+
+    assert tb.sink.idle()
+    assert tb.sink.empty()
+
+    await tb.source.send(test_frame)
+
+    rx_frame = await tb.sink.recv()
+
+    assert rx_frame.tdata == test_data
+    assert not rx_frame.tuser
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.s_axis_clk)
+    await RisingEdge(dut.s_axis_clk)
+
+
+async def run_test_shift_in_reset(dut, reset_type=TB.reset):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**16-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 1024*byte_lanes))
+    test_frame = AxiStreamFrame(test_data)
+    await tb.source.send(test_frame)
+
+    for k in range(256):
+        await RisingEdge(dut.s_axis_clk)
+
+    await reset_type(tb)
+
+    for k in range(2048):
+        await RisingEdge(dut.s_axis_clk)
+
+    assert tb.sink.idle()
+    assert tb.sink.empty()
+
+    await tb.source.send(test_frame)
+
+    rx_frame = await tb.sink.recv()
+
+    assert rx_frame.tdata == test_data
+    assert not rx_frame.tuser
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.s_axis_clk)
+    await RisingEdge(dut.s_axis_clk)
+
+
+async def run_test_shift_out_reset(dut, reset_type=TB.reset):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**16-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 1024*byte_lanes))
+    test_frame = AxiStreamFrame(test_data)
+    await tb.source.send(test_frame)
+
+    await RisingEdge(dut.m_axis_tvalid)
+
+    for k in range(8):
+        await RisingEdge(dut.s_axis_clk)
+
+    await reset_type(tb)
+
+    for k in range(2048):
+        await RisingEdge(dut.s_axis_clk)
+
+    assert tb.sink.idle()
+    assert tb.sink.empty()
+
+    await tb.source.send(test_frame)
+
+    rx_frame = await tb.sink.recv()
+
+    assert rx_frame.tdata == test_data
+    assert not rx_frame.tuser
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.s_axis_clk)
+    await RisingEdge(dut.s_axis_clk)
+
+
+async def run_test_overflow(dut):
+
+    tb = TB(dut)
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**16-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    tb.sink.pause = True
+
+    ram_size = 2**tb.axi_ram[0].write_if.address_width*len(tb.axi_ram)
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 2*ram_size))
+    test_frame = AxiStreamFrame(test_data)
+    await tb.source.send(test_frame)
+
+    for k in range(2048):
+        await RisingEdge(dut.s_axis_clk)
+
+    tb.sink.pause = False
+
+    rx_frame = await tb.sink.recv()
+
+    assert rx_frame.tdata == test_data
+    assert not rx_frame.tuser
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.s_axis_clk)
+    await RisingEdge(dut.s_axis_clk)
+
+
+async def run_stress_test(dut, space=False,
+        stream_idle_inserter=None, stream_backpressure_inserter=None,
+        axi_0_idle_inserter=None, axi_0_backpressure_inserter=None,
+        axi_idle_inserter=None, axi_backpressure_inserter=None):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+    id_count = 2**len(tb.source.bus.tid)
+
+    cur_id = 1
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**16-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    tb.set_stream_idle_generator(stream_idle_inserter)
+    tb.set_stream_backpressure_generator(stream_backpressure_inserter)
+    tb.set_axi_idle_generator(axi_idle_inserter)
+    tb.set_axi_backpressure_generator(axi_backpressure_inserter)
+    tb.set_axi_0_backpressure_generator(axi_0_backpressure_inserter)
+    tb.set_axi_0_idle_generator(axi_0_idle_inserter)
+
+    test_frames = []
+
+    for k in range(128):
+        length = random.randint(1, byte_lanes*16)
+        test_data = bytearray(itertools.islice(itertools.cycle(range(256)), length))
+        test_frame = AxiStreamFrame(test_data)
+        test_frame.tid = cur_id
+        test_frame.tdest = cur_id
+
+        test_frames.append(test_frame)
+        await tb.source.send(test_frame)
+
+        cur_id = (cur_id + 1) % id_count
+
+        if space:
+            for k in range(1000):
+                await RisingEdge(dut.clk)
+
+                if dut.m_axis_tvalid.value.integer and dut.m_axis_tready.value.integer and dut.m_axis_tlast.value.integer:
+                    break
+
+    for test_frame in test_frames:
+        rx_frame = await tb.sink.recv()
+
+        assert rx_frame.tdata == test_frame.tdata
+        assert rx_frame.tid == test_frame.tid
+        assert rx_frame.tdest == test_frame.tdest
+        assert not rx_frame.tuser
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.s_axis_clk)
+    await RisingEdge(dut.s_axis_clk)
+
+
+def cycle_pause():
+    return itertools.cycle([1, 1, 1, 0])
+
+
+def size_list():
+    data_width = len(cocotb.top.m_axis_tdata)
+    byte_width = data_width // 8
+    return list(range(1, byte_width*4+1))+list(range(byte_width, byte_width*32, byte_width))+[2**14]+[1]*64
+
+
+def incrementing_payload(length):
+    return bytearray(itertools.islice(itertools.cycle(range(256)), length))
+
+
+if cocotb.SIM_NAME:
+
+    factory = TestFactory(run_test)
+    factory.add_option("payload_lengths", [size_list])
+    factory.add_option("payload_data", [incrementing_payload])
+    factory.add_option(("space",
+            "stream_idle_inserter", "stream_backpressure_inserter",
+            "axi_0_idle_inserter", "axi_0_backpressure_inserter",
+            "axi_idle_inserter", "axi_backpressure_inserter"), [
+        (False, None, None, None, None, None, None),
+        (False, cycle_pause, None, None, None, None, None),
+        (False, None, cycle_pause, None, None, None, None),
+        (False, None, None, cycle_pause, None, None, None),
+        (False, None, None, None, cycle_pause, None, None),
+        (False, None, None, None, None, cycle_pause, None),
+        (False, None, None, None, None, None, cycle_pause),
+        (True,  None, None, None, None, None, None),
+        (True,  cycle_pause, None, None, None, None, None),
+        (True,  None, cycle_pause, None, None, None, None),
+        (True,  None, None, cycle_pause, None, None, None),
+        (True,  None, None, None, cycle_pause, None, None),
+        (True,  None, None, None, None, cycle_pause, None),
+        (True,  None, None, None, None, None, cycle_pause),
+    ])
+    factory.generate_tests()
+
+    for test in [
+                run_test_tuser_assert,
+                run_test_init_sink_pause,
+                run_test_overflow
+            ]:
+
+        factory = TestFactory(test)
+        factory.generate_tests()
+
+    for test in [
+                run_test_init_sink_pause_reset,
+                run_test_shift_in_reset,
+                run_test_shift_out_reset,
+            ]:
+
+        factory = TestFactory(test)
+        factory.add_option("reset_type", [TB.reset, TB.reset_source,
+                TB.reset_sink, TB.reset_axi_0, TB.reset_axi, TB.reset_cfg])
+        factory.generate_tests()
+
+    factory = TestFactory(run_stress_test)
+    factory.add_option(("space",
+            "stream_idle_inserter", "stream_backpressure_inserter",
+            "axi_0_idle_inserter", "axi_0_backpressure_inserter",
+            "axi_idle_inserter", "axi_backpressure_inserter"), [
+        (False, None, None, None, None, None, None),
+        (False, cycle_pause, None, None, None, None, None),
+        (False, None, cycle_pause, None, None, None, None),
+        (False, None, None, cycle_pause, None, None, None),
+        (False, None, None, None, cycle_pause, None, None),
+        (False, None, None, None, None, cycle_pause, None),
+        (False, None, None, None, None, None, cycle_pause),
+        (True,  None, None, None, None, None, None),
+        (True,  cycle_pause, None, None, None, None, None),
+        (True,  None, cycle_pause, None, None, None, None),
+        (True,  None, None, cycle_pause, None, None, None),
+        (True,  None, None, None, cycle_pause, None, None),
+        (True,  None, None, None, None, cycle_pause, None),
+        (True,  None, None, None, None, None, cycle_pause),
+    ])
+    factory.generate_tests()
+
+
+# cocotb-test
+
+tests_dir = os.path.abspath(os.path.dirname(__file__))
+rtl_dir = os.path.abspath(os.path.join(tests_dir, '..', '..', 'rtl'))
+
+
+@pytest.mark.parametrize(("axis_data_width", "axi_ch", "axi_data_width"), [
+            # (32, 1, 32),
+            # (32, 2, 32),
+            (512, 2, 512),
+        ])
+def test_axi_vfifo(request, axis_data_width, axi_ch, axi_data_width):
+    dut = "axi_vfifo"
+    module = os.path.splitext(os.path.basename(__file__))[0]
+    toplevel = dut
+
+    verilog_sources = [
+        os.path.join(rtl_dir, f"{dut}.v"),
+        os.path.join(rtl_dir, "axi_vfifo_raw.v"),
+        os.path.join(rtl_dir, "axi_vfifo_raw_wr.v"),
+        os.path.join(rtl_dir, "axi_vfifo_raw_rd.v"),
+        os.path.join(rtl_dir, "axi_vfifo_enc.v"),
+        os.path.join(rtl_dir, "axi_vfifo_dec.v"),
+    ]
+
+    parameters = {}
+
+    parameters['AXI_CH'] = axi_ch
+    parameters['AXI_DATA_WIDTH'] = axi_data_width
+    parameters['AXI_ADDR_WIDTH'] = 16
+    parameters['AXI_STRB_WIDTH'] = parameters['AXI_DATA_WIDTH'] // 8
+    parameters['AXI_ID_WIDTH'] = 8
+    parameters['AXI_MAX_BURST_LEN'] = 16
+    parameters['AXIS_DATA_WIDTH'] = axis_data_width
+    parameters['AXIS_KEEP_ENABLE'] = int(parameters['AXIS_DATA_WIDTH'] > 8)
+    parameters['AXIS_KEEP_WIDTH'] = parameters['AXIS_DATA_WIDTH'] // 8
+    parameters['AXIS_LAST_ENABLE'] = 1
+    parameters['AXIS_ID_ENABLE'] = 1
+    parameters['AXIS_ID_WIDTH'] = 8
+    parameters['AXIS_DEST_ENABLE'] = 1
+    parameters['AXIS_DEST_WIDTH'] = 8
+    parameters['AXIS_USER_ENABLE'] = 1
+    parameters['AXIS_USER_WIDTH'] = 1
+    parameters['LEN_WIDTH'] = parameters['AXI_ADDR_WIDTH']
+    parameters['MAX_SEG_WIDTH'] = 256
+    parameters['WRITE_FIFO_DEPTH'] = 64
+    parameters['WRITE_MAX_BURST_LEN'] = parameters['WRITE_FIFO_DEPTH'] // 4
+    parameters['READ_FIFO_DEPTH'] = 128
+    parameters['READ_MAX_BURST_LEN'] = parameters['WRITE_MAX_BURST_LEN']
+
+    extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()}
+
+    sim_build = os.path.join(tests_dir, "sim_build",
+        request.node.name.replace('[', '-').replace(']', ''))
+
+    cocotb_test.simulator.run(
+        python_search=[tests_dir],
+        verilog_sources=verilog_sources,
+        toplevel=toplevel,
+        module=module,
+        parameters=parameters,
+        sim_build=sim_build,
+        extra_env=extra_env,
+    )
diff --git a/tb/axi_vfifo_dec/Makefile b/tb/axi_vfifo_dec/Makefile
new file mode 100644
index 0000000..41db651
--- /dev/null
+++ b/tb/axi_vfifo_dec/Makefile
@@ -0,0 +1,79 @@
+# Copyright (c) 2023 Alex Forencich
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+TOPLEVEL_LANG = verilog
+
+SIM ?= icarus
+WAVES ?= 0
+
+COCOTB_HDL_TIMEUNIT = 1ns
+COCOTB_HDL_TIMEPRECISION = 1ps
+
+DUT      = axi_vfifo_dec
+TOPLEVEL = $(DUT)
+MODULE   = test_$(DUT)
+VERILOG_SOURCES += ../../rtl/$(DUT).v
+
+# module parameters
+export PARAM_SEG_WIDTH := 256
+export PARAM_SEG_CNT := 4
+export PARAM_AXIS_DATA_WIDTH := $(shell expr $(PARAM_SEG_WIDTH) \* $(PARAM_SEG_CNT) / 2 )
+export PARAM_AXIS_KEEP_ENABLE := $(shell expr $(PARAM_AXIS_DATA_WIDTH) \> 8 )
+export PARAM_AXIS_KEEP_WIDTH := $(shell expr $(PARAM_AXIS_DATA_WIDTH) / 8 )
+export PARAM_AXIS_LAST_ENABLE := 1
+export PARAM_AXIS_ID_ENABLE := 1
+export PARAM_AXIS_ID_WIDTH := 8
+export PARAM_AXIS_DEST_ENABLE := 1
+export PARAM_AXIS_DEST_WIDTH := 8
+export PARAM_AXIS_USER_ENABLE := 1
+export PARAM_AXIS_USER_WIDTH := 1
+
+ifeq ($(SIM), icarus)
+	PLUSARGS += -fst
+
+	COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-P $(TOPLEVEL).$(subst PARAM_,,$(v))=$($(v)))
+
+	ifeq ($(WAVES), 1)
+		VERILOG_SOURCES += iverilog_dump.v
+		COMPILE_ARGS += -s iverilog_dump
+	endif
+else ifeq ($(SIM), verilator)
+	COMPILE_ARGS += -Wno-SELRANGE -Wno-WIDTH -Wno-CASEINCOMPLETE
+
+	COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-G$(subst PARAM_,,$(v))=$($(v)))
+
+	ifeq ($(WAVES), 1)
+		COMPILE_ARGS += --trace-fst
+	endif
+endif
+
+include $(shell cocotb-config --makefiles)/Makefile.sim
+
+iverilog_dump.v:
+	echo 'module iverilog_dump();' > $@
+	echo 'initial begin' >> $@
+	echo '    $$dumpfile("$(TOPLEVEL).fst");' >> $@
+	echo '    $$dumpvars(0, $(TOPLEVEL));' >> $@
+	echo 'end' >> $@
+	echo 'endmodule' >> $@
+
+clean::
+	@rm -rf iverilog_dump.v
+	@rm -rf dump.fst $(TOPLEVEL).fst
diff --git a/tb/axi_vfifo_dec/test_axi_vfifo_dec.py b/tb/axi_vfifo_dec/test_axi_vfifo_dec.py
new file mode 100644
index 0000000..a755c03
--- /dev/null
+++ b/tb/axi_vfifo_dec/test_axi_vfifo_dec.py
@@ -0,0 +1,394 @@
+"""
+
+Copyright (c) 2023 Alex Forencich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+"""
+
+import itertools
+import logging
+import os
+
+import cocotb_test.simulator
+import pytest
+
+import cocotb
+
+from cocotb.queue import Queue
+from cocotb.clock import Clock
+from cocotb.triggers import RisingEdge
+from cocotb.regression import TestFactory
+from cocotb_bus.bus import Bus
+
+from cocotbext.axi import AxiStreamBus, AxiStreamFrame, AxiStreamSink
+
+
+class BaseBus(Bus):
+
+    _signals = ["data"]
+    _optional_signals = []
+
+    def __init__(self, entity=None, prefix=None, **kwargs):
+        super().__init__(entity, prefix, self._signals, optional_signals=self._optional_signals, **kwargs)
+
+    @classmethod
+    def from_entity(cls, entity, **kwargs):
+        return cls(entity, **kwargs)
+
+    @classmethod
+    def from_prefix(cls, entity, prefix, **kwargs):
+        return cls(entity, prefix, **kwargs)
+
+
+class DataBus(BaseBus):
+    _signals = ["data", "valid", "ready"]
+
+
+class DataSource:
+
+    def __init__(self, bus, clock, reset=None, *args, **kwargs):
+        self.bus = bus
+        self.clock = clock
+        self.reset = reset
+        self.log = logging.getLogger(f"cocotb.{bus._entity._name}.{bus._name}")
+
+        self.pause = False
+        self._pause_generator = None
+        self._pause_cr = None
+
+        self.width = len(self.bus.data)
+        self.byte_size = 8
+        self.byte_lanes = self.width // self.byte_size
+
+        self.seg_count = len(self.bus.valid)
+        self.seg_data_width = self.width // self.seg_count
+        self.seg_byte_lanes = self.seg_data_width // self.byte_size
+
+        self.seg_data_mask = 2**self.seg_data_width-1
+
+        # queue per segment
+        self.queue = [Queue() for x in range(self.seg_count)]
+
+        self.bus.data.setimmediatevalue(0)
+        self.bus.valid.setimmediatevalue(0)
+
+        cocotb.start_soon(self._run())
+
+    def set_pause_generator(self, generator=None):
+        if self._pause_cr is not None:
+            self._pause_cr.kill()
+            self._pause_cr = None
+
+        self._pause_generator = generator
+
+        if self._pause_generator is not None:
+            self._pause_cr = cocotb.start_soon(self._run_pause())
+
+    def clear_pause_generator(self):
+        self.set_pause_generator(None)
+
+    def empty(self):
+        for queue in self.queue:
+            if not queue.empty():
+                return False
+        return True
+
+    def clear(self):
+        for queue in self.queue:
+            while not queue.empty():
+                _ = queue.get_nowait()
+
+    async def write(self, data):
+        self.write_nowait(data)
+
+    def write_nowait(self, data):
+        data = bytearray(data)
+
+        # pad to interface width
+        if len(data) % self.byte_lanes:
+            data.extend(b'\x00'*(self.byte_lanes - (len(data) % self.byte_lanes)))
+
+        # stripe across segment queues
+        index = 0
+        for offset in range(0, len(data), self.seg_byte_lanes):
+            self.queue[index].put_nowait(data[offset:offset+self.seg_byte_lanes])
+            index = (index + 1) % self.seg_count
+
+    async def _run(self):
+        data = 0
+        valid = 0
+
+        clock_edge_event = RisingEdge(self.clock)
+
+        while True:
+            await clock_edge_event
+
+            ready_sample = self.bus.ready.value.integer
+
+            if self.reset is not None and self.reset.value:
+                self.bus.valid.setimmediatevalue(0)
+                valid = 0
+                continue
+
+            # process segments
+            for seg in range(self.seg_count):
+                seg_mask = 1 << seg
+                if ((ready_sample & seg_mask) or not (valid & seg_mask)):
+                    if not self.queue[seg].empty() and not self.pause:
+                        d = self.queue[seg].get_nowait()
+                        data &= ~(self.seg_data_mask << self.seg_data_width*seg)
+                        data |= int.from_bytes(d, 'little') << self.seg_data_width*seg
+                        valid |= seg_mask
+
+                        self.log.info("TX seg: %d data: %s", seg, d)
+                    else:
+                        valid = valid & ~seg_mask
+
+            self.bus.data.value = data
+            self.bus.valid.value = valid
+
+    async def _run_pause(self):
+        clock_edge_event = RisingEdge(self.clock)
+
+        for val in self._pause_generator:
+            self.pause = val
+            await clock_edge_event
+
+
+class TB(object):
+    def __init__(self, dut):
+        self.dut = dut
+
+        self.log = logging.getLogger("cocotb.tb")
+        self.log.setLevel(logging.DEBUG)
+
+        cocotb.start_soon(Clock(dut.clk, 10, units="ns").start())
+
+        # streaming data in
+        self.source = DataSource(DataBus.from_prefix(dut, "input"), dut.clk, dut.rst)
+        self.source_ctrl = DataSource(DataBus.from_prefix(dut, "input_ctrl"), dut.clk, dut.rst)
+
+        # streaming data out
+        self.sink = AxiStreamSink(AxiStreamBus.from_prefix(dut, "m_axis"), dut.clk, dut.rst)
+
+        dut.fifo_rst_in.setimmediatevalue(0)
+
+    def set_idle_generator(self, generator=None):
+        if generator:
+            self.source_ctrl.set_pause_generator(generator())
+
+    def set_backpressure_generator(self, generator=None):
+        if generator:
+            self.sink.set_pause_generator(generator())
+
+    async def reset(self):
+        self.dut.rst.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.rst.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.rst.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+    async def reset_fifo(self):
+        self.dut.fifo_rst_in.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.fifo_rst_in.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.fifo_rst_in.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+
+async def run_test(dut, payload_lengths=None, payload_data=None, pack=False, idle_inserter=None, backpressure_inserter=None):
+
+    tb = TB(dut)
+
+    id_width = len(tb.sink.bus.tid)
+    dest_width = len(tb.sink.bus.tdest)
+    user_width = len(tb.sink.bus.tuser)
+
+    seg_cnt = tb.source.seg_count
+    seg_byte_lanes = tb.source.seg_byte_lanes
+
+    max_block_size = seg_byte_lanes*seg_cnt*16
+
+    meta_id_offset = 0
+    meta_dest_offset = meta_id_offset + id_width
+    meta_user_offset = meta_dest_offset + dest_width
+    meta_width = meta_user_offset + user_width
+    hdr_size = (16 + meta_width + 7) // 8
+
+    id_count = 2**id_width
+
+    cur_id = 1
+
+    await tb.reset()
+
+    tb.set_idle_generator(idle_inserter)
+    tb.set_backpressure_generator(backpressure_inserter)
+
+    test_frames = []
+
+    packed_data = bytearray()
+
+    for test_data in [payload_data(x) for x in payload_lengths()]:
+        test_frame = AxiStreamFrame(test_data)
+        test_frame.tid = cur_id
+        test_frame.tdest = cur_id
+        test_frame.tuser = 0
+
+        # encode frame
+        test_frame_enc = bytearray()
+
+        for offset in range(0, len(test_data), max_block_size):
+            block = test_data[offset:offset+max_block_size]
+            block_enc = bytearray()
+
+            meta = test_frame.tid << meta_id_offset
+            meta |= (test_frame.tdest) << meta_dest_offset
+            meta |= (test_frame.tuser) << meta_user_offset
+
+            # pack header
+            hdr = 0x1
+            if offset+len(block) >= len(test_data):
+                # last block
+                hdr |= 0x2
+            hdr |= (len(block)-1) << 4
+            hdr |= (bin(hdr & 0x0003).count("1") & 1) << 2
+            hdr |= (bin(hdr & 0xfff0).count("1") & 1) << 3
+            hdr |= meta << 16
+
+            # pack data
+            block_enc.extend(hdr.to_bytes(hdr_size, 'little'))
+            block_enc.extend(block)
+
+            # zero pad to segment size
+            if len(block_enc) % seg_byte_lanes:
+                block_enc.extend(b'\x00'*(seg_byte_lanes - (len(block_enc) % seg_byte_lanes)))
+
+            test_frame_enc.extend(block_enc)
+
+        if pack:
+            packed_data.extend(test_frame_enc)
+        else:
+            await tb.source.write(test_frame_enc)
+            await tb.source_ctrl.write(test_frame_enc)
+
+        test_frames.append(test_frame)
+
+        cur_id = (cur_id + 1) % id_count
+
+    if pack:
+        await tb.source.write(packed_data)
+        await tb.source_ctrl.write(packed_data)
+
+    for test_frame in test_frames:
+        rx_frame = await tb.sink.recv()
+
+        assert rx_frame.tdata == test_frame.tdata
+        assert rx_frame.tid == test_frame.tid
+        assert rx_frame.tdest == test_frame.tdest
+        assert not rx_frame.tuser
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.clk)
+    await RisingEdge(dut.clk)
+
+
+def cycle_pause():
+    return itertools.cycle([1, 1, 1, 0])
+
+
+def size_list():
+    data_width = len(cocotb.top.m_axis_tdata)
+    byte_width = data_width // 8
+    return list(range(1, byte_width*4+1))+list(range(byte_width, 2**14, byte_width))+[1]*64
+
+
+def incrementing_payload(length):
+    return bytearray(itertools.islice(itertools.cycle(range(256)), length))
+
+
+if cocotb.SIM_NAME:
+
+    factory = TestFactory(run_test)
+    factory.add_option("payload_lengths", [size_list])
+    factory.add_option("payload_data", [incrementing_payload])
+    factory.add_option("pack", [False, True])
+    factory.add_option("idle_inserter", [None, cycle_pause])
+    factory.add_option("backpressure_inserter", [None, cycle_pause])
+    factory.generate_tests()
+
+
+# cocotb-test
+
+tests_dir = os.path.abspath(os.path.dirname(__file__))
+rtl_dir = os.path.abspath(os.path.join(tests_dir, '..', '..', 'rtl'))
+
+
+@pytest.mark.parametrize(("axis_data_width", "seg_width", "seg_cnt"), [
+            # (32, 32, 2),
+            # (64, 256, 4),
+            (512, 256, 4),
+        ])
+def test_axi_vfifo_dec(request, axis_data_width, seg_width, seg_cnt):
+    dut = "axi_vfifo_dec"
+    module = os.path.splitext(os.path.basename(__file__))[0]
+    toplevel = dut
+
+    verilog_sources = [
+        os.path.join(rtl_dir, f"{dut}.v"),
+    ]
+
+    parameters = {}
+
+    parameters['SEG_WIDTH'] = seg_width
+    parameters['SEG_CNT'] = seg_cnt
+    parameters['AXIS_DATA_WIDTH'] = axis_data_width
+    parameters['AXIS_KEEP_ENABLE'] = int(parameters['AXIS_DATA_WIDTH'] > 8)
+    parameters['AXIS_KEEP_WIDTH'] = parameters['AXIS_DATA_WIDTH'] // 8
+    parameters['AXIS_LAST_ENABLE'] = 1
+    parameters['AXIS_ID_ENABLE'] = 1
+    parameters['AXIS_ID_WIDTH'] = 8
+    parameters['AXIS_DEST_ENABLE'] = 1
+    parameters['AXIS_DEST_WIDTH'] = 8
+    parameters['AXIS_USER_ENABLE'] = 1
+    parameters['AXIS_USER_WIDTH'] = 1
+
+    extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()}
+
+    sim_build = os.path.join(tests_dir, "sim_build",
+        request.node.name.replace('[', '-').replace(']', ''))
+
+    cocotb_test.simulator.run(
+        python_search=[tests_dir],
+        verilog_sources=verilog_sources,
+        toplevel=toplevel,
+        module=module,
+        parameters=parameters,
+        sim_build=sim_build,
+        extra_env=extra_env,
+    )
diff --git a/tb/axi_vfifo_enc/Makefile b/tb/axi_vfifo_enc/Makefile
new file mode 100644
index 0000000..2160446
--- /dev/null
+++ b/tb/axi_vfifo_enc/Makefile
@@ -0,0 +1,79 @@
+# Copyright (c) 2023 Alex Forencich
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+TOPLEVEL_LANG = verilog
+
+SIM ?= icarus
+WAVES ?= 0
+
+COCOTB_HDL_TIMEUNIT = 1ns
+COCOTB_HDL_TIMEPRECISION = 1ps
+
+DUT      = axi_vfifo_enc
+TOPLEVEL = $(DUT)
+MODULE   = test_$(DUT)
+VERILOG_SOURCES += ../../rtl/$(DUT).v
+
+# module parameters
+export PARAM_SEG_WIDTH := 256
+export PARAM_SEG_CNT := 4
+export PARAM_AXIS_DATA_WIDTH := $(shell expr $(PARAM_SEG_WIDTH) \* $(PARAM_SEG_CNT) / 2)
+export PARAM_AXIS_KEEP_ENABLE := $(shell expr $(PARAM_AXIS_DATA_WIDTH) \> 8 )
+export PARAM_AXIS_KEEP_WIDTH := $(shell expr $(PARAM_AXIS_DATA_WIDTH) / 8 )
+export PARAM_AXIS_LAST_ENABLE := 1
+export PARAM_AXIS_ID_ENABLE := 1
+export PARAM_AXIS_ID_WIDTH := 8
+export PARAM_AXIS_DEST_ENABLE := 1
+export PARAM_AXIS_DEST_WIDTH := 8
+export PARAM_AXIS_USER_ENABLE := 1
+export PARAM_AXIS_USER_WIDTH := 1
+
+ifeq ($(SIM), icarus)
+	PLUSARGS += -fst
+
+	COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-P $(TOPLEVEL).$(subst PARAM_,,$(v))=$($(v)))
+
+	ifeq ($(WAVES), 1)
+		VERILOG_SOURCES += iverilog_dump.v
+		COMPILE_ARGS += -s iverilog_dump
+	endif
+else ifeq ($(SIM), verilator)
+	COMPILE_ARGS += -Wno-SELRANGE -Wno-WIDTH -Wno-CASEINCOMPLETE
+
+	COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-G$(subst PARAM_,,$(v))=$($(v)))
+
+	ifeq ($(WAVES), 1)
+		COMPILE_ARGS += --trace-fst
+	endif
+endif
+
+include $(shell cocotb-config --makefiles)/Makefile.sim
+
+iverilog_dump.v:
+	echo 'module iverilog_dump();' > $@
+	echo 'initial begin' >> $@
+	echo '    $$dumpfile("$(TOPLEVEL).fst");' >> $@
+	echo '    $$dumpvars(0, $(TOPLEVEL));' >> $@
+	echo 'end' >> $@
+	echo 'endmodule' >> $@
+
+clean::
+	@rm -rf iverilog_dump.v
+	@rm -rf dump.fst $(TOPLEVEL).fst
diff --git a/tb/axi_vfifo_enc/test_axi_vfifo_enc.py b/tb/axi_vfifo_enc/test_axi_vfifo_enc.py
new file mode 100644
index 0000000..a7915c5
--- /dev/null
+++ b/tb/axi_vfifo_enc/test_axi_vfifo_enc.py
@@ -0,0 +1,426 @@
+"""
+
+Copyright (c) 2023 Alex Forencich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+"""
+
+import itertools
+import logging
+import os
+
+import cocotb_test.simulator
+import pytest
+
+import cocotb
+
+from cocotb.queue import Queue
+from cocotb.clock import Clock
+from cocotb.triggers import RisingEdge, Event
+from cocotb.regression import TestFactory
+from cocotb_bus.bus import Bus
+
+from cocotbext.axi import AxiStreamBus, AxiStreamFrame, AxiStreamSource
+
+
+class BaseBus(Bus):
+
+    _signals = ["data"]
+    _optional_signals = []
+
+    def __init__(self, entity=None, prefix=None, **kwargs):
+        super().__init__(entity, prefix, self._signals, optional_signals=self._optional_signals, **kwargs)
+
+    @classmethod
+    def from_entity(cls, entity, **kwargs):
+        return cls(entity, **kwargs)
+
+    @classmethod
+    def from_prefix(cls, entity, prefix, **kwargs):
+        return cls(entity, prefix, **kwargs)
+
+
+class DataBus(BaseBus):
+    _signals = ["data", "valid", "ready"]
+
+
+class DataSink:
+
+    def __init__(self, bus, clock, reset=None, watermark=None, *args, **kwargs):
+        self.bus = bus
+        self.clock = clock
+        self.reset = reset
+        self.watermark = watermark
+        self.log = logging.getLogger(f"cocotb.{bus._entity._name}.{bus._name}")
+
+        self.pause = False
+        self._pause_generator = None
+        self._pause_cr = None
+
+        self.enqueue_event = Event()
+
+        self.watermark_level = 0
+
+        self.width = len(self.bus.data)
+        self.byte_size = 8
+        self.byte_lanes = self.width // self.byte_size
+
+        self.seg_count = len(self.bus.valid)
+        self.seg_data_width = self.width // self.seg_count
+        self.seg_byte_lanes = self.seg_data_width // self.byte_size
+
+        self.seg_data_mask = 2**self.seg_data_width-1
+
+        # queue per segment
+        self.queue = [Queue() for x in range(self.seg_count)]
+
+        self.read_queue = bytearray()
+
+        self.bus.data.setimmediatevalue(0)
+        self.bus.valid.setimmediatevalue(0)
+
+        cocotb.start_soon(self._run())
+
+    def set_pause_generator(self, generator=None):
+        if self._pause_cr is not None:
+            self._pause_cr.kill()
+            self._pause_cr = None
+
+        self._pause_generator = generator
+
+        if self._pause_generator is not None:
+            self._pause_cr = cocotb.start_soon(self._run_pause())
+
+    def clear_pause_generator(self):
+        self.set_pause_generator(None)
+
+    def empty(self):
+        for queue in self.queue:
+            if not queue.empty():
+                return False
+        return True
+
+    def clear(self):
+        for queue in self.queue:
+            while not queue.empty():
+                _ = queue.get_nowait()
+        self.read_queue.clear()
+
+    def _read_queues(self):
+        while True:
+            for queue in self.queue:
+                if queue.empty():
+                    return
+            for queue in self.queue:
+                self.read_queue.extend(queue.get_nowait())
+
+    async def read(self, count=-1):
+        self._read_queues()
+        while not self.read_queue:
+            self.enqueue_event.clear()
+            await self.enqueue_event.wait()
+            self._read_queues()
+        return self.read_nowait(count)
+
+    def read_nowait(self, count=-1):
+        self._read_queues()
+        if count < 0:
+            count = len(self.read_queue)
+        data = self.read_queue[:count]
+        del self.read_queue[:count]
+        return data
+
+    async def _run(self):
+        data_sample = 0
+        valid_sample = 0
+        ready = 0
+        watermark = 0
+
+        has_ready = self.bus.ready is not None
+        has_watermark = self.watermark is not None
+
+        clock_edge_event = RisingEdge(self.clock)
+
+        while True:
+            await clock_edge_event
+
+            valid_sample = self.bus.valid.value.integer
+
+            if valid_sample:
+                data_sample = self.bus.data.value.integer
+
+            if self.reset is not None and self.reset.value:
+                if has_ready:
+                    self.bus.ready.setimmediatevalue(0)
+                ready = 0
+                continue
+
+            # process segments
+            watermark = 0
+            for seg in range(self.seg_count):
+                seg_mask = 1 << seg
+                if (ready & seg_mask or not has_ready) and (valid_sample & seg_mask):
+                    data = (data_sample >> self.seg_data_width*seg) & self.seg_data_mask
+
+                    data = data.to_bytes(self.seg_byte_lanes, 'little')
+
+                    self.queue[seg].put_nowait(data)
+                    self.enqueue_event.set()
+
+                    self.log.info("RX seg: %d data: %s", seg, data)
+
+                if has_watermark and self.watermark_level > 0 and self.queue[seg].qsize() > self.watermark_level:
+                    watermark = 1
+
+            ready = 2**self.seg_count-1
+
+            if self.pause:
+                ready = 0
+                watermark = 1
+
+            if has_ready:
+                self.bus.ready.value = ready
+            if has_watermark:
+                self.watermark.value = watermark
+
+    async def _run_pause(self):
+        clock_edge_event = RisingEdge(self.clock)
+
+        for val in self._pause_generator:
+            self.pause = val
+            await clock_edge_event
+
+
+class TB(object):
+    def __init__(self, dut):
+        self.dut = dut
+
+        self.log = logging.getLogger("cocotb.tb")
+        self.log.setLevel(logging.DEBUG)
+
+        cocotb.start_soon(Clock(dut.clk, 10, units="ns").start())
+
+        # streaming data in
+        self.source = AxiStreamSource(AxiStreamBus.from_prefix(dut, "s_axis"), dut.clk, dut.rst)
+
+        # streaming data out
+        self.sink = DataSink(DataBus.from_prefix(dut, "output"), dut.clk, dut.rst, dut.fifo_watermark_in)
+
+        dut.fifo_rst_in.value = 0
+
+    def set_idle_generator(self, generator=None):
+        if generator:
+            self.source.set_pause_generator(generator())
+
+    def set_backpressure_generator(self, generator=None):
+        if generator:
+            self.sink.set_pause_generator(generator())
+
+    async def reset(self):
+        self.dut.rst.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.rst.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.rst.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+    async def reset_fifo(self):
+        self.dut.fifo_rst_in.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.fifo_rst_in.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.fifo_rst_in.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+
+async def run_test(dut, payload_lengths=None, payload_data=None, space=False, idle_inserter=None, backpressure_inserter=None):
+
+    tb = TB(dut)
+
+    id_width = len(tb.source.bus.tid)
+    dest_width = len(tb.source.bus.tdest)
+    user_width = len(tb.source.bus.tuser)
+
+    seg_cnt = tb.sink.seg_count
+    seg_byte_lanes = tb.sink.seg_byte_lanes
+
+    meta_id_offset = 0
+    meta_dest_offset = meta_id_offset + id_width
+    meta_user_offset = meta_dest_offset + dest_width
+    meta_width = meta_user_offset + user_width
+    hdr_size = (16 + meta_width + 7) // 8
+
+    id_count = 2**id_width
+
+    cur_id = 1
+
+    await tb.reset()
+
+    tb.set_idle_generator(idle_inserter)
+    tb.set_backpressure_generator(backpressure_inserter)
+
+    test_frames = []
+
+    for test_data in [payload_data(x) for x in payload_lengths()]:
+        test_frame = AxiStreamFrame(test_data)
+        test_frame.tid = cur_id
+        test_frame.tdest = cur_id
+
+        await tb.source.send(test_frame)
+
+        test_frames.append(test_frame)
+
+        cur_id = (cur_id + 1) % id_count
+
+        if space:
+            for k in range(1000):
+                await RisingEdge(dut.clk)
+
+    for test_frame in test_frames:
+        rx_frame = AxiStreamFrame()
+        while True:
+            # read block
+            block = await tb.sink.read(seg_byte_lanes)
+            # print(block)
+
+            # extract header
+            hdr = int.from_bytes(block[0:hdr_size], 'little')
+            # print(hex(hdr))
+
+            # check parity bits
+            assert bool(hdr & 0x4) == bool(bin(hdr & 0x0003).count("1") & 1)
+            assert bool(hdr & 0x8) == bool(bin(hdr & 0xfff0).count("1") & 1)
+
+            if not hdr & 1:
+                # null block, skip
+                continue
+
+            length = ((hdr >> 4) & 0xfff)+1
+            meta = hdr >> 16
+
+            rx_frame.tid = (meta >> meta_id_offset) & (2**id_width-1)
+            rx_frame.tdest = (meta >> meta_dest_offset) & (2**dest_width-1)
+            rx_frame.tuser = (meta >> meta_user_offset) & (2**user_width-1)
+
+            data = block[hdr_size:]
+
+            while len(data) < length:
+                block = await tb.sink.read(seg_byte_lanes)
+                data.extend(block)
+
+            if len(data) >= length:
+                rx_frame.tdata.extend(data[0:length])
+
+                if hdr & 0x2:
+                    break
+
+        print(rx_frame)
+
+        assert rx_frame == test_frame
+
+    # assert tb.sink.empty()
+
+    for k in range(1000):
+        await RisingEdge(dut.clk)
+
+    await RisingEdge(dut.clk)
+    await RisingEdge(dut.clk)
+
+
+def cycle_pause():
+    return itertools.cycle([1, 1, 1, 0])
+
+
+def size_list():
+    data_width = len(cocotb.top.s_axis_tdata)
+    byte_width = data_width // 8
+    return list(range(1, byte_width*4+1))+list(range(byte_width, 2**14, byte_width))+[1]*64
+
+
+def incrementing_payload(length):
+    return bytearray(itertools.islice(itertools.cycle(range(256)), length))
+
+
+if cocotb.SIM_NAME:
+
+    factory = TestFactory(run_test)
+    factory.add_option("payload_lengths", [size_list])
+    factory.add_option("payload_data", [incrementing_payload])
+    factory.add_option("space", [False, True])
+    factory.add_option("idle_inserter", [None, cycle_pause])
+    factory.add_option("backpressure_inserter", [None, cycle_pause])
+    factory.generate_tests()
+
+
+# cocotb-test
+
+tests_dir = os.path.abspath(os.path.dirname(__file__))
+rtl_dir = os.path.abspath(os.path.join(tests_dir, '..', '..', 'rtl'))
+
+
+@pytest.mark.parametrize(("axis_data_width", "seg_width", "seg_cnt"), [
+            # (32, 32, 2),
+            # (64, 256, 4),
+            (512, 256, 4),
+        ])
+def test_axi_vfifo_enc(request, axis_data_width, seg_width, seg_cnt):
+    dut = "axi_vfifo_enc"
+    module = os.path.splitext(os.path.basename(__file__))[0]
+    toplevel = dut
+
+    verilog_sources = [
+        os.path.join(rtl_dir, f"{dut}.v"),
+    ]
+
+    parameters = {}
+
+    parameters['SEG_WIDTH'] = seg_width
+    parameters['SEG_CNT'] = seg_cnt
+    parameters['AXIS_DATA_WIDTH'] = axis_data_width
+    parameters['AXIS_KEEP_ENABLE'] = int(parameters['AXIS_DATA_WIDTH'] > 8)
+    parameters['AXIS_KEEP_WIDTH'] = parameters['AXIS_DATA_WIDTH'] // 8
+    parameters['AXIS_LAST_ENABLE'] = 1
+    parameters['AXIS_ID_ENABLE'] = 1
+    parameters['AXIS_ID_WIDTH'] = 8
+    parameters['AXIS_DEST_ENABLE'] = 1
+    parameters['AXIS_DEST_WIDTH'] = 8
+    parameters['AXIS_USER_ENABLE'] = 1
+    parameters['AXIS_USER_WIDTH'] = 1
+
+    extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()}
+
+    sim_build = os.path.join(tests_dir, "sim_build",
+        request.node.name.replace('[', '-').replace(']', ''))
+
+    cocotb_test.simulator.run(
+        python_search=[tests_dir],
+        verilog_sources=verilog_sources,
+        toplevel=toplevel,
+        module=module,
+        parameters=parameters,
+        sim_build=sim_build,
+        extra_env=extra_env,
+    )
diff --git a/tb/axi_vfifo_raw/Makefile b/tb/axi_vfifo_raw/Makefile
new file mode 100644
index 0000000..63c0880
--- /dev/null
+++ b/tb/axi_vfifo_raw/Makefile
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 Alex Forencich
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+TOPLEVEL_LANG = verilog
+
+SIM ?= icarus
+WAVES ?= 0
+
+COCOTB_HDL_TIMEUNIT = 1ns
+COCOTB_HDL_TIMEPRECISION = 1ps
+
+DUT      = axi_vfifo_raw
+TOPLEVEL = $(DUT)
+MODULE   = test_$(DUT)
+VERILOG_SOURCES += ../../rtl/$(DUT).v
+VERILOG_SOURCES += ../../rtl/$(DUT)_wr.v
+VERILOG_SOURCES += ../../rtl/$(DUT)_rd.v
+
+# module parameters
+export PARAM_SEG_WIDTH := 32
+export PARAM_SEG_CNT := 1
+export PARAM_AXI_DATA_WIDTH := $(shell expr $(PARAM_SEG_WIDTH) \* $(PARAM_SEG_CNT) )
+export PARAM_AXI_ADDR_WIDTH := 16
+export PARAM_AXI_STRB_WIDTH := $(shell expr $(PARAM_AXI_DATA_WIDTH) / 8 )
+export PARAM_AXI_ID_WIDTH := 8
+export PARAM_AXI_MAX_BURST_LEN := 16
+export PARAM_LEN_WIDTH := $(PARAM_AXI_ADDR_WIDTH)
+export PARAM_WRITE_FIFO_DEPTH := 64
+export PARAM_WRITE_MAX_BURST_LEN := $(shell expr $(PARAM_WRITE_FIFO_DEPTH) / 4 )
+export PARAM_READ_FIFO_DEPTH := 128
+export PARAM_READ_MAX_BURST_LEN := $(PARAM_WRITE_MAX_BURST_LEN)
+export PARAM_WATERMARK_LEVEL := $(shell expr $(PARAM_WRITE_FIFO_DEPTH) / 2 )
+export PARAM_CTRL_OUT_EN := 0
+
+ifeq ($(SIM), icarus)
+	PLUSARGS += -fst
+
+	COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-P $(TOPLEVEL).$(subst PARAM_,,$(v))=$($(v)))
+
+	ifeq ($(WAVES), 1)
+		VERILOG_SOURCES += iverilog_dump.v
+		COMPILE_ARGS += -s iverilog_dump
+	endif
+else ifeq ($(SIM), verilator)
+	COMPILE_ARGS += -Wno-SELRANGE -Wno-WIDTH -Wno-CASEINCOMPLETE
+
+	COMPILE_ARGS += $(foreach v,$(filter PARAM_%,$(.VARIABLES)),-G$(subst PARAM_,,$(v))=$($(v)))
+
+	ifeq ($(WAVES), 1)
+		COMPILE_ARGS += --trace-fst
+	endif
+endif
+
+include $(shell cocotb-config --makefiles)/Makefile.sim
+
+iverilog_dump.v:
+	echo 'module iverilog_dump();' > $@
+	echo 'initial begin' >> $@
+	echo '    $$dumpfile("$(TOPLEVEL).fst");' >> $@
+	echo '    $$dumpvars(0, $(TOPLEVEL));' >> $@
+	echo 'end' >> $@
+	echo 'endmodule' >> $@
+
+clean::
+	@rm -rf iverilog_dump.v
+	@rm -rf dump.fst $(TOPLEVEL).fst
diff --git a/tb/axi_vfifo_raw/test_axi_vfifo_raw.py b/tb/axi_vfifo_raw/test_axi_vfifo_raw.py
new file mode 100644
index 0000000..2a57215
--- /dev/null
+++ b/tb/axi_vfifo_raw/test_axi_vfifo_raw.py
@@ -0,0 +1,804 @@
+"""
+
+Copyright (c) 2023 Alex Forencich
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+"""
+
+import itertools
+import logging
+import os
+
+import cocotb_test.simulator
+import pytest
+
+import cocotb
+
+from cocotb.queue import Queue
+from cocotb.clock import Clock
+from cocotb.triggers import RisingEdge, Event
+from cocotb.regression import TestFactory
+from cocotb_bus.bus import Bus
+
+from cocotbext.axi import AxiBus, AxiRam
+
+
+class BaseBus(Bus):
+
+    _signals = ["data"]
+    _optional_signals = []
+
+    def __init__(self, entity=None, prefix=None, **kwargs):
+        super().__init__(entity, prefix, self._signals, optional_signals=self._optional_signals, **kwargs)
+
+    @classmethod
+    def from_entity(cls, entity, **kwargs):
+        return cls(entity, **kwargs)
+
+    @classmethod
+    def from_prefix(cls, entity, prefix, **kwargs):
+        return cls(entity, prefix, **kwargs)
+
+
+class DataBus(BaseBus):
+    _signals = ["data", "valid", "ready"]
+
+
+class DataSource:
+
+    def __init__(self, bus, clock, reset=None, *args, **kwargs):
+        self.bus = bus
+        self.clock = clock
+        self.reset = reset
+        self.log = logging.getLogger(f"cocotb.{bus._entity._name}.{bus._name}")
+
+        self.pause = False
+        self._pause_generator = None
+        self._pause_cr = None
+
+        self.width = len(self.bus.data)
+        self.byte_size = 8
+        self.byte_lanes = self.width // self.byte_size
+
+        self.seg_count = len(self.bus.valid)
+        self.seg_data_width = self.width // self.seg_count
+        self.seg_byte_lanes = self.seg_data_width // self.byte_size
+
+        self.seg_data_mask = 2**self.seg_data_width-1
+
+        # queue per segment
+        self.queue = [Queue() for x in range(self.seg_count)]
+
+        self.bus.data.setimmediatevalue(0)
+        self.bus.valid.setimmediatevalue(0)
+
+        cocotb.start_soon(self._run())
+
+    def set_pause_generator(self, generator=None):
+        if self._pause_cr is not None:
+            self._pause_cr.kill()
+            self._pause_cr = None
+
+        self._pause_generator = generator
+
+        if self._pause_generator is not None:
+            self._pause_cr = cocotb.start_soon(self._run_pause())
+
+    def clear_pause_generator(self):
+        self.set_pause_generator(None)
+
+    def empty(self):
+        for queue in self.queue:
+            if not queue.empty():
+                return False
+        return True
+
+    def clear(self):
+        for queue in self.queue:
+            while not queue.empty():
+                _ = queue.get_nowait()
+
+    async def write(self, data):
+        self.write_nowait(data)
+
+    def write_nowait(self, data):
+        data = bytearray(data)
+
+        # pad to interface width
+        if len(data) % self.byte_lanes:
+            data.extend(b'\x00'*(self.byte_lanes - (len(data) % self.byte_lanes)))
+
+        # stripe across segment queues
+        index = 0
+        for offset in range(0, len(data), self.seg_byte_lanes):
+            self.queue[index].put_nowait(data[offset:offset+self.seg_byte_lanes])
+            index = (index + 1) % self.seg_count
+
+    async def _run(self):
+        data = 0
+        valid = 0
+        ready_sample = 0
+
+        clock_edge_event = RisingEdge(self.clock)
+
+        while True:
+            await clock_edge_event
+
+            ready_sample = self.bus.ready.value.integer
+
+            if self.reset is not None and self.reset.value:
+                self.bus.valid.setimmediatevalue(0)
+                valid = 0
+                self.clear()
+                continue
+
+            # process segments
+            for seg in range(self.seg_count):
+                seg_mask = 1 << seg
+                if ((ready_sample & seg_mask) or not (valid & seg_mask)):
+                    if not self.queue[seg].empty() and not self.pause:
+                        d = self.queue[seg].get_nowait()
+                        data &= ~(self.seg_data_mask << self.seg_data_width*seg)
+                        data |= int.from_bytes(d, 'little') << self.seg_data_width*seg
+                        valid |= seg_mask
+
+                        self.log.info("TX seg: %d data: %s", seg, d)
+                    else:
+                        valid = valid & ~seg_mask
+
+            self.bus.data.value = data
+            self.bus.valid.value = valid
+
+    async def _run_pause(self):
+        clock_edge_event = RisingEdge(self.clock)
+
+        for val in self._pause_generator:
+            self.pause = val
+            await clock_edge_event
+
+
+class DataSink:
+
+    def __init__(self, bus, clock, reset=None, watermark=None, *args, **kwargs):
+        self.bus = bus
+        self.clock = clock
+        self.reset = reset
+        self.watermark = watermark
+        self.log = logging.getLogger(f"cocotb.{bus._entity._name}.{bus._name}")
+
+        self.pause = False
+        self._pause_generator = None
+        self._pause_cr = None
+
+        self.enqueue_event = Event()
+
+        self.watermark_level = 0
+
+        self.width = len(self.bus.data)
+        self.byte_size = 8
+        self.byte_lanes = self.width // self.byte_size
+
+        self.seg_count = len(self.bus.valid)
+        self.seg_data_width = self.width // self.seg_count
+        self.seg_byte_lanes = self.seg_data_width // self.byte_size
+
+        self.seg_data_mask = 2**self.seg_data_width-1
+
+        # queue per segment
+        self.queue = [Queue() for x in range(self.seg_count)]
+
+        self.read_queue = bytearray()
+
+        self.bus.data.setimmediatevalue(0)
+        self.bus.valid.setimmediatevalue(0)
+
+        cocotb.start_soon(self._run())
+
+    def set_pause_generator(self, generator=None):
+        if self._pause_cr is not None:
+            self._pause_cr.kill()
+            self._pause_cr = None
+
+        self._pause_generator = generator
+
+        if self._pause_generator is not None:
+            self._pause_cr = cocotb.start_soon(self._run_pause())
+
+    def clear_pause_generator(self):
+        self.set_pause_generator(None)
+
+    def empty(self):
+        for queue in self.queue:
+            if not queue.empty():
+                return False
+        return True
+
+    def clear(self):
+        for queue in self.queue:
+            while not queue.empty():
+                _ = queue.get_nowait()
+        self.read_queue.clear()
+
+    def _read_queues(self):
+        while True:
+            for queue in self.queue:
+                if queue.empty():
+                    return
+            for queue in self.queue:
+                self.read_queue.extend(queue.get_nowait())
+
+    async def read(self, count=-1):
+        self._read_queues()
+        while not self.read_queue:
+            self.enqueue_event.clear()
+            await self.enqueue_event.wait()
+            self._read_queues()
+        return self.read_nowait(count)
+
+    def read_nowait(self, count=-1):
+        self._read_queues()
+        if count < 0:
+            count = len(self.read_queue)
+        data = self.read_queue[:count]
+        del self.read_queue[:count]
+        return data
+
+    async def _run(self):
+        data_sample = 0
+        valid_sample = 0
+        ready = 0
+        watermark = 0
+
+        has_ready = self.bus.ready is not None
+        has_watermark = self.watermark is not None
+
+        clock_edge_event = RisingEdge(self.clock)
+
+        while True:
+            await clock_edge_event
+
+            valid_sample = self.bus.valid.value.integer
+
+            if valid_sample:
+                data_sample = self.bus.data.value.integer
+
+            if self.reset is not None and self.reset.value:
+                if has_ready:
+                    self.bus.ready.setimmediatevalue(0)
+                ready = 0
+                continue
+
+            # process segments
+            watermark = 0
+            for seg in range(self.seg_count):
+                seg_mask = 1 << seg
+                if ready & valid_sample & seg_mask:
+                    data = (data_sample >> self.seg_data_width*seg) & self.seg_data_mask
+
+                    data = data.to_bytes(self.seg_byte_lanes, 'little')
+
+                    self.queue[seg].put_nowait(data)
+                    self.enqueue_event.set()
+
+                    self.log.info("RX seg: %d data: %s", seg, data)
+
+                if has_watermark and self.watermark_level > 0 and self.queue[seg].qsize() > self.watermark_level:
+                    watermark = 1
+
+            ready = 2**self.seg_count-1
+
+            if self.pause:
+                ready = 0
+                watermark = 1
+
+            if has_ready:
+                self.bus.ready.value = ready
+            if has_watermark:
+                self.watermark.value = watermark
+
+    async def _run_pause(self):
+        clock_edge_event = RisingEdge(self.clock)
+
+        for val in self._pause_generator:
+            self.pause = val
+            await clock_edge_event
+
+
+class TB(object):
+    def __init__(self, dut):
+        self.dut = dut
+
+        self.log = logging.getLogger("cocotb.tb")
+        self.log.setLevel(logging.DEBUG)
+
+        cocotb.start_soon(Clock(dut.clk, 10, units="ns").start())
+
+        # streaming data in
+        cocotb.start_soon(Clock(dut.input_clk, 10, units="ns").start())
+        self.source = DataSource(DataBus.from_prefix(dut, "input"), dut.input_clk, dut.input_rst_out)
+
+        # streaming data out
+        cocotb.start_soon(Clock(dut.output_clk, 10, units="ns").start())
+        self.sink = DataSink(DataBus.from_prefix(dut, "output"), dut.output_clk, dut.output_rst_out)
+
+        # AXI interface
+        self.axi_ram = AxiRam(AxiBus.from_prefix(dut, "m_axi"), dut.clk, dut.rst, size=2**16)
+
+        dut.rst_req_in.setimmediatevalue(0)
+
+        dut.cfg_fifo_base_addr.setimmediatevalue(0)
+        dut.cfg_fifo_size_mask.setimmediatevalue(0)
+        dut.cfg_enable.setimmediatevalue(0)
+        dut.cfg_reset.setimmediatevalue(0)
+
+    def set_stream_idle_generator(self, generator=None):
+        if generator:
+            self.source.set_pause_generator(generator())
+
+    def set_stream_backpressure_generator(self, generator=None):
+        if generator:
+            self.sink.set_pause_generator(generator())
+
+    def set_axi_idle_generator(self, generator=None):
+        if generator:
+            self.axi_ram.write_if.b_channel.set_pause_generator(generator())
+            self.axi_ram.read_if.r_channel.set_pause_generator(generator())
+
+    def set_axi_backpressure_generator(self, generator=None):
+        if generator:
+            self.axi_ram.write_if.aw_channel.set_pause_generator(generator())
+            self.axi_ram.write_if.w_channel.set_pause_generator(generator())
+            self.axi_ram.read_if.ar_channel.set_pause_generator(generator())
+
+    async def reset(self):
+        self.dut.rst.setimmediatevalue(0)
+        self.dut.input_rst.setimmediatevalue(0)
+        self.dut.output_rst.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.rst.value = 1
+        self.dut.input_rst.value = 1
+        self.dut.output_rst.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.rst.value = 0
+        self.dut.input_rst.value = 0
+        self.dut.output_rst.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+    async def reset_axi(self):
+        self.dut.rst.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.rst.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.rst.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+    async def reset_source(self):
+        self.dut.input_rst.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.input_clk)
+        self.dut.input_rst.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.input_clk)
+        self.dut.input_rst.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.input_clk)
+
+    async def reset_sink(self):
+        self.dut.output_rst.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.output_clk)
+        self.dut.output_rst.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.output_clk)
+        self.dut.output_rst.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.output_clk)
+
+    async def reset_cfg(self):
+        self.dut.cfg_reset.setimmediatevalue(0)
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.cfg_reset.value = 1
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+        self.dut.cfg_reset.value = 0
+        for k in range(10):
+            await RisingEdge(self.dut.clk)
+
+
+async def run_test(dut, payload_lengths=None, payload_data=None, space=False,
+        stream_idle_inserter=None, stream_backpressure_inserter=None,
+        axi_idle_inserter=None, axi_backpressure_inserter=None):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    tb.set_stream_idle_generator(stream_idle_inserter)
+    tb.set_stream_backpressure_generator(stream_backpressure_inserter)
+    tb.set_axi_idle_generator(axi_idle_inserter)
+    tb.set_axi_backpressure_generator(axi_backpressure_inserter)
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**len(dut.m_axi_awaddr)-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    test_frames = []
+
+    for test_data in [payload_data(x) for x in payload_lengths()]:
+        if len(test_data) % byte_lanes:
+            test_data.extend(b'\x00'*(byte_lanes - (len(test_data) % byte_lanes)))
+
+        test_frames.append(test_data)
+        await tb.source.write(test_data)
+
+        if space:
+            for k in range(1000):
+                await RisingEdge(dut.clk)
+
+    for test_data in test_frames:
+        rx_data = bytearray()
+        while len(rx_data) < len(test_data):
+            d = await tb.sink.read(len(test_data) - len(rx_data))
+            rx_data.extend(d)
+
+        assert rx_data == test_data
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.clk)
+    await RisingEdge(dut.clk)
+
+
+async def run_test_init_sink_pause(dut):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**len(dut.m_axi_awaddr)-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    tb.sink.pause = True
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 1024*byte_lanes))
+
+    if len(test_data) % byte_lanes:
+        test_data.extend(b'\x00'*(byte_lanes - (len(test_data) % byte_lanes)))
+
+    await tb.source.write(test_data)
+
+    for k in range(256):
+        await RisingEdge(dut.clk)
+
+    tb.sink.pause = False
+
+    rx_data = bytearray()
+    while len(rx_data) < len(test_data):
+        d = await tb.sink.read(len(test_data) - len(rx_data))
+        rx_data.extend(d)
+
+    assert rx_data == test_data
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.clk)
+    await RisingEdge(dut.clk)
+
+
+async def run_test_init_sink_pause_reset(dut, reset_type=TB.reset):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**len(dut.m_axi_awaddr)-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    tb.sink.pause = True
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 1024*byte_lanes))
+
+    if len(test_data) % byte_lanes:
+        test_data.extend(b'\x00'*(byte_lanes - (len(test_data) % byte_lanes)))
+
+    await tb.source.write(test_data)
+
+    for k in range(256):
+        await RisingEdge(dut.clk)
+
+    await reset_type(tb)
+    tb.sink.clear()
+
+    tb.sink.pause = False
+
+    for k in range(1024):
+        await RisingEdge(dut.clk)
+
+    assert tb.sink.empty()
+
+    await tb.source.write(test_data)
+
+    rx_data = bytearray()
+    while len(rx_data) < len(test_data):
+        d = await tb.sink.read(len(test_data) - len(rx_data))
+        rx_data.extend(d)
+
+    assert rx_data == test_data
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.clk)
+    await RisingEdge(dut.clk)
+
+
+async def run_test_shift_in_reset(dut, reset_type=TB.reset):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**len(dut.m_axi_awaddr)-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 1024*byte_lanes))
+
+    if len(test_data) % byte_lanes:
+        test_data.extend(b'\x00'*(byte_lanes - (len(test_data) % byte_lanes)))
+
+    await tb.source.write(test_data)
+
+    for k in range(256):
+        await RisingEdge(dut.clk)
+
+    await reset_type(tb)
+    tb.sink.clear()
+
+    for k in range(2048):
+        await RisingEdge(dut.clk)
+
+    assert tb.sink.empty()
+
+    await tb.source.write(test_data)
+
+    rx_data = bytearray()
+    while len(rx_data) < len(test_data):
+        d = await tb.sink.read(len(test_data) - len(rx_data))
+        rx_data.extend(d)
+
+    assert rx_data == test_data
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.clk)
+    await RisingEdge(dut.clk)
+
+
+async def run_test_shift_out_reset(dut, reset_type=TB.reset):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**len(dut.m_axi_awaddr)-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 1024*byte_lanes))
+
+    if len(test_data) % byte_lanes:
+        test_data.extend(b'\x00'*(byte_lanes - (len(test_data) % byte_lanes)))
+
+    await tb.source.write(test_data)
+
+    while not dut.output_valid:
+        await RisingEdge(dut.clk)
+
+    for k in range(8):
+        await RisingEdge(dut.clk)
+
+    await reset_type(tb)
+    tb.sink.clear()
+
+    for k in range(2048):
+        await RisingEdge(dut.clk)
+
+    assert tb.sink.empty()
+
+    await tb.source.write(test_data)
+
+    rx_data = bytearray()
+    while len(rx_data) < len(test_data):
+        d = await tb.sink.read(len(test_data) - len(rx_data))
+        rx_data.extend(d)
+
+    assert rx_data == test_data
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.clk)
+    await RisingEdge(dut.clk)
+
+
+async def run_test_overflow(dut):
+
+    tb = TB(dut)
+
+    byte_lanes = tb.source.byte_lanes
+
+    await tb.reset()
+
+    dut.cfg_fifo_base_addr.setimmediatevalue(0)
+    dut.cfg_fifo_size_mask.setimmediatevalue(2**len(dut.m_axi_awaddr)-1)
+    dut.cfg_enable.setimmediatevalue(1)
+
+    tb.sink.pause = True
+
+    test_data = bytearray(itertools.islice(itertools.cycle(range(256)), 2*2**len(dut.m_axi_awaddr)))
+
+    if len(test_data) % byte_lanes:
+        test_data.extend(b'\x00'*(byte_lanes - (len(test_data) % byte_lanes)))
+
+    await tb.source.write(test_data)
+
+    for k in range(2*2**len(dut.m_axi_awaddr)//len(dut.m_axi_wstrb)):
+        await RisingEdge(dut.clk)
+
+    tb.sink.pause = False
+
+    rx_data = bytearray()
+    while len(rx_data) < len(test_data):
+        d = await tb.sink.read(len(test_data) - len(rx_data))
+        rx_data.extend(d)
+
+    assert rx_data == test_data
+
+    assert tb.sink.empty()
+
+    await RisingEdge(dut.clk)
+    await RisingEdge(dut.clk)
+
+
+def cycle_pause():
+    return itertools.cycle([1, 1, 1, 0])
+
+
+def size_list():
+    data_width = len(cocotb.top.input_data)
+    byte_width = data_width // 8
+    return list(range(byte_width, byte_width*64, byte_width))+[1]*64
+
+
+def incrementing_payload(length):
+    return bytearray(itertools.islice(itertools.cycle(range(256)), length))
+
+
+if cocotb.SIM_NAME:
+
+    factory = TestFactory(run_test)
+    factory.add_option("payload_lengths", [size_list])
+    factory.add_option("payload_data", [incrementing_payload])
+    factory.add_option(("space",
+            "stream_idle_inserter", "stream_backpressure_inserter",
+            "axi_idle_inserter", "axi_backpressure_inserter"), [
+        (False, None, None, None, None),
+        (False, cycle_pause, None, None, None),
+        (False, None, cycle_pause, None, None),
+        (False, None, None, cycle_pause, None),
+        (False, None, None, None, cycle_pause),
+        (True,  None, None, None, None),
+        (True,  cycle_pause, None, None, None),
+        (True,  None, cycle_pause, None, None),
+        (True,  None, None, cycle_pause, None),
+        (True,  None, None, None, cycle_pause),
+    ])
+    factory.generate_tests()
+
+    for test in [
+                run_test_init_sink_pause,
+                run_test_overflow
+            ]:
+
+        factory = TestFactory(test)
+        factory.generate_tests()
+
+    for test in [
+                run_test_init_sink_pause_reset,
+                run_test_shift_in_reset,
+                run_test_shift_out_reset,
+            ]:
+
+        factory = TestFactory(test)
+        factory.add_option("reset_type", [TB.reset, TB.reset_source,
+                TB.reset_sink, TB.reset_axi, TB.reset_cfg])
+        factory.generate_tests()
+
+
+# cocotb-test
+
+tests_dir = os.path.abspath(os.path.dirname(__file__))
+rtl_dir = os.path.abspath(os.path.join(tests_dir, '..', '..', 'rtl'))
+
+
+@pytest.mark.parametrize(("seg_width", "seg_cnt"), [
+            (32, 1),
+            (16, 2),
+        ])
+def test_axi_vfifo_raw(request, seg_width, seg_cnt):
+    dut = "axi_vfifo_raw"
+    module = os.path.splitext(os.path.basename(__file__))[0]
+    toplevel = dut
+
+    verilog_sources = [
+        os.path.join(rtl_dir, f"{dut}.v"),
+        os.path.join(rtl_dir, f"{dut}_wr.v"),
+        os.path.join(rtl_dir, f"{dut}_rd.v"),
+    ]
+
+    parameters = {}
+
+    parameters['SEG_WIDTH'] = seg_width
+    parameters['SEG_CNT'] = seg_cnt
+    parameters['AXI_DATA_WIDTH'] = seg_width*seg_cnt
+    parameters['AXI_ADDR_WIDTH'] = 16
+    parameters['AXI_STRB_WIDTH'] = parameters['AXI_DATA_WIDTH'] // 8
+    parameters['AXI_ID_WIDTH'] = 8
+    parameters['AXI_MAX_BURST_LEN'] = 16
+    parameters['LEN_WIDTH'] = parameters['AXI_ADDR_WIDTH']
+    parameters['WRITE_FIFO_DEPTH'] = 64
+    parameters['WRITE_MAX_BURST_LEN'] = parameters['WRITE_FIFO_DEPTH'] // 4
+    parameters['READ_FIFO_DEPTH'] = 128
+    parameters['READ_MAX_BURST_LEN'] = parameters['WRITE_MAX_BURST_LEN']
+    parameters['WATERMARK_LEVEL'] = parameters['WRITE_FIFO_DEPTH'] // 2
+    parameters['CTRL_OUT_EN'] = 0
+
+    extra_env = {f'PARAM_{k}': str(v) for k, v in parameters.items()}
+
+    sim_build = os.path.join(tests_dir, "sim_build",
+        request.node.name.replace('[', '-').replace(']', ''))
+
+    cocotb_test.simulator.run(
+        python_search=[tests_dir],
+        verilog_sources=verilog_sources,
+        toplevel=toplevel,
+        module=module,
+        parameters=parameters,
+        sim_build=sim_build,
+        extra_env=extra_env,
+    )