diff --git a/cosim/black-parrot-example/flist.vcs b/cosim/black-parrot-example/flist.vcs index 3bfb7167..36c9326f 100644 --- a/cosim/black-parrot-example/flist.vcs +++ b/cosim/black-parrot-example/flist.vcs @@ -1,6 +1,7 @@ +incdir+$CURR_VSRC_DIR +incdir+$BASEJUMP_STL_DIR/bsg_tag $BASEJUMP_STL_DIR/bsg_misc/bsg_mux2_gatestack.sv +$BASEJUMP_STL_DIR/bsg_misc/bsg_counter_dynamic_limit.sv $BASEJUMP_STL_DIR/bsg_tag/bsg_tag_pkg.sv $BASEJUMP_STL_DIR/bsg_tag/bsg_tag_master_decentralized.sv $BASEJUMP_STL_DIR/bsg_tag/bsg_tag_client.sv @@ -12,6 +13,7 @@ $BASEJUMP_STL_DIR/bsg_tag/bsg_tag_bitbang.sv $BLACKPARROT_SUB_DIR/axi/v/bp_me_axil_master.sv $BLACKPARROT_SUB_DIR/axi/v/bp_me_axil_client.sv $BLACKPARROT_SUB_DIR/axi/v/bp_axi_top.sv +$BLACKPARROT_SUB_DIR/axi/v/bp_axi_cdl.sv $BLACKPARROT_SUB_DIR/axi/v/bsg_axil_fifo_client.sv $BLACKPARROT_SUB_DIR/axi/v/bsg_axil_fifo_master.sv $BLACKPARROT_SUB_DIR/axi/v/bsg_axil_store_packer.sv @@ -21,6 +23,9 @@ $COSIM_VSRC_DIR/bsg_zynq_pl_shell.sv $CURR_VSRC_DIR/bsg_blackparrot_pkg.sv $CURR_VSRC_DIR/bsg_bootrom.v $CURR_VSRC_DIR/zynq_pkg.sv +$CURR_VSRC_DIR/bp_profiler_pkg.sv +$CURR_VSRC_DIR/bp_nonsynth_core_profiler.sv +$CURR_VSRC_DIR/bp_commit_profiler.sv $CURR_VSRC_DIR/top_zynq.sv $CURR_VSRC_DIR/top.v diff --git a/cosim/black-parrot-example/ps.cpp b/cosim/black-parrot-example/ps.cpp index b3d94713..858d222a 100644 --- a/cosim/black-parrot-example/ps.cpp +++ b/cosim/black-parrot-example/ps.cpp @@ -12,6 +12,9 @@ #include #include #include +#include +#include +#include #include "ps.hpp" @@ -28,7 +31,7 @@ #endif #ifndef DRAM_ALLOCATE_SIZE_MB -#define DRAM_ALLOCATE_SIZE_MB 128 +#define DRAM_ALLOCATE_SIZE_MB 241 #endif #define DRAM_ALLOCATE_SIZE (DRAM_ALLOCATE_SIZE_MB * 1024 * 1024) @@ -40,9 +43,39 @@ #define BP_NCPUS 1 #endif +#ifndef SAMPLE_GATE_EN +#error "SAMPLE_GATE_EN not defined!" +#endif + +#ifndef SAMPLE_INTRVL +#error "SAMPLE_INTRVL not defined!" +#endif + +#ifndef DRAM_GATE_EN +#error "DRAM_GATE_EN not defined!" +#endif + +#ifndef DRAM_LATENCY +#error "DRAM_LATENCY not defined!" +#endif + // Helper functions void nbf_load(bsg_zynq_pl *zpl, char *filename); bool decode_bp_output(bsg_zynq_pl *zpl, long data); +void report(bsg_zynq_pl *zpl, char *); + +const char* metrics[] = { + "cycle", "mcycle", "minstret", + "ic_miss", + "br_ovr", "ret_ovr", "jal_ovr", "fe_cmd", "fe_cmd_fence", + "mispredict", "control_haz", "long_haz", "data_haz", + "catchup_dep", "aux_dep", "load_dep", "mul_dep", "fma_dep", "sb_iraw_dep", + "sb_fraw_dep", "sb_iwaw_dep", "sb_fwaw_dep", + "struct_haz", "idiv_haz", "fdiv_haz", + "ptw_busy", "special", "exception", "_interrupt", + "itlb_miss", "dtlb_miss", + "dc_miss", "dc_fail", "unknown", +}; // Globals std::queue getchar_queue; @@ -75,17 +108,43 @@ inline uint64_t get_counter_64(bsg_zynq_pl *zpl, uint64_t addr) { } while (1); } +void device_poll(bsg_zynq_pl *zpl) { + uint32_t pc; + uint8_t stall; + int instret; + while (1) { + // keep reading as long as there is data + if (zpl->shell_read(GP0_RD_PL2PS_FIFO_0_CTRS) != 0) { + decode_bp_output(zpl, zpl->shell_read(GP0_RD_PL2PS_FIFO_0_DATA)); + } + // break loop when all cores done + if (done_vec.all()) { + break; + } + + // drain sample data from FIFOs + int cnt = zpl->shell_read(GP0_RD_PL2PS_FIFO_1_CTRS); + if(cnt != 0) { + for(int i = 0; i < cnt; i++) { +/* + uint32x2_t data = zpl->axil_2read(GP0_RD_PL2PS_FIFO_1_DATA); + pc = data[0]; + stall = ((data[1] & 0x1) << 7) | (data[1] >> 1); +*/ + pc = zpl->shell_read(GP0_RD_PL2PS_FIFO_1_DATA); + uint32_t data = zpl->shell_read(GP0_RD_PL2PS_FIFO_2_DATA); + instret = (data & 0x1); + stall = (data >> 1); + } + } + zpl->poll_tick(); + } +} + int ps_main(int argc, char **argv) { bsg_zynq_pl *zpl = new bsg_zynq_pl(argc, argv); - long data; - long val1 = 0x1; - long val2 = 0x0; - long mask1 = 0xf; - long mask2 = 0xf; - - pthread_t thread_id; long allocated_dram = DRAM_ALLOCATE_SIZE; int32_t val; @@ -98,9 +157,9 @@ int ps_main(int argc, char **argv) { bsg_pr_info("ps.cpp: attempting to write and read register 0x8\n"); - zpl->shell_write(GP0_WR_CSR_DRAM_BASE, 0xDEADBEEF, mask1); + zpl->shell_write(GP0_WR_CSR_DRAM_BASE, 0xDEADBEEF, 0xf); assert((zpl->shell_read(GP0_RD_CSR_DRAM_BASE) == (0xDEADBEEF))); - zpl->shell_write(GP0_WR_CSR_DRAM_BASE, val, mask1); + zpl->shell_write(GP0_WR_CSR_DRAM_BASE, val, 0xf); assert((zpl->shell_read(GP0_RD_CSR_DRAM_BASE) == (val))); bsg_pr_info("ps.cpp: successfully wrote and read registers in bsg_zynq_shell " @@ -108,6 +167,7 @@ int ps_main(int argc, char **argv) { bsg_tag_bitbang *btb = new bsg_tag_bitbang(zpl, GP0_WR_CSR_TAG_BITBANG, TAG_NUM_CLIENTS, TAG_MAX_LEN); bsg_tag_client *pl_reset_client = new bsg_tag_client(TAG_CLIENT_PL_RESET_ID, TAG_CLIENT_PL_RESET_WIDTH); + bsg_tag_client *pl_cnten_client = new bsg_tag_client(TAG_CLIENT_PL_CNTEN_ID, TAG_CLIENT_PL_CNTEN_WIDTH); bsg_tag_client *wd_reset_client = new bsg_tag_client(TAG_CLIENT_WD_RESET_ID, TAG_CLIENT_WD_RESET_WIDTH); // Reset the bsg tag master @@ -118,7 +178,9 @@ int ps_main(int argc, char **argv) { btb->reset_client(wd_reset_client); // Set bsg client0 to 1 (assert BP reset) btb->set_client(pl_reset_client, 0x1); - // Set bsg client1 to 1 (assert WD reset) + // Set bsg client1 to 1 (assert BP counter en) + btb->set_client(pl_cnten_client, 0x1); + // Set bsg client2 to 1 (assert WD reset) btb->set_client(wd_reset_client, 0x1); // Set bsg client0 to 0 (deassert BP reset) btb->set_client(pl_reset_client, 0x0); @@ -130,24 +192,29 @@ int ps_main(int argc, char **argv) { unsigned long phys_ptr; volatile int32_t *buf; - data = zpl->shell_read(GP0_RD_CSR_DRAM_INITED); - if (data == 0) { + if (zpl->shell_read(GP0_RD_CSR_DRAM_INITED) == 0) { bsg_pr_info( "ps.cpp: CSRs do not contain a DRAM base pointer; calling allocate " "dram with size %ld\n", allocated_dram); buf = (volatile int32_t *)zpl->allocate_dram(allocated_dram, &phys_ptr); bsg_pr_info("ps.cpp: received %p (phys = %lx)\n", buf, phys_ptr); - zpl->shell_write(GP0_WR_CSR_DRAM_BASE, phys_ptr, mask1); + zpl->shell_write(GP0_WR_CSR_DRAM_BASE, phys_ptr, 0xf); assert((zpl->shell_read(GP0_RD_CSR_DRAM_BASE) == (phys_ptr))); bsg_pr_info("ps.cpp: wrote and verified base register\n"); - zpl->shell_write(GP0_WR_CSR_DRAM_INITED, 0x1, mask2); + zpl->shell_write(GP0_WR_CSR_DRAM_INITED, 0x1, 0xf); assert(zpl->shell_read(GP0_RD_CSR_DRAM_INITED) == 1); } else bsg_pr_info("ps.cpp: reusing dram base pointer %x\n", zpl->shell_read(GP0_RD_CSR_DRAM_BASE)); +#ifdef ZYNQ int outer = 1024 / 4; + long num_times = allocated_dram / 32768; +#else + int outer = 64 / 4; + long num_times = 64; +#endif if (argc == 1) { bsg_pr_warn( @@ -175,24 +242,23 @@ int ps_main(int argc, char **argv) { int y = zpl->shell_read(GP1_CSR_BASE_ADDR + 0x304000); bsg_pr_info("ps.cpp: writing mtimecmp\n"); - zpl->shell_write(GP1_CSR_BASE_ADDR + 0x304000, y + 1, mask1); + zpl->shell_write(GP1_CSR_BASE_ADDR + 0x304000, y + 1, 0xf); bsg_pr_info("ps.cpp: reading mtimecmp\n"); assert(zpl->shell_read(GP1_CSR_BASE_ADDR + 0x304000) == y + 1); #ifdef DRAM_TEST - long num_times = allocated_dram / 32768; bsg_pr_info( "ps.cpp: attempting to write L2 %ld times over %ld MB (testing ARM GP1 " "and HP0 connections)\n", num_times * outer, (allocated_dram) >> 20); - zpl->shell_write(GP1_DRAM_BASE_ADDR, 0x12345678, mask1); + zpl->shell_write(GP1_DRAM_BASE_ADDR, 0x12345678, 0xf); for (int s = 0; s < outer; s++) for (int t = 0; t < num_times; t++) { zpl->shell_write(GP1_DRAM_BASE_ADDR + 32768 * t + s * 4, 0x1ADACACA + t + s, - mask1); + 0xf); } bsg_pr_info("ps.cpp: finished write L2 %ld times over %ld MB\n", num_times * outer, (allocated_dram) >> 20); @@ -214,6 +280,8 @@ int ps_main(int argc, char **argv) { ((float)matches) / (float)(mismatches + matches)); #endif + mismatches = 0; + matches = 0; bsg_pr_info( "ps.cpp: attempting to read L2 %ld times over %ld MB (testing ARM GP1 " "and HP0 connections)\n", @@ -239,73 +307,84 @@ int ps_main(int argc, char **argv) { bsg_pr_info("ps.cpp: Zero-ing DRAM (%d bytes)\n", DRAM_ALLOCATE_SIZE); for (int i = 0; i < DRAM_ALLOCATE_SIZE; i+=4) { if (i % (1024*1024) == 0) bsg_pr_info("ps.cpp: zero-d %d MB\n", i/(1024*1024)); - zpl->shell_write(gp1_addr_base + i, 0x0, mask1); + zpl->shell_write(gp1_addr_base + i, 0x0, 0xf); } } #endif + bsg_pr_info("ps.cpp: clearing pl to ps fifo\n"); + while(zpl->shell_read(GP0_RD_PL2PS_FIFO_0_CTRS) != 0) { + zpl->shell_read(GP0_RD_PL2PS_FIFO_0_DATA); + } + bsg_pr_info("ps.cpp: beginning nbf load\n"); nbf_load(zpl, argv[1]); struct timespec start, end; clock_gettime(CLOCK_MONOTONIC, &start); - unsigned long long minstret_start = get_counter_64(zpl, GP0_RD_MINSTRET); unsigned long long mtime_start = get_counter_64(zpl, GP1_CSR_BASE_ADDR + 0x30bff8); bsg_pr_dbg_ps("ps.cpp: finished nbf load\n"); - // Set bsg client1 to 0 (deassert WD reset) + // Set bsg client2 to 0 (deassert WD reset) btb->set_client(wd_reset_client, 0x0); bsg_pr_info("ps.cpp: starting watchdog\n"); // We need some additional toggles for data to propagate through btb->idle(50); + bsg_pr_info("ps.cpp: Setting DRAM latency\n"); + zpl->shell_write(GP0_WR_CSR_DRAM_LATENCY, DRAM_LATENCY, 0xf); + + bsg_pr_info("ps.cpp: Setting sampling interval\n"); + zpl->shell_write(GP0_WR_CSR_SAMPLE_INTRVL, (SAMPLE_INTRVL - 1), 0xf); + + bsg_pr_info("ps.cpp: Asserting sampling clock gate enable\n"); + zpl->shell_write(GP0_WR_CSR_SAMPLE_GATE_EN, SAMPLE_GATE_EN, 0xf); + + bsg_pr_info("ps.cpp: Asserting DRAM clock gate enable\n"); + zpl->shell_write(GP0_WR_CSR_DRAM_GATE_EN, DRAM_GATE_EN, 0xf); + + bsg_pr_info("ps.cpp: Unfreezing BlackParrot\n"); + zpl->shell_write(GP1_CSR_BASE_ADDR + 0x200008, 0x0, 0xf); + bsg_pr_info("ps.cpp: Starting scan thread\n"); - pthread_create(&thread_id, NULL, monitor, NULL); + pthread_t monitor_id; + pthread_create(&monitor_id, NULL, monitor, NULL); bsg_pr_info("ps.cpp: Starting i/o polling thread\n"); - while (1) { - // keep reading as long as there is data - if (zpl->shell_read(GP0_RD_PL2PS_FIFO_CTRS) != 0) { - decode_bp_output(zpl, zpl->shell_read(GP0_RD_PL2PS_FIFO_DATA)); - } - // break loop when all cores done - if (done_vec.all()) { - break; - } - zpl->poll_tick(); - } + device_poll(zpl); - // Set bsg client1 to 1 (assert WD reset) + bsg_pr_info("ps.cpp: Deasserting sampling clock gate enable\n"); + zpl->shell_write(GP0_WR_CSR_SAMPLE_GATE_EN, 0x0, 0xf); + + bsg_pr_info("ps.cpp: Deasserting DRAM clock gate enable\n"); + zpl->shell_write(GP0_WR_CSR_DRAM_GATE_EN, 0x0, 0xf); + + // Set bsg client1 to 0 (deassert BP counter en) + btb->set_client(pl_cnten_client, 0x0); + // Set bsg client2 to 1 (assert WD reset) btb->set_client(wd_reset_client, 0x1); bsg_pr_info("ps.cpp: stopping watchdog\n"); // We need some additional toggles for data to propagate through btb->idle(50); + unsigned long long mcycle_stop = get_counter_64(zpl, GP0_RD_MCYCLE); + unsigned long long minstret_stop = get_counter_64(zpl, GP0_RD_MINSTRET); unsigned long long mtime_stop = get_counter_64(zpl, GP1_CSR_BASE_ADDR + 0x30bff8); + unsigned long long mtime_delta = mtime_stop - mtime_start; - unsigned long long minstret_stop = get_counter_64(zpl, GP0_RD_MINSTRET); - // test delay for reading counter - unsigned long long counter_data = get_counter_64(zpl, GP0_RD_MINSTRET); clock_gettime(CLOCK_MONOTONIC, &end); setlocale(LC_NUMERIC, ""); - bsg_pr_info("ps.cpp: end polling i/o\n"); - bsg_pr_info("ps.cpp: minstret (instructions retired): %'16llu (%16llx)\n", - minstret_start, minstret_start); + bsg_pr_info("ps.cpp: mcycle (instructions retired): %'16llu (%16llx)\n", + mcycle_stop, mcycle_stop); bsg_pr_info("ps.cpp: minstret (instructions retired): %'16llu (%16llx)\n", minstret_stop, minstret_stop); - unsigned long long minstret_delta = minstret_stop - minstret_start; - bsg_pr_info("ps.cpp: minstret delta: %'16llu (%16llx)\n", - minstret_delta, minstret_delta); bsg_pr_info("ps.cpp: MTIME start: %'16llu (%16llx)\n", mtime_start, mtime_start); bsg_pr_info("ps.cpp: MTIME stop: %'16llu (%16llx)\n", mtime_stop, mtime_stop); - unsigned long long mtime_delta = mtime_stop - mtime_start; bsg_pr_info("ps.cpp: MTIME delta (=1/8 BP cycles): %'16llu (%16llx)\n", mtime_delta, mtime_delta); bsg_pr_info("ps.cpp: IPC : %'16f\n", - ((double)minstret_delta) / ((double)(mtime_delta)) / 8.0); - bsg_pr_info("ps.cpp: minstret (instructions retired): %'16llu (%16llx)\n", - counter_data, counter_data); + ((double)minstret_stop) / ((double)(mcycle_stop))); unsigned long long diff_ns = 1000LL * 1000LL * 1000LL * ((unsigned long long)(end.tv_sec - start.tv_sec)) + @@ -313,7 +392,7 @@ int ps_main(int argc, char **argv) { bsg_pr_info("ps.cpp: wall clock time : %'16llu (%16llx) ns\n", diff_ns, diff_ns); bsg_pr_info( - "ps.cpp: sim/emul speed : %'16.2f BP cycles per minute\n", + "ps.cpp: sim/emul speed : %'16.2f BP cycles per minute\n", mtime_delta * 8 / ((double)(diff_ns) / (60.0 * 1000.0 * 1000.0 * 1000.0))); @@ -323,6 +402,10 @@ int ps_main(int argc, char **argv) { zpl->shell_read(GP0_RD_MEM_PROF_2), zpl->shell_read(GP0_RD_MEM_PROF_1), zpl->shell_read(GP0_RD_MEM_PROF_0)); + + report(zpl, argv[1]); + btb->idle(50000000); + // in general we do not want to free the dram; the Xilinx allocator has a // tendency to // fail after many allocate/fail cycle. instead we keep a pointer to the dram @@ -333,7 +416,7 @@ int ps_main(int argc, char **argv) { if (FREE_DRAM) { bsg_pr_info("ps.cpp: freeing DRAM buffer\n"); zpl->free_dram((void *)buf); - zpl->shell_write(GP0_WR_CSR_DRAM_INITED, 0x0, mask2); + zpl->shell_write(GP0_WR_CSR_DRAM_INITED, 0x0, 0xf); } zpl->done(); @@ -396,16 +479,16 @@ void nbf_load(bsg_zynq_pl *zpl, char *nbf_filename) { } else if (nbf[0] == 0x1) { int offset = nbf[1] % 4; - int shift = 2 * offset; + int shift = 8 * offset; data = zpl->shell_read(base_addr + nbf[1] - offset); - data = data & rotl((uint32_t)0xffff0000,shift) + nbf[2] & ((uint32_t)0x0000ffff << shift); + data = data & rotl((uint32_t)0xffff0000,shift) + ((nbf[2] & ((uint32_t)0x0000ffff)) << shift); zpl->shell_write(base_addr + nbf[1] - offset, data, 0xf); } else { int offset = nbf[1] % 4; - int shift = 2 * offset; + int shift = 8 * offset; data = zpl->shell_read(base_addr + nbf[1] - offset); - data = data & rotl((uint32_t)0xffffff00,shift) + nbf[2] & ((uint32_t)0x000000ff << shift); + data = data & rotl((uint32_t)0xffffff00,shift) + ((nbf[2] & ((uint32_t)0x000000ff)) << shift); zpl->shell_write(base_addr + nbf[1] - offset, data, 0xf); } } @@ -434,6 +517,8 @@ bool decode_bp_output(bsg_zynq_pl *zpl, long data) { if (address == 0x101000) { printf("%c", print_data); fflush(stdout); + } else if (address == 0x101004) { + return false; } else if (address >= 0x102000 && address < 0x103000) { done_vec[core] = true; if (print_data == 0) { @@ -479,6 +564,7 @@ bool decode_bp_output(bsg_zynq_pl *zpl, long data) { // if not implemented, print error } else { bsg_pr_err("ps.cpp: Errant read from (%lx)\n", address); + sleep(60); return false; } } @@ -486,3 +572,24 @@ bool decode_bp_output(bsg_zynq_pl *zpl, long data) { return true; } +void report(bsg_zynq_pl *zpl, char* nbf_filename) { + + char filename[100]; + if(strrchr(nbf_filename, '/') != NULL) + strcpy(filename, 1 + strrchr(nbf_filename, '/')); + else + strcpy(filename, nbf_filename); + *strrchr(filename, '.') = '\0'; + strcat(filename, ".rep"); + ofstream file(filename); + + if(file.is_open()) { + file << nbf_filename << endl; + for(int i=0; i> 1) - 1) + ,.clk_r_o(ds_clk) + ); + + bsg_icg_pos + clk_buf + (.clk_i(ds_clk) + ,.en_i(~gate_lo) + ,.clk_o(bp_clk) + ); +`endif // Address Translation (MBT): // @@ -435,13 +572,10 @@ module top_zynq // Zynq PA 0x8000_0000 .. 0x8FFF_FFFF -> AXI 0x0000_0000 .. 0x0FFF_FFFF -> BP 0x8000_0000 - 0x8FFF_FFFF // Zynq PA 0xA000_0000 .. 0xAFFF_FFFF -> AXI 0x2000_0000 .. 0x2FFF_FFFF -> BP 0x0000_0000 - 0x0FFF_FFFF - - wire [bp_axil_addr_width_lp-1:0] s01_awaddr_translated_lo = {~s01_axi_awaddr[29], 3'b0, s01_axi_awaddr[0+:28]}; - - // Zynq PA 0x8000_0000 .. 0x8FFF_FFFF -> AXI 0x0000_0000 .. 0x0FFF_FFFF -> BP 0x8000_0000 - 0x8FFF_FFFF - // Zynq PA 0xA000_0000 .. 0xAFFF_FFFF -> AXI 0x2000_0000 .. 0x2FFF_FFFF -> BP 0x0000_0000 - 0x0FFF_FFFF - - wire [bp_axil_addr_width_lp-1:0] s01_araddr_translated_lo = {~s01_axi_araddr[29], 3'b0, s01_axi_araddr[0+:28]}; + logic [bp_axil_addr_width_lp-1:0] s01_awaddr_translated_lo, s01_araddr_translated_lo; + assign s01_awaddr_translated_lo = (s01_axi_awaddr < 32'h20000000) ? (s01_axi_awaddr + 32'h80000000) : {4'b0, s01_axi_awaddr[0+:28]}; + assign s01_araddr_translated_lo = (s01_axi_araddr < 32'h20000000) ? (s01_axi_araddr + 32'h80000000) : {4'b0, s01_axi_araddr[0+:28]}; + logic [C_S01_AXI_ADDR_WIDTH-1 : 0] spack_axi_awaddr; logic [2 : 0] spack_axi_awprot; @@ -496,9 +630,9 @@ module top_zynq ,.s_axil_rvalid_o (spack_axi_rvalid) ,.s_axil_rready_i (spack_axi_rready) - ,.data_o (pl_to_ps_fifo_data_li) - ,.v_o (pl_to_ps_fifo_v_li) - ,.ready_i(pl_to_ps_fifo_ready_lo) + ,.data_o (pl_to_ps_fifo_data_li[0]) + ,.v_o (pl_to_ps_fifo_v_li[0]) + ,.ready_i(pl_to_ps_fifo_ready_lo[0]) ,.data_i(ps_to_pl_fifo_data_lo) ,.v_i(ps_to_pl_fifo_v_lo) @@ -576,6 +710,7 @@ module top_zynq ,.m01_axil_rready(m01_axi_rready) ); + // TODO: Bug in zero-extension of Xcelium 21.09 wire [bp_axil_addr_width_lp-1:0] s02_awaddr_translated_lo = s02_axi_awaddr; wire [bp_axil_addr_width_lp-1:0] s02_araddr_translated_lo = s02_axi_araddr; @@ -648,6 +783,7 @@ module top_zynq ,.m00_axil_rready (bp_s_axil_rready) ); + logic [bp_axi_addr_width_lp-1:0] axi_awaddr; logic [bp_axi_addr_width_lp-1:0] axi_araddr; @@ -666,21 +802,9 @@ module top_zynq assign m00_axi_awaddr = (axi_awaddr ^ 32'h8000_0000) + dram_base_li; assign m00_axi_araddr = (axi_araddr ^ 32'h8000_0000) + dram_base_li; - // synopsys translate_off - - always @(negedge aclk) - if (m00_axi_awvalid & m00_axi_awready) - if (debug_lp) $display("top_zynq: (BP DRAM) AXI Write Addr %x -> %x (AXI HP0)",axi_awaddr,m00_axi_awaddr); - - always @(negedge aclk) - if (m00_axi_arvalid & m00_axi_arready) - if (debug_lp) $display("top_zynq: (BP DRAM) AXI Write Addr %x -> %x (AXI HP0)",axi_araddr,m00_axi_araddr); - - // synopsys translate_on - bsg_dff_reset #(.width_p(128)) dff (.clk_i(aclk) - ,.reset_i(bp_reset_li) + ,.reset_i(~aresetn) ,.data_i(mem_profiler_r | m00_axi_awvalid << (axi_awaddr[29-:7]) | m00_axi_arvalid << (axi_araddr[29-:7]) @@ -699,13 +823,19 @@ module top_zynq ,.axi_id_width_p(6) ,.axi_size_width_p(3) ,.axi_len_width_p(4) - ,.axi_core_clk_async_p(0) + ,.axi_core_clk_async_p(axi_core_clk_async_lp) + ,.async_fifo_size_p(async_fifo_size_lp) ) blackparrot (.axi_clk_i(aclk) - ,.core_clk_i(aclk) + ,.core_clk_i(bp_clk) + ,.ds_clk_i(ds_clk) ,.rt_clk_i(rt_clk) - ,.async_reset_i(bp_reset_li) + ,.async_reset_i(bp_async_reset_li) + + ,.cdl_en_i(dram_gate_en_li) + ,.cdl_lat_i(dram_latency_li) + ,.cdl_gate_o(cdl_gate_lo) // these are reads/write from BlackParrot ,.m_axil_awaddr_o (bp_m_axil_awaddr) @@ -803,6 +933,184 @@ module top_zynq ,.m_axi_rresp_i (m00_axi_rresp) ); + // Performance Profiler + logic bp_reset_li; + bsg_sync_sync + #(.width_p(1)) + reset_bss + (.oclk_i(bp_clk) + ,.iclk_data_i(bp_async_reset_li) + ,.oclk_data_o(bp_reset_li) + ); + + logic prof_en_li; + bsg_sync_sync + #(.width_p(1)) + en_bss + (.oclk_i(bp_clk) + ,.iclk_data_i(prof_async_en_li) + ,.oclk_data_o(prof_en_li) + ); + + logic [31:0] sample_cnt_lo; + bsg_counter_dynamic_limit + #(.width_p(32)) + i_sample_counter + (.clk_i(bp_clk) + ,.reset_i(bp_reset_li) + ,.limit_i(sample_intrvl_li) + ,.counter_o(sample_cnt_lo) + ); + + bp_commit_profiler + #(.bp_params_p(bp_params_p) + ,.els_p(prof_els_lp) + ,.width_p(64) + ) + i_profiler + (.clk_i(bp_clk) + ,.reset_i(bp_reset_li) + ,.en_i(prof_en_li) + ,.freeze_i(`COREPATH.be.calculator.pipe_sys.csr.cfg_bus_cast_i.freeze) + + ,.fe_queue_ready_i(`COREPATH.fe.fe_queue_ready_and_i) + ,.fe_queue_empty_i(`COREPATH.be.scheduler.issue_queue.empty) + + ,.icache_yumi_i(`COREPATH.fe.icache_yumi_lo) + ,.icache_miss_i(~`COREPATH.fe.icache.is_ready + | (`COREPATH.fe.icache.v_tv + & ~`COREPATH.fe.icache.decode_tv_r.inval_op + & ~`COREPATH.fe.icache.hit_v_tv)) + ,.icache_tl_we_i(`COREPATH.fe.icache.tl_we) + ,.icache_tv_we_i(`COREPATH.fe.icache.tv_we) + + ,.br_ovr_i(`COREPATH.fe.pc_gen.ovr_btaken) + ,.ret_ovr_i(`COREPATH.fe.pc_gen.ovr_ret) + ,.jal_ovr_i(`COREPATH.fe.pc_gen.ovr_jmp) + + ,.fe_cmd_yumi_i(`COREPATH.fe.fe_cmd_yumi_o) + ,.fe_cmd_i(`COREPATH.fe.fe_cmd_cast_i) + ,.issue_v_i(`COREPATH.be.director.issue_pkt_cast_i.v) + ,.suppress_iss_i(`COREPATH.be.director.suppress_iss_o) + ,.clear_iss_i(`COREPATH.be.director.clear_iss_o) + ,.mispredict_i(`COREPATH.be.director.npc_mismatch_v) + ,.dispatch_v_i(~`COREPATH.be.scheduler.hazard_v_i) + ,.isd_expected_npc_i(`COREPATH.be.director.expected_npc_o) + + ,.data_haz_i(`COREPATH.be.detector.data_haz_v) + ,.catchup_dep_i(`COREPATH.be.detector.dep_status_r[0].fint_iwb_v + & `COREPATH.be.detector.data_haz_v + ) + ,.aux_dep_i((`COREPATH.be.detector.dep_status_r[0].aux_iwb_v + | `COREPATH.be.detector.dep_status_r[0].aux_fwb_v + ) & `COREPATH.be.detector.data_haz_v + ) + ,.load_dep_i((`COREPATH.be.detector.dep_status_r[0].emem_iwb_v + | `COREPATH.be.detector.dep_status_r[0].emem_fwb_v + | `COREPATH.be.detector.dep_status_r[0].fmem_iwb_v + | `COREPATH.be.detector.dep_status_r[1].fmem_iwb_v + | `COREPATH.be.detector.dep_status_r[0].fmem_fwb_v + | `COREPATH.be.detector.dep_status_r[1].fmem_fwb_v + ) & `COREPATH.be.detector.data_haz_v + ) + ,.mul_dep_i((`COREPATH.be.detector.dep_status_r[0].mul_iwb_v + | `COREPATH.be.detector.dep_status_r[1].mul_iwb_v + ) & `COREPATH.be.detector.data_haz_v + ) + ,.fma_dep_i((`COREPATH.be.detector.dep_status_r[0].fma_fwb_v + | `COREPATH.be.detector.dep_status_r[1].fma_fwb_v + | `COREPATH.be.detector.dep_status_r[2].fma_fwb_v + ) & `COREPATH.be.detector.data_haz_v + ) + + ,.sb_iraw_dep_i((`COREPATH.be.detector.irs1_sb_raw_haz_v + | `COREPATH.be.detector.irs2_sb_raw_haz_v + ) & `COREPATH.be.detector.data_haz_v + ) + ,.sb_fraw_dep_i((`COREPATH.be.detector.frs1_sb_raw_haz_v + | `COREPATH.be.detector.frs2_sb_raw_haz_v + | `COREPATH.be.detector.frs3_sb_raw_haz_v + ) & `COREPATH.be.detector.data_haz_v + ) + ,.sb_iwaw_dep_i(`COREPATH.be.detector.ird_sb_waw_haz_v & `COREPATH.be.detector.data_haz_v) + ,.sb_fwaw_dep_i(`COREPATH.be.detector.frd_sb_waw_haz_v & `COREPATH.be.detector.data_haz_v) + + ,.sb_int_v_i(`COREPATH.be.detector.score_int_v_li) + ,.sb_int_clr_i(`COREPATH.be.detector.clear_int_v_li) + ,.sb_fp_v_i(`COREPATH.be.detector.score_fp_v_li) + ,.sb_fp_clr_i(`COREPATH.be.detector.clear_fp_v_li) + ,.sb_irs_match_i(`COREPATH.be.detector.irs_match_lo) + ,.sb_frs_match_i(`COREPATH.be.detector.frs_match_lo) + ,.rs1_match_vector_i(`COREPATH.be.detector.rs1_match_vector) + ,.rs2_match_vector_i(`COREPATH.be.detector.rs2_match_vector) + ,.rs3_match_vector_i(`COREPATH.be.detector.rs3_match_vector) + + ,.control_haz_i(`COREPATH.be.detector.control_haz_v) + ,.long_haz_i(1'b0) + + ,.struct_haz_i(`COREPATH.be.detector.struct_haz_v) + ,.mem_haz_i(`COREPATH.be.detector.mem_busy_i + & (`COREPATH.be.detector.issue_pkt_cast_i.decode.pipe_mem_early_v + | `COREPATH.be.detector.issue_pkt_cast_i.decode.pipe_mem_final_v)) + ,.idiv_haz_i(`COREPATH.be.detector.idiv_busy_i & `COREPATH.be.detector.issue_pkt_cast_i.decode.pipe_long_v) + ,.fdiv_haz_i(`COREPATH.be.detector.fdiv_busy_i & `COREPATH.be.detector.issue_pkt_cast_i.decode.pipe_long_v) + ,.ptw_busy_i(1'b0) + + ,.dispatch_pkt_i(`COREPATH.be.detector.dispatch_pkt_i) + ,.retire_pkt_i(`COREPATH.be.calculator.pipe_sys.retire_pkt) + ,.commit_pkt_i(`COREPATH.be.calculator.pipe_sys.commit_pkt_cast_o) + ,.iwb_pkt_i(`COREPATH.be.calculator.pipe_sys.iwb_pkt_cast_i) + ,.fwb_pkt_i(`COREPATH.be.calculator.pipe_sys.fwb_pkt_cast_i) + + ,.data_o(prof_data_lo[prof_els_lp-1:0]) + ,.v_o(prof_v_lo) + ,.instret_o(prof_instret_lo) + ,.stall_o(prof_stall_lo) + ,.pc_o(prof_pc_lo) + ); + + bsg_async_fifo + #(.width_p(1+$bits(bp_stall_reason_e)+vaddr_width_p) + ,.lg_size_p(async_fifo_size_lp) + ) + i_afifo_prof + (.w_clk_i(bp_clk) + ,.w_reset_i(bp_reset_li) + + ,.w_enq_i(prof_v_lo & (sample_cnt_lo == '0) & ~prof_afifo_full_lo) + ,.w_data_i({prof_instret_lo, prof_stall_lo, prof_pc_lo}) + ,.w_full_o(prof_afifo_full_lo) + + ,.r_clk_i(aclk) + ,.r_reset_i(~aresetn) + + ,.r_valid_o(prof_afifo_v_lo) + ,.r_data_o({prof_afifo_instret_lo, prof_afifo_stall_lo, prof_afifo_pc_lo}) + ,.r_deq_i(prof_afifo_v_lo & prof_fifo_ready_lo) + ); + + bsg_fifo_1r1w_small + #(.width_p(1+$bits(bp_stall_reason_e)+vaddr_width_p) + ,.els_p(skid_buffer_els_lp) + ) + i_fifo_prof + (.clk_i(aclk) + ,.reset_i(~aresetn) + + ,.v_i(prof_afifo_v_lo) + ,.data_i({prof_afifo_instret_lo, prof_afifo_stall_lo, prof_afifo_pc_lo}) + ,.ready_param_o(prof_fifo_ready_lo) + + ,.v_o(prof_fifo_v_lo) + ,.data_o({prof_fifo_instret_lo, prof_fifo_stall_lo, prof_fifo_pc_lo}) + ,.yumi_i(pl_to_ps_fifo_v_li[1]) + ); + + assign pl_to_ps_fifo_v_li[1] = prof_fifo_v_lo & pl_to_ps_fifo_ready_lo[1] & pl_to_ps_fifo_ready_lo[2]; + assign pl_to_ps_fifo_v_li[2] = pl_to_ps_fifo_v_li[1]; + assign pl_to_ps_fifo_data_li[1] = prof_fifo_pc_lo[0+:32]; + assign pl_to_ps_fifo_data_li[2] = {prof_fifo_stall_lo, prof_fifo_instret_lo}; + // synopsys translate_off always @(negedge aclk) if (aresetn !== '0 & bb_v_li & ~bb_ready_and_lo == 1'b1) @@ -815,8 +1123,14 @@ module top_zynq always @(negedge aclk) if (s01_axi_arvalid & s01_axi_arready) if (debug_lp) $display("top_zynq: AXI Read Addr %x -> %x (BP)",s01_axi_araddr,s01_araddr_translated_lo); - // synopsys translate_on + always @(negedge aclk) + if (m00_axi_awvalid & m00_axi_awready) + if (debug_lp) $display("top_zynq: (BP DRAM) AXI Write Addr %x -> %x (AXI HP0)",axi_awaddr,m00_axi_awaddr); -endmodule + always @(negedge aclk) + if (m00_axi_arvalid & m00_axi_arready) + if (debug_lp) $display("top_zynq: (BP DRAM) AXI Write Addr %x -> %x (AXI HP0)",axi_araddr,m00_axi_araddr); + // synopsys translate_on +endmodule diff --git a/cosim/black-parrot-example/v/zynq_pkg.sv b/cosim/black-parrot-example/v/zynq_pkg.sv index da12581a..6143c0eb 100644 --- a/cosim/black-parrot-example/v/zynq_pkg.sv +++ b/cosim/black-parrot-example/v/zynq_pkg.sv @@ -17,6 +17,7 @@ package zynq_pkg; typedef struct packed { + bsg_tag_s counter_en; bsg_tag_s core_reset; } zynq_pl_tag_lines_s; localparam tag_pl_local_els_gp = $bits(zynq_pl_tag_lines_s)/$bits(bsg_tag_s); diff --git a/cosim/black-parrot-example/vcs/Makefile b/cosim/black-parrot-example/vcs/Makefile index 7299c5f0..36e41e14 100644 --- a/cosim/black-parrot-example/vcs/Makefile +++ b/cosim/black-parrot-example/vcs/Makefile @@ -3,26 +3,50 @@ include ../Makefile.design ############################# # Accelerator Software Settings ############################# + DROMAJO_COSIM ?= 0 -SKIP_DRAM ?= -DSKIP_DRAM_TESTING + BP_NCPUS ?= 1 +# AXI to core clock downsampling factor +CLK_DIV ?= 2 + +# Sampling gate enable and interval +SAMPLE_GATE_EN ?= 1 +SAMPLE_INTRVL ?= 1 + +# DRAM constant delay enable and latency +DRAM_GATE_EN ?= 1 +DRAM_LATENCY ?= 80 + +#CFLAGS += -DDRAM_TEST #CFLAGS += -DZYNQ_PL_DEBUG CFLAGS += -DZYNQ_PS_DEBUG -CFLAGS += -I$(BP_TOOLS_DIR)/dromajo/include CFLAGS += -DBP_NCPUS=$(BP_NCPUS) -CFLAGS += $(SKIP_DRAM) +CFLAGS += -DSAMPLE_GATE_EN=$(SAMPLE_GATE_EN) +CFLAGS += -DSAMPLE_INTRVL=$(SAMPLE_INTRVL) +CFLAGS += -DDRAM_GATE_EN=$(DRAM_GATE_EN) +CFLAGS += -DDRAM_LATENCY=$(DRAM_LATENCY) +ifeq ($(DROMAJO_COSIM),1) +CFLAGS += -I$(BP_TOOLS_DIR)/dromajo/include +endif +DEFINES += CLK_DIV=$(CLK_DIV) DEFINES += GP0_ENABLE -DEFINES += GP0_ADDR_BASE=0x40000000U GP0_ADDR_WIDTH=10 GP0_DATA_WIDTH=32 +DEFINES += GP0_ADDR_BASE=0x400000000U GP0_ADDR_WIDTH=10 GP0_DATA_WIDTH=32 DEFINES += GP0_HIER_BASE=bsg_nonsynth_zynq_testbench.axil0 DEFINES += GP1_ENABLE -DEFINES += GP1_ADDR_BASE=0x80000000U GP1_ADDR_WIDTH=30 GP1_DATA_WIDTH=32 +DEFINES += GP1_ADDR_BASE=0x500000000U GP1_ADDR_WIDTH=30 GP1_DATA_WIDTH=32 DEFINES += GP1_HIER_BASE=bsg_nonsynth_zynq_testbench.axil1 DEFINES += GP2_ENABLE -DEFINES += GP2_ADDR_BASE=0x80000000U GP2_ADDR_WIDTH=28 GP2_DATA_WIDTH=32 +DEFINES += GP2_ADDR_BASE=0x800000000U GP2_ADDR_WIDTH=28 GP2_DATA_WIDTH=32 DEFINES += GP2_HIER_BASE=bsg_nonsynth_zynq_testbench.axil2 +# watchdog cannot function if clock gating is enabled +ifndef SAMPLE_GATE_EN +ifndef DRAM_GATE_EN DEFINES += WATCHDOG_ENABLE +endif +endif DEFINES += HP0_ENABLE DEFINES += HP0_ADDR_BASE=0x0000000U HP0_ADDR_WIDTH=32 HP0_DATA_WIDTH=64 DEFINES += AXI_MEM_ENABLE @@ -58,10 +82,11 @@ $(FLIST): $(BP_FLIST) $(BASE_FLIST) echo "+incdir+$(COSIM_DIR)/include/vcs" >> $@ echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_clock_gen.sv" >> $@ echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_reset_gen.sv" >> $@ - echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_axi_mem.sv" >> $@ echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_dpi_clock_gen.cpp" >> $@ echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_dpi_clock_gen.sv" >> $@ echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_dpi_gpio.sv" >> $@ + echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_axi_mem.sv" >> $@ + echo "$(BASEJUMP_STL_DIR)/bsg_misc/bsg_icg_pos.sv" >> $@ echo "$(COSIM_VSRC_DIR)/bsg_nonsynth_dpi_to_axil.sv" >> $@ echo "$(COSIM_VSRC_DIR)/bsg_nonsynth_axil_to_dpi.sv" >> $@ echo "$(COSIM_VSRC_DIR)/bsg_nonsynth_zynq_testbench.sv" >> $@ diff --git a/cosim/black-parrot-example/verilator/Makefile b/cosim/black-parrot-example/verilator/Makefile index 42e6ecb1..cb8c2ae3 100644 --- a/cosim/black-parrot-example/verilator/Makefile +++ b/cosim/black-parrot-example/verilator/Makefile @@ -5,25 +5,48 @@ include ../Makefile.design ############################# DROMAJO_COSIM ?= 0 -SKIP_DRAM ?= -DSKIP_DRAM_TESTING + BP_NCPUS ?= 1 +# AXI to core clock downsampling factor +CLK_DIV ?= 2 + +# Sampling gate enable and interval +SAMPLE_GATE_EN ?= 1 +SAMPLE_INTRVL ?= 1 + +# DRAM constant delay enable and latency +DRAM_GATE_EN ?= 1 +DRAM_LATENCY ?= 80 + +#CFLAGS += -DDRAM_TEST #CFLAGS += -DZYNQ_PL_DEBUG CFLAGS += -DZYNQ_PS_DEBUG -CFLAGS += -I$(BP_TOOLS_DIR)/dromajo/include CFLAGS += -DBP_NCPUS=$(BP_NCPUS) -CFLAGS += $(SKIP_DRAM) +CFLAGS += -DSAMPLE_GATE_EN=$(SAMPLE_GATE_EN) +CFLAGS += -DSAMPLE_INTRVL=$(SAMPLE_INTRVL) +CFLAGS += -DDRAM_GATE_EN=$(DRAM_GATE_EN) +CFLAGS += -DDRAM_LATENCY=$(DRAM_LATENCY) +ifeq ($(DROMAJO_COSIM),1) +CFLAGS += -I$(BP_TOOLS_DIR)/dromajo/include +endif +DEFINES += CLK_DIV=$(CLK_DIV) DEFINES += GP0_ENABLE -DEFINES += GP0_ADDR_BASE=0x40000000U GP0_ADDR_WIDTH=10 GP0_DATA_WIDTH=32 +DEFINES += GP0_ADDR_BASE=0x400000000U GP0_ADDR_WIDTH=10 GP0_DATA_WIDTH=32 DEFINES += GP0_HIER_BASE=TOP.bsg_nonsynth_zynq_testbench.axil0 DEFINES += GP1_ENABLE -DEFINES += GP1_ADDR_BASE=0x80000000U GP1_ADDR_WIDTH=30 GP1_DATA_WIDTH=32 +DEFINES += GP1_ADDR_BASE=0x500000000U GP1_ADDR_WIDTH=30 GP1_DATA_WIDTH=32 DEFINES += GP1_HIER_BASE=TOP.bsg_nonsynth_zynq_testbench.axil1 DEFINES += GP2_ENABLE -DEFINES += GP2_ADDR_BASE=0x80000000U GP2_ADDR_WIDTH=28 GP2_DATA_WIDTH=32 +DEFINES += GP2_ADDR_BASE=0x800000000U GP2_ADDR_WIDTH=28 GP2_DATA_WIDTH=32 DEFINES += GP2_HIER_BASE=TOP.bsg_nonsynth_zynq_testbench.axil2 -#DEFINES += WATCHDOG_ENABLE +# watchdog cannot function if clock gating is enabled +ifndef SAMPLE_GATE_EN +ifndef DRAM_GATE_EN +DEFINES += WATCHDOG_ENABLE +endif +endif DEFINES += HP0_ENABLE DEFINES += HP0_ADDR_BASE=0x0000000U HP0_ADDR_WIDTH=32 HP0_DATA_WIDTH=64 DEFINES += AXI_MEM_ENABLE @@ -58,6 +81,8 @@ $(FLIST): $(BP_FLIST) $(BASE_FLIST) echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_dpi_clock_gen.cpp" >> $@ echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_dpi_clock_gen.sv" >> $@ echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_dpi_gpio.sv" >> $@ + echo "$(BASEJUMP_STL_DIR)/bsg_test/bsg_nonsynth_axi_mem.sv" >> $@ + echo "$(BASEJUMP_STL_DIR)/bsg_misc/bsg_icg_pos.sv" >> $@ echo "$(COSIM_VSRC_DIR)/bsg_nonsynth_dpi_to_axil.sv" >> $@ echo "$(COSIM_VSRC_DIR)/bsg_nonsynth_axil_to_dpi.sv" >> $@ echo "$(COSIM_VSRC_DIR)/bsg_nonsynth_zynq_testbench.sv" >> $@ diff --git a/cosim/black-parrot-example/vivado/Makefile b/cosim/black-parrot-example/vivado/Makefile index 988fa891..71e9ae41 100644 --- a/cosim/black-parrot-example/vivado/Makefile +++ b/cosim/black-parrot-example/vivado/Makefile @@ -3,6 +3,11 @@ include ../Makefile.design ############################# # Modify base flist ############################# +export CLK_FREQ ?= 160 +export CLK_DIV ?= 4 + +export VDEFINES ?= CLK_DIV=$(CLK_DIV) + BASE_FLIST ?= $(abspath ../flist.vcs) WD_FLIST ?= $(abspath flist.watchdog.vcs) TOP_FLIST ?= $(abspath flist.blackparrot.vcs) @@ -33,7 +38,8 @@ $(WD_FLIST): sed -i "s#.*bsg_mem_1rw_sync_mask_write_bit.sv#$(BASEJUMP_STL_DIR)/hard/ultrascale_plus/bsg_mem/bsg_mem_1rw_sync_mask_write_bit.sv#g" $@ $(TOP_FLIST): $(BLACKPARROT_DIR)/bp_top/syn/flist.vcs $(BASE_FLIST) - cat $^ | envsubst > $@ + echo "+define+CLK_DIV=$(CLK_DIV)" > $@ + cat $^ | envsubst >> $@ sed -i "s/BASEJUMP_STL_DIR/BP_BASEJUMP_STL_DIR/g" $@ sed -i "/bp_common_pkg.sv/d" $@ sed -i "1i $(CURR_VSRC_DIR)/bp_common_pkg.sv" $@ diff --git a/cosim/black-parrot-example/vivado/vivado-create-block.zynq.pynqz2.2022.1.tcl b/cosim/black-parrot-example/vivado/vivado-create-block.zynq.pynqz2.2022.1.tcl index 8f76b49e..aec74140 100644 --- a/cosim/black-parrot-example/vivado/vivado-create-block.zynq.pynqz2.2022.1.tcl +++ b/cosim/black-parrot-example/vivado/vivado-create-block.zynq.pynqz2.2022.1.tcl @@ -103,7 +103,6 @@ set_property range 1G [get_bd_addr_segs {processing_system7_0/Data/SEG_top_0_reg validate_bd_design make_wrapper -files [get_files ${project_name}.srcs/sources_1/bd/${project_bd}/${project_bd}.bd] -top add_files -norecurse ${project_name}.srcs/sources_1/bd/${project_bd}/hdl/${project_bd}_wrapper.v - #delete_bd_objs [get_bd_nets reset_rtl_0_1] [get_bd_ports reset_rtl_0] #connect_bd_net [get_bd_pins processing_system7_0/FCLK_RESET0_N] [get_bd_pins proc_sys_reset_0/ext_reset_in] diff --git a/cosim/black-parrot-example/vivado/vivado-create-block.zynq.ultra96v2.2020.1.tcl b/cosim/black-parrot-example/vivado/vivado-create-block.zynq.ultra96v2.2020.1.tcl index d6c77dc2..ed253f7c 100644 --- a/cosim/black-parrot-example/vivado/vivado-create-block.zynq.ultra96v2.2020.1.tcl +++ b/cosim/black-parrot-example/vivado/vivado-create-block.zynq.ultra96v2.2020.1.tcl @@ -2,6 +2,8 @@ set project_name $::env(BASENAME)_bd_proj set project_part $::env(PART) set project_bd $::env(BASENAME)_bd_1 set tcl_dir $::env(CURR_TCL_DIR) +set clk_freq $::env(CLK_FREQ) +set vdefines $::env(VDEFINES) create_project -force ${project_name} [pwd] -part ${project_part} create_bd_design "${project_bd}" @@ -12,7 +14,7 @@ update_ip_catalog startgroup create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0 -set_property -dict [list CONFIG.PSU__FPGA_PL0_ENABLE {1} CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ {50}] [get_bd_cells zynq_ultra_ps_e_0] +set_property -dict [list CONFIG.PSU__FPGA_PL0_ENABLE {1} CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ $clk_freq] [get_bd_cells zynq_ultra_ps_e_0] set_property -dict [list CONFIG.PSU__USE__M_AXI_GP0 {1} CONFIG.PSU__MAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0] set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {1} CONFIG.PSU__MAXIGP1__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0] set_property -dict [list CONFIG.PSU__USE__S_AXI_GP3 {1} CONFIG.PSU__SAXIGP3__DATA_WIDTH {64}] [get_bd_cells zynq_ultra_ps_e_0] @@ -84,10 +86,10 @@ connect_bd_intf_net [get_bd_intf_pins watchdog_0/m_axil] [get_bd_intf_pins smart connect_bd_net [get_bd_pins top_0/tag_clk] [get_bd_pins watchdog_0/tag_clk] connect_bd_net [get_bd_pins top_0/tag_data] [get_bd_pins watchdog_0/tag_data] -apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config {Clk "/zynq_ultra_ps_e_0/pl_clk0 (50 MHz)" } [get_bd_pins zynq_ultra_ps_e_0/maxihpm0_fpd_aclk] -apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config {Clk "/zynq_ultra_ps_e_0/pl_clk0 (50 MHz)" } [get_bd_pins zynq_ultra_ps_e_0/maxihpm1_fpd_aclk] -apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config {Clk "/zynq_ultra_ps_e_0/pl_clk0 (50 MHz)" } [get_bd_pins zynq_ultra_ps_e_0/saxihp1_fpd_aclk] -apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config {Clk "/zynq_ultra_ps_e_0/pl_clk0 (50 MHz)" } [get_bd_pins proc_sys_reset_0/slowest_sync_clk] +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config {Clk "/zynq_ultra_ps_e_0/pl_clk0 (${clk_freq} MHz)" } [get_bd_pins zynq_ultra_ps_e_0/maxihpm0_fpd_aclk] +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config {Clk "/zynq_ultra_ps_e_0/pl_clk0 (${clk_freq} MHz)" } [get_bd_pins zynq_ultra_ps_e_0/maxihpm1_fpd_aclk] +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config {Clk "/zynq_ultra_ps_e_0/pl_clk0 (${clk_freq} MHz)" } [get_bd_pins zynq_ultra_ps_e_0/saxihp1_fpd_aclk] +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config {Clk "/zynq_ultra_ps_e_0/pl_clk0 (${clk_freq} MHz)" } [get_bd_pins proc_sys_reset_0/slowest_sync_clk] apply_bd_automation -rule xilinx.com:bd_rule:board -config {Manual_Source {Auto}} [get_bd_pins proc_sys_reset_0/ext_reset_in] create_bd_addr_seg -range 0x00002000 -offset 0x10000000 [get_bd_addr_spaces top_0/m01_axi] [get_bd_addr_segs axi_bram_ctrl_0/S_AXI/Mem0] SEG_axi_bram_ctrl_0_Mem0 @@ -101,13 +103,14 @@ validate_bd_design make_wrapper -files [get_files ${project_name}.srcs/sources_1/bd/${project_bd}/${project_bd}.bd] -top add_files -norecurse ${project_name}.srcs/sources_1/bd/${project_bd}/hdl/${project_bd}_wrapper.v - +set_property verilog_define ${vdefines} [current_fileset] save_bd_design # Change to 0 to have it stop before synthesis / implementation # so you can inspect the design with the GUI if {1} { + set_property STEPS.SYNTH_DESIGN.ARGS.GATED_CLOCK_CONVERSION auto [get_runs synth_1] launch_runs synth_1 -jobs 4 wait_on_run synth_1 open_run synth_1 -name synth_1 diff --git a/cosim/black-parrot-example/zynq/Makefile b/cosim/black-parrot-example/zynq/Makefile index e5e27ed2..04210d08 100644 --- a/cosim/black-parrot-example/zynq/Makefile +++ b/cosim/black-parrot-example/zynq/Makefile @@ -1,5 +1,13 @@ include ../Makefile.design +# Sampling gate enable and interval +SAMPLE_GATE_EN ?= 1 +SAMPLE_INTRVL ?= 1 + +# DRAM constant delay enable and latency +DRAM_GATE_EN ?= 1 +DRAM_LATENCY ?= 80 + ############################# # Accelerator Software Settings ############################# @@ -8,6 +16,10 @@ CFLAGS += -DZYNQ_PS_DEBUG CFLAGS += -DDRAM_TEST CFLAGS += -DFREE_DRAM=1 CFLAGS += -DZERO_DRAM=1 +CFLAGS += -DSAMPLE_GATE_EN=$(SAMPLE_GATE_EN) +CFLAGS += -DSAMPLE_INTRVL=$(SAMPLE_INTRVL) +CFLAGS += -DDRAM_GATE_EN=$(DRAM_GATE_EN) +CFLAGS += -DDRAM_LATENCY=$(DRAM_LATENCY) SIM_ARGS += $(NBF_FILE) @@ -16,17 +28,22 @@ DEFINES += GP0_ENABLE DEFINES += GP0_ADDR_BASE=0x40000000 GP0_ADDR_WIDTH=10 GP0_ADDR_SIZE_BYTES=4096 GP0_DATA_WIDTH=32 DEFINES += GP1_ENABLE DEFINES += GP1_ADDR_BASE=0x80000000 GP1_ADDR_WIDTH=30 GP1_ADDR_SIZE_BYTES=0x30000000 GP1_DATA_WIDTH=32 -DEFINES += DRAM_ALLOCATE_SIZE_MB=80 +DEFINES += DRAM_ALLOCATE_SIZE_MB=241 else ifeq ($(BOARDNAME),ultra96v2) DEFINES += GP0_ENABLE GP0_ADDR_WIDTH=10 DEFINES += GP0_ADDR_BASE=0x400000000 GP0_ADDR_WIDTH=10 GP0_ADDR_SIZE_BYTES=4096 GP0_DATA_WIDTH=32 DEFINES += GP1_ENABLE DEFINES += GP1_ADDR_BASE=0x500000000 GP1_ADDR_WIDTH=30 GP1_ADDR_SIZE_BYTES=0x30000000 GP1_DATA_WIDTH=32 -DEFINES += DRAM_ALLOCATE_SIZE_MB=200 +DEFINES += DRAM_ALLOCATE_SIZE_MB=241 endif DEFINES += GP2_ENABLE DEFINES += GP2_ADDR_BASE=0x80000000U GP2_ADDR_WIDTH=28 GP2_DATA_WIDTH=32 +# watchdog cannot function if clock gating is enabled +ifndef SAMPLE_GATE_EN +ifndef DRAM_GATE_EN DEFINES += WATCHDOG_ENABLE +endif +endif DEFINES += HP0_ENABLE DEFINES += HP0_ADDR_BASE=0x0000000U HP0_ADDR_WIDTH=32 HP0_DATA_WIDTH=64 DEFINES += AXI_MEM_ENABLE diff --git a/cosim/import/basejump_stl b/cosim/import/basejump_stl index 14f29e4d..209813fa 160000 --- a/cosim/import/basejump_stl +++ b/cosim/import/basejump_stl @@ -1 +1 @@ -Subproject commit 14f29e4d8a498ad62ec05fe232407f9d887e36a9 +Subproject commit 209813fae5f40ae148cf6026ff8c46097f98cd52 diff --git a/cosim/import/black-parrot b/cosim/import/black-parrot index ea328c9d..ed1a24b3 160000 --- a/cosim/import/black-parrot +++ b/cosim/import/black-parrot @@ -1 +1 @@ -Subproject commit ea328c9dcf918825e9c92353b00d32df24a63415 +Subproject commit ed1a24b3ddc4819852a4f08c1f4cd55b368f4844 diff --git a/cosim/import/black-parrot-subsystems b/cosim/import/black-parrot-subsystems index 6fd09ffd..642f29d3 160000 --- a/cosim/import/black-parrot-subsystems +++ b/cosim/import/black-parrot-subsystems @@ -1 +1 @@ -Subproject commit 6fd09ffd4d5dcb90457154da21e12a69cace199f +Subproject commit 642f29d3b4c656d3b60d6b81c51b593eed895901 diff --git a/cosim/include/common/bsg_axil.h b/cosim/include/common/bsg_axil.h index 3ba6f22c..1177c14f 100644 --- a/cosim/include/common/bsg_axil.h +++ b/cosim/include/common/bsg_axil.h @@ -17,7 +17,7 @@ #include "bsg_printing.h" #ifndef ZYNQ_AXI_TIMEOUT -#define ZYNQ_AXI_TIMEOUT 8000 +#define ZYNQ_AXI_TIMEOUT 50000 #endif using namespace std; diff --git a/cosim/include/common/bsg_zynq_pl_hardware.h b/cosim/include/common/bsg_zynq_pl_hardware.h index 88ac7f14..cce83ecc 100644 --- a/cosim/include/common/bsg_zynq_pl_hardware.h +++ b/cosim/include/common/bsg_zynq_pl_hardware.h @@ -26,11 +26,13 @@ #include "bsg_argparse.h" #include "bsg_printing.h" #include "zynq_headers.h" +#ifdef NEON +#include "arm_neon.h" +#endif using namespace std; class bsg_zynq_pl_hardware { protected: - bool debug = ZYNQ_PL_DEBUG; int serial_port; uintptr_t gp0_base_offset = 0; uintptr_t gp1_base_offset = 0; @@ -125,16 +127,12 @@ class bsg_zynq_pl_hardware { volatile int32_t *ptr32 = axil_get_ptr32(address); int32_t data = *ptr32; - if (debug) - printf(" bsg_zynq_pl: AXI reading [%" PRIxPTR "]->%8.8x\n", address, data); - + bsg_pr_dbg_pl("AXI reading [%" PRIxPTR "]->%8.8x\n", address, data); return data; } inline void axil_write(uintptr_t address, int32_t data, uint8_t wstrb) { - if (debug) - printf(" bsg_zynq_pl: AXI writing [%" PRIxPTR "]=%8.8x mask %" PRIu8 "\n", address, data, - wstrb); + bsg_pr_dbg_pl("AXI writing [%" PRIxPTR "]=%8.8x mask %" PRIu8 "\n", address, data, wstrb); // for now we don't support alternate write strobes assert(wstrb == 0XF || wstrb == 0x3 || wstrb == 0x1); @@ -152,6 +150,32 @@ class bsg_zynq_pl_hardware { assert(false); // Illegal write strobe } } + +#ifdef NEON + inline uint32x2_t axil_2read(uintptr_t address) { + // Only aligned 64B reads + assert(alignof(address) >= 8); + volatile uint32_t *ptr32x2 = (uint32_t *)axil_get_ptr32(address); + uint32x2_t data = vld1_u32((const uint32_t *)ptr32x2); + + bsg_pr_dbg_pl("AXI reading [%" PRIxPTR "]->%8.8x\n", address, data[0]); + bsg_pr_dbg_pl("AXI reading [%" PRIxPTR "]->%8.8x\n", address + 4, data[1]); + return data; + } + + inline uint32x4_t axil_4read(uintptr_t address) { + // Only aligned 128B reads + assert(alignof(address) >= 16); + volatile uint32_t *ptr32x4 = (uint32_t *)axil_get_ptr32(address); + uint32x4_t data = vld1q_u32((const uint32_t *)ptr32x4); + + bsg_pr_dbg_pl("AXI reading [%" PRIxPTR "]->%8.8x\n", address, data[0]); + bsg_pr_dbg_pl("AXI reading [%" PRIxPTR "]->%8.8x\n", address + 4, data[1]); + bsg_pr_dbg_pl("AXI reading [%" PRIxPTR "]->%8.8x\n", address + 8, data[2]); + bsg_pr_dbg_pl("AXI reading [%" PRIxPTR "]->%8.8x\n", address + 12, data[3]); + return data; + } +#endif #endif #ifdef UART_ENABLE diff --git a/cosim/mk/Makefile.verilator b/cosim/mk/Makefile.verilator index 897e0a16..fb3e385e 100644 --- a/cosim/mk/Makefile.verilator +++ b/cosim/mk/Makefile.verilator @@ -14,6 +14,7 @@ CFLAGS += -DVERILATOR CFLAGS += -DHOST_$(call upper,$(HOST)) CFLAGS += $(addprefix -D,$(DEFINES)) CFLAGS += -std=c++14 +CFLAGS += -lpthread LDFLAGS += -L$(BP_SDK_INSTALL_DIR)/lib LDFLAGS += -lboost_coroutine -lboost_context -lboost_system @@ -43,15 +44,15 @@ obj_dir/V$(TB_MODULE): $(FLIST) $(BUILD_COLLATERAL) clean: rm -rf flist*.vcs - rm -rf obj_dir/ *~ trace.fst + rm -rf obj_dir/ *~ rm -rf build.log rm -rf run.log - rm -rf *.nbf - rm -rf *.elf - rm -rf *.trace - rm -rf *.riscv* + #rm -rf *.nbf + #rm -rf *.elf + #rm -rf *.trace + #rm -rf *.riscv* rm -rf *.rv*o* - rm -rf *.dump* + #rm -rf *.dump* rm -rf $(BUILD_COLLATERAL) $(SIM_COLLATERAL) view: diff --git a/cosim/mk/Makefile.zynq b/cosim/mk/Makefile.zynq index f62516cd..dc35614c 100644 --- a/cosim/mk/Makefile.zynq +++ b/cosim/mk/Makefile.zynq @@ -11,8 +11,9 @@ CFLAGS += -DZYNQ CFLAGS += -DHOST_$(call upper,$(HOST)) CFLAGS += $(addprefix -D,$(DEFINES)) CFLAGS += -O2 -CFLAGS += -lcma -lpthread +CFLAGS += $(SIM_DEFINES) CFLAGS += -std=c++14 +LFLAGS += -lcma -lpthread CINCLUDES += -I$(COSIM_INCLUDE_DIR)/common CINCLUDES += -I$(COSIM_INCLUDE_DIR)/zynq @@ -29,7 +30,7 @@ run: $(EXE) $(RUN_COLLATERAL) sudo ./$< $(SIM_ARGS) | tee run.log $(EXE): $(HOST_PROGRAM) $(BUILD_COLLATERAL) - $(GCC) $(CFLAGS) $(LDFLAGS) $(CINCLUDES) $(CSOURCES) $< $(CFLAGS) $(LDFLAGS) -o $@ + $(GCC) $(CFLAGS) $(CINCLUDES) $(CSOURCES) $< $(LFLAGS) -o $@ reset_pl: echo 'from pynq import PL; PL.reset();' | sudo -E $(PYTHON3) diff --git a/cosim/v/bsg_nonsynth_zynq_testbench.sv b/cosim/v/bsg_nonsynth_zynq_testbench.sv index 026a9821..78dfe2c5 100644 --- a/cosim/v/bsg_nonsynth_zynq_testbench.sv +++ b/cosim/v/bsg_nonsynth_zynq_testbench.sv @@ -36,17 +36,6 @@ module bsg_nonsynth_zynq_testbench; aclk_gen (.o(aclk)); - logic core_clk; -`ifdef ASYNC_ACLK_CORE_CLK - localparam aclk_period_lp = 40000; - bsg_nonsynth_clock_gen - #(.cycle_time_p(core_clk_period_lp)) - core_clk_gen - (.o(core_clk)); -`elsif - assign core_clk = aclk; -`endif - logic areset; bsg_nonsynth_reset_gen #(.reset_cycles_lo_p(0), .reset_cycles_hi_p(10))