From 6f7261529b1db32b0dbb7a9dc7b316fdef89e4f3 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Wed, 13 Apr 2022 19:41:17 +0200 Subject: [PATCH 1/2] rework CPU shifter unit move register stage of barrel shifter to improve timing --- rtl/core/neorv32_cpu_cp_shifter.vhd | 80 +++++++++++++---------------- rtl/core/neorv32_package.vhd | 2 +- 2 files changed, 36 insertions(+), 46 deletions(-) diff --git a/rtl/core/neorv32_cpu_cp_shifter.vhd b/rtl/core/neorv32_cpu_cp_shifter.vhd index 4dd1e0c20..e4ad996d1 100644 --- a/rtl/core/neorv32_cpu_cp_shifter.vhd +++ b/rtl/core/neorv32_cpu_cp_shifter.vhd @@ -1,9 +1,8 @@ -- ################################################################################################# --- # << NEORV32 - CPU Co-Processor: Shifter (CPU Core ISA) >> # +-- # << NEORV32 - CPU Co-Processor: Shifter (CPU Base ISA) >> # -- # ********************************************************************************************* # --- # Bit-shift unit for base ISA. # --- # FAST_SHIFT_EN = false (default): Use bit-serial shifter architecture (small but slow) # --- # FAST_SHIFT_EN = true: Use barrel shifter architecture (large but fast) # +-- # FAST_SHIFT_EN = false (default) : Use bit-serial shifter architecture (small but slow) # +-- # FAST_SHIFT_EN = true : Use barrel shifter architecture (large but fast) # -- # ********************************************************************************************* # -- # BSD 3-Clause License # -- # # @@ -45,7 +44,7 @@ use neorv32.neorv32_package.all; entity neorv32_cpu_cp_shifter is generic ( - FAST_SHIFT_EN : boolean -- use barrel shifter for shift operations + FAST_SHIFT_EN : boolean -- implement fast but large barrel shifter ); port ( -- global control -- @@ -78,21 +77,24 @@ architecture neorv32_cpu_cp_shifter_rtl of neorv32_cpu_cp_shifter is -- barrel shifter -- type bs_level_t is array (index_size_f(data_width_c) downto 0) of std_ulogic_vector(data_width_c-1 downto 0); signal bs_level : bs_level_t; + signal bs_start : std_ulogic; signal bs_result : std_ulogic_vector(data_width_c-1 downto 0); begin - -- Iterative Shifter Core (small but slow) ------------------------------------------------ + -- Serial Shifter (small but slow) -------------------------------------------------------- -- ------------------------------------------------------------------------------------------- - serial_shifter_sync: + serial_shifter: if (FAST_SHIFT_EN = false) generate - shifter_unit_sync: process(rstn_i, clk_i) + + -- shifter core -- + serial_shifter_core: process(rstn_i, clk_i) begin if (rstn_i = '0') then - shifter.busy <= '0'; shifter.busy_ff <= def_rst_val_c; - shifter.sreg <= (others => def_rst_val_c); + shifter.busy <= '0'; shifter.cnt <= (others => def_rst_val_c); + shifter.sreg <= (others => def_rst_val_c); elsif rising_edge(clk_i) then shifter.busy_ff <= shifter.busy; if (start_i = '1') then @@ -102,8 +104,8 @@ begin end if; -- if (start_i = '1') then -- trigger new shift - shifter.sreg <= rs1_i; -- shift operand shifter.cnt <= shamt_i; -- shift amount + shifter.sreg <= rs1_i; -- shift data elsif (or_reduce_f(shifter.cnt) = '1') then -- running shift (cnt != 0) shifter.cnt <= std_ulogic_vector(unsigned(shifter.cnt) - 1); if (ctrl_i(ctrl_ir_funct3_2_c) = '0') then -- SLL: shift left logical @@ -113,23 +115,23 @@ begin end if; end if; end if; - end process shifter_unit_sync; - end generate; + end process serial_shifter_core; - -- shift control/output -- - serial_shifter_ctrl: - if (FAST_SHIFT_EN = false) generate + -- shift control/output -- shifter.done <= '1' when (or_reduce_f(shifter.cnt(shifter.cnt'left downto 1)) = '0') else '0'; valid_o <= shifter.busy and shifter.done; res_o <= shifter.sreg when (shifter.busy = '0') and (shifter.busy_ff = '1') else (others => '0'); - end generate; + + end generate; -- /serial_shifter - -- Barrel Shifter Core (fast but large) --------------------------------------------------- + -- Barrel Shifter (fast but large) -------------------------------------------------------- -- ------------------------------------------------------------------------------------------- - barrel_shifter_async: + barrel_shifter: if (FAST_SHIFT_EN = true) generate - shifter_unit_async: process(rs1_i, shamt_i, ctrl_i, bs_level) + + -- shifter core -- + barrel_shifter_core: process(rs1_i, shamt_i, ctrl_i, bs_level) begin -- input level: convert left shifts to right shifts -- if (ctrl_i(ctrl_ir_funct3_2_c) = '0') then -- is left shift? @@ -137,8 +139,7 @@ begin else bs_level(index_size_f(data_width_c)) <= rs1_i; end if; - - -- shifter array -- + -- shifter array (right-shifts only) -- for i in index_size_f(data_width_c)-1 downto 0 loop if (shamt_i(i) = '1') then bs_level(i)(data_width_c-1 downto data_width_c-(2**i)) <= (others => (bs_level(i+1)(data_width_c-1) and ctrl_i(ctrl_ir_funct12_10_c))); @@ -147,35 +148,24 @@ begin bs_level(i) <= bs_level(i+1); end if; end loop; + end process barrel_shifter_core; - -- re-convert original left shifts -- - if (ctrl_i(ctrl_ir_funct3_2_c) = '0') then - bs_result <= bit_rev_f(bs_level(0)); - else - bs_result <= bs_level(0); - end if; - end process shifter_unit_async; - end generate; - - -- output register -- - barrel_shifter_sync: - if (FAST_SHIFT_EN = true) generate - shifter_unit_sync: process(clk_i) + -- pipeline register -- + barrel_shifter_buf: process(clk_i) begin if rising_edge(clk_i) then - res_o <= (others => '0'); - if (start_i = '1') then - res_o <= bs_result; - end if; + bs_start <= start_i; + bs_result <= bs_level(0); -- this register can be moved by the register balancing end if; - end process shifter_unit_sync; - end generate; + end process barrel_shifter_buf; - -- shift control/output -- - barrel_shifter_ctrl: - if (FAST_SHIFT_EN = true) generate + -- output gate and re-convert original left shifts -- + res_o <= (others => '0') when (bs_start = '0') else bit_rev_f(bs_result) when (ctrl_i(ctrl_ir_funct3_2_c) = '0') else bs_result; + + -- processing done -- valid_o <= start_i; - end generate; + + end generate; -- /barrel_shifter end neorv32_cpu_cp_shifter_rtl; diff --git a/rtl/core/neorv32_package.vhd b/rtl/core/neorv32_package.vhd index bdb6b1dcd..86109902e 100644 --- a/rtl/core/neorv32_package.vhd +++ b/rtl/core/neorv32_package.vhd @@ -68,7 +68,7 @@ package neorv32_package is -- Architecture Constants (do not modify!) ------------------------------------------------ -- ------------------------------------------------------------------------------------------- constant data_width_c : natural := 32; -- native data path width - do not change! - constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01070003"; -- NEORV32 version - no touchy! + constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01070004"; -- NEORV32 version - no touchy! constant archid_c : natural := 19; -- official NEORV32 architecture ID - hands off! -- Check if we're inside the Matrix ------------------------------------------------------- From 6142ff7ed546f4930702670ac96b2054765f317e Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Wed, 13 Apr 2022 19:47:45 +0200 Subject: [PATCH 2/2] add v1.7.0.4 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62263d72c..cd64f9649 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ mimpid = 0x01040312 => 01.04.03.12 => Version 01.04.03.12 => v1.4.3.12 | Date (*dd.mm.yyyy*) | Version | Comment | |:----------:|:-------:|:--------| +| 13.04.2022 | 1.7.0.4 | improve timing of CPU's barrel shifter (`FAST_SHIFT_EN` = true) by moving the register stage; [#301](https://github.com/stnolting/neorv32/pull/301) | | 12.04.2022 | 1.7.0.3 | CPU front-end is now controlled by a _synchronous_ state machine (all outgoing signals are driven by registers), reducing critical path of memory system & reducing area costs; :warning: `CPU_IPB_ENTRIES` now has to be >= 2; [#300](https://github.com/stnolting/neorv32/pull/300) | | 11.04.2022 | 1.7.0.2 | cleanup of CPU front-end (instruction fetch); cleaner code, less area costs; [#299](https://github.com/stnolting/neorv32/pull/299) | | 10.04.2022 | 1.7.0.1 | rework handling of `x0` register (`zero`): shortens critical path and reduces area costs; [#298](https://github.com/stnolting/neorv32/pull/298) |