From adf4b219aba1b4cb18cb3c85c9f035ad3d709acd Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 7 May 2022 21:59:42 +0200 Subject: [PATCH 1/4] fix B instruction timings --- docs/datasheet/cpu.adoc | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/datasheet/cpu.adoc b/docs/datasheet/cpu.adoc index 460e42571..21640fc16 100644 --- a/docs/datasheet/cpu.adoc +++ b/docs/datasheet/cpu.adoc @@ -822,14 +822,13 @@ configurations are presented in <<_cpu_performance>>. | Floating-point - misc | `Zfinx` | `fsgnj.s` `fsgnjn.s` `fsgnjx.s` `fclass.s` | 12 | Floating-point - conversion | `Zfinx` | `fcvt.w.s` `fcvt.wu.s` | 47 | Floating-point - conversion | `Zfinx` | `fcvt.s.w` `fcvt.s.wu` | 48 -| Bit-manipulation - arithmetic/logic | `B(Zbb)` | `sext.b` `sext.h` `min` `minu` `max` `maxu` `andn` `orn` `xnor` `zext`(pack) `rev8`(grevi) `orc.b`(gorci) | 3 -| Bit-manipulation - arithmetic/logic | `B(Zba)` | `sh1add` `sh2add` `sh3add` | 3 -| Bit-manipulation - shifts | `B(Zbb)` | `clz` `ctz` | 3 + 0..32 -| Bit-manipulation - shifts | `B(Zbb)` | `cpop` | 3 + 32 -| Bit-manipulation - shifts | `B(Zbb)` | `rol` `ror` `rori` | 3 + SA -| Bit-manipulation - single-bit | `B(Zbs)` | `sbset[i]` `sbclr[i]` `sbinv[i]` `sbext[i]` | 3 -| Bit-manipulation - shifted-add | `B(Zba)` | `sh1add` `sh2add` `sh3add` | 3 -| Bit-manipulation - carry-less multiply | `B(Zbc)` | `clmul` `clmulh` `clmulr` | 3 + 32 +| Bit-manipulation - arithmetic/logic | `B(Zbb)` | `min[u]` `max[u]` `sext.b` `sext.h` `andn` `orn` `xnor` `zext`(pack) `rev8`(grevi) `orc.b`(gorci) | 4 +| Bit-manipulation - shifts | `B(Zbb)` | `clz` `ctz` | 4 + 1..32; FAST_SHIFT: 4 +| Bit-manipulation - shifts | `B(Zbb)` | `cpop` | 4 + 32; FAST_SHIFT: 4 +| Bit-manipulation - shifts | `B(Zbb)` | `rol` `ror[i]` | 4 + SA; FAST_SHIFT: 4 +| Bit-manipulation - shifted-add | `B(Zba)` | `sh1add` `sh2add` `sh3add` | 4 +| Bit-manipulation - single-bit | `B(Zbs)` | `sbset[i]` `sbclr[i]` `sbinv[i]` `sbext[i]` | 4 +| Bit-manipulation - carry-less multiply | `B(Zbc)` | `clmul` `clmulh` `clmulr` | 4 + 32 | Custom instructions (CFU) | `Zxcfu` | - | min. 4 | | | | | _Illegal instructions_ | `Zicsr` | - | min. 2 From 14d3a29b23ecd83ea5a5c5c0f904a42298892de6 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 7 May 2022 21:59:56 +0200 Subject: [PATCH 2/4] code cleanup, faster shifts --- rtl/core/neorv32_cpu_cp_bitmanip.vhd | 154 +++++++++++++-------------- 1 file changed, 72 insertions(+), 82 deletions(-) diff --git a/rtl/core/neorv32_cpu_cp_bitmanip.vhd b/rtl/core/neorv32_cpu_cp_bitmanip.vhd index c83930f6d..525182994 100644 --- a/rtl/core/neorv32_cpu_cp_bitmanip.vhd +++ b/rtl/core/neorv32_cpu_cp_bitmanip.vhd @@ -7,6 +7,9 @@ -- # - Zbs: Single-bit instructions # -- # - Zbc: Carry-less multiplication instructions # -- # # +-- # Processor/CPU configuration generic FAST_MUL_EN can be used to enable implementation of fast # +-- # (full-parallel) logic for all shift-related instructions (ROL, ROR[I], CLZ, CTZ, CPOP). # +-- # # -- # NOTE: This is a first implementation of the bit-manipulation co-processor that supports all # -- # sub-sets of the B extension. Hence, it is not yet optimized for area, latency or speed. # -- # ********************************************************************************************* # @@ -80,43 +83,43 @@ architecture neorv32_cpu_cp_bitmanip_rtl of neorv32_cpu_cp_bitmanip is -- -------------------------------------------------------- -- Zbb - logic with negate -- - constant op_andn_c : natural := 0; - constant op_orn_c : natural := 1; - constant op_xnor_c : natural := 2; + constant op_andn_c : natural := 0; + constant op_orn_c : natural := 1; + constant op_xnor_c : natural := 2; -- Zbb - count leading/trailing zero bits -- - constant op_clz_c : natural := 3; - constant op_ctz_c : natural := 4; + constant op_clz_c : natural := 3; + constant op_ctz_c : natural := 4; -- Zbb - count population -- - constant op_cpop_c : natural := 5; + constant op_cpop_c : natural := 5; -- Zbb - integer minimum/maximum -- - constant op_max_c : natural := 6; -- signed/unsigned - constant op_min_c : natural := 7; -- signed/unsigned + constant op_max_c : natural := 6; -- signed/unsigned + constant op_min_c : natural := 7; -- signed/unsigned -- Zbb - sign- and zero-extension -- - constant op_sextb_c : natural := 8; - constant op_sexth_c : natural := 9; - constant op_zexth_c : natural := 10; + constant op_sextb_c : natural := 8; + constant op_sexth_c : natural := 9; + constant op_zexth_c : natural := 10; -- Zbb - bitwise rotation -- - constant op_rol_c : natural := 11; - constant op_ror_c : natural := 12; -- also rori + constant op_rol_c : natural := 11; + constant op_ror_c : natural := 12; -- also rori -- Zbb - or-combine -- - constant op_orcb_c : natural := 13; + constant op_orcb_c : natural := 13; -- Zbb - byte-reverse -- - constant op_rev8_c : natural := 14; + constant op_rev8_c : natural := 14; -- Zba - shifted-add -- - constant op_sh1add_c : natural := 15; - constant op_sh2add_c : natural := 16; - constant op_sh3add_c : natural := 17; + constant op_sh1add_c : natural := 15; + constant op_sh2add_c : natural := 16; + constant op_sh3add_c : natural := 17; -- Zbs - single-bit operations -- - constant op_bclr_c : natural := 18; - constant op_bext_c : natural := 19; - constant op_binv_c : natural := 20; - constant op_bset_c : natural := 21; + constant op_bclr_c : natural := 18; + constant op_bext_c : natural := 19; + constant op_binv_c : natural := 20; + constant op_bset_c : natural := 21; -- Zbc - carry-less multiplication -- - constant op_clmul_c : natural := 22; - constant op_clmulh_c : natural := 23; - constant op_clmulr_c : natural := 24; + constant op_clmul_c : natural := 22; + constant op_clmulh_c : natural := 23; + constant op_clmulr_c : natural := 24; -- - constant op_width_c : natural := 25; + constant op_width_c : natural := 25; -- controller -- type ctrl_state_t is (S_IDLE, S_START_SHIFT, S_BUSY_SHIFT, S_START_CLMUL, S_BUSY_CLMUL); @@ -125,10 +128,10 @@ architecture neorv32_cpu_cp_bitmanip_rtl of neorv32_cpu_cp_bitmanip is signal valid : std_ulogic; -- operand buffers -- - signal rs1_reg : std_ulogic_vector(data_width_c-1 downto 0); - signal rs2_reg : std_ulogic_vector(data_width_c-1 downto 0); - signal sha_reg : std_ulogic_vector(index_size_f(data_width_c)-1 downto 0); - signal less_ff : std_ulogic; + signal rs1_reg : std_ulogic_vector(data_width_c-1 downto 0); + signal rs2_reg : std_ulogic_vector(data_width_c-1 downto 0); + signal sha_reg : std_ulogic_vector(index_size_f(data_width_c)-1 downto 0); + signal less_reg : std_ulogic; -- serial shifter -- type shifter_t is record @@ -152,7 +155,7 @@ architecture neorv32_cpu_cp_bitmanip_rtl of neorv32_cpu_cp_bitmanip is -- shifted-add unit -- signal adder_core : std_ulogic_vector(data_width_c-1 downto 0); - -- one-hot shifter -- + -- one-hot decoder -- signal one_hot_core : std_ulogic_vector(data_width_c-1 downto 0); -- carry-less multiplier -- @@ -231,7 +234,7 @@ begin rs1_reg <= (others => def_rst_val_c); rs2_reg <= (others => def_rst_val_c); sha_reg <= (others => def_rst_val_c); - less_ff <= def_rst_val_c; + less_reg <= def_rst_val_c; clmul.start <= '0'; shifter.start <= '0'; valid <= '0'; @@ -241,24 +244,24 @@ begin clmul.start <= '0'; valid <= '0'; + -- operand registers -- + if (start_i = '1') then + less_reg <= cmp_i(cmp_less_c); + cmd_buf <= cmd; + rs1_reg <= rs1_i; + rs2_reg <= rs2_i; + sha_reg <= shamt_i; + end if; + -- fsm -- case ctrl_state is when S_IDLE => -- wait for operation trigger -- ------------------------------------------------------------ if (start_i = '1') then - less_ff <= cmp_i(cmp_less_c); - cmd_buf <= cmd; - rs1_reg <= rs1_i; - rs2_reg <= rs2_i; - sha_reg <= shamt_i; - if ((cmd(op_clz_c) or cmd(op_ctz_c) or cmd(op_cpop_c) or cmd(op_ror_c) or cmd(op_rol_c)) = '1') then -- multi-cycle shift operation - if (FAST_SHIFT_EN = false) then -- default: iterative computation - shifter.start <= '1'; - ctrl_state <= S_START_SHIFT; - else -- full-parallel computation - ctrl_state <= S_BUSY_SHIFT; - end if; + if (FAST_SHIFT_EN = false) and ((cmd(op_clz_c) or cmd(op_ctz_c) or cmd(op_cpop_c) or cmd(op_ror_c) or cmd(op_rol_c)) = '1') then -- multi-cycle shift operation + shifter.start <= '1'; + ctrl_state <= S_START_SHIFT; elsif (zbc_en_c = true) and ((cmd(op_clmul_c) or cmd(op_clmulh_c) or cmd(op_clmulr_c)) = '1') then -- multi-cycle clmul operation clmul.start <= '1'; ctrl_state <= S_START_CLMUL; @@ -303,6 +306,7 @@ begin -- ------------------------------------------------------------------------------------------- serial_shifter: if (FAST_SHIFT_EN = false) generate + shifter_unit: process(rstn_i, clk_i) variable new_bit_v : std_ulogic; begin @@ -315,7 +319,7 @@ begin if (shifter.start = '1') then -- trigger new shift shifter.cnt <= (others => '0'); -- shift operand -- - if (cmd_buf(op_clz_c) = '1') or (cmd_buf(op_rol_c) = '1') then -- count LEADING zeros / rotate LEFT + if (cmd_buf(op_clz_c) = '1') or (cmd_buf(op_rol_c) = '1') then -- clz, rol shifter.sreg <= bit_rev_f(rs1_reg); -- reverse - we can only do right shifts here else -- ctz, cpop, ror shifter.sreg <= rs1_reg; @@ -338,14 +342,11 @@ begin end if; end if; end process shifter_unit; - end generate; - -- run control -- - serial_shifter_ctrl: - if (FAST_SHIFT_EN = false) generate + -- run control -- shifter_unit_ctrl: process(cmd_buf, shifter) begin - -- keep shifting until ... -- + -- keep shifting until all bits are processed -- if (cmd_buf(op_clz_c) = '1') or (cmd_buf(op_ctz_c) = '1') then -- count leading/trailing zeros shifter.run <= not shifter.sreg(0); else -- population count / rotate @@ -356,40 +357,17 @@ begin end if; end if; end process shifter_unit_ctrl; - end generate; + + end generate; -- /serial_shifter -- Shifter Function Core (parallel: fast but large) --------------------------------------- -- ------------------------------------------------------------------------------------------- - barrel_shifter_async_sync: + parallel_shifter: if (FAST_SHIFT_EN = true) generate - shifter_unit_fast: process(rstn_i, clk_i) - variable new_bit_v : std_ulogic; - begin - if (rstn_i = '0') then - shifter.cnt <= (others => def_rst_val_c); - shifter.sreg <= (others => def_rst_val_c); - shifter.bcnt <= (others => def_rst_val_c); - elsif rising_edge(clk_i) then - -- population count -- - shifter.bcnt <= std_ulogic_vector(to_unsigned(popcount_f(rs1_reg), shifter.bcnt'length)); - -- count leading/trailing zeros -- - if cmd_buf(op_clz_c) = '1' then -- leading - shifter.cnt <= std_ulogic_vector(to_unsigned(leading_zeros_f(rs1_reg), shifter.cnt'length)); - else -- trailing - shifter.cnt <= std_ulogic_vector(to_unsigned(leading_zeros_f(bit_rev_f(rs1_reg)), shifter.cnt'length)); - end if; - -- barrel shifter -- - shifter.sreg <= bs_level(0); -- rol/ror[i] - end if; - end process shifter_unit_fast; - shifter.run <= '0'; -- we are done already! - end generate; - -- barrel shifter array -- - barrel_shifter_async: - if (FAST_SHIFT_EN = true) generate - shifter_unit_async: process(rs1_reg, sha_reg, cmd_buf, bs_level) + -- barrel shifter array -- + barrel_shifter: process(cmd_buf, rs1_reg, sha_reg, bs_level) begin -- input level: convert left shifts to right shifts -- if (cmd_buf(op_rol_c) = '1') then -- is left shift? @@ -397,7 +375,6 @@ begin else bs_level(index_size_f(data_width_c)) <= rs1_reg; end if; - -- shifter array -- for i in index_size_f(data_width_c)-1 downto 0 loop if (sha_reg(i) = '1') then @@ -407,8 +384,21 @@ begin bs_level(i) <= bs_level(i+1); end if; end loop; - end process shifter_unit_async; - end generate; + end process barrel_shifter; + + -- shift result -- + shifter.sreg <= bs_level(0); -- rol/ror[i] + + -- population count -- + shifter.bcnt <= std_ulogic_vector(to_unsigned(popcount_f(rs1_reg), shifter.bcnt'length)); -- CPOP + + -- count leading/trailing zeros -- + shifter.cnt <= std_ulogic_vector(to_unsigned(leading_zeros_f(rs1_reg), shifter.cnt'length)) when (cmd_buf(op_clz_c) = '1') else -- CLZ + std_ulogic_vector(to_unsigned(leading_zeros_f(bit_rev_f(rs1_reg)), shifter.cnt'length)); -- CTZ + + shifter.run <= '0'; -- we are done already! + + end generate; -- /parallel_shifter -- Shifted-Add Core ----------------------------------------------------------------------- @@ -490,7 +480,7 @@ begin res_int(op_cpop_c)(shifter.bcnt'left downto 0) <= shifter.bcnt; -- min/max select -- - res_int(op_min_c) <= rs1_reg when ((less_ff xor cmd_buf(op_max_c)) = '1') else rs2_reg; + res_int(op_min_c) <= rs1_reg when ((less_reg xor cmd_buf(op_max_c)) = '1') else rs2_reg; res_int(op_max_c) <= (others => '0'); -- unused/redundant -- sign-extension -- From a47596d6e532d1bb4d95123e15d073b14b11eff6 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Tue, 10 May 2022 16:37:37 +0200 Subject: [PATCH 3/4] update version (1.7.1.5) --- rtl/core/neorv32_package.vhd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtl/core/neorv32_package.vhd b/rtl/core/neorv32_package.vhd index d1a858dbe..f9e2de93b 100644 --- a/rtl/core/neorv32_package.vhd +++ b/rtl/core/neorv32_package.vhd @@ -68,7 +68,7 @@ package neorv32_package is -- Architecture Constants (do not modify!) ------------------------------------------------ -- ------------------------------------------------------------------------------------------- constant data_width_c : natural := 32; -- native data path width - do not change! - constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01070104"; -- NEORV32 version - no touchy! + constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01070105"; -- NEORV32 version - no touchy! constant archid_c : natural := 19; -- official NEORV32 architecture ID - hands off! -- Check if we're inside the Matrix ------------------------------------------------------- From b49e1522095f35baf7c5c3f296e93558c6bdbb29 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Tue, 10 May 2022 16:38:11 +0200 Subject: [PATCH 4/4] add v1.7.1.5 --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c8665bdc2..6d027b62c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,11 +6,11 @@ The latest release is [![release](https://img.shields.io/github/v/release/stnolt A list of all releases can be found [here](https://github.com/stnolting/neorv32/releases). The most recent version of the *NEORV32 data sheet* can be found [online at GitHub-pages](https://stnolting.github.io/neorv32). -:information_source: Starting with version `1.5.7` this project uses [semantic versioning](https://semver.org). +Starting with version `1.5.7` this project uses [semantic versioning](https://semver.org). The _hardware version identifier_ uses an additional custom element (`MAJOR.MINOR.PATCH.custom`) to track _individual_ changes. The identifier number is incremented with every core RTL modification and also by major framework modifications. -:information_source: The version number is globally defined by the `hw_version_c` constant in the main VHDL +The version number is globally defined by the `hw_version_c` constant in the main VHDL [package file](https://github.com/stnolting/neorv32/blob/master/rtl/core/neorv32_package.vhd). The processor can determine its version by reading the `mimpid` CSR (at CSR address 0xf13). A 8x4-bit BCD representation is used. Leading zeros are optional. Example: @@ -32,6 +32,7 @@ mimpid = 0x01040312 => 01.04.03.12 => Version 01.04.03.12 => v1.4.3.12 | Date (*dd.mm.yyyy*) | Version | Comment | |:----------:|:-------:|:--------| +| 10.05.2022 | 1.7.1.5 | code clean-up and minor optimization of `B` extension (bit-manipulation) CPU co-processor; [#312](https://github.com/stnolting/neorv32/pull/312) | | 06.05.2022 | 1.7.1.4 | :sparkles: upgrade TRNG module to new [neoTRNG v2](https://github.com/stnolting/neoTRNG); [#311](https://github.com/stnolting/neorv32/pull/311) | | 05.05.2022 | 1.7.1.3 | :bug: bug fix in CPU counter overflow logic (`cycle` and `instret` counters); minor optimization of CPU execution unit; [#310](https://github.com/stnolting/neorv32/pull/310) | | 28.04.2022 | 1.7.1.2 | add flag to `mxisa` CSR to check if _this_ is a simulation (bit 20: _CSR_MXISA_IS_SIM_); add flag to `mxisa` CSR to check if all CPU core register have a dedicated reset (bit 21: _CSR_MXISA_HW_RESET_); [#309](https://github.com/stnolting/neorv32/pull/309) |