diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0e9f20ea6e6cd..803e1ac68354e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -394,6 +394,29 @@ jobs: make -C hw/ip/otbn/util asm-check displayName: Assemble & link code snippets +- job: otbn_crypto_tests + displayName: Run OTBN crypto tests + dependsOn: lint + condition: and(succeeded(), eq(dependencies.lint.outputs['DetermineBuildType.onlyCdcChanges'], '0')) + pool: + vmImage: ubuntu-20.04 + timeoutInMinutes: 60 + steps: + - template: ci/checkout-template.yml + - template: ci/install-package-dependencies.yml + - task: DownloadSecureFile@1 + condition: eq(variables['Build.SourceBranchName'], 'master') + name: bazelCacheGcpKey + inputs: + secureFile: "bazel_cache_gcp_key.json" + - bash: echo "##vso[task.setvariable variable=bazelCacheGcpKeyPath]$(bazelCacheGcpKey.secureFilePath)" + condition: eq(variables['Build.SourceBranchName'], 'master') + displayName: GCP key path + # Set the remote cache GCP key path + - bash: | + ci/bazelisk.sh test --test_tag_filters=-nightly //sw/otbn/crypto/... + displayName: Execute tests + - job: chip_darjeeling_cw310 displayName: CW310's Darjeeling Bitstream # Build CW310 variant of the Darjeeling toplevel design using Vivado diff --git a/ci/azure-pipelines-nightly.yml b/ci/azure-pipelines-nightly.yml index aafcde27f73e9..14224d13a428c 100644 --- a/ci/azure-pipelines-nightly.yml +++ b/ci/azure-pipelines-nightly.yml @@ -70,6 +70,29 @@ jobs: //sw/device/silicon_creator/rom/e2e/... displayName: "Run all ROM E2E tests" +- job: slow_otbn_crypto_tests + displayName: Run slow OTBN crypto tests + dependsOn: lint + condition: and(succeeded(), eq(dependencies.lint.outputs['DetermineBuildType.onlyCdcChanges'], '0')) + pool: + vmImage: ubuntu-20.04 + timeoutInMinutes: 120 + steps: + - template: ci/checkout-template.yml + - template: ci/install-package-dependencies.yml + - task: DownloadSecureFile@1 + condition: eq(variables['Build.SourceBranchName'], 'master') + name: bazelCacheGcpKey + inputs: + secureFile: "bazel_cache_gcp_key.json" + - bash: echo "##vso[task.setvariable variable=bazelCacheGcpKeyPath]$(bazelCacheGcpKey.secureFilePath)" + condition: eq(variables['Build.SourceBranchName'], 'master') + displayName: GCP key path + # Set the remote cache GCP key path + - bash: | + ci/bazelisk.sh test --test_tag_filters=nightly //sw/otbn/crypto/... + displayName: Execute tests + - job: bob_spi_i2c displayName: "BoB: SPI and I2C Tests" timeoutInMinutes: 30 diff --git a/hw/ip/otbn/dv/smoke/smoke_test.s b/hw/ip/otbn/dv/smoke/smoke_test.s index 0fd354ecd5cad..2e80a021d2fcb 100644 --- a/hw/ip/otbn/dv/smoke/smoke_test.s +++ b/hw/ip/otbn/dv/smoke/smoke_test.s @@ -98,24 +98,24 @@ test_label_3: # use mod WSR to load bignum registers with base li psuedo-instruction # mod = 0x78fccc06_2228e9d6_89c9b54f_887cf14e_c79af825_69be586e_9866bb3b_53769ada li x23, 0x78fccc06 -csrrw x0, 0x7d7, x23 +csrrw x0, mod7, x23 li x23, 0x2228e9d6 -csrrw x0, 0x7d6, x23 +csrrw x0, mod6, x23 li x23, 0x89c9b54f -csrrw x0, 0x7d5, x23 +csrrw x0, mod5, x23 li x23, 0x887cf14e -csrrw x0, 0x7d4, x23 +csrrw x0, mod4, x23 li x23, 0xc79af825 -csrrw x0, 0x7d3, x23 +csrrw x0, mod3, x23 li x23, 0x69be586e -csrrw x0, 0x7d2, x23 +csrrw x0, mod2, x23 li x23, 0x9866bb3b -csrrw x0, 0x7d1, x23 +csrrw x0, mod1, x23 li x23, 0x53769ada -csrrw x0, 0x7d0, x23 +csrrw x0, mod0, x23 # x22 = 0x89c9b54f -csrrs x23, 0x7d5, x0 +csrrs x23, mod5, x0 # Note that some instructions used the fixed inputs (from w1 and w2) others use # results from previous instructions. When debugging an failure it is recommened @@ -126,7 +126,7 @@ csrrs x23, 0x7d5, x0 bn.wsrr w1, 0x0 /* MOD */ # Request an RND value with a write to CSR RND_PREFETCH -csrrw x0, 0x7d8, x0 +csrrw x0, rnd_prefetch, x0 # sim environment provides a fixed value for RND (in other environment RND isn't # fixed so this test will have a different final state) @@ -185,7 +185,7 @@ bn.addc w15, w10, w11, FG0 bn.subb w17, w3, w4, FG1 # x24 = {fg1, fg0} = 0x52 -csrrs x24, 0x7c8, x0 +csrrs x24, flags, x0 # w18 = w1 + (w2 << 136) = 0x23a7769f_bbc28381_34745fe9_22168a4e_c79af825_69be586e_9866bb3b_53769ada bn.add w18, w1, w2 << 136 diff --git a/hw/top_darjeeling/rtl/autogen/chip_darjeeling_cw310.sv b/hw/top_darjeeling/rtl/autogen/chip_darjeeling_cw310.sv index 9b28f0721dd11..fbf2308ac094b 100644 --- a/hw/top_darjeeling/rtl/autogen/chip_darjeeling_cw310.sv +++ b/hw/top_darjeeling/rtl/autogen/chip_darjeeling_cw310.sv @@ -1499,30 +1499,31 @@ module chip_darjeeling_cw310 #( // Capture trigger. // We use the clkmgr_aon_idle signal of the IP of interest to form a precise capture trigger. - // GPIO[11:9] is used for selecting the IP of interest. The encoding is as follows (see + // GPIO[11:10] is used for selecting the IP of interest. The encoding is as follows (see // hint_names_e enum in clkmgr_pkg.sv for details). // - // IP - GPIO[11:9] - Index for clkmgr_aon_idle - // ------------------------------------------------------------ - // AES - 000 - 0 - // HMAC - 001 - 1 - not implemented on CW305 - // KMAC - 010 - 2 - not implemented on CW305 - // OTBN (IO_DIV4) - 011 - 3 - not implemented on CW305 - // OTBN - 100 - 4 - not implemented on CW305 + // IP - GPIO[11:10] - Index for clkmgr_aon_idle + // ------------------------------------------------------------- + // AES - 00 - 0 + // HMAC - 01 - 1 - not implemented on CW305 + // KMAC - 10 - 2 - not implemented on CW305 + // OTBN - 11 - 3 - not implemented on CW305 // - // In addition, GPIO8 is used for gating the capture trigger in software. - // Note that GPIO[11:8] are connected to LED[3:0] on the CW310. - // On the CW305, GPIO[9,8] are connected to LED[5,7]. + // GPIO9 is used for gating the selected capture trigger in software. Alternatively, GPIO8 + // can be used to implement a less precise but fully software-controlled capture trigger + // similar to what can be done on ASIC. + // + // Note that on the CW305, GPIO[9,8] are connected to LED[5(Green),7(Red)]. prim_mubi_pkg::mubi4_t clk_trans_idle, manual_in_io_clk_idle; clkmgr_pkg::hint_names_e trigger_sel; always_comb begin : trigger_sel_mux - unique case ({dio_out[DioGpioGpio11], dio_out[DioGpioGpio10], dio_out[DioGpioGpio9]}) - 3'b000: trigger_sel = clkmgr_pkg::HintMainAes; - 3'b001: trigger_sel = clkmgr_pkg::HintMainHmac; - 3'b010: trigger_sel = clkmgr_pkg::HintMainKmac; - 3'b100: trigger_sel = clkmgr_pkg::HintMainOtbn; + unique case ({dio_out[DioGpioGpio11], dio_out[DioGpioGpio10]}) + 2'b00: trigger_sel = clkmgr_pkg::HintMainAes; + 2'b01: trigger_sel = clkmgr_pkg::HintMainHmac; + 2'b10: trigger_sel = clkmgr_pkg::HintMainKmac; + 2'b11: trigger_sel = clkmgr_pkg::HintMainOtbn; default: trigger_sel = clkmgr_pkg::HintMainAes; endcase; end @@ -1530,23 +1531,39 @@ module chip_darjeeling_cw310 #( logic clk_io_div4_trigger_en, manual_in_io_clk_trigger_en; logic clk_io_div4_trigger_oe, manual_in_io_clk_trigger_oe; - assign clk_io_div4_trigger_en = dio_out[DioGpioGpio8]; - assign clk_io_div4_trigger_oe = dio_oe[DioGpioGpio8]; + logic clk_io_div4_trigger_hw_en, manual_in_io_clk_trigger_hw_en; + logic clk_io_div4_trigger_hw_oe, manual_in_io_clk_trigger_hw_oe; + logic clk_io_div4_trigger_sw_en, manual_in_io_clk_trigger_sw_en; + logic clk_io_div4_trigger_sw_oe, manual_in_io_clk_trigger_sw_oe; + assign clk_io_div4_trigger_hw_en = dio_out[DioGpioGpio9]; + assign clk_io_div4_trigger_hw_oe = dio_oe[DioGpioGpio9]; + assign clk_io_div4_trigger_sw_en = dio_out[DioGpioGpio8]; + assign clk_io_div4_trigger_sw_oe = dio_oe[DioGpioGpio8]; // Synchronize signals to manual_in_io_clk. prim_flop_2sync #( - .Width ($bits(clk_trans_idle) + 2) + .Width ($bits(clk_trans_idle) + 4) ) u_sync_trigger ( .clk_i (manual_in_io_clk), .rst_ni(manual_in_por_n), - .d_i ({clk_trans_idle, clk_io_div4_trigger_en, clk_io_div4_trigger_oe}), - .q_o ({manual_in_io_clk_idle, manual_in_io_clk_trigger_en, manual_in_io_clk_trigger_oe}) + .d_i ({clk_trans_idle, + clk_io_div4_trigger_hw_en, + clk_io_div4_trigger_hw_oe, + clk_io_div4_trigger_sw_en, + clk_io_div4_trigger_sw_oe}), + .q_o ({manual_in_io_clk_idle, + manual_in_io_clk_trigger_hw_en, + manual_in_io_clk_trigger_hw_oe, + manual_in_io_clk_trigger_sw_en, + manual_in_io_clk_trigger_sw_oe}) ); - // Generate the actual trigger signal. + // Generate the actual trigger signal as trigger_sw OR trigger_hw. assign manual_attr_io_trigger = '0; - assign manual_oe_io_trigger = manual_in_io_clk_trigger_oe; - assign manual_out_io_trigger = manual_in_io_clk_trigger_en & - prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle); + assign manual_oe_io_trigger = + manual_in_io_clk_trigger_sw_oe | manual_in_io_clk_trigger_hw_oe; + assign manual_out_io_trigger = + manual_in_io_clk_trigger_sw_en | (manual_in_io_clk_trigger_hw_en & + prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle)); endmodule : chip_darjeeling_cw310 diff --git a/hw/top_earlgrey/rtl/autogen/chip_earlgrey_cw310.sv b/hw/top_earlgrey/rtl/autogen/chip_earlgrey_cw310.sv index edfad81678840..84a463c23cf8b 100644 --- a/hw/top_earlgrey/rtl/autogen/chip_earlgrey_cw310.sv +++ b/hw/top_earlgrey/rtl/autogen/chip_earlgrey_cw310.sv @@ -1109,30 +1109,31 @@ module chip_earlgrey_cw310 #( // Capture trigger. // We use the clkmgr_aon_idle signal of the IP of interest to form a precise capture trigger. - // GPIO[11:9] is used for selecting the IP of interest. The encoding is as follows (see + // GPIO[11:10] is used for selecting the IP of interest. The encoding is as follows (see // hint_names_e enum in clkmgr_pkg.sv for details). // - // IP - GPIO[11:9] - Index for clkmgr_aon_idle - // ------------------------------------------------------------ - // AES - 000 - 0 - // HMAC - 001 - 1 - not implemented on CW305 - // KMAC - 010 - 2 - not implemented on CW305 - // OTBN (IO_DIV4) - 011 - 3 - not implemented on CW305 - // OTBN - 100 - 4 - not implemented on CW305 + // IP - GPIO[11:10] - Index for clkmgr_aon_idle + // ------------------------------------------------------------- + // AES - 00 - 0 + // HMAC - 01 - 1 - not implemented on CW305 + // KMAC - 10 - 2 - not implemented on CW305 + // OTBN - 11 - 3 - not implemented on CW305 // - // In addition, GPIO8 is used for gating the capture trigger in software. - // Note that GPIO[11:8] are connected to LED[3:0] on the CW310. - // On the CW305, GPIO[9,8] are connected to LED[5,7]. + // GPIO9 is used for gating the selected capture trigger in software. Alternatively, GPIO8 + // can be used to implement a less precise but fully software-controlled capture trigger + // similar to what can be done on ASIC. + // + // Note that on the CW305, GPIO[9,8] are connected to LED[5(Green),7(Red)]. prim_mubi_pkg::mubi4_t clk_trans_idle, manual_in_io_clk_idle; clkmgr_pkg::hint_names_e trigger_sel; always_comb begin : trigger_sel_mux - unique case ({mio_out[MioOutGpioGpio11], mio_out[MioOutGpioGpio10], mio_out[MioOutGpioGpio9]}) - 3'b000: trigger_sel = clkmgr_pkg::HintMainAes; - 3'b001: trigger_sel = clkmgr_pkg::HintMainHmac; - 3'b010: trigger_sel = clkmgr_pkg::HintMainKmac; - 3'b100: trigger_sel = clkmgr_pkg::HintMainOtbn; + unique case ({mio_out[MioOutGpioGpio11], mio_out[MioOutGpioGpio10]}) + 2'b00: trigger_sel = clkmgr_pkg::HintMainAes; + 2'b01: trigger_sel = clkmgr_pkg::HintMainHmac; + 2'b10: trigger_sel = clkmgr_pkg::HintMainKmac; + 2'b11: trigger_sel = clkmgr_pkg::HintMainOtbn; default: trigger_sel = clkmgr_pkg::HintMainAes; endcase; end @@ -1140,23 +1141,39 @@ module chip_earlgrey_cw310 #( logic clk_io_div4_trigger_en, manual_in_io_clk_trigger_en; logic clk_io_div4_trigger_oe, manual_in_io_clk_trigger_oe; - assign clk_io_div4_trigger_en = mio_out[MioOutGpioGpio8]; - assign clk_io_div4_trigger_oe = mio_oe[MioOutGpioGpio8]; + logic clk_io_div4_trigger_hw_en, manual_in_io_clk_trigger_hw_en; + logic clk_io_div4_trigger_hw_oe, manual_in_io_clk_trigger_hw_oe; + logic clk_io_div4_trigger_sw_en, manual_in_io_clk_trigger_sw_en; + logic clk_io_div4_trigger_sw_oe, manual_in_io_clk_trigger_sw_oe; + assign clk_io_div4_trigger_hw_en = mio_out[MioOutGpioGpio9]; + assign clk_io_div4_trigger_hw_oe = mio_oe[MioOutGpioGpio9]; + assign clk_io_div4_trigger_sw_en = mio_out[MioOutGpioGpio8]; + assign clk_io_div4_trigger_sw_oe = mio_oe[MioOutGpioGpio8]; // Synchronize signals to manual_in_io_clk. prim_flop_2sync #( - .Width ($bits(clk_trans_idle) + 2) + .Width ($bits(clk_trans_idle) + 4) ) u_sync_trigger ( .clk_i (manual_in_io_clk), .rst_ni(manual_in_por_n), - .d_i ({clk_trans_idle, clk_io_div4_trigger_en, clk_io_div4_trigger_oe}), - .q_o ({manual_in_io_clk_idle, manual_in_io_clk_trigger_en, manual_in_io_clk_trigger_oe}) + .d_i ({clk_trans_idle, + clk_io_div4_trigger_hw_en, + clk_io_div4_trigger_hw_oe, + clk_io_div4_trigger_sw_en, + clk_io_div4_trigger_sw_oe}), + .q_o ({manual_in_io_clk_idle, + manual_in_io_clk_trigger_hw_en, + manual_in_io_clk_trigger_hw_oe, + manual_in_io_clk_trigger_sw_en, + manual_in_io_clk_trigger_sw_oe}) ); - // Generate the actual trigger signal. + // Generate the actual trigger signal as trigger_sw OR trigger_hw. assign manual_attr_io_trigger = '0; - assign manual_oe_io_trigger = manual_in_io_clk_trigger_oe; - assign manual_out_io_trigger = manual_in_io_clk_trigger_en & - prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle); + assign manual_oe_io_trigger = + manual_in_io_clk_trigger_sw_oe | manual_in_io_clk_trigger_hw_oe; + assign manual_out_io_trigger = + manual_in_io_clk_trigger_sw_en | (manual_in_io_clk_trigger_hw_en & + prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle)); endmodule : chip_earlgrey_cw310 diff --git a/sw/device/sca/BUILD b/sw/device/sca/BUILD index 2b3d7bcb45a63..3104aa0c539cc 100644 --- a/sw/device/sca/BUILD +++ b/sw/device/sca/BUILD @@ -23,6 +23,7 @@ opentitan_flash_binary( "//sw/device/lib/testing/test_framework:check", "//sw/device/lib/testing/test_framework:ottf_ld_silicon_creator_slot_a", "//sw/device/lib/testing/test_framework:ottf_main", + "//sw/device/sca/lib:aes", "//sw/device/sca/lib:prng", "//sw/device/sca/lib:sca", "//sw/device/sca/lib:simple_serial", diff --git a/sw/device/sca/aes_serial.c b/sw/device/sca/aes_serial.c index e4fc551a99a5e..1bfd13ed8dadb 100644 --- a/sw/device/sca/aes_serial.c +++ b/sw/device/sca/aes_serial.c @@ -5,6 +5,7 @@ #include "sw/device/lib/testing/test_framework/check.h" #include "sw/device/lib/testing/test_framework/ottf_main.h" #include "sw/device/lib/testing/test_framework/ottf_test_config.h" +#include "sw/device/sca/lib/aes.h" #include "sw/device/sca/lib/prng.h" #include "sw/device/sca/lib/sca.h" #include "sw/device/sca/lib/simple_serial.h" @@ -26,12 +27,19 @@ * - Version ('v')+, * - Seed PRNG ('s')+, * - Batch encrypt ('b')*, - * - FvsR batch fixed key set ('t')*, + * - FvsR batch fixed key set ('f')*, * - FvsR batch generate ('g')*, - * - FvsR batch encrypt and generate ('f')*, + * - FvsR batch encrypt and generate ('e')*, + * - Batch encrypt alternative routine ('a')*, + * - Batch encrypt alternative routine, initial plaintext input ('i')*. + * - Set default values for AES-based data generation ('d')*, * Commands marked with * are implemented in this file. Those marked with + are * implemented in the simple serial library. Encryption is done in AES-ECB-128 * mode. See https://wiki.newae.com/SimpleSerial for details on the protocol. + * + * Data for running batch capture is generated according to: + * [DTR] Test Vector Leakage Assessment (TVLA) Derived Test Requirements (DTR) + * with AES */ OTTF_DEFINE_TEST_CONFIG(); @@ -45,17 +53,20 @@ enum { * noise during AES operations. Caution: This number should be chosen to * provide enough time. Otherwise, Ibex might wake up while AES is still busy * and disturb the capture. Currently, we use a start trigger delay of 320 - * clock cycles and the scope captures 60 clock cycles at kClockFreqCpuHz - * (1200 samples). + * clock cycles and the scope captures 60 clock cycles at kClockFreqCpuHz. */ kIbexAesSleepCycles = 680, /** - * Max number of encryption that can be captured with the scope - * 81 is selected for AES with CW Husky - * Note: Maybe it would be better if we use dynamic memory allocation but I - * am not sure whether we are supporting it or not. + * The maximum number of encryptions to do per batch. The ChipWhisperer Husky + * scope determines how many encryptions (capture segments) it wants to record + * per batch based on the number of samples per segment. As the plaintexts + * and keys are generated in advance for fixed-vs-random batch captures, we + * need to make sure the corresponding buffers are sufficiently large. Note + * that on both CW305 and CW310, the main SRAM has a size of 128 kBytes. So it + * should be fine to allocate space for 256 segments (2 * 16 Bytes * 256 = 8 + * kBytes). */ - kNumBatchOpsMax = 81, + kNumBatchOpsMax = 256, /** * Max number of encryptions that can be captured before we rewrite the key to * reset the internal block counter. Otherwise, the AES peripheral might @@ -81,9 +92,57 @@ uint8_t batch_plaintexts[kNumBatchOpsMax][kAesTextLength]; bool sample_fixed = true; /** - * Fixed key for fvsr key TVLA batch capture. + * An array to store pre-computed round keys derived from the generation key. + * The generation key (key_gen) is specified in [DTR] Section 5.1. + * This key is used for generating all pseudo-random data for batch captures. + * kKeyGen[kAesKeyLength] = {0x12, 0x34, 0x56, 0x78, + * 0x9a, 0xbc, 0xde, 0xf1, + * 0x23, 0x45, 0x67, 0x89, + * 0xab, 0xcd, 0xe0, 0xf0}; */ -uint8_t key_fixed[kAesKeyLength]; +static const uint32_t kKeyGenRoundKeys[(kAesKeyLength / 4) * 11] = { + 0xab239a12, 0xcd45bc34, 0xe067de56, 0xf089f178, 0xbc1734ae, 0xe12c69d5, + 0x836304da, 0x9262eb1a, 0xcb776054, 0x9d7c5039, 0x71f29195, 0x64f6947f, + 0xd2196e0e, 0x2bb6ca9a, 0xc4b547d6, 0x6602f460, 0x528099f7, 0xd1fa4c86, + 0xd317a2e5, 0x452321d5, 0x92c040d9, 0x8756ace0, 0xed3e298b, 0x92d7f4d5, + 0xfc6eaeee, 0xc84f19b5, 0x3ed3edc4, 0x2bb96e9a, 0x7a86e846, 0x99511e07, + 0x350bd835, 0xd6fd442a, 0x3c46c028, 0x47de8f91, 0x25101bc3, 0x9f49b4f0, + 0x29155393, 0xb8ff21ae, 0x36130318, 0x79e6af1b, 0xa68f9ac9, 0xcd758aab, + 0x88beadae, 0x8ef711be}; + +/** + * Plaintext of the fixed set of fixed-vs-random-key TVLA + */ +static uint8_t plaintext_fixed[kAesTextLength] = { + 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, + 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa}; +/** + * Key of the of the fixed set of fixed-vs-random-key TVLA + */ +static uint8_t key_fixed[kAesTextLength] = {0x81, 0x1E, 0x37, 0x31, 0xB0, 0x12, + 0x0A, 0x78, 0x42, 0x78, 0x1E, 0x22, + 0xB2, 0x5C, 0xDD, 0xF9}; +/** + * Plaintext of the random set of fixed-vs-random-key TVLA + */ +static uint8_t plaintext_random[kAesTextLength] = { + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc}; +/** + * Key of the random set of fixed-vs-random-key TVLA + */ +static uint8_t key_random[kAesTextLength] = {0x53, 0x53, 0x53, 0x53, 0x53, 0x53, + 0x53, 0x53, 0x53, 0x53, 0x53, 0x53, + 0x53, 0x53, 0x53, 0x53}; +/** + * Temp ciphertext variable + */ +static uint8_t ciphertext_temp[kAesTextLength]; + +/** + * batch_plaintext for batch capture to initially set it using command. + */ +static uint8_t batch_plaintext[kAesTextLength]; /** * Block counter variable for manually handling reseeding operations of the @@ -141,7 +200,7 @@ static void aes_manual_trigger(void) { } /** - * Simple serial 't' (key set) command handler. + * Simple serial 'k' (key set) command handler. * * This command is designed to set the fixed_key variable and in addition also * configures the key into the AES peripheral. @@ -238,6 +297,30 @@ static void aes_serial_single_encrypt(const uint8_t *plaintext, aes_send_ciphertext(false); } +/** + * Advances data for fvsr-key TVLA - fixed set + * + * This function updates plaintext_fixed for fvsr-key TVLA, according + * to DTR recommendations. + */ +static void aes_serial_advance_fixed(void) { + aes_sw_encrypt_block(plaintext_fixed, kKeyGenRoundKeys, ciphertext_temp); + memcpy(plaintext_fixed, ciphertext_temp, kAesTextLength); +} + +/** + * Advances data for fvsr-key TVLA - random set + * + * This function updates plaintext_random and key_random for fvsr-key and + * random TVLA, according to DTR recommendations. + */ +static void aes_serial_advance_random(void) { + aes_sw_encrypt_block(plaintext_random, kKeyGenRoundKeys, ciphertext_temp); + memcpy(plaintext_random, ciphertext_temp, kAesTextLength); + aes_sw_encrypt_block(key_random, kKeyGenRoundKeys, ciphertext_temp); + memcpy(key_random, ciphertext_temp, kAesTextLength); +} + /** * Simple serial 'b' (batch encrypt) command handler. * @@ -279,9 +362,8 @@ static void aes_serial_batch_encrypt(const uint8_t *data, size_t data_len) { sca_set_trigger_high(); for (uint32_t i = 0; i < num_encryptions; ++i) { - uint8_t plaintext[kAesTextLength]; - prng_rand_bytes(plaintext, kAesTextLength); - aes_encrypt(plaintext, kAesTextLength); + aes_encrypt(plaintext_random, kAesTextLength); + aes_serial_advance_random(); } sca_set_trigger_low(); @@ -289,7 +371,90 @@ static void aes_serial_batch_encrypt(const uint8_t *data, size_t data_len) { } /** - * Simple serial 't' (fvsr key set) command handler. + * Simple serial 'a' (alternative batch encrypt) command handler. + * + * This command is designed to maximize the capture rate for side-channel + * attacks. It uses the first supplied plaintext and repeats AES encryptions + * by using every ciphertext as next plaintext with a constant key. This + * minimizes the overhead of UART communication and significantly improves the + * capture rate. + + * Packet payload must be a `uint32_t` representation of the number of + * encryptions to perform. Since generated plaintexts are not cached, there is + * no limit on the number of encryptions. + * + * The key should also be set using 'k' (key set) command. + * + * The host can verify the operation by checking the last 'r' (ciphertext) + * packet that is sent at the end. + * + * @param data Packet payload. + * @param data_len Packet payload length. + */ +static void aes_serial_batch_alternative_encrypt(const uint8_t *data, + size_t data_len) { + // Get num_encryptions from input + uint32_t num_encryptions = 0; + SS_CHECK(data_len == sizeof(num_encryptions)); + num_encryptions = read_32(data); + + // Add to current block_ctr to check if > kBlockCtrMax + block_ctr += num_encryptions; + // Rewrite the key to reset the internal block counter. Otherwise, the AES + // peripheral might trigger the reseeding of the internal masking PRNG which + // disturbs SCA measurements. + if (block_ctr > kBlockCtrMax) { + aes_key_mask_and_config(key_fixed, kAesKeyLength); + block_ctr = num_encryptions; + } + + // First plaintext has been set through command into batch_plaintext + + // Set trigger high outside of loop + // On FPGA, the trigger is AND-ed with AES !IDLE and creates a LO-HI-LO per + // AES operation + sca_set_trigger_high(); + dif_aes_data_t ciphertext; + for (uint32_t i = 0; i < num_encryptions; ++i) { + // Encrypt + aes_encrypt(batch_plaintext, kAesTextLength); + + // Get ciphertext + bool ready = false; + do { + SS_CHECK_DIF_OK( + dif_aes_get_status(&aes, kDifAesStatusOutputValid, &ready)); + } while (!ready); + SS_CHECK_DIF_OK(dif_aes_read_output(&aes, &ciphertext)); + + // Use ciphertext as next plaintext (incl. next call to this function) + memcpy(batch_plaintext, ciphertext.data, kAesTextLength); + } + sca_set_trigger_low(); + + // send last ciphertext + simple_serial_send_packet('r', (uint8_t *)ciphertext.data, kAesTextLength); +} + +/** + * Simple serial 'i' (batch plaintext) command handler. + * + * This command is designed to set the initial plaintext for + * aes_serial_batch_alternative_encrypt. + * + * The plaintext must be `kAesTextLength` bytes long. + * + * @param plaintext. + * @param len. + */ +static void aes_serial_batch_plaintext_set(const uint8_t *plaintext, + size_t len) { + SS_CHECK(len == kAesTextLength); + memcpy(batch_plaintext, plaintext, len); +} + +/** + * Simple serial 'f' (fvsr key set) command handler. * * This command is designed to set the fixed key which is used for fvsr key TVLA * captures. @@ -339,20 +504,19 @@ static void aes_serial_fvsr_key_batch_generate(const uint8_t *data, for (uint32_t i = 0; i < num_encryptions; ++i) { if (sample_fixed) { memcpy(batch_keys[i], key_fixed, kAesKeyLength); + memcpy(batch_plaintexts[i], plaintext_fixed, kAesKeyLength); + aes_serial_advance_fixed(); } else { - prng_rand_bytes(batch_keys[i], kAesKeyLength); + memcpy(batch_keys[i], key_random, kAesKeyLength); + memcpy(batch_plaintexts[i], plaintext_random, kAesKeyLength); + aes_serial_advance_random(); } - // Note: To decrease memory usage, plaintexts may be generated before use in - // every encryption operation instead of generating and storing them for all - // encyrption operation in a batch. Also, a new method should be selected - // to set sample_fixed variable. - prng_rand_bytes(batch_plaintexts[i], kAesTextLength); sample_fixed = batch_plaintexts[i][0] & 0x1; } } /** - * Simple serial 'f' (fixed vs random key batch encrypt and generate) command + * Simple serial 'e' (fixed vs random key batch encrypt and generate) command * handler. * * This command is designed to maximize the capture rate for side-channel @@ -410,12 +574,55 @@ static void aes_serial_fvsr_key_batch_encrypt(const uint8_t *data, * Simple serial 'l' (seed lfsr) command handler. * * This function only supports 4-byte seeds. + * Enables/disables masking depending on seed value, i.e. 0 for disable. * * @param seed A buffer holding the seed. */ static void aes_serial_seed_lfsr(const uint8_t *seed, size_t seed_len) { SS_CHECK(seed_len == sizeof(uint32_t)); - sca_seed_lfsr(read_32(seed)); + uint32_t seed_local = read_32(seed); + if (seed_local == 0) { + // disable masking + transaction.force_masks = true; + } else { + // enable masking + transaction.force_masks = false; + } + sca_seed_lfsr(seed_local); +} + +/** + * Simple serial 'd' (set starting values) command handler. + * + * This function sets starting values for FvsR data generation + * if the received value is 1. + * These values are specified in DTR for AES TVLA + * + * @param data Input command. For now only data == 1 resets values. + */ +static void aes_serial_set_default_values(const uint8_t *data, + size_t data_len) { + SS_CHECK(data_len == sizeof(uint32_t)); + uint32_t command = 0; + command = read_32(data); + static const uint8_t kPlaintextFixedStart[kAesTextLength] = { + 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, + 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa}; + static const uint8_t kKeyFixedStart[kAesTextLength] = { + 0x81, 0x1E, 0x37, 0x31, 0xB0, 0x12, 0x0A, 0x78, + 0x42, 0x78, 0x1E, 0x22, 0xB2, 0x5C, 0xDD, 0xF9}; + static const uint8_t kPlaintextRandomStart[kAesTextLength] = { + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc}; + static const uint8_t kKeyRandomStart[kAesTextLength] = { + 0x53, 0x53, 0x53, 0x53, 0x53, 0x53, 0x53, 0x53, + 0x53, 0x53, 0x53, 0x53, 0x53, 0x53, 0x53, 0x53}; + if (command == 1) { + memcpy(plaintext_fixed, kPlaintextFixedStart, kAesTextLength); + memcpy(key_fixed, kKeyFixedStart, kAesKeyLength); + memcpy(plaintext_random, kPlaintextRandomStart, kAesTextLength); + memcpy(key_random, kKeyRandomStart, kAesKeyLength); + } } /** @@ -443,10 +650,13 @@ bool test_main(void) { simple_serial_register_handler('k', aes_serial_key_set); simple_serial_register_handler('p', aes_serial_single_encrypt); simple_serial_register_handler('b', aes_serial_batch_encrypt); - simple_serial_register_handler('t', aes_serial_fvsr_key_set); + simple_serial_register_handler('f', aes_serial_fvsr_key_set); simple_serial_register_handler('g', aes_serial_fvsr_key_batch_generate); - simple_serial_register_handler('f', aes_serial_fvsr_key_batch_encrypt); + simple_serial_register_handler('e', aes_serial_fvsr_key_batch_encrypt); simple_serial_register_handler('l', aes_serial_seed_lfsr); + simple_serial_register_handler('a', aes_serial_batch_alternative_encrypt); + simple_serial_register_handler('i', aes_serial_batch_plaintext_set); + simple_serial_register_handler('d', aes_serial_set_default_values); LOG_INFO("Initializing AES unit."); init_aes(); diff --git a/sw/device/sca/ecc384_serial.c b/sw/device/sca/ecc384_serial.c index 74a3a6889634c..7894643fbb0d1 100644 --- a/sw/device/sca/ecc384_serial.c +++ b/sw/device/sca/ecc384_serial.c @@ -8,29 +8,29 @@ #include "sw/device/sca/lib/sca.h" #include "sw/device/sca/lib/simple_serial.h" #include "sw/ip/entropy_src/test/utils/entropy_testutils.h" +#include "sw/lib/sw/device/base/abs_mmio.h" #include "sw/lib/sw/device/base/memory.h" #include "sw/lib/sw/device/base/mmio.h" #include "sw/lib/sw/device/runtime/ibex.h" #include "sw/lib/sw/device/runtime/log.h" #include "hw/top_darjeeling/sw/autogen/top_darjeeling.h" +#include "otbn_regs.h" /** * OpenTitan program for OTBN ECDSA-P384 side-channel analysis. * * This program implements the following simple serial commands: - * - Set ephemeral secret key and sign ('p')*, + * - Set ephemeral secret key ('k')*, * - Set private key ('d')*, * - Set message ('n')*, + * - Start signing ('p')* * - Version ('v')+, * - Seed PRNG ('s')+, + * Commands marked with * are implemented in this file. Those marked with + are + * implemented in the simple serial library. * See https://wiki.newae.com/SimpleSerial for details on the protocol. * - * The OTBN-related code was developed based on - * https://github.com/lowRISC/opentitan/tree/master/sw/device/lib/crypto/ecc/ecdsa_p256.c - * and - * https://github.com/lowRISC/opentitan/blob/master/sw/device/tests/crypto/ecdsa_p256_functest.c - * */ OTTF_DEFINE_TEST_CONFIG(); @@ -53,17 +53,37 @@ enum { kEcc384NumWords = kEcc384NumBytes / sizeof(uint32_t), }; +/** + * Two shares of the ephemeral secret key k + * k = k0 + k1 + * k0 = ecc384_secret_k[0:11] (0x00000000...ffffffff) + * k1 = ecc384_secret_k[12:23] (0x00000000...00000000) + * + * The default values can be overwritten via + * the simpleserial command `k` (see ecc384_set_private_key_d) + */ +uint32_t ecc384_secret_k[2 * kEcc384NumWords] = { + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, +}; + /** * Private key d * I took this from here: https://www.rfc-editor.org/rfc/rfc6979#page-33 * The endianness may need to be fixed. * + * Delivered as 2 shares. The second share is set to all-zero by default. + * * The value of this variable can be overwritten via the simpleserial command * `d` (see ecc384_set_private_key_d) */ -uint32_t ecc384_private_key_d[12] = { +uint32_t ecc384_private_key_d[2 * kEcc384NumWords] = { 0xAD3D9D6B, 0x1C8C1B2E, 0x7598B105, 0x4D9F65B6, 0x663B3CE2, 0xBA97F27B, 0x4077A49A, 0xD8377178, 0x4E72D596, 0x25A8704C, 0xEAC972F8, 0xF5EDD260, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, }; /** @@ -71,7 +91,7 @@ uint32_t ecc384_private_key_d[12] = { * The value of this variable can be overwritten via the simpleserial command * `n` (see ecc384_set_msg). */ -uint32_t ecc384_msg[12] = { +uint32_t ecc384_msg[kEcc384NumWords] = { 0x48656c6c, // 'Hell' 0x6f204f54, // 'o OT' 0x424e0000, // 'BN' @@ -85,10 +105,10 @@ OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_r); OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_s); OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_x); OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_y); -OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_d); -OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, - dptr_rnd); // x_r not used in p384 verify .s -OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_k); +OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_d0); +OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_d1); +OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_k0); +OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_k1); OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, mode); OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, msg); @@ -96,10 +116,10 @@ OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, r); OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, s); OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, x); OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, y); -OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, d); -OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, k); -OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, - rnd); // x_r not used in p384 verify .s file +OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, d0); +OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, d1); +OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, k0); +OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, k1); static const otbn_app_t kOtbnAppP384Ecdsa = OTBN_APP_T_INIT(p384_ecdsa_sca); @@ -113,12 +133,14 @@ static const otbn_addr_t kOtbnVarDptrX = OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_x); static const otbn_addr_t kOtbnVarDptrY = OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_y); -static const otbn_addr_t kOtbnVarDptrD = - OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_d); -static const otbn_addr_t kOtbnVarDptrRnd = - OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_rnd); -static const otbn_addr_t kOtbnVarDptrK = - OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_k); +static const otbn_addr_t kOtbnVarDptrD0 = + OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_d0); +static const otbn_addr_t kOtbnVarDptrD1 = + OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_d1); +static const otbn_addr_t kOtbnVarDptrK0 = + OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_k0); +static const otbn_addr_t kOtbnVarDptrK1 = + OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_k1); static const otbn_addr_t kOtbnVarMode = OTBN_ADDR_T_INIT(p384_ecdsa_sca, mode); static const otbn_addr_t kOtbnVarMsg = OTBN_ADDR_T_INIT(p384_ecdsa_sca, msg); @@ -126,9 +148,10 @@ static const otbn_addr_t kOtbnVarR = OTBN_ADDR_T_INIT(p384_ecdsa_sca, r); static const otbn_addr_t kOtbnVarS = OTBN_ADDR_T_INIT(p384_ecdsa_sca, s); static const otbn_addr_t kOtbnVarX = OTBN_ADDR_T_INIT(p384_ecdsa_sca, x); static const otbn_addr_t kOtbnVarY = OTBN_ADDR_T_INIT(p384_ecdsa_sca, y); -static const otbn_addr_t kOtbnVarD = OTBN_ADDR_T_INIT(p384_ecdsa_sca, d); -static const otbn_addr_t kOtbnVarRnd = OTBN_ADDR_T_INIT(p384_ecdsa_sca, rnd); -static const otbn_addr_t kOtbnVarK = OTBN_ADDR_T_INIT(p384_ecdsa_sca, k); +static const otbn_addr_t kOtbnVarD0 = OTBN_ADDR_T_INIT(p384_ecdsa_sca, d0); +static const otbn_addr_t kOtbnVarD1 = OTBN_ADDR_T_INIT(p384_ecdsa_sca, d1); +static const otbn_addr_t kOtbnVarK0 = OTBN_ADDR_T_INIT(p384_ecdsa_sca, k0); +static const otbn_addr_t kOtbnVarK1 = OTBN_ADDR_T_INIT(p384_ecdsa_sca, k1); /** * Makes a single dptr in the P384 library point to where its value is stored. @@ -158,22 +181,50 @@ static void setup_data_pointers(void) { setup_data_pointer(kOtbnVarDptrS, kOtbnVarS); setup_data_pointer(kOtbnVarDptrX, kOtbnVarX); setup_data_pointer(kOtbnVarDptrY, kOtbnVarY); - setup_data_pointer(kOtbnVarDptrD, kOtbnVarD); - setup_data_pointer(kOtbnVarDptrRnd, kOtbnVarRnd); - setup_data_pointer(kOtbnVarDptrK, kOtbnVarK); + setup_data_pointer(kOtbnVarDptrD0, kOtbnVarD0); + setup_data_pointer(kOtbnVarDptrD1, kOtbnVarD1); + setup_data_pointer(kOtbnVarDptrK0, kOtbnVarK0); + setup_data_pointer(kOtbnVarDptrK1, kOtbnVarK1); +} + +/** + * Simple serial 'k' (set ephemeral key) command handler. + * + * This function sets both shares of the secret scalar k. + * The first 48 bytes (i.e, kEcc384NumBytes) are used as k0, and + * The last 48 bytes (i.e, kEcc384NumBytes) are used as k1. + * + * Any of the shares can be set to all zeros to simplify the SCA. + * + * As this function sets both shares, + * the data length must be `2*kEcc384NumBytes`. + * + * @param secret_k Key. + * @param secret_k_len Key length. + */ +static void ecc384_set_secret_key_k(const uint8_t *secret_k, + size_t secret_k_len) { + SS_CHECK(secret_k_len == 2 * kEcc384NumBytes); + memcpy(ecc384_secret_k, secret_k, secret_k_len); } /** * Simple serial 'd' (set private key) command handler. * - * This function does not use key shares to simplify side-channel analysis. - * The key must be `kEcc384NumBytes` bytes long. + * This function sets both shares of the private key d. + * The first 48 bytes (i.e, kEcc384NumBytes) are used as d0, and + * The last 48 bytes (i.e, kEcc384NumBytes) are used as d1. + * + * Any of the shares can be set to all zeros to simplify the SCA. + * + * As this function sets both shares, + * the data length must be `2*kEcc384NumBytes`. * * @param key_d Key. * @param key_d_len Key length. */ static void ecc_384_set_private_key_d(const uint8_t *key_d, size_t key_d_len) { - SS_CHECK(key_d_len == kEcc384NumBytes); + SS_CHECK(key_d_len == 2 * kEcc384NumBytes); memcpy(ecc384_private_key_d, key_d, key_d_len); } @@ -231,23 +282,21 @@ static void p384_ecdsa_sign(const uint32_t *msg, const uint32_t *private_key_d, setup_data_pointers(); uint32_t mode = 1; // mode 1 => sign - LOG_INFO("Copy data"); + // LOG_INFO("Copy data"); SS_CHECK_STATUS_OK(otbn_dmem_write(/*num_words=*/1, &mode, kOtbnVarMode)); p384_dmem_write(msg, kOtbnVarMsg); - p384_dmem_write(private_key_d, kOtbnVarD); + p384_dmem_write(private_key_d, kOtbnVarD0); + p384_dmem_write(private_key_d + kEcc384NumWords, kOtbnVarD1); - SS_CHECK_STATUS_OK(otbn_dmem_write(kEcc384NumWords, k, kOtbnVarK)); + SS_CHECK_STATUS_OK(otbn_dmem_write(kEcc384NumWords, k, kOtbnVarK0)); + SS_CHECK_STATUS_OK( + otbn_dmem_write(kEcc384NumWords, k + kEcc384NumWords, kOtbnVarK1)); - LOG_INFO("Execute"); SS_CHECK_STATUS_OK(otbn_execute()); - LOG_INFO("Wait for done"); SS_CHECK_STATUS_OK(otbn_busy_wait_for_done()); - LOG_INFO("Get results"); SS_CHECK_STATUS_OK(otbn_dmem_read(kEcc384NumWords, kOtbnVarR, signature_r)); SS_CHECK_STATUS_OK(otbn_dmem_read(kEcc384NumWords, kOtbnVarS, signature_s)); - LOG_INFO("r[0]: 0x%02x", signature_r[0]); - LOG_INFO("s[0]: 0x%02x", signature_s[0]); } /** @@ -263,18 +312,13 @@ static void p384_ecdsa_sign(const uint32_t *msg, const uint32_t *private_key_d, * UART. * @param secret_k_len Length of the ephemeral key. */ -static void ecc_384_ecdsa(const uint8_t *ecc384_secret_k_bytes, - size_t secret_k_len) { - if (secret_k_len != kEcc384NumBytes) { - LOG_INFO("Invalid data length %hu", (uint8_t)secret_k_len); - return; - } - // Copy k to an aligned buffer. - uint32_t ecc384_secret_k[kEcc384NumWords]; - memcpy(ecc384_secret_k, ecc384_secret_k_bytes, kEcc384NumBytes); - +static void ecc384_ecdsa(const uint8_t *ecc384_secret_k_bytes, + size_t secret_k_len) { LOG_INFO("SSECDSA starting..."); SS_CHECK_STATUS_OK(otbn_load_app(kOtbnAppP384Ecdsa)); + LOG_INFO( + "otbn_status: 0x%08x", + abs_mmio_read32(TOP_DARJEELING_OTBN_BASE_ADDR + OTBN_STATUS_REG_OFFSET)); uint32_t ecc384_signature_r[kEcc384NumWords]; uint32_t ecc384_signature_s[kEcc384NumWords]; @@ -301,7 +345,7 @@ static void ecc_384_ecdsa(const uint8_t *ecc384_secret_k_bytes, simple_serial_send_packet('r', ecc384_signature_r_bytes, kEcc384NumBytes); simple_serial_send_packet('r', ecc384_signature_s_bytes, kEcc384NumBytes); - LOG_INFO("Clearing OTBN memory"); + // Clear OTBN memory SS_CHECK_STATUS_OK(otbn_dmem_sec_wipe()); SS_CHECK_STATUS_OK(otbn_imem_sec_wipe()); } @@ -321,11 +365,13 @@ static void simple_serial_main(void) { LOG_INFO("Initializing simple serial interface to capture board."); simple_serial_init(sca_get_uart()); - SS_CHECK(simple_serial_register_handler('p', ecc_384_ecdsa) != + SS_CHECK(simple_serial_register_handler('p', ecc384_ecdsa) == kSimpleSerialOk); - SS_CHECK(simple_serial_register_handler('d', ecc_384_set_private_key_d) != + SS_CHECK(simple_serial_register_handler('k', ecc384_set_secret_key_k) == kSimpleSerialOk); - SS_CHECK(simple_serial_register_handler('n', ecc384_set_msg) != + SS_CHECK(simple_serial_register_handler('d', ecc_384_set_private_key_d) == + kSimpleSerialOk); + SS_CHECK(simple_serial_register_handler('n', ecc384_set_msg) == kSimpleSerialOk); LOG_INFO("Starting simple serial packet handling."); @@ -335,6 +381,9 @@ static void simple_serial_main(void) { } bool test_main(void) { + (void)kOtbnVarX; + (void)kOtbnVarY; + simple_serial_main(); return true; } diff --git a/sw/device/sca/kmac_serial.c b/sw/device/sca/kmac_serial.c index 70419708d5505..0516b66e04aaa 100644 --- a/sw/device/sca/kmac_serial.c +++ b/sw/device/sca/kmac_serial.c @@ -429,10 +429,13 @@ static void kmac_init(void) { dif_kmac_config_t config = (dif_kmac_config_t){ .entropy_mode = kDifKmacEntropyModeSoftware, + .entropy_fast_process = kDifToggleDisabled, .entropy_seed = {0xaa25b4bf, 0x48ce8fff, 0x5a78282a, 0x48465647, 0x70410fef}, - .entropy_fast_process = false, - .msg_mask = true, + .message_big_endian = kDifToggleDisabled, + .output_big_endian = kDifToggleDisabled, + .sideload = kDifToggleDisabled, + .msg_mask = kDifToggleEnabled, }; SS_CHECK_DIF_OK(dif_kmac_configure(&kmac, config)); @@ -587,7 +590,7 @@ bool test_main(void) { simple_serial_register_handler('k', sha3_serial_set_key); simple_serial_register_handler('p', sha3_serial_single_absorb); simple_serial_register_handler('b', sha3_serial_batch); - simple_serial_register_handler('t', sha3_serial_fixed_key_set); + simple_serial_register_handler('f', sha3_serial_fixed_key_set); simple_serial_register_handler('l', sha3_serial_seed_lfsr); LOG_INFO("Initializing the KMAC peripheral."); diff --git a/sw/device/sca/lib/BUILD b/sw/device/sca/lib/BUILD index 6ee5934cc2136..c9ca5bc81f447 100644 --- a/sw/device/sca/lib/BUILD +++ b/sw/device/sca/lib/BUILD @@ -4,6 +4,15 @@ package(default_visibility = ["//visibility:public"]) +cc_library( + name = "aes", + srcs = ["aes.c"], + hdrs = ["aes.h"], + deps = [ + "//sw/lib/sw/device/base:memory", + ], +) + cc_library( name = "prng", srcs = ["prng.c"], @@ -25,6 +34,7 @@ cc_library( hdrs = ["simple_serial.h"], deps = [ ":prng", + ":sca", "//hw/top_darjeeling/sw/autogen:top_darjeeling", "//sw/ip/base/dif:base", "//sw/lib/sw/device/arch:device", diff --git a/sw/device/sca/lib/aes.c b/sw/device/sca/lib/aes.c new file mode 100644 index 0000000000000..03863b41f6dbe --- /dev/null +++ b/sw/device/sca/lib/aes.c @@ -0,0 +1,212 @@ +// Copyright lowRISC contributors. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +/** + * NOTE: The only intended use of this code is to serve as a PRNG for generating + * input data for SCA experiments and penetration testing. + * The library is not hardened against any type of attacks, and it should not be + * used for any purpose other than stated. + * + * During the SCA experiments, encryptions are verified on the host side by + * running the same encryption using PyCryptodome package and comparing the + * result. + * + * Implementation of round-functions is based on a transposed-state technique + * for 32-bit architecture presented in: + * + * [1] Bertoni et. al., Efficient Software Implementation of AES on 32-Bit + * Platforms, CHES 2002. + * + * https://link.springer.com/content/pdf/10.1007/3-540-36400-5_13.pdf + * + */ +#include "aes.h" + +#include +#include +#include + +#include "sw/lib/sw/device/base/memory.h" + +enum { + kAesNumRounds = 10, + kAesNumKeyBytes = 16, + kAesNumTextBytes = 16, + kAesNumStateBytes = 16, + kAesNumStateWords = 4 +}; + +static void aes_add_round_key(uint32_t *state, const uint32_t *round_key) { + state[0] ^= round_key[0]; + state[1] ^= round_key[1]; + state[2] ^= round_key[2]; + state[3] ^= round_key[3]; +} + +static void aes_sub_bytes(uint32_t *state) { + // SubBytes on a transposed state + // Section 3.1 of [1] + for (size_t i = 0; i < 4; ++i) { + state[i] = (uint32_t)kSbox[state[i] & 0xff] | + ((uint32_t)kSbox[(state[i] >> 8) & 0xff] << 8) | + ((uint32_t)kSbox[(state[i] >> 16) & 0xff] << 16) | + ((uint32_t)kSbox[(state[i] >> 24) & 0xff] << 24); + } +} + +static uint32_t aes_mul2(uint32_t s) { + // Multiplication by 2 in Rijndael field. + // Each byte of the 32b input word is multiplied. + uint32_t t; + t = (uint32_t)kMul2[s & 0xff] | ((uint32_t)kMul2[(s >> 8) & 0xff] << 8) | + ((uint32_t)kMul2[(s >> 16) & 0xff] << 16) | + ((uint32_t)kMul2[(s >> 24) & 0xff] << 24); + return t; +} + +static void aes_shift_rows(uint32_t *state) { + // ShiftRows on a transposed state + // Section 3.1 of [1] + state[1] = (state[1] >> 8) | (state[1] << 24); + state[2] = (state[2] >> 16) | (state[2] << 16); + state[3] = (state[3] >> 24) | (state[3] << 8); +} + +static void aes_mix_columns(uint32_t *state) { + // MixColumns on a transposed state + // Section 3.1 of [1] + uint32_t temp[kAesNumStateWords]; + + memcpy(temp, state, kAesNumStateBytes); + + state[0] = temp[1] ^ temp[2] ^ temp[3]; + state[1] = temp[0] ^ temp[2] ^ temp[3]; + state[2] = temp[0] ^ temp[1] ^ temp[3]; + state[3] = temp[0] ^ temp[1] ^ temp[2]; + + temp[0] = aes_mul2(temp[0]); + temp[1] = aes_mul2(temp[1]); + temp[2] = aes_mul2(temp[2]); + temp[3] = aes_mul2(temp[3]); + + state[0] ^= temp[0] ^ temp[1]; + state[1] ^= temp[1] ^ temp[2]; + state[2] ^= temp[2] ^ temp[3]; + state[3] ^= temp[3] ^ temp[0]; +} + +static void aes_transpose_to_32(uint8_t *in_data, uint32_t *out_data) { + out_data[0] = (uint32_t)in_data[0] | ((uint32_t)in_data[4] << 8) | + ((uint32_t)in_data[8] << 16) | ((uint32_t)in_data[12] << 24); + out_data[1] = (uint32_t)in_data[1] | ((uint32_t)in_data[5] << 8) | + ((uint32_t)in_data[9] << 16) | ((uint32_t)in_data[13] << 24); + out_data[2] = (uint32_t)in_data[2] | ((uint32_t)in_data[6] << 8) | + ((uint32_t)in_data[10] << 16) | ((uint32_t)in_data[14] << 24); + out_data[3] = (uint32_t)in_data[3] | ((uint32_t)in_data[7] << 8) | + ((uint32_t)in_data[11] << 16) | ((uint32_t)in_data[15] << 24); +} + +static void aes_transpose_from_32(uint32_t *in_data, uint8_t *out_data) { + out_data[0] = (uint8_t)(in_data[0] & 0xff); + out_data[1] = (uint8_t)(in_data[1] & 0xff); + out_data[2] = (uint8_t)(in_data[2] & 0xff); + out_data[3] = (uint8_t)(in_data[3] & 0xff); + out_data[4] = (uint8_t)(in_data[0] >> 8) & 0xff; + out_data[5] = (uint8_t)(in_data[1] >> 8) & 0xff; + out_data[6] = (uint8_t)(in_data[2] >> 8) & 0xff; + out_data[7] = (uint8_t)(in_data[3] >> 8) & 0xff; + out_data[8] = (uint8_t)(in_data[0] >> 16) & 0xff; + out_data[9] = (uint8_t)(in_data[1] >> 16) & 0xff; + out_data[10] = (uint8_t)(in_data[2] >> 16) & 0xff; + out_data[11] = (uint8_t)(in_data[3] >> 16) & 0xff; + out_data[12] = (uint8_t)(in_data[0] >> 24) & 0xff; + out_data[13] = (uint8_t)(in_data[1] >> 24) & 0xff; + out_data[14] = (uint8_t)(in_data[2] >> 24) & 0xff; + out_data[15] = (uint8_t)(in_data[3] >> 24) & 0xff; +} + +static uint8_t aes_rcon_next(uint8_t rcon) { + // rcon cannot be 0 + if (rcon != 0) { + // update round constant + return kMul2[rcon]; + } else { + // init round constant to first-round value + return 0x1; + } +} + +static void aes_key_expand(uint8_t *round_key, uint8_t *rcon) { + uint8_t temp[kAesNumStateWords]; + uint8_t old_key[kAesNumKeyBytes]; + + // copy key to temp + memcpy(old_key, round_key, kAesNumKeyBytes); + + // shift last word + temp[0] = old_key[13]; + temp[1] = old_key[14]; + temp[2] = old_key[15]; + temp[3] = old_key[12]; + + // sub bytes in last word + temp[0] = kSbox[temp[0]]; + temp[1] = kSbox[temp[1]]; + temp[2] = kSbox[temp[2]]; + temp[3] = kSbox[temp[3]]; + + // update rcon + *rcon = aes_rcon_next(*rcon); + + // get new words + round_key[0] = temp[0] ^ old_key[0] ^ *rcon; + round_key[1] = temp[1] ^ old_key[1]; + round_key[2] = temp[2] ^ old_key[2]; + round_key[3] = temp[3] ^ old_key[3]; + + for (size_t i = 4; i < kAesNumKeyBytes; ++i) { + round_key[i] = round_key[i - 4] ^ old_key[i]; + } +} + +void aes_key_schedule(uint32_t *round_keys, const uint8_t *key) { + // Derives all round keys for AES128 + // Each key is storred in 4 32-bit words in a transposed-state form. + uint8_t rcon = 0; + uint8_t key_temp[kAesNumKeyBytes]; + uint32_t key_temp_32[kAesNumStateWords]; + + memcpy(key_temp, key, kAesNumKeyBytes); + aes_transpose_to_32(key_temp, key_temp_32); + memcpy(round_keys, key_temp_32, kAesNumKeyBytes); + for (size_t i = 1; i < kAesNumRounds + 1; ++i) { + aes_key_expand(key_temp, &rcon); + aes_transpose_to_32(key_temp, key_temp_32); + memcpy(round_keys + i * kAesNumStateWords, key_temp_32, kAesNumKeyBytes); + } +} + +void aes_sw_encrypt_block(const uint8_t *plain_text, const uint32_t *round_keys, + uint8_t *cipher_text) { + uint32_t state[kAesNumStateWords]; + + // initially transpose state + uint8_t pt[kAesNumTextBytes]; + memcpy(pt, plain_text, kAesNumTextBytes); + aes_transpose_to_32(pt, state); + + // encrypt + aes_add_round_key(state, round_keys); + for (int j = 0; j < kAesNumRounds - 1; ++j) { + aes_sub_bytes(state); + aes_shift_rows(state); + aes_mix_columns(state); + aes_add_round_key(state, round_keys + (j + 1) * kAesNumStateWords); + } + aes_sub_bytes(state); + aes_shift_rows(state); + aes_add_round_key(state, round_keys + kAesNumStateWords * kAesNumRounds); + + // transpose the result back into the byte form + aes_transpose_from_32(state, cipher_text); +} diff --git a/sw/device/sca/lib/aes.h b/sw/device/sca/lib/aes.h new file mode 100644 index 0000000000000..b2db60cae28cc --- /dev/null +++ b/sw/device/sca/lib/aes.h @@ -0,0 +1,149 @@ +// Copyright lowRISC contributors. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +#ifndef OPENTITAN_SW_DEVICE_SCA_LIB_AES_H_ +#define OPENTITAN_SW_DEVICE_SCA_LIB_AES_H_ + +/** + * NOTE: The only intended use of this code is to serve as a PRNG for generating + * input data for SCA experiments and penetration testing. + * The library is not hardened against any type of attacks, and it should not be + * used for any purpose other than stated. + * + * During the SCA experiments, encryptions are verified on the host side by + * running the same encryption using PyCryptodome package and comparing the + * result. + * + * Implementation of round-functions is based on a transposed-state technique + * for 32-bit architecture presented in: + * + * [1] Bertoni et. al., Efficient Software Implementation of AES on 32-Bit + * Platforms, CHES 2002. + * + * https://link.springer.com/content/pdf/10.1007/3-540-36400-5_13.pdf + * + */ + +#include +#include + +#include "sw/lib/sw/device/base/memory.h" + +/** + * Encrypt one data block (16 Bytes) in ECB mode. + * + * @param plain_text Input block to enrypt + * @param round_keys All round keys (pre-computed) + * @param[out] cipher_text Encrypted output block + */ +void aes_sw_encrypt_block(const uint8_t *plain_text, const uint32_t *round_keys, + uint8_t *cipher_text); + +/** + * Generate all round keys for AES-128 encryption. + * Store keys in a transposed-state form. + * + * @param[out] round_keys Round keys for all rounds + * @param key Encryption key + */ +void aes_key_schedule(uint32_t *round_keys, const uint8_t *key); + +static const uint8_t kSbox[256] = { + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, + 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, + 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, + 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, + 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, + 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, + 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, + 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, + 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, + 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, + 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, + 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, + 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, + 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, + 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, + 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, + 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16}; + +static const uint8_t kMul2[256] = { + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, + 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, + + 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, + 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, + + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, + 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, + + 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, + 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, + + 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, + 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, + + 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, + 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, + + 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, + 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, + + 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, + 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, + + 0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, + 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05, + + 0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, + 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25, + + 0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, + 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, + + 0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, + 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65, + + 0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, + 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85, + + 0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, + 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5, + + 0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, + 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5, + + 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, + 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5}; + +#endif // OPENTITAN_SW_DEVICE_SCA_LIB_AES_H_ diff --git a/sw/device/sca/lib/sca.c b/sw/device/sca/lib/sca.c index c22de550faf20..13210836b4764 100644 --- a/sw/device/sca/lib/sca.c +++ b/sw/device/sca/lib/sca.c @@ -32,21 +32,28 @@ /** * Bitfield for the trigger source. * - * Bits 9 to 11 are used to select the trigger source. See chiplevel.sv.tpl for - * details. + * Bits 10 and 11 are used to select the trigger source. See chiplevel.sv.tpl + * for details. */ static const bitfield_field32_t kTriggerSourceBitfield = { - .index = 9, - .mask = 0x7, + .index = 10, + .mask = 0x3, }; enum { /** - * Bit index of the trigger gate signal for gating the trigger from software. + * Bit index of the hardware trigger gate signal for gating the hardware + * trigger from software. * * See chiplevel.sv.tpl for details. */ - kTriggerGateBitIndex = 8, + kTriggerHwGateBitIndex = 9, + /** + * Bit index of the software trigger signal. + * + * See chiplevel.sv.tpl for details. + */ + kTriggerSwBitIndex = 8, /** * RV timer settings. */ @@ -54,6 +61,9 @@ enum { kRvTimerHart = kTopDarjeelingPlicTargetIbex0, }; +// By default, we use the precise, hardware-gated capture trigger. +static unsigned int trigger_bit_index = kTriggerHwGateBitIndex; + static dif_uart_t uart0; static dif_gpio_t gpio; static dif_pinmux_t pinmux; @@ -98,7 +108,8 @@ static void sca_init_gpio(sca_trigger_source_t trigger) { uint32_t select_mask = bitfield_field32_write(0, kTriggerSourceBitfield, UINT32_MAX); - uint32_t enable_mask = bitfield_bit32_write(0, kTriggerGateBitIndex, true); + uint32_t enable_mask = bitfield_bit32_write(0, kTriggerHwGateBitIndex, true); + enable_mask = bitfield_bit32_write(enable_mask, kTriggerSwBitIndex, true); OT_DISCARD(dif_gpio_output_set_enabled_all(&gpio, select_mask | enable_mask)); @@ -245,12 +256,20 @@ void sca_init(sca_trigger_source_t trigger, sca_peripherals_t enable) { const dif_uart_t *sca_get_uart(void) { return &uart0; } +void sca_select_trigger_type(sca_trigger_type_t trigger_type) { + if (trigger_type == kScaTriggerTypeHwGated) { + trigger_bit_index = kTriggerHwGateBitIndex; + } else if (trigger_type == kScaTriggerTypeSw) { + trigger_bit_index = kTriggerSwBitIndex; + } +} + void sca_set_trigger_high(void) { - OT_DISCARD(dif_gpio_write(&gpio, kTriggerGateBitIndex, true)); + OT_DISCARD(dif_gpio_write(&gpio, trigger_bit_index, true)); } void sca_set_trigger_low(void) { - OT_DISCARD(dif_gpio_write(&gpio, kTriggerGateBitIndex, false)); + OT_DISCARD(dif_gpio_write(&gpio, trigger_bit_index, false)); } void sca_call_and_sleep(sca_callee callee, uint32_t sleep_cycles) { diff --git a/sw/device/sca/lib/sca.h b/sw/device/sca/lib/sca.h index 8a7805b442d0a..9b8ef77e2c80e 100644 --- a/sw/device/sca/lib/sca.h +++ b/sw/device/sca/lib/sca.h @@ -24,28 +24,43 @@ typedef enum sca_trigger_source { /** * Use AES for capture trigger. * - * The trigger signal will go high 40 cycles after `dif_aes_trigger()` is + * The trigger signal will go high 320 cycles after `dif_aes_trigger()` is * called and remain high until the operation is complete. */ - kScaTriggerSourceAes, + kScaTriggerSourceAes = 0, /** * Use HMAC for capture trigger. */ - kScaTriggerSourceHmac, + kScaTriggerSourceHmac = 1, /** * Use KMAC for capture trigger. */ - kScaTriggerSourceKmac, - /** - * Use OTBN (IO_DIV4 clock) for capture trigger. - */ - kScaTriggerSourceOtbnIoDiv4, + kScaTriggerSourceKmac = 2, /** * Use OTBN for capture trigger. */ - kScaTriggerSourceOtbn, + kScaTriggerSourceOtbn = 3, } sca_trigger_source_t; +/** + * Trigger type. + */ +typedef enum sca_trigger_type { + /** + * Use the precise hardware capture trigger gateable by software. If selected, + * the actual capture trigger is generated based on the clkmgr_aon_idle signal + * of the peripheral corresponding to selected trigger source. + * + * Note that this is available on FPGA only. + */ + kScaTriggerTypeHwGated = 0, + /** + * Use the fully software controlled capture trigger. If selected, the + * configured trigger source is not relevant. + */ + kScaTriggerTypeSw = 1, +} sca_trigger_type_t; + /** * Peripherals. * @@ -124,6 +139,13 @@ void sca_init(sca_trigger_source_t trigger, sca_peripherals_t enable); */ const dif_uart_t *sca_get_uart(void); +/** + * Select the capture trigger type. + * + * @param trigger_type The trigger type to select. + */ +void sca_select_trigger_type(sca_trigger_type_t trigger_type); + /** * Sets capture trigger high. * diff --git a/sw/device/sca/lib/simple_serial.c b/sw/device/sca/lib/simple_serial.c index 564a10f5c500e..b3ae942efb914 100644 --- a/sw/device/sca/lib/simple_serial.c +++ b/sw/device/sca/lib/simple_serial.c @@ -5,6 +5,7 @@ #include "sw/device/sca/lib/simple_serial.h" #include "sw/device/sca/lib/prng.h" +#include "sw/device/sca/lib/sca.h" #include "sw/ip/uart/dif/dif_uart.h" #include "sw/lib/sw/device/arch/device.h" #include "sw/lib/sw/device/base/macros.h" @@ -34,11 +35,11 @@ enum { * Command handlers. * * Clients can register handlers for commands 'a'-'z' using - * `simple_serial_register_handler()` except for 'v' (version) and 's' (seed - * PRNG), which are handled by this library. This array has an extra element - * (27) that is initialized in `simple_serial_init()` to point to - * `simple_serial_unknown_command()` in order to simplify handling of invalid - * commands in `simple_serial_process_packet()`. + * `simple_serial_register_handler()` except for 'v' (version), 's' (seed + * PRNG), and 't' (select trigger type) which are handled by this library. This + * array has an extra element (27) that is initialized in `simple_serial_init()` + * to point to `simple_serial_unknown_command()` in order to simplify handling + * of invalid commands in `simple_serial_process_packet()`. */ static simple_serial_command_handler handlers[27]; static const dif_uart_t *uart; @@ -161,6 +162,20 @@ static void simple_serial_seed_prng(const uint8_t *seed, size_t seed_len) { prng_seed(read_32(seed)); } +/** + * Simple serial 't' (select trigger type) command handler. + * + * This function only supports 1-byte trigger values. + * + * @param trigger A buffer holding the trigger type. + * @param trigger_len Buffer length. + */ +static void simple_serial_select_trigger_type(const uint8_t *trigger, + size_t trigger_len) { + SS_CHECK(trigger_len == 1); + sca_select_trigger_type((sca_trigger_type_t)trigger[0]); +} + /** * Handler for uninmplemented simple serial commands. * @@ -181,6 +196,8 @@ void simple_serial_init(const dif_uart_t *uart_) { handlers[i] = simple_serial_unknown_command; } handlers[simple_serial_get_handler_index('s')] = simple_serial_seed_prng; + handlers[simple_serial_get_handler_index('t')] = + simple_serial_select_trigger_type; handlers[simple_serial_get_handler_index('v')] = simple_serial_version; } @@ -188,7 +205,7 @@ simple_serial_result_t simple_serial_register_handler( uint8_t cmd, simple_serial_command_handler handler) { if (!simple_serial_is_valid_command(cmd)) { return kSimpleSerialError; - } else if (cmd == 's' || cmd == 'v') { + } else if (cmd == 's' || cmd == 't' || cmd == 'v') { // Cannot register handlers for built-in commands. return kSimpleSerialError; } else { diff --git a/sw/device/sca/otbn_vertical/ecc256_keygen_serial.c b/sw/device/sca/otbn_vertical/ecc256_keygen_serial.c index 1b9b6c02a5673..bcc972f02b7ea 100644 --- a/sw/device/sca/otbn_vertical/ecc256_keygen_serial.c +++ b/sw/device/sca/otbn_vertical/ecc256_keygen_serial.c @@ -123,6 +123,21 @@ uint32_t ecc256_seed[kEcc256SeedNumWords] = { 0x22b79d5f, 0x1176f31d, 0xb5ac3a51, 0x99a082d7, 0x484eb366, }; +uint32_t ecc256_C[kEcc256SeedNumWords] = { + 0x016064e9, 0x11e3f4d6, 0xac3a6fa7, 0xaba11a1b, 0x8f9271d1, + 0x22b79d5f, 0x1176f31d, 0xb5ac3a51, 0x99a082d7, 0x484eb366, +}; + +uint32_t random_number[kEcc256CoordNumWords] = { + 0x016064e9, 0x11e3f4d6, 0xac3a6fa7, 0xaba11a1b, + 0x22b79d5f, 0x1176f31d, 0xb5ac3a51, 0x99a082d7, +}; + +uint32_t ecc256_fixed_number[kEcc256CoordNumWords] = { + 0x04030201, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, +}; + void ecc256_en_masks(const uint8_t *enable, size_t enable_len) { SS_CHECK(enable_len == 1); if (*enable) { @@ -132,6 +147,14 @@ void ecc256_en_masks(const uint8_t *enable, size_t enable_len) { } } +/** + * Simple serial 'x' (set seed) command handler. + * + * The seed must be `kEcc256SeedNumBytes` bytes long. + * + * @param seed Value for seed share. + * @param seed_len Length of seed share. + */ void ecc256_set_seed(const uint8_t *seed, size_t seed_len) { SS_CHECK(seed_len == kEcc256SeedNumBytes); memcpy(ecc256_seed, seed, seed_len); @@ -141,6 +164,19 @@ void ecc256_set_seed(const uint8_t *seed, size_t seed_len) { prng_rand_bytes((unsigned char *)temp, kEcc256SeedNumBytes); } +/** + * Simple serial 'c' (set constant) command handler. + * + * The constant must be `kEcc256SeedNumBytes` bytes long. + * + * @param C Value of the C constant. + * @param len Length of the C constant. + */ +void ecc256_set_c(const uint8_t *C, size_t len) { + SS_CHECK(len == kEcc256SeedNumBytes); + memcpy(ecc256_C, C, len); +} + /** * Callback wrapper for OTBN manual trigger function. */ @@ -173,7 +209,7 @@ static void p256_run_keygen(uint32_t mode, const uint32_t *share0, sca_set_trigger_low(); } -void ecc256_ecdsa_secret_keygen_batch(const uint8_t *data, size_t data_len) { +void ecc256_ecdsa_keygen_fvsr_seed_batch(const uint8_t *data, size_t data_len) { uint32_t num_traces = 0; uint32_t batch_digest[kEcc256SeedNumWords]; uint8_t dummy[kEcc256SeedNumBytes]; @@ -234,6 +270,103 @@ void ecc256_ecdsa_secret_keygen_batch(const uint8_t *data, size_t data_len) { kEcc256SeedNumWords * 4); } +/** + * Adds two integers storred in byte arrays. + * + * Adds the integer stored in source array to the integer stored in + * destination aray. + * The user needs to ensure that dest_len isenough to store the result + * without overflow. + * + * @param[in] dest Location of the first input array and the result. + * @param[in] source Location of the second input array. + * @param[in] dest_len Length od the dest array in bytes. + * @param[in] source_len Length of the source array in bytes. + */ +static void add_arrays(uint8_t *dest, uint8_t *source, size_t dest_len, + size_t source_len) { + uint16_t temp = 0; + + for (size_t i = 0; i < source_len; i++) { + temp += (uint16_t)source[i] + dest[i]; + dest[i] = (uint8_t)(temp & 0x00FF); + temp >>= 8; + } + + for (size_t i = source_len; i < dest_len; i++) { + temp += (uint16_t)dest[i]; + dest[i] = (uint8_t)(temp & 0x00FF); + temp >>= 8; + } +} + +void ecc256_ecdsa_keygen_fvsr_key_batch(const uint8_t *data, size_t data_len) { + uint32_t num_traces = 0; + uint32_t batch_digest[kEcc256SeedNumWords]; + uint8_t dummy[kEcc256SeedNumBytes]; + SS_CHECK(data_len == sizeof(num_traces)); + num_traces = read_32(data); + + if (num_traces > kNumBatchOpsMax) { + LOG_ERROR("Too many traces for one batch."); + return; + } + + // zero the batch digest + for (uint32_t j = 0; j < kEcc256SeedNumWords; ++j) { + batch_digest[j] = 0; + } + + for (uint32_t i = 0; i < num_traces; ++i) { + if (run_fixed) { + memcpy(batch_share0[i], ecc256_seed, kEcc256SeedNumBytes); + } else { + // Here change to random_number + C + // It is necessary to set the C first + memcpy(batch_share0[i], ecc256_C, kEcc256SeedNumBytes); + prng_rand_bytes((unsigned char *)random_number, kEcc256CoordNumBytes); + add_arrays((unsigned char *)batch_share0[i], + (unsigned char *)random_number, kEcc256SeedNumBytes, + kEcc256CoordNumBytes); + } + if (en_masks) { + prng_rand_bytes((unsigned char *)batch_share1[i], kEcc256SeedNumBytes); + } else { + for (uint32_t j = 0; j < kEcc256SeedNumWords; ++j) { + batch_share1[i][j] = 0; + } + } + for (uint32_t j = 0; j < kEcc256SeedNumWords; ++j) { + batch_share0[i][j] ^= batch_share1[i][j]; + } + // Another PRNG run to determine 'run_fixed' for the next cycle. + prng_rand_bytes(dummy, kEcc256SeedNumBytes); + + run_fixed = dummy[0] & 0x1; + } + + for (uint32_t i = 0; i < num_traces; ++i) { + p256_run_keygen(kEcc256ModePrivateKeyOnly, batch_share0[i], + batch_share1[i]); + + // Read results. + SS_CHECK_STATUS_OK( + otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0_batch)); + SS_CHECK_STATUS_OK( + otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1_batch)); + + // The correctness of each batch is verified by computing and sending + // the batch digest. This digest is computed by XORing all d0 shares of + // the batch. + for (uint32_t j = 0; j < kEcc256SeedNumWords; ++j) { + batch_digest[j] ^= d0_batch[j]; + } + } + // Send the batch digest to the host for verification. + simple_serial_send_packet('r', (uint8_t *)batch_digest, + kEcc256SeedNumWords * 4); +} + /** * Generates a secret key from a masked seed. * diff --git a/sw/device/sca/otbn_vertical/ecc256_keygen_serial.h b/sw/device/sca/otbn_vertical/ecc256_keygen_serial.h index fa9b37cc4bf68..533e2e749075d 100644 --- a/sw/device/sca/otbn_vertical/ecc256_keygen_serial.h +++ b/sw/device/sca/otbn_vertical/ecc256_keygen_serial.h @@ -50,15 +50,46 @@ void ecc256_en_masks(const uint8_t *enable, size_t enable_len); */ void ecc256_set_seed(const uint8_t *seed, size_t seed_len); +/** + * Simple serial 'c' (set constant) command handler. + * + * The constant must be `kEcc256SeedNumBytes` bytes long. + * + * @param C Value of the C constant. + * @param len Length of the C constant. + */ +void ecc256_set_c(const uint8_t *C, size_t len); + +/** + * Simple serial 'e' (secret keygen fvsr key batch mode) command handler. + * + * Collects data for ECDSA keygen fixed-vs-random test in the KEY mode. + * In the KEY mode, the fixed set of measurements is generated using the fixed + * 320 bit seed. The random set of measurements is generated in two steps: + * 1. Choose a random 256 bit number r + * 2. Compute the seed as (C + r) where C is the fixed 320 bit constant. Note + * that in this case the used key is equal to (C + r) mod curve_order_n. + * Takes a number of traces that has to be captured in one batch as input. + * + * @param data Value for trace count. + * @param data_len Length of trace count input. + */ +void ecc256_ecdsa_keygen_fvsr_key_batch(const uint8_t *data, size_t data_len); + /** * Simple serial 'b' (secret keygen batch mode) command handler. * + * Collects data for ECDSA keygen fixed-vs-random test in the SEED mode. + * In the SEED mode, the fixed set of measurements is generated using the fixed + * 320 bit seed. The random set of measurements is generated using a random 320 + * bit seed. In both cases, the used key is equal to seed mod curve_order_n + * * Takes a number of traces that has to be captured in one batch as input. * * @param data Value for trace count. * @param data_len Length of trace count input. */ -void ecc256_ecdsa_secret_keygen_batch(const uint8_t *data, size_t data_len); +void ecc256_ecdsa_keygen_fvsr_seed_batch(const uint8_t *data, size_t data_len); /** * Simple serial 'k' (secret keygen) command handler. diff --git a/sw/device/sca/otbn_vertical/otbn_vertical_serial.c b/sw/device/sca/otbn_vertical/otbn_vertical_serial.c index fdabf1489184c..db9fdc43ea210 100644 --- a/sw/device/sca/otbn_vertical/otbn_vertical_serial.c +++ b/sw/device/sca/otbn_vertical/otbn_vertical_serial.c @@ -74,13 +74,17 @@ static void simple_serial_main(void) { simple_serial_init(sca_get_uart()); SS_CHECK(simple_serial_register_handler( - 'b', ecc256_ecdsa_secret_keygen_batch) == kSimpleSerialOk); + 'b', ecc256_ecdsa_keygen_fvsr_seed_batch) == kSimpleSerialOk); + SS_CHECK(simple_serial_register_handler( + 'e', ecc256_ecdsa_keygen_fvsr_key_batch) == kSimpleSerialOk); SS_CHECK(simple_serial_register_handler('k', ecc256_ecdsa_secret_keygen) == kSimpleSerialOk); SS_CHECK(simple_serial_register_handler('p', ecc256_ecdsa_gen_keypair) == kSimpleSerialOk); SS_CHECK(simple_serial_register_handler('x', ecc256_set_seed) == kSimpleSerialOk); + SS_CHECK(simple_serial_register_handler('c', ecc256_set_c) == + kSimpleSerialOk); SS_CHECK(simple_serial_register_handler('m', ecc256_en_masks) == kSimpleSerialOk); SS_CHECK(simple_serial_register_handler('a', ecc256_app_select) == diff --git a/sw/device/sca/sha3_serial.c b/sw/device/sca/sha3_serial.c index d47435b9208c1..b7b1497cc73f9 100644 --- a/sw/device/sca/sha3_serial.c +++ b/sw/device/sca/sha3_serial.c @@ -70,6 +70,20 @@ enum { */ static dif_kmac_t kmac; +/** + * The KMAC config. + */ +static dif_kmac_config_t config = (dif_kmac_config_t){ + .entropy_mode = kDifKmacEntropyModeSoftware, + .entropy_fast_process = kDifToggleDisabled, + .entropy_seed = {0xaa25b4bf, 0x48ce8fff, 0x5a78282a, 0x48465647, + 0x70410fef}, + .message_big_endian = kDifToggleDisabled, + .output_big_endian = kDifToggleDisabled, + .sideload = kDifToggleDisabled, + .msg_mask = kDifToggleEnabled, +}; + /** * KMAC operation state. */ @@ -344,14 +358,6 @@ static void kmac_init(void) { SS_CHECK_DIF_OK(dif_kmac_init( mmio_region_from_addr(TOP_DARJEELING_KMAC_BASE_ADDR), &kmac)); - dif_kmac_config_t config = (dif_kmac_config_t){ - .entropy_mode = kDifKmacEntropyModeSoftware, - .entropy_seed = {0xaa25b4bf, 0x48ce8fff, 0x5a78282a, 0x48465647, - 0x70410fef}, - .entropy_fast_process = false, - .msg_mask = true, - }; - SS_CHECK_DIF_OK(dif_kmac_configure(&kmac, config)); kmac_block_until_idle(); @@ -367,14 +373,13 @@ static void kmac_disable_masking(const uint8_t *masks_off, size_t off_len) { SS_CHECK_DIF_OK(dif_kmac_init( mmio_region_from_addr(TOP_DARJEELING_KMAC_BASE_ADDR), &kmac)); - dif_kmac_config_t config; if (masks_off[0]) { - config.entropy_fast_process = true; - config.msg_mask = false; + config.entropy_fast_process = kDifToggleEnabled; + config.msg_mask = kDifToggleDisabled; LOG_INFO("Initializing the KMAC peripheral with masking disabled."); } else { - config.entropy_fast_process = false; - config.msg_mask = true; + config.entropy_fast_process = kDifToggleDisabled; + config.msg_mask = kDifToggleEnabled; LOG_INFO("Initializing the KMAC peripheral with masking enabled."); } SS_CHECK_DIF_OK(dif_kmac_configure(&kmac, config)); @@ -513,7 +518,7 @@ bool test_main(void) { simple_serial_init(sca_get_uart()); simple_serial_register_handler('p', sha3_serial_single_absorb); simple_serial_register_handler('b', sha3_serial_batch); - simple_serial_register_handler('t', sha3_serial_fixed_message_set); + simple_serial_register_handler('f', sha3_serial_fixed_message_set); simple_serial_register_handler('l', sha3_serial_seed_lfsr); simple_serial_register_handler('m', kmac_disable_masking); diff --git a/sw/device/silicon_creator/lib/drivers/BUILD b/sw/device/silicon_creator/lib/drivers/BUILD index 0cc5ba7931b4a..43b72777dda41 100644 --- a/sw/device/silicon_creator/lib/drivers/BUILD +++ b/sw/device/silicon_creator/lib/drivers/BUILD @@ -251,6 +251,7 @@ cc_library( "//hw/top_darjeeling/sw/autogen:top_darjeeling", "//sw/lib/sw/device/base:abs_mmio", "//sw/lib/sw/device/base:macros", + "//sw/lib/sw/device/runtime:hart", "//sw/lib/sw/device/silicon_creator:error", "//sw/lib/sw/device/silicon_creator:keymgr_binding", "//sw/lib/sw/device/silicon_creator/base:sec_mmio", diff --git a/sw/device/silicon_creator/lib/drivers/keymgr.c b/sw/device/silicon_creator/lib/drivers/keymgr.c index f08b96c67ae7a..ceb36af1d6252 100644 --- a/sw/device/silicon_creator/lib/drivers/keymgr.c +++ b/sw/device/silicon_creator/lib/drivers/keymgr.c @@ -8,6 +8,7 @@ #include "sw/lib/sw/device/base/abs_mmio.h" #include "sw/lib/sw/device/base/macros.h" +#include "sw/lib/sw/device/runtime/hart.h" #include "sw/lib/sw/device/silicon_creator/base/sec_mmio.h" #include "hw/top_darjeeling/sw/autogen/top_darjeeling.h" @@ -132,3 +133,131 @@ void keymgr_advance_state(void) { rom_error_t keymgr_state_check(keymgr_state_t expected_state) { return expected_state_check(expected_state); } + +/** + * Fails if the keymgr is not idle. + * + * @return OK if the key manager is idle, kErrorKeymgrInternal otherwise. + */ +OT_WARN_UNUSED_RESULT +static rom_error_t keymgr_is_idle(void) { + uint32_t reg = abs_mmio_read32(kBase + KEYMGR_OP_STATUS_REG_OFFSET); + uint32_t status = bitfield_field32_read(reg, KEYMGR_OP_STATUS_STATUS_FIELD); + if (launder32(status) == KEYMGR_OP_STATUS_STATUS_VALUE_IDLE) { + HARDENED_CHECK_EQ(status, KEYMGR_OP_STATUS_STATUS_VALUE_IDLE); + return kErrorOk; + } + return kErrorKeymgrInternal; +} + +/** + * Wait for the key manager to finish an operation. + * + * Polls the key manager until it is no longer busy. If the operation completed + * successfully or the key manager was already idle, returns kErrorOk. If + * there was an error during the operation, reads and clears the error code + * and returns kErrorKeymgrInternal. + * + * @return OK or error. + */ +OT_WARN_UNUSED_RESULT +static rom_error_t keymgr_wait_until_done(void) { + // Poll the OP_STATUS register until it is something other than "WIP". + uint32_t reg; + uint32_t status; + do { + // Read OP_STATUS and then clear by writing back the value we read. + reg = abs_mmio_read32(kBase + KEYMGR_OP_STATUS_REG_OFFSET); + abs_mmio_write32(kBase + KEYMGR_OP_STATUS_REG_OFFSET, reg); + status = bitfield_field32_read(reg, KEYMGR_OP_STATUS_STATUS_FIELD); + } while (status == KEYMGR_OP_STATUS_STATUS_VALUE_WIP); + + // Check if the key manager reported errors. If it is already idle or + // completed an operation successfully, return an OK status. A `WIP` status + // should not be possible because of the check above. + switch (launder32(status)) { + case KEYMGR_OP_STATUS_STATUS_VALUE_IDLE: + HARDENED_CHECK_EQ(status, KEYMGR_OP_STATUS_STATUS_VALUE_IDLE); + return kErrorOk; + case KEYMGR_OP_STATUS_STATUS_VALUE_DONE_SUCCESS: + HARDENED_CHECK_EQ(status, KEYMGR_OP_STATUS_STATUS_VALUE_DONE_SUCCESS); + return kErrorOk; + case KEYMGR_OP_STATUS_STATUS_VALUE_DONE_ERROR: { + // Clear the ERR_CODE register before returning. + uint32_t err_code = abs_mmio_read32(kBase + KEYMGR_ERR_CODE_REG_OFFSET); + abs_mmio_write32(kBase + KEYMGR_ERR_CODE_REG_OFFSET, err_code); + return kErrorKeymgrInternal; + } + } + + // Should be unreachable. + HARDENED_TRAP(); + return kErrorKeymgrInternal; +} + +rom_error_t keymgr_generate_attestation_key_otbn( + keymgr_diversification_t diversification) { + HARDENED_RETURN_IF_ERROR(keymgr_is_idle()); + + // Select OTBN as the destination. + uint32_t ctrl = + bitfield_field32_write(0, KEYMGR_CONTROL_SHADOWED_DEST_SEL_FIELD, + KEYMGR_CONTROL_SHADOWED_DEST_SEL_VALUE_OTBN); + + // Select the attestation CDI. + ctrl = bitfield_bit32_write(ctrl, KEYMGR_CONTROL_SHADOWED_CDI_SEL_BIT, true); + + // Select the "generate" operation. + ctrl = bitfield_field32_write( + ctrl, KEYMGR_CONTROL_SHADOWED_OPERATION_FIELD, + KEYMGR_CONTROL_SHADOWED_OPERATION_VALUE_GENERATE_HW_OUTPUT); + + // Write the control register. + abs_mmio_write32_shadowed(kBase + KEYMGR_CONTROL_SHADOWED_REG_OFFSET, ctrl); + + // Set the version. + abs_mmio_write32(kBase + KEYMGR_KEY_VERSION_REG_OFFSET, + diversification.version); + // Set the salt. + for (size_t i = 0; i < kKeymgrSaltNumWords; i++) { + abs_mmio_write32(kBase + KEYMGR_SALT_0_REG_OFFSET + (i * sizeof(uint32_t)), + diversification.salt[i]); + } + + // Issue the start command. + abs_mmio_write32(kBase + KEYMGR_START_REG_OFFSET, 1 << KEYMGR_START_EN_BIT); + + // Block until keymgr is done. + return keymgr_wait_until_done(); +} + +rom_error_t keymgr_sideload_clear_otbn(void) { + HARDENED_RETURN_IF_ERROR(keymgr_is_idle()); + + // Set SIDELOAD_CLEAR to begin continuously clearing the requested slot. + abs_mmio_write32( + kBase + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET, + bitfield_field32_write(0, KEYMGR_SIDELOAD_CLEAR_VAL_FIELD, + KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_OTBN)); + + // Read back the value (hardening measure). + uint32_t sideload_clear = + abs_mmio_read32(kBase + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET); + if (bitfield_field32_read(sideload_clear, KEYMGR_SIDELOAD_CLEAR_VAL_FIELD) != + KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_OTBN) { + return kErrorKeymgrInternal; + } + + // Spin for 100 microseconds. + // TODO(#20024): this value seems to work for tests, but it would be good to + // run a more principled analysis. + busy_spin_micros(100); + + // Stop continuous clearing. + abs_mmio_write32( + kBase + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET, + bitfield_field32_write(0, KEYMGR_SIDELOAD_CLEAR_VAL_FIELD, + KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_NONE)); + + return kErrorOk; +} diff --git a/sw/device/silicon_creator/lib/drivers/keymgr.h b/sw/device/silicon_creator/lib/drivers/keymgr.h index 6b561278669f1..b01f4ffbceb60 100644 --- a/sw/device/silicon_creator/lib/drivers/keymgr.h +++ b/sw/device/silicon_creator/lib/drivers/keymgr.h @@ -57,6 +57,27 @@ typedef enum keymgr_state { kKeymgrStateNumStates, } keymgr_state_t; +enum { + /** + * Number of 32-bit words for the salt. + */ + kKeymgrSaltNumWords = 8, +}; + +/** + * Data used to differentiate a generated keymgr key. + */ +typedef struct keymgr_diversification { + /** + * Salt value to use for key generation. + */ + uint32_t salt[kKeymgrSaltNumWords]; + /** + * Version for key generation (anti-rollback protection). + */ + uint32_t version; +} keymgr_diversification_t; + /** * The following constants represent the expected number of sec_mmio register * writes performed by functions in provided in this module. See @@ -160,6 +181,31 @@ void keymgr_advance_state(void); OT_WARN_UNUSED_RESULT rom_error_t keymgr_state_check(keymgr_state_t expected_state); +/** + * Derive a key manager key for the OTBN block. + * + * Calls the key manager to sideload a key into the OTBN hardware block and + * waits until the operation is complete before returning. Always uses the + * attestation (not sealing) CDI; call this only for attestation keys. + * + * @param diversification Diversification input for the key derivation. + * @return OK or error. + */ +OT_WARN_UNUSED_RESULT +rom_error_t keymgr_generate_attestation_key_otbn( + const keymgr_diversification_t diversification); + +/** + * Clear OTBN's sideloaded key slot. + * + * The entropy complex needs to be initialized before calling this function, so + * that keymgr can use it to clear the slot. + * + * @return OK or error. + */ +OT_WARN_UNUSED_RESULT +rom_error_t keymgr_sideload_clear_otbn(void); + #ifdef __cplusplus } #endif diff --git a/sw/device/silicon_creator/lib/drivers/keymgr_unittest.cc b/sw/device/silicon_creator/lib/drivers/keymgr_unittest.cc index 02ad25df0ea39..9916f1c4457b0 100644 --- a/sw/device/silicon_creator/lib/drivers/keymgr_unittest.cc +++ b/sw/device/silicon_creator/lib/drivers/keymgr_unittest.cc @@ -36,6 +36,39 @@ class KeymgrTest : public rom_test::RomTest { EXPECT_SEC_READ32(base_ + KEYMGR_WORKING_STATE_REG_OFFSET, km_state); } + void ExpectIdleCheck(uint32_t op_status) { + EXPECT_ABS_READ32(base_ + KEYMGR_OP_STATUS_REG_OFFSET, op_status); + } + void ExpectDiversificationWrite(keymgr_diversification_t diversification) { + EXPECT_ABS_WRITE32(base_ + KEYMGR_KEY_VERSION_REG_OFFSET, + diversification.version); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_0_REG_OFFSET, + diversification.salt[0]); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_1_REG_OFFSET, + diversification.salt[1]); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_2_REG_OFFSET, + diversification.salt[2]); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_3_REG_OFFSET, + diversification.salt[3]); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_4_REG_OFFSET, + diversification.salt[4]); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_5_REG_OFFSET, + diversification.salt[5]); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_6_REG_OFFSET, + diversification.salt[6]); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_7_REG_OFFSET, + diversification.salt[7]); + } + void ExpectWaitUntilDone(size_t busy_cycles, uint32_t end_status) { + for (size_t i = 0; i < busy_cycles; i++) { + EXPECT_ABS_READ32(base_ + KEYMGR_OP_STATUS_REG_OFFSET, + KEYMGR_OP_STATUS_STATUS_VALUE_WIP); + EXPECT_ABS_WRITE32(base_ + KEYMGR_OP_STATUS_REG_OFFSET, + KEYMGR_OP_STATUS_STATUS_VALUE_WIP); + } + EXPECT_ABS_READ32(base_ + KEYMGR_OP_STATUS_REG_OFFSET, end_status); + EXPECT_ABS_WRITE32(base_ + KEYMGR_OP_STATUS_REG_OFFSET, end_status); + } uint32_t base_ = TOP_DARJEELING_KEYMGR_BASE_ADDR; SwBindingCfg cfg_ = { .max_key_ver = 0xA5A5A5A5, @@ -162,5 +195,132 @@ TEST_F(KeymgrTest, CheckStateInvalidResponse) { kErrorKeymgrInternal); } +TEST_F(KeymgrTest, GenAttestationKey) { + keymgr_diversification_t test_diversification = { + .salt = {0xf0f1f2f3, 0xf4f5f6f7, 0xf8f9fafb, 0xfcfdfeff, 0xd0d1d2d3, + 0xd4d5d6d7, 0xd8d9dadb, 0xdcdddedf}, + .version = cfg_.max_key_ver - 1, + }; + + ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_IDLE); + EXPECT_ABS_WRITE32_SHADOWED( + base_ + KEYMGR_CONTROL_SHADOWED_REG_OFFSET, + { + {KEYMGR_CONTROL_SHADOWED_DEST_SEL_OFFSET, + KEYMGR_CONTROL_SHADOWED_DEST_SEL_VALUE_OTBN}, + {KEYMGR_CONTROL_SHADOWED_CDI_SEL_BIT, true}, + {KEYMGR_CONTROL_SHADOWED_OPERATION_OFFSET, + KEYMGR_CONTROL_SHADOWED_OPERATION_VALUE_GENERATE_HW_OUTPUT}, + }); + ExpectDiversificationWrite(test_diversification); + EXPECT_ABS_WRITE32(base_ + KEYMGR_START_REG_OFFSET, + { + {KEYMGR_START_EN_BIT, true}, + }); + ExpectWaitUntilDone(/*busy_cycles=*/2, + KEYMGR_OP_STATUS_STATUS_VALUE_DONE_SUCCESS); + + EXPECT_EQ(keymgr_generate_attestation_key_otbn(test_diversification), + kErrorOk); +} + +TEST_F(KeymgrTest, GenAttestationKeyNotIdle) { + keymgr_diversification_t test_diversification = { + .salt = {0xf0f1f2f3, 0xf4f5f6f7, 0xf8f9fafb, 0xfcfdfeff, 0xd0d1d2d3, + 0xd4d5d6d7, 0xd8d9dadb, 0xdcdddedf}, + .version = cfg_.max_key_ver - 1, + }; + + ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_WIP); + EXPECT_EQ(keymgr_generate_attestation_key_otbn(test_diversification), + kErrorKeymgrInternal); + ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_DONE_ERROR); + EXPECT_EQ(keymgr_generate_attestation_key_otbn(test_diversification), + kErrorKeymgrInternal); + ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_DONE_SUCCESS); + EXPECT_EQ(keymgr_generate_attestation_key_otbn(test_diversification), + kErrorKeymgrInternal); +} + +TEST_F(KeymgrTest, GenAttestationKeyError) { + keymgr_diversification_t test_diversification = { + .salt = {0xf0f1f2f3, 0xf4f5f6f7, 0xf8f9fafb, 0xfcfdfeff, 0xd0d1d2d3, + 0xd4d5d6d7, 0xd8d9dadb, 0xdcdddedf}, + .version = cfg_.max_key_ver - 1, + }; + uint32_t err_code = 0x1; + + ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_IDLE); + EXPECT_ABS_WRITE32_SHADOWED( + base_ + KEYMGR_CONTROL_SHADOWED_REG_OFFSET, + { + {KEYMGR_CONTROL_SHADOWED_DEST_SEL_OFFSET, + KEYMGR_CONTROL_SHADOWED_DEST_SEL_VALUE_OTBN}, + {KEYMGR_CONTROL_SHADOWED_CDI_SEL_BIT, true}, + {KEYMGR_CONTROL_SHADOWED_OPERATION_OFFSET, + KEYMGR_CONTROL_SHADOWED_OPERATION_VALUE_GENERATE_HW_OUTPUT}, + }); + ExpectDiversificationWrite(test_diversification); + EXPECT_ABS_WRITE32(base_ + KEYMGR_START_REG_OFFSET, + { + {KEYMGR_START_EN_BIT, true}, + }); + ExpectWaitUntilDone(/*busy_cycles=*/2, + KEYMGR_OP_STATUS_STATUS_VALUE_DONE_ERROR); + EXPECT_ABS_READ32(base_ + KEYMGR_ERR_CODE_REG_OFFSET, err_code); + EXPECT_ABS_WRITE32(base_ + KEYMGR_ERR_CODE_REG_OFFSET, err_code); + + EXPECT_EQ(keymgr_generate_attestation_key_otbn(test_diversification), + kErrorKeymgrInternal); +} + +TEST_F(KeymgrTest, SideloadClearOtbn) { + ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_IDLE); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET, + { + {KEYMGR_SIDELOAD_CLEAR_VAL_OFFSET, + KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_OTBN}, + }); + EXPECT_ABS_READ32(base_ + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET, + { + {KEYMGR_SIDELOAD_CLEAR_VAL_OFFSET, + KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_OTBN}, + }); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET, + { + {KEYMGR_SIDELOAD_CLEAR_VAL_OFFSET, + KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_NONE}, + }); + + EXPECT_EQ(keymgr_sideload_clear_otbn(), kErrorOk); +} + +TEST_F(KeymgrTest, SideloadClearOtbnNotIdle) { + ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_WIP); + EXPECT_EQ(keymgr_sideload_clear_otbn(), kErrorKeymgrInternal); + ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_DONE_SUCCESS); + EXPECT_EQ(keymgr_sideload_clear_otbn(), kErrorKeymgrInternal); + ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_DONE_ERROR); + EXPECT_EQ(keymgr_sideload_clear_otbn(), kErrorKeymgrInternal); +} + +TEST_F(KeymgrTest, SideloadClearOtbnReadbackMismatch) { + ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_IDLE); + EXPECT_ABS_WRITE32(base_ + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET, + { + {KEYMGR_SIDELOAD_CLEAR_VAL_OFFSET, + KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_OTBN}, + }); + + // Readback does not match the value written. + EXPECT_ABS_READ32(base_ + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET, + { + {KEYMGR_SIDELOAD_CLEAR_VAL_OFFSET, + KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_AES}, + }); + + EXPECT_EQ(keymgr_sideload_clear_otbn(), kErrorKeymgrInternal); +} + } // namespace } // namespace keymgr_unittest diff --git a/sw/device/silicon_creator/lib/drivers/otbn.h b/sw/device/silicon_creator/lib/drivers/otbn.h index fab051a2a1f4c..f76f27dc8192d 100644 --- a/sw/device/silicon_creator/lib/drivers/otbn.h +++ b/sw/device/silicon_creator/lib/drivers/otbn.h @@ -210,7 +210,7 @@ typedef struct otbn_app { ((uint32_t)OTBN_SYMBOL_ADDR(app_name, symbol_name)) /** - * (Re-)loads the RSA application into OTBN. + * (Re-)loads an application into OTBN. * * Load the application image with both instruction and data segments into * OTBN. diff --git a/sw/lib/sw/device/silicon_creator/BUILD b/sw/lib/sw/device/silicon_creator/BUILD index ae85b613a7213..42bf8db1b991d 100644 --- a/sw/lib/sw/device/silicon_creator/BUILD +++ b/sw/lib/sw/device/silicon_creator/BUILD @@ -140,6 +140,29 @@ dual_cc_library( ), ) +cc_library( + name = "attestation", + hdrs = ["attestation.h"], +) + +cc_library( + name = "otbn_boot_services", + srcs = ["otbn_boot_services.c"], + hdrs = ["otbn_boot_services.h"], + # This target uses OTBN pointers internally, so it cannot work host-side. + target_compatible_with = [OPENTITAN_CPU], + deps = [ + ":attestation", + "//sw/device/silicon_creator/lib/drivers:hmac", + "//sw/device/silicon_creator/lib/drivers:keymgr", + "//sw/device/silicon_creator/lib/drivers:otbn", + "//sw/lib/sw/device/base:macros", + "//sw/lib/sw/device/silicon_creator:error", + "//sw/lib/sw/device/silicon_creator/base:sec_mmio", + "//sw/lib/sw/device/silicon_creator/sigverify:rsa_key", + ], +) + exports_files([ "boot_data.h", "boot_data.c", diff --git a/sw/lib/sw/device/silicon_creator/attestation.h b/sw/lib/sw/device/silicon_creator/attestation.h new file mode 100644 index 0000000000000..720161d2f8212 --- /dev/null +++ b/sw/lib/sw/device/silicon_creator/attestation.h @@ -0,0 +1,86 @@ +// Copyright lowRISC contributors. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_ATTESTATION_H_ +#define OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_ATTESTATION_H_ + +#include "sw/lib/sw/device/silicon_creator/error.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +enum { + /** + * Size of the additional seed for attestation key generation in bits. + */ + kAttestationSeedBits = 320, + /** + * Size of the additional seed for attestation key generation in bytes. + */ + kAttestationSeedBytes = kAttestationSeedBits / 8, + /** + * Size of the additional seed for attestation key generation in 32b words. + */ + kAttestationSeedWords = kAttestationSeedBytes / sizeof(uint32_t), + /** + * Size of a coordinate for an attestation public key in bits. + */ + kAttestationPublicKeyCoordBits = 256, + /** + * Size of a coordinate for an attestation public key in bytes. + */ + kAttestationPublicKeyCoordBytes = kAttestationPublicKeyCoordBits / 8, + /** + * Size of a coordinate for an attestation public key in 32b words. + */ + kAttestationPublicKeyCoordWords = + kAttestationPublicKeyCoordBytes / sizeof(uint32_t), + /** + * Size of an attestation signature in bits. + */ + kAttestationSignatureBits = 512, + /** + * Size of an attestation signature in bytes. + */ + kAttestationSignatureBytes = kAttestationSignatureBits / 8, + /** + * Size of an attestation signature in 32b words. + */ + kAttestationSignatureWords = kAttestationSignatureBytes / sizeof(uint32_t), +}; + +/** + * Holds an additional seed for use in attestation key generation. + */ +typedef struct attestation_seed { + uint32_t seed[kAttestationSeedWords]; +} attestation_seed_t; + +/** + * Holds an attestation public key (ECDSA-P256). + */ +typedef struct attestation_public_key { + /** + * Affine x-coordinate of the point. + */ + uint32_t x[kAttestationPublicKeyCoordWords]; + /** + * Affine y-coordinate of the point. + */ + uint32_t y[kAttestationPublicKeyCoordWords]; +} attestation_public_key_t; + +/** + * Holds an attestation signature (ECDSA-P256). + */ +typedef struct attestation_signature { + uint32_t sig[kAttestationSignatureWords]; +} attestation_signature_t; + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_ATTESTATION_H_ diff --git a/sw/lib/sw/device/silicon_creator/otbn_boot_services.c b/sw/lib/sw/device/silicon_creator/otbn_boot_services.c new file mode 100644 index 0000000000000..210fe11f83952 --- /dev/null +++ b/sw/lib/sw/device/silicon_creator/otbn_boot_services.c @@ -0,0 +1,190 @@ +// Copyright lowRISC contributors. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "sw/lib/sw/device/silicon_creator/otbn_boot_services.h" + +#include "sw/device/silicon_creator/lib/drivers/hmac.h" +#include "sw/device/silicon_creator/lib/drivers/keymgr.h" +#include "sw/device/silicon_creator/lib/drivers/otbn.h" +#include "sw/lib/sw/device/silicon_creator/attestation.h" +#include "sw/lib/sw/device/silicon_creator/base/sec_mmio.h" + +OTBN_DECLARE_APP_SYMBOLS(boot); // The OTBN boot-services app. +OTBN_DECLARE_SYMBOL_ADDR(boot, mode); // Application mode. +OTBN_DECLARE_SYMBOL_ADDR(boot, rsa_mod); // RSA modulus. +OTBN_DECLARE_SYMBOL_ADDR(boot, rsa_m0inv); // RSA Montgomery constant. +OTBN_DECLARE_SYMBOL_ADDR(boot, rsa_inout); // RSA input/output buffer. +OTBN_DECLARE_SYMBOL_ADDR(boot, msg); // ECDSA message digest. +OTBN_DECLARE_SYMBOL_ADDR(boot, x); // ECDSA public key x-coordinate. +OTBN_DECLARE_SYMBOL_ADDR(boot, y); // ECDSA public key y-coordinate. +OTBN_DECLARE_SYMBOL_ADDR(boot, r); // ECDSA signature component r. +OTBN_DECLARE_SYMBOL_ADDR(boot, s); // ECDSA signature component s. +OTBN_DECLARE_SYMBOL_ADDR( + boot, attestation_additional_seed); // Additional seed for ECDSA keygen. + +static const otbn_app_t kOtbnAppBoot = OTBN_APP_T_INIT(boot); +static const otbn_addr_t kOtbnVarBootMode = OTBN_ADDR_T_INIT(boot, mode); +static const otbn_addr_t kOtbnVarBootRsaMod = OTBN_ADDR_T_INIT(boot, rsa_mod); +static const otbn_addr_t kOtbnVarBootRsaM0inv = + OTBN_ADDR_T_INIT(boot, rsa_m0inv); +static const otbn_addr_t kOtbnVarBootRsaInout = + OTBN_ADDR_T_INIT(boot, rsa_inout); +static const otbn_addr_t kOtbnVarBootMsg = OTBN_ADDR_T_INIT(boot, msg); +static const otbn_addr_t kOtbnVarBootX = OTBN_ADDR_T_INIT(boot, x); +static const otbn_addr_t kOtbnVarBootY = OTBN_ADDR_T_INIT(boot, y); +static const otbn_addr_t kOtbnVarBootR = OTBN_ADDR_T_INIT(boot, r); +static const otbn_addr_t kOtbnVarBootS = OTBN_ADDR_T_INIT(boot, s); +static const otbn_addr_t kOtbnVarBootAttestationAdditionalSeed = + OTBN_ADDR_T_INIT(boot, attestation_additional_seed); + +enum { + /* + * Mode is represented by a single word. + */ + kOtbnBootModeWords = 1, + /* + * Mode to run RSA modular exponentiation. + * + * Value taken from `boot.s`. + */ + kOtbnBootModeSecBootModexp = 0x7d3, + /* + * Mode to generate an attestation keypair. + * + * Value taken from `boot.s`. + */ + kOtbnBootModeAttestationKeygen = 0x2bf, + /* + * Mode to endorse a message with a saved private key. + * + * Value taken from `boot.s`. + */ + kOtbnBootModeAttestationEndorse = 0x5e8, + /* + * Mode to save an attesation private key. + * + * Value taken from `boot.s`. + */ + kOtbnBootModeAttestationKeySave = 0x64d, +}; + +rom_error_t otbn_boot_app_load(void) { return otbn_load_app(kOtbnAppBoot); } + +rom_error_t otbn_boot_attestation_keygen( + const attestation_seed_t *additional_seed, + keymgr_diversification_t diversification, + attestation_public_key_t *public_key) { + // Trigger key manager to sideload the attestation key into OTBN. + HARDENED_RETURN_IF_ERROR( + keymgr_generate_attestation_key_otbn(diversification)); + + // Write the mode. + uint32_t mode = kOtbnBootModeAttestationKeygen; + HARDENED_RETURN_IF_ERROR( + otbn_dmem_write(kOtbnBootModeWords, &mode, kOtbnVarBootMode)); + + // Write the additional seed. + HARDENED_RETURN_IF_ERROR( + otbn_dmem_write(kAttestationSeedWords, additional_seed->seed, + kOtbnVarBootAttestationAdditionalSeed)); + + // Run the OTBN program (blocks until OTBN is done). + HARDENED_RETURN_IF_ERROR(otbn_execute()); + SEC_MMIO_WRITE_INCREMENT(kOtbnSecMmioExecute); + + // TODO(#20023): Check the instruction count register (see `mod_exp_otbn`). + + // Retrieve the public key. + HARDENED_RETURN_IF_ERROR(otbn_dmem_read(kAttestationPublicKeyCoordWords, + kOtbnVarBootX, public_key->x)); + HARDENED_RETURN_IF_ERROR(otbn_dmem_read(kAttestationPublicKeyCoordWords, + kOtbnVarBootY, public_key->y)); + + return kErrorOk; +} + +rom_error_t otbn_boot_attestation_key_save( + const attestation_seed_t *additional_seed, + keymgr_diversification_t diversification) { + // Trigger key manager to sideload the attestation key into OTBN. + HARDENED_RETURN_IF_ERROR( + keymgr_generate_attestation_key_otbn(diversification)); + + // Write the mode. + uint32_t mode = kOtbnBootModeAttestationKeySave; + HARDENED_RETURN_IF_ERROR( + otbn_dmem_write(kOtbnBootModeWords, &mode, kOtbnVarBootMode)); + + // Write the additional seed. + HARDENED_RETURN_IF_ERROR( + otbn_dmem_write(kAttestationSeedWords, additional_seed->seed, + kOtbnVarBootAttestationAdditionalSeed)); + + // Run the OTBN program (blocks until OTBN is done). + HARDENED_RETURN_IF_ERROR(otbn_execute()); + SEC_MMIO_WRITE_INCREMENT(kOtbnSecMmioExecute); + + // TODO(#20023): Check the instruction count register (see `mod_exp_otbn`). + + return kErrorOk; +} + +rom_error_t otbn_boot_attestation_key_clear(void) { + return otbn_dmem_sec_wipe(); +} + +rom_error_t otbn_boot_attestation_endorse(const hmac_digest_t *digest, + attestation_signature_t *sig) { + // Write the mode. + uint32_t mode = kOtbnBootModeAttestationEndorse; + HARDENED_RETURN_IF_ERROR( + otbn_dmem_write(kOtbnBootModeWords, &mode, kOtbnVarBootMode)); + + // Write the message digest. + HARDENED_RETURN_IF_ERROR( + otbn_dmem_write(kHmacDigestNumWords, digest->digest, kOtbnVarBootMsg)); + + // Run the OTBN program (blocks until OTBN is done). + HARDENED_RETURN_IF_ERROR(otbn_execute()); + SEC_MMIO_WRITE_INCREMENT(kOtbnSecMmioExecute); + + // TODO(#20023): Check the instruction count register (see `mod_exp_otbn`). + + // Retrieve the signature (in two parts, r and s). + size_t half_num_words = kAttestationSignatureWords / 2; + uint32_t *r_dest = sig->sig; + uint32_t *s_dest = &sig->sig[half_num_words]; + HARDENED_RETURN_IF_ERROR( + otbn_dmem_read(half_num_words, kOtbnVarBootR, r_dest)); + HARDENED_RETURN_IF_ERROR( + otbn_dmem_read(half_num_words, kOtbnVarBootS, s_dest)); + + return kErrorOk; +} + +rom_error_t otbn_boot_sigverify_mod_exp(const sigverify_rsa_key_t *key, + const sigverify_rsa_buffer_t *sig, + sigverify_rsa_buffer_t *result) { + // Set the modulus (n). + HARDENED_RETURN_IF_ERROR( + otbn_dmem_write(kSigVerifyRsaNumWords, key->n.data, kOtbnVarBootRsaMod)); + + // Set the encoded message. + HARDENED_RETURN_IF_ERROR( + otbn_dmem_write(kSigVerifyRsaNumWords, sig->data, kOtbnVarBootRsaInout)); + + // Set the precomputed constant m0_inv. + HARDENED_RETURN_IF_ERROR(otbn_dmem_write(kOtbnWideWordNumWords, key->n0_inv, + kOtbnVarBootRsaM0inv)); + + // Start the OTBN routine. + HARDENED_RETURN_IF_ERROR(otbn_execute()); + SEC_MMIO_WRITE_INCREMENT(kOtbnSecMmioExecute); + + // TODO(#20023): Check the instruction count register (see `mod_exp_otbn`). + + // Read recovered message out of OTBN dmem. + return otbn_dmem_read(kSigVerifyRsaNumWords, kOtbnVarBootRsaInout, + result->data); +} diff --git a/sw/lib/sw/device/silicon_creator/otbn_boot_services.h b/sw/lib/sw/device/silicon_creator/otbn_boot_services.h new file mode 100644 index 0000000000000..d5e66759a701c --- /dev/null +++ b/sw/lib/sw/device/silicon_creator/otbn_boot_services.h @@ -0,0 +1,138 @@ +// Copyright lowRISC contributors. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_OTBN_BOOT_SERVICES_H_ +#define OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_OTBN_BOOT_SERVICES_H_ + +#include +#include + +#include "sw/device/silicon_creator/lib/drivers/hmac.h" +#include "sw/device/silicon_creator/lib/drivers/keymgr.h" +#include "sw/lib/sw/device/silicon_creator/attestation.h" +#include "sw/lib/sw/device/silicon_creator/sigverify/rsa_key.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/** + * Loads the OTBN boot-services application. + * + * Loads the OTBN program that runs attestation and code-signature + * verification. The program can later be cleared by wiping OTBN's IMEM and + * DMEM, or by loading a diffierent OTBN application. + * + * @return The result of the operation. + */ +OT_WARN_UNUSED_RESULT +rom_error_t otbn_boot_app_load(void); + +/** + * Generate an attestation public key from a keymgr-derived secret. + * + * This routine triggers the key manager to sideload key material into OTBN, + * and also takes in an extra seed to XOR with the key material. The final + * private key is: + * d = (additional_seed ^ keymgr_seed) mod n + * ...where n is the P256 curve order. The public key is d*G, where G is the + * P256 base point. + * + * The `additional_seed` is expected to be the output from a specially seeded + * DRBG. It must be fully independent from the key manager seed. + * + * Expects the OTBN boot-services program to already be loaded; see + * `otbn_boot_app_load`. + * + * @param additional_seed Seed material from DRBG. + * @param diversification Salt and version information for key manager. + * @param[out] public_key Attestation public key. + * @return The result of the operation. + */ +OT_WARN_UNUSED_RESULT +rom_error_t otbn_boot_attestation_keygen( + const attestation_seed_t *additional_seed, + keymgr_diversification_t diversification, + attestation_public_key_t *public_key); + +/** + * Saves an attestation private key to OTBN's scratchpad. + * + * This routine takes the same arguments as `otbn_boot_attestation_keygen`, but + * instead of computing the public key, it computes only the private key and + * saves it to OTBN's scratchpad memory. + * + * Expects the OTBN boot-services program to already be loaded; see + * `otbn_boot_app_load`. + * + * @param additional_seed Seed material from DRBG. + * @param diversification Salt and version information for key manager. + * @return The result of the operation. + */ +OT_WARN_UNUSED_RESULT +rom_error_t otbn_boot_attestation_key_save( + const attestation_seed_t *additional_seed, + keymgr_diversification_t diversification); + +/** + * Clears any saved attestation key from OTBN's scratchpad. + * + * This routine clears OTBN's DMEM. If called after + * `otbn_boot_attestation_key_save`, it will clear the saved key. + * + * @return The result of the operation. + */ +OT_WARN_UNUSED_RESULT +rom_error_t otbn_boot_attestation_key_clear(void); + +/** + * Signs the message with the saved attestation key, and clears the key. + * + * Must be called when there is a saved attestation key in OTBN's scratchpad; + * use `otbn_boot_attestation_key_save` to store one. + * + * The intended purpose of this function is to sign the current stage's + * attestation certificate with the private key of the previous stage. The + * caller should hash the certificate with SHA-256 before calling this + * function. + * + * Expects the OTBN boot-services program to already be loaded; see + * `otbn_boot_app_load`. + * + * @param digest Digest to sign. + * @param[out] sig Resulting signature. + * @return The result of the operation. + */ +OT_WARN_UNUSED_RESULT +rom_error_t otbn_boot_attestation_endorse(const hmac_digest_t *digest, + attestation_signature_t *sig); + +/** + * Computes the modular exponentiation of an RSA signature on OTBN. + * + * Given an RSA public key and sig, this function computes sig^e mod n using + * Montgomery multiplication, where + * - sig is an RSA signature, + * - e and n are the exponent and the modulus of the key, respectively. + * + * The key exponent is always 65537; no other exponents are supported. + * + * Expects the OTBN boot-services program to already be loaded; see + * `otbn_boot_app_load`. + * + * @param key An RSA public key. + * @param sig Buffer that holds the signature, little-endian. + * @param[out] result Buffer to write the result to, little-endian. + * @return The result of the operation. + */ +OT_WARN_UNUSED_RESULT +rom_error_t otbn_boot_sigverify_mod_exp(const sigverify_rsa_key_t *key, + const sigverify_rsa_buffer_t *sig, + sigverify_rsa_buffer_t *result); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_OTBN_BOOT_SERVICES_H_ diff --git a/sw/otbn/crypto/BUILD b/sw/otbn/crypto/BUILD index aded20c8ec6d5..70783f9aeddff 100644 --- a/sw/otbn/crypto/BUILD +++ b/sw/otbn/crypto/BUILD @@ -6,6 +6,19 @@ load("//rules:otbn.bzl", "otbn_binary", "otbn_library") package(default_visibility = ["//visibility:public"]) +otbn_binary( + name = "boot", + srcs = [ + "boot.s", + ], + deps = [ + ":p256_base", + ":p256_sign", + ":rsa_verify_3072", + ":rsa_verify_3072_rr", + ], +) + otbn_library( name = "ed25519", srcs = [ @@ -70,9 +83,37 @@ otbn_library( ) otbn_library( - name = "p256", + name = "p256_shared_key", + srcs = [ + "p256_shared_key.s", + ], +) + +otbn_library( + name = "p256_base", srcs = [ - "p256.s", + "p256_base.s", + ], +) + +otbn_library( + name = "p256_isoncurve", + srcs = [ + "p256_isoncurve.s", + ], +) + +otbn_library( + name = "p256_sign", + srcs = [ + "p256_sign.s", + ], +) + +otbn_library( + name = "p256_verify", + srcs = [ + "p256_verify.s", ], ) @@ -82,7 +123,9 @@ otbn_binary( "p256_ecdh.s", ], deps = [ - ":p256", + ":p256_base", + ":p256_isoncurve", + ":p256_shared_key", ], ) @@ -92,7 +135,10 @@ otbn_binary( "p256_ecdsa.s", ], deps = [ - ":p256", + ":p256_base", + ":p256_isoncurve", + ":p256_sign", + ":p256_verify", ], ) @@ -103,6 +149,20 @@ otbn_library( ], ) +otbn_library( + name = "p384_a2b", + srcs = [ + "p384_a2b.s", + ], +) + +otbn_library( + name = "p384_isoncurve", + srcs = [ + "p384_isoncurve.s", + ], +) + otbn_library( name = "p384_sign", srcs = [ @@ -110,6 +170,41 @@ otbn_library( ], ) +otbn_library( + name = "p384_internal_mult", + srcs = [ + "p384_internal_mult.s", + ], +) + +otbn_library( + name = "p384_keygen", + srcs = [ + "p384_keygen.s", + ], +) + +otbn_library( + name = "p384_base_mult", + srcs = [ + "p384_base_mult.s", + ], +) + +otbn_library( + name = "p384_modinv", + srcs = [ + "p384_modinv.s", + ], +) + +otbn_library( + name = "p384_scalar_mult", + srcs = [ + "p384_scalar_mult.s", + ], +) + otbn_library( name = "p384_verify", srcs = [ @@ -226,7 +321,10 @@ otbn_binary( "p256_ecdsa_sca.s", ], deps = [ - ":p256", + ":p256_base", + ":p256_isoncurve", + ":p256_sign", + ":p256_verify", ], ) @@ -236,7 +334,7 @@ otbn_binary( "p256_key_from_seed_sca.s", ], deps = [ - ":p256", + ":p256_base", ], ) @@ -246,7 +344,7 @@ otbn_binary( "p256_mod_inv_sca.s", ], deps = [ - ":p256", + ":p256_base", ], ) @@ -257,10 +355,80 @@ otbn_binary( ], deps = [ ":p384_base", + ":p384_internal_mult", + ":p384_modinv", + ":p384_sign", + ], +) + +otbn_binary( + name = "p384_curve_point_valid", + srcs = [ + "p384_curve_point_valid.s", + ], + deps = [ + ":p384_base", + ":p384_isoncurve", + ], +) + +otbn_binary( + name = "p384_ecdh", + srcs = [ + "p384_ecdh.s", + ], + deps = [ + ":p384_a2b", + ":p384_base", + ":p384_base_mult", + ":p384_internal_mult", + ":p384_keygen", + ":p384_scalar_mult", + ], +) + +otbn_binary( + name = "p384_ecdsa_keygen", + srcs = [ + "p384_ecdsa_keygen.s", + ], + deps = [ + ":p384_base", + ":p384_base_mult", + ":p384_internal_mult", + ":p384_keygen", + ], +) + +otbn_binary( + name = "p384_ecdsa_sign", + srcs = [ + "p384_ecdsa_sign.s", + ], + deps = [ + ":p384_base", + ":p384_base_mult", + ":p384_internal_mult", + ":p384_keygen", + ":p384_modinv", ":p384_sign", ], ) +otbn_binary( + name = "p384_ecdsa_verify", + srcs = [ + "p384_ecdsa_verify.s", + ], + deps = [ + ":p384_base", + ":p384_base_mult", + ":p384_internal_mult", + ":p384_modinv", + ":p384_verify", + ], +) + otbn_library( name = "sha256", srcs = [ diff --git a/sw/otbn/crypto/boot.s b/sw/otbn/crypto/boot.s new file mode 100644 index 0000000000000..25f6200010281 --- /dev/null +++ b/sw/otbn/crypto/boot.s @@ -0,0 +1,374 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Unified boot-services OTBN program. + * + * During the boot process, this program should remain loaded. This binary has + * the following modes: + * 1. MODE_SEC_BOOT_MODEXP: RSA-3072 modexp (to verify a code signature). + * 2. MODE_ATTESTATION_KEYGEN: Derive a new attestation keypair (ECDSA-P256). + * 3. MODE_ATTESTATION_ENDORSE: Sign with a saved attestation signing key. + * 4. MODE_ATTESTATION_KEY_SAVE: Save an attestation signing key. + * + * Ibex will run `MODE_SEC_BOOT_MODEXP` as part of checking the code + * signature of the next boot stage. This mode doesn't interact or interfere + * with any other modes, and can be called at any point. + * + * The attestation modes are more entangled with each other. Part of the + * purpose of this program is to store the attestation key of a particular key + * manager stage long enough to sign the public key of the next stage, without + * rebooting. At each key manager stage, Ibex should: + * - Call `MODE_ATTESTATION_KEYGEN` to get the current public key + * - Construct the attestation certificate for the current stage, including + * the public key + * - Call `MODE_ATTESTATION_ENDORSE` to sign the certificate with the stored + * signing key from the *previous stage* and clear the key + * - Call `MODE_ATTESTATION_KEY_SAVE` to save the current stage's signing + * key, which will later endorse the next stage's certificate + * + * Of course, in the first stage there is no previous stage signing key and no + * certificate, so Ibex should skip the `MODE_ATTESTATION_ENDORSE` step. Ibex + * may clear IMEM/DMEM if it needs to run a different OTBN routine (e.g. + * signature verification for ownership transfer), but doing so will wipe any + * saved keys. This binary is designed so that it should not need to be + * cleared and re-loaded on a normal boot. + * + * The attestation keys are derived from a key manager seed value, which is + * XORed with output from a specially seeded DRBG in order to satisfy the FIPS + * 186-5 requirement that the seed comes from a DRBG (other FIPS documents say + * it is permissible to XOR DRBG output with implementation-specific values, so + * the key manager seed is effectively ignored for FIPS compliance). The saved + * signing key is stored in OTBN's scratchpad memory, which is not accessible + * to Ibex over the bus. + */ + +/** + * Mode magic values, generated with + * $ ./util/design/sparse-fsm-encode.py -d 6 -m 4 -n 11 --avoid-zero -s 3357382482 + * + * Call the same utility with the same arguments and a higher -m to generate + * additional value(s) without changing the others or sacrificing mutual HD. + * + * TODO(#17727): in some places the OTBN assembler support for .equ directives + * is lacking, so they cannot be used in bignum instructions or pseudo-ops such + * as `li`. If support is added, we could use 32-bit values here instead of + * 11-bit. + */ +.equ MODE_SEC_BOOT_MODEXP, 0x7d3 +.equ MODE_ATTESTATION_KEYGEN, 0x2bf +.equ MODE_ATTESTATION_ENDORSE, 0x5e8 +.equ MODE_ATTESTATION_KEY_SAVE, 0x64d + +.section .text.start +start: + /* Read the mode and tail-call the requested operation. */ + la x2, mode + lw x2, 0(x2) + + addi x3, x0, MODE_SEC_BOOT_MODEXP + beq x2, x3, sec_boot_modexp + + addi x3, x0, MODE_ATTESTATION_KEYGEN + beq x2, x3, attestation_keygen + + addi x3, x0, MODE_ATTESTATION_ENDORSE + beq x2, x3, attestation_endorse + + addi x3, x0, MODE_ATTESTATION_KEY_SAVE + beq x2, x3, attestation_key_save + + /* Invalid mode; fail. */ + unimp + unimp + unimp + +/** + * RSA-3072 modular exponentation. + * + * Computes msg = (sig^65537) mod M, where + * sig is the signature + * M is the public key modulus + * + * Uses the specialized RSA-3072 OTBN modexp implementation to recover an + * encoded message from an input signature. Ibex needs to check that the + * encoded message matches the encoding of the expected message to complete + * signature verification. + * + * Assumes that the Montgomery constant m0_inv is provided, but computes the RR + * constant on the fly. The only exponent supported is e=65537. + * + * @param[in] dmem[rsa_mod]: Modulus of the RSA public key + * @param[in] dmem[rsa_inout]: Signature to check against + * @param[in] dmem[m0inv]: Montgomery constant (-(M^-1)) mod 2^256 + * @param[out] dmem[rsa_inout]: Recovered message digest + */ +sec_boot_modexp: + /* Compute R^2 (same for both exponents): dmem[rr] <= R^2 */ + jal x1, compute_rr + + /* Set pointers to buffers for modexp. */ + la x24, rsa_inout + la x16, rsa_mod + la x23, rsa_inout + la x26, rr + la x17, m0inv + + /* run modular exponentiation */ + jal x1, modexp_var_3072_f4 + + ecall + +/** + * Generate an attestation keypair from a sideloaded seed. + * + * Takes two input seeds, one from the key manager in the key-sideload slots + * and one from DMEM that is expected to be the output of a DRBG and fully + * independent from the first. For both seeds, only the first 320 bits are used + * and the rest are ignored. + * + * @param[in] dmem[attestation_additional_seed]: DRBG output. + * @param[out] dmem[x]: Public key x-coordinate. + * @param[out] dmem[y]: Public key y-coordinate. + */ +attestation_keygen: + /* Initialize all-zero register. */ + bn.xor w31, w31, w31 + + /* Generate secret key in shares. + w20, w21 <= d0 (first share of secret key) + w10, w11 <= d1 (second share of secret key) */ + jal x1, attestation_secret_key_from_seed + + /* Call scalar multiplication with base point. + R = (x_p, y_p, z_p) = (w8, w9, w10) <= d*G */ + la x21, p256_gx + la x22, p256_gy + jal x1, scalar_mult_int + + /* Convert masked result back to affine coordinates. + R = (x_a, y_a) = (w11, w12) */ + jal x1, proj_to_affine + + /* Store public key in DMEM. + dmem[x] <= x_a = w11 + dmem[y] <= y_a = w12 */ + li x2, 11 + la x21, x + bn.sid x2++, 0(x21) + la x22, y + bn.sid x2, 0(x22) + + ecall + +/** + * Sign a message using the saved signing key from the scratchpad. + * + * Clears the saved key after use, so only one signature is possible with a + * saved key. + * + * @param[in] dmem[msg]: Message digest (256 bits) + * @param[in] dmem[d0]: First share of private key d (320 bits) + * @param[in] dmem[d1]: Second share of private key d (320 bits) + * @param[out] dmem[r]: Buffer for r component of signature (256 bits) + * @param[out] dmem[s]: Buffer for s component of signature (256 bits) + */ +attestation_endorse: + /* Generate a fresh random scalar for signing. + dmem[k0] <= first share of k + dmem[k1] <= second share of k */ + jal x1, p256_generate_k + + /* Generate the signature. + dmem[r], dmem[s] <= signature */ + jal x1, p256_sign + + /* Clear the saved key by overwriting with random data. + dmem[d0], dmem[d1] <= RND */ + li x20, 20 + la x2, d0 + bn.wsrr w20, RND + bn.sid x20, 0(x2++) + bn.wsrr w20, RND + bn.sid x20, 0(x2) + la x2, d1 + bn.wsrr w20, RND + bn.sid x20, 0(x2++) + bn.wsrr w20, RND + bn.sid x20, 0(x2) + + ecall + +/** + * Save an attestation signing key to the scratchpad. + * + * @param[in] dmem[attestation_additional_seed]: DRBG output. + * @param[out] dmem[d0]: First share of private key (320 bits). + * @param[out] dmem[d1]: Second share of private key (320 bits). + */ +attestation_key_save: + /* Initialize all-zero register. */ + bn.xor w31, w31, w31 + + /* Generate secret key in shares. + w20, w21 <= d0 (first share of secret key) + w10, w11 <= d1 (second share of secret key) */ + jal x1, attestation_secret_key_from_seed + + /* Store secret key in DMEM. + dmem[d0] <= w20, w21 = d0 + dmem[d1] <= w10, w11 = d1 */ + li x2, 20 + la x3, d0 + bn.sid x2++, 0(x3) + bn.sid x2, 32(x3) + li x2, 10 + la x3, d1 + bn.sid x2++, 0(x3) + bn.sid x2, 32(x3) + + ecall + +/** + * Generate an attestation secret key from a sideloaded seed. + * + * Takes two input seeds, one from the key manager in the key-sideload slots + * and one from DMEM that is expected to be the output of a DRBG and fully + * independent from the first. For both seeds, only the first 320 bits are used + * and the rest are ignored. + * + * Returns the key in two 320-bit shares d0 and d1, such that the secret key d + * = (d0 + d1) mod n. + * + * @param[in] w31: all-zero + * @param[in] dmem[attestation_additional_seed]: DRBG output seed + * @param[out] w20: Lower 256 bits of first share of secret key (d0) + * @param[out] w21: Upper 64 bits of first share of secret key (d0) + * @param[out] w10: Lower 256 bits of first share of secret key (d1) + * @param[out] w11: Upper 64 bits of second share of secret key (d1) + * + * clobbered registers: x2, x3, x20, w1 to w4, w10, w11, w20 to w29 + * clobbered flag groups: FG0 + */ +attestation_secret_key_from_seed: + /* Load keymgr seeds from WSRs. + w20,w21 <= seed0 + w10,w11 <= seed1 */ + bn.wsrr w20, KEY_S0_L + bn.wsrr w10, KEY_S1_L + bn.wsrr w21, KEY_S0_H + bn.wsrr w11, KEY_S1_H + + /* Load the additional DRBG seed from DMEM and XOR with one share of the + sideloaded seed. + w20, w21 <= seed0 ^ dmem[attestation_additional_seed] */ + la x2, attestation_additional_seed + li x3, 22 + bn.xor w20, w20, w22 + bn.lid x3++, 0(x2) + bn.xor w21, w21, w23 + bn.lid x3, 32(x2) + + /* Tail-call `p256_key_from_seed` to generate secret key shares. + w20, w21 <= d0 + w10, w11 <= d1 */ + jal x0, p256_key_from_seed + +.bss + +/* Operation mode. */ +.globl mode +.balign 4 +mode: +.zero 4 + +/* Input buffer for RSA-3072 modulus. */ +.globl rsa_mod +.balign 32 +rsa_mod: +.zero 384 + +/* Input buffer for precomputed RSA-3072 Montgomery constant: + m0' = (- M) mod 2^256. */ +.globl rsa_m0inv +.balign 32 +rsa_m0inv: +.zero 32 + +/* Input/output buffer for RSA-3072 modexp: + input: signature + output: recovered message = (signature ^ 65537) mod M */ +.globl rsa_inout +.balign 32 +rsa_inout: +.zero 384 + +/* Input buffer for an ECDSA-P256 message digest. */ +.globl msg +.balign 32 +msg: +.zero 32 + +/* Output buffer for the first part of an ECDSA-P256 signature. */ +.globl r +.balign 32 +r: +.zero 32 + +/* Output buffer for the second part of an ECDSA-P256 signature. */ +.globl s +.balign 32 +s: +.zero 32 + +/* ECDSA-P256 public key x-coordinate. */ +.globl x +.balign 32 +x: +.zero 32 + +/* ECDSA-P256 public key y-coordinate. */ +.globl y +.balign 32 +y: +.zero 32 + +/* DRBG output to XOR with key manager seed. */ +.globl attestation_additional_seed +.balign 32 +attestation_additional_seed: +.zero 64 + +.section .scratchpad + +/* First share of the saved attestation ECDSA-P256 private key (d). */ +.globl d0 +.balign 32 +d0: +.zero 64 + +/* Second share of the saved attestation ECDSA-P256 private key (d). */ +.globl d1 +.balign 32 +d1: +.zero 64 + +/* First share of the per-signature ECDSA-P256 secret scalar (k). */ +.globl k0 +.balign 32 +k0: +.zero 64 + +/* Second share of the per-signature ECDSA-P256 secret scalar (k). */ +.globl k1 +.balign 32 +k1: +.zero 64 + +/* Buffer for the squared Mongomery Radix RR = (2^3072)^2 mod M. + Populated by the RSA-3072 implementation. */ +.balign 32 +.globl rr +rr: +.zero 384 diff --git a/sw/otbn/crypto/div.s b/sw/otbn/crypto/div.s index ee9597b797840..ef9c94b0a8244 100644 --- a/sw/otbn/crypto/div.s +++ b/sw/otbn/crypto/div.s @@ -2,6 +2,9 @@ /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ /* SPDX-License-Identifier: Apache-2.0 */ +/* Public interface. */ +.globl div + /** * Shift a bignum one bit to the right. * @@ -269,7 +272,6 @@ cond_sub_shifted: * clobbered registers: x2 to x5, x8, x23 to x25, w23 to w27 * clobbered flag groups: FG0 */ -.globl div div: /* Initialize quotient to zero. dmem[dptr_q..dptr_q+n*32] = 0 */ diff --git a/sw/otbn/crypto/ed25519_scalar.s b/sw/otbn/crypto/ed25519_scalar.s index c3747ce145f50..135ecff866fe1 100644 --- a/sw/otbn/crypto/ed25519_scalar.s +++ b/sw/otbn/crypto/ed25519_scalar.s @@ -35,7 +35,7 @@ sc_init: li x2, 14 la x3, ed25519_scalar_L bn.lid x2, 0(x3) - bn.wsrw 0x0, w14 + bn.wsrw MOD, w14 /* Load lower half of precomputed constant mu (260 bits). w14 <= mu mod 2^256 */ @@ -188,7 +188,7 @@ sc_reduce: /* Load L from the MOD register. w11 <= WSR[0x0] = MOD = L */ - bn.wsrr w11, 0x0 + bn.wsrr w11, MOD /* Compute the value r2 = (q3 * L) mod 2^256. Since q3 has 260 bits and L has 253, we use a 320x256-bit multiplication, but we stop after the lowest 256 diff --git a/sw/otbn/crypto/handwritten/rsa_verify_3072.s b/sw/otbn/crypto/handwritten/rsa_verify_3072.s index d907b418a7d32..97fdea07b9604 100644 --- a/sw/otbn/crypto/handwritten/rsa_verify_3072.s +++ b/sw/otbn/crypto/handwritten/rsa_verify_3072.s @@ -229,7 +229,7 @@ mont_loop: bn.movr x10++, x13 /* No subtracion if carry bit of addition of carry words not set. */ - csrrs x2, 0x7c1, x0 + csrrs x2, FG1, x0 andi x2, x2, 1 beq x2, x0, mont_loop_no_sub @@ -388,7 +388,7 @@ modexp_var_3072_f4: bn.lid x9, 0(x16++) bn.subb w2, w2, w3 bn.movr x17++, x11 - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 /* TODO: currently we subtract the modulus if out_buf == M. This should never happen in an RSA context. We could catch this and raise an alert. */ diff --git a/sw/otbn/crypto/modexp.s b/sw/otbn/crypto/modexp.s index 233b620d345db..fec8b142fca2a 100644 --- a/sw/otbn/crypto/modexp.s +++ b/sw/otbn/crypto/modexp.s @@ -305,3 +305,151 @@ modexp_65537: jal x1, montmul_mul1 ret + +/** + * Constant time conditional bigint subtraction + * + * Returns C = A-x*B + * with A being a bigint of length 256..4096 bit + * B being a bigint of length 256..4096 bit + * C being a bigint of length 256..4096 bit + * x being a boolean value [0,1] + * + * Depending on state of FG1.C subtracts a bigint B located in dmem from + * another bigint A, located in the wide reg file and stores result C in dmem. + * + * Flags: When leaving this subroutine, flags of FG0 depend on a + * potentially discarded value and therefore are not usable after + * return. FG1 is not modified in this subroutine. + * + * @param[in] x16: dmem pointer to first limb of subtrahend (B) + * @param[in] x8: regfile pointer to first limb of minuend (input A) + * @param[in] x21: dmem pointer to first limb of result (C) + * @param[in] x30: N, number of limbs + * @param[in] FG1.C: subtraction condition, subtract if 1 (x) + * @param[in] x9: pointer to temp reg, must be set to 3 + * @param[in] x11: pointer to temp reg, must be set to 2 + * @param[in] FG0.C: needs to be set to 0 + * + * clobbered registers: x8, x16, x21, w2, w3 + * clobbered Flag Groups: FG0 + */ +cond_sub_to_dmem: + /* iterate over all limbs for conditional limb-wise subtraction */ + loop x30, 5 + /* load limb of subtrahend (input B): w3 = dmem[x16+i] */ + bn.lid x9, 0(x16++) + + /* move limb from bignum bufer to w2 */ + bn.movr x11, x8++ + + /* perform subtraction for a limb w3 = w2-1 */ + bn.subb w3, w2, w3 + + /* conditionally select subtraction result or unmodified limb */ + bn.sel w2, w3, w2, FG1.C + + /* store selection result in dmem */ + bn.sid x11, 0(x21++) + + ret + +/** + * Constant-time Montgomery modular multiply by one + * + * Returns: C = montmul(1,A) = A*R^(-1) mod M + * + * Routine for back-conversion from Montgomery domain. + * This implements the limb-by-limb interleaved Montgomery Modular + * Multiplication Algorithm, with one operand fixed to 1. This is only a + * wrapper around the main loop body. For algorithmic implementation details + * see the mont_loop subroutine. + * + * Flags: The states of both FG0 and FG1 depend on intermediate values and are + * not usable after return. + * + * @param[in] x16: dmem pointer to first limb of modulus M + * @param[in] x17: dptr_m0d, dmem pointer to Montgomery Constant m0' + * @param[in] x19: dmem pointer to first limb of operand A + * @param[in] x21: dmem pointer to first limb of result C + * @param[in] x30: N, number of limbs + * @param[in] x31: N-1, number of limbs minus one + * @param[in] x8: pointer to temp reg, must be set to 4 + * @param[in] x9: pointer to temp reg, must be set to 3 + * @param[in] x10: pointer to temp reg, must be set to 4 + * @param[in] x11: pointer to temp reg, must be set to 2 + * @param[in] w31: all-zero + * + * clobbered registers: x6, x7, x8, x12, x13, x21, x22, + * w2, w3, w4 to w[4+N-1], w24 to w30 + * clobbered Flag Groups: FG0, FG1 + */ +montmul_mul1: + /* load Montgomery constant: w3 = dmem[x17] = dmem[dptr_m0d] = m0' */ + bn.lid x9, 0(x17) + + /* init regfile bigint buffer with zeros */ + bn.mov w2, w31 + loop x30, 1 + bn.movr x10++, x11 + + /* w2=1 this is operand B */ + bn.xor w2, w2, w2 + bn.addi w2, w2, 1 + + /* save dmem pointers for operand A and modulus */ + addi x6, x16, 0 + addi x7, x19, 0 + + /* iterate over limbs of operand B */ + loop x30, 4 + + /* restore dmem pointers for operand A and modulus */ + addi x16, x6, 0 + addi x19, x7, 0 + + /* Main loop body of Montgomery Multiplication algorithm */ + /* 1[i]*A */ + jal x1, mont_loop + + /* all subsequent limbs of operand B are zero since B=1 */ + bn.mov w2, w31 + + /* restore dmem pointers for operand A and modulus */ + addi x16, x6, 0 + addi x19, x7, 0 + + /* zeroize w2 and clear flags */ + bn.sub w2, w2, w2, FG1 + + /* iterate over all limbs of bigint buffer for limbwise comparison of + buffer with the Modulus. After last loop cycle, FG1.C is set if bigint + in buffer is larger than Modulus */ + loop x30, 3 + + /* load limb of limb of Modulus to w3 */ + bn.lid x9, 0(x16++) + + /* load limb from bigint buffer to w2 */ + bn.movr x11, x8++ + + /* compare limb of flag with limb of Modulus */ + bn.cmpb w3, w2, FG1 + + /* restore pointers to bigint buffer in regfile */ + li x8, 4 + li x10, 4 + + /* restore dmem pointers for operand A and modulus */ + addi x16, x6, 0 + addi x19, x7, 0 + + /* conditionally subtract Modulus from buffer and store result in + dmem[x21] to dmem[x21+N] */ + jal x1, cond_sub_to_dmem + + /* restore dmem pointers for operand A and modulus */ + addi x16, x6, 0 + addi x19, x7, 0 + + ret diff --git a/sw/otbn/crypto/montmul.s b/sw/otbn/crypto/montmul.s index 56c0da011e6e5..6ae2cc26a2857 100644 --- a/sw/otbn/crypto/montmul.s +++ b/sw/otbn/crypto/montmul.s @@ -11,7 +11,7 @@ .text .globl modload .globl montmul -.globl montmul_mul1 +.globl mont_loop /** * Precomputation of a constant m0' for Montgomery modular arithmetic @@ -92,69 +92,82 @@ m0inv: ret /** - * Constant time conditional subtraction of modulus from a bigint + * Doubles a number and reduces modulo M in-place. * - * Returns C <= C-s*M - * with C being a bigint of length 256..4096 bit - * M being the modulus of length 256..4096 bit - * s being a boolean value [0,1] + * Returns: C = (A + A) mod M * - * Conditionally subtracts the modulus located in dmem from the bigint - * located in a buffer in the wide regfile (starting at w5). The subtracted - * value is selected when FG1.C equals 1, otherwise the unmodified value is - * selected. + * Requires that A < M < 2^(256*N). Writes output to the A buffer in DMEM. * - * Note that the interpretation of the subtrahend as a modulus is only - * contextual. In theory, it can be any bigint. However, the subtrahend is - * expected in dmem at a location that is reserved for the modulus according - * to the calling conventions within this library. + * This routine runs in constant time. * - * Flags: When leaving this subroutine, flags of FG0 depend on a - * potentially discarded value and therefore are not usable after - * return. - * FG1 is not modified in this subroutine. + * Flags: Flags have no meaning beyond the scope of this subroutine. * - * @param[in] x16: dptr_m, pointer to 1st limb of modulus M - * @param[in] x30: N, number of 256 bit limbs in modulus and bigint + * @param[in] x16: dmem pointer to first limb of modulus M + * @param[in] x30: N, number of limbs + * @param[in] [w4:w(4+N-1)]: operand A * @param[in] w31: all-zero - * @param[in] FG1.C: s, selection flag - * @param[out] [w[5+N-1]:w5]: new bigint value - * @param[in] FG0.C: needs to be set to 0 + * @param[out] [w4:w(4+N-1)]: result C * - * clobbered registers: x8, x10, x11, x16, w2, w3, w4, w5 to w[5+N-1] - * clobbered flag groups: FG0 + * clobbered registers: x2, x3, x8, x10 to x13 + * w2, w3, w4 to w(4+N-1), w24, w29, w30 + * clobbered Flag Groups: FG0, FG1 */ -cond_sub_mod: - - /* setup pointers */ - li x8, 5 - li x10, 3 - li x11, 2 - - /* reset flags for FG0 */ - bn.add w31, w31, w31 - - /* iterate over all limbs for limb-wise subtraction + conditional selection*/ +double_and_reduce: + /* Clear carry flags. */ + bn.sub w31, w31, w31 + bn.sub w31, w31, w31, FG1 + + /* Double the input and compare the sum to the modulus. + [w4:w(4+N-1)] <= (A+A) mod 2^(256*N) + FG1.C <= (A+A-M) < 0 */ + li x2, 2 + li x3, 3 + li x10, 4 + addi x11, x16, 0 loop x30, 5 + /* w3 <= a[i] */ + bn.movr x3, x10 + /* FG0.C, w3 <= w3 + w3 + FG0.C */ + bn.addc w3, w3, w3 + /* w2 <= M[i] */ + bn.lid x2, 0(x11++) + /* FG1.C <= (w3 - M[i] - FG1.C) < 0 */ + bn.cmpb w3, w2, FG1 + /* w[4+i] <= w3 */ + bn.movr x10++, x3 + + /* Now, FG0.C is 1 if (A + A) >= 2^(256*N) and 0 otherwise, and FG1.C is 1 if + (A + A) mod 2^(256*N) < M. So we have the following cases: + 1) FG0.C is 0, FG1.C is 0 : A+A < 2^(256*N) and A + A >= M + 2) FG0.C is 0, FG1.C is 1 : A+A < 2^(256*N) and A + A < M + 3) FG0.C is 1, FG1.C is 0 : A+A >= 2^(256*N) and (A + A) mod 2^(256*N) >= M + 4) FG0.C is 1, FG1.C is 1 : A+A >= 2^(256*N) and (A + A) mod 2^(256*N) < M + + Case (3) is impossible given the bounds on A and M, because it would + require that A + A > 2^(256*N) + M. Case (2) is the only one in which we + don't need to subtract the modulus, since A + A < M. In cases (1) and (4) + we need to subtract the modulus. */ + + /* Clear FG0.C, and set FG1.C so that it is 1 if and only if FG0.C and FG1.C + match. + FG0.C <= 0 + FG1.C <= (FG0.C ^ FG1.C) R^2 mod M can be computed by performing N*w duplications of R. - We directly perform a modulo reduction in each step such that the - final result will already be reduced. */ - loop x24, 18 - /* reset pointer */ - li x8, 5 - - /* zeroize w3 reset flags of FG1 */ - bn.sub w3, w3, w3, FG1 - - /* Duplicate the intermediate bigint result. This can overflow such that - bit 2^(N*w) (represented by the carry bit after the final loop cycle) - is set. */ - loop x30, 3 - /* copy current limb of bigint to w2 */ - bn.movr x11, x8 - - /* perform the doubling */ - bn.addc w2, w2, w2, FG1 - - /* copy result back to bigint in regfile */ - bn.movr x8++, x11 - - /* Conditionally subtract the modulus from the current bigint Y if there - was an overflow. Again, just considering the lowest N*w bits is - sufficient, since (in case of an overflow) we can write - 2*Y as 2^(N*w) + X with M > X >= 0. - Then, 2*Y - M = 2^(N*w) + X - M = X + unsigned(0-M) */ - addi x16, x22, 0 - jal x1, cond_sub_mod - - /* reset pointer to 1st limb of bigint in regfile */ - li x8, 5 - - /* reset pointer to modulus in dmem */ - addi x16, x22, 0 - - /* reset flags of FG1 */ - bn.sub w3, w3, w3, FG1 - - /* compare intermediate bigint y with modulus - subtract modulus if Y > M */ - loop x30, 3 - bn.lid x10, 0(x16++) - bn.movr x11, x8++ - bn.cmpb w3, w2, FG1 - addi x16, x22, 0 - jal x1, cond_sub_mod - - li x0, 0 - - /* reset pointer to 1st limb of bigint in regfile */ - li x8, 5 - - /* reset pointer to modulus */ - addi x16, x22, 0 + /* Prepare a pointer to the w4 register for storing the result. */ + li x8, 4 - /* store computed RR in dmem */ - addi x3, x18, 0 - loop x30, 2 - bn.sid x8, 0(x3++) - addi x8, x8, 1 + /* Five montgomery squares to compute RR = (T^(2^5) * R) mod M. */ + loopi 5,9 + /* [w4:w(4+N-1)] <= montmul(dmem[rr], dmem[rr]) */ + addi x19, x18, 0 + addi x20, x18, 0 + jal x1, montmul + /* Store result: dmem[rr] <= [w4:w(4+N-1)] */ + addi x2, x18, 0 + addi x3, x8, 0 + loop x30, 2 + bn.sid x3, 0(x2++) + addi x3, x3, 1 + nop ret @@ -368,7 +348,7 @@ mul256_w30xw2: * @param[in] x30: number of limbs * @param[in] FG0.C: needs to be set to 0 * - * clobbered registers: x8, x16, w24, w29, w30, w[x8] to w[x8+N-1] + * clobbered registers: x8, x12, x13, x16, w24, w29, w30, w[x8] to w[x8+N-1] * clobbered Flag Groups: FG0 */ cond_sub_to_reg: @@ -378,7 +358,7 @@ cond_sub_to_reg: li x13, 24 /* iterate over all limbs for conditional limb-wise subtraction */ - loop x30, 6 + loop x30, 5 /* load limb of subtrahend (input B) to w24 */ bn.lid x13, 0(x16++) @@ -388,8 +368,6 @@ cond_sub_to_reg: /* perform subtraction for a limb */ bn.subb w29, w30, w24 - bn.movr x8, x13 - /* conditionally select subtraction result or unmodified limb */ bn.sel w24, w29, w30, FG1.C @@ -567,156 +545,6 @@ mont_loop: ret -/** - * Constant time conditional bigint subtraction - * - * Returns C = A-x*B - * with A being a bigint of length 256..4096 bit - * B being a bigint of length 256..4096 bit - * C being a bigint of length 256..4096 bit - * x being a boolean value [0,1] - * - * Depending on state of FG1.C subtracts a bigint B located in dmem from - * another bigint A, located in the wide reg file and stores result C in dmem. - * - * Flags: When leaving this subroutine, flags of FG0 depend on a - * potentially discarded value and therefore are not usable after - * return. FG1 is not modified in this subroutine. - * - * @param[in] x16: dmem pointer to first limb of subtrahend (B) - * @param[in] x8: regfile pointer to first limb of minuend (input A) - * @param[in] x21: dmem pointer to first limb of result (C) - * @param[in] x30: N, number of limbs - * @param[in] FG1.C: subtraction condition, subtract if 1 (x) - * @param[in] x9: pointer to temp reg, must be set to 3 - * @param[in] x11: pointer to temp reg, must be set to 2 - * @param[in] FG0.C: needs to be set to 0 - * - * clobbered registers: x8, x16, x21, w2, w3 - * clobbered Flag Groups: FG0 - */ -cond_sub_to_dmem: - /* iterate over all limbs for conditional limb-wise subtraction */ - loop x30, 5 - /* load limb of subtrahend (input B): w3 = dmem[x16+i] */ - bn.lid x9, 0(x16++) - - /* move limb from bignum bufer to w2 */ - bn.movr x11, x8++ - - /* perform subtraction for a limb w3 = w2-1 */ - bn.subb w3, w2, w3 - - /* conditionally select subtraction result or unmodified limb */ - bn.sel w2, w3, w2, FG1.C - - /* store selection result in dmem */ - bn.sid x11, 0(x21++) - - ret - - -/** - * Constant-time Montgomery modular multiply by one - * - * Returns: C = montmul(1,A) = A*R^(-1) mod M - * - * Routine for back-conversion from Montgomery domain. - * This implements the limb-by-limb interleaved Montgomery Modular - * Multiplication Algorithm, with one operand fixed to 1. This is only a - * wrapper around the main loop body. For algorithmic implementation details - * see the mont_loop subroutine. - * - * Flags: The states of both FG0 and FG1 depend on intermediate values and are - * not usable after return. - * - * @param[in] x16: dmem pointer to first limb of modulus M - * @param[in] x17: dptr_m0d, dmem pointer to Montgomery Constant m0' - * @param[in] x19: dmem pointer to first limb of operand A - * @param[in] x21: dmem pointer to first limb of result C - * @param[in] x30: N, number of limbs - * @param[in] x31: N-1, number of limbs minus one - * @param[in] x8: pointer to temp reg, must be set to 4 - * @param[in] x9: pointer to temp reg, must be set to 3 - * @param[in] x10: pointer to temp reg, must be set to 4 - * @param[in] x11: pointer to temp reg, must be set to 2 - * @param[in] w31: all-zero - * - * clobbered registers: x6, x7, x8, x12, x13, x21, x22, - * w2, w3, w4 to w[4+N-1], w24 to w30 - * clobbered Flag Groups: FG0, FG1 - */ -montmul_mul1: - /* load Montgomery constant: w3 = dmem[x17] = dmem[dptr_m0d] = m0' */ - bn.lid x9, 0(x17) - - /* init regfile bigint buffer with zeros */ - bn.mov w2, w31 - loop x30, 1 - bn.movr x10++, x11 - - /* w2=1 this is operand B */ - bn.xor w2, w2, w2 - bn.addi w2, w2, 1 - - /* save dmem pointers for operand A and modulus */ - addi x6, x16, 0 - addi x7, x19, 0 - - /* iterate over limbs of operand B */ - loop x30, 4 - - /* restore dmem pointers for operand A and modulus */ - addi x16, x6, 0 - addi x19, x7, 0 - - /* Main loop body of Montgomery Multiplication algorithm */ - /* 1[i]*A */ - jal x1, mont_loop - - /* all subsequent limbs of operand B are zero since B=1 */ - bn.mov w2, w31 - - /* restore dmem pointers for operand A and modulus */ - addi x16, x6, 0 - addi x19, x7, 0 - - /* zeroize w2 and clear flags */ - bn.sub w2, w2, w2, FG1 - - /* iterate over all limbs of bigint buffer for limbwise comparison of - buffer with the Modulus. After last loop cycle, FG1.C is set if bigint - in buffer is larger than Modulus */ - loop x30, 3 - - /* load limb of limb of Modulus to w3 */ - bn.lid x9, 0(x16++) - - /* load limb from bigint buffer to w2 */ - bn.movr x11, x8++ - - /* compare limb of flag with limb of Modulus */ - bn.cmpb w3, w2, FG1 - - /* restore pointers to bigint buffer in regfile */ - li x8, 4 - li x10, 4 - - /* restore dmem pointers for operand A and modulus */ - addi x16, x6, 0 - addi x19, x7, 0 - - /* conditionally subtract Modulus from buffer and store result in - dmem[x21] to dmem[x21+N] */ - jal x1, cond_sub_to_dmem - - /* restore dmem pointers for operand A and modulus */ - addi x16, x6, 0 - addi x19, x7, 0 - - ret - - /** * Constant-time Montgomery Modular Multiplication * @@ -741,7 +569,7 @@ montmul_mul1: * @param[in] x11: pointer to temp reg, must be set to 2 * @param[out] [w[4+N-1]:w4]: result C * - * clobbered registers: x5, x6, x7, x8, x10, x12, x13, x20, x22 + * clobbered registers: x5 to x9, x12, x13, x20, x22 * w2, w3, w4 to w[4+N-1], w24 to w30 * clobbered Flag Groups: FG0, FG1 */ @@ -776,6 +604,7 @@ montmul: /* restore pointers */ li x8, 4 li x10, 4 + li x11, 2 ret @@ -802,6 +631,10 @@ modload: li x8, 28 bn.lid x8, 0(x16) + /* x31 <= N - 1 */ + li x2, 1 + sub x31, x30, x2 + /* Compute Montgomery constant */ jal x1, m0inv diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256_base.s similarity index 66% rename from sw/otbn/crypto/p256.s rename to sw/otbn/crypto/p256_base.s index 857f423da9367..f9593701f7cde 100644 --- a/sw/otbn/crypto/p256.s +++ b/sw/otbn/crypto/p256_base.s @@ -1,5 +1,8 @@ -/* Copyright lowRISC Contributors. - * Copyright 2016 The Chromium OS Authors. All rights reserved. +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/* Copyright 2016 The Chromium OS Authors. All rights reserved. * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE.dcrypto file. * @@ -7,19 +10,22 @@ * https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/cr50_stab/chip/g/dcrypto/dcrypto_p256.c */ -.globl p256_isoncurve .globl p256_scalar_mult .globl p256_base_mult -.globl p256_sign -.globl p256_verify .globl p256_generate_k .globl p256_generate_random_key .globl p256_key_from_seed +.globl trigger_fault_if_fg0_z +.globl mul_modp +.globl setup_modp +.globl mod_mul_256x256 +.globl mod_mul_320x128 +.globl scalar_mult_int +.globl proj_add +.globl proj_to_affine /* Exposed only for testing or SCA purposes. */ -.globl proj_add .globl mod_inv -.globl mod_mul_320x128 .text @@ -43,7 +49,7 @@ trigger_fault_if_fg0_z: /* Read the FG0.Z flag (position 3). x2 <= FG0.Z */ - csrrw x2, 0x7c0, x0 + csrrw x2, FG0, x0 andi x2, x2, 8 srli x2, x2, 3 @@ -312,104 +318,214 @@ mod_mul_320x128: ret /** - * Checks if a point is a valid curve point on curve P-256 (secp256r1) - * - * Returns r = x^3 + ax + b mod p - * and s = y^2 mod p - * with x,y being the affine coordinates of the curve point - * a, b and p being the domain parameters of P-256 - * - * This routine checks if a point with given x- and y-coordinate is a valid - * curve point on P-256. - * The routine checks whether the coordinates are a solution of the - * Weierstrass equation y^2 = x^3 + ax + b mod p. - * The routine makes use of the property that the domain parameter 'a' can be - * written as a=-3 for the P-256 curve, hence the routine is limited to P-256. - * The routine does not return a boolean result but computes the left side - * and the right sight of the Weierstrass equation and leaves the final - * comparison to the caller. - * The routine runs in constant time. + * 256-bit modular multiplication for P-256 coordinate field. + * + * Returns c = a * b mod p + * + * Uses a specialized algorithm to quicly multiply modulo the P-256 coordinate + * modulus p = 2^256 - 2^224 + 2^192 + 2^96 - 1. + * + * This code has been proven correct in Coq here against a simplified model of + * OTBN (simplified in the sense of only including the instructions and + * functionality that this code uses): + * https://gist.github.com/jadephilipoom/5c1910fd355f730238c99ce620aed98a + * + * For more details about the code and how to read the proofs above, see the PR + * description here: https://github.com/lowRISC/opentitan/pull/20701 * * Flags: Flags have no meaning beyond the scope of this subroutine. * - * @param[in] dmem[x]: affine x-coordinate of input point - * @param[in] dmem[y]: affine y-coordinate of input point - * @param[out] dmem[r]: right side result r - * @param[out] dmem[s]: left side result s + * @param[in] w24: a, first 256 bit operand (a < p) + * @param[in] w25: b, second 256 bit operand (b < p) + * @param[in] w28: r256, constant, 2^256 mod p = 2^256 - p + * @param[in] w29: r448, constant, 2^448 mod p + * @param[in] w31: all-zero + * @param[in] MOD: p, modulus of P-256 underlying finite field + * @param[out] w19: c, result * - * clobbered registers: x2, x3, x19, x20, w0, w19 to w25 + * clobbered registers: w19, w20, w21, w22, w23, w24, w25 * clobbered flag groups: FG0 */ -p256_isoncurve: +mul_modp: + /* First, compute the high partial products (coefficient 2^192 or higher). + w19,w20.U <= 2^192*(a0b3 + a1b2 + a2b1 + a3b0) + + 2^256*(a1b3 + a2b2 + a3b1) + + 2^320*(a2b3 + a3b2) + + 2^384*a3b3 */ + bn.mulqacc.z w24.0, w25.3, 64 /* a0b3 */ + bn.mulqacc w24.1, w25.2, 64 /* a1b2 */ + bn.mulqacc w24.2, w25.1, 64 /* a2b1 */ + bn.mulqacc.so w20.U, w24.3, w25.0, 64 /* a3b0 */ + bn.mulqacc w24.1, w25.3, 0 /* a1b3 */ + bn.mulqacc w24.2, w25.2, 0 /* a2b2 */ + bn.mulqacc w24.3, w25.1, 0 /* a3b1 */ + bn.mulqacc w24.2, w25.3, 64 /* a2b3 */ + bn.mulqacc w24.3, w25.2, 64 /* a3b2 */ + bn.mulqacc.wo w19, w24.3, w25.3, 128 /* a3b3 */ + + /* Now, we have: + a * b = a0b0 + 2^64*(a0b1 + a1b0) + 2^128*(a0b2 + a1b1 + a2b0 + w20.U) + + 2^256*w19 + + If we separate w19 into limbs t0, t1, t2, and t3, that gives us + a * b = a0b0 + 2^64*(a0b1 + a1b0) + 2^128*(a0b2 + a1b1 + a2b0 + w20.U) + + 2^256*t0 + 2^320*t1 + 2^384*t2 + 2^448*t3 + + This implies the modular equivalence: + (a * b) mod p + \equiv (a0b0 + 2^64*(a0b1 + a1b0) + 2^128*(a0b2 + a1b1 + a2b0 + w20.U) + + (2^256 mod p)*t0 + (2^448 mod p)*t3 - ((-2^320) mod p)*t1 + - ((-2^384) mod p)*t2 + + The only reason above for using ((-2^320) mod p) and ((-2^384) mod p) + instead of (2^320 mod p) and (2^384 mod p) is that, for these specific + values, the positive terms are ~256 bits and the negative ones are ~224 + bits, so the negative ones are quicker to compute. + + For simplicity, let's call the additive terms u and the subtractive ones v: + u = a0b0 + 2^64*(a0b1 + a1b0) + 2^128*(a0b2 + a1b1 + a2b0 + w20.U) + + (2^256 mod p)*t0 + (2^448 mod p)*t3 + v = ((-2^320) mod p)*t1 + ((-2^384) mod p)*t2 + (a * b) mod p \equiv (u - v) mod p + */ - /* setup all-zero reg */ - bn.xor w31, w31, w31 + /* Compute the additive terms (u). The term in w21 is offset 128 bits to save + a writeback instruction. + w20 + w21 << 384 = u */ + bn.mulqacc.z w24.0, w25.0, 0 /* a0b0 */ + bn.mulqacc w28.0, w19.0, 0 /* r256[0] * t0 */ + bn.mulqacc w29.0, w19.3, 0 /* r448[0] * t3 */ + bn.mulqacc w24.0, w25.1, 64 /* a0b1 */ + bn.mulqacc w24.1, w25.0, 64 /* a1b0 */ + bn.mulqacc w28.1, w19.0, 64 /* r256[1] * t0 */ + bn.mulqacc.so w20.L, w29.1, w19.3, 64 /* r448[1] * t3 */ + bn.mulqacc w24.0, w25.2, 0 /* a0b2 */ + bn.mulqacc w24.1, w25.1, 0 /* a1b1 */ + bn.mulqacc w24.2, w25.0, 0 /* a2b0 */ + bn.mulqacc w28.2, w19.0, 0 /* r256[2] * t0 */ + bn.mulqacc w29.2, w19.3, 0 /* r448[2] * t3 */ + bn.mulqacc w28.3, w19.0, 64 /* r256[3] * t0 */ + bn.mulqacc.wo w21, w29.3, w19.3, 64 /* r448[3] * t3 */ + + /* To fully reduce u mod p, we'll separate the low 256 bits (u0) from the + high 33 bits (u1) and compute: + u0 + (2^256 mod p)*u1 = u0 + (2^224 - 2^192 - 2^96 + 1) * u1 */ + + /* Rotate 128 bits to undo the offset and put u1 in the least significant + position. + w22 <= w21[128:0] << 128 | w21[255:127] */ + bn.rshi w22, w21, w21 >> 128 + + /* w21 <= (u0 + u1) mod p */ + bn.addm w20, w20, w31 + bn.addm w21, w22, w31 + bn.addm w21, w20, w21 + + /* w24 <= u1 << 223 */ + bn.rshi w24, w22, w31 >> 33 + + /* w25 <= u1 * (2^223 - 2^191 - 2^95) */ + bn.sub w25, w24, w24 >> 32 + bn.sub w25, w25, w24 >> 128 + + /* Note: the value in w25 is small enough for addm because u1 < 2^33, and + 2^33*(2^223 - 2^191 - 2^95) < p. + w25 <= (u0 + (2^224 - 2^192 - 2^96 + 1) * u1) mod p = u mod p */ + bn.addm w25, w25, w25 + bn.addm w25, w25, w21 + + /* Now, compute the subtractive terms (v). We don't store constants for this + one; instead we transform the expression into something that is + computable with (the minimum number of) shifts and adds. + v = ((-2^320) mod p)*t1 + ((-2^384) mod p)*t2 + = t1 * (2^224 + 2^160 + 2^128 - 2^64 - 2^32) + + t2 * (2^224 - 2*2^128 - 2*2^96 + 2^32 + 1) + = 2^224 * (t1 + t2) + (2^32 + 1) * (t1*2^128 + t2) + - 2^32 * (2^32 + 1) * (t1 + t2*2*2^64) */ + + /* First, isolate t1 and t2 using `mulqacc` and the lowest limb of r256, + which happens to be 1. This method is faster than using shifts. + w20 <= t1 + w21 <= t2 */ + bn.mulqacc.wo.z w20, w28.0, w19.1, 0 + bn.mulqacc.wo.z w21, w28.0, w19.2, 0 + + /* w22 <= (2^32 + 1) * (t1*2^128 + t2) */ + bn.add w22, w21, w20 << 128 + bn.add w22, w22, w22 << 32 + + /* w23 <= t1 + t2 */ + bn.add w23, w20, w21 + + /* w24 <= (2^32 + 1) * (t1 + 2*2^64*t2) */ + bn.add w24, w20, w21 << 64 + bn.add w24, w24, w21 << 64 + bn.add w24, w24, w24 << 32 + + /* w21, w20 <= v */ + bn.add w20, w22, w23 << 224 + bn.addc w21, w31, w23 >> 32 + bn.sub w20, w20, w24 << 32 + bn.subb w21, w21, w31 + + /* The maximum value of v is 289 bits, so we can now reduce v the same way we + reduced u earlier. */ + + /* w22 <= (v0 + v1) mod p */ + bn.addm w22, w20, w21 + + /* w24 <= v1 << 223 */ + bn.rshi w24, w21, w31 >> 33 + + /* w23 <= v1 * (2^223 - 2^191 - 2^95) */ + bn.sub w23, w24, w24 >> 32 + bn.sub w23, w23, w24 >> 128 + + /* w23 <= (v0 + (2^224 - 2^192 - 2^96 + 1) * v1) mod p = v mod p */ + bn.addm w23, w23, w23 + bn.addm w23, w23, w22 + + /* w19 = (u - v) mod p = (a * b) mod p */ + bn.subm w19, w25, w23 - /* setup modulus p and Barrett constant u - MOD <= w29 <= dmem[p256_p] = p; w28 <= dmem[p256_u_p] = u_p */ - li x2, 29 - la x3, p256_p - bn.lid x2, 0(x3) - bn.wsrw 0, w29 - li x2, 28 - la x3, p256_u_p - bn.lid x2, 0(x3) + ret - /* load domain parameter b from dmem - w27 <= b = dmem[p256_b] */ - li x2, 27 - la x3, p256_b - bn.lid x2, 0(x3) - /* load affine y-coordinate of curve point from dmem - w26 <= dmem[y] */ - la x3, y - li x2, 24 +/** + * Set up for coordinate field operations modulo the prime p. + * + * Loads the constants required by `mul_modp` and other coordinate-arithmetic + * routines. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] w31: all-zero + * @param[out] MOD: p, modulus of P-256 underlying finite field + * @param[out] w28: r256, constant, 2^256 mod p = 2^256 - p + * @param[out] w29: r448, constant, 2^448 mod p + * + * clobbered registers: w28, w29 + * clobbered flag groups: FG0 + */ +setup_modp: + /* Load the modulus p from DMEM and store it in MOD. + MOD <= w29 <= p = dmem[p256_p] */ + li x2, 29 + la x3, p256_p bn.lid x2, 0(x3) + bn.wsrw MOD, w29 - /* w19 <= y^2 = w24*w24 */ - bn.mov w25, w24 - jal x1, mod_mul_256x256 - - /* store left side result: dmem[s] <= w19 = y^2 mod p */ - la x20, s - li x2, 19 - bn.sid x2, 0(x20) + /* Compute the constant r256 for reduction modulo p. + w28 <= 2^256 - p = r256 */ + bn.sub w28, w31, w29 - /* load affine x-coordinate of curve point from dmem - w26 <= dmem[x] */ - la x3, x - li x2, 26 + /* Load the constant r448 for reduction modulo p. + w29 <= dmem[p256_r448] = r448 */ + li x2, 29 + la x3, p256_r448 bn.lid x2, 0(x3) - - /* w19 <= x^2 = w26*w26 */ - bn.mov w25, w26 - bn.mov w24, w26 - jal x1, mod_mul_256x256 - - /* w19 = x^3 <= x^2 * x = w25*w24 = w26*w19 */ - bn.mov w25, w19 - bn.mov w24, w26 - jal x1, mod_mul_256x256 - - /* for curve P-256, 'a' can be written as a = -3, therefore we subtract - x three times from x^3. - w19 = x^3 + ax <= x^3 - 3x mod p */ - bn.subm w19, w19, w26 - bn.subm w19, w19, w26 - bn.subm w19, w19, w26 - - /* w24 <= x^3 + ax + b mod p = w19 + w27 mod p */ - bn.addm w19, w19, w27 - - /* store right side result: dmem[r] <= w19 = x^3 + ax + b mod p */ - la x19, r - li x2, 19 - bn.sid x2, 0(x19) - ret - /** * P-256 point addition in projective coordinates * @@ -427,7 +543,7 @@ p256_isoncurve: * terminology of Algorithm 4 of [2]. * The routine is limited to P-256 curve points due to: * - fixed a=-3 domain parameter - * - usage of a P-256 optimized Barrett multiplication kernel + * - usage of a P-256 optimized modular multiplication kernel * This routine runs in constant time. * * [1] https://doi.org/10.1006/jnth.1995.1088 @@ -440,9 +556,8 @@ p256_isoncurve: * @param[in] w12: y_q, x-coordinate of input point Q * @param[in] w13: z_q, x-coordinate of input point Q * @param[in] w27: b, curve domain parameter - * @param[in] w29: p, modulus, 2^256 > p > 2^255. - * @param[in] w28: u, pre-computed Barrett constant (without u[256]/MSb - * of u which is always 1 for the allowed range. + * @param[in] w28: r256, constant, 2^256 mod p = 2^256 - p + * @param[in] w29: r448, constant, 2^448 mod p * @param[in] w31: all-zero. * @param[in] MOD: p, modulus, 2^256 > p > 2^255. * @param[out] w11: x_r, x-coordinate of resulting point R @@ -462,19 +577,19 @@ proj_add: /* 1: w14 = t0 <= X1*X2 = w11*w8 */ bn.mov w24, w11 bn.mov w25, w8 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w14, w19 /* 2: w15 = t1 <= Y1*Y2 = w12*w9 */ bn.mov w24, w12 bn.mov w25, w9 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w15, w19 /* 3: w16 = t2 <= Z1*Z2 = w13*w10*/ bn.mov w24, w13 bn.mov w25, w10 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w16, w19 /* 5: w17 = t4 <= X2+Y2 = w11 + w12 */ @@ -486,7 +601,7 @@ proj_add: /* 6: w19 = t3 <= t3*t4 = w18*w17 */ bn.mov w24, w17 bn.mov w25, w18 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 7: w18 = t4 <= t0+t1 = w14+w15 */ bn.addm w18, w14, w15 @@ -503,7 +618,7 @@ proj_add: /* 11: w18 = t4 <= t4 * X3 = w19 * w18 */ bn.mov w24, w18 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w18, w19 /* 12: w19 = X3 <= t1 + t2 = w15 + w16 */ @@ -521,7 +636,7 @@ proj_add: /* 16: w11 = X3 <= X3 * Y3 = w12 * w19 */ bn.mov w24, w19 bn.mov w25, w12 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w11, w19 /* 17: w12 = Y3 <= t0 + t2 = w14 + w16 */ @@ -533,7 +648,7 @@ proj_add: /* 19: w19 = Z3 <= b * t2 = w27 * w16 */ bn.mov w24, w27 bn.mov w25, w16 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 20: w11 = X3 <= Y3 -Z3 = w12 - w19 */ bn.subm w11, w12, w19 @@ -553,7 +668,7 @@ proj_add: /* 25: w19 = Y3 <= w27 * w12 = b * Y3 */ bn.mov w24, w27 bn.mov w25, w12 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 26: w15 = t1 <= t2 + t2 = w16 + w16 */ bn.addm w15, w16, w16 @@ -585,19 +700,19 @@ proj_add: /* 35: w15 = t1 <= t4 * Y3 = w18 * w12 */ bn.mov w24, w18 bn.mov w25, w12 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w15, w19 /* 36: w16 = t2 <= t0 * Y3 = w14 * w12 */ bn.mov w24, w14 bn.mov w25, w12 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w16, w19 /* 37: w12 = Y3 <= X3 * Z3 = w11 * w13 */ bn.mov w24, w11 bn.mov w25, w13 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 38: w12 = Y3 <= Y3 + t2 = w19 + w16 */ bn.addm w12, w19, w16 @@ -605,7 +720,7 @@ proj_add: /* 39: w19 = X3 <= t3 * X3 = w17 * w11 */ bn.mov w24, w17 bn.mov w25, w11 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 40: w11 = X3 <= X3 - t1 = w19 - w15 */ bn.subm w11, w19, w15 @@ -613,13 +728,13 @@ proj_add: /* 41: w13 = Z3 <= t4 * Z3 = w18 * w13 */ bn.mov w24, w18 bn.mov w25, w13 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w13, w19 /* 42: w19 = t1 <= t3 * t0 = w17 * w14 */ bn.mov w24, w17 bn.mov w25, w14 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 43: w13 = Z3 <= Z3 + t1 = w13 + w19 */ bn.addm w13, w13, w19 @@ -659,12 +774,12 @@ proj_add: * @param[in] w8: x, x-coordinate of curve point (projective) * @param[in] w9: y, y-coordinate of curve point (projective) * @param[in] w10: z, z-coordinate of curve point (projective) - * @param[in] w29: p, modulus, 2^256 > p > 2^255. - * @param[in] w28: u, pre-computed Barrett constant (without u[256]/MSb - * of u which is always 1 for the allowed range. + * @param[in] w28: r256, constant, 2^256 mod p = 2^256 - p + * @param[in] w29: r448, constant, 2^448 mod p * @param[in] MOD: p, modulus of the finite field of P-256 * @param[out] w11: x_a, x-coordinate of curve point (affine) * @param[out] w12: y_a, y-coordinate of curve point (affine) + * @param[out] w14: z^-1, modular inverse of the projective z-coordinate * * clobbered registers: w10 to w19, w24, w25 * clobbered flag groups: FG0 @@ -677,81 +792,81 @@ proj_to_affine: /* 2: exp = 0x2 = 2*0x1 */ bn.mov w24, w10 bn.mov w25, w10 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 3: exp = 0x3 = 0x2+0x1 */ bn.mov w24, w19 bn.mov w25, w10 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w12, w19 /* 4: exp = 0x6 = 2*0x3 */ bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 5: exp = 0xc = 2*0x6 */ bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 6: exp = 0xf = 0xc+0x3 */ bn.mov w24, w19 bn.mov w25, w12 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w13, w19 /* 7: exp = 0xf0 = 16*0xf */ loopi 4, 4 bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp nop /* 8: exp = 0xff = 0xf0+0xf */ bn.mov w24, w19 bn.mov w25, w13 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w14, w19 /* 9: exp = 0xff00 = 256*0xff */ loopi 8, 4 bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp nop /* 10: exp = 0xffff = 0xff00+0xff */ bn.mov w24, w19 bn.mov w25, w14 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w15, w19 /* 11: exp = 0xffff0000 = 2^16*0xffff */ loopi 16, 4 bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp nop /* 12: exp = 0xffffffff = 0xffff0000+0xffff */ bn.mov w24, w19 bn.mov w25, w15 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w16, w19 /* 13: exp = 0xffffffff00000000 = 2^32*0xffffffff */ loopi 32, 4 bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp nop bn.mov w17, w19 /* 14: exp = 0xffffffff00000001 = 0xffffffff00000000+0x1 */ bn.mov w24, w10 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 15: exp = 0xffffffff00000001000000000000000000000000000000000000000000000000 @@ -759,74 +874,74 @@ proj_to_affine: loopi 192, 4 bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp nop bn.mov w18, w19 /* 16: exp = 0xffffffffffffffff = 0xffffffff00000000+0xffffffff */ bn.mov w24, w17 bn.mov w25, w16 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 17: exp = 0xffffffffffffffff0000 = 2^16*0xffffffffffffffff */ loopi 16, 4 bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp nop /* 18: exp = 0xffffffffffffffffffff = 0xffffffffffffffff0000+0xffff */ bn.mov w24, w15 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 19: exp = 0xffffffffffffffffffff00 = 256*0xffffffffffffffffffff */ loopi 8, 4 bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp nop /* 20: exp = 0xffffffffffffffffffffff = 0xffffffffffffffffffff00+0xff */ bn.mov w24, w14 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 21: exp = 0xffffffffffffffffffffff0 = 16*0xffffffffffffffffffffff */ loopi 4, 4 bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp nop /* 22: exp = 0xfffffffffffffffffffffff = 0xffffffffffffffffffffff0+0xf */ bn.mov w24, w13 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 23: exp = 0x3ffffffffffffffffffffffc = 4*0xfffffffffffffffffffffff */ loopi 2, 4 bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp nop /* 24: exp = 0x3fffffffffffffffffffffff = 0x3ffffffffffffffffffffffc+0x3 */ bn.mov w24, w12 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 25: exp = 0xfffffffffffffffffffffffc = 4*0x3fffffffffffffffffffffff */ loopi 2, 4 bn.mov w24, w19 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp nop /* 26: exp = 0xfffffffffffffffffffffffd = 0xfffffffffffffffffffffffc+0x1 */ bn.mov w24, w10 bn.mov w25, w19 - jal x1, mod_mul_256x256 + jal x1, mul_modp /* 27: exp = p-2 = 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffd @@ -835,21 +950,21 @@ proj_to_affine: w14 = z^exp = z^(p-2) = z^-1 mod p */ bn.mov w24, w19 bn.mov w25, w18 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w14, w19 /* convert x-coordinate to affine w11 = x_a = x/z = x * z^(-1) = w8 * w14 */ bn.mov w24, w8 bn.mov w25, w14 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w11, w19 /* convert y-coordinate to affine w12 = y_a = y/z = y * z^(-1) = w9 * w14 */ bn.mov w24, w9 bn.mov w25, w14 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w12, w19 ret @@ -884,7 +999,7 @@ mod_inv: /* subtract 2 from modulus for Fermat's little theorem w2 = MOD - 2 = m - 2 */ - bn.wsrr w2, 0 + bn.wsrr w2, MOD bn.subi w2, w2, 2 /* init square and multiply: w1 = 1 */ @@ -905,7 +1020,7 @@ mod_inv: /* skip multiplication if C flag not set */ bn.sel w1, w1, w3, C - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 beq x2, x0, nomul @@ -939,8 +1054,8 @@ mod_inv: * x-coordinate of input point * @param[in] x22: dptr_y, pointer to dmem location containing affine * y-coordinate of input point - * @param[in] w28: u, lower 256 bit of Barrett constant for curve P-256 - * @param[in] w29: p, modulus of P-256 underlying finite field + * @param[in] w28: r256, constant, 2^256 mod p = 2^256 - p + * @param[in] w29: r448, constant, 2^448 mod p * @param[in] w31: all-zero * @param[in] MOD: p, modulus of P-256 underlying finite field * @param[out] w14: x, projective x-coordinate @@ -956,7 +1071,7 @@ mod_inv: fetch_proj_randomize: /* get random number from URND */ - bn.wsrr w16, 2 /* URND */ + bn.wsrr w16, URND /* reduce random number w16 = z <= w16 mod p */ @@ -969,7 +1084,7 @@ fetch_proj_randomize: /* scale x-coordinate w14 = x <= w24*w16 = x_a*z mod p */ bn.mov w25, w16 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w14, w19 /* fetch y-coordinate from dmem @@ -979,7 +1094,7 @@ fetch_proj_randomize: /* scale y-coordinate w15 = y <= w24*w16 = y_a*z mod p */ bn.mov w25, w16 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w15, w19 ret @@ -1000,8 +1115,8 @@ fetch_proj_randomize: * @param[in] w9: y_p, y-coordinate of input point * @param[in] w10: z_p, z-coordinate of input point * @param[in] w27: b, curve domain parameter - * @param[in] w29: p, p, modulus of P-256 underlying finite field - * @param[in] w28: u, u, lower 256 bit of Barrett constant for curve P-256 + * @param[in] w28: r256, constant, 2^256 mod p = 2^256 - p + * @param[in] w29: r448, constant, 2^448 mod p * @param[in] w31: all-zero. * @param[in] MOD: p, modulus of P-256 underlying finite field * @param[out] w11: x_r, x-coordinate of resulting point @@ -1066,8 +1181,9 @@ proj_double: * @param[in] w27: b, curve domain parameter * @param[in] w31: all-zero * @param[in] MOD: p, modulus, 2^256 > p > 2^255. - * @param[out] w11: x_r, affine x-coordinate of resulting point - * @param[out] w12: y_r, affine y-coordinate of resulting point + * @param[out] w8: x, x-coordinate of curve point (projective) + * @param[out] w9: y, y-coordinate of curve point (projective) + * @param[out] w10: z, z-coordinate of curve point (projective) * * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on * the computed affine y-coordinate. @@ -1076,21 +1192,11 @@ proj_double: * clobbered flag groups: FG0 */ scalar_mult_int: - - /* load field modulus p from dmem - w29 <= p = dmem[p256_p] */ - li x2, 29 - la x3, p256_p - bn.lid x2, 0(x3) - - /* store modulus to MOD WSR */ - bn.wsrw 0, w29 - - /* load lower 256 bit of Barrett constant u for modulus p from dmem - w28 <= u = dmem[p256_u_p] */ - li x2, 28 - la x3, p256_u_p - bn.lid x2, 0(x3) + /* Set up for coordinate arithmetic. + MOD <= p + w28 <= r256 + w29 <= r448 */ + jal x1, setup_modp /* load domain parameter b from dmem w27 <= b = dmem[p256_b] */ @@ -1190,31 +1296,31 @@ scalar_mult_int: bn.rshi w2, w2, w31 >> 255 /* init regs with random numbers from URND */ - bn.wsrr w11, 2 - bn.wsrr w12, 2 - bn.wsrr w13, 2 + bn.wsrr w11, URND + bn.wsrr w12, URND + bn.wsrr w13, URND /* get a fresh random number from URND and scale the coordinates of 2P = (w3, w4, w5) (scaling each projective coordinate with same factor results in same point) */ - bn.wsrr w7, 2 + bn.wsrr w7, URND /* w4 = w4 * w7 */ bn.mov w24, w4 bn.mov w25, w7 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w4, w19 /* w5 = w5 * w7 */ bn.mov w24, w5 bn.mov w25, w7 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w5, w19 /* w6 = w6 * w7 */ bn.mov w24, w6 bn.mov w25, w7 - jal x1, mod_mul_256x256 + jal x1, mul_modp bn.mov w6, w19 /* Check if the z-coordinate of Q is 0. If so, fail; this represents the @@ -1225,244 +1331,8 @@ scalar_mult_int: bn.cmp w10, w31 jal x1, trigger_fault_if_fg0_z - /* convert back to affine coordinates - R = (x_a, y_a) = (w11, w12) */ - jal x1, proj_to_affine - - ret - - -/** - * P-256 ECDSA signature generation - * - * returns the signature as the pair r, s with - * r = x_1 mod n - * and s = k^(-1)(msg + r*d) mod n - * with x_1 being the affine x-coordinate of the curve point k*G, - * where G is the curve's base point. - * k being a supplied secret random number, - * n being the order of the base point G of P-256, - * msg being the msg to be signed, - * d being the private key. - * - * This routine runs in constant time. - * - * Note: Some versions of the ECDSA spec suggest that msg must be reduced - * modulo n (e.g. RFC 6979, section 2.4). However, for this implementation, it - * is sufficient that msg < 2^256, because the message is multiplied with - * k^(-1) mod n, and our Barrett multiplication implementation accepts any - * operands a and b such that a * b < 2^256 * p and fully reduces the result. - * - * This routine assumes that the secret scalars d and k are provided in two - * shares each (d0/d1 and k0/k1 respectively), where - * d = (d0 + d1) mod n - * k = (k0 + k1) mod n - * - * Each share is 320 bits, which gives us 64 bits of extra redundancy modulo n - * (256 bits). This is a protection measure against side-channel attacks. - * - * For s = k^-1 * (r * d + msg), we compute a random nonzero masking scalar - * alpha, and compute s as: - * s = ((k * alpha)^-1 * (r * (d * alpha) + alpha * msg)) mod n - * - * We choose alpha to be at most 128 bits, so the product with a 320b share - * produces fits in the same 512-bit modular reduction routine that we use for - * 256x256-bit multiplications. It should be safe to compute e.g. k * alpha = - * (k0 * alpha + k1 * alpha) mod n, because alpha has enough randomness to mask - * the true value of k. - * - * @param[in] dmem[k0]: first share of secret scalar (320 bits) - * @param[in] dmem[k1]: second share of secret scalar (320 bits) - * @param[in] dmem[msg]: message to be signed (256 bits) - * @param[in] dmem[r]: dmem buffer for r component of signature (256 bits) - * @param[in] dmem[s]: dmem buffer for s component of signature (256 bits) - * @param[in] dmem[d0]: first share of private key d (320 bits) - * @param[in] dmem[d1]: second share of private key d (320 bits) - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on - * the computed affine y-coordinate. - * - * clobbered registers: x2, x3, x16 to x23, w0 to w26 - * clobbered flag groups: FG0 - */ -p256_sign: - - /* init all-zero register */ - bn.xor w31, w31, w31 - - /* load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */ - la x16, k0 - li x2, 0 - bn.lid x2, 0(x16++) - li x2, 1 - bn.lid x2, 0(x16) - - /* load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */ - la x16, k1 - li x2, 2 - bn.lid x2, 0(x16++) - li x2, 3 - bn.lid x2, 0(x16) - - /* setup modulus n (curve order) and Barrett constant - MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ - li x2, 29 - la x3, p256_n - bn.lid x2, 0(x3) - bn.wsrw 0, w29 - li x2, 28 - la x3, p256_u_n - bn.lid x2, 0(x3) - - /* scalar multiplication with base point - (x_1, y_1) = (w11, w12) <= k*G = w0*(dmem[p256_gx], dmem[p256_gy]) */ - la x21, p256_gx - la x22, p256_gy - jal x1, scalar_mult_int - - /* setup modulus n (curve order) and Barrett constant - MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ - li x2, 29 - la x3, p256_n - bn.lid x2, 0(x3) - bn.wsrw 0, w29 - li x2, 28 - la x3, p256_u_n - bn.lid x2, 0(x3) - - /* re-load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */ - la x16, k0 - li x2, 0 - bn.lid x2, 0(x16++) - li x2, 1 - bn.lid x2, 0(x16) - - /* re-load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */ - la x16, k1 - li x2, 2 - bn.lid x2, 0(x16++) - li x2, 3 - bn.lid x2, 0(x16) - - /* Generate a random 127-bit number. - w4 <= URND()[255:129] */ - bn.wsrr w4, 0x2 /* URND */ - bn.rshi w4, w31, w4 >> 129 - - /* Add 1 to get a 128-bit nonzero scalar for masking. - w4 <= w4 + 1 = alpha */ - bn.addi w4, w4, 1 - - /* w0 <= ([w0,w1] * w4) mod n = (k0 * alpha) mod n */ - bn.mov w24, w0 - bn.mov w25, w1 - bn.mov w26, w4 - jal x1, mod_mul_320x128 - bn.mov w0, w19 - - /* w19 <= ([w2,w3] * w26) mod n = (k1 * alpha) mod n */ - bn.mov w24, w2 - bn.mov w25, w3 - jal x1, mod_mul_320x128 - - /* w0 <= (w0+w19) mod n = (k * alpha) mod n */ - bn.addm w0, w0, w19 - - /* w1 <= w0^-1 mod n = (k * alpha)^-1 mod n */ - jal x1, mod_inv - - /* Load first share of secret key d from dmem. - w2,w3 = dmem[d0] */ - la x16, d0 - li x2, 2 - bn.lid x2, 0(x16++) - li x2, 3 - bn.lid x2, 0(x16) - - /* Load second share of secret key d from dmem. - w5,w6 = dmem[d1] */ - la x16, d1 - li x2, 5 - bn.lid x2, 0(x16++) - li x2, 6 - bn.lid x2, 0(x16) - - /* w0 <= ([w2,w3] * w4) mod n = (d0 * alpha) mod n */ - bn.mov w24, w2 - bn.mov w25, w3 - bn.mov w26, w4 - jal x1, mod_mul_320x128 - bn.mov w0, w19 - - /* w19 <= ([w5,w6] * w4) mod n = (d1 * alpha) mod n */ - bn.mov w24, w5 - bn.mov w25, w6 - bn.mov w26, w4 - jal x1, mod_mul_320x128 - - /* w0 <= (w0+w19) mod n = (d * alpha) mod n */ - bn.addm w0, w0, w19 - - /* Compare to 0. - FG0.Z <= (w0 =? w31) = ((d * alpha) mod n =? 0) */ - bn.cmp w0, w31 - - /* Trigger a fault if FG0.Z is set, aborting the computation. - - Since alpha is nonzero mod n, (d * alpha) mod n = 0 means d is zero mod n, - which violates ECDSA private key requirements. This could technically be - triggered by an unlucky key manager seed, but the probability is so low (~1/n) - that it more likely indicates a fault attack. */ - jal x1, trigger_fault_if_fg0_z - - /* w24 = r <= w11 mod n */ - bn.addm w24, w11, w31 - - /* Store r of signature in dmem. - dmem[r] <= r = w24 */ - la x19, r - li x2, 24 - bn.sid x2, 0(x19) - - /* w19 <= (w24 * w0) mod n = (r * d * alpha) mod n */ - bn.mov w25, w0 - jal x1, mod_mul_256x256 - - /* w0 <= (w1 * w19) mod n = ((k * alpha)^-1 * (r * d * alpha)) mod n - = (k^-1 * r * d) mod n */ - bn.mov w24, w1 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - bn.mov w0, w19 - - /* Load message from dmem: - w24 = msg <= dmem[msg] */ - la x18, msg - li x2, 24 - bn.lid x2, 0(x18) - - /* w19 = (w24 * w4) mod n = <= (msg * alpha) mod n */ - bn.mov w25, w4 - jal x1, mod_mul_256x256 - - /* w19 = (w1 * w19) mod n = ((k * alpha)^-1 * (msg * alpha)) mod n - = (k^-1 * msg) mod n */ - bn.mov w24, w1 - bn.mov w25, w19 - jal x1, mod_mul_256x256 - - /* w0 = (w0 + w19) mod n = (k^-1*r*d + k^-1*msg) mod n = s */ - bn.addm w0, w0, w19 - - /* Store s of signature in dmem. - dmem[s] <= s = w0 */ - la x20, s - li x2, 0 - bn.sid x2, 0(x20) - ret - /** * P-256 scalar multiplication with base point G * @@ -1514,11 +1384,15 @@ p256_base_mult: bn.lid x2, 0(x16) /* call internal scalar multiplication routine - R = (x_a, y_a) = (w11, w12) <= d*P = (w0 + w1)*P */ + R = (x_p, y_p, z_p) = (w8, w9, w10) <= d*P = (w0 + w1)*P */ la x21, p256_gx la x22, p256_gy jal x1, scalar_mult_int + /* Convert masked result back to affine coordinates. + R = (x_a, y_a) = (w11, w12) */ + jal x1, proj_to_affine + /* store result (affine coordinates) in dmem dmem[x] <= x_a = w11 dmem[y] <= y_a = w12 */ @@ -1531,440 +1405,6 @@ p256_base_mult: ret -/** - * Variable time modular multiplicative inverse computation - * - * Returns c <= a^(-1) mod m - * with a being a bigint of length 256 bit with a < m - * m being the modulus with a length of 256 bit - * c being a 256-bit result - * - * This routine implements the computation of the modular multiplicative - * inverse based on the binary GCD or Stein's algorithm. - * The implemented variant is based on the - * "right-shift binary extended GCD" as it is described in section 3.1 of [1] - * (Algorithm 1). - * [1] https://doi.org/10.1155/ES/2006/32192 - * - * Note that this is a variable time implementation. I.e. this routine will - * show a data dependent timing and execution profile. Only use in situations - * where a full white-box environment is acceptable. - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * @param[in] w0: a, operand - * @param[in] MOD: m, modulus - * @param[in] w31: all-zero - * @param[out] w1: result c - * - * clobbered registers: x2, w2, w3, w4, w7 - * clobbered flag groups: FG0 - */ -mod_inv_var: - - /* w2 = r = 0 */ - bn.mov w2, w31 - - /* w3 = s = 1 */ - bn.addi w3, w31, 1 - - /* w4 = u = MOD */ - bn.wsrr w4, 0 - bn.wsrr w7, 0 - - /* w5 = v = w0 */ - bn.mov w5, w0 - - ebgcd_loop: - /* test if u is odd */ - bn.or w4, w4, w4 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_u_odd - - /* u is even: */ - /* w4 = u <= u/2 = w4 >> 1 */ - bn.rshi w4, w31, w4 >> 1 - - /* test if r is odd */ - bn.or w2, w2, w2 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_r_odd - - /* r is even: */ - /* w2 = r <= r/2 = w2 >> 1 */ - bn.rshi w2, w31, w2 >> 1 - jal x0, ebgcd_loop - - ebgcd_r_odd: - /* w2 = r <= (r + m)/2 = (w2 + w7) >> 1 */ - bn.add w2, w7, w2 - bn.addc w6, w31, w31 - bn.rshi w2, w6, w2 >> 1 - jal x0, ebgcd_loop - - ebgcd_u_odd: - /* test if v is odd */ - bn.or w5, w5, w5 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_uv_odd - - /* v is even: */ - /* w5 = v <= v/2 = w5 >> 1 */ - bn.rshi w5, w31, w5 >> 1 - - /* test if s is odd */ - bn.or w3, w3, w3 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_s_odd - - /* s is even: */ - /* w3 = s <= s/2 = w3 >> 1 */ - bn.rshi w3, w31, w3 >> 1 - jal x0, ebgcd_loop - - ebgcd_s_odd: - /* w3 = s <= (s + m)/2 = (w3 + w7) >> 1 */ - bn.add w3, w7, w3 - bn.addc w6, w31, w31 - bn.rshi w3, w6, w3 >> 1 - jal x0, ebgcd_loop - - ebgcd_uv_odd: - /* test if v >= u */ - bn.cmp w5, w4 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, ebgcd_v_gte_u - - /* u > v: */ - /* w2 = r <= r - s = w2 - w3; if (r < 0): r <= r + m */ - bn.subm w2, w2, w3 - - /* w4 = u <= u - v = w4 - w5 */ - bn.sub w4, w4, w5 - jal x0, ebgcd_loop - - ebgcd_v_gte_u: - /* w3 = s <= s - r = w3 - w2; if (s < 0) s <= s + m */ - bn.subm w3, w3, w2 - - /* w5 = v <= v - u = w5 - w4 */ - bn.sub w5, w5, w4 - - /* if v > 0 go back to start of loop */ - csrrs x2, 0x7c0, x0 - andi x2, x2, 8 - beq x2, x0, ebgcd_loop - - /* v <= 0: */ - /* if (r > m): w1 = a = r - m = w2 - MOD else: w1 = a = r = w2 */ - bn.addm w1, w2, w31 - - ret - - -/** - * P-256 ECDSA signature verification - * - * returns the affine x-coordinate of - * (x1, y1) = u1*G + u2*Q - * with u1 = z*s^-1 mod n and u2 = r*s^-1 mod n - * with G being the curve's base point, - * z being the message - * r, s being the signature - * Q being the public key. - * - * The routine computes the x1 coordinate and places it in dmem. x1 will be - * reduced (mod n), however, the final comparison has to be performed on the - * host side. The signature is valid if x1 == r. - * This routine runs in variable time. - * - * @param[in] dmem[msg]: message to be verified (256 bits) - * @param[in] dmem[r]: r component of signature (256 bits) - * @param[in] dmem[s]: s component of signature (256 bits) - * @param[in] dmem[x]: affine x-coordinate of public key (256 bits) - * @param[in] dmem[y]: affine y-coordinate of public key (256 bits) - * @param[out] dmem[x_r]: dmem buffer for reduced affine x_r-coordinate (x_1) - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * clobbered registers: x2, x3, x13, x14, x17 to x24, w0 to w25 - * clobbered flag groups: FG0 - */ -p256_verify: - - /* init all-zero register */ - bn.xor w31, w31, w31 - - /* load domain parameter b from dmem - w27 <= b = dmem[p256_b] */ - li x2, 27 - la x3, p256_b - bn.lid x2, 0(x3) - - /* load r of signature from dmem: w24 = r = dmem[r] */ - la x19, r - li x2, 11 - bn.lid x2, 0(x19) - - /* setup modulus n (curve order) and Barrett constant - MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ - li x2, 29 - la x3, p256_n - bn.lid x2, 0(x3) - bn.wsrw 0, w29 - li x2, 28 - la x3, p256_u_n - bn.lid x2, 0(x3) - - /* load s of signature from dmem: w0 = s = dmem[s] */ - la x20, s - bn.lid x0, 0(x20) - - /* goto 'fail' if w0 == w31 <=> s == 0 */ - bn.cmp w0, w31 - csrrs x2, 0x7c0, x0 - andi x2, x2, 8 - bne x2, x0, fail - - /* goto 'fail' if w0 >= w29 <=> s >= n */ - bn.cmp w0, w29 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, fail - - /* w1 = s^-1 mod n */ - jal x1, mod_inv_var - - /* load r of signature from dmem: w24 = r = dmem[r] */ - la x19, r - li x2, 24 - bn.lid x2, 0(x19) - - /* goto 'fail' if w24 == w31 <=> r == 0 */ - bn.cmp w24, w31 - csrrs x2, 0x7c0, x0 - andi x2, x2, 8 - bne x2, x0, fail - - /* goto 'fail' if w0 >= w29 <=> r >= n */ - bn.cmp w24, w29 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, fail - - /* w25 = s^-1 = w1 */ - bn.mov w25, w1 - - /* u2 = w0 = w19 <= w24*w25 = r*s^-1 mod n */ - jal x1, mod_mul_256x256 - bn.mov w0, w19 - - /* load message, w24 = msg = dmem[msg] */ - la x18, msg - li x2, 24 - bn.lid x2, 0(x18) - - /* u1 = w1 = w19 <= w24*w25 = w24*w1 = msg*s^-1 mod n */ - bn.mov w25, w1 - jal x1, mod_mul_256x256 - bn.mov w1, w19 - - /* setup modulus p and Barrett constant */ - li x2, 29 - la x3, p256_p - bn.lid x2, 0(x3) - bn.wsrw 0, w29 - li x2, 28 - la x3, p256_u_p - bn.lid x2, 0(x3) - - /* load public key Q from dmem and use in projective form (set z to 1) - Q = (w11, w12, w13) = (dmem[x], dmem[y], 1) */ - li x2, 11 - la x21, x - bn.lid x2++, 0(x21) - la x22, y - bn.lid x2, 0(x22) - bn.addi w13, w31, 1 - - /* load base point G and use in projective form (set z to 1) - G = (w8, w9, w10) = (x_g, y_g, 1) */ - li x13, 8 - la x23, p256_gx - bn.lid x13, 0(x23) - li x14, 9 - la x24, p256_gy - bn.lid x14, 0(x24) - bn.addi w10, w31, 1 - - /* The rest of the routine implements a variable time double-and-add - algorithm. For the signature verification we need to compute the point - C = (x1, y1) = u_1*G + u_2*Q. This can be done in a single - double-and-add routine by using Shamir's Trick. */ - - /* G+Q = (w3,w4,w5) = (w11,w12,w13) = (w8,w9,w10) (+) (w11,w12,w13) */ - jal x1, proj_add - bn.mov w3, w11 - bn.mov w4, w12 - bn.mov w5, w13 - - /* w2 = u_2 & u_0 = w0 & w1*/ - bn.and w2, w0, w1 - - /* init double and add algorithm with (0, 1, 0) */ - bn.mov w11, w31 - bn.addi w12, w31, 1 - bn.mov w13, w31 - - /* main loop with dicreasing index i (i=255 downto 0) */ - loopi 256, 31 - - /* always double: C = (w11,w12,w13) <= 2 (*) C = 2 (*) (w11,w12,w13) */ - bn.mov w8, w11 - bn.mov w9, w12 - bn.mov w10, w13 - jal x1, proj_add - - /* if either u_1[i] == 0 or u_2[i] == 0 jump to 'no_both' */ - bn.add w2, w2, w2 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, no_both - - /* both bits at current index (u1[i] and u2[i]) are set: - do C <= C + (P + Q) and jump to end */ - bn.mov w8, w3 - bn.mov w9, w4 - bn.mov w10, w5 - jal x1, proj_add - jal x0, no_q - - /* either u1[i] or u2[i] is set, but not both */ - no_both: - - /* if u2[i] is not set jump to 'no_g' */ - bn.add w6, w0, w0 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, no_g - - /* u2[i] is set: do C <= C + Q */ - bn.lid x13, 0(x21) - bn.lid x14, 0(x22) - bn.addi w10, w31, 1 - jal x1, proj_add - - no_g: - /* if u1[i] is not set jump to 'no_q' */ - bn.add w6, w1, w1 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, no_q - - /* load base point x-coordinate - w8 <= g_x = dmem [p256_gx]; w9 <= g_y = dmem[p256_gy] */ - bn.lid x13, 0(x23) - bn.lid x14, 0(x24) - - /* u1[i] is set: do C <= C + G */ - bn.addi w10, w31, 1 - jal x1, proj_add - - no_q: - /* left shift w0 and w1 to decrease index */ - bn.add w0, w0, w0 - bn.add w1, w1, w1 - - /* compute inverse of z-coordinate: w1 = z_c^-1 mod p */ - bn.mov w0, w13 - jal x1, mod_inv_var - - /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1 mod p */ - bn.mov w24, w1 - bn.mov w25, w11 - jal x1, mod_mul_256x256 - - /* final reduction: w24 = x1 <= x1 mod n */ - la x3, p256_n - bn.lid x0, 0(x3) - bn.wsrw 0, w0 - bn.subm w24, w19, w31 - - fail: - /* store affine x-coordinate in dmem: dmem[x_r] = w24 = x_r */ - la x17, x_r - li x2, 24 - bn.sid x2, 0(x17) - - ret - - -/** - * Externally callable wrapper for P-256 scalar point multiplication - * - * returns R = k*P = k*(x_p, y_p, z_p) - * with R, P being valid P-256 curve points in projective form, - * k being a 256 bit scalar. - * - * This routine assumes that the scalar k is provided in two shares, k0 and k1, - * where: - * k = (k0 + k1) mod n - * - * Sets up context and calls internal scalar multiplication routine. - * This routine runs in constant time. - * - * @param[in] dmem[k0]: first share of scalar k (256 bits) - * @param[in] dmem[k1]: second share of scalar k (256 bits) - * @param[in,out] dmem[x]: affine x-coordinate in dmem - * @param[in,out] dmem[y]: affine y-coordinate in dmem - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on - * the computed affine y-coordinate. - * - * clobbered registers: x2, x3, x16, x17, x21, x22, w0 to w25 - * clobbered flag groups: FG0 - */ -p256_scalar_mult: - - /* init all-zero register */ - bn.xor w31, w31, w31 - - /* Load first share of secret key k from dmem. - w0,w1 = dmem[k0] */ - la x16, k0 - li x2, 0 - bn.lid x2, 0(x16++) - li x2, 1 - bn.lid x2, 0(x16) - - /* Load second share of secret key d from dmem. - w2,w3 = dmem[k1] */ - la x16, k1 - li x2, 2 - bn.lid x2, 0(x16++) - li x2, 3 - bn.lid x2, 0(x16) - - /* call internal scalar multiplication routine - R = (x_a, y_a) = (w11, w12) <= k*P = w0*P */ - la x21, x - la x22, y - jal x1, scalar_mult_int - - /* store result (affine coordinates) in dmem - dmem[x] <= x_a = w11 - dmem[y] <= y_a = w12 */ - li x2, 11 - bn.sid x2++, 0(x21) - bn.sid x2, 0(x22) - - ret - /** * Generate a nonzero random value in the scalar field. * @@ -2013,7 +1453,7 @@ p256_random_scalar: bn.lid x2, 0(x3) /* Copy n into the MOD register. */ - bn.wsrw 0, w29 + bn.wsrw MOD, w29 /* Load Barrett constant for n. w28 <= u_n = dmem[p256_u_n] */ @@ -2023,18 +1463,18 @@ p256_random_scalar: random_scalar_retry: /* Obtain 768 bits of randomness from RND. */ - bn.wsrr w15, 0x1 /* RND */ - bn.wsrr w16, 0x1 /* RND */ - bn.wsrr w17, 0x1 /* RND */ + bn.wsrr w15, RND + bn.wsrr w16, RND + bn.wsrr w17, RND /* XOR with bits from URND, just in case there's any vulnerability in EDN that lets the attacker recover bits before they reach OTBN. */ - bn.wsrr w20, 0x2 /* URND */ + bn.wsrr w20, URND + bn.xor w15, w15, w20 + bn.wsrr w20, URND bn.xor w16, w16, w20 - bn.wsrr w20, 0x2 /* URND */ + bn.wsrr w20, URND bn.xor w17, w17, w20 - bn.wsrr w20, 0x2 /* URND */ - bn.xor w18, w18, w20 /* Shift bits to get 320-bit seeds. w18 <= w16[255:192] @@ -2045,7 +1485,7 @@ p256_random_scalar: /* Generate a random masking parameter. w14 <= URND(127) + 1 = x */ - bn.wsrr w14, 0x2 /* URND */ + bn.wsrr w14, URND bn.addi w14, w14, 1 /* w12 <= ([w15,w16] * w14) mod n = (seed0 * x) mod n */ @@ -2071,7 +1511,7 @@ p256_random_scalar: /* Read the FG0.Z flag (position 3). x2 <= 8 if FG0.Z else 0 */ - csrrw x2, 0x7c0, x0 + csrrw x2, FG0, x0 andi x2, x2, 8 /* Retry if x2 != 0. */ @@ -2187,13 +1627,20 @@ p256_generate_k: * * This routine runs in constant time. * + * We are aware that MSB of the intermediate values here may leak 1-bit of + * secret seed. We observed this with formal masking analysis tool and FPGA + * experiments. The algorithm runs with 64-bit excess randomness, so we don't + * expect that to be possible to use that leakage and retrieve secret values. + * We also verified that the leakage disappeared after running the routine on + * 320-bit instead of 321-bit. + * * Flags: Flags have no meaning beyond the scope of this subroutine. * * @param[in] [w21, w20]: s0, first share of seed (320 bits) - * @param[in] [w23, w22]: s1, second share of seed (320 bits) + * @param[in] [w11, w10]: s1, second share of seed (320 bits) * @param[in] w31: all-zero * @param[out] [w21, w20]: result x0 (321 bits) - * @param[out] [w23, w22]: result x1 (320 bits) + * @param[out] [w11, w10]: result x1 (320 bits) * * clobbered registers: w1 to w5, w20 to w23 * clobbered flag groups: FG0 @@ -2201,16 +1648,17 @@ p256_generate_k: boolean_to_arithmetic: /* Mask out excess bits from seed shares. [w21, w20] <= s0 mod 2^320 - [w23, w22] <= s1 mod 2^320 = x1 */ + [w11, w10] <= s1 mod 2^320 = x1 */ bn.rshi w21, w21, w31 >> 64 bn.rshi w21, w31, w21 >> 192 - bn.rshi w23, w23, w31 >> 64 - bn.rshi w23, w31, w23 >> 192 + bn.rshi w31, w31, w31 >> 192 # dummy instruction to flush ALU datapath + bn.rshi w11, w11, w31 >> 64 + bn.rshi w11, w31, w11 >> 192 /* Fetch 321 bits of randomness from URND. [w2, w1] <= gamma */ - bn.wsrr w1, 2 - bn.wsrr w2, 2 + bn.wsrr w1, URND + bn.wsrr w2, URND bn.rshi w2, w31, w2 >> 191 /* [w4, w3] <= [w21, w20] ^ [w2, w1] = s0 ^ gamma */ @@ -2222,6 +1670,7 @@ boolean_to_arithmetic: [w4, w3] <= [w4, w3] - [w2, w1] = ((s0 ^ gamma) - gamma) mod 2^512 */ bn.sub w3, w3, w1 bn.subb w4, w4, w2 + bn.sub w31, w31, w31 # dummy instruction to clear flags /* Truncate subtraction result to 321 bits. [w4, w3] <= [w4, w3] mod 2^321 = T */ @@ -2232,9 +1681,9 @@ boolean_to_arithmetic: bn.xor w3, w3, w20 bn.xor w4, w4, w21 - /* [w2, w1] <= [w2, w1] ^ [w23, w22] = gamma ^ s1 = G */ - bn.xor w1, w1, w22 - bn.xor w2, w2, w23 + /* [w2, w1] <= [w2, w1] ^ [w11, w10] = gamma ^ s1 = G */ + bn.xor w1, w1, w10 + bn.xor w2, w2, w11 /* [w21, w20] <= [w21, w20] ^ [w2, w1] = s0 ^ G */ bn.xor w20, w20, w1 @@ -2243,15 +1692,26 @@ boolean_to_arithmetic: /* [w21, w20] <= [w21, w20] - [w2, w1] = ((s0 ^ G) - G) mod 2^512 */ bn.sub w20, w20, w1 bn.subb w21, w21, w2 + bn.sub w31, w31, w31 # dummy instruction to clear flags /* [w21, w20] <= [w21, w20] mod 2^321 = A */ bn.rshi w21, w21, w31 >> 65 bn.rshi w21, w31, w21 >> 191 + /* apply fresh mask to w20 and w21 before xoring with w3 and w4 */ + bn.wsrr w28, RND + bn.wsrr w29, RND + bn.xor w20, w28, w20 + bn.xor w21, w29, w21 + /* [w21, w20] <= [w21, w20] ^ [w4, w3] = A ^ T2 = x0 */ bn.xor w20, w20, w3 bn.xor w21, w21, w4 + /* remove fresh mask */ + bn.xor w20, w28, w20 + bn.xor w21, w29, w21 + ret /** @@ -2290,10 +1750,10 @@ boolean_to_arithmetic: * Flags: Flags have no meaning beyond the scope of this subroutine. * * @param[in] [w21, w20]: seed0, first share of seed (320 bits) - * @param[in] [w23, w22]: seed1, second share of seed (320 bits) + * @param[in] [w11, w10]: seed1, second share of seed (320 bits) * @param[in] w31: all-zero * @param[out] [w21, w20]: d0, first share of private key d (320 bits) - * @param[out] [w23, w22]: d1, second share of private key d (320 bits) + * @param[out] [w11, w10]: d1, second share of private key d (320 bits) * * clobbered registers: x2, x3, w1 to w4, w20 to w29 * clobbered flag groups: FG0 @@ -2305,7 +1765,7 @@ p256_key_from_seed: /* At this point, we have arithmetic shares modulo 2^321: [w21, w20] : x0 - [w23, w22] : x1 + [w11, w10] : x1 We know that x1=seed1, and seed and x1 are at most 320 bits. Therefore, the highest bit of x0 holds a carry bit modulo 2^320: @@ -2343,15 +1803,15 @@ p256_key_from_seed: bn.rshi w29, w31, w29 >> 192 /* [w25,w24] <= (x1 - (n << 64)) mod 2^512 */ - bn.sub w24, w22, w28 - bn.subb w25, w23, w29 + bn.sub w24, w10, w28 + bn.subb w25, w11, w29 /* Compute d1. Because 2^320 < 2 * (n << 64), a conditional subtraction is sufficient to reduce. Similarly to the carry bit, the conditional bit here is not very sensitive because the shares are large relative to n. - [w23,w22] <= x1 mod (n << 64) = d1 */ - bn.sel w22, w22, w24, FG0.C - bn.sel w23, w23, w25, FG0.C + [w11,w10] <= x1 mod (n << 64) = d1 */ + bn.sel w10, w10, w24, FG0.C + bn.sel w11, w11, w25, FG0.C /* Isolate the carry bit and shift it back into position. w25 <= x0[320] << 64 */ @@ -2418,18 +1878,18 @@ p256_p: .word 0x00000001 .word 0xffffffff -/* Barrett constant u for modulus p */ -.globl p256_u_p +/* Constant ((2^448) mod p) for reduction modulo p. */ +.globl p256_r448 .balign 32 -p256_u_p: - .word 0x00000003 - .word 0x00000000 +p256_r448: .word 0xffffffff .word 0xfffffffe .word 0xfffffffe - .word 0xfffffffe .word 0xffffffff .word 0x00000000 + .word 0x00000002 + .word 0x00000003 + .word 0x00000000 /* P-256 domain parameter n (order of base point) */ .globl p256_n diff --git a/sw/otbn/crypto/p256_ecdh.s b/sw/otbn/crypto/p256_ecdh.s index 57fff9fdcc0b9..01489fd39a46a 100644 --- a/sw/otbn/crypto/p256_ecdh.s +++ b/sw/otbn/crypto/p256_ecdh.s @@ -106,36 +106,15 @@ keypair_random: * @param[out] dmem[y]: x1, second share of shared key. */ shared_key: - /* Generate shared key d*Q. - dmem[x] <= (d*Q).x - dmem[y] <= (d*Q).y */ - jal x1, p256_scalar_mult - - /* TODO: `p256_scalar_mult` and the code below briefly handle the shared key - in unmasked form. The best way to fixing this is likely: - - modify scalar_mult_int to return projective coordinates - - get additive arithmetic mask for x before converting it to affine - - multiply both shares by Z^-1 to convert to affine form - - run a safe arithmetic-to-boolean conversion algorithm - */ - - /* Fetch a fresh random number for blinding. - w2 <= URND() */ - bn.wsrr w2, 0x2 /* URND */ + /* Validate the public key. Halts the program if the key is invalid and jumps + back here if it's OK. */ + jal x0, check_public_key_valid + _pk_valid: - /* Store the random number as the second share. - dmem[y] <= w2 */ - li x2, 2 - la x4, y - bn.sid x2, 0(x4) - - /* Blind the x-coordinate. - dmem[x] <= dmem[x] ^ w2 */ - li x3, 3 - la x4, x - bn.lid x3, 0(x4) - bn.xor w3, w3, w2 - bn.sid x3, 0(x4) + /* Generate boolean-masked shared key (d*Q).x. + dmem[x] <= x0 + dmem[y] <= x1 */ + jal x1, p256_shared_key ecall @@ -212,15 +191,15 @@ shared_key_from_seed: secret_key_from_seed: /* Load keymgr seeds from WSRs. w20,w21 <= seed0 - w22,w23 <= seed1 */ - bn.wsrr w20, 0x4 /* KEY_S0_L */ - bn.wsrr w21, 0x5 /* KEY_S0_H */ - bn.wsrr w22, 0x6 /* KEY_S1_L */ - bn.wsrr w23, 0x7 /* KEY_S1_H */ + w10,w11 <= seed1 */ + bn.wsrr w20, KEY_S0_L + bn.wsrr w21, KEY_S0_H + bn.wsrr w10, KEY_S1_L + bn.wsrr w11, KEY_S1_H /* Generate secret key shares. w20, w21 <= d0 - w22, w23 <= d1 */ + w10, w11 <= d1 */ jal x1, p256_key_from_seed /* Store secret key shares. @@ -230,12 +209,95 @@ secret_key_from_seed: la x3, d0 bn.sid x2++, 0(x3) bn.sid x2++, 32(x3) - la x3, d0 + li x2, 10 + la x3, d1 bn.sid x2++, 0(x3) bn.sid x2, 32(x3) ret +/** + * Check if a provided public key is valid. + * + * For a given public key (x, y), check that: + * - x and y are both fully reduced mod p + * - (x, y) is on the P-256 curve. + * + * Note that, because the point is in affine form, it is not possible that (x, + * y) is the point at infinity. In some other forms such as projective + * coordinates, we would need to check for this also. + * + * This routine raises a software error and halts operation if the public key + * is invalid. + * + * @param[in] dmem[x]: Public key x-coordinate. + * @param[in] dmem[y]: Public key y-coordinate. + */ +check_public_key_valid: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load domain parameter p. + w29 <= dmem[p256_p] = p */ + li x2, 29 + la x3, p256_p + bn.lid x2, 0(x3) + + /* Load public key x-coordinate. + w2 <= dmem[x] = x */ + li x2, 2 + la x3, x + bn.lid x2, 0(x3) + + /* Compare x to p. + FG0.C <= (x < p) */ + bn.cmp w2, w29 + + /* Trigger a fault if FG0.C is false. */ + csrrs x2, FG0, x0 + andi x2, x2, 1 + bne x2, x0, _x_valid + unimp + + _x_valid: + + /* Load public key y-coordinate. + w2 <= dmem[y] = y */ + li x2, 2 + la x3, y + bn.lid x2, 0(x3) + + /* Compare y to p. + FG0.C <= (y < p) */ + bn.cmp w2, w29 + + /* Trigger a fault if FG0.C is false. */ + csrrs x2, FG0, x0 + andi x2, x2, 1 + bne x2, x0, _y_valid + unimp + + _y_valid: + + /* Compute both sides of the Weierstrauss equation. + w18 <= (x^3 + ax + b) mod p + w19 <= (y^2) mod p */ + jal x1, p256_isoncurve + + /* Compare the two sides of the equation. + FG0.Z <= (y^2) mod p == (x^2 + ax + b) mod p */ + bn.cmp w18, w19 + + /* Trigger a fault if FG0.Z is false; otherwise jump back to the single call + site. */ + csrrs x2, FG0, x0 + srli x2, x2, 3 + andi x2, x2, 1 + bne x2, x0, _pk_valid + unimp + unimp + unimp + .bss /* Operational mode. */ @@ -256,20 +318,14 @@ x: y: .zero 32 -/* Secret key (d) in two shares: d = (d0 + d1) mod n. - - Note: This is also labeled k0, k1 because the `p256_scalar_mult` algorithm - is also used for ECDSA signing and reads from those labels; in the case of - ECDH, the scalar in `p256_scalar_mult` is always the private key (d). */ +/* Secret key (d) in two shares: d = (d0 + d1) mod n. */ .globl d0 -.globl k0 .balign 32 d0: k0: .zero 64 .globl d1 -.globl k1 .balign 32 d1: k1: diff --git a/sw/otbn/crypto/p256_ecdsa.s b/sw/otbn/crypto/p256_ecdsa.s index a0fb63364c0e9..7fd02dfdf3cf4 100644 --- a/sw/otbn/crypto/p256_ecdsa.s +++ b/sw/otbn/crypto/p256_ecdsa.s @@ -111,8 +111,9 @@ ecdsa_sign: * @param[out] dmem[x_r]: dmem buffer for reduced affine x_r-coordinate (x_1) */ ecdsa_verify: - /* Validate the public key. */ - jal x1, check_public_key_valid + /* Validate the public key (jumps back here if successful). */ + jal x0, check_public_key_valid + _pk_valid: /* Verify the signature (compute x_r). */ jal x1, p256_verify @@ -165,18 +166,18 @@ sideload_ecdsa_sign: secret_key_from_seed: /* Load keymgr seeds from WSRs. w20,w21 <= seed0 - w22,w23 <= seed1 */ - bn.wsrr w20, 4 /*KEY_S0_L*/ - bn.wsrr w21, 5 /*KEY_S0_H*/ - bn.wsrr w22, 6 /*KEY_S1_L*/ - bn.wsrr w23, 7 /*KEY_S1_H*/ + w10,w11 <= seed1 */ + bn.wsrr w20, KEY_S0_L + bn.wsrr w21, KEY_S0_H + bn.wsrr w10, KEY_S1_L + bn.wsrr w11, KEY_S1_H /* Init all-zero register. */ bn.xor w31, w31, w31 /* Generate secret key shares. w20, w21 <= d0 - w22, w23 <= d1 */ + w10, w11 <= d1 */ jal x1, p256_key_from_seed /* Store secret key shares. @@ -186,7 +187,8 @@ secret_key_from_seed: la x3, d0 bn.sid x2++, 0(x3) bn.sid x2++, 32(x3) - la x3, d0 + li x2, 10 + la x3, d1 bn.sid x2++, 0(x3) bn.sid x2, 32(x3) @@ -230,7 +232,7 @@ check_public_key_valid: bn.cmp w2, w29 /* Trigger a fault if FG0.C is false. */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 bne x2, x0, _x_valid unimp @@ -248,59 +250,31 @@ check_public_key_valid: bn.cmp w2, w29 /* Trigger a fault if FG0.C is false. */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 bne x2, x0, _y_valid unimp _y_valid: - /* Save the signature values to registers. - w4 <= dmem[r] - w5 <= dmem[s] */ - li x2, 4 - la x3, r - bn.lid x2++, 0(x3) - la x3, s - bn.lid x2, 0(x3) - /* Compute both sides of the Weierstrauss equation. - dmem[r] <= (x^3 + ax + b) mod p - dmem[s] <= (y^2) mod p */ + w18 <= (x^3 + ax + b) mod p + w19 <= (y^2) mod p */ jal x1, p256_isoncurve - /* Load both sides of the equation. - w2 <= dmem[r] - w3 <= dmem[s] */ - li x2, 2 - la x3, r - bn.lid x2++, 0(x3) - la x3, s - bn.lid x2, 0(x3) - /* Compare the two sides of the equation. FG0.Z <= (y^2) mod p == (x^2 + ax + b) mod p */ - bn.cmp w2, w3 + bn.cmp w18, w19 - /* Trigger a fault if FG0.Z is false. */ - csrrs x2, 0x7c0, x0 + /* Trigger a fault if FG0.Z is false; otherwise jump back to the single call + site. */ + csrrs x2, FG0, x0 srli x2, x2, 3 andi x2, x2, 1 bne x2, x0, _pk_valid unimp - - _pk_valid: - - /* Write back the saved signature values. - dmem[r] <= w4 - dmem[s] <= w5 */ - li x2, 4 - la x3, r - bn.sid x2++, 0(x3) - la x3, s - bn.sid x2, 0(x3) - - ret + unimp + unimp .bss diff --git a/sw/otbn/crypto/p256_isoncurve.s b/sw/otbn/crypto/p256_isoncurve.s new file mode 100644 index 0000000000000..07d139bb5e404 --- /dev/null +++ b/sw/otbn/crypto/p256_isoncurve.s @@ -0,0 +1,86 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +.globl p256_isoncurve + +/** + * Checks if a point is a valid curve point on curve P-256 (secp256r1) + * + * Returns rhs = x^3 + ax + b mod p + * and lhs = y^2 mod p + * with x,y being the affine coordinates of the curve point + * a, b and p being the domain parameters of P-256 + * + * This routine checks if a point with given x- and y-coordinate is a valid + * curve point on P-256. + * The routine checks whether the coordinates are a solution of the + * Weierstrass equation y^2 = x^3 + ax + b mod p. + * The routine makes use of the property that the domain parameter 'a' can be + * written as a=-3 for the P-256 curve, hence the routine is limited to P-256. + * The routine does not return a boolean result but computes the left side + * and the right sight of the Weierstrass equation and leaves the final + * comparison to the caller. + * The routine runs in constant time. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] w31: all-zero + * @param[in] dmem[x]: affine x-coordinate of input point + * @param[in] dmem[y]: affine y-coordinate of input point + * @param[out] w18: lhs, left side of equation = (x^3 + ax + b) mod p + * @param[out] w19: rhs, right side of equation = y^2 mod p + * + * clobbered registers: x2, x3, x19, x20, w0, w19 to w29 + * clobbered flag groups: FG0 + */ +p256_isoncurve: + /* Set up for coordinate arithmetic. + MOD <= p + w28 <= r256 + w29 <= r448 */ + jal x1, setup_modp + + /* load domain parameter b from dmem + w27 <= b = dmem[p256_b] */ + li x2, 27 + la x3, p256_b + bn.lid x2, 0(x3) + + /* load affine x-coordinate of curve point from dmem + w26 <= dmem[x] */ + la x3, x + li x2, 26 + bn.lid x2, 0(x3) + + /* w19 <= x^2 = w26*w26 */ + bn.mov w25, w26 + bn.mov w24, w26 + jal x1, mul_modp + + /* w19 = x^3 <= x^2 * x = w25*w24 = w26*w19 */ + bn.mov w25, w19 + bn.mov w24, w26 + jal x1, mul_modp + + /* for curve P-256, 'a' can be written as a = -3, therefore we subtract + x three times from x^3. + w19 = x^3 + ax <= x^3 - 3x mod p */ + bn.subm w19, w19, w26 + bn.subm w19, w19, w26 + bn.subm w19, w19, w26 + + /* w18 <= x^3 + ax + b mod p = w19 + w27 mod p = lhs */ + bn.addm w18, w19, w27 + + /* Load affine y-coordinate of curve point from dmem + w26 <= dmem[y] */ + la x3, y + li x2, 24 + bn.lid x2, 0(x3) + + /* w19 <= w24*w24 mod p = y^2 mod p = rhs */ + bn.mov w25, w24 + jal x1, mul_modp + + ret diff --git a/sw/otbn/crypto/p256_key_from_seed_sca.s b/sw/otbn/crypto/p256_key_from_seed_sca.s index 5a6b7d04538d6..5429b75981d61 100644 --- a/sw/otbn/crypto/p256_key_from_seed_sca.s +++ b/sw/otbn/crypto/p256_key_from_seed_sca.s @@ -53,33 +53,35 @@ run_gen_secret_key: /* Load shares of seed from DMEM. [w21,w20] <= dmem[seed0] - [w23,w33] <= dmem[seed1] */ + [w11,w10] <= dmem[seed1] */ li x2, 20 la x3, seed0 bn.lid x2, 0(x3++) li x2, 21 - bn.lid x2++, 0(x3) + bn.lid x2, 0(x3) + li x2, 10 la x3, seed1 bn.lid x2, 0(x3++) - li x2, 23 + li x2, 11 bn.lid x2, 0(x3) /* Generate the derived secret key. [w21,w20] <= d0 - [w23,w33] <= d1 */ + [w11,w10] <= d1 */ jal x1, p256_key_from_seed /* Write the results to DMEM. dmem[d0] <= [w21, w20] - dmem[d1] <= [w23, w22] */ + dmem[d1] <= [w11, w10] */ li x2, 20 la x3, d0 bn.sid x2, 0(x3++) li x2, 21 - bn.sid x2++, 0(x3) + bn.sid x2, 0(x3) + li x2, 10 la x3, d1 bn.sid x2, 0(x3++) - li x2, 23 + li x2, 11 bn.sid x2, 0(x3) ret diff --git a/sw/otbn/crypto/p256_mod_inv_sca.s b/sw/otbn/crypto/p256_mod_inv_sca.s index ac8464dca7744..5b7c3d53b0fcf 100644 --- a/sw/otbn/crypto/p256_mod_inv_sca.s +++ b/sw/otbn/crypto/p256_mod_inv_sca.s @@ -25,7 +25,7 @@ main: li x2, 29 la x3, p256_n bn.lid x2, 0(x3) - bn.wsrw 0, w29 + bn.wsrw MOD, w29 /* Load first share of input. w0, w1 <= dmem[k0] */ @@ -48,7 +48,7 @@ main: /* Generate a random 127-bit number. w4 <= URND()[255:129] */ - bn.wsrr w4, 0x2 /* URND */ + bn.wsrr w4, URND bn.rshi w4, w31, w4 >> 129 /* Add 1 to get a 128-bit nonzero scalar for masking. diff --git a/sw/otbn/crypto/p256_shared_key.s b/sw/otbn/crypto/p256_shared_key.s new file mode 100644 index 0000000000000..122f4926a2473 --- /dev/null +++ b/sw/otbn/crypto/p256_shared_key.s @@ -0,0 +1,358 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/* Public interface. */ +.globl p256_shared_key + +/* Exposed only for testing or SCA purposes. */ +.globl arithmetic_to_boolean_mod +.globl arithmetic_to_boolean + +.text + +/** + * Externally callable wrapper for P-256 scalar point multiplication. + * + * Returns x0, x1 such that x0 ^ x1 = x-coordinate of (d * P). + * + * This routine is specialized for ECDH shared key generation and includes an + * arithmetic-to-boolean masking conversion. + * + * This routine assumes that the scalar d is provided in two arithmetic shares, + * d0 and d1, where d = (d0 + d1) mod n. + * + * This routine runs in constant time. + * + * @param[in] dmem[d0]: first share of scalar d (320 bits) + * @param[in] dmem[d1]: second share of scalar d (320 bits) + * @param[in] dmem[x]: affine x-coordinate in dmem + * @param[in] dmem[y]: affine y-coordinate in dmem + * @param[out] dmem[x]: x0, first share of x-coordinate in dmem + * @param[out] dmem[y]: x1, second share of x-coordinate in dmem + * + * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on + * the computed affine y-coordinate. + * + * clobbered registers: x2, x3, x16, x17, x21, x22, w0 to w25 + * clobbered flag groups: FG0 + */ +p256_shared_key: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load first share of secret key d from dmem. + w0,w1 = dmem[d0] */ + la x16, d0 + li x2, 0 + bn.lid x2, 0(x16++) + li x2, 1 + bn.lid x2, 0(x16) + + /* Load second share of secret key d from dmem. + w2,w3 = dmem[d1] */ + la x16, d1 + li x2, 2 + bn.lid x2, 0(x16++) + li x2, 3 + bn.lid x2, 0(x16) + + /* Call internal scalar multiplication routine. + Returns point in projective coordinates. + R = (x, y, z) = (w8, w9, w10) <= k*P = w0*P */ + la x21, x + la x22, y + jal x1, scalar_mult_int + + /* Arithmetic masking: + 1. Generate a random mask + 2. Subtract masks from projective x coordinate + (x, y, z) -> ((x - m) mod p, + y, + z) + 3. Convert masked curve point back to affine + form. + 4. Multiply mask with z^-1 for use in + affine space. */ + + /* Fetch a fresh random number as mask. + w2 <= URND() */ + bn.wsrr w2, URND + + /* Subtract random mask from x coordinate of + projective point. + The subtraction has to be done within the underlying + finite field -> mod p. + w8 = (w8 - w2) mod p */ + bn.subm w8, w8, w2 + + /* Convert masked result back to affine coordinates. + R = (x_a, y_a) = (w11, w12) */ + jal x1, proj_to_affine + + /* Store result (masked affine x-coordinate) in DMEM. + Y-coordinate not needed, will be overwritten with + mask value below. + dmem[x] <= x_a = w11 */ + li x2, 11 + bn.sid x2, 0(x21) + + /* Get modular inverse z^-1 of projective z coordinate + and multiply the random masks with z^-1 to + also convert them into affine space. */ + + /* Move z^-1 and x coordinate mask to mul_modp input WDRs. + z^-1 is still stored in w14 from previous + proj_to_affine call. + w25 <= w14 = z^-1 + w24 <= w2 = m_x */ + bn.mov w25, w14 + bn.mov w24, w2 + + /* Compute modular multiplication of m_x and z^-1. + w19 = w24 * w25 mod p = m_x * z^-1 mod p = x1 */ + jal x1, mul_modp + + /* Store "affine" mask to DMEM. Use the y-coordinate + to save memory (not needed afterwards) + dmem[y] <= w19 = x1 */ + li x2, 19 + bn.sid x2, 0(x22) + + /* Arithmetic-to-boolean conversion. + w20 <= x ^ x1 = x0 */ + jal x1, arithmetic_to_boolean_mod + + /* dmem[x] <= w20 = x0 */ + li x3, 20 + la x4, x + bn.sid x3, 0(x4) + + ret + +/** + * Converts arithmetic shares mod p to boolean shares. + * + * Calls the 257-bit A2B function twice, first using unmodified 256-bit shares + * in reduced form, and then using modified 257-bit shares in unreduced form. + * + * It then checks if the MSB (carry bit) is true or false, to decide + * which of the two A2B results is used. This detects and handles an + * underflow during the subtraction of arithmetic masking. + * + * The logic behind the carry bit handling is as follows: + * If x >= r, then A = (x - r) mod p = x - r exactly. + * So when we add 2^257 - p and then add A and x, we get + * (2^257 - p + x - r + r) mod 2^257 = 2^257 - p + x. + * In this case, the high bit is always true since p - x <= p < 2^256, + * so we choose the A2B conversion without the 2^257 - p added. + * On the other hand, if x < r, then A = (x - r) mod p = x - r + p. + * When we add 2^257 - p and then add A and x, we get + * (2^257 - p + x - r + p + r) mod 2^257 = (2^257 + x) mod 2^257 = x. + * In this case, the high bit is always false since x < p < 2^256, so we + * choose this second A2B conversion. + * + * This routine runs in constant time. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] w31: all-zero wide data register + * @param[in] w19: mask r + * @param[in] w11: arithmetically masked value A, such that x = A + r + * @param[out] w20: boolean masked value x', such that x = x' ^ r + * + * clobbered registers: w1 to w6, w11, w12, w18, w20 to w27, and w29 + * clobbered flag groups: FG0 + */ +arithmetic_to_boolean_mod: + /* First step: calculate A2B from reduced values. */ + + /* Save inputs for second A2B execution. + w24 <= w19 = r + w25 <= w11 = A */ + bn.mov w24, w19 + bn.mov w25, w11 + + /* Expand inputs r and A (w19 and w11) to 257-bit values [w19,w18] + and [w12,w11] and prepare input for 257-bit A2B function. + w18 <= w19 + w19 <= w31 + w11 <= w11 -> obsolete + w12 <= w31 */ + bn.mov w18, w19 + bn.mov w19, w31 + bn.mov w12, w31 + + /* Call 257-bit A2B function. + [w21,w20] <= x' */ + jal x1, arithmetic_to_boolean + + /* Save intermediate result of reduced inputs. + w26 <= w20 = x' (lower part) + w27 <= w21 = x' (upper part) */ + bn.mov w26, w20 + bn.mov w27, w21 + + /* Second step: calculate A2B from unreduced values. */ + + /* Restore and expand inputs r and A (w19 and w11) to 257-bit + values [w19,w18] and [w12,w11] and prepare input for + 257-bit A2B function. + w18 <= w24 + w19 <= w31 + w11 <= w25 + w12 <= w31 */ + bn.mov w18, w24 + bn.mov w19, w31 + bn.mov w11, w25 + bn.mov w12, w31 + + /* Get field modulus p. + w29 <= MOD() */ + bn.wsrr w29, MOD + + /* Convert input A ([w12,w11]) to an unreduced value + in the 2^257 domain. For this add (2^257 - p) to A. + [w12,w11] <= [w12,w11] + 2^257 - w29 = A + 2^257 - p + w12 <= w12 + 0x2 = A + 2^257 + -> equal to addition of 2^257 + (w11 doesn't need to be touched) + [w12,w11] <= [w12,w11] - w29 = (A + 2^257) - p */ + bn.addi w12, w12, 0x2 + bn.sub w11, w11, w29 + bn.subb w12, w12, w31 + + /* Call 257-bit A2B function. + [w21,w20] <= x' */ + jal x1, arithmetic_to_boolean + + /* Restore initial mask input of w19 for consistency + in calling functions. + w19 <= w24 */ + bn.mov w19, w24 + + /* Check MSB (carry bit) of second A2B result for true or false. */ + bn.cmp w21, w31 /* w21 can only be 0x1 or 0x0 */ + + /* Return the unreduced A2B computation (second result), + if zero flag is set, otherwise return the reduced + A2B computation (first result). */ + bn.sel w20, w20, w26, FG0.Z + + ret + +/** + * Convert arithmetic shares to boolean ones using Goubin's algorithm. + * + * We use Goubin's boolean-to-arithmetic masking algorithm to switch from + * an arithmetic masking scheme to a boolean one without ever unmasking the + * seed. See Algorithm 2 here: + * https://link.springer.com/content/pdf/10.1007/3-540-44709-1_2.pdf + * + * This implementation expands the algorithm to 257 bits for carry bit + * handling. The carry bit can be used to detect and handle an + * underflow during the subtraction of arithmetic masking. + * + * This routine runs in constant time. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] w31: all-zero wide data register + * @param[in] w18: lower part of mask r + * @param[in] w19: upper part of mask r + * @param[in] w11: lower part of arithmetically masked value A, + * such that x = A + r + * @param[in] w12: upper part of arithmetically masked value A, + * such that x = A + r + * @param[out] w20: lower part of boolean masked value x', + * such that x = x' ^ r + * @param[out] w21: upper part of boolean masked value x', + * such that x = x' ^ r + * + * clobbered registers: w1 - w6, w11, w12, and w18 - w21 + * clobbered flag groups: FG0 + */ +arithmetic_to_boolean: + /* Initialize inputs: in case of randomness in upper part of inputs + truncate to 257 bits. */ + bn.rshi w19, w19, w31 >> 1 + bn.rshi w19, w31, w19 >> 255 + bn.rshi w12, w12, w31 >> 1 + bn.rshi w12, w31, w12 >> 255 + + /* Fetch 257 bits of randomness. + [w2,w1] = gamma <= URND */ + bn.wsrr w1, URND + bn.wsrr w2, URND + bn.rshi w2, w31, w2 >> 255 + + /* Double gamma and truncate to 257 bits. + [w4,w3] = T <= 2 * [w2,w1] = 2 * gamma */ + bn.add w3, w1, w1 + bn.addc w4, w2, w2 + bn.rshi w4, w4, w31 >> 1 + bn.rshi w4, w31, w4 >> 255 + + /* [w21,w20] = x' <= [w2,w1] ^ [w19,w18] = gamma ^ r */ + bn.xor w20, w1, w18 + bn.xor w21, w2, w19 + + /* [w6,w5] = omega <= [w2,w1] & [w21,w20] = gamma & x' */ + bn.and w5, w1, w20 + bn.and w6, w2, w21 + + /* [w21,w20] = x' <= [w4,w3] ^ [w12,w11] = T ^ A */ + bn.xor w20, w3, w11 + bn.xor w21, w4, w12 + + /* [w2,w1] = gamma <= [w2,w1] ^ [w21,w20] = gamma ^ x' */ + bn.xor w1, w1, w20 + bn.xor w2, w2, w21 + + /* [w2,w1] = gamma <= [w2,w1] & [w19,w18] = gamma & r */ + bn.and w1, w1, w18 + bn.and w2, w2, w19 + + /* [w6,w5] = omega <= [w6,w5] ^ [w2,w1] = omega ^ gamma */ + bn.xor w5, w5, w1 + bn.xor w6, w6, w2 + + /* [w2,w1] = gamma <= [w4,w3] & [w12,w11] = T & A */ + bn.and w1, w3, w11 + bn.and w2, w4, w12 + + /* [w6,w5] = omega <= [w6,w5] ^ [w2,w1] = omega ^ gamma */ + bn.xor w5, w5, w1 + bn.xor w6, w6, w2 + + /* Loop for k = 1 to K - 1 = 257 - 1 */ + loopi 256, 12 + + /* [w2,w1] = gamma <= [w4,w3] & [w19,w18] = T & r */ + bn.and w1, w3, w18 + bn.and w2, w4, w19 + + /* [w2,w1] = gamma <= [w2,w1] ^ [w6,w5] = gamma ^ omega */ + bn.xor w1, w1, w5 + bn.xor w2, w2, w6 + + /* [w4,w3] = T <= [w4,w3] & [w12,w11] = T & A */ + bn.and w3, w3, w11 + bn.and w4, w4, w12 + + /* [w2,w1] = gamma <= [w2,w1] ^ [w4,w3] = gamma ^ T */ + bn.xor w1, w1, w3 + bn.xor w2, w2, w4 + + /* Double gamma and truncate to 257 bits. + [w4,w3] = T <= 2 * [w2,w1] = 2 * gamma */ + bn.add w3, w1, w1 + bn.addc w4, w2, w2 + bn.rshi w4, w4, w31 >> 1 + bn.rshi w4, w31, w4 >> 255 + + /* [w21,w20] = x' <= [w21,w20] ^ [w4,w3] = x' ^ T */ + bn.xor w20, w20, w3 + bn.xor w21, w21, w4 + + ret diff --git a/sw/otbn/crypto/p256_sign.s b/sw/otbn/crypto/p256_sign.s new file mode 100644 index 0000000000000..0adffc0d5b9b0 --- /dev/null +++ b/sw/otbn/crypto/p256_sign.s @@ -0,0 +1,289 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/* Copyright 2016 The Chromium OS Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE.dcrypto file. + * + * Derived from code in + * https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/cr50_stab/chip/g/dcrypto/dcrypto_p256.c + */ + +.globl p256_sign + +.text + + /** + * P-256 ECDSA signature generation + * + * returns the signature as the pair r, s with + * r = x_1 mod n + * and s = k^(-1)(msg + r*d) mod n + * with x_1 being the affine x-coordinate of the curve point k*G, + * where G is the curve's base point. + * k being a supplied secret random number, + * n being the order of the base point G of P-256, + * msg being the msg to be signed, + * d being the private key. + * + * This routine runs in constant time. + * + * Note: Some versions of the ECDSA spec suggest that msg must be reduced + * modulo n (e.g. RFC 6979, section 2.4). However, for this implementation, it + * is sufficient that msg < 2^256, because the message is multiplied with + * k^(-1) mod n, and our Barrett multiplication implementation accepts any + * operands a and b such that a * b < 2^256 * p and fully reduces the result. + * + * This routine assumes that the secret scalars d and k are provided in two + * shares each (d0/d1 and k0/k1 respectively), where + * d = (d0 + d1) mod n + * k = (k0 + k1) mod n + * + * Each share is 320 bits, which gives us 64 bits of extra redundancy modulo n + * (256 bits). This is a protection measure against side-channel attacks. + * + * For s = k^-1 * (r * d + msg), we compute a random nonzero masking scalar + * alpha, and compute s as: + * s = ((k * alpha)^-1 * (r * (d * alpha) + alpha * msg)) mod n + * + * We choose alpha to be at most 128 bits, so the product with a 320b share + * produces fits in the same 512-bit modular reduction routine that we use for + * 256x256-bit multiplications. It should be safe to compute e.g. k * alpha = + * (k0 * alpha + k1 * alpha) mod n, because alpha has enough randomness to mask + * the true value of k. + * + * @param[in] dmem[k0]: first share of secret scalar (320 bits) + * @param[in] dmem[k1]: second share of secret scalar (320 bits) + * @param[in] dmem[msg]: message to be signed (256 bits) + * @param[in] dmem[r]: dmem buffer for r component of signature (256 bits) + * @param[in] dmem[s]: dmem buffer for s component of signature (256 bits) + * @param[in] dmem[d0]: first share of private key d (320 bits) + * @param[in] dmem[d1]: second share of private key d (320 bits) + * + * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on + * the computed affine y-coordinate. + * + * clobbered registers: x2, x3, x16 to x23, w0 to w26 + * clobbered flag groups: FG0 + */ +p256_sign: + + /* init all-zero register */ + bn.xor w31, w31, w31 + + /* load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */ + la x16, k0 + li x2, 0 + bn.lid x2, 0(x16++) + li x2, 1 + bn.lid x2, 0(x16) + + /* load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */ + la x16, k1 + li x2, 2 + bn.lid x2, 0(x16++) + li x2, 3 + bn.lid x2, 0(x16) + + /* setup modulus n (curve order) and Barrett constant + MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ + li x2, 29 + la x3, p256_n + bn.lid x2, 0(x3) + bn.wsrw MOD, w29 + li x2, 28 + la x3, p256_u_n + bn.lid x2, 0(x3) + + /* scalar multiplication with base point (projective) + (x_1, y_1, z_1) = (w8, w9, w10) <= k*G = w0*(dmem[p256_gx], dmem[p256_gy]) */ + la x21, p256_gx + la x22, p256_gy + jal x1, scalar_mult_int + + /* Convert masked result back to affine coordinates. + R = (x_a, y_a) = (w11, w12) */ + jal x1, proj_to_affine + + /* setup modulus n (curve order) and Barrett constant + MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ + li x2, 29 + la x3, p256_n + bn.lid x2, 0(x3) + bn.wsrw MOD, w29 + li x2, 28 + la x3, p256_u_n + bn.lid x2, 0(x3) + + /* re-load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */ + la x16, k0 + li x2, 0 + bn.lid x2, 0(x16++) + li x2, 1 + bn.lid x2, 0(x16) + + /* re-load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */ + la x16, k1 + li x2, 2 + bn.lid x2, 0(x16++) + li x2, 3 + bn.lid x2, 0(x16) + + /* Generate a random 127-bit number. + w4 <= URND()[255:129] */ + bn.wsrr w4, URND + bn.rshi w4, w31, w4 >> 129 + + /* Add 1 to get a 128-bit nonzero scalar for masking. + w4 <= w4 + 1 = alpha */ + bn.addi w4, w4, 1 + + /* w0 <= ([w0,w1] * w4) mod n = (k0 * alpha) mod n */ + bn.mov w24, w0 + bn.mov w25, w1 + bn.mov w26, w4 + jal x1, mod_mul_320x128 + bn.mov w0, w19 + + /* w19 <= ([w2,w3] * w26) mod n = (k1 * alpha) mod n */ + bn.mov w24, w2 + bn.mov w25, w3 + jal x1, mod_mul_320x128 + + /* w0 <= (w0+w19) mod n = (k * alpha) mod n */ + bn.addm w0, w0, w19 + + /* w1 <= w0^-1 mod n = (k * alpha)^-1 mod n */ + jal x1, mod_inv + + /* Load first share of secret key d from dmem. + w2,w3 = dmem[d0] */ + la x16, d0 + li x2, 2 + bn.lid x2, 0(x16++) + li x2, 3 + bn.lid x2, 0(x16) + + /* Load second share of secret key d from dmem. + w5,w6 = dmem[d1] */ + la x16, d1 + li x2, 5 + bn.lid x2, 0(x16++) + li x2, 6 + bn.lid x2, 0(x16) + + /* w0 <= ([w2,w3] * w4) mod n = (d0 * alpha) mod n */ + bn.mov w24, w2 + bn.mov w25, w3 + bn.mov w26, w4 + jal x1, mod_mul_320x128 + bn.mov w0, w19 + + /* w19 <= ([w5,w6] * w4) mod n = (d1 * alpha) mod n */ + bn.mov w24, w5 + bn.mov w25, w6 + bn.mov w26, w4 + jal x1, mod_mul_320x128 + + /* w0 <= (w0+w19) mod n = (d * alpha) mod n */ + bn.addm w0, w0, w19 + + /* Compare to 0. + FG0.Z <= (w0 =? w31) = ((d * alpha) mod n =? 0) */ + bn.cmp w0, w31 + + /* Trigger a fault if FG0.Z is set, aborting the computation. + + Since alpha is nonzero mod n, (d * alpha) mod n = 0 means d is zero mod n, + which violates ECDSA private key requirements. This could technically be + triggered by an unlucky key manager seed, but the probability is so low (~1/n) + that it more likely indicates a fault attack. */ + jal x1, trigger_fault_if_fg0_z + + /* w24 = r <= w11 mod n */ + bn.addm w24, w11, w31 + + /* Store r of signature in dmem. + dmem[r] <= r = w24 */ + la x19, r + li x2, 24 + bn.sid x2, 0(x19) + + /* w19 <= (w24 * w0) mod n = (r * d * alpha) mod n */ + bn.mov w25, w0 + jal x1, mod_mul_256x256 + + /* w0 <= (w1 * w19) mod n = ((k * alpha)^-1 * (r * d * alpha)) mod n + = (k^-1 * r * d) mod n */ + bn.mov w24, w1 + bn.mov w25, w19 + jal x1, mod_mul_256x256 + bn.mov w0, w19 + + /* Load message from dmem: + w24 = msg <= dmem[msg] */ + la x18, msg + li x2, 24 + bn.lid x2, 0(x18) + + /* w19 = (w24 * w4) mod n = <= (msg * alpha) mod n */ + bn.mov w25, w4 + jal x1, mod_mul_256x256 + + /* w19 = (w1 * w19) mod n = ((k * alpha)^-1 * (msg * alpha)) mod n + = (k^-1 * msg) mod n */ + bn.mov w24, w1 + bn.mov w25, w19 + jal x1, mod_mul_256x256 + + /* w0 = (w0 + w19) mod n = (k^-1*r*d + k^-1*msg) mod n = s */ + bn.addm w0, w0, w19 + + /* Store s of signature in dmem. + dmem[s] <= s = w0 */ + la x20, s + li x2, 0 + bn.sid x2, 0(x20) + + ret + +.section .bss + +/* random scalar k (in two 320b shares) */ +.balign 32 +.weak k0 +k0: + .zero 64 +.balign 32 +.weak k1 +k1: + .zero 64 + +/* message digest */ +.balign 32 +.weak msg +msg: + .zero 32 + +/* signature R */ +.balign 32 +.weak r +r: + .zero 32 + +/* signature S */ +.balign 32 +.weak s +s: + .zero 32 + +/* private key d (in two 320b shares) */ +.balign 32 +.weak d0 +d0: + .zero 64 +.balign 32 +.weak d1 +d1: + .zero 64 diff --git a/sw/otbn/crypto/p256_verify.s b/sw/otbn/crypto/p256_verify.s new file mode 100644 index 0000000000000..78315f7e51319 --- /dev/null +++ b/sw/otbn/crypto/p256_verify.s @@ -0,0 +1,417 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/* Copyright 2016 The Chromium OS Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the LICENSE.dcrypto file. + * + * Derived from code in + * https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/cr50_stab/chip/g/dcrypto/dcrypto_p256.c + */ + +.globl p256_verify + +.text + + /** + * P-256 ECDSA signature verification + * + * returns the affine x-coordinate of + * (x1, y1) = u1*G + u2*Q + * with u1 = z*s^-1 mod n and u2 = r*s^-1 mod n + * with G being the curve's base point, + * z being the message + * r, s being the signature + * Q being the public key. + * + * The routine computes the x1 coordinate and places it in dmem. x1 will be + * reduced (mod n), however, the final comparison has to be performed on the + * host side. The signature is valid if x1 == r. + * This routine runs in variable time. + * + * @param[in] dmem[msg]: message to be verified (256 bits) + * @param[in] dmem[r]: r component of signature (256 bits) + * @param[in] dmem[s]: s component of signature (256 bits) + * @param[in] dmem[x]: affine x-coordinate of public key (256 bits) + * @param[in] dmem[y]: affine y-coordinate of public key (256 bits) + * @param[out] dmem[x_r]: dmem buffer for reduced affine x_r-coordinate (x_1) + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * clobbered registers: x2, x3, x13, x14, x17 to x24, w0 to w25 + * clobbered flag groups: FG0 + */ +p256_verify: + + /* init all-zero register */ + bn.xor w31, w31, w31 + + /* load domain parameter b from dmem + w27 <= b = dmem[p256_b] */ + li x2, 27 + la x3, p256_b + bn.lid x2, 0(x3) + + /* setup modulus n (curve order) and Barrett constant + MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n] */ + li x2, 29 + la x3, p256_n + bn.lid x2, 0(x3) + bn.wsrw MOD, w29 + li x2, 28 + la x3, p256_u_n + bn.lid x2, 0(x3) + + /* load s of signature from dmem: w0 = s = dmem[s] */ + la x20, s + bn.lid x0, 0(x20) + + /* goto 'fail' if w0 == w31 <=> s == 0 */ + bn.cmp w0, w31 + csrrs x2, FG0, x0 + andi x2, x2, 8 + bne x2, x0, fail + + /* goto 'fail' if w0 >= w29 <=> s >= n */ + bn.cmp w0, w29 + csrrs x2, FG0, x0 + andi x2, x2, 1 + beq x2, x0, fail + + /* w1 = s^-1 mod n */ + jal x1, mod_inv_var + + /* load r of signature from dmem: w24 = r = dmem[r] */ + la x19, r + li x2, 24 + bn.lid x2, 0(x19) + + /* goto 'fail' if w24 == w31 <=> r == 0 */ + bn.cmp w24, w31 + csrrs x2, FG0, x0 + andi x2, x2, 8 + bne x2, x0, fail + + /* goto 'fail' if w0 >= w29 <=> r >= n */ + bn.cmp w24, w29 + csrrs x2, FG0, x0 + andi x2, x2, 1 + beq x2, x0, fail + + /* w25 = s^-1 = w1 */ + bn.mov w25, w1 + + /* u2 = w0 = w19 <= w24*w25 = r*s^-1 mod n */ + jal x1, mod_mul_256x256 + bn.mov w0, w19 + + /* load message, w24 = msg = dmem[msg] */ + la x18, msg + li x2, 24 + bn.lid x2, 0(x18) + + /* u1 = w1 = w19 <= w24*w25 = w24*w1 = msg*s^-1 mod n */ + bn.mov w25, w1 + jal x1, mod_mul_256x256 + bn.mov w1, w19 + + /* Set up for coordinate arithmetic. + MOD <= p + w28 <= r256 + w29 <= r448 */ + jal x1, setup_modp + + /* load public key Q from dmem and use in projective form (set z to 1) + Q = (w11, w12, w13) = (dmem[x], dmem[y], 1) */ + li x2, 11 + la x21, x + bn.lid x2++, 0(x21) + la x22, y + bn.lid x2, 0(x22) + bn.addi w13, w31, 1 + + /* load base point G and use in projective form (set z to 1) + G = (w8, w9, w10) = (x_g, y_g, 1) */ + li x13, 8 + la x23, p256_gx + bn.lid x13, 0(x23) + li x14, 9 + la x24, p256_gy + bn.lid x14, 0(x24) + bn.addi w10, w31, 1 + + /* The rest of the routine implements a variable time double-and-add + algorithm. For the signature verification we need to compute the point + C = (x1, y1) = u_1*G + u_2*Q. This can be done in a single + double-and-add routine by using Shamir's Trick. */ + + /* G+Q = (w3,w4,w5) = (w11,w12,w13) = (w8,w9,w10) (+) (w11,w12,w13) */ + jal x1, proj_add + bn.mov w3, w11 + bn.mov w4, w12 + bn.mov w5, w13 + + /* w2 = u_2 & u_0 = w0 & w1*/ + bn.and w2, w0, w1 + + /* init double and add algorithm with (0, 1, 0) */ + bn.mov w11, w31 + bn.addi w12, w31, 1 + bn.mov w13, w31 + + /* main loop with dicreasing index i (i=255 downto 0) */ + loopi 256, 31 + + /* always double: C = (w11,w12,w13) <= 2 (*) C = 2 (*) (w11,w12,w13) */ + bn.mov w8, w11 + bn.mov w9, w12 + bn.mov w10, w13 + jal x1, proj_add + + /* if either u_1[i] == 0 or u_2[i] == 0 jump to 'no_both' */ + bn.add w2, w2, w2 + csrrs x2, FG0, x0 + andi x2, x2, 1 + beq x2, x0, no_both + + /* both bits at current index (u1[i] and u2[i]) are set: + do C <= C + (P + Q) and jump to end */ + bn.mov w8, w3 + bn.mov w9, w4 + bn.mov w10, w5 + jal x1, proj_add + jal x0, no_q + + /* either u1[i] or u2[i] is set, but not both */ + no_both: + + /* if u2[i] is not set jump to 'no_g' */ + bn.add w6, w0, w0 + csrrs x2, FG0, x0 + andi x2, x2, 1 + beq x2, x0, no_g + + /* u2[i] is set: do C <= C + Q */ + bn.lid x13, 0(x21) + bn.lid x14, 0(x22) + bn.addi w10, w31, 1 + jal x1, proj_add + + no_g: + /* if u1[i] is not set jump to 'no_q' */ + bn.add w6, w1, w1 + csrrs x2, FG0, x0 + andi x2, x2, 1 + beq x2, x0, no_q + + /* load base point x-coordinate + w8 <= g_x = dmem [p256_gx]; w9 <= g_y = dmem[p256_gy] */ + bn.lid x13, 0(x23) + bn.lid x14, 0(x24) + + /* u1[i] is set: do C <= C + G */ + bn.addi w10, w31, 1 + jal x1, proj_add + + no_q: + /* left shift w0 and w1 to decrease index */ + bn.add w0, w0, w0 + bn.add w1, w1, w1 + + /* compute inverse of z-coordinate: w1 = z_c^-1 mod p */ + bn.mov w0, w13 + jal x1, mod_inv_var + + /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1 mod p */ + bn.mov w24, w1 + bn.mov w25, w11 + jal x1, mul_modp + + /* final reduction: w24 = x1 <= x1 mod n */ + la x3, p256_n + bn.lid x0, 0(x3) + bn.wsrw MOD, w0 + bn.subm w24, w19, w31 + + fail: + /* store affine x-coordinate in dmem: dmem[x_r] = w24 = x_r */ + la x17, x_r + li x2, 24 + bn.sid x2, 0(x17) + + ret + + +/** + * Variable time modular multiplicative inverse computation + * + * Returns c <= a^(-1) mod m + * with a being a bigint of length 256 bit with a < m + * m being the modulus with a length of 256 bit + * c being a 256-bit result + * + * This routine implements the computation of the modular multiplicative + * inverse based on the binary GCD or Stein's algorithm. + * The implemented variant is based on the + * "right-shift binary extended GCD" as it is described in section 3.1 of [1] + * (Algorithm 1). + * [1] https://doi.org/10.1155/ES/2006/32192 + * + * Note that this is a variable time implementation. I.e. this routine will + * show a data dependent timing and execution profile. Only use in situations + * where a full white-box environment is acceptable. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] w0: a, operand + * @param[in] MOD: m, modulus + * @param[in] w31: all-zero + * @param[out] w1: result c + * + * clobbered registers: x2, w2, w3, w4, w7 + * clobbered flag groups: FG0 + */ +mod_inv_var: + + /* w2 = r = 0 */ + bn.mov w2, w31 + + /* w3 = s = 1 */ + bn.addi w3, w31, 1 + + /* w4 = u = MOD */ + bn.wsrr w4, MOD + bn.wsrr w7, MOD + + /* w5 = v = w0 */ + bn.mov w5, w0 + + ebgcd_loop: + /* test if u is odd */ + bn.or w4, w4, w4 + csrrs x2, FG0, x0 + andi x2, x2, 4 + bne x2, x0, ebgcd_u_odd + + /* u is even: */ + /* w4 = u <= u/2 = w4 >> 1 */ + bn.rshi w4, w31, w4 >> 1 + + /* test if r is odd */ + bn.or w2, w2, w2 + csrrs x2, FG0, x0 + andi x2, x2, 4 + bne x2, x0, ebgcd_r_odd + + /* r is even: */ + /* w2 = r <= r/2 = w2 >> 1 */ + bn.rshi w2, w31, w2 >> 1 + jal x0, ebgcd_loop + + ebgcd_r_odd: + /* w2 = r <= (r + m)/2 = (w2 + w7) >> 1 */ + bn.add w2, w7, w2 + bn.addc w6, w31, w31 + bn.rshi w2, w6, w2 >> 1 + jal x0, ebgcd_loop + + ebgcd_u_odd: + /* test if v is odd */ + bn.or w5, w5, w5 + csrrs x2, FG0, x0 + andi x2, x2, 4 + bne x2, x0, ebgcd_uv_odd + + /* v is even: */ + /* w5 = v <= v/2 = w5 >> 1 */ + bn.rshi w5, w31, w5 >> 1 + + /* test if s is odd */ + bn.or w3, w3, w3 + csrrs x2, FG0, x0 + andi x2, x2, 4 + bne x2, x0, ebgcd_s_odd + + /* s is even: */ + /* w3 = s <= s/2 = w3 >> 1 */ + bn.rshi w3, w31, w3 >> 1 + jal x0, ebgcd_loop + + ebgcd_s_odd: + /* w3 = s <= (s + m)/2 = (w3 + w7) >> 1 */ + bn.add w3, w7, w3 + bn.addc w6, w31, w31 + bn.rshi w3, w6, w3 >> 1 + jal x0, ebgcd_loop + + ebgcd_uv_odd: + /* test if v >= u */ + bn.cmp w5, w4 + csrrs x2, FG0, x0 + andi x2, x2, 1 + beq x2, x0, ebgcd_v_gte_u + + /* u > v: */ + /* w2 = r <= r - s = w2 - w3; if (r < 0): r <= r + m */ + bn.subm w2, w2, w3 + + /* w4 = u <= u - v = w4 - w5 */ + bn.sub w4, w4, w5 + jal x0, ebgcd_loop + + ebgcd_v_gte_u: + /* w3 = s <= s - r = w3 - w2; if (s < 0) s <= s + m */ + bn.subm w3, w3, w2 + + /* w5 = v <= v - u = w5 - w4 */ + bn.sub w5, w5, w4 + + /* if v > 0 go back to start of loop */ + csrrs x2, FG0, x0 + andi x2, x2, 8 + beq x2, x0, ebgcd_loop + + /* v <= 0: */ + /* if (r > m): w1 = a = r - m = w2 - MOD else: w1 = a = r = w2 */ + bn.addm w1, w2, w31 + + ret + +.section .bss + +/* message digest */ +.balign 32 +.weak msg +msg: + .zero 32 + +/* signature R */ +.balign 32 +.weak r +r: + .zero 32 + +/* signature S */ +.balign 32 +.weak s +s: + .zero 32 + +/* public key x-coordinate */ +.balign 32 +.weak x +x: + .zero 32 + +/* public key y-coordinate */ +.balign 32 +.weak y +y: + .zero 32 + +/* verification result x_r (aka x_1) */ +.balign 32 +.weak x_r +x_r: + .zero 32 diff --git a/sw/otbn/crypto/p384_a2b.s b/sw/otbn/crypto/p384_a2b.s new file mode 100644 index 0000000000000..8c851829af907 --- /dev/null +++ b/sw/otbn/crypto/p384_a2b.s @@ -0,0 +1,211 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +.globl p384_arithmetic_to_boolean_mod +.globl p384_arithmetic_to_boolean + +.text + +/** + * Converts arithmetic shares mod p to boolean shares. + * + * Calls the 385-bit A2B function twice, first using unmodified 384-bit shares + * in reduced form, and then using modified 385-bit shares in unreduced form. + * + * It then checks if the MSB (carry bit) is true or false, to decide + * which of the two A2B results is used. This detects and handles an + * underflow during the subtraction of arithmetic masking. + * + * The logic behind the carry bit handling is as follows: + * If x >= r, then A = (x - r) mod p = x - r exactly. + * So when we add 2^385 - p and then add A and x, we get + * (2^385 - p + x - r + r) mod 2^385 = 2^385 - p + x. + * In this case, the high bit is always true since p - x <= p < 2^384, + * so we choose the A2B conversion without the 2^385 - p added. + * On the other hand, if x < r, then A = (x - r) mod p = x - r + p. + * When we add 2^385 - p and then add A and x, we get + * (2^385 - p + x - r + p + r) mod 2^385 = (2^385 + x) mod 2^385 = x. + * In this case, the high bit is always false since x < p < 2^384, so we + * choose this second A2B conversion. + * + * This routine runs in constant time. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] w31: all-zero wide data register + * @param[in] [w14,w13]: field modulus p + * @param[in] [w19,w18]: mask r + * @param[in] [w12,w11]: arithmetically masked value A, such that x = A + r + * @param[out] [w21,w20]: boolean masked value x', such that x = x' ^ r + * + * clobbered registers: w1 to w6, w10 to w12, w20, w21, w23 to w28 + * clobbered flag groups: FG0 + */ +p384_arithmetic_to_boolean_mod: + /* First step: calculate A2B from reduced values. */ + + /* Save inputs for second A2B execution. + [w24,w23] <= [w19,w18] = r + [w26,w25] <= [w12,w11] = A */ + bn.mov w23, w18 + bn.mov w24, w19 + bn.mov w25, w11 + bn.mov w26, w12 + + /* Call 385-bit A2B function. + [w21,w20] <= x' */ + jal x1, p384_arithmetic_to_boolean + + /* Save intermediate result of reduced inputs. + [w28,w27] <= [w21,w20] = x' */ + bn.mov w27, w20 + bn.mov w28, w21 + + /* Second step: calculate A2B from unreduced values. */ + + /* Restore inputs r and A values [w19,w18] and [w12,w11] and + prepare input for 385-bit A2B function. */ + bn.mov w18, w23 + bn.mov w19, w24 + bn.mov w11, w25 + bn.mov w12, w26 + + /* Convert input A ([w12,w11]) to an unreduced value + in the 2^385 domain. For this add (2^385 - p) to A. + [w12,w11] <= [w12,w11] + 2^385 - [w14,w13] = A + 2^385 - p */ + bn.addi w10, w31, 0x2 + bn.add w12, w12, w10 << 128 + bn.sub w11, w11, w13 + bn.subb w12, w12, w14 + + /* Call 385-bit A2B function. + [w21,w20] <= x' */ + jal x1, p384_arithmetic_to_boolean + + /* Restore initial mask input of w19 for consistency + in calling functions. + w18 <= w23 + w19 <= w24 */ + bn.mov w18, w23 + bn.mov w19, w24 + + /* Check MSB (carry bit) of second A2B result for true or false. */ + bn.cmp w31, w21 >> 128 + + /* Return the unreduced A2B computation (second result), + if zero flag is set, otherwise return the reduced + A2B computation (first result). */ + bn.sel w20, w20, w27, FG0.Z + bn.sel w21, w21, w28, FG0.Z + + ret + +/** + * Convert arithmetic shares to boolean ones using Goubin's algorithm. + * + * We use Goubin's boolean-to-arithmetic masking algorithm to switch from + * an arithmetic masking scheme to a boolean one without ever unmasking the + * seed. See Algorithm 2 here: + * https://link.springer.com/content/pdf/10.1007/3-540-44709-1_2.pdf + * + * This implementation expands the algorithm to 385 bits for carry bit + * handling. The carry bit can be used to detect and handle an + * underflow during the subtraction of arithmetic masking. + * + * This routine runs in constant time. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] w31: all-zero wide data register + * @param[in] w18: lower part of mask r + * @param[in] w19: upper part of mask r + * @param[in] w11: lower part of arithmetically masked value A, + * such that x = A + r + * @param[in] w12: upper part of arithmetically masked value A, + * such that x = A + r + * @param[out] w20: lower part of boolean masked value x', + * such that x = x' ^ r + * @param[out] w21: upper part of boolean masked value x', + * such that x = x' ^ r + * + * clobbered registers: w1 to w6, w11, w12, and w18 to w21 + * clobbered flag groups: FG0 + */ +p384_arithmetic_to_boolean: + /* Fetch 385 bits of randomness. + [w2,w1] = gamma <= URND */ + bn.wsrr w1, 2 + bn.wsrr w2, 2 + bn.rshi w2, w31, w2 >> 127 + + /* Double gamma and truncate to 385 bits. + [w4,w3] = T <= 2 * [w2,w1] = 2 * gamma */ + bn.add w3, w1, w1 + bn.addc w4, w2, w2 + bn.rshi w4, w4, w31 >> 129 + bn.rshi w4, w31, w4 >> 127 + + /* [w21,w20] = x' <= [w2,w1] ^ [w19,w18] = gamma ^ r */ + bn.xor w20, w1, w18 + bn.xor w21, w2, w19 + + /* [w6,w5] = omega <= [w2,w1] & [w21,w20] = gamma & x' */ + bn.and w5, w1, w20 + bn.and w6, w2, w21 + + /* [w21,w20] = x' <= [w4,w3] ^ [w12,w11] = T ^ A */ + bn.xor w20, w3, w11 + bn.xor w21, w4, w12 + + /* [w2,w1] = gamma <= [w2,w1] ^ [w21,w20] = gamma ^ x' */ + bn.xor w1, w1, w20 + bn.xor w2, w2, w21 + + /* [w2,w1] = gamma <= [w2,w1] & [w19,w18] = gamma & r */ + bn.and w1, w1, w18 + bn.and w2, w2, w19 + + /* [w6,w5] = omega <= [w6,w5] ^ [w2,w1] = omega ^ gamma */ + bn.xor w5, w5, w1 + bn.xor w6, w6, w2 + + /* [w2,w1] = gamma <= [w4,w3] & [w12,w11] = T & A */ + bn.and w1, w3, w11 + bn.and w2, w4, w12 + + /* [w6,w5] = omega <= [w6,w5] ^ [w2,w1] = omega ^ gamma */ + bn.xor w5, w5, w1 + bn.xor w6, w6, w2 + + /* Loop for k = 1 to K - 1 = 385 - 1 */ + loopi 384, 12 + + /* [w2,w1] = gamma <= [w4,w3] & [w19,w18] = T & r */ + bn.and w1, w3, w18 + bn.and w2, w4, w19 + + /* [w2,w1] = gamma <= [w2,w1] ^ [w6,w5] = gamma ^ omega */ + bn.xor w1, w1, w5 + bn.xor w2, w2, w6 + + /* [w4,w3] = T <= [w4,w3] & [w12,w11] = T & A */ + bn.and w3, w3, w11 + bn.and w4, w4, w12 + + /* [w2,w1] = gamma <= [w2,w1] ^ [w4,w3] = gamma ^ T */ + bn.xor w1, w1, w3 + bn.xor w2, w2, w4 + + /* Double gamma and truncate to 385 bits. + [w4,w3] = T <= 2 * [w2,w1] = 2 * gamma */ + bn.add w3, w1, w1 + bn.addc w4, w2, w2 + bn.rshi w4, w4, w31 >> 129 + bn.rshi w4, w31, w4 >> 127 + + /* [w21,w20] = x' <= [w21,w20] ^ [w4,w3] = x' ^ T */ + bn.xor w20, w20, w3 + bn.xor w21, w21, w4 + + ret diff --git a/sw/otbn/crypto/p384_base.s b/sw/otbn/crypto/p384_base.s index 41e272ced942a..ded113c92c658 100644 --- a/sw/otbn/crypto/p384_base.s +++ b/sw/otbn/crypto/p384_base.s @@ -5,6 +5,7 @@ * This library contains: * - P-384 specific routines for point addition in projective space * - P-384 domain parameters + * - P-384 specific routines for multiplication and reduction of large values */ .section .text @@ -66,11 +67,46 @@ mul384: ret +/** + * Unrolled 572=448x128 bit multiplication. + * + * Returns c = a x b. + * + * This routine runs in constant time. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] [w11, w10]: a, first operand, max. length 448 bit, a < m. + * @param[in] w16: b, second operand, max. length 128 bit, b < m. + * @param[in] w31: all-zero. + * @param[out] [w20:w18]: c, result, max. length 572 bit. + * + * Clobbered registers: w18 to w20 + * Clobbered flag groups: FG0 + */ +mul448x128: + bn.mulqacc.z w10.0, w16.0, 0 + bn.mulqacc w10.0, w16.1, 64 + bn.mulqacc.so w18.L, w10.1, w16.0, 64 + bn.mulqacc w10.1, w16.1, 0 + bn.mulqacc w10.2, w16.0, 0 + bn.mulqacc w10.2, w16.1, 64 + bn.mulqacc.so w18.U, w10.3, w16.0, 64 + bn.mulqacc w10.3, w16.1, 0 + bn.mulqacc w11.0, w16.0, 0 + bn.mulqacc w11.1, w16.0, 64 + bn.mulqacc.so w19.L, w11.0, w16.1, 64 + bn.mulqacc w11.2, w16.0, 0 + bn.mulqacc w11.1, w16.1, 0 + bn.mulqacc.so w19.U, w11.2, w16.1, 64 + bn.mulqacc.wo w20, w31.0, w31.0, 0 + + ret /** - * 384-bit modular multiplication based on Solinas reduction algorithm. + * Solinas reduction algorithm. * - * Returns c = a x b % p. + * Returns c = a mod m = (x + 2^384 * y) mod m. * * This subroutine is specialized to the coordinate field of P-384 and cannot * be used for other moduli. @@ -90,8 +126,7 @@ mul384: * * Flags: Flags have no meaning beyond the scope of this subroutine. * - * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m. - * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m. + * @param[in] [w20:w18]: a, input to reduce, max. length 768 bit. * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384. * @param[in] w31: all-zero. * @param[out] [w17, w16]: c, result, max. length 384 bit. @@ -99,12 +134,8 @@ mul384: * Clobbered registers: w16 to w24 * Clobbered flag groups: FG0 */ -.globl p384_mulmod_p -p384_mulmod_p: - /* Compute the raw 768-bit product: - ab = [w20:w18] <= a * b */ - jal x1, mul384 - +.globl p384_reduce_p +p384_reduce_p: /* Solinas reduction step. Based on the observation that: (x + 2^384 * y) mod (2^384 - K) = (x + K * y) mod (2^384 - K). @@ -196,13 +227,12 @@ p384_mulmod_p: bn.sel w16, w18, w16, C bn.sel w17, w19, w17, C - /* return result: c =[w17, w16] = a * b % m. */ ret /** - * 384-bit modular multiplication based on Solinas reduction algorithm. + * Solinas reduction algorithm. * - * Returns c = a x b % m. + * Returns c = a mod m = (x + 2^384 * y) mod m. * * This subroutine is intended for use with the group order (n) of P-384, but * will work for any modulus m such that 2^384 - 2^191 < m < 2^384. @@ -220,8 +250,7 @@ p384_mulmod_p: * * Flags: Flags have no meaning beyond the scope of this subroutine. * - * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m. - * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m. + * @param[in] [w20:w18]: a, input to reduce, max. length 768 bit. * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384. * @param[in] w14: k, Solinas constant (2^384 - modulus), max. length 191 bit. * @param[in] w31: all-zero. @@ -230,15 +259,8 @@ p384_mulmod_p: * Clobbered registers: w16 to w24 * Clobbered flag groups: FG0 */ -.globl p384_mulmod_n -p384_mulmod_n: - /* Compute the raw 768-bit product: - ab = [w20:w18] <= a * b */ - jal x1, mul384 - - /* Solinas reduction step. Based on the observation that: - (x + 2^384 * y) mod (2^384 - K) = (x + K * y) mod (2^384 - K). */ - +.globl p384_reduce_n +p384_reduce_n: /* Extract the high 128 bits from the middle term and the low 128 bits from the high term: w21 <= ab[639:384] */ @@ -336,9 +358,106 @@ p384_mulmod_n: bn.sel w16, w18, w16, C bn.sel w17, w19, w17, C - /* return result: c =[w17, w16] = a * b % m. */ ret +/** + * 384-bit modular multiplication based on Solinas reduction algorithm. + * + * Returns c = a x b % p. + * + * This subroutine is specialized to the coordinate field of P-384 and cannot + * be used for other moduli. + * + * For mor information on the reduction algorith, see 'p384_reduce_p'. + * + * This routine runs in constant time. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m. + * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m. + * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384. + * @param[in] w31: all-zero. + * @param[out] [w17, w16]: c, result, max. length 384 bit. + * + * Clobbered registers: w16 to w24 + * Clobbered flag groups: FG0 + */ +.globl p384_mulmod_p +p384_mulmod_p: + /* Compute the raw 768-bit product: + ab = [w20:w18] <= a * b */ + jal x1, mul384 + + /* return [w17, w16] = ab mod m = [w20:w18] mod m */ + jal x0, p384_reduce_p + +/** + * 384-bit modular multiplication based on Solinas reduction algorithm. + * + * Returns c = a * b mod m. + * + * This subroutine is intended for use with the group order (n) of P-384, but + * will work for any modulus m such that 2^384 - 2^191 < m < 2^384. + * + * For mor information on the reduction algorith, see 'p384_reduce_n'. + * + * This routine runs in constant time. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m. + * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m. + * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384. + * @param[in] w14: k, Solinas constant (2^384 - modulus), max. length 191 bit. + * @param[in] w31: all-zero. + * @param[out] [w17, w16]: c, result, max. length 384 bit. + * + * Clobbered registers: w16 to w24 + * Clobbered flag groups: FG0 + */ +.globl p384_mulmod_n +p384_mulmod_n: + /* Compute the raw 768-bit product: + ab = [w20:w18] <= a * b */ + jal x1, mul384 + + /* return [w17, w16] = ab mod m = [w20:w18] mod m */ + jal x0, p384_reduce_n + +/** + * 448x128=572-bit modular multiplication based on Solinas reduction algorithm. + * + * Returns c = a * b mod m. + * + * This subroutine is intended for use with the group order (n) of P-384, but + * will work for any modulus m such that 2^384 - 2^191 < m < 2^384. + * + * For mor information on the reduction algorith, see 'p384_reduce_n'. + * + * This routine runs in constant time. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m. + * @param[in] w16: b, second operand, max. length 128 bit, b < m. + * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384. + * @param[in] w14: k, Solinas constant (2^384 - modulus), max. length 191 bit. + * @param[in] w31: all-zero. + * @param[out] [w17, w16]: c, result, max. length 384 bit. + * + * Clobbered registers: w16 to w24 + * Clobbered flag groups: FG0 + */ +.globl p384_mulmod448x128_n +p384_mulmod448x128_n: + /* Compute the raw 768-bit product: + ab = [w20:w18] <= a * b */ + jal x1, mul448x128 + + /* return [w17, w16] = ab mod m = [w20:w18] mod m */ + jal x0, p384_reduce_n + /** * P-384 point addition in projective space * @@ -422,10 +541,8 @@ proj_add_p384: bn.addc w17, w11, w17 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w6, w16 - bn.mov w7, w17 + bn.sel w6, w16, w10, C + bn.sel w7, w17, w11, C /* 5: [w9, w8] = t4 <= X2+Y2 = dmem[x27+0]+dmem[x27+64] */ bn.lid x22, 0(x27) @@ -436,10 +553,8 @@ proj_add_p384: bn.addc w17, w11, w17 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w8, w16 - bn.mov w9, w17 + bn.sel w8, w16, w10, C + bn.sel w9, w17, w11, C /* 6: [w7, w6] = t3 <= t3*t4 = [w7, w6]*[w9, w8] */ bn.mov w10, w6 @@ -455,20 +570,16 @@ proj_add_p384: bn.addc w17, w1, w3 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w8, w16 - bn.mov w9, w17 + bn.sel w8, w16, w10, C + bn.sel w9, w17, w11, C /* 8: [w7, w6] = t3 <= t3-t4 = [w7, w6]-[w9, w8] */ bn.sub w16, w6, w8 bn.subb w17, w7, w9 bn.add w10, w16, w12 bn.addc w11, w17, w13 - bn.sel w16, w10, w16, C - bn.sel w17, w11, w17, C - bn.mov w6, w16 - bn.mov w7, w17 + bn.sel w6, w10, w16, C + bn.sel w7, w11, w17, C /* 9: [w9, w8] = t4 <= Y1+Z1 = dmem[x26+64]+dmem[x26+128] */ bn.lid x22, 64(x26) @@ -479,10 +590,8 @@ proj_add_p384: bn.addc w17, w11, w17 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w8, w16 - bn.mov w9, w17 + bn.sel w8, w16, w10, C + bn.sel w9, w17, w11, C /* 10: [w26, w25] = X3 <= Y2+Z2 = dmem[x27+64]+dmem[x27+128] */ bn.lid x22, 64(x27) @@ -493,10 +602,8 @@ proj_add_p384: bn.addc w17, w11, w17 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w25, w16 - bn.mov w26, w17 + bn.sel w25, w16, w10, C + bn.sel w26, w17, w11, C /* 11: [w9, w8] = t4 <= t4*X3 = [w9, w8]*[w26, w25] */ bn.mov w10, w8 @@ -512,20 +619,16 @@ proj_add_p384: bn.addc w17, w3, w5 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w25, w16 - bn.mov w26, w17 + bn.sel w25, w16, w10, C + bn.sel w26, w17, w11, C /* 13: [w9, w8] = t4 <= t4-X3 = [w9, w8]-[w26, w25] */ bn.sub w16, w8, w25 bn.subb w17, w9, w26 bn.add w10, w16, w12 bn.addc w11, w17, w13 - bn.sel w16, w10, w16, C - bn.sel w17, w11, w17, C - bn.mov w8, w16 - bn.mov w9, w17 + bn.sel w8, w10, w16, C + bn.sel w9, w11, w17, C /* 14: [w26, w25] = X3 <= X1+Z1 = dmem[x26+0]+dmem[x26+128] */ bn.lid x22, 0(x26) @@ -536,10 +639,8 @@ proj_add_p384: bn.addc w17, w11, w17 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w25, w16 - bn.mov w26, w17 + bn.sel w25, w16, w10, C + bn.sel w26, w17, w11, C /* 15: [w28, w27] = Y3 <= X2+Z2 = dmem[x27+0]+dmem[x27+128] */ bn.lid x22, 0(x27) @@ -550,10 +651,8 @@ proj_add_p384: bn.addc w17, w11, w17 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w27, w16 - bn.mov w28, w17 + bn.sel w27, w16, w10, C + bn.sel w28, w17, w11, C /* 16: [w26, w25] = X3 <= X3*Y3 = [w26, w25]*[w28, w27] */ bn.mov w10, w25 @@ -569,20 +668,16 @@ proj_add_p384: bn.addc w17, w1, w5 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w27, w16 - bn.mov w28, w17 + bn.sel w27, w16, w10, C + bn.sel w28, w17, w11, C /* 18: [w28, w27] = Y3 <= X3-Y3 = [w26, w25]-[w28, w27] */ bn.sub w16, w25, w27 bn.subb w17, w26, w28 bn.add w10, w16, w12 bn.addc w11, w17, w13 - bn.sel w16, w10, w16, C - bn.sel w17, w11, w17, C - bn.mov w27, w16 - bn.mov w28, w17 + bn.sel w27, w10, w16, C + bn.sel w28, w11, w17, C /* 19: [w30, w29] = Z3 <= b*t2 = dmem[x28+0]*[w5, w4] */ bn.lid x22, 0(x28) @@ -598,50 +693,40 @@ proj_add_p384: bn.subb w17, w28, w30 bn.add w10, w16, w12 bn.addc w11, w17, w13 - bn.sel w16, w10, w16, C - bn.sel w17, w11, w17, C - bn.mov w25, w16 - bn.mov w26, w17 + bn.sel w25, w10, w16, C + bn.sel w26, w11, w17, C /* 21: [w30, w29] = Z3 <= X3+X3 = [w26, w25]+[w26, w25] */ bn.add w16, w25, w25 bn.addc w17, w26, w26 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w29, w16 - bn.mov w30, w17 + bn.sel w29, w16, w10, C + bn.sel w30, w17, w11, C /* 22: [w26, w25] = X3 <= X3+Z3 = [w26, w25]+[w30, w29] */ bn.add w16, w25, w29 bn.addc w17, w26, w30 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w25, w16 - bn.mov w26, w17 + bn.sel w25, w16, w10, C + bn.sel w26, w17, w11, C /* 23: [w30, w29] = Z3 <= t1-X3 = [w3, w2]-[w26, w25] */ bn.sub w16, w2, w25 bn.subb w17, w3, w26 bn.add w10, w16, w12 bn.addc w11, w17, w13 - bn.sel w16, w10, w16, C - bn.sel w17, w11, w17, C - bn.mov w29, w16 - bn.mov w30, w17 + bn.sel w29, w10, w16, C + bn.sel w30, w11, w17, C /* 24: [w26, w25] = X3 <= t1+X3 = [w3, w2]+[w26, w25] */ bn.add w16, w2, w25 bn.addc w17, w3, w26 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w25, w16 - bn.mov w26, w17 + bn.sel w25, w16, w10, C + bn.sel w26, w17, w11, C /* 25: [w28, w27] = Y3 <= b*Y3 = dmem[x28+0]*[w28, w27] */ bn.lid x22, 0(x28) @@ -657,90 +742,72 @@ proj_add_p384: bn.addc w17, w5, w5 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w2, w16 - bn.mov w3, w17 + bn.sel w2, w16, w10, C + bn.sel w3, w17, w11, C /* 27: [w5, w4] = t2 <= t1+t2 = [w3, w2]+[w5, w4] */ bn.add w16, w2, w4 bn.addc w17, w3, w5 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w4, w16 - bn.mov w5, w17 + bn.sel w4, w16, w10, C + bn.sel w5, w17, w11, C /* 28: [w28, w27] = Y3 <= Y3-t2 = [w28, w27]-[w5, w4] */ bn.sub w16, w27, w4 bn.subb w17, w28, w5 bn.add w10, w16, w12 bn.addc w11, w17, w13 - bn.sel w16, w10, w16, C - bn.sel w17, w11, w17, C - bn.mov w27, w16 - bn.mov w28, w17 + bn.sel w27, w10, w16, C + bn.sel w28, w11, w17, C /* 29: [w28, w27] = Y3 <= Y3-t0 = [w28, w27]-[w1, w0] */ bn.sub w16, w27, w0 bn.subb w17, w28, w1 bn.add w10, w16, w12 bn.addc w11, w17, w13 - bn.sel w16, w10, w16, C - bn.sel w17, w11, w17, C - bn.mov w27, w16 - bn.mov w28, w17 + bn.sel w27, w10, w16, C + bn.sel w28, w11, w17, C /* 30: [w3, w2] = t1 <= Y3+Y3 = [w28, w27]+[w28, w27] */ bn.add w16, w27, w27 bn.addc w17, w28, w28 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w2, w16 - bn.mov w3, w17 + bn.sel w2, w16, w10, C + bn.sel w3, w17, w11, C /* 31: [w28, w27] = Y3 <= t1+Y3 = [w3, w2]+[w28, w27] */ bn.add w16, w2, w27 bn.addc w17, w3, w28 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w27, w16 - bn.mov w28, w17 + bn.sel w27, w16, w10, C + bn.sel w28, w17, w11, C /* 32: [w3, w2] = t1 <= t0+t0 = [w1, w0]+[w1, w0] */ bn.add w16, w0, w0 bn.addc w17, w1, w1 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w2, w16 - bn.mov w3, w17 + bn.sel w2, w16, w10, C + bn.sel w3, w17, w11, C /* 33: [w1, w0] = t0 <= t1+t0 = [w3, w2]+[w1, w0] */ bn.add w16, w2, w0 bn.addc w17, w3, w1 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w0, w16 - bn.mov w1, w17 + bn.sel w0, w16, w10, C + bn.sel w1, w17, w11, C /* 34: [w1, w0] = t0 <= t0-t2 = [w1, w0]-[w5, w4] */ bn.sub w16, w0, w4 bn.subb w17, w1, w5 bn.add w10, w16, w12 bn.addc w11, w17, w13 - bn.sel w16, w10, w16, C - bn.sel w17, w11, w17, C - bn.mov w0, w16 - bn.mov w1, w17 + bn.sel w0, w10, w16, C + bn.sel w1, w11, w17, C /* 35: [w3, w2] = t1 <= t4*Y3 = [w9, w8]*[w28, w27] */ bn.mov w10, w8 @@ -774,10 +841,8 @@ proj_add_p384: bn.addc w17, w28, w5 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w27, w16 - bn.mov w28, w17 + bn.sel w27, w16, w10, C + bn.sel w28, w17, w11, C /* 39: [w26, w25] = X3 <= t3*X3 = [w7, w6]*[w26, w25] */ bn.mov w10, w6 @@ -793,10 +858,8 @@ proj_add_p384: bn.subb w17, w26, w3 bn.add w10, w16, w12 bn.addc w11, w17, w13 - bn.sel w16, w10, w16, C - bn.sel w17, w11, w17, C - bn.mov w25, w16 - bn.mov w26, w17 + bn.sel w25, w10, w16, C + bn.sel w26, w11, w17, C /* 41: [w30, w29] = Z3 <= t4*Z3 = [w9, w8]*[w30, w29] */ bn.mov w10, w8 @@ -821,16 +884,272 @@ proj_add_p384: bn.addc w17, w30, w3 bn.sub w10, w16, w12 bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - bn.mov w29, w16 - bn.mov w30, w17 + bn.sel w29, w16, w10, C + bn.sel w30, w17, w11, C ret +/** + * Convert projective coordinates of a P-384 curve point to affine coordinates + * + * returns P = (x_a, y_a) = (x/z mod p, y/z mod p) + * where P is a valid P-384 curve point, + * x_a and y_a are the resulting affine coordinates of the + * curve point, + * x,y and z are a set of projective coordinates of the + * point and + * p is the modulus of the P-384 underlying finite field. + * + * This routine computes the affine coordinates for a set of projective + * coordinates of a valid P-384 curve point. The routine performs the required + * divisions by computing the multiplicative modular inverse of the + * projective z-coordinate in the underlying finite field of the P-384 curve. + * For inverse computation Fermat's little theorem is used, i.e. + * we compute z^-1 = z^(p-2) mod p. + * For exponentiation a 16 step addition chain is used. + * Source of the addition chain is the addchain project: + * https://github.com/mmcloughlin/addchain/ + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] [w26,w25]: x, x-coordinate of curve point (projective). + * @param[in] [w28,w27]: y, y-coordinate of curve point (projective). + * @param[in] [w30,w29]: z, z-coordinate of curve point (projective). + * @param[in] [w13, w12]: p, modulus of P-384. + * @param[in] w31: all-zero. + * @param[out] [w1, w0]: z^-1. inverse of z-coordinate of curve point. + * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point. + * @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point. + * + * clobbered registers: w0 to w28 + * clobbered flag groups: FG0 + */ + .globl proj_to_affine_p384 +proj_to_affine_p384: + + /* Exp: 0b10 = 2*0b1 + Val: r10 = z^2 mod p + [w17,w16] <= [w30,w29]^2 mod [w13,w12] */ + bn.mov w10, w29 + bn.mov w11, w30 + bn.mov w16, w29 + bn.mov w17, w30 + jal x1, p384_mulmod_p + + /* Exp: 0b11 = 0b1+0b10 + Val: r11 <= z*r10 mod p + [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */ + bn.mov w10, w29 + bn.mov w11, w30 + jal x1, p384_mulmod_p + + /* Exp: 0b110 = 2*0b11 + Val: r110 = r11^2 mod p + [w17,w16] <= [w17,w16]^2 mod [w13,w12] */ + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + + /* Exp: 0b111 = 0b1+0b110 + Val: r111 <= z*r110 mod p + [w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */ + bn.mov w10, w29 + bn.mov w11, w30 + jal x1, p384_mulmod_p + bn.mov w0, w16 + bn.mov w1, w17 + + /* Exp: 0b111000 = 0b111<<3 + Val: r111000 <= r111^(2^3) mod p + [w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */ + loopi 3, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + + /* Exp: 0b1111111 = 0b111+0b111000 + Val: r1111111 <= r111*r111000 mod p + [w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */ + bn.mov w10, w0 + bn.mov w11, w1 + jal x1, p384_mulmod_p + bn.mov w2, w16 + bn.mov w3, w17 + + /* Exp: 2^12-1 = (0b1111111<<6)+0b111111 + Val: r_12_1 <= r111111^(2^6)*r111111 mod p + [w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */ + loopi 6, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + bn.mov w10, w2 + bn.mov w11, w3 + jal x1, p384_mulmod_p + bn.mov w4, w16 + bn.mov w5, w17 + + /* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1) + Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p + [w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */ + loopi 12, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + bn.mov w10, w4 + bn.mov w11, w5 + jal x1, p384_mulmod_p + + /* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111 + Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p + [w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */ + loopi 6, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + bn.mov w10, w2 + bn.mov w11, w3 + jal x1, p384_mulmod_p + bn.mov w2, w16 + bn.mov w3, w17 + + /* Exp: 2^31-1 <= (2^30-1)*2+0b1 + Val: r_31_1 <= r30_1^2*z mod p + [w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */ + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + bn.mov w10, w29 + bn.mov w11, w30 + jal x1, p384_mulmod_p + bn.mov w6, w16 + bn.mov w7, w17 + + /* Exp: 2^32-1 <= (2^30-1)*2+0b1 + Val: r_32_1 <= r31_1^2*z mod p + [w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */ + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + bn.mov w10, w29 + bn.mov w11, w30 + jal x1, p384_mulmod_p + bn.mov w9, w16 + bn.mov w8, w17 + + /* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1) + Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p + [w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */ + loopi 31, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + bn.mov w10, w6 + bn.mov w11, w7 + jal x1, p384_mulmod_p + bn.mov w6, w16 + bn.mov w7,w17 + + /* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1) + Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p + [w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */ + loopi 63, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + bn.mov w10, w6 + bn.mov w11, w7 + jal x1, p384_mulmod_p + bn.mov w6, w16 + bn.mov w7, w17 + + /* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1) + Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p + [w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */ + loopi 126, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + bn.mov w10, w6 + bn.mov w11, w7 + jal x1, p384_mulmod_p + + /* Exp: 2^255-1 = ((2^252-1)<<3)+0b111 + Val: r_255_1 <= r_252_1^(2^3)*r111 mod p + [w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */ + loopi 3, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + bn.mov w10, w0 + bn.mov w11, w1 + jal x1, p384_mulmod_p + + /* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1 + Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p + [w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2) + *[w30,w29] mod [w13,w12] */ + loopi 33, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + bn.mov w10, w9 + bn.mov w11, w8 + jal x1, p384_mulmod_p + loopi 94, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + bn.mov w10, w2 + bn.mov w11, w3 + jal x1, p384_mulmod_p + loopi 2, 4 + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_p + nop + bn.mov w10, w29 + bn.mov w11, w30 + jal x1, p384_mulmod_p + + /* store inverse [w1,w0] <= [w17,w16] = z_inv*/ + bn.mov w0, w16 + bn.mov w1, w17 + + /* convert x-coordinate to affine space + [w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */ + bn.mov w10, w25 + bn.mov w11, w26 + jal x1, p384_mulmod_p + bn.mov w25, w16 + bn.mov w26, w17 + + /* convert y-coordinate to affine space + [w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */ + bn.mov w10, w27 + bn.mov w11, w28 + bn.mov w16, w0 + bn.mov w17, w1 + jal x1, p384_mulmod_p + bn.mov w27, w16 + bn.mov w28, w17 + + ret .section .data +.balign 32 + /* P-384 domain parameter b */ .globl p384_b p384_b: diff --git a/sw/otbn/crypto/p384_base_mult.s b/sw/otbn/crypto/p384_base_mult.s new file mode 100644 index 0000000000000..55f96294199ca --- /dev/null +++ b/sw/otbn/crypto/p384_base_mult.s @@ -0,0 +1,149 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ +/* + * P-384 specific routines for constant-time base point multiplication. + */ + + .section .text + +/** + * Externally callable routine for P-384 base point multiplication + * + * returns Q = d (*) G + * where Q is a resulting valid P-384 curve point in affine + * coordinates, + * G is the base point of curve P-384, and + * d is a 384-bit scalar. + * + * Sets up context and calls the internal scalar multiplication routine. + * This routine runs in constant time. + * + * @param[in] dmem[0]: dptr_d0, pointer to location in dmem containing + * 1st private key share d0 + * @param[in] dmem[4]: dptr_d1, pointer to location in dmem containing + * 2nd private key share d1 + * @param[in] dmem[20]: dptr_x, pointer to result buffer for x-coordinate + * @param[in] dmem[24]: dptr_y, pointer to result buffer for y-coordinate + * + * 384-bit quantities have to be provided in dmem in little-endian format, + * 512 bit aligned, with the highest 128 bit set to zero. + * + * Flags: When leaving this subroutine, the M, L and Z flags of FG0 correspond + * to the computed affine y-coordinate. + * + * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30 + * w0 to w30 + * clobbered flag groups: FG0 + */ +.globl p384_base_mult +p384_base_mult: + + /* set dmem pointer to x-coordinate of base point*/ + la x20, p384_gx + + /* set dmem pointer to y-coordinate of base point */ + la x21, p384_gy + + /* set dmem pointer to 1st scalar share d0 */ + la x17, dptr_d0 + lw x17, 0(x17) + + /* set dmem pointer to 2nd scalar share d1 */ + la x19, dptr_d1 + lw x19, 0(x19) + + /* set dmem pointer to domain parameter b */ + la x28, p384_b + + /* set dmem pointer to scratchpad */ + la x30, scratchpad + + /* load domain parameter n (order of base point) + [w11, w10] = n = dmem[p384_n] */ + li x2, 10 + la x3, p384_n + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* load domain parameter p (modulus) + [w13, w12] = p = dmem[p384_p] */ + la x3, p384_p + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* init all-zero reg */ + bn.xor w31, w31, w31 + + /* scalar multiplication in projective space + [w30:w25] <= (x, y, z) = d * G */ + jal x1, scalar_mult_int_p384 + + /* conversion into affine space + [w28:w25] <= (x, y) */ + jal x1, proj_to_affine_p384 + + /* set dmem pointer to point x-coordinate */ + la x20, dptr_x + lw x20, 0(x20) + + /* set dmem pointer to point y-coordinate */ + la x21, dptr_y + lw x21, 0(x21) + + /* store result in dmem */ + li x2, 25 + bn.sid x2++, 0(x20) + bn.sid x2++, 32(x20) + bn.sid x2++, 0(x21) + bn.sid x2++, 32(x21) + + ret + +/* pointers and scratchpad memory */ +.section .data + +.balign 32 + + /* pointer to k0 (dptr_k0) */ +.globl dptr_k0 +.weak dptr_k0 +dptr_k0: + .zero 4 + +/* pointer to k1 (dptr_k1) */ +.globl dptr_k1 +.weak dptr_k1 +dptr_k1: + .zero 4 + +/* pointer to d0 (dptr_d0) */ +.globl dptr_d0 +.weak dptr_d0 +dptr_d0: + .zero 4 + +/* pointer to d1 (dptr_d1) */ +.globl dptr_d1 +.weak dptr_d1 +dptr_d1: + .zero 4 + +/* pointer to X (dptr_x) */ +.globl dptr_x +.weak dptr_x +dptr_x: + .zero 4 + +/* pointer to Y (dptr_y) */ +.globl dptr_y +.weak dptr_y +dptr_y: + .zero 4 + +/* 704 bytes of scratchpad memory */ +.balign 32 +.globl scratchpad +.weak scratchpad +scratchpad: + .zero 704 diff --git a/sw/otbn/crypto/p384_curve_point_valid.s b/sw/otbn/crypto/p384_curve_point_valid.s new file mode 100644 index 0000000000000..b0d57e4b84134 --- /dev/null +++ b/sw/otbn/crypto/p384_curve_point_valid.s @@ -0,0 +1,58 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Curve point validation for curve P-384. + * + * Checks if a given curve point (e.g. public key for ECDH shared key + * generation) is a valid curve point on the P-384 curve. + * + * The check is successful when the the binary execution completes without + * error. In case of an unvalid point, a software error is raised and execution + * is halted. + */ + +.section .text.start +start: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + jal x1, validate_point + + /* Unsupported mode; fail. */ + unimp + unimp + unimp + +validate_point: + /* Call curve point validation function */ + jal x1, p384_curve_point_valid + + ecall + +.data + +/* pointer to x-coordinate (dptr_x) */ +.globl dptr_x +.balign 4 +dptr_x: + .zero 4 + +/* pointer to y-coordinate (dptr_y) */ +.globl dptr_y +.balign 4 +dptr_y: + .zero 4 + +/* Public key x-coordinate. */ +.globl x +.balign 32 +x: + .zero 64 + +/* Public key y-coordinate. */ +.globl y +.balign 32 +y: + .zero 64 diff --git a/sw/otbn/crypto/p384_ecdh.s b/sw/otbn/crypto/p384_ecdh.s new file mode 100644 index 0000000000000..0fb8fd42271b5 --- /dev/null +++ b/sw/otbn/crypto/p384_ecdh.s @@ -0,0 +1,217 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Elliptic-curve Diffie-Hellman (ECDH) on curve P-384. + * + * This binary has the following modes of operation: + * 1. MODE_KEYGEN_RANDOM: generate a random keypair + * 2. MODE_SHARED_KEYGEN: compute shared key - !!! Attention !!! - before + * shared key computation p384_curve_point_valid + * binary has to be executed to check if the provided + * public key is valid. + */ + + /** + * Mode magic values generated with + * $ ./util/design/sparse-fsm-encode.py -d 6 -m 4 -n 11 \ + * --avoid-zero -s 3660400884 + * + * Call the same utility with the same arguments and a higher -m to generate + * additional value(s) without changing the others or sacrificing mutual HD. + * + * TODO(#17727): in some places the OTBN assembler support for .equ directives + * is lacking, so they cannot be used in bignum instructions or pseudo-ops such + * as `li`. If support is added, we could use 32-bit values here instead of + * 11-bit. + */ +.equ MODE_SHARED_KEY, 0x5ec +.equ MODE_KEYPAIR_RANDOM, 0x3f1 + +.section .text.start +start: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Read the mode and tail-call the requested operation. */ + la x2, mode + lw x2, 0(x2) + + addi x3, x0, MODE_KEYPAIR_RANDOM + beq x2, x3, keypair_random + + addi x3, x0, MODE_SHARED_KEY + beq x2, x3, shared_key + + /* Unsupported mode; fail. */ + unimp + unimp + unimp + +/** + * Generate a fresh random keypair. + * + * Returns secret key d in 448-bit shares d0, d1. + * + * Returns public key Q = d*G in affine coordinates (x, y). + * + * This routine runs in constant time (except potentially waiting for entropy + * from RND). + * + * @param[in] w31: all-zero + * @param[in] dmem[0]: dptr_d0, pointer to location in dmem containing + * 1st private key share d0 + * @param[in] dmem[4]: dptr_d1, pointer to location in dmem containing + * 2nd private key share d1 + * @param[in] dmem[20]: dptr_x, pointer to result buffer for x-coordinate + * @param[in] dmem[24]: dptr_y, pointer to result buffer for y-coordinate + * @param[out] dmem[d0]: 1st private key share d0 + * @param[out] dmem[d1]: 2nd private key share d1 + * @param[out] dmem[x]: Public key x-coordinate + * @param[out] dmem[y]: Public key y-coordinate + * + * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30, w0 to w30 + * clobbered flag groups: FG0 + */ +keypair_random: + /* Generate secret key d in shares. + dmem[d0] <= d0 + dmem[d1] <= d1 */ + jal x1, p384_generate_random_key + + /* Generate public key d*G. + dmem[x] <= (d*G).x + dmem[y] <= (d*G).y */ + jal x1, p384_base_mult + + ecall + +/** + * Generate a shared key from a secret and public key. + * + * Returns the shared key, which is the affine x-coordinate of (d*Q). The + * shared key is expressed in boolean shares x0, x1 such that the key is (x0 ^ + * x1). + * + * This routine runs in constant time. + * + * !!! Attention !!! - before shared key computation p384_curve_point_valid + * binary has to be executed to check if the provided public key is valid. + * + * @param[in] w31: all-zero + * @param[in] dmem[0]: dptr_k0, pointer to location in dmem containing + * 1st private key share d0/k0 + * @param[in] dmem[4]: dptr_k1, pointer to location in dmem containing + * 2nd private key share d1/k0 + * @param[in] dmem[20]: dptr_x, pointer to result buffer for x-coordinate + * @param[in] dmem[24]: dptr_y, pointer to result buffer for y-coordinate + * @param[out] dmem[x]: x0, first share of shared key. + * @param[out] dmem[y]: x1, second share of shared key. + * + * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30, w0 to w30 + * clobbered flag groups: FG0 + */ +shared_key: + /* Generate arithmetically masked shared key d*Q. + dmem[x] <= (d*Q).x - m_x mod p + dmem[y] <= m_x */ + jal x1, p384_scalar_mult + + /* Arithmetic-to-boolean conversion*/ + + /* load result to WDRs for a2b conversion. + [w12,w11] <= dmem[p1_x] = x_m + [w19,w18] <= dmem[p1_y] = m */ + li x2, 11 + la x3, x + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + li x2, 18 + la x3, y + bn.lid x2++, 0(x3) + bn.lid x2, 32(x3) + + /* Load domain parameter. + [w14,w13] = dmem[p384_p] */ + li x2, 13 + la x4, p384_p + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + jal x1, p384_arithmetic_to_boolean_mod + + /* dmem[x] <= w20 = x' */ + li x3, 20 + la x4, x + bn.sid x3, 0(x4) + + ecall + +.data + +/* Operational mode. */ +.globl mode +.balign 4 +mode: + .zero 4 + +/* pointer to x-coordinate (dptr_x) */ +.globl dptr_x +.balign 4 +dptr_x: + .zero 4 + +/* pointer to y-coordinate (dptr_y) */ +.globl dptr_y +.balign 4 +dptr_y: + .zero 4 + +/* Public key x-coordinate. */ +.globl x +.balign 32 +x: + .zero 64 + +/* Public key y-coordinate. */ +.globl y +.balign 32 +y: + .zero 64 + +/* Secret key (d) in two shares: d = (d0 + d1) mod n. + + Note: This is also labeled k0, k1 because the `p384_scalar_mult` algorithm + is also used for ECDSA signing and reads from those labels; in the case of + ECDH, the scalar in `p384_scalar_mult` is always the private key (d). */ + +/* pointer to d0 (dptr_d0) */ +.globl dptr_k0 +.globl dptr_d0 +.balign 4 +dptr_d0: + .zero 4 + +/* pointer to d1 (dptr_d1) */ +.globl dptr_k1 +.globl dptr_d1 +.balign 4 +dptr_d1: + .zero 4 + +.globl d0 +.globl k0 +.balign 32 +d0: +k0: + .zero 64 + +.globl d1 +.globl k1 +.balign 32 +d1: +k1: + .zero 64 + +.balign 32 diff --git a/sw/otbn/crypto/p384_ecdsa_keygen.s b/sw/otbn/crypto/p384_ecdsa_keygen.s new file mode 100644 index 0000000000000..e7d282b2a03c8 --- /dev/null +++ b/sw/otbn/crypto/p384_ecdsa_keygen.s @@ -0,0 +1,106 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Entrypoint for P-384 ECDSA key generation operations. + * + * This binary generates a new keypair. + */ + +.section .text.start +.globl start +start: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + jal x1, random_keygen + + /* Invalid mode; fail. */ + unimp + unimp + unimp + +/** + * Generate a fresh random keypair. + * + * Returns secret key d in 448-bit shares d0, d1. + * Returns public key Q = d*G in affine coordinates (x, y). + * + * @param[in] w31: all-zero + * @param[in] dmem[0]: dptr_d0, pointer to location in dmem containing + * 1st private key share d0 + * @param[in] dmem[4]: dptr_d1, pointer to location in dmem containing + * 2nd private key share d1 + * @param[in] dmem[20]: dptr_x, pointer to result buffer for x-coordinate + * @param[in] dmem[24]: dptr_y, pointer to result buffer for y-coordinate + * @param[out] dmem[d0]: 1st private key share d0 + * @param[out] dmem[d1]: 2nd private key share d1 + * @param[out] dmem[x]: Public key x-coordinate + * @param[out] dmem[y]: Public key y-coordinate + */ +random_keygen: + /* Generate secret key d in shares. + dmem[d0] <= d0 + dmem[d1] <= d1 */ + jal x1, p384_generate_random_key + + /* Generate public key d*G. + dmem[x] <= (d*G).x + dmem[y] <= (d*G).y */ + jal x1, p384_base_mult + + ecall + +.bss + +/* pointer to k0 (dptr_k0) */ +.globl dptr_k0 +dptr_k0: + .zero 4 + +/* pointer to k1 (dptr_k1) */ +.globl dptr_k1 +dptr_k1: + .zero 4 + +/* pointer to d0 (dptr_d0) */ +.globl dptr_d0 +dptr_d0: + .zero 4 + +/* pointer to d1 (dptr_d1) */ +.globl dptr_d1 +dptr_d1: + .zero 4 + +/* random scalar first share */ +.globl k0 +.balign 32 +k0: + .zero 64 + +/* random scalar second share */ +.globl k1 +.balign 32 +k1: + .zero 64 + +/* private key first share */ +.globl d0 +.balign 32 +d0: + .zero 64 + +/* private key second share */ +.globl d1 +.balign 32 +d1: + .zero 64 + +/* 704 bytes of scratchpad memory + defined globally to save dmem */ +.balign 32 +.globl scratchpad +scratchpad: + .zero 704 diff --git a/sw/otbn/crypto/p384_ecdsa_sca.s b/sw/otbn/crypto/p384_ecdsa_sca.s index fbe766b144ef0..77ec9f800cec6 100644 --- a/sw/otbn/crypto/p384_ecdsa_sca.s +++ b/sw/otbn/crypto/p384_ecdsa_sca.s @@ -26,7 +26,7 @@ start: .text p384_ecdsa_sign: - jal x1, p384_ecdsa_setup_rand + jal x1, p384_ecdsa_setup jal x1, p384_sign ecall @@ -37,23 +37,40 @@ p384_ecdsa_verify: /** * Populate the variables rnd and k with randomness, and setup data pointers. */ -p384_ecdsa_setup_rand: - /* Obtain the blinding constant from URND, and write it to `rnd` in DMEM. */ - /* bn.wsrr w0, 0x2 */ /* URND */ - la x10, rnd - /* bn.sid x0, 0(x10) */ - - /* Point dptr_rnd to rnd. */ - la x11, dptr_rnd +p384_ecdsa_setup: + /* Point dptr_k0 to k0. */ + la x10, k0 + la x11, dptr_k0 sw x10, 0(x11) - /* Obtain the nonce (k) from RND. */ - /*bn.wsrr w0, 0x1 *//* RND */ - la x10, k - /*bn.sid x0, 0(x10)*/ + /* Point dptr_k1 to k1. */ + la x10, k1 + la x11, dptr_k1 + sw x10, 0(x11) + + /* Point dptr_d0 to d0. */ + la x10, d0 + la x11, dptr_d0 + sw x10, 0(x11) + + /* Point dptr_d1 to d1. */ + la x10, d1 + la x11, dptr_d1 + sw x10, 0(x11) - /* Point dptr_k to k. */ - la x11, dptr_k + /* Point dptr_msg to msg. */ + la x10, msg + la x11, dptr_msg + sw x10, 0(x11) + + /* Point dptr_r to sig_r. */ + la x10, r + la x11, dptr_r + sw x10, 0(x11) + + /* Point dptr_s to sig_s. */ + la x10, s + la x11, dptr_s sw x10, 0(x11) ret @@ -70,15 +87,21 @@ mode: /* All constants below must be 256b-aligned. */ -/* random scalar k */ -.global k +/* random scalar k0*/ +.global k0 .balign 64 -k: +k0: .zero 64 -/* randomness for blinding */ +/* random scalar k1*/ +.global k1 .balign 64 +k1: + .zero 64 + +/* randomness for blinding */ .global rnd +.balign 64 rnd: .zero 64 @@ -112,10 +135,16 @@ x: y: .zero 64 -/* private key d */ -.globl d +/* private key d0 */ +.globl d0 .balign 64 -d: +d0: + .zero 64 + +/* private key d1 */ +.globl d1 +.balign 64 +d1: .zero 64 /* verification result x_r (aka x_1) */ @@ -123,3 +152,53 @@ d: .balign 64 x_r: .zero 64 + +/* pointer to rnd (dptr_rnd) */ +.globl dptr_rnd +dptr_rnd: + .zero 4 + +/* pointer to k0 (dptr_k0) */ +.globl dptr_k0 +dptr_k0: + .zero 4 + +/* pointer to k1 (dptr_k1) */ +.globl dptr_k1 +dptr_k1: + .zero 4 + +/* pointer to msg (dptr_msg) */ +.globl dptr_msg +dptr_msg: + .zero 4 + +/* pointer to R (dptr_r) */ +.globl dptr_r +dptr_r: + .zero 4 + +/* pointer to S (dptr_s) */ +.globl dptr_s +dptr_s: + .zero 4 + +/* pointer to X (dptr_x) */ +.globl dptr_x +dptr_x: + .zero 4 + +/* pointer to Y (dptr_y) */ +.globl dptr_y +dptr_y: + .zero 4 + +/* pointer to d0 (dptr_d0) */ +.globl dptr_d0 +dptr_d0: + .zero 4 + +/* pointer to d1 (dptr_d1) */ +.globl dptr_d1 +dptr_d1: + .zero 4 diff --git a/sw/otbn/crypto/p384_ecdsa_sign.s b/sw/otbn/crypto/p384_ecdsa_sign.s new file mode 100644 index 0000000000000..c063c15bff2b4 --- /dev/null +++ b/sw/otbn/crypto/p384_ecdsa_sign.s @@ -0,0 +1,163 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Entrypoint for P-384 ECDSA signing operations. + * + * This binary generates a signature using a caller-provided secret key. + */ + +.section .text.start +.globl start +start: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + jal x1, ecdsa_sign + + /* Invalid mode; fail. */ + unimp + unimp + unimp + +/** + * P-384 ECDSA signature generation. + * Generate the secret scalar k from a random seed. + * + * @param[in] dmem[0]: dptr_k0, pointer to location in dmem containing + * 1st scalar share k0 + * @param[in] dmem[4]: dptr_k1, pointer to location in dmem containing + * 2nd scalar share k1 + * @param[in] dmem[8]: dptr_msg, pointer to the message to be signed in dmem + * @param[in] dmem[12]: dptr_r, pointer to dmem location where s component + * of signature will be placed + * @param[in] dmem[16]: dptr_s, pointer to dmem location where r component + * of signature will be placed + * @param[in] dmem[28]: dptr_d0, pointer to location in dmem containing + * 1st private key share d0 + * @param[in] dmem[32]: dptr_d1, pointer to location in dmem containing + * 2nd private key share d1 + * @param[out] dmem[r]: r component of signature + * @param[out] dmem[s]: s component of signature + */ +ecdsa_sign: + /* Generate a fresh random scalar for signing. + dmem[k0] <= first share of k + dmem[k1] <= second share of k */ + jal x1, p384_generate_k + + /* Generate the signature. */ + jal x1, p384_sign + + ecall + +.bss + +/* pointer to x-coordinate (dptr_x) */ +.globl dptr_x +.balign 4 +dptr_x: + .zero 4 + +/* pointer to y-coordinate (dptr_y) */ +.globl dptr_y +.balign 4 +dptr_y: + .zero 4 + +/* pointer to k0 (dptr_k0) */ +.globl dptr_k0 +dptr_k0: + .zero 4 + +/* pointer to k1 (dptr_k1) */ +.globl dptr_k1 +dptr_k1: + .zero 4 + +/* pointer to d0 (dptr_d0) */ +.globl dptr_d0 +dptr_d0: + .zero 4 + +/* pointer to d1 (dptr_d1) */ +.globl dptr_d1 +dptr_d1: + .zero 4 + +/* pointer to msg (dptr_msg) */ +.globl dptr_msg +dptr_msg: + .zero 4 + +/* pointer to R (dptr_r) */ +.globl dptr_r +dptr_r: + .zero 4 + +/* pointer to S (dptr_s) */ +.globl dptr_s +dptr_s: + .zero 4 + +/* x-coordinate. */ +.globl x +.balign 32 +x: + .zero 64 + +/* y-coordinate. */ +.globl y +.balign 32 +y: + .zero 64 + +/* random scalar first share */ +.globl k0 +.balign 32 +k0: + .zero 64 + +/* random scalar second share */ +.globl k1 +.balign 32 +k1: + .zero 64 + +/* private key first share */ +.globl d0 +.balign 32 +d0: + .zero 64 + +/* private key second share */ +.globl d1 +.balign 32 +d1: + .zero 64 + +/* hash message to sign/verify */ +.globl msg +.balign 32 +msg: + .zero 64 + +/* r part of signature */ +.globl r +.balign 32 +r: + .zero 64 + +/* s part of signature */ +.globl s +.balign 32 +s: + .zero 64 + +/* 704 bytes of scratchpad memory + defined globally to save dmem */ +.balign 32 +.globl scratchpad +scratchpad: + .zero 704 diff --git a/sw/otbn/crypto/p384_ecdsa_verify.s b/sw/otbn/crypto/p384_ecdsa_verify.s new file mode 100644 index 0000000000000..577dcc2184bd5 --- /dev/null +++ b/sw/otbn/crypto/p384_ecdsa_verify.s @@ -0,0 +1,130 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Entrypoint for P-384 ECDSA verifying operations. + * + * This binary verifies a signature. - !!! Attention !!! - before + * signature verification p384_curve_point_valid + * binary has to be executed to check if the provided + * public key is valid. + */ + +.section .text.start +.globl start +start: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + jal x1, ecdsa_verify + + /* Invalid mode; fail. */ + unimp + unimp + unimp + + +/** + * P-384 ECDSA signature verification + * + * The routine computes the x1 coordinate and places it in dmem. x1 will be + * reduced (mod n), however, the final comparison has to be performed on the + * host side. The signature is valid if x1 == r. + * This routine runs in variable time. + * + * @param[in] dmem[4]: dptr_rnd, pointer to dmem location where the reduced + * affine x1-coordinate will be stored + * @param[in] dmem[8]: dptr_msg, pointer to the message to be verified in dmem + * @param[in] dmem[12]: dptr_r, pointer to r of signature in dmem + * @param[in] dmem[16]: dptr_s, pointer to s of signature in dmem + * @param[in] dmem[20]: dptr_x, pointer to x-coordinate of public key in dmem + * @param[in] dmem[20]: dptr_y, pointer to y-coordinate of public key in dmem + * @param[out] dmem[rnd]: x1 coordinate to be compared to rs + * + * !!! Attention !!! - before signature verification p384_curve_point_valid + * binary has to be executed to check if the provided public key is valid. + * + */ +ecdsa_verify: + /* Verify the signature (compute x1). */ + jal x1, p384_verify + + ecall + +.bss + +/* pointer to x-coordinate (dptr_x) */ +.globl dptr_x +.balign 4 +dptr_x: + .zero 4 + +/* pointer to y-coordinate (dptr_y) */ +.globl dptr_y +.balign 4 +dptr_y: + .zero 4 + +/* pointer to rnd (dptr_rnd) */ +.globl dptr_rnd +dptr_rnd: + .zero 4 + +/* pointer to msg (dptr_msg) */ +.globl dptr_msg +dptr_msg: + .zero 4 + +/* pointer to R (dptr_r) */ +.globl dptr_r +dptr_r: + .zero 4 + +/* pointer to S (dptr_s) */ +.globl dptr_s +dptr_s: + .zero 4 + +/* Public key x-coordinate. */ +.globl x +.balign 32 +x: + .zero 64 + +/* Public key y-coordinate. */ +.globl y +.balign 32 +y: + .zero 64 + +/* result of verify (x1 coordinate) */ +.globl rnd +.balign 32 +rnd: + .zero 64 + +/* hash message to sign/verify */ +.globl msg +.balign 32 +msg: + .zero 64 + +/* r part of signature */ +.globl r +.balign 32 +r: + .zero 64 + +/* s part of signature */ +.globl s +.balign 32 +s: + .zero 64 + +/* 896 bytes of scratchpad memory + defined globally to save dmem. */ +.balign 32 +.globl scratchpad +scratchpad: + .zero 896 diff --git a/sw/otbn/crypto/p384_internal_mult.s b/sw/otbn/crypto/p384_internal_mult.s new file mode 100644 index 0000000000000..409404937696d --- /dev/null +++ b/sw/otbn/crypto/p384_internal_mult.s @@ -0,0 +1,374 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ +/* + * P-384 specific routines for internal scalar multiplication of curve points. + */ + + .section .text + +/** + * Fetch curve point from dmem, randomize z-coordinate and store point in dmem + * + * returns P = (x, y, z) = (x_a*z, y_a*z, z) + * with P being a valid P-384 curve point in projective coordinates + * x_a and y_a being the affine coordinates as fetched from dmem + * z being a randomized z-coordinate + * + * This routines fetches the affine x- and y-coordinates of a curve point from + * dmem and computes a valid set of projective coordinates. The z-coordinate is + * randomized and x and y are scaled appropriately. The resulting projective + * coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells, + * i.e. each coordinate is stored 512 bit aligned, little endian. + * This routine runs in constant time. + * + * @param[in] x20: dptr_x, pointer to dmem location containing affine + * x-coordinate of input point + * @param[in] x21: dptr_y, pointer to dmem location containing affine + * y-coordinate of input point + * @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for + * modulus p + * @param[in] [w13, w12]: p, modulus of P-384 underlying finite field + * @param[in] w31: all-zero + * @param[in] x18: dptr_p_p, pointer to dmem location to store resulting point + * in projective space + * + * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on + * the upper limb of projective y-coordinate. + * + * clobbered registers: x10, x11 to x13 + * w2, w3, w8 to w11, w16 to w24, w29, w30 + * clobbered flag groups: FG0 + */ + .globl store_proj_randomize +store_proj_randomize: + + /* get a 384-bit random number from URND + [w3, w2] = random(384) */ + bn.wsrr w2, 2 + bn.wsrr w3, 2 + bn.rshi w3, w31, w3 >> 128 + + /* reduce random number + [w2, w3] = z <= [w2, w3] mod p */ + bn.sub w10, w2, w12 + bn.subb w11, w3, w13 + bn.sel w2, w2, w10, C + bn.sel w3, w3, w11, C + + bn.mov w10, w2 + bn.mov w11, w3 + + /* store z-coordinate + dmem[x20+128] = [w10, w11] */ + li x10, 10 + li x11, 11 + bn.sid x10, 128(x18) + bn.sid x11, 160(x18) + + /* fetch x-coordinate from dmem + [w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */ + li x12, 16 + li x13, 17 + bn.lid x12, 0(x20) + bn.lid x13, 32(x20) + + /* scale and store x-coordinate + [dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] = + x_p <= [w11, w10] * [w17, w16] = z*x mod p */ + + jal x1, p384_mulmod_p + bn.sid x12, 0(x18) + bn.sid x13, 32(x18) + + /* fetch y-coordinate from dmem + [w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */ + bn.lid x12, 0(x21) + bn.lid x13, 32(x21) + + /* scale and store y-coordinate + [dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] = + y_p <= [w11, w10] * [w17, w16] = z*y mod p */ + bn.mov w10, w2 + bn.mov w11, w3 + jal x1, p384_mulmod_p + bn.sid x12, 64(x18) + bn.sid x13, 96(x18) + + ret + +/** + * P-384 scalar point multiplication in projective space + * + * returns R = k*P = k*(x_p, y_p) + * where P is a valid P-384 curve point in affine coordinates, + * k is a 384-bit scalar, + * R is a valid P-384 curve point in projective coordinates. + * + * This routine performs scalar multiplication based on the group laws + * of Weierstrass curves. + * A constant time double-and-add algorithm (sometimes referred to as + * double-and-add-always) is used. + * Due to the P-384 optimized implementations of the internally called routines + * for point addition and doubling, this routine is limited to P-384 curves. + * The routine makes use of blinding by additive splitting the + * exponent/scalar d into two shares. The double-and-add loop operates on both + * shares in parallel applying the Strauss-Shamir trick: + * The routine receives the scalar in two shares k0, k1 such that + * k = (k0 + k1) mod n + * The loop operates on both shares in parallel, computing (k0 + k1) * P as + * follows: + * Q = (0, 1, 0) # origin + * for i in 448..0: + * Q = 2 * Q + * A = if (k0[i] ^ k1[i]) then P else 2P + * B = Q + A + * Q = if (k0[i] | k1[i]) then B else Q + * + * Each share k0/k1 is 448 bits, even though it represents a 384-bit value. + * This is a side-channel protection measure. + * + * @param[in] x17: dptr_k0, pointer to first share k0 of scalar k + * (0 < k < n) in dmem (448-bit) + * @param[in] x19: dptr_k1, pointer to second share k1 of scalar k + * (0 < k < n) in dmem (448-bit) + * @param[in] x20: dptr_x, pointer to affine x-coordinate in dmem + * @param[in] x21: dptr_y, pointer to affine y-coordinate in dmem + * @param[in] x28: dptr_b, pointer to domain parameter b of P-384 in dmem + * @param[in] x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem + * @param[in] [w13, w12]: p, modulus of P-384 underlying finite field + * @param[in] [w11, w10]: n, domain parameter of P-384 curve + * (order of base point G) + * @param[in] w31: all-zero + * @param[out] [w26,w25]: x, x-coordinate of resulting point R (projective). + * @param[out] [w28,w27]: y, y-coordinate of resulting point R (projective). + * @param[out] [w30,w29]: z, z-coordinate of resulting point R (projective). + * + * Scratchpad memory layout: + * The routine expects at least 704 bytes of scratchpad memory at dmem + * location 'scratchpad' (sp). Internally the scratchpad is used as follows: + * dptr_sp .. dptr_sp+191: point P, projective + * dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar + * dptr_sp+256 .. dptr_sp+447: point 2P, projective + * dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar + * dptr_sp+512 .. dptr_sp+703: point Q, projective + * + * Projective coordinates of a point are kept in dmem in little endian format + * with the individual coordinates 512 bit aligned. The coordinates are stored + * in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit + * curve point occupies 6 consecutive 256-bit dmem cells. + * + * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on + * the computed affine y-coordinate. + * + * clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30 + * clobbered flag groups: FG0 + */ + .globl scalar_mult_int_p384 +scalar_mult_int_p384: + + /* set regfile pointers to in/out regs of Barrett routine. Set here to avoid + resetting in very call to point addition routine */ + li x22, 10 + li x23, 11 + li x24, 16 + li x25, 17 + + /* fetch 1st share of scalar from dmem + s0 = [w1, w0] <= dmem[dptr_k0] = [dmem[x17], dmem[x17+32]] = k0 */ + li x2, 0 + bn.lid x2++, 0(x17) + bn.lid x2++, 32(x17) + + /* fetch 2nd share of scalar from dmem + s0 = [w3, w2] <= dmem[dptr_k1] = [dmem[x19], dmem[x19+32]] = k1 */ + bn.lid x2++, 0(x19) + bn.lid x2++, 32(x19) + + /* left align both shares for probing of MSB in loop body */ + bn.rshi w1, w1, w0 >> 192 + bn.rshi w0, w0, w31 >> 192 + bn.rshi w3, w3, w2 >> 192 + bn.rshi w2, w2, w31 >> 192 + + /* store shares in scratchpad */ + li x2, 0 + bn.sid x2++, 192(x30) + bn.sid x2++, 224(x30) + bn.sid x2++, 448(x30) + bn.sid x2++, 480(x30) + + /* get randomized projective coodinates of curve point + P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */ + add x18, x30, 0 + jal x1, store_proj_randomize + + /* double point P + 2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */ + add x27, x30, x0 + add x26, x30, x0 + jal x1, proj_add_p384 + + /* store point 2P in scratchpad @w30+256 + dmem[dptr_sc+256] = [w30:w25] = 2P */ + li x2, 25 + bn.sid x2++, 256(x30) + bn.sid x2++, 288(x30) + bn.sid x2++, 320(x30) + bn.sid x2++, 352(x30) + bn.sid x2++, 384(x30) + bn.sid x2++, 416(x30) + + /* init point Q = (0,1,0) for double-and-add in scratchpad */ + /* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */ + addi x26, x30, 512 + li x2, 30 + bn.addi w30, w31, 1 + bn.sid x2++, 64(x26) + bn.sid x2, 0(x26) + bn.sid x2, 32(x26) + bn.sid x2, 96(x26) + bn.sid x2, 128(x26) + bn.sid x2, 160(x26) + + /* double-and-add loop with decreasing index */ + loopi 448, 85 + + /* double point Q + Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */ + add x27, x26, x0 + jal x1, proj_add_p384 + + /* store Q in dmem + dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */ + li x2, 25 + bn.sid x2++, 0(x26) + bn.sid x2++, 32(x26) + bn.sid x2++, 64(x26) + bn.sid x2++, 96(x26) + bn.sid x2++, 128(x26) + bn.sid x2++, 160(x26) + + /* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both + is 1. + If only one MSb is set, select P for addition. + If both MSbs are set, select 2P for addition. + (If neither MSB is set, 2P will be selected but result discarded.) */ + li x2, 0 + bn.lid x2++, 224(x30) + bn.lid x2, 480(x30) + bn.xor w8, w0, w1 + /* Create conditional offeset into scratchpad. + if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */ + csrrs x3, FG0, x0 + xori x3, x3, -1 + andi x3, x3, 2 + slli x27, x3, 7 + add x27, x27, x30 + + /* Reload randomized projective coodinates for curve point P. + P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */ + jal x1, store_proj_randomize + + /* Add points Q+P or Q+2P depending on offset in x27. + Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */ + jal x1, proj_add_p384 + + /* load shares from scratchpad + [w1, w0] = s0; [w3, w2] = s1 */ + li x2, 0 + bn.lid x2++, 192(x30) + bn.lid x2++, 224(x30) + bn.lid x2++, 448(x30) + bn.lid x2++, 480(x30) + + /* M = s0[511] | s1[511] */ + bn.or w8, w1, w3 + + /* load q from scratchpad + Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */ + li x2, 4 + bn.lid x2++, 0(x26) + bn.lid x2++, 32(x26) + bn.lid x2++, 64(x26) + bn.lid x2++, 96(x26) + bn.lid x2++, 128(x26) + bn.lid x2++, 160(x26) + + /* select either Q or Q_a + if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */ + bn.sel w25, w25, w4, M + bn.sel w26, w26, w5, M + bn.sel w27, w27, w6, M + bn.sel w28, w28, w7, M + bn.sel w29, w29, w8, M + bn.sel w30, w30, w9, M + + /* store Q in dmem + dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */ + li x2, 25 + bn.sid x2++, 0(x26) + bn.sid x2++, 32(x26) + bn.sid x2++, 64(x26) + bn.sid x2++, 96(x26) + bn.sid x2++, 128(x26) + bn.sid x2++, 160(x26) + + /* left shift both shares + s0 <= s0 << 1 ; s1 <= s1 << 1 */ + bn.add w0, w0, w0 + bn.addc w1, w1, w1 + bn.add w2, w2, w2 + bn.addc w3, w3, w3 + /* store both shares in scratchpad */ + li x2, 0 + bn.sid x2++, 192(x30) + bn.sid x2++, 224(x30) + bn.sid x2++, 448(x30) + bn.sid x2++, 480(x30) + + + /* Get a fresh random number from URND and scale the coordinates of 2P. + (scaling each proj. coordinate by same factor results in same point) */ + + /* get a 384-bit random number from URND */ + bn.wsrr w2, 2 + bn.wsrr w3, 2 + bn.rshi w3, w31, w3 >> 128 + + /* reduce random number + [w2, w3] = z <= [w2, w3] mod p */ + bn.sub w10, w2, w12 + bn.subb w11, w3, w13 + bn.sel w2, w2, w10, C + bn.sel w3, w3, w11, C + + /* scale all coordinates in scratchpad */ + li x2, 16 + li x3, 17 + /* x-coordinate */ + bn.mov w10, w2 + bn.mov w11, w3 + bn.lid x2, 256(x30) + bn.lid x3, 288(x30) + jal x1, p384_mulmod_p + bn.sid x2, 256(x30) + bn.sid x3, 288(x30) + /* y-coordinate */ + bn.mov w10, w2 + bn.mov w11, w3 + bn.lid x2, 320(x30) + bn.lid x3, 352(x30) + jal x1, p384_mulmod_p + bn.sid x2, 320(x30) + bn.sid x3, 352(x30) + /* z-coordinate */ + bn.mov w10, w2 + bn.mov w11, w3 + bn.lid x2, 384(x30) + bn.lid x3, 416(x30) + jal x1, p384_mulmod_p + bn.sid x2, 384(x30) + bn.sid x3, 416(x30) + + ret diff --git a/sw/otbn/crypto/p384_isoncurve.s b/sw/otbn/crypto/p384_isoncurve.s new file mode 100644 index 0000000000000..ef8bb215dbd75 --- /dev/null +++ b/sw/otbn/crypto/p384_isoncurve.s @@ -0,0 +1,316 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ +/* + * P-384 specific routines for ECDSA signature verification and curve point + * test. + */ + + .section .text + +/** + * Checks if a point is a valid curve point on curve P-384 + * + * Returns rhs = x^3 + ax + b mod p + * and lhs = y^2 mod p + * where x,y are the affine coordinates of the curve point and + * a, b and p being the domain parameters of curve P-384. + * + * This routine checks if a point with given x- and y-coordinate is a valid + * curve point on P-384. + * The routine checks whether the coordinates are a solution of the + * Weierstrass equation y^2 = x^3 + ax + b mod p. + * The routine makes use of the property that the domain parameter 'a' can be + * written as a=-3 for the P-384 curve, hence the routine is limited to P-384. + * The routine does not return a boolean result but computes the left side + * and the right sight of the Weierstrass equation and leaves the final + * comparison to the caller. + * The routine runs in constant time. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] dmem[12]: dptr_rhs, pointer to dmem location where right + * side result will be stored + * @param[in] dmem[16]: dptr_lhs, pointer to dmem location where left side + * result will be stored + * @param[in] dmem[20]: dptr_x, pointer to dmem location containing affine + * x-coordinate of input point + * @param[in] dmem[24]: dptr_y, pointer to dmem location containing affine + * y-coordinate of input point + * + * clobbered registers: x2, x3, w0 to w5, w10 to w17 + * clobbered flag groups: FG0 + */ + .globl p384_isoncurve +p384_isoncurve: + + /* setup all-zero reg */ + bn.xor w31, w31, w31 + + /* load affine x-coordinate of curve point from dmem + [w1, w0] <= dmem[dptr_x] = dmem[20] */ + la x3, dptr_x + lw x3, 0(x3) + li x2, 0 + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* load affine y-coordinate of curve point from dmem + [w3, w2] <= dmem[dptr_y] = dmem[24] */ + la x3, dptr_y + lw x3, 0(x3) + bn.lid x2++, 0(x3) + bn.lid x2, 32(x3) + + /* load domain parameter p (modulus) from dmem + [w13, w12] = p = dmem[p384_p] */ + li x2, 12 + la x3, p384_p + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* load domain parameter b from dmem + [w4, w5] = b = dmem[p384_b] */ + li x2, 4 + la x3, p384_b + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* y^2 = [w17,w16] <= y*y = [w3,w2]*w[w3,w2] */ + bn.mov w10, w2 + bn.mov w11, w3 + bn.mov w16, w2 + bn.mov w17, w3 + jal x1, p384_mulmod_p + + /* store result (left side): dmem[dptr_lhs] <= y^2 = [w17,w16] */ + la x3, dptr_lhs + lw x3, 0(x3) + li x2, 16 + bn.sid x2++, 0(x3) + bn.sid x2++, 32(x3) + + /* x^3 = [w17,w16] <= (x*x)*x = ([w1,w0]*(w1,w0])*[w1,w0] */ + bn.mov w10, w0 + bn.mov w11, w1 + bn.mov w16, w0 + bn.mov w17, w1 + jal x1, p384_mulmod_p + bn.mov w10, w0 + bn.mov w11, w1 + jal x1, p384_mulmod_p + + /* for curve P-384, 'a' can be written as a = -3, therefore we subtract + x three times from x^3. + x^3 + ax mod p = [w17,w16] <= x^3 -3 x mod p + = [w17,w16] - [w1,w0] - [w1,w0] - [w1,w0] mod [w13,w12] */ + loopi 3, 6 + bn.sub w16, w16, w0 + bn.subb w17, w17, w1 + bn.add w10, w16, w12 + bn.addc w11, w17, w13 + bn.sel w16, w10, w16, C + bn.sel w17, w11, w17, C + + /* add domain parameter b + x^3 + ax + b mod p = [w17,w16] <= [w17,w16] + [w5,w4] mod [w13,w12] */ + bn.add w16, w16, w4 + bn.addc w17, w17, w5 + bn.sub w10, w16, w12 + bn.subb w11, w17, w13 + bn.sel w16, w16, w10, C + bn.sel w17, w17, w11, C + + /* store result (right side) + dmem[dptr_rhs] <= x^3 + ax + b mod p = [w17,w16] */ + la x3, dptr_rhs + lw x3, 0(x3) + li x2, 16 + bn.sid x2++, 0(x3) + bn.sid x2++, 32(x3) + + ret + +/** + * Check if a provided curve point is valid. + * + * For a given curve point (x, y), check that: + * - x and y are both fully reduced mod p + * - (x, y) is on the P-384 curve. + * + * Note that, because the point is in affine form, it is not possible that (x, + * y) is the point at infinity. In some other forms such as projective + * coordinates, we would need to check for this also. + * + * This routine raises a software error and halts operation if the curve point + * is invalid. + * + * @param[in] dmem[12]: dptr_rhs, pointer to dmem location where right hand + * side result rhs will be stored + * @param[in] dmem[16]: dptr_lhs, pointer to dmem location where left hand + * side result lhs will be stored + * @param[in] dmem[20]: dptr_x, pointer to dmem location containing affine + * x-coordinate of input point + * @param[in] dmem[24]: dptr_y, pointer to dmem location containing affine + * y-coordinate of input point + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * clobbered registers: x2, x3, x20 to x23, w0 to w17 + * clobbered flag groups: FG0 + */ + .globl p384_curve_point_valid +p384_curve_point_valid: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* load domain parameter p (modulus) + [w13, w12] = p = dmem[p384_p] */ + li x2, 12 + la x3, p384_p + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* Load public key x-coordinate. + [w11, w10] <= dmem[x] = x */ + la x20, dptr_x + lw x20, 0(x20) + li x2, 10 + bn.lid x2++, 0(x20) + bn.lid x2, 32(x20) + + /* Compare x to p. + FG0.C <= (x < p) */ + bn.sub w0, w10, w12 + bn.subb w0, w11, w13 + + /* Trigger a fault if FG0.C is false. */ + csrrs x2, FG0, x0 + andi x2, x2, 1 + bne x2, x0, _x_valid + unimp + + _x_valid: + + /* Load public key y-coordinate. + w2 <= dmem[y] = y */ + la x21, dptr_y + lw x21, 0(x21) + li x2, 8 + bn.lid x2++, 0(x21) + bn.lid x2, 32(x21) + + /* Compare y to p. + FG0.C <= (y < p) */ + bn.sub w0, w8, w12 + bn.subb w0, w9, w13 + + /* Trigger a fault if FG0.C is false. */ + csrrs x2, FG0, x0 + andi x2, x2, 1 + bne x2, x0, _y_valid + unimp + + _y_valid: + + /* Compute both sides of the Weierstrauss equation. + dmem[rhs] <= (x^3 + ax + b) mod p + dmem[lhs] <= (y^2) mod p */ + jal x1, p384_isoncurve + + /* Load both sides of the equation. + [w7, w6] <= dmem[rhs] + [w5, w4] <= dmem[lhs] */ + la x22, dptr_rhs + lw x22, 0(x22) + li x2, 6 + bn.lid x2++, 0(x22) + bn.lid x2, 32(x22) + la x23, dptr_lhs + lw x23, 0(x23) + li x2, 4 + bn.lid x2++, 0(x23) + bn.lid x2, 32(x23) + + /* Compare the two sides of the equation. + FG0.Z <= (y^2) mod p == (x^2 + ax + b) mod p */ + bn.sub w0, w4, w6 + bn.subb w1, w5, w7 + + bn.cmp w0, w31 + + /* Trigger a fault if FG0.Z is false. */ + csrrs x2, FG0, x0 + srli x2, x2, 3 + andi x2, x2, 1 + bne x2, x0, _pt_1st_reg_valid + unimp + unimp + unimp + + _pt_1st_reg_valid: + + bn.cmp w1, w31 + + /* Trigger a fault if FG0.Z is false. */ + csrrs x2, FG0, x0 + srli x2, x2, 3 + andi x2, x2, 1 + bne x2, x0, _pt_valid + unimp + unimp + unimp + + _pt_valid: + + ret + +.data + +/* Right side of Weierstrass equation */ +.globl rhs +.balign 32 +rhs: + .zero 64 + +/* Left side of Weierstrass equation */ +.globl lhs +.balign 32 +lhs: + .zero 64 + +/* Curve point x-coordinate. */ +.globl x +.weak x +.balign 32 +x: + .zero 64 + +/* Curve point y-coordinate. */ +.globl y +.weak y +.balign 32 +y: + .zero 64 + +/* pointer to R (dptr_rhs) */ +.globl dptr_rhs +dptr_rhs: + .zero 4 + +/* pointer to S (dptr_lhs) */ +.globl dptr_lhs +dptr_lhs: + .zero 4 + +/* pointer to X (dptr_x) */ +.globl dptr_x +.weak dptr_x +dptr_x: + .zero 4 + +/* pointer to Y (dptr_y) */ +.globl dptr_y +.weak dptr_y +dptr_y: + .zero 4 diff --git a/sw/otbn/crypto/p384_keygen.s b/sw/otbn/crypto/p384_keygen.s new file mode 100644 index 0000000000000..b608ea8af9f5b --- /dev/null +++ b/sw/otbn/crypto/p384_keygen.s @@ -0,0 +1,256 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ +/* + * This library contains: + * - P-384 specific routines to generate random values for keys and scalars + */ + +.section .text + +/** + * Generate a nonzero random value in the scalar field. + * + * Returns t, a random value that is nonzero mod n, in shares. + * + * This follows a modified version of the method in FIPS 186-4 sections B.4.1 + * and B.5.1 for generation of secret scalar values d and k. The computation + * in FIPS 186-4 is: + * seed = RBG(seedlen) // seedlen >= 448 + * return (seed mod (n-1)) + 1 + * + * The important features here are that (a) the seed is at least 64 bits longer + * than n in order to minimize bias after the reduction and (b) the resulting + * scalar is guaranteed to be nonzero. + * + * We deviate from FIPS a little bit here because for side-channel protection, + * we do not want to fully reduce the seed modulo (n-1) or combine the shares. + * Instead, we do the following: + * seed0 = RBG(448) + * seed1 = RBG(448) + * x = URND(127) + 1 // random value for masking + * if (seed0 * x + seed1 * x) mod n == 0: + * retry + * return seed0, seed1 + * + * Essentially, we get two independent seeds and interpret these as additive + * shares of the scalar t = (seed0 + seed1) mod n. Then, we need to ensure t is + * nonzero. Multiplying each share with a random masking parameter allows us to + * safely add them, and then check if this result is 0; if it is, then t must + * be 0 mod n and we need to retry. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] w31: all-zero + * @param[in] dmem[p384_n]: Curve order n + * @param[out] [w7,w6]: first share of secret scalar t (448 bits) + * @param[out] [w9,w8]: second share of secret scalar t (448 bits) + * + * clobbered registers: x2, x3, w4 to w11, w14, w16 to w28 + * clobbered flag groups: FG0 + */ +p384_random_scalar: + /* Load the curve order n. + [w13,w12] <= dmem[p384_n] = n */ + li x2, 12 + la x3, p384_n + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + random_scalar_retry: + /* Obtain 1024 bits of randomness from RND. */ + bn.wsrr w6, RND + bn.wsrr w7, RND + bn.wsrr w8, RND + bn.wsrr w9, RND + + /* XOR with bits from URND, just in case there's any vulnerability in EDN + that lets the attacker recover bits before they reach OTBN. */ + bn.wsrr w5, URND + bn.xor w6, w6, w5 + bn.wsrr w5, URND + bn.xor w7, w7, w5 + bn.wsrr w5, URND + bn.xor w8, w8, w5 + bn.wsrr w5, URND + bn.xor w9, w9, w5 + + /* Shift bits to get 448-bit seeds. + seed0 = [w7,w6], seed1 = [w9,w8] + w7 <= w7[192:0] + w9 <= w9[192:0] */ + bn.rshi w7, w31, w7 >> 64 + bn.rshi w9, w31, w9 >> 64 + + /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so + no need to compute the high part): + w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */ + bn.sub w14, w31, w12 + + /* Generate a random 127-bit number. + w4 <= URND()[255:129] */ + bn.wsrr w4, URND + bn.rshi w4, w31, w4 >> 129 + + /* Add 1 to get a 128-bit nonzero scalar for masking. + w4 <= w4 + 1 = x */ + bn.addi w4, w4, 1 + + /* [w26,w25] <= ([w7,w6] * w4) mod n = (seed0 * x) mod n */ + bn.mov w16, w4 + bn.mov w10, w6 + bn.mov w11, w7 + jal x1, p384_mulmod448x128_n + bn.mov w25, w16 + bn.mov w26, w17 + + /* [w28,w27] <= ([w9,w8] * w4) mod n = (seed1 * x) mod n */ + bn.mov w16, w4 + bn.mov w10, w8 + bn.mov w11, w9 + jal x1, p384_mulmod448x128_n + bn.mov w27, w16 + bn.mov w28, w17 + + /* Compute (seed * x) mod n = (seed0 * x + seed1 * x) mod n + [w17,w16] <= seed * x = [w26,w25] + [w28,w27] mod n */ + bn.add w18, w27, w25 + bn.addc w19, w28, w26 + bn.mov w20, w31 + jal x1, p384_reduce_n + + /* Compare w16 to 0. */ + bn.cmp w16, w31 + + /* Read the FG0.Z flag (position 3). + x2 <= 8 if FG0.Z else 0 */ + csrrw x2, FG0, x0 + andi x2, x2, 8 + + /* Compare w17 to 0. */ + bn.cmp w17, w31 + + /* Read the FG0.Z flag (position 3). + x3 <= 8 if FG0.Z else 0 */ + csrrw x3, FG0, x0 + andi x3, x3, 8 + + /* Check if both registers w16 and w17 are equal to 0. + x2 AND x3 == 0 <=> [w17,w16] != 0, x2 AND x3 != 0 <=> [w17,w16] == 0 */ + or x2, x2, x3 + + /* Retry if x2 != 0. */ + bne x2, x0, random_scalar_retry + + /* If we get here, then (seed0 + seed1) mod n is nonzero mod n; return. */ + + ret + +/** + * Generate the secret key d from a random seed. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] dmem[0]: dptr_d0, pointer to location in dmem containing + * 1st private key share d0 + * @param[in] dmem[4]: dptr_d1, pointer to location in dmem containing + * 2nd private key share d1 + * + * clobbered registers: x2, x3, x20, w4 to w11, w14, w16 to w28 + * clobbered flag groups: FG0 + */ +.globl p384_generate_random_key +p384_generate_random_key: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Generate a random scalar in two 448-bit shares. + [w7,w6] <= d0 + [w9,w8] <= d1 */ + jal x1, p384_random_scalar + + /* Write first share to DMEM. + dmem[d0] <= [w7,w6] = d0 */ + la x20, dptr_d0 + lw x20, 0(x20) + li x2, 6 + bn.sid x2++, 0(x20) + bn.sid x2++, 32(x20) + + /* Write second share to DMEM. + dmem[d1] <= [w9,w8] = d1 */ + la x20, dptr_d1 + lw x20, 0(x20) + bn.sid x2++, 0(x20) + bn.sid x2++, 32(x20) + + ret + +/** + * Generate the secret scalar k from a random seed. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] dmem[0]: dptr_k0, pointer to location in dmem containing + * 1st scalar share k0 + * @param[in] dmem[4]: dptr_k1, pointer to location in dmem containing + * 2nd scalar share k1 + * + * clobbered registers: x2, x3, x20, w4 to w11, w14, w16 to w28 + * clobbered flag groups: FG0 + */ +.globl p384_generate_k +p384_generate_k: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Generate a random scalar in two 448-bit shares. + [w7,w6] <= k0 + [w9,w8] <= k1 */ + jal x1, p384_random_scalar + + /* Write first share to DMEM. + dmem[k0] <= [w7,w6] = k0 */ + la x20, dptr_k0 + lw x20, 0(x20) + li x2, 6 + bn.sid x2++, 0(x20) + bn.sid x2++, 32(x20) + + /* Write second share to DMEM. + dmem[k1] <= [w9,w8] = k1 */ + la x20, dptr_k1 + lw x20, 0(x20) + bn.sid x2++, 0(x20) + bn.sid x2++, 32(x20) + + ret + +/* pointers */ +.section .data + +.balign 32 + +/* pointer to k0 (dptr_k0) */ +.globl dptr_k0 +.weak dptr_k0 +dptr_k0: + .zero 4 + +/* pointer to k1 (dptr_k1) */ +.globl dptr_k1 +.weak dptr_k1 +dptr_k1: + .zero 4 + +/* pointer to d0 (dptr_d0) */ +.globl dptr_d0 +.weak dptr_d0 +dptr_d0: + .zero 4 + +/* pointer to d1 (dptr_d1) */ +.globl dptr_d1 +.weak dptr_d1 +dptr_d1: + .zero 4 diff --git a/sw/otbn/crypto/p384_modinv.s b/sw/otbn/crypto/p384_modinv.s new file mode 100644 index 0000000000000..7d1712b8239c9 --- /dev/null +++ b/sw/otbn/crypto/p384_modinv.s @@ -0,0 +1,87 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ +/* + * P-384 specific routines for ECC modular inverse computation. + */ + + .section .text + +/** + * Variable-time modular multiplicative inverse computation + * + * returns x_inv = x^-1 mod m + * + * This routine computes the modular multiplicative inverse for any x < m in + * the finite field GF(m) where m is prime. + * + * For inverse computation, Fermat's little theorem is used, i.e. + * we compute x^-1 = x^(m-2) mod m. + * For exponentiation we use a standard, variable-time (!) square and multiply + * algorithm. + * + * This routine is mainly intended to be used for inversion of scalars in + * context of the P-384 curve. In theory, it can be used with any 384-bit + * modulus m with a corresponding 385-bit Barrett constant u, + * where u[383:192] = 0. + * + * Note: When used for P-384 scalar inversion, the routine will need 672 calls + * to the multiplication routine. By using an adder chain this could be reduced + * to ~433 multiplications, however, at the cost of a significant code size + * increase. + * + * Note: This routine runs in variable-time w.r.t. the modulus. It should only + * be used with a non-secret modulus. + * + * @param[in] [w13, w12]: m, 384 bit modulus + * @param[in] w14: k, Solinas constant (2^384 - m) (max. length 191 bits). + * @param[in] [w30, w29]: x, 384 bit operand + * @param[in] w31, all-zero + * @param[out] [w17, w16]: x_inv, modular multiplicative inverse + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * clobbered registers: x2, w2, w3, w10, w11, w16 to w24 + * clobbered flag groups: FG0 + */ + .globl mod_inv_n_p384 +mod_inv_n_p384: + + /* subtract 2 from modulus for Fermat's little theorem + [w3,w2] <= m - 2 = [w13,w12]-2 (left aligned) */ + bn.subi w2, w12, 2 + bn.subb w3, w13, w31 + bn.rshi w3, w3, w2 >> 128 + bn.rshi w2, w2, w31 >> 128 + + /* init square and multiply: [w17,w16] = 1 */ + bn.addi w16, w31, 1 + bn.mov w17, w31 + + /* square and multiply loop */ + loopi 384, 12 + + /* square: [w17,w16] <= [w17, w16]*[w11,w10] mod [w13, w12] */ + bn.mov w10, w16 + bn.mov w11, w17 + jal x1, p384_mulmod_n + + /* shift MSB into carry flag + [w3,w2] = 2*[w3,w2] = [w3,w2] << 1 */ + bn.add w2, w2, w2 + bn.addc w3, w3, w3 + + /* skip multiplication if C flag not set */ + csrrs x2, 0x7c0, x0 + andi x2, x2, 1 + beq x2, x0, nomul + + /* multiply: [w17,w16] <= [w17, w16]*[w30,w29] mod [w13, w12] */ + bn.mov w10, w29 + bn.mov w11, w30 + jal x1, p384_mulmod_n + + nomul: + nop + + ret diff --git a/sw/otbn/crypto/p384_scalar_mult.s b/sw/otbn/crypto/p384_scalar_mult.s new file mode 100644 index 0000000000000..ab8700a4ff96e --- /dev/null +++ b/sw/otbn/crypto/p384_scalar_mult.s @@ -0,0 +1,225 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ +/* + * P-384 specific routines for constant-time scalar multiplication. + */ + + .section .text + +/** + * Externally callable wrapper for P-384 scalar point multiplication + * + * Calculates R = k*P = k*(x_p, y_p) + * where R, P are valid P-384 curve points in affine coordinates, + * k is a 384-bit scalar. + * The x coordinate of R is arithmetically masked. + * Returns the masked x coordinate of R and the corresponding mask. + * + * Sets up context and calls the internal scalar multiplication routine. + * This routine runs in constant time. + * + * @param[in] dmem[0]: dptr_k0, pointer to location in dmem containing + * 1st scalar share k0 + * @param[in] dmem[4]: dptr_k1, pointer to location in dmem containing + * 2nd scalar share k1 + * @param[in] dmem[20]: dptr_x, pointer to affine x-coordinate in dmem + * @param[in] dmem[22]: dptr_y, pointer to affine y-coordinate in dmem + * @param[out] dmem[x]: masked x coordinate of R + * @param[out] dmem[y]: corresponding mask + * + * 384-bit quantities have to be provided in dmem in little-endian format, + * 512 bit aligned, with the highest 128 bit set to zero. + * + * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on + * the computed affine y-coordinate. + * + * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30 + * w0 to w30 + * clobbered flag groups: FG0 + */ +.globl p384_scalar_mult +p384_scalar_mult: + + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* set dmem pointer to point x-coordinate */ + la x20, dptr_x + lw x20, 0(x20) + + /* set dmem pointer to point y-coordinate */ + la x21, dptr_y + lw x21, 0(x21) + + /* set dmem pointer to 1st scalar share k0 */ + la x17, dptr_k0 + lw x17, 0(x17) + + /* set dmem pointer to 2nd scalar share k1 */ + la x19, dptr_k1 + lw x19, 0(x19) + + /* set dmem pointer to domain parameter b */ + la x28, p384_b + + /* set dmem pointer to scratchpad */ + la x30, scratchpad + + /* load domain parameter p (modulus) + [w13, w12] = p = dmem[p384_p] */ + li x2, 12 + la x3, p384_p + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* load domain parameter n (order of base point) + [w11, w10] = n = dmem[p384_n] */ + li x2, 10 + la x3, p384_n + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* scalar multiplication inprojective space + [w30:w25] <= (x, y, z) = k * P */ + jal x1, scalar_mult_int_p384 + + /* Arithmetic masking: + 1. Generate a random mask r + 2. Subtract masks from projective x coordinate + (x, y, z) -> ((x - r) mod p, + y, + z) + 3. Convert masked curve point back to affine + form. + 4. Multiply mask with z^-1 for use in + affine space. */ + + /* Load domain parameter. + [w13,w12] = dmem[p384_p] */ + li x2, 12 + la x4, p384_p + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + /* Fetch a fresh randomness for mask. + [w20, w19, w18] <= URND() = r */ + bn.wsrr w18, URND + bn.wsrr w19, URND + bn.wsrr w20, URND + + /* Reduce r mod p + [w19, w18] <= [w20, w19, w18] mod [w13, w12] = r mod p */ + jal x1, p384_reduce_p + bn.mov w18, w16 + bn.mov w19, w17 + + /* Arithmetic masking. + [w26,w25] = A <= [w26,w25] - [w19,w18] mod [w13,w12] = x - r mod p */ + + /* [w9,w8] = A1 <= [w26,w25] - [w19,w18] = x - r */ + bn.sub w8, w25, w18 + bn.subb w9, w26, w19 + + /* [w7,w6] = A2 <= [w9,w8] + [w13,w12] = A1 + p = x - r + p */ + bn.add w6, w8, w12 + bn.addc w7, w9, w13 + + /* If x < r: [w26,w25] <= A2, else: [w26,w25] <= A1 */ + bn.sub w0, w25, w18 + bn.subb w1, w26, w19 + bn.sel w25, w6, w8, FG0.C + bn.sel w26, w7, w9, FG0.C + + /* Store mask to dmem for later use. + y coordinate is not required afterwards and therefore can be used + for the mask. */ + li x2, 18 + bn.sid x2++, 0(x21) + bn.sid x2, 32(x21) + + /* conversion into affine space + [w1, w0] <= z^-1 + [w28:w25] <= (x, y) */ + jal x1, proj_to_affine_p384 + + /* Get modular inverse z^-1 of projective z coordinate + and multiply the random masks with z^-1 to + also convert them into affine space. */ + + /* Load domain parameter. + [w13,w12] = dmem[p384_p] */ + li x2, 12 + la x4, p384_p + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + /* Move previously stored mask r and z^-1 into input WDRs + for multiplication. */ + li x2, 10 + bn.lid x2++, 0(x21) + bn.lid x2, 32(x21) + bn.mov w16, w0 + bn.mov w17, w1 + + /* Compute affine mask by modular multiplication of r and z^-1. + [w17, w16] = r_a = r * z^-1 mod p */ + jal x1, p384_mulmod_p + + /* Store result in dmem. + y coordinate is not required afterwards and + is therefore replaced by the affine mask r_a*/ + li x2, 25 + bn.sid x2++, 0(x20) + bn.sid x2, 32(x20) + li x2, 16 + bn.sid x2++, 0(x21) + bn.sid x2, 32(x21) + + ret + +/* pointers and scratchpad memory */ +.section .data + +.balign 32 + + /* pointer to k0 (dptr_k0) */ +.globl dptr_k0 +.weak dptr_k0 +dptr_k0: + .zero 4 + +/* pointer to k1 (dptr_k1) */ +.globl dptr_k1 +.weak dptr_k1 +dptr_k1: + .zero 4 + +/* pointer to d0 (dptr_d0) */ +.globl dptr_d0 +.weak dptr_d0 +dptr_d0: + .zero 4 + +/* pointer to d1 (dptr_d1) */ +.globl dptr_d1 +.weak dptr_d1 +dptr_d1: + .zero 4 + +/* pointer to X (dptr_x) */ +.globl dptr_x +.weak dptr_x +dptr_x: + .zero 4 + +/* pointer to Y (dptr_y) */ +.globl dptr_y +.weak dptr_y +dptr_y: + .zero 4 + +/* 704 bytes of scratchpad memory */ +.balign 32 +scratchpad: + .zero 704 diff --git a/sw/otbn/crypto/p384_sign.s b/sw/otbn/crypto/p384_sign.s index 21aaeff1fec9d..6422ba62c7884 100644 --- a/sw/otbn/crypto/p384_sign.s +++ b/sw/otbn/crypto/p384_sign.s @@ -2,884 +2,11 @@ /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ /* SPDX-License-Identifier: Apache-2.0 */ /* - * P-384 specific routines for ECDSA signature generation and constant-time - * scalar multiplication. + * P-384 specific routines for ECDSA signature generation. */ .section .text -/** - * Convert projective coordinates of a P-384 curve point to affine coordinates - * - * returns P = (x_a, y_a) = (x/z mod p, y/z mod p) - * where P is a valid P-384 curve point, - * x_a and y_a are the resulting affine coordinates of the - * curve point, - * x,y and z are a set of projective coordinates of the - * point and - * p is the modulus of the P-384 underlying finite field. - * - * This routine computes the affine coordinates for a set of projective - * coordinates of a valid P-384 curve point. The routine performs the required - * divisions by computing the multiplicative modular inverse of the - * projective z-coordinate in the underlying finite field of the P-384 curve. - * For inverse computation Fermat's little theorem is used, i.e. - * we compute z^-1 = z^(p-2) mod p. - * For exponentiation a 16 step addition chain is used. - * Source of the addition chain is the addchain project: - * https://github.com/mmcloughlin/addchain/ - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * @param[in] [w26,w25]: x, x-coordinate of curve point (projective). - * @param[in] [w26,w25]: y, y-coordinate of curve point (projective). - * @param[in] [w30,w29]: z, z-coordinate of curve point (projective). - * @param[in] [w13, w12]: p, modulus of P-384. - * @param[in] w31: all-zero. - * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point. - * @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point. - * - * clobbered registers: w0 to w28 - * clobbered flag groups: FG0 - */ -proj_to_affine_p384: - - /* Exp: 0b10 = 2*0b1 - Val: r10 = z^2 mod p - [w17,w16] <= [w30,w29]^2 mod [w13,w12] */ - bn.mov w10, w29 - bn.mov w11, w30 - bn.mov w16, w29 - bn.mov w17, w30 - jal x1, p384_mulmod_p - - /* Exp: 0b11 = 0b1+0b10 - Val: r11 <= z*r10 mod p - [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */ - bn.mov w10, w29 - bn.mov w11, w30 - jal x1, p384_mulmod_p - - /* Exp: 0b110 = 2*0b11 - Val: r110 = r11^2 mod p - [w17,w16] <= [w17,w16]^2 mod [w13,w12] */ - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - - /* Exp: 0b111 = 0b1+0b110 - Val: r111 <= z*r110 mod p - [w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */ - bn.mov w10, w29 - bn.mov w11, w30 - jal x1, p384_mulmod_p - bn.mov w0, w16 - bn.mov w1, w17 - - /* Exp: 0b111000 = 0b111<<3 - Val: r111000 <= r111^(2^3) mod p - [w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */ - loopi 3, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - - /* Exp: 0b1111111 = 0b111+0b111000 - Val: r1111111 <= r111*r111000 mod p - [w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */ - bn.mov w10, w0 - bn.mov w11, w1 - jal x1, p384_mulmod_p - bn.mov w2, w16 - bn.mov w3, w17 - - /* Exp: 2^12-1 = (0b1111111<<6)+0b111111 - Val: r_12_1 <= r111111^(2^6)*r111111 mod p - [w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */ - loopi 6, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - bn.mov w10, w2 - bn.mov w11, w3 - jal x1, p384_mulmod_p - bn.mov w4, w16 - bn.mov w5, w17 - - /* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1) - Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p - [w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */ - loopi 12, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - bn.mov w10, w4 - bn.mov w11, w5 - jal x1, p384_mulmod_p - - /* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111 - Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p - [w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */ - loopi 6, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - bn.mov w10, w2 - bn.mov w11, w3 - jal x1, p384_mulmod_p - bn.mov w2, w16 - bn.mov w3, w17 - - /* Exp: 2^31-1 <= (2^30-1)*2+0b1 - Val: r_31_1 <= r30_1^2*z mod p - [w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */ - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - bn.mov w10, w29 - bn.mov w11, w30 - jal x1, p384_mulmod_p - bn.mov w6, w16 - bn.mov w7, w17 - - /* Exp: 2^32-1 <= (2^30-1)*2+0b1 - Val: r_32_1 <= r31_1^2*z mod p - [w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */ - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - bn.mov w10, w29 - bn.mov w11, w30 - jal x1, p384_mulmod_p - bn.mov w9, w16 - bn.mov w8, w17 - - /* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1) - Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p - [w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */ - loopi 31, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - bn.mov w10, w6 - bn.mov w11, w7 - jal x1, p384_mulmod_p - bn.mov w6, w16 - bn.mov w7,w17 - - /* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1) - Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p - [w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */ - loopi 63, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - bn.mov w10, w6 - bn.mov w11, w7 - jal x1, p384_mulmod_p - bn.mov w6, w16 - bn.mov w7, w17 - - /* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1) - Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p - [w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */ - loopi 126, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - bn.mov w10, w6 - bn.mov w11, w7 - jal x1, p384_mulmod_p - - /* Exp: 2^255-1 = ((2^252-1)<<3)+0b111 - Val: r_255_1 <= r_252_1^(2^3)*r111 mod p - [w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */ - loopi 3, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - bn.mov w10, w0 - bn.mov w11, w1 - jal x1, p384_mulmod_p - - /* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1 - Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p - [w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2) - *[w30,w29] mod [w13,w12] */ - loopi 33, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - bn.mov w10, w9 - bn.mov w11, w8 - jal x1, p384_mulmod_p - loopi 94, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - bn.mov w10, w2 - bn.mov w11, w3 - jal x1, p384_mulmod_p - loopi 2, 4 - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_p - nop - bn.mov w10, w29 - bn.mov w11, w30 - jal x1, p384_mulmod_p - - /* store inverse [w1,w0] <= [w17,w16] = z_inv*/ - bn.mov w0, w16 - bn.mov w1, w17 - - /* convert x-coordinate to affine space - [w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */ - bn.mov w10, w25 - bn.mov w11, w26 - jal x1, p384_mulmod_p - bn.mov w25, w16 - bn.mov w26, w17 - - /* convert y-coordinate to affine space - [w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */ - bn.mov w10, w27 - bn.mov w11, w28 - bn.mov w16, w0 - bn.mov w17, w1 - jal x1, p384_mulmod_p - bn.mov w27, w16 - bn.mov w28, w17 - - ret - - -/** - * Fetch curve point from dmem, randomize z-coordinate and store point in dmem - * - * returns P = (x, y, z) = (x_a*z, y_a*z, z) - * with P being a valid P-384 curve point in projective coordinates - * x_a and y_a being the affine coordinates as fetched from dmem - * z being a randomized z-coordinate - * - * This routines fetches the affine x- and y-coordinates of a curve point from - * dmem and computes a valid set of projective coordinates. The z-coordinate is - * randomized and x and y are scaled appropriately. The resulting projective - * coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells, - * i.e. each coordinate is stored 512 bit aligned, little endian. - * This routine runs in constant time. - * - * @param[in] x20: dptr_x, pointer to dmem location containing affine - * x-coordinate of input point - * @param[in] x21: dptr_y, pointer to dmem location containing affine - * y-coordinate of input point - * @param[in] [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for - * modulus p - * @param[in] [w13, w12]: p, modulus of P-384 underlying finite field - * @param[in] w31: all-zero - * @param[in] x18: dptr_p_p, pointer to dmem location to store resulting point - * in projective space - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on - * the upper limb of projective y-coordinate. - * - * clobbered registers: x10, x11 to x13 - * w2, w3, w8 to w11, w16 to w24, w29, w30 - * clobbered flag groups: FG0 - */ -store_proj_randomize: - - /* get a 384-bit random number from URND - [w3, w2] = random(384) */ - bn.wsrr w2, 2 - bn.wsrr w3, 2 - bn.rshi w3, w31, w3 >> 128 - - /* reduce random number - [w2, w3] = z <= [w2, w3] mod p */ - bn.sub w10, w2, w12 - bn.subb w11, w3, w13 - bn.sel w2, w2, w10, C - bn.sel w3, w3, w11, C - - bn.mov w10, w2 - bn.mov w11, w3 - - /* store z-coordinate - dmem[x20+128] = [w10, w11] */ - li x10, 10 - li x11, 11 - bn.sid x10, 128(x18) - bn.sid x11, 160(x18) - - /* fetch x-coordinate from dmem - [w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */ - li x12, 16 - li x13, 17 - bn.lid x12, 0(x20) - bn.lid x13, 32(x20) - - /* scale and store x-coordinate - [dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] = - x_p <= [w11, w10] * [w17, w16] = z*x mod p */ - - jal x1, p384_mulmod_p - bn.sid x12, 0(x18) - bn.sid x13, 32(x18) - - /* fetch y-coordinate from dmem - [w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */ - bn.lid x12, 0(x21) - bn.lid x13, 32(x21) - - /* scale and store y-coordinate - [dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] = - y_p <= [w11, w10] * [w17, w16] = z*y mod p */ - bn.mov w10, w2 - bn.mov w11, w3 - jal x1, p384_mulmod_p - bn.sid x12, 64(x18) - bn.sid x13, 96(x18) - - ret - - -/** - * P-384 scalar point multiplication in affine space - * - * returns R = k*P = k*(x_p, y_p) - * where R, P are valid P-384 curve points in affine coordinates, - * k is a 384-bit scalar. - * - * This routine performs scalar multiplication based on the group laws - * of Weierstrass curves. - * A constant time double-and-add algorithm (sometimes referred to as - * double-and-add-always) is used. - * Due to the P-384 optimized implementations of the internally called routines - * for point addition and doubling, this routine is limited to P-384 curves. - * The routine makes use of blinding by additive splitting the - * exponent/scalar d into two shares. The double-and-add loop operates on both - * shares in parallel applying Shamir's trick. - * - * @param[in] x9: dptr_rnd, pointer to location in dmem containing random - * number to be used for additive splitting of scalar - * @param[in] x19: dptr_k, pointer to scalar k (0 < k < n) in dmem - * @param[in] x20: dptr_x, pointer to affine x-coordinate in dmem - * @param[in] x21: dptr_y, pointer to affine y-coordinate in dmem - * @param[in] x28: dptr_b, pointer to domain parameter b of P-384 in dmem - * @param[in] x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem - * @param[in] [w13, w12]: p, modulus of P-384 underlying finite field - * @param[in] [w11, w10]: n, domain parameter of P-384 curve - * (order of base point G) - * @param[in] w31: all-zero - * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point R. - * @param[out] [w28, w26]: y_a, affine y-coordinate of resulting point R. - * - * Scratchpad memory layout: - * The routine expects at least 704 bytes of scratchpad memory at dmem - * location 'scratchpad' (sp). Internally the scratchpad is used as follows: - * dptr_sp .. dptr_sp+191: point P, projective - * dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar - * dptr_sp+256 .. dptr_sp+447: point 2P, projective - * dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar - * dptr_sp+512 .. dptr_sp+703: point Q, projective - * - * Projective coordinates of a point are kept in dmem in little endian format - * with the individual coordinates 512 bit aligned. The coordinates are stored - * in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit - * curve point occupies 6 consecutive 256-bit dmem cells. - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on - * the computed affine y-coordinate. - * - * clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30 - * clobbered flag groups: FG0 - */ -scalar_mult_int_p384: - - /* set regfile pointers to in/out regs of Barrett routine. Set here to avoid - resetting in very call to point addition routine */ - li x22, 10 - li x23, 11 - li x24, 16 - li x25, 17 - - /* fetch externally supplied random number from dmem - [w1, w0] = dmem[dptr_rnd] = [dmem[x9], dmem[x9+32]] = rnd */ - li x2, 0 - bn.lid x2++, 0(x9) - bn.lid x2++, 32(x9) - - /* 1st share (reduced rnd) - s0 = [w1, w0] <= rnd mod n = [w1, w0] mod [w11, w10] */ - bn.sub w9, w0, w10 - bn.subb w8, w1, w11 - bn.sel w0, w0, w9, C - bn.sel w1, w1, w8, C - - /* load scalar k from dmem - [w3, w2] = k <= dmem[dptr_k] = [dmem[x19], dmem[x19+32]] */ - bn.lid x2++, 0(x19) - bn.lid x2, 32(x19) - - /* 2nd share (k-s0) - s1 = [w3, w2] <= k - s0 mod n = [w2, w3] - [w1, w0] mod [w11, w10] */ - bn.sub w2, w2, w0 - bn.subb w3, w3, w1 - bn.add w8, w2, w10 - bn.addc w9, w3, w11 - bn.sel w2, w8, w2, C - bn.sel w3, w9, w3, C - - /* left align both shares for probing of MSB in loop body */ - bn.rshi w1, w1, w0 >> 128 - bn.rshi w0, w0, w31 >> 128 - bn.rshi w3, w3, w2 >> 128 - bn.rshi w2, w2, w31 >> 128 - - /* store shares in scratchpad */ - li x2, 0 - bn.sid x2++, 192(x30) - bn.sid x2++, 224(x30) - bn.sid x2++, 448(x30) - bn.sid x2++, 480(x30) - - /* get randomized projective coodinates of curve point - P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */ - add x18, x30, 0 - jal x1, store_proj_randomize - - /* double point P - 2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */ - add x27, x30, x0 - add x26, x30, x0 - jal x1, proj_add_p384 - - /* store point 2P in scratchpad @w30+256 - dmem[dptr_sc+256] = [w30:w25] = 2P */ - li x2, 25 - bn.sid x2++, 256(x30) - bn.sid x2++, 288(x30) - bn.sid x2++, 320(x30) - bn.sid x2++, 352(x30) - bn.sid x2++, 384(x30) - bn.sid x2++, 416(x30) - - /* init point Q = (0,1,0) for double-and-add in scratchpad */ - /* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */ - addi x26, x30, 512 - li x2, 30 - bn.addi w30, w31, 1 - bn.sid x2++, 64(x26) - bn.sid x2, 0(x26) - bn.sid x2, 32(x26) - bn.sid x2, 96(x26) - bn.sid x2, 128(x26) - bn.sid x2, 160(x26) - - /* double-and-add loop with decreasing index */ - loopi 384, 85 - - /* double point Q - Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */ - add x27, x26, x0 - jal x1, proj_add_p384 - - /* store Q in dmem - dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */ - li x2, 25 - bn.sid x2++, 0(x26) - bn.sid x2++, 32(x26) - bn.sid x2++, 64(x26) - bn.sid x2++, 96(x26) - bn.sid x2++, 128(x26) - bn.sid x2++, 160(x26) - - /* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both - is 1. - If only one MSb is set, select P for addition. - If both MSbs are set, select 2P for addition. - (If neither MSB is set, 2P will be selected but result discarded.) */ - li x2, 0 - bn.lid x2++, 224(x30) - bn.lid x2, 480(x30) - bn.xor w8, w0, w1 - /* Create conditional offeset into scratchpad. - if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */ - csrrs x3, 0x7c0, x0 - xori x3, x3, -1 - andi x3, x3, 2 - slli x27, x3, 7 - add x27, x27, x30 - - /* Reload randomized projective coodinates for curve point P. - P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */ - jal x1, store_proj_randomize - - /* Add points Q+P or Q+2P depending on offset in x27. - Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */ - jal x1, proj_add_p384 - - /* load shares from scratchpad - [w1, w0] = s0; [w3, w2] = s1 */ - li x2, 0 - bn.lid x2++, 192(x30) - bn.lid x2++, 224(x30) - bn.lid x2++, 448(x30) - bn.lid x2++, 480(x30) - - /* M = s0[511] | s1[511] */ - bn.or w8, w1, w3 - - /* load q from scratchpad - Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */ - li x2, 4 - bn.lid x2++, 0(x26) - bn.lid x2++, 32(x26) - bn.lid x2++, 64(x26) - bn.lid x2++, 96(x26) - bn.lid x2++, 128(x26) - bn.lid x2++, 160(x26) - - /* select either Q or Q_a - if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */ - bn.sel w25, w25, w4, M - bn.sel w26, w26, w5, M - bn.sel w27, w27, w6, M - bn.sel w28, w28, w7, M - bn.sel w29, w29, w8, M - bn.sel w30, w30, w9, M - - /* store Q in dmem - dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */ - li x2, 25 - bn.sid x2++, 0(x26) - bn.sid x2++, 32(x26) - bn.sid x2++, 64(x26) - bn.sid x2++, 96(x26) - bn.sid x2++, 128(x26) - bn.sid x2++, 160(x26) - - /* left shift both shares - s0 <= s0 << 1 ; s1 <= s1 << 1 */ - bn.add w0, w0, w0 - bn.addc w1, w1, w1 - bn.add w2, w2, w2 - bn.addc w3, w3, w3 - /* store both shares in scratchpad */ - li x2, 0 - bn.sid x2++, 192(x30) - bn.sid x2++, 224(x30) - bn.sid x2++, 448(x30) - bn.sid x2++, 480(x30) - - - /* Get a fresh random number from URND and scale the coordinates of 2P. - (scaling each proj. coordinate by same factor results in same point) */ - - /* get a 384-bit random number from URND */ - bn.wsrr w2, 2 - bn.wsrr w3, 2 - bn.rshi w3, w31, w3 >> 128 - - /* reduce random number - [w2, w3] = z <= [w2, w3] mod p */ - bn.sub w10, w2, w12 - bn.subb w11, w3, w13 - bn.sel w2, w2, w10, C - bn.sel w3, w3, w11, C - - /* scale all coordinates in scratchpad */ - li x2, 16 - li x3, 17 - /* x-coordinate */ - bn.mov w10, w2 - bn.mov w11, w3 - bn.lid x2, 256(x30) - bn.lid x3, 288(x30) - jal x1, p384_mulmod_p - bn.sid x2, 256(x30) - bn.sid x3, 288(x30) - /* y-coordinate */ - bn.mov w10, w2 - bn.mov w11, w3 - bn.lid x2, 320(x30) - bn.lid x3, 352(x30) - jal x1, p384_mulmod_p - bn.sid x2, 320(x30) - bn.sid x3, 352(x30) - /* z-coordinate */ - bn.mov w10, w2 - bn.mov w11, w3 - bn.lid x2, 384(x30) - bn.lid x3, 416(x30) - jal x1, p384_mulmod_p - bn.sid x2, 384(x30) - bn.sid x3, 416(x30) - - /* convert coordinates to affine space */ - jal x1, proj_to_affine_p384 - - ret - - -/** - * Externally callable wrapper for P-384 scalar point multiplication - * - * returns R = k*P = k*(x_p, y_p) - * where R, P are valid P-384 curve points in affine coordinates, - * k is a 384-bit scalar.. - * - * Sets up context and calls the internal scalar multiplication routine. - * This routine runs in constant time. - * - * @param[in] dmem[0]: dK, pointer to location in dmem containing scalar k - * @param[in] dmem[4]: dRnd, pointer to location in dmem containing random - * number for blinding - * @param[in] dmem[20]: dptr_x, pointer to affine x-coordinate in dmem - * @param[in] dmem[22]: dptr_y, pointer to affine y-coordinate in dmem - * - * 384-bit quantities have to be provided in dmem in little-endian format, - * 512 bit aligned, with the highest 128 bit set to zero. - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on - * the computed affine y-coordinate. - * - * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30 - * w0 to w30 - * clobbered flag groups: FG0 - */ -.globl scalar_mult_p384 -scalar_mult_p384: - - /* set dmem pointer to point x-coordinate */ - la x20, dptr_x - lw x20, 0(x20) - - /* set dmem pointer to point y-coordinate */ - la x21, dptr_y - lw x21, 0(x21) - - /* set dmem pointer to scalar k */ - la x19, dptr_k - lw x19, 0(x19) - - /* set pointer to blinding parameter */ - la x9, dptr_rnd - lw x9, 0(x9) - - /* set dmem pointer to domain parameter b */ - la x28, p384_b - - /* set dmem pointer to scratchpad */ - la x30, scratchpad - - /* load domain parameter p (modulus) - [w13, w12] = p = dmem[p384_p] */ - li x2, 12 - la x3, p384_p - bn.lid x2++, 0(x3) - bn.lid x2++, 32(x3) - - /* load domain parameter n (order of base point) - [w11, w10] = n = dmem[p384_n] */ - li x2, 10 - la x3, p384_n - bn.lid x2++, 0(x3) - bn.lid x2++, 32(x3) - - /* init all-zero reg */ - bn.xor w31, w31, w31 - - jal x1, scalar_mult_int_p384 - - /* store result in dmem */ - li x2, 25 - bn.sid x2++, 0(x20) - bn.sid x2++, 32(x20) - bn.sid x2++, 0(x21) - bn.sid x2++, 32(x21) - - ret - -/** - * Externally callable routine for P-384 base point multiplication - * - * returns Q = d (*) G - * where Q is a resulting valid P-384 curve point in affine - * coordinates, - * G is the base point of curve P-384, and - * d is a 384-bit scalar. - * - * Sets up context and calls the internal scalar multiplication routine. - * This routine runs in constant time. - * - * @param[in] dmem[0]: dptr_d, pointer to location in dmem containing - * scalar d. - * @param[in] dmem[20]: dptr_x, pointer to result buffer for x-coordinate - * @param[in] dmem[24]: dptr_y, pointer to result buffer for y-coordinate - * @param[in] dmem[28]: dptr_rnd, pointer to location in dmem containing - * random number for blinding. - * - * 384-bit quantities have to be provided in dmem in little-endian format, - * 512 bit aligned, with the highest 128 bit set to zero. - * - * Flags: When leaving this subroutine, the M, L and Z flags of FG0 correspond - * to the computed affine y-coordinate. - * - * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30 - * w0 to w30 - * clobbered flag groups: FG0 - */ -.globl p384_base_mult -p384_base_mult: - - /* set dmem pointer to x-coordinate of base point*/ - la x20, p384_gx - - /* set dmem pointer to y-coordinate of base point */ - la x21, p384_gy - - /* set dmem pointer to scalar d */ - la x19, dptr_d - lw x19, 0(x19) - - /* set pointer to blinding parameter */ - la x9, dptr_rnd - lw x9, 0(x9) - - /* set dmem pointer to domain parameter b */ - la x28, p384_b - - /* set dmem pointer to scratchpad */ - la x30, scratchpad - - /* load domain parameter p (modulus) - [w13, w12] = p = dmem[p384_p] */ - li x2, 12 - la x3, p384_p - bn.lid x2++, 0(x3) - bn.lid x2++, 32(x3) - - /* load domain parameter n (order of base point) - [w11, w10] = n = dmem[p384_n] */ - li x2, 10 - la x3, p384_n - bn.lid x2++, 0(x3) - bn.lid x2++, 32(x3) - - /* init all-zero reg */ - bn.xor w31, w31, w31 - - jal x1, scalar_mult_int_p384 - - /* set dmem pointer to point x-coordinate */ - la x20, dptr_x - lw x20, 0(x20) - - /* set dmem pointer to point y-coordinate */ - la x21, dptr_y - lw x21, 0(x21) - - /* store result in dmem */ - li x2, 25 - bn.sid x2++, 0(x20) - bn.sid x2++, 32(x20) - bn.sid x2++, 0(x21) - bn.sid x2++, 32(x21) - - ret - - -/** - * Variable-time modular multiplicative inverse computation - * - * returns x_inv = x^-1 mod m - * - * This routine computes the modular multiplicative inverse for any x < m in - * the finite field GF(m) where m is prime. - * - * For inverse computation, Fermat's little theorem is used, i.e. - * we compute x^-1 = x^(m-2) mod m. - * For exponentiation we use a standard, variable-time (!) square and multiply - * algorithm. - * - * This routine is mainly intended to be used for inversion of scalars in - * context of the P-384 curve. In theory, it can be used with any 384-bit - * modulus m with a corresponding 385-bit Barrett constant u, - * where u[383:192] = 0. - * - * Note: When used for P-384 scalar inversion, the routine will need 672 calls - * to the multiplication routine. By using an adder chain this could be reduced - * to ~433 multiplications, however, at the cost of a significant codes size - * increase. - * - * Note: This routine runs in variable-time w.r.t. the modulus. It should only - * be used with a non-secret modulus. - * - * @param[in] [w13, w12]: m, 384 bit modulus - * @param[in] w14: k, Solinas constant (2^384 - m) (max. length 191 bits). - * @param[in] [w30, w29]: x, 384 bit operand - * @param[in] w31, all-zero - * @param[out] [w17, w16]: x_inv, modular multiplicative inverse - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * clobbered registers: x2, w2, w3, w10, w11, w16 to w24 - * clobbered flag groups: FG0 - */ -mod_inv_n_p384: - - /* subtract 2 from modulus for Fermat's little theorem - [w13,w12] <= m - 2 = [w11,w10]-2 (left aligned) */ - bn.subi w2, w12, 2 - bn.subb w3, w13, w31 - bn.rshi w3, w3, w2 >> 128 - bn.rshi w2, w2, w31 >> 128 - - /* init square and multiply: [w17,w16] = 1 */ - bn.addi w16, w31, 1 - bn.mov w17, w31 - - /* square and multiply loop */ - loopi 384, 12 - - /* square: [w17,w16] <= [w17, w16]*[w11,w10] mod [w13, w12] */ - bn.mov w10, w16 - bn.mov w11, w17 - jal x1, p384_mulmod_n - - /* shift MSB into carry flag - [w3,w2] = 2*[w3,w2] = [w3,w2] << 1 */ - bn.add w2, w2, w2 - bn.addc w3, w3, w3 - - /* skip multiplication if C flag not set */ - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, nomul - - /* multiply: [w17,w16] <= [w17, w16]*[w30,w29] mod [w13, w12] */ - bn.mov w10, w29 - bn.mov w11, w30 - jal x1, p384_mulmod_n - - nomul: - nop - - ret - - /** * P-384 ECDSA signature generation * @@ -895,19 +22,23 @@ mod_inv_n_p384: * * This routine runs in constant time. * - * @param[in] dmem[0]: dptr_k, pointer to a 384 bit random secret in dmem - * @param[in] dmem[4]: dptr_rnd, pointer to location in dmem containing - * a 384-bit random number for blinding + * @param[in] dmem[0]: dptr_k0, pointer to location in dmem containing + * 1st scalar share k0 + * @param[in] dmem[4]: dptr_k1, pointer to location in dmem containing + * 2nd scalar share k1 * @param[in] dmem[8]: dptr_msg, pointer to the message to be signed in dmem * @param[in] dmem[12]: dptr_r, pointer to dmem location where s component * of signature will be placed * @param[in] dmem[16]: dptr_s, pointer to dmem location where r component * of signature will be placed - * @param[in] dmem[28]: dptr_d, pointer to private key d in dmem + * @param[in] dmem[28]: dptr_d0, pointer to location in dmem containing + * 1st private key share d0 + * @param[in] dmem[32]: dptr_d1, pointer to location in dmem containing + * 2nd private key share d1 * * Flags: Flags have no meaning beyond the scope of this subroutine. * - * clobbered registers: x2, x3, x9 to x13, x18 to x28, x30 + * clobbered registers: x2 to x6, x9 to x15, x17 to x28, x30 * w0 to w31 * clobbered flag groups: FG0 */ @@ -925,13 +56,33 @@ p384_sign: /* set dmem pointer to base point y-coordinate */ la x21, p384_gy - /* set dmem pointer to secret random scalar k */ - la x19, dptr_k + /* set dmem pointer to 1st scalar share k0 */ + la x17, dptr_k0 + lw x17, 0(x17) + + /* set dmem pointer to 2nd scalar share k1 */ + la x19, dptr_k1 lw x19, 0(x19) - /* set pointer to blinding parameter */ - la x9, dptr_rnd - lw x9, 0(x9) + /* set dmem pointer to 1st private key share d0 */ + la x4, dptr_d0 + lw x4, 0(x4) + + /* set dmem pointer to 2nd private key share d1 */ + la x5, dptr_d1 + lw x5, 0(x5) + + /* set dmem pointer to message msg */ + la x6, dptr_msg + lw x6, 0(x6) + + /* set dmem pointer to signature r */ + la x14, dptr_r + lw x14, 0(x14) + + /* set dmem pointer to signature s */ + la x15, dptr_s + lw x15, 0(x15) /* set dmem pointer to scratchpad */ la x30, scratchpad @@ -950,25 +101,19 @@ p384_sign: bn.lid x2++, 0(x3) bn.lid x2++, 32(x3) - /* scalar multiplication with base point - [w28:w25] <= (x_1, y_1) = k*G */ + /* scalar multiplication with base point and + conversion of projective coordinates to affine space + [w28:w25] <= (x_1, y_1) = (k*alpha) * G */ jal x1, scalar_mult_int_p384 + jal x1, proj_to_affine_p384 /* store r of signature in dmem: dmem[dptr_r] <= r = [w26,w25] */ li x2, 25 - la x3, dptr_r - lw x3, 0(x3) - bn.sid x2++, 0(x3) - bn.sid x2++, 32(x3) - - /* load secret random number k from dmem - [w30,w29] <= k = dmem[dptr_k] */ - li x2, 29 - bn.lid x2++, 0(x19) - bn.lid x2++, 32(x19) + bn.sid x2++, 0(x14) + bn.sid x2++, 32(x14) /* load domain parameter n (order of base point) - [w13, w12] <= p = dmem[p384_n] */ + [w13, w12] <= n = dmem[p384_n] */ li x2, 12 la x3, p384_n bn.lid x2++, 0(x3) @@ -979,40 +124,126 @@ p384_sign: w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */ bn.sub w14, w31, w12 + /* Multiplicative masking of shares k0 and k1 */ + + /* Generate a random 127-bit number. + w4 <= URND()[255:129] */ + bn.wsrr w4, URND + bn.rshi w4, w31, w4 >> 129 + + /* Add 1 to get a 128-bit nonzero scalar for masking. + w4 <= w4 + 1 = alpha */ + bn.addi w4, w4, 1 + + /* load 1st share k0 from dmem + [w11,w10] <= k0 = dmem[dptr_k0] */ + li x2, 10 + bn.lid x2++, 0(x17) + bn.lid x2++, 32(x17) + + /* [w26,w25] <= ([w11,w10] * w4) mod n = (k0 * alpha) mod n */ + bn.mov w16, w4 + jal x1, p384_mulmod448x128_n + bn.mov w25, w16 + bn.mov w26, w17 + + /* load 2nd share k1 from dmem + [w11,w10] <= k1 = dmem[dptr_k1] */ + li x2, 10 + bn.lid x2++, 0(x19) + bn.lid x2++, 32(x19) + + /* [w28,w27] <= ([w11,w10] * w4) mod n = (k1 * alpha) mod n */ + bn.mov w16, w4 + jal x1, p384_mulmod448x128_n + bn.mov w27, w16 + bn.mov w28, w17 + + /* Multiplicative masking of shares d0 and d1 */ + + /* load 1st share d0 from dmem + [w11,w10] <= d0 = dmem[dptr_d0] */ + li x2, 10 + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + /* [w7,w6] <= ([w11,w10] * w4) mod n = (d0 * alpha) mod n */ + bn.mov w16, w4 + jal x1, p384_mulmod448x128_n + bn.mov w6, w16 + bn.mov w7, w17 + + /* load 2nd share d1 from dmem + [w11,w10] <= d1 = dmem[dptr_d1] */ + li x2, 10 + bn.lid x2++, 0(x5) + bn.lid x2++, 32(x5) + + /* [w9,w8] <= ([w11,w10] * w4) mod n = (d1 * alpha) mod n */ + bn.mov w16, w4 + jal x1, p384_mulmod448x128_n + bn.mov w8, w16 + bn.mov w9, w17 + + /* Multiplicative masking of message msg */ + + /* load message from dmem + [w11, w10] <= msg = dmem[dptr_msg] */ + li x2, 10 + bn.lid x2++, 0(x6) + bn.lid x2++, 32(x6) + + /* [w1,w0] <= ([w11,w10] * w4) mod n = (msg * alpha) mod n */ + bn.mov w16, w4 + jal x1, p384_mulmod448x128_n + bn.mov w0, w16 + bn.mov w1, w17 + + /* Compute (k*alpha) mod n = (k0*alpha + k1*alpha) mod n + [w17,w16] <= k*alpha = [w26,w25] + [w28,w27] mod n */ + bn.add w18, w27, w25 + bn.addc w19, w28, w26 + bn.mov w20, w31 + jal x1, p384_reduce_n + /* modular multiplicative inverse of k - [w3, w2] <= [w17, w16] <= k^(-1) mod n */ + [w3, w2] <= [w17, w16] <= (k*alpha)^(-1) mod n */ + bn.mov w29, w16 + bn.mov w30, w17 jal x1, mod_inv_n_p384 bn.mov w2, w16 bn.mov w3, w17 - /* load private key d from dmem - [w11,w10] <= d = dmem[dptr_d] */ - li x2, 10 - la x3, dptr_d - lw x3, 0(x3) - bn.lid x2++, 0(x3) - bn.lid x2++, 32(x3) + /* Compute (d*alpha) mod n = (d0*alpha + d1*alpha) mod n + [w17,w16] <= d*alpha = [w7,w6] + [w9,w8] mod n */ + bn.add w18, w8, w6 + bn.addc w19, w9, w7 + bn.mov w20, w31 + jal x1, p384_reduce_n - /* [w17, w16] <= k^(-1)*d mod n = [w17, w16] * [w11, w10] mod [w13, w12] */ + /* [w17, w16] <= (k*alpha)^(-1)*d*alpha mod n = [w3, w2] * [w17, w16] mod [w13, w12] */ + bn.mov w10, w2 + bn.mov w11, w3 jal x1, p384_mulmod_n + /* load r of signature from dmem + [w11,w10] <= r = dmem[dptr_r] */ + li x2, 10 + bn.lid x2++, 0(x14) + bn.lid x2++, 32(x14) + /* [w5, w4] <= [w17, w16] - <= r * (k^(-1)*d) mod n = [w26, w25] * [w17, w16] mod [w13, w12] */ - bn.mov w10, w25 - bn.mov w11, w26 + <= r * (k^(-1)*d) mod n = r * ((k*alpha)^(-1)*d*alpha) mod n = + = [w11, w10] * [w17, w16] mod [w13, w12] */ jal x1, p384_mulmod_n bn.mov w4, w16 bn.mov w5, w17 - /* load message from dmem - [w11, w10] <= msg = dmem[dptr_msg] */ - li x2, 10 - la x3, dptr_msg - lw x3, 0(x3) - bn.lid x2++, 0(x3) - bn.lid x2++, 32(x3) - - /* [w17, w16] <= k^(-1) * msg = [w3, w2]*[w17, w16] mod n */ + /* [w17, w16] <= k^(-1) * msg = + = (k*alpha)^(-1) * msg*alpha = + = [w3, w2]*[w1, w0] mod n */ + bn.mov w10, w0 + bn.mov w11, w1 bn.mov w16, w2 bn.mov w17, w3 jal x1, p384_mulmod_n @@ -1029,10 +260,8 @@ p384_sign: /* store s of signature in dmem: dmem[dptr_s] <= s = [w28, w27] */ li x2, 27 - la x3, dptr_s - lw x3, 0(x3) - bn.sid x2++, 0(x3) - bn.sid x2++, 32(x3) + bn.sid x2++, 0(x15) + bn.sid x2++, 32(x15) ret @@ -1040,46 +269,65 @@ p384_sign: /* pointers and scratchpad memory */ .section .data -/* pointer to k (dptr_k) */ -.globl dptr_k -dptr_k: +.balign 32 + +/* pointer to k0 (dptr_k0) */ +.globl dptr_k0 +.weak dptr_k0 +dptr_k0: .zero 4 -/* pointer to rnd (dptr_rnd) */ -.globl dptr_rnd -dptr_rnd: +/* pointer to k1 (dptr_k1) */ +.globl dptr_k1 +.weak dptr_k1 +dptr_k1: .zero 4 /* pointer to msg (dptr_msg) */ .globl dptr_msg +.weak dptr_msg dptr_msg: .zero 4 /* pointer to R (dptr_r) */ .globl dptr_r +.weak dptr_r dptr_r: .zero 4 /* pointer to S (dptr_s) */ .globl dptr_s +.weak dptr_s dptr_s: .zero 4 /* pointer to X (dptr_x) */ .globl dptr_x +.weak dptr_x dptr_x: .zero 4 /* pointer to Y (dptr_y) */ .globl dptr_y +.weak dptr_y dptr_y: .zero 4 -/* pointer to D (dptr_d) */ -.globl dptr_d -dptr_d: +/* pointer to d0 (dptr_d0) */ +.globl dptr_d0 +.weak dptr_d0 +dptr_d0: + .zero 4 + +/* pointer to d1 (dptr_d1) */ +.globl dptr_d1 +.weak dptr_d1 +dptr_d1: .zero 4 /* 704 bytes of scratchpad memory */ +.balign 32 +.globl scratchpad +.weak scratchpad scratchpad: .zero 704 diff --git a/sw/otbn/crypto/p384_verify.s b/sw/otbn/crypto/p384_verify.s index 8c0a1adf67bcc..a4ab44ebe3584 100644 --- a/sw/otbn/crypto/p384_verify.s +++ b/sw/otbn/crypto/p384_verify.s @@ -8,294 +8,6 @@ .section .text -/** - * Checks if a point is a valid curve point on curve P-384 - * - * Returns r = x^3 + ax + b mod p - * and s = y^2 mod p - * where x,y are the affine coordinates of the curve point and - * a, b and p being the domain parameters of curve P-384. - * - * This routine checks if a point with given x- and y-coordinate is a valid - * curve point on P-384. - * The routine checks whether the coordinates are a solution of the - * Weierstrass equation y^2 = x^3 + ax + b mod p. - * The routine makes use of the property that the domain parameter 'a' can be - * written as a=-3 for the P-384 curve, hence the routine is limited to P-384. - * The routine does not return a boolean result but computes the left side - * and the right sight of the Weierstrass equation and leaves the final - * comparison to the caller. - * The routine runs in constant time. - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * @param[in] dmem[12]: dptr_r, pointer to dmem location where right - * side result r will be stored - * @param[in] dmem[16]: dptr_s, pointer to dmem location where left side - * result s will be stored - * @param[in] dmem[20]: dptr_x, pointer to dmem location containing affine - * x-coordinate of input point - * @param[in] dmem[24]: dptr_y, pointer to dmem location containing affine - * y-coordinate of input point - * - * clobbered registers: x2, x3, w0 to w5, w10 to w17 - * clobbered flag groups: FG0 - */ - .globl p384_isoncurve -p384_isoncurve: - - /* setup all-zero reg */ - bn.xor w31, w31, w31 - - /* load affine x-coordinate of curve point from dmem - [w1, w0] <= dmem[dptr_x] = dmem[20] */ - la x3, dptr_x - lw x3, 0(x3) - li x2, 0 - bn.lid x2++, 0(x3) - bn.lid x2++, 32(x3) - - /* load affine y-coordinate of curve point from dmem - [w3, w2] <= dmem[dptr_y] = dmem[24] */ - la x3, dptr_y - lw x3, 0(x3) - bn.lid x2++, 0(x3) - bn.lid x2, 32(x3) - - /* load domain parameter p (modulus) from dmem - [w13, w12] = p = dmem[p384_p] */ - li x2, 12 - la x3, p384_p - bn.lid x2++, 0(x3) - bn.lid x2++, 32(x3) - - /* load domain parameter b from dmem - [w4, w5] = b = dmem[p384_b] */ - li x2, 4 - la x3, p384_b - bn.lid x2++, 0(x3) - bn.lid x2++, 32(x3) - - /* y^2 = [w17,w16] <= y*y = [w3,w2]*w[w3,w2] */ - bn.mov w10, w2 - bn.mov w11, w3 - bn.mov w16, w2 - bn.mov w17, w3 - jal x1, p384_mulmod_p - - /* store result (left side): dmem[dptr_s] <= y^2 = [w17,w16] */ - la x3, dptr_s - lw x3, 0(x3) - li x2, 16 - bn.sid x2++, 0(x3) - bn.sid x2++, 32(x3) - - /* x^3 = [w17,w16] <= (x*x)*x = ([w1,w0]*(w1,w0])*[w1,w0] */ - bn.mov w10, w0 - bn.mov w11, w1 - bn.mov w16, w0 - bn.mov w17, w1 - jal x1, p384_mulmod_p - bn.mov w10, w0 - bn.mov w11, w1 - jal x1, p384_mulmod_p - - /* for curve P-384, 'a' can be written as a = -3, therefore we subtract - x three times from x^3. - x^3 + ax mod p = [w17,w16] <= x^3 -3 x mod p - = [w17,w16] - [w1,w0] - [w1,w0] - [w1,w0] mod [w13,w12] */ - loopi 3, 6 - bn.sub w16, w16, w0 - bn.subb w17, w17, w1 - bn.add w10, w16, w12 - bn.addc w11, w17, w13 - bn.sel w16, w10, w16, C - bn.sel w17, w11, w17, C - - /* add domain parameter b - x^3 + ax + b mod p = [w17,w16] <= [w17,w16] + [w5,w4] mod [w13,w12] */ - bn.add w16, w16, w4 - bn.addc w17, w17, w5 - bn.sub w10, w16, w12 - bn.subb w11, w17, w13 - bn.sel w16, w16, w10, C - bn.sel w17, w17, w11, C - - /* store result (right side) - dmem[dptr_r] <= x^3 + ax + b mod p = [w17,w16] */ - la x3, dptr_r - lw x3, 0(x3) - li x2, 16 - bn.sid x2++, 0(x3) - bn.sid x2++, 32(x3) - - ret - - -/** - * 384-bit variable time modular multiplicative inverse computation - * - * Returns c <= a^(-1) mod m - * where 'a' is a bigint of length 384 bit with a < m - * 'm' is the modulus with a length of 384 bit - * 'c' is a 384-bit result - * - * This routine implements the computation of the modular multiplicative - * inverse based on the binary GCD or Stein's algorithm. - * The implemented variant is based on the "right-shift binary extended GCD" - * as it is described in section 3.1 of [1] (Algorithm 1). - * [1] https://doi.org/10.1155/ES/2006/32192 - * - * Note that this is a variable time implementation. I.e. this routine will - * show a data-dependent timing and execution profile. Only use where a - * full white-box scenario is acceptable. - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * @param[in] [w30, w29]: a, 384-bit operand - * @param[in] [w13, w12]: m, modulus - * @param[in] w31: all-zero - * @param[out] [w17,w16]: result c - * - * clobbered registers: x2, w2, w4 to w11, w16 to w19 - * clobbered flag groups: FG0 - */ -mod_inv_var: - /* [w5,w4] = r <= 0 */ - bn.xor w4, w4, w4 - bn.xor w5, w5, w5 - - /* [w7,w6] = s <= 1 */ - bn.addi w6, w31, 1 - bn.xor w7, w7, w7 - - /* [w9,w8] = u <= m = [w13, w12]*/ - bn.mov w8, w12 - bn.mov w9, w13 - - /* [w11,w10] = v <= [w30, w29] */ - bn.mov w10, w29 - bn.mov w11, w30 - - ebgcd_loop: - /* test if u is odd */ - bn.or w8, w8, w8 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_u_odd - - /* u is even: */ - /* [w9,w8] = u <= u/2 = [w9,w8] >> 1 */ - bn.rshi w8, w9, w8 >> 1 - bn.rshi w9, w31, w9 >> 1 - - /* test if r is odd */ - bn.or w4, w4, w4 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_r_odd - - /* r is even: */ - /* [w5,w4] = r <= r/2 = [w5,w4] >> 1 */ - bn.rshi w4, w5, w4 >> 1 - bn.rshi w5, w31, w5 >> 1 - jal x0, ebgcd_loop - - ebgcd_r_odd: - /* [w5,w4] = r <= (r + m)/2 = ([w5,w4] + [w13,w12]) >> 1 */ - bn.add w4, w4, w12 - bn.addc w5, w5, w13 - bn.rshi w4, w5, w4 >> 1 - bn.rshi w5, w31, w5 >> 1 - jal x0, ebgcd_loop - - ebgcd_u_odd: - /* test if v is odd */ - bn.or w10, w10, w10 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_uv_odd - - /* v is even: */ - /* [w11,w10] = v <= v/2 = [w11,w10] >> 1 */ - bn.rshi w10, w11, w10 >> 1 - bn.rshi w11, w31, w11 >> 1 - - /* test if s is odd */ - bn.or w6, w6, w6 - csrrs x2, 0x7c0, x0 - andi x2, x2, 4 - bne x2, x0, ebgcd_s_odd - - /* s is even: */ - /* [w7,w6] = s <= s/2 = [w7,w6] >> 1 */ - bn.rshi w6, w7, w6 >> 1 - bn.rshi w7, w31, w7 >> 1 - jal x0, ebgcd_loop - - ebgcd_s_odd: - /* [w7,w6] = s <= (s + m)/2 = ([w7,w6] + [w13,w12]) >> 1 */ - bn.add w6, w6, w12 - bn.addc w7, w7, w13 - bn.rshi w6, w7, w6 >> 1 - bn.rshi w7, w31, w7 >> 1 - jal x0, ebgcd_loop - - ebgcd_uv_odd: - /* test if v >= u */ - bn.cmp w10, w8 - bn.cmpb w11, w9 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - beq x2, x0, ebgcd_v_gte_u - - /* u > v: */ - /* [w5,w4] = r <= r - s = [w5,w4] - [w7,w6]; if (r < 0): r <= r + m */ - bn.sub w4, w4, w6 - bn.subb w5, w5, w7 - bn.add w18, w4, w12 - bn.addc w19, w5, w13 - bn.sel w4, w18, w4, C - bn.sel w5, w19, w5, C - - /* [w9,w8] = u <= u - v = [w9,w8] - [w11,w10] */ - bn.sub w8, w8, w10 - bn.subb w9, w9, w11 - jal x0, ebgcd_loop - - ebgcd_v_gte_u: - /* [w7,w6] = s <= s - r = [w7,w6] - [w5,w4]; if (s < 0) s <= s + m */ - bn.sub w6, w6, w4 - bn.subb w7, w7, w5 - bn.add w18, w6, w12 - bn.addc w19, w7, w13 - bn.sel w6, w18, w6, C - bn.sel w7, w19, w7, C - - /* [w11,w10] = v <= v - u = [w11,w10] - [w9,w8] */ - bn.sub w10, w10, w8 - bn.subb w11, w11, w9 - - /* if v > 0 go back to start of loop */ - bn.cmp w31, w10 - bn.cmpb w31, w11 - csrrs x2, 0x7c0, x0 - andi x2, x2, 1 - bne x2, x0, ebgcd_loop - - /* v <= 0: */ - /* if (r > m): [w17,w16] = a <= r - m = [w5,w4] - [w13,w12] - else: [w17,w16] = a <= r = [w5,w4] */ - bn.sub w18, w4, w12 - bn.subb w19, w5, w13 - bn.cmp w12, w4 - bn.cmpb w13, w5 - bn.sel w16, w18, w4, C - bn.sel w17, w19, w5, C - - ret - - /** * Store curve point in projective coordinates (non randomized) * @@ -339,7 +51,6 @@ store_aff_proj: ret - /** * Store curve point in projective coordinates (non randomized) * @@ -394,7 +105,8 @@ store_proj: * * Scratchpad memory layout: * The routine expects at least 896 bytes of scratchpad memory at dmem - * location 'scratchpad' (sp). Internally the scratchpad is used as follows: + * location 'scratchpad' (sp). + * Internally the scratchpad is used as follows: * dptr_sp .. dptr_sp+191: point C, projective * dptr_sp+192 .. dptr_sp+383: point G, projective * dptr_sp+384 .. dptr_sp+575: point Q, projective @@ -431,30 +143,30 @@ p384_verify: /* goto 'fail' if [w30,w29] == [w31, w31] <=> s == 0 */ bn.cmp w31, w29 bn.cmpb w31, w30 - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 beq x2, x0, fail /* goto 'fail' if [w30,w29] >= [w12,w13] <=> s >= n */ bn.cmp w29, w12 bn.cmpb w30, w13 - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 beq x2, x0, fail + /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so + no need to compute the high part): + w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */ + bn.sub w14, w31, w12 + /* Compute modular inverse of S Note: This can be replaced by the 'mod_inv_n_p384' subroutine at the cost of ~60k cycles if reduced code size is targeted */ /* [w9,w8] <= [w17,w16] <= s^-1 mod n = [w30,w29]^-1 mod [w13,w12] */ - jal x1, mod_inv_var + jal x1, mod_inv_n_p384 bn.mov w8, w16 bn.mov w9, w17 - /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so - no need to compute the high part): - w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */ - bn.sub w14, w31, w12 - /* set regfile pointers to in/out regs of Barrett routine */ li x22, 10 li x23, 11 @@ -472,14 +184,14 @@ p384_verify: /* goto 'fail' if [w11, w10] == [w31, w31] <=> r == 0 */ bn.cmp w31, w10 bn.cmpb w31, w11 - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 beq x2, x0, fail /* goto 'fail' if [w11,w10] >= [w12,w13] <=> r >= n */ bn.cmp w10, w12 bn.cmpb w11, w13 - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 beq x2, x0, fail @@ -590,7 +302,7 @@ p384_verify: bn.addc w1, w1, w1 /* keep MSB/carry bit in x3: x3 <= u1[i] */ - csrrs x3, 0x7c0, x0 + csrrs x3, FG0, x0 andi x3, x3, 1 /* left shift u2 = [w3,w2] <= [w3,w2] << 1 */ @@ -598,7 +310,7 @@ p384_verify: bn.addc w3, w3, w3 /* keep MSB/carry bit in x3: x4 <= u2[i] */ - csrrs x4, 0x7c0, x0 + csrrs x4, FG0, x0 andi x4, x4, 1 li x2, 0 @@ -651,8 +363,20 @@ p384_verify: jal x1, store_proj nop - /* compute inverse of z-coordinate: [w1,w0] <= z_c^-1 mod p */ - jal x1, mod_inv_var + /* load domain parameter p (order of finite field) + [w13, w12] <= p = dmem[p384_p] */ + li x2, 12 + la x3, p384_p + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* Compute Solinas constant k for modulus p (we know it is only 191 bits, so + no need to compute the high part): + w14 <= 2^256 - p[255:0] = (2^384 - p) mod (2^256) = 2^384 - p */ + bn.sub w14, w31, w12 + + /* compute inverse of z-coordinate: [w17,w16] <= z_c^-1 mod p */ + jal x1, mod_inv_n_p384 /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1 mod p */ bn.mov w10, w25 @@ -687,47 +411,46 @@ p384_verify: /* pointers and scratchpad memory */ .section .data -/* pointer to k (dptr_k) */ -.globl dptr_k -dptr_k: - .zero 4 - /* pointer to rnd (dptr_rnd) used for result here */ .globl dptr_rnd +.weak dptr_rnd dptr_rnd: .zero 4 /* pointer to msg (dptr_msg) */ .globl dptr_msg +.weak dptr_msg dptr_msg: .zero 4 -/* pointer to R (dptr_r) */ -.globl dptr_r -dptr_r: - .zero 4 - -/* pointer to S (dptr_s) */ -.globl dptr_s -dptr_s: - .zero 4 - /* pointer to X (dptr_x) */ .globl dptr_x +.weak dptr_x dptr_x: .zero 4 /* pointer to Y (dptr_y) */ .globl dptr_y +.weak dptr_y dptr_y: .zero 4 -/* pointer to D (dptr_d) */ -.globl dptr_d -dptr_d: +/* pointer to R (dptr_r) */ +.globl dptr_r +.weak dptr_r +dptr_r: + .zero 4 + +/* pointer to S (dptr_s) */ +.globl dptr_s +.weak dptr_s +dptr_s: .zero 4 /* Scratchpad memory */ +.balign 32 +.globl scratchpad +.weak scratchpad scratchpad: .zero 896 diff --git a/sw/otbn/crypto/primality.s b/sw/otbn/crypto/primality.s index afc2599b05255..9d51bdeef59d2 100644 --- a/sw/otbn/crypto/primality.s +++ b/sw/otbn/crypto/primality.s @@ -33,7 +33,7 @@ * @param[in] x10: t, number of Miller-Rabin rounds (security parameter) * @param[in] x14: dptr_b, pointer to temporary working buffer in dmem (n*32 bytes) * @param[in] x15: dptr_z, pointer to temporary working buffer in dmem (n*32 bytes) - * @param[in] x16: dptr_w, pointer to candidate prime w in dmem + * @param[in] x16: dptr_w, pointer to candidate prime w in dmem, w mod 4 = 3 * @param[in] x17: dptr_m0inv, pointer to Montgomery constant m0' (for w) in dmem * @param[in] x18: dptr_rr, pointer to Montgomery constant RR = R^2 mod w in dmem * @param[in] x30: n, number of limbs for all bignums (wlen / 256; n <= 16) @@ -56,7 +56,7 @@ miller_rabin: bn.cmp w31, w21 /* x2 <= CSRs[FG0][0] = FG0.C */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 /* Skip the rest of the loop if w is composite (x2 == 0). We can't exit @@ -95,7 +95,7 @@ miller_rabin: * * @param[in] x14: dptr_b, pointer to temporary working buffer in dmem (n*32 bytes) * @param[in] x15: dptr_z, pointer to temporary working buffer in dmem (n*32 bytes) - * @param[in] x16: dptr_w, pointer to candidate prime w in dmem + * @param[in] x16: dptr_w, pointer to candidate prime w in dmem, w mod 4 = 3 * @param[in] x17: dptr_m0inv, pointer to Montgomery constant m0' (for w) in dmem * @param[in] x18: dptr_rr, pointer to Montgomery constant RR = R^2 mod w in dmem * @param[in] x30: n, number of limbs for all bignums (wlen / 256; n <= 16) @@ -116,9 +116,9 @@ miller_rabin_round: addi x2, x14, 0 loop x30, 4 /* w22 <= URND() */ - bn.wsrr w22, 0x1 + bn.wsrr w22, URND /* w23 <= RND() */ - bn.wsrr w23, 0x2 + bn.wsrr w23, RND /* w23 <= w22 ^ w23 */ bn.xor w23, w22, w23 /* b[i] <= w23 */ @@ -142,7 +142,7 @@ miller_rabin_round: /* Extract FG0.C into a small register and jump back to the start if it is 0. x2 <= CSRs[FG0][0] = FG0.C */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 beq x2, x0, miller_rabin_round @@ -164,7 +164,7 @@ miller_rabin_round: /* Extract FG0.C into a small register and retry if it is 0. x2 <= CSRs[FG0][0] = FG0.C */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 beq x2, x0, miller_rabin_round @@ -198,56 +198,27 @@ miller_rabin_round: * 4.7 Continue. * 5. Return PROBABLY PRIME. * - * This routine corresponds to steps 4.3 through 4.7. - * - * This loop needs to be constant-time relative to w if w is possibly prime (if - * we find proof that w is composite we are permitted to break early). To make - * the algorithm constant-time, we need to: - * - Compute b^(w-1) mod w in one loop, rather than separate into b^m mod w - * and a squaring loop for trailing zeroes. - * - Keep track of whether we have already reached a "step 4.7" condition, - * meaning we should return that w is possibly prime regardless of the rest - * of the loop. - * - * For each bit of (w-1), if we are in the case where all the remaining bits - * are 0 (steps 4.4-4.5 of the FIPS procedure), then we have four possible - * cases: - * 1. If z == w - 1, then b is a witness to the primality of w regardless of - * what happens in the rest of the loop (step 4.4/step 4.5.2). - * 2. If z == 1 and the current bit of (w - 1) is 1, then b is a witness to - * the primality of w regardless of what happens in the rest of the loop - * (step 4.4). - * 3. If z == 1 and the current bit of (w - 1) is 0, then w is composite and - * we can exit early (optional and not currently implemented). - * 4. If none of the above, we should continue the exponentiation. - * - * In pseudocode, the constant-time variant of steps 4.3-4.7 above looks like: - * z = 1 - * possibly_prime = false // 0 represents "composite" - * for i=wlen-1 down to 0 { - * // Perform the next step of modular exponentiation. - * wi = ((w - 1) >> i) & 1 - * z = wi ? (z^2 * b) mod w : (z^2) mod w - * - * // Get the lower bits (to see if they're all zero). - * w_low = (w - 1) mod (2^i) - * - * // Determine if b is a witness to the primality of w. - * possibly_prime |= ((w_low == 0) && (z == w-1)) - * possibly_prime |= ((w_low == 0) && (wi == 1) && (z == 1)) - * } - * return possibly_prime + * If we specialize to the case that w mod 4 = 3, the routine becomes much + * simpler and easier to make constant-time, because a in step 1 is always 1. + * In pseudocode, the modified version of steps 4.3 through 4.7 is: + * 4.3 Compute z = b^((w-1)/2) mod w. + * 4.4 If ((z = 1) or (z = w - 1)), then go to step 4.7. + * 4.5 No-op. + * 4.6 Return COMPOSITE. + * 4.7 Continue. * * Expects the Montgomery constants for w to be precomputed before entry. For * this routine, R = 2^(n*256) and R/2 < w < R. None of the input buffers may * overlap in DMEM. This routine runs in constant time relative to w if w is * possibly prime. * + * This routine is constant-time relative to w if w is possibly prime. + * * Flags: Flags have no meaning beyond the scope of this subroutine. * * @param[in] x14: dptr_b, pointer to randomly-generated witness to use for testing * @param[in] x15: dptr_z, pointer to temporary working buffer in dmem (n*32 bytes) - * @param[in] x16: dptr_w, pointer to candidate prime w in dmem + * @param[in] x16: dptr_w, pointer to candidate prime w in dmem, w mod 4 = 3 * @param[in] x17: dptr_m0inv, pointer to Montgomery constant m0' (for w) in dmem * @param[in] x18: dptr_rr, pointer to Montgomery constant RR = R^2 mod w in dmem * @param[in] x30: n, number of limbs for all bignums (wlen / 256; n <= 16) @@ -276,207 +247,137 @@ test_witness: bn.sid x8, 0(x21++) addi x8, x8, 1 - /* Initialize work buffer to R mod w (1 in Montgomery form). - dmem[dptr_z:dptr_z+n*32] <= montmul(1, RR) = R mod w */ - addi x19, x18, 0 - addi x21, x15, 0 - jal x1, montmul_mul1 - - /* Initialize the "possibly prime" tracking register to 0. - w21 <= 0 */ - bn.mov w21, w31 - /* Initialize wide-register pointers. */ li x23, 23 li x25, 25 - /* Initialize loop counter. - x26 <= n */ - addi x26, x30, 0 - - /* Loop through the limbs of (w - 1), most significant first. - - Throughout the loop we maintain a mask that is 0 until all remaining bits - of (w - 1) are 0. - - We have not yet reached the part of the loop where the remaining bits - of (w-1) are all 0 (i.e. step 4.5), OR - - We have already discovered that b is a witness to the primality of w - - Loop invariants at start of loop for iteration i (i=n-1..0): - x4 = 0 if w has already been found to be composite, all 1s otherwise - x16 = dptr_w - x26 = i+1 - w21 = all 1s if b is already a witness to the primality of w, otherwise 0 - dmem[dptr_z:dptr_z+n*32] <= (b^((w - 1) >> (i*256)) * R) mod w - */ - loop x30, 12 - /* w20 <= 2^256 - 1 */ - bn.not w20, w31 - - /* Set flags in preparation for loop. - FG0.C <= 1 - FG0.Z <= 1 */ - bn.addi w25, w20, 1 - - /* Compute limb i of (w-1) and set the mask (w20) based on whether - the lower limbs are all-zero. */ - addi x3, x16, 0 - loop x26, 3 - /* Select mask based on whether the previous limb was 0. - w20 <= FG0.Z ? w20 : w31 */ - bn.sel w20, w20, w31, FG0.Z - /* w25 <= next limb of w */ - bn.lid x25, 0(x3++) - /* w22 <= (w24 - FG0.C) mod 2^256 = next limb of (w - 1) */ - bn.subb w22, w25, w31 - - /* Loop through the bits of this limb. The code is separated in order to - make it more readable and to make loop instruction counting easier, even - though this is the only call site. We use unconditional branches instead - of jal/ret to avoid consuming the call stack unnecessarily. */ - loopi 256, 2 - jal x0, test_witness_step -_test_witness_step_done: - nop - - /* Update the loop counter. - x26 <= x26 - 1 = i - 1 */ - addi x3, x0, 1 - sub x26, x26, x3 - - /* TODO: add an FI check here to ensure we completed all loop iterations if - the result register is all 1s. */ + /* Ensure the last 3 bits of the candidate prime are set so that w mod 4 = 3. + This is a precondition of the subroutine, but re-setting the bits here + provides further protection from e.g. fault injection attacks. */ + bn.lid x25, 0(x16) + bn.addi w23, w31, 3 + bn.or w25, w25, w23 + bn.sid x25, 0(x16) - ret - -/** - * Inner loop body for the Miller-Rabin primality test. - * - * This subroutine expects and maintains the following loop invariants, for - * loop counter j=0..255: - * x9 = 3 - * x10 = 4 - * x11 = 2 - * x23 = 23 - * x25 = 25 - * x30 = n - * x31 = n-1 - * w21 = all 1s if b is already a witness to the primality of w, otherwise 0 - * w22 = ((w - 1)[i] << j) mod 2^256 - * dmem[dptr_z:dptr_z+n*32] <= (b^((w - 1) >> (i*256+j)) * R) mod w - * - * See `test_witness` for more explanation. - * - * Flags: Flags have no meaning beyond the scope of this subroutine. - * - * @param[in] x9: 3, constant - * @param[in] x10: 4, constant - * @param[in] x11: 2, constant - * @param[in] x14: dptr_b, pointer to randomly-generated witness to use for testing - * @param[in] x15: dptr_z, pointer to temporary working buffer in dmem (n*32 bytes) - * @param[in] x16: dptr_w, pointer to candidate prime w in dmem - * @param[in] x17: dptr_m0inv, pointer to Montgomery constant m0' (for w) in dmem - * @param[in] x23: 23, constant - * @param[in] x25: 25, constant - * @param[in] x30: n, number of limbs - * @param[in] x31: n-1 - * @param[in] w31: all-zero - * @param[in,out] w21: 2^256-1 if w is possibly prime, 0 otherwise - * @param[in,out] w22: current limb of exponent, shifted (see invariant) - * @param[in,out] dmem[dptr_z:dptr_z+n*32]: intermediate value (see invariant) - * - * clobbered registers: x2, x3, x5 to x8, x10, x12, x13, x19 to x22, - * w2, w3, w4..w[4+(n-1)], w21 to w30 - * clobbered flag groups: FG0, FG1 - */ -test_witness_step: - /* Perform the next squaring step of modular exponentiation. - w4..w[4+(n-1)] = montmul(z, z) */ - addi x19, x15, 0 - addi x20, x15, 0 - jal x1, montmul + /* Clear carry flag. + FG0.C <= 0 */ + bn.sub w31, w31, w31 - /* Store squaring result in work buffer. - dmem[dptr_z:dptr_z+n*32] <= w4..w[4+(n-1)] */ + /* Initialize work buffer to (R - w) mod w (1 in Montgomery form). + dmem[dptr_z:dptr_z+n*32] <= (0 - w) mod R = R - w = R mod w */ + addi x20, x16, 0 addi x21, x15, 0 - loop x30, 2 - bn.sid x8, 0(x21++) - addi x8, x8, 1 + loop x30, 3 + bn.lid x23, 0(x20++) + bn.subb w23, w31, w23 + bn.sid x23, 0(x21++) - /* Perform the next multiplication step of modular exponentiation. - w4..w[4+(n-1)] = montmul(z, b) */ - addi x19, x14, 0 - addi x20, x15, 0 - jal x1, montmul + /* Initialize loop counter and high limb. + x26 <= n - 1 + w20 <= 0 */ + addi x26, x31, 0 + bn.sub w20, w20, w20 - /* Shift the exponent and update flags; FG0.C will now be the next bit of - the exponent, and FG0.Z will be 1 if the remaining bits in this limb - are zero. - w22 <= (w22 << 1) mod 2^256 - FG0.C <= w22[255] - FG0.Z <= w22 mod 2^255 =? 0 */ - bn.add w22, w22, w22 - - /* Select either squared or squared+multiplied result based on FG0.C. - dmem[dptr_z:dptr_z+n*32] <= - FG0.C ? w4..w[4+(n-1)] : dmem[dptr_z:dptr_z+n*32] */ - addi x2, x15, 0 - li x8, 4 - loop x30, 4 - /* w23 <= dmem[dptr_z+i*32] */ - bn.lid x23, 0(x2) - /* w25 <= w[4+i] */ - bn.movr x25, x8++ - /* w23 = FG0.C ? w[4+i] : dmem[dptr_z+i*32] */ - bn.sel w23, w25, w23, FG0.C - /* dmem[dptr_z+i*32] <= w23 */ - bn.sid x23, 0(x2++) + /* Perform modular exponentiation to compute b^((w-1)/2). - /* Select a mask that is all 1s if all the remaining bits of (w-1) are 0. - That means BOTH: - - the lower limbs are 0 (w20 == 2^256 - 1), AND - - the rest of the current limb is 0 (FG0.Z == 1) + Loop through the limbs, most significant first, then iterate through each + bit of each limb. - w3 <= FG0.Z ? w20 : w31 - = all 1s if w mod 2^(i*256+j) is 0, otherwise 0 */ - bn.sel w3, w20, w31, FG0.Z + Loop invariants (i=n-1 to 0): + x15 = dptr_z + x16 = dptr_w + x26 = i + w20 = w[i+1] (or 0 if i=n-1) + dmem[dptr_z:dptr_z+n*32] <= (b^((w - 1) >> (i*256)) * R) mod w */ + loop x30, 27 + /* Get the ith limb of w. + w25 <= dmem[dptr_w + (i << 5)] = w[i] */ + slli x13, x26, 5 + add x13, x13, x16 + bn.lid x25, 0(x13) + + /* Get limb i of ((w-1) / 2). Since we know w is odd, we can simply + concatenate with the limb above and shift right by 1. + w22 <= (w20[0] << 255) | (w[i] >> 1) = (w >> 1)[i] */ + bn.rshi w22, w20, w25 >> 1 + + /* Save the ith limb for the next iteration. + w20 <= w[i] */ + bn.mov w20, w25 + + /* Loop through the bits of this limb and multiply/accumulate. */ + loopi 256, 19 + /* Perform the next squaring step of modular exponentiation. + w4..w[4+(n-1)] = montmul(z, z) */ + addi x19, x15, 0 + addi x20, x15, 0 + jal x1, montmul + + /* Store squaring result in work buffer. + dmem[dptr_z:dptr_z+n*32] <= w4..w[4+(n-1)] */ + addi x21, x15, 0 + loop x30, 2 + bn.sid x8, 0(x21++) + addi x8, x8, 1 + + /* Perform the next multiplication step of modular exponentiation. + w4..w[4+(n-1)] = montmul(z, b) */ + addi x19, x14, 0 + addi x20, x15, 0 + jal x1, montmul + + /* Shift the exponent and update flags; FG0.C will now be the next bit of + the exponent. + w22 <= (w22 << 1) mod 2^256 + FG0.C <= w22[255] */ + bn.add w22, w22, w22 + + /* Select either squared or squared+multiplied result based on FG0.C. + dmem[dptr_z:dptr_z+n*32] <= + FG0.C ? w4..w[4+(n-1)] : dmem[dptr_z:dptr_z+n*32] */ + addi x2, x15, 0 + li x8, 4 + loop x30, 4 + /* w23 <= dmem[dptr_z+i*32] */ + bn.lid x23, 0(x2) + /* w25 <= w[4+i] */ + bn.movr x25, x8++ + /* w23 = FG0.C ? w[4+i] : dmem[dptr_z+i*32] */ + bn.sel w23, w25, w23, FG0.C + /* dmem[dptr_z+i*32] <= w23 */ + bn.sid x23, 0(x2++) + + /* End of inner loop. */ + nop - /* Capture FG0.C, the current bit of (w - 1), as a mask. - w24 <= (0 - FG0.C) mod 2^256 = FG0.C ? 2^256 - 1 : 0 */ - bn.subb w24, w31, w31 + /* Update loop counter. + x26 <= x26 - 1 = i - 1 */ + li x3, 1 + sub x26, x26, x3 + /* End of outer loop. */ /* Fully reduce mod w. The `montmul` routine does not guarantee that the result is < w, only < R. dmem[dptr_z:dptr_z+n*32] <= dmem[dptr_z:dptr_z+n*32] mod w */ jal x1, reduce_modw + /* Check if the intermediate result represents 1 in Montgomery form. + w22 <= all 1s if dmem[x15:x15+n*32] is R mod w, otherwise 0 */ + jal x1, is_mont1 + bn.mov w22, w26 + /* Check if the work buffer is (-R) mod w, which is the Montgomery form representation of (-1) mod w = w - 1. w26 <= all 1s if dmem[x15:x15+n*32] is (-R) mod w, otherwise 0 */ jal x1, is_mont_minus1 - /* If the intermediate result is w - 1 (w26) AND the remaining bits of w - are all-zero (w3), then b is a witness to the primality of w. This - corresponds to steps 4.4 and 4.5.2 of the FIPS procedure. - w21 <= w21 | (w3 & w26) */ - bn.and w2, w3, w26 - bn.or w21, w21, w2 + /* If either check returned all-ones, then the input is possibly prime. */ + bn.or w21, w26, w22 - /* Check if the intermediate result represents 1 in Montgomery form. - w26 <= all 1s if dmem[x15:x15+n*32] is R mod w, otherwise 0 */ - jal x1, is_mont1 - - /* If the intermediate result is 1 (w26) AND the remaining bits of w are - all-zero (w3) AND the current bit of w is 1 (w24), then b is a - witness to the primality of w. This corresponds to step 4.4 in the - FIPS procedure. - w21 <= w21 | (w3 & w24 & w26) */ - bn.and w2, w3, w26 - bn.and w2, w2, w24 - bn.or w21, w21, w2 + /* TODO: add an FI check here to ensure we completed all loop iterations if + the result register is all 1s. */ - /* Unconditional branch back to `test_witness`. */ - jal x0, _test_witness_step_done + ret /** * Fully reduce modulo a candidate prime w. @@ -592,6 +493,8 @@ is_mont1: * specialized and sensitive to the range of w (for some w, 3w - R could also * be equivalent to w - 1). * + * WARNING: this routine clobbers its input in DMEM (dmem[dptr_x..dptr_x+n*32]). + * * Flags: Flags have no meaning beyond the scope of this subroutine. * * @param[in] x15: dptr_x, pointer to input buffer x in dmem @@ -627,21 +530,4 @@ is_mont_minus1: w26 <= all 1s if dmem[dptr_x:dptr_x+n*32] == (-R) mod w, otherwise 0 */ jal x1, is_mont1 - /* Clear flags. */ - bn.sub w31, w31, w31 - - /* Negate the input back to its previous form. - dmem[dptr_x:dptr_x+n*32] <= w - dmem[dptr_x:dptr_x+n*32] */ - addi x2, x15, 0 - addi x3, x16, 0 - loop x30, 4 - /* w23 <= x[i] */ - bn.lid x23, 0(x2) - /* w25 <= w[i] */ - bn.lid x25, 0(x3++) - /* w23 <= w[i] - out[i] - FG0.C */ - bn.subb w23, w25, w23 - /* out[i] <= w23 */ - bn.sid x23, 0(x2++) - ret diff --git a/sw/otbn/crypto/rsa_keygen.s b/sw/otbn/crypto/rsa_keygen.s index 03959f28fb99d..4483c112d628f 100644 --- a/sw/otbn/crypto/rsa_keygen.s +++ b/sw/otbn/crypto/rsa_keygen.s @@ -4,6 +4,7 @@ /* Public interface. */ .globl rsa_keygen +.globl rsa_key_from_cofactor /* Exposed for testing purposes only. */ .globl relprime_f4 @@ -14,6 +15,10 @@ /** * Generate a random RSA key pair. * + * The public key is the pair (n, e), where n is the modulus and e is the + * public exponent. and the private key is the pair (n, d), where n is the same + * modulus as in the public key and d is the private exponent. + * * For the official specification, see FIPS 186-5 section A.1.3. For the * purposes of this implementation, the RSA public exponent e is always 65537 * (aka the Fermat number "F4", 2^16 + 1). @@ -28,13 +33,13 @@ * * Flags: Flags have no meaning beyond the scope of this subroutine. * - * @param[in] x30: number of 256-bit limbs for p and q (key size in bits / 512) + * @param[in] x30: plen, number of 256-bit limbs for p and q * @param[in] w31: all-zero - * @param[out] dmem[rsa_n..rsa_n+(n*2*32)] RSA public key modulus (n) - * @param[out] dmem[rsa_d..rsa_d+(n*2*32)] RSA private exponent (d) + * @param[out] dmem[rsa_n..rsa_n+(plen*2*32)] RSA public key modulus (n) + * @param[out] dmem[rsa_d..rsa_d+(plen*2*32)] RSA private exponent (d) * - * clobbered registers: x2 to x15, x17 to x26, x31, - * w2, w3, w4..w[4+(n-1)], w20 to w30 + * clobbered registers: x2 to x26, x31, + * w2, w3, w4..w[4+(plen-1)], w20 to w30 * clobbered flag groups: FG0, FG1 */ rsa_keygen: @@ -50,21 +55,26 @@ rsa_keygen: li x21, 21 /* Generate the first prime, p. - dmem[rsa_p..rsa_p+(n*32)] <= p */ + dmem[rsa_p..rsa_p+(plen*32)] <= p */ jal x1, generate_p /* Generate the second prime, q. - dmem[rsa_q..rsa_q+(n*32)] <= q */ + dmem[rsa_q..rsa_q+(plen*32)] <= q */ jal x1, generate_q /* Multiply p and q to get the public modulus n. - dmem[rsa_n..rsa_n+(n*2*32)] <= p * q */ + dmem[rsa_n..rsa_n+(plen*2*32)] <= p * q */ la x10, rsa_p la x11, rsa_q la x12, rsa_n jal x1, bignum_mul - /* Derive the private exponent d from p and q (tail-call). */ - jal x0, derive_d + /* Derive the private exponent d from p and q. + x2 <= zero if d is OK, otherwise nonzero */ + jal x1, derive_d + + /* Check that d is large enough (tail-call). If d is not large enough, + then `check_d` will restart the key-generation process. */ + jal x0, check_d /** * Derive the private RSA exponent d. @@ -72,18 +82,21 @@ rsa_keygen: * Returns d = (65537^-1) mod LCM(p-1, q-1). * * This function overwrites p and q, and requires that they are continuous in - * memory (specifically, it expects to be able to use 512 bytes of space - * following the label `rsa_pq`). + * memory. Specifically, it expects to be able to use 512 bytes of space + * following the label `rsa_pq`. * - * Flags: Flags are not set in this subroutine. + * Important: This routine uses `rsa_cofactor` as a second 512-byte work buffer + * and clobbers the contents. * - * @param[in] dmem[rsa_p..rsa_p+(n*32)]: first prime p - * @param[in] dmem[rsa_q..rsa_q+(n*32)]: second prime q + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] dmem[rsa_p..rsa_p+(plen*32)]: first prime p + * @param[in] dmem[rsa_q..rsa_q+(plen*32)]: second prime q * @param[in] x20: 20, constant * @param[in] x21: 21, constant - * @param[in] x30: number of 256-bit limbs for p and q + * @param[in] x30: plen, number of 256-bit limbs for p and q * @param[in] w31: all-zero - * @param[out] dmem[rsa_d..rsa_d+(n*2*32)]: result, private exponent d + * @param[out] dmem[rsa_d..rsa_d+(plen*2*32)]: result, private exponent d * * clobbered registers: x2 to x8, x10 to x15, x20 to x26, x31, w20 to w28 * clobbered flag groups: FG0, FG1 @@ -94,13 +107,13 @@ derive_d: la x11, rsa_q /* Subtract 1 from p in-place (no carry from lowest limb since p is odd). - dmem[rsa_p..rsa_p+(n*32)] <= p - 1 */ + dmem[rsa_p..rsa_p+(plen*32)] <= p - 1 */ bn.lid x20, 0(x10) bn.subi w20, w20, 1 bn.sid x20, 0(x10) /* Subtract 1 from q in-place (no carry from lowest limb since p is odd). - dmem[rsa_q..rsa_q+(n*32)] <= q - 1 */ + dmem[rsa_q..rsa_q+(plen*32)] <= q - 1 */ bn.lid x20, 0(x11) bn.subi w20, w20, 1 bn.sid x20, 0(x11) @@ -111,52 +124,152 @@ derive_d: jal x1, lcm /* Update the number of limbs for modinv. - x30 <= n*2 */ + x30 <= plen*2 */ add x30, x30, x30 /* Compute d = (65537^-1) mod LCM(p-1,q-1). The modular inverse - routine requires two working buffers, which we construct from `tmp_data` - and the required-contiguous `rsa_p` and `rsa_q` buffers. - dmem[rsa_d..rsa_d+(n*2*32)] <= (65537^-1) mod dmem[x12..x12+(n*2*32)] */ + routine requires two working buffers, which we construct from + `rsa_cofactor` and the required-contiguous `rsa_p` and `rsa_q` buffers. + dmem[rsa_d..rsa_d+(plen*2*32)] <= (65537^-1) mod dmem[x12..x12+(n*2*32)] */ + la x12, tmp_scratchpad la x13, rsa_d - la x14, tmp_data + la x14, rsa_cofactor la x15, rsa_pq jal x1, modinv_f4 - /* x30 <= (n*2) >> 1 = n */ + /* Reset the limb count. + x30 <= (plen*2) >> 1 = n */ srli x30, x30, 1 + ret - /* Get a pointer to the nth limb of d (halfway through the number). - x3 <= rsa_d + n*32 */ +/** + * Check the private RSA exponent d. + * + * Calls `rsa_keygen` if d is too small, otherwise returns. Designed to be + * tail-called by `rsa_keygen`. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] x20: 20, constant + * @param[in] x30: plen, number of 256-bit limbs for p and q + * @param[in] w31: all-zero + * @param[out] dmem[rsa_d..rsa_d+(plen*2*32)]: result, private exponent d + * + * clobbered registers: x2, x3, w20, w23 + * clobbered flag groups: FG0, FG1 + */ +check_d: + /* Get a pointer to the second half of d. + x3 <= rsa_d + plen*32 */ slli x2, x30, 5 la x3, rsa_d add x3, x3, x2 - /* Check that d > 2^(n*256), i.e. that the highest n limbs are nonzero. We + /* Check that d > 2^(plen*256), i.e. that the highest plen limbs are nonzero. We need to retry if it's too small (see FIPS 186-5 section A.1.1), although - in practice this is unlikely. We do this by ORing the n highest limbs. - FG0.Z <= (d >> (n*256)) == 0 */ + in practice this is unlikely. We do this by ORing the plen highest limbs. + FG0.Z <= (d >> (plen*256)) == 0 */ bn.mov w23, w31 loop x30, 2 /* w20 <= d[n+i] */ - bn.lid x20, 0(x3++) + bn.lid x20, 0(x3++) /* w23 <= w23 | w20 */ bn.or w23, w23, w20 /* Get the FG0.Z flag into a register. x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 srli x2, x2, 3 andi x2, x2, 1 - /* If the flag is set, the high limbs are zero and we should start from - scratch, generating a new p and q. Note that x30 MUST be set to n here, - not n*2, to meet the rsa_keygen preconditions. */ + /* If x2 != 0, then d is too small and we need to restart key generation from + scratch. */ bne x2, x0, rsa_keygen - /* If we get here, d is OK; return. */ ret +/** + * Construct an RSA key pair from a modulus and cofactor. + * + * This routine does not check the validity of the RSA key pair; it does not + * ensure that the factors are prime or check any other properties, simply + * divides the modulus by the cofactor and derives the private exponent. The + * only public exponent supported is e=65537. + * + * This routine will recompute the public modulus n after deriving the factors; + * the caller may want to check that the value matches. If the modulus is not + * in fact divisible by the cofactor, or the cofactor is much too small, it + * will not match. + * + * Flags: Flags have no meaning beyond the scope of this subroutine. + * + * @param[in] x30: plen, number of 256-bit limbs for p and q + * @param[in] w31: all-zero + * @param[in] dmem[rsa_n..rsa_n+(plen*2*32)] RSA public key modulus (n) + * @param[in] dmem[rsa_cofactor..rsa_cofactor+(plen*32)] Cofactor (p or q) + * @param[out] dmem[rsa_n..rsa_n+(plen*2*32)] Recomputed public key modulus (n) + * @param[out] dmem[rsa_d..rsa_d+(plen*2*32)] RSA private exponent (d) + * + * clobbered registers: x2 to x8, x10 to x15, x20 to x26, x31, w3, w20 to w28 + * clobbered flag groups: FG0, FG1 + */ +rsa_key_from_cofactor: + /* Initialize wide-register pointers. + x20 <= 20 + x21 <= 21 */ + li x20, 20 + li x21, 21 + + /* Get a pointer to the end of the cofactor. + x2 <= rsa_cofactor + plen*32 */ + slli x2, x30, 5 + la x3, rsa_cofactor + add x2, x2, x3 + + /* Set the second half of the cofactor buffer to zero, so the cofactor is the + same size as the modulus for division. + dmem[rsa_cofactor+plen*32..rsa_cofactor+plen*2*32] <= 0 */ + li x3, 31 + loop x30, 1 + bn.sid x3, 0(x2++) + + /* Update the number of limbs for division. + x30 <= plen*2 */ + add x30, x30, x30 + + /* Compute (n / cofactor) and store the result in `rsa_pq`. The quotient will + only occupy the first half (`rsa_p`) if the input is valid. + dmem[rsa_n..rsa_n+plen*2*32] <= n % cofactor + dmem[rsa_pq..rsa_pq+plen*2*32] <= n / cofactor */ + la x10, rsa_n + la x11, rsa_cofactor + la x12, rsa_pq + jal x1, div + + /* Reset the limb count. + x30 <= (plen*2) >> 1 = n */ + srli x30, x30, 1 + + /* Copy the original cofactor into `rsa_q` and compute + the private exponent. + dmem[rsa_q..rsa_q+plen*32] <= dmem[rsa_cofactor..rsa_cofactor+plen*32] */ + la x11, rsa_cofactor + la x2, rsa_q + li x3, 3 + loop x30, 2 + bn.lid x3, 0(x11++) + bn.sid x3, 0(x2++) + + /* Multiply p and q to get the public modulus n. + dmem[rsa_n..rsa_n+(plen*2*32)] <= p * q */ + la x10, rsa_p + la x11, rsa_q + la x12, rsa_n + jal x1, bignum_mul + + /* Derive the private exponent d from p and q (tail-call). */ + jal x0, derive_d + /** * Compute the inverse of 65537 modulo a given number. * @@ -275,17 +388,17 @@ derive_d: * @param[in] x15: dptr_v, pointer to a temporary buffer in DMEM (n limbs) * @param[in] x20: 20, constant * @param[in] x21: 21, constant - * @param[in] x30: n, number of 256-bit limbs for modulus m and result d + * @param[in] x30: nlen, number of 256-bit limbs for modulus m and result d * @param[in] w31: all-zero - * @param[out] dmem[dptr_A..dptr_A+(n*32)]: result, modular inverse d + * @param[out] dmem[dptr_A..dptr_A+(plen*32)]: result, modular inverse d * * clobbered registers: MOD, x2 to x4, x31, w20 to w28 * clobbered flag groups: FG0, FG1 */ modinv_f4: /* Zero the intermediate buffers. - dmem[dptr_A..dptr_A+(n*32)] <= 0 - dmem[dptr_C..dptr_C+(n*32)] <= 0 */ + dmem[dptr_A..dptr_A+(nlen*32)] <= 0 + dmem[dptr_C..dptr_C+(nlen*32)] <= 0 */ li x2, 31 addi x3, x13, 0 addi x4, x14, 0 @@ -307,7 +420,7 @@ modinv_f4: bn.addi w28, w31, 1 /* Copy the modulus to the buffer for v. - dmem[dptr_v..dptr_v+(n*32)] <= m */ + dmem[dptr_v..dptr_v+(nlen*32)] <= m */ addi x3, x12, 0 addi x4, x15, 0 loop x30, 2 @@ -320,7 +433,7 @@ modinv_f4: bn.add w22, w23, w23 << 16 /* MOD <= 65537 */ - bn.wsrw 0x0, w22 + bn.wsrw MOD, w22 /* Calculate number of loop iterations = bitlen(m) + bitlen(65537). x31 <= (x30 << 8) + 17 = 256*n + 17 */ @@ -370,7 +483,7 @@ modinv_f4: bn.sub w22, w22, w20 /* Conditionally subtract u from v. - dmem[dptr_v..dptr_v+(n*32)] <= v - (u & w25) */ + dmem[dptr_v..dptr_v+(nlen*32)] <= v - (u & w25) */ bn.and w23, w22, w25 addi x2, x15, 0 loop x30, 4 @@ -433,7 +546,7 @@ modinv_f4: /* Update A if we updated u in the previous steps (w24 == 2^256-1). We additionally subtract the modulus if *both* w24,w26 == 2^256-1. - dmem[dptr_A..dptr_A+(n*32)] <= (w24 == 2^256-1) ? (A + C) mod m : A */ + dmem[dptr_A..dptr_A+(nlen*32)] <= (w24 == 2^256-1) ? (A + C) mod m : A */ addi x2, x12, 0 addi x3, x13, 0 addi x4, x14, 0 @@ -459,7 +572,7 @@ modinv_f4: /* Update C if we updated v in the previous steps (w25 == 2^256-1). We additionally subtract the modulus if *both* w25,w26 == 2^256-1. - dmem[dptr_C..dptr_C+(n*32)] <= (w25 == 2^256-1) ? (A + C) mod m : C */ + dmem[dptr_C..dptr_C+(nlen*32)] <= (w25 == 2^256-1) ? (A + C) mod m : C */ addi x2, x12, 0 addi x3, x13, 0 addi x4, x14, 0 @@ -509,7 +622,7 @@ modinv_f4: /* Conditionally add to B. w27 <= B + (65537 & w23) */ - bn.wsrr w24, 0x0 /* MOD */ + bn.wsrr w24, MOD bn.and w24, w24, w23 bn.add w27, w27, w24 @@ -522,7 +635,7 @@ modinv_f4: bn.sub w31, w31, w31 /* Conditionally add m to A. - dmem[dptr_A..dptr_A+(n+32)] <= (!u[0] && (A[0] | B[0])) ? A + m : A */ + dmem[dptr_A..dptr_A+(nlen*32)] <= (!u[0] && (A[0] | B[0])) ? A + m : A */ addi x2, x12, 0 addi x3, x13, 0 loop x30, 5 @@ -542,7 +655,7 @@ modinv_f4: bn.addc w23, w31, w31 /* Shift A to the right 1 if FG1.L is unset. - dmem[dptr_A..dptr_A+(n+32)] <= FG1.L ? A : A >> 1 */ + dmem[dptr_A..dptr_A+(nlen*32)] <= FG1.L ? A : A >> 1 */ addi x3, x13, 0 jal x1, bignum_rshift1_if_not_fg1L @@ -552,7 +665,7 @@ modinv_f4: bn.or w20, w20, w31, FG1 /* Shift v to the right 1 if FG1.L is unset. - dmem[dptr_v..dptr_v+(n+32)] <= FG1.L ? v : v >> 1 */ + dmem[dptr_v..dptr_v+(nlen*32)] <= FG1.L ? v : v >> 1 */ addi x3, x15, 0 bn.mov w23, w31 jal x1, bignum_rshift1_if_not_fg1L @@ -574,7 +687,7 @@ modinv_f4: /* Conditionally add to D. w28 <= D + (65537 & w23) */ - bn.wsrr w24, 0x0 /* MOD */ + bn.wsrr w24, MOD bn.and w24, w24, w23 bn.add w28, w28, w24 @@ -587,7 +700,7 @@ modinv_f4: bn.sub w31, w31, w31 /* Conditionally add m to C. - dmem[dptr_C..dptr_C+(n+32)] <= (!v[0] && (C[0] | D[0])) ? C + m : C */ + dmem[dptr_C..dptr_C+(nlen*32)] <= (!v[0] && (C[0] | D[0])) ? C + m : C */ addi x2, x12, 0 addi x3, x14, 0 loop x30, 5 @@ -607,7 +720,7 @@ modinv_f4: bn.addc w23, w31, w31 /* Shift C to the right 1 if FG1.L is unset. - dmem[dptr_C..dptr_C+(n+32)] <= FG1.L ? C : C >> 1 */ + dmem[dptr_C..dptr_C+(nlen*32)] <= FG1.L ? C : C >> 1 */ addi x3, x14, 0 jal x1, bignum_rshift1_if_not_fg1L @@ -620,7 +733,7 @@ modinv_f4: /* Get the FG0.Z flag into a register. x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 srli x2, x2, 3 andi x2, x2, 1 @@ -647,10 +760,10 @@ _modinv_f4_u_ok: * @param[in] x3: dptr_A, pointer to input A in DMEM * @param[in] x20: 20, constant * @param[in] x21: 21, constant - * @param[in] x30: n, number of 256-bit limbs for input A + * @param[in] x30: alen, number of 256-bit limbs for input A * @param[in] w23: value to use as the msb * @param[in] w31: all-zero - * @param[out] dmem[dptr_A..dptr_A+n*32]: A', result + * @param[out] dmem[dptr_A..dptr_A+alen*32]: A', result * * clobbered registers: x2, x3, x4, w20, w21 * clobbered flag groups: FG0 @@ -698,19 +811,19 @@ bignum_rshift1_if_not_fg1L: * * @param[in] x20: 20, constant * @param[in] x21: 21, constant - * @param[in] x30: n, number of 256-bit limbs in the candidate prime + * @param[in] x30: plen, number of 256-bit limbs in the candidate prime * @param[in] x31: n-1, constant * @param[in] w31: all-zero - * @param[out] dmem[rsa_p..rsa_p+(n*32)]: result, probable prime p + * @param[out] dmem[rsa_p..rsa_p+(plen*32)]: result, probable prime p * - * clobbered registers: x2 to x13, x17 to x19, x22 to x26, - * w2, w3, w4..w[4+(n-1)], w20 to w30 + * clobbered registers: x2 to x13, x16 to x19, x22 to x26, + * w2, w3, w4..w[4+(plen-1)], w20 to w30 * clobbered flag groups: FG0, FG1 */ generate_p: /* Compute nlen, the bit-length of the RSA modulus based on the number of limbs for p. - x4 <= n << 9 = n*256*2 = nlen */ + x4 <= n << 9 = plen*256*2 = nlen */ slli x4, x30, 9 /* Initialize counter for # of attempts. @@ -731,7 +844,7 @@ _generate_p_counter_nonzero: sub x4, x4, x5 /* Generate a new random value for p. - dmem[rsa_p] <= */ + dmem[rsa_p] <= */ la x16, rsa_p jal x1, generate_prime_candidate @@ -746,7 +859,7 @@ _generate_p_counter_nonzero: /* Get the FG0.Z flag into a register. x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 srli x2, x2, 3 andi x2, x2, 1 @@ -770,19 +883,19 @@ _generate_p_counter_nonzero: * * @param[in] x20: 20, constant * @param[in] x21: 21, constant - * @param[in] x30: n, number of 256-bit limbs in the candidate prime + * @param[in] x30: plen, number of 256-bit limbs in the candidate prime * @param[in] x31: n-1, constant * @param[in] w31: all-zero - * @param[out] dmem[rsa_p..rsa_p+(n*32)]: result, probable prime p + * @param[out] dmem[rsa_p..rsa_p+(plen*32)]: result, probable prime p * - * clobbered registers: x2 to x13, x17 to x19, x22 to x26, - * w2, w3, w4..w[4+(n-1)], w20 to w30 + * clobbered registers: x2 to x13, x16 to x19, x22 to x26, + * w2, w3, w4..w[4+(plen-1)], w20 to w30 * clobbered flag groups: FG0, FG1 */ generate_q: /* Compute nlen, the bit-length of the RSA modulus based on the number of limbs for q. - x4 <= n << 9 = n*256*2 = nlen */ + x4 <= n << 9 = plen*256*2 = nlen */ slli x4, x30, 9 /* Initialize counter for # of attempts. @@ -804,7 +917,7 @@ _generate_q_counter_nonzero: sub x4, x4, x5 /* Generate a new random value for q. - dmem[rsa_q] <= */ + dmem[rsa_q] <= */ la x16, rsa_q jal x1, generate_prime_candidate @@ -819,7 +932,7 @@ _generate_q_counter_nonzero: /* Get the FG0.Z flag into a register. x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 srli x2, x2, 3 andi x2, x2, 1 @@ -835,13 +948,14 @@ _generate_q_counter_nonzero: * Returns all 1s if the check passess, and 0 if it fails. * * For the candidate value p, this check passes only if: - * * p >= sqrt(2)*(2^(nlen/2 - 1)), where nlen = RSA public key length, and * * GCD(p-1, 65537) = 1, and * * p passes 5 rounds of the Miller-Rabin primality test. * * Assumes that the input is an odd number (this is a precondition for the - * primality test). Before using this to check untrusted or imported keys, the - * caller must check to ensure p is odd. + * primality test) and that p >= sqrt(2)*(2^(nlen/2 - 1)), where nlen = RSA + * public key length. Internally, `generate_prime_candidate` guarantees these + * conditions. The caller must ensure them before using this routine to check + * untrusted or imported keys. * * See FIPS 186-5 section A.1.3 for the official spec. See this comment in * BoringSSL's implementation for a detailed description of how to choose the @@ -856,13 +970,13 @@ _generate_q_counter_nonzero: * @param[in] x16: dptr_p, address of the candidate prime in DMEM * @param[in] x20: 20, constant * @param[in] x21: 21, constant - * @param[in] x30: n, number of 256-bit limbs in the candidate prime - * @param[in] x31: n-1, constant + * @param[in] x30: plen, number of 256-bit limbs in the candidate prime + * @param[in] x31: plen-1, constant * @param[in] w31: all-zero * @param[out] w24: result, all 1s if the check passed and 0 otherwise * * clobbered registers: x2, x3, x5 to x13, x17 to x19, x22 to x26, - * w2, w3, w4..w[4+(n-1)], w20 to w30 + * w2, w3, w4..w[4+(plen-1)], w20 to w30 * clobbered flag groups: FG0, FG1 */ check_p: @@ -870,46 +984,6 @@ check_p: w24 <= 2^256 - 1 */ bn.not w24, w31 - /* Get a pointer to the precomputed constant sqrt(2)*2^2047. */ - la x2, sqrt2_rsa4k - - /* For RSA-2048 and RSA-3072, we will need to shift the lower bound right to - get sqrt(2)*2^1535 and sqrt(2)*2^1023, respectively. We can do this by - simply adjusting the pointer to skip the lower limbs. - x2 <= x2 + ((8 - x30) << 5) = sqrt2_rsa4k + ((8 - n) * 32) */ - li x3, 8 - sub x3, x3, x30 - slli x3, x3, 5 - add x2, x2, x3 - - /* Clear flags. */ - bn.sub w31, w31, w31 - - /* Now, the value at dmem[x2] is n limbs long and represents the lower bound - for p. Compare the two values. */ - addi x3, x16, 0 - loop x30, 3 - /* w20 <= dmem[x2] = lower_bound[i] */ - bn.lid x20, 0(x2++) - /* w21 <= dmem[x3] = p[i] */ - bn.lid x21, 0(x3++) - /* FG0.C <= p[i] > 3) & 1 = FG0.Z */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 srli x2, x2, 3 andi x2, x2, 1 @@ -1021,15 +1095,15 @@ _check_prime_fail: * * @param[in] x20: 20, constant * @param[in] x21: 21, constant - * @param[in] x30: n, number of 256-bit limbs in the candidate prime - * @param[in] x31: n-1, constant + * @param[in] x30: plen, number of 256-bit limbs in the candidate prime + * @param[in] x31: plen-1, constant * @param[in] w31: all-zero - * @param[in] dmem[rsa_p..rsa_p+(n*32)]: value for p - * @param[in] dmem[rsa_q..rsa_q+(n*32)]: candidate value for q + * @param[in] dmem[rsa_p..rsa_p+(plen*32)]: value for p + * @param[in] dmem[rsa_q..rsa_q+(plen*32)]: candidate value for q * @param[out] w24: result, all 1s if the check passed and 0 otherwise * * clobbered registers: x2, x3, x5 to x13, x17 to x19, x22 to x26, - * w2, w3, w4..w[4+(n-1)], w20 to w30 + * w2, w3, w4..w[4+(plen-1)], w20 to w30 * clobbered flag groups: FG0, FG1 */ check_q: @@ -1067,7 +1141,7 @@ check_q: /* Get the FG0.Z flag into a register. x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 srli x2, x2, 3 andi x2, x2, 1 @@ -1082,43 +1156,45 @@ check_q: /** * Generate a candidate prime (can be used for either p or q). * - * Fixes the lowest and highest bits to 1, so the number is always odd and >= - * 2^(256*n). All other bits are fully random. + * Fixes the lowest 3 bits to 1 and the highest 2 bits to 1, so the number is + * always equivalent to 7 mod 8 and is always >= 2^(256*n - 1) * 1.5. This + * implies that the prime candidate is always in range, i.e. it is greater than + * sqrt(2) * (2^(256*n - 1)), because sqrt(2) < 1.5. All other bits are fully + * random. This follows FIPS 186-5 section A.1.3, which allows generating prime + * candidates with a specific value mod 8 and allows the highest 2 bits to be + * set arbitrarily. * * Flags: Flags have no meaning beyond the scope of this subroutine. * * @param[in] x16: dptr_result, address of the result buffer in DMEM * @param[in] x20: 20, constant - * @param[in] x30: n, number of 256-bit limbs for the result - * @param[in] x31: n-1, constant + * @param[in] x30: plen, number of 256-bit limbs for the result + * @param[in] x31: plen-1, constant * @param[in] w31: all-zero - * @param[out] dmem[x16..x16+(n*32)]: random candidate prime + * @param[out] dmem[x16..x16+(plen*32)]: random candidate prime * * clobbered registers: x2, x3, w20, w21 * clobbered flag groups: FG0 */ generate_prime_candidate: /* Generate random 256-bit limbs. - dmem[x16..x16+(n*32)] <= RND(n*32) ^ URND(n*32) */ + dmem[x16..x16+(plen*32)] <= RND(n*32) ^ URND(n*32) */ addi x2, x16, 0 loop x30, 4 /* w20 <= RND() */ - bn.wsrr w20, 0x1 /* RND */ + bn.wsrr w20, RND /* w21 <= URND() */ - bn.wsrr w21, 0x2 /* URND */ + bn.wsrr w21, URND /* w20 <= w20 ^ w21 */ bn.xor w20, w20, w21 /* dmem[x2] <= w20 */ bn.sid x20, 0(x2++) - /* Create an all-ones mask. - w21 <= 2^256 - 1 */ - bn.not w21, w31 - - /* Fix the lowest bit to 1 so the number is always odd. - dmem[x16] <= (dmem[x16] << 1) mod 2^256 | 1 */ + /* Fix the lowest 3 bits to 1 so the number is always 7 mod 8. + dmem[x16] <= dmem[x16] | 7 */ bn.lid x20, 0(x16) - bn.rshi w20, w20, w21 >> 255 + bn.addi w21, w31, 7 + bn.or w20, w20, w21 bn.sid x20, 0(x16) /* Get a pointer to the last limb. @@ -1126,12 +1202,11 @@ generate_prime_candidate: slli x3, x31, 5 add x2, x16, x3 - /* Fix the highest bit to 1 so the number is always at least 2^(256*n-1). - This is implied by the lower bound and setting the bit is explicitly - permitted by FIPS 186-5. - dmem[x2] <= 1 << 255 | (dmem[x2] >> 1) */ + /* Fix the highest 2 bits to 1. + dmem[x2] <= dmem[x2] | (3 << 6) << 248 = dmem[x2] | 3 << 254 */ bn.lid x20, 0(x2) - bn.rshi w20, w21, w20 >> 1 + bn.addi w21, w31, 192 + bn.or w20, w20, w21 << 248 bn.sid x20, 0(x2) ret @@ -1175,7 +1250,7 @@ generate_prime_candidate: * Flags: Flags have no meaning beyond the scope of this subroutine. * * @param[in] x16: dptr_x, pointer to first limb of x in dmem - * @param[in] x30: n, number of 256-bit limbs for x + * @param[in] x30: plen, number of 256-bit limbs for x * @param[in] w31: all-zero * @param[out] w22: result, 0 only if x is not relatively prime to F4 * @@ -1187,7 +1262,7 @@ relprime_f4: MOD <= 2^16 + 1 */ bn.addi w22, w31, 1 bn.add w22, w22, w22 << 16 - bn.wsrw 0x0, w22 + bn.wsrw MOD, w22 /* Initialize constants for loop. */ li x22, 22 @@ -1282,7 +1357,8 @@ relprime_f4: /* Extra label marking the start of p || q in memory. The `derive_d` function uses this to get a 512-byte working buffer, which means p and q must be - continuous in memory (but it's OK if their order is reversed). */ + continuous in memory. In addition, `rsa_key_from_cofactor` uses the + larger buffer for division and depends on the order of `p` and `q`. */ .balign 32 rsa_pq: @@ -1301,7 +1377,7 @@ rsa_q: tmp_scratchpad: .zero 512 -.section .data +.bss /* RSA modulus n = p*q (up to 4096 bits). */ .balign 32 @@ -1315,9 +1391,11 @@ rsa_n: rsa_d: .zero 512 -/* Temporary working buffer (4096 bits). */ +/* Prime cofactor for n for `rsa_key_from_cofactor`; also used as a temporary + * work buffer. */ .balign 32 -tmp_data: +.globl rsa_cofactor +rsa_cofactor: .zero 512 /* Montgomery constant m0' (256 bits). */ @@ -1329,77 +1407,3 @@ mont_m0inv: .balign 32 mont_rr: .zero 256 - -/* Precomputed value for sqrt(2)*(2^2047), such that - (sqrt2_rsa4k^2 < 2**4095 < (sqrt2_rsa4k+1)^2 - - This number was taken from BoringSSL's implementation and has enough - precision to be exact for RSA-4096 and smaller: - https://boringssl.googlesource.com/boringssl/+/dcabfe2d8940529a69e007660fa7bf6c15954ecc/crypto/fipsmodule/rsa/rsa_impl.c#1006 -*/ -.balign 32 -sqrt2_rsa4k: - .word 0xe633e3e1 - .word 0x4d7c60a5 - .word 0xca3ea33b - .word 0x5fcf8f7b - .word 0x92957023 - .word 0xc246785e - .word 0x797f2805 - .word 0xf9acce41 - .word 0xd3b1f780 - .word 0xfdfe170f - .word 0x3facb882 - .word 0xd24f4a76 - .word 0xaff5f3b2 - .word 0x18838a2e - .word 0xa2f7dc33 - .word 0xc1fcbdde - .word 0xf7aa81c2 - .word 0xdea06241 - .word 0xca221307 - .word 0xf6a1be3f - .word 0x7bda1ebf - .word 0x332a5e9f - .word 0xfe32352f - .word 0x0104dc01 - .word 0x6f8236c7 - .word 0xb8cf341b - .word 0xd528b651 - .word 0x4264dabc - .word 0xebc93e0c - .word 0xf4d3a02c - .word 0xd8fd0efd - .word 0x81394ab6 - .word 0x9040ca4a - .word 0xeaa4a089 - .word 0x836e582e - .word 0xf52f120f - .word 0x31f3c84d - .word 0xcb2a6343 - .word 0x8bb7e9dc - .word 0xc6d5a8a3 - .word 0x2f7c4e33 - .word 0x460abc72 - .word 0x1688458a - .word 0xcab1bc91 - .word 0x11bc337b - .word 0x53059c60 - .word 0x42af1f4e - .word 0xd2202e87 - .word 0x3dfa2768 - .word 0x78048736 - .word 0x439c7b4a - .word 0x0f74a85e - .word 0xdc83db39 - .word 0xa8b1fe6f - .word 0x3ab8a2c3 - .word 0x4afc8304 - .word 0x83339915 - .word 0xed17ac85 - .word 0x893ba84c - .word 0x1d6f60ba - .word 0x754abe9f - .word 0x597d89b3 - .word 0xf9de6484 - .word 0xb504f333 diff --git a/sw/otbn/crypto/rsa_verify.s b/sw/otbn/crypto/rsa_verify.s index 50de37ff7d3c3..e9b0ee3618a37 100644 --- a/sw/otbn/crypto/rsa_verify.s +++ b/sw/otbn/crypto/rsa_verify.s @@ -132,7 +132,7 @@ cmp_dmem_reg_buf: /* compare limbs and store comparison result in x3 */ bn.cmp w2, w3, FG1 - csrrs x3, 0x7c1, x0 + csrrs x3, FG1, x0 /* leave loop if lowest limb was reached */ beq x8, x7, cmp_end @@ -230,7 +230,7 @@ compute_rr: /* In case of final carry in doubling procedure substract modulus */ /* Jump to 'rr_sub' if FG1.C == 1 */ - csrrs x3, 0x7c1, x0 + csrrs x3, FG1, x0 andi x3, x3, 1 bne x3, x0, rr_sub @@ -239,7 +239,7 @@ compute_rr: bn.lid x10, 0(x17) bn.movr x11, x9 bn.cmp w2, w3, FG1 - csrrs x3, 0x7c1, x0 + csrrs x3, FG1, x0 /* If the highest limbs of buf and mod are equal we have to run a multi-limb comparison. This is very unlikely to happen. If this @@ -501,7 +501,7 @@ mont_loop: bn.movr x10++, x13 /* No subtracion if carry bit of addition of carry words not set. */ - csrrs x2, 0x7c1, x0 + csrrs x2, FG1, x0 andi x2, x2, 1 beq x2, x0, mont_loop_no_sub @@ -698,7 +698,7 @@ modexp_var: bn.lid x9, 0(x16++) bn.subb w2, w2, w3 bn.movr x17++, x11 - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 /* TODO: currently we subtract the modulus if out_buf == M. This should never happen in an RSA context. We could catch this and raise an alert. */ diff --git a/sw/otbn/crypto/rsa_verify_3072_m0inv.s b/sw/otbn/crypto/rsa_verify_3072_m0inv.s index 6807f9a7074df..650ec0519eb61 100644 --- a/sw/otbn/crypto/rsa_verify_3072_m0inv.s +++ b/sw/otbn/crypto/rsa_verify_3072_m0inv.s @@ -68,7 +68,7 @@ check_eq_w6w27: /* Get value from flag register. x3 <= (b < a) */ - csrrs x3, 0x7c0, x0 + csrrs x3, FG0, x0 andi x3, x3, 1 /* Check if a < b. */ @@ -76,7 +76,7 @@ check_eq_w6w27: /* Get value from flag register. x4 <= (a < b) */ - csrrs x4, 0x7c0, x0 + csrrs x4, FG0, x0 andi x4, x4, 1 /* If b < a or a < b, then a != b; otherwise a = b. diff --git a/sw/otbn/crypto/rsa_verify_3072_rr.s b/sw/otbn/crypto/rsa_verify_3072_rr.s index 072432c6773a5..b6145bbb51ac0 100644 --- a/sw/otbn/crypto/rsa_verify_3072_rr.s +++ b/sw/otbn/crypto/rsa_verify_3072_rr.s @@ -98,13 +98,13 @@ double_mod_var: /* Extract final carry bit from flags register. x2 <= aa[3072] */ - csrrs x2, 0x7c0, x0 + csrrs x2, FG0, x0 andi x2, x2, 1 jal x1, subtract_modulus_var /* Extract final borrow bit from flags register. */ - csrrs x3, 0x7c0, x0 + csrrs x3, FG0, x0 andi x3, x3, 1 /** diff --git a/sw/otbn/crypto/run_rsa_keygen.s b/sw/otbn/crypto/run_rsa_keygen.s index a82bfcf816ea5..7544ba7ed853c 100644 --- a/sw/otbn/crypto/run_rsa_keygen.s +++ b/sw/otbn/crypto/run_rsa_keygen.s @@ -4,11 +4,17 @@ /** * RSA key generation. + * + * This binary can be called in two different modes: + * - `GEN` mode generates a new, random keypair + * - `COFACTOR` mode constructs a keypair from n, e, d, and either p or q. + * + * Both modes support three sizes: RSA-2048, RSA-3072, and RSA-4096. */ /** * Mode magic values generated with - * $ ./util/design/sparse-fsm-encode.py -d 6 -m 4 -n 11 \ + * $ ./util/design/sparse-fsm-encode.py -d 6 -m 6 -n 11 \ * --avoid-zero -s 561689407 * * Call the same utility with the same arguments and a higher -m to generate @@ -19,9 +25,12 @@ * as `li`. If support is added, we could use 32-bit values here instead of * 11-bit. */ -.equ MODE_RSA_2048, 0x3b7 -.equ MODE_RSA_3072, 0x4fa -.equ MODE_RSA_4096, 0x74d +.equ MODE_GEN_RSA_2048, 0x137 +.equ MODE_GEN_RSA_3072, 0x4e5 +.equ MODE_GEN_RSA_4096, 0x63a +.equ MODE_COFACTOR_RSA_2048, 0x34e +.equ MODE_COFACTOR_RSA_3072, 0x0db +.equ MODE_COFACTOR_RSA_4096, 0x794 .section .text.start start: @@ -32,15 +41,24 @@ start: la x2, mode lw x2, 0(x2) - addi x3, x0, MODE_RSA_2048 + addi x3, x0, MODE_GEN_RSA_2048 beq x2, x3, rsa_keygen_2048 - addi x3, x0, MODE_RSA_3072 + addi x3, x0, MODE_GEN_RSA_3072 beq x2, x3, rsa_keygen_3072 - addi x3, x0, MODE_RSA_4096 + addi x3, x0, MODE_GEN_RSA_4096 beq x2, x3, rsa_keygen_4096 + addi x3, x0, MODE_COFACTOR_RSA_2048 + beq x2, x3, rsa_key_from_cofactor_2048 + + addi x3, x0, MODE_COFACTOR_RSA_3072 + beq x2, x3, rsa_key_from_cofactor_3072 + + addi x3, x0, MODE_COFACTOR_RSA_4096 + beq x2, x3, rsa_key_from_cofactor_4096 + /* Unsupported mode; fail. */ unimp unimp @@ -70,10 +88,34 @@ rsa_keygen_4096: jal x1, rsa_keygen ecall +rsa_key_from_cofactor_2048: + /* Set the number of limbs for the primes (2048 / 2 / 256). */ + li x30, 4 + + /* Generate a key (results in dmem[rsa_n] and dmem[rsa_d]). */ + jal x1, rsa_key_from_cofactor + ecall + +rsa_key_from_cofactor_3072: + /* Set the number of limbs for the primes (3072 / 2 / 256). */ + li x30, 6 + + /* Generate a key (results in dmem[rsa_n] and dmem[rsa_d]). */ + jal x1, rsa_key_from_cofactor + ecall + +rsa_key_from_cofactor_4096: + /* Set the number of limbs for the primes (4096 / 2 / 256). */ + li x30, 8 + + /* Generate a key (results in dmem[rsa_n] and dmem[rsa_d]). */ + jal x1, rsa_key_from_cofactor + ecall + .bss /* Operational mode. */ .globl mode .balign 4 mode: - .zero 4 +.zero 4 diff --git a/sw/otbn/crypto/tests/BUILD b/sw/otbn/crypto/tests/BUILD index 3044ad623373d..10172edecd588 100644 --- a/sw/otbn/crypto/tests/BUILD +++ b/sw/otbn/crypto/tests/BUILD @@ -2,7 +2,7 @@ # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 -load("//rules:otbn.bzl", "otbn_consttime_test", "otbn_sim_test") +load("//rules:otbn.bzl", "otbn_consttime_test", "otbn_library", "otbn_sim_test") package(default_visibility = ["//visibility:public"]) @@ -200,7 +200,7 @@ otbn_sim_test( ], exp = "p256_base_mult_test.exp", deps = [ - "//sw/otbn/crypto:p256", + "//sw/otbn/crypto:p256_base", ], ) @@ -235,7 +235,18 @@ otbn_sim_test( ], exp = "p256_key_from_seed_test.exp", deps = [ - "//sw/otbn/crypto:p256", + "//sw/otbn/crypto:p256_base", + ], +) + +otbn_sim_test( + name = "p256_mul_modp_test", + srcs = [ + "p256_mul_modp_test.s", + ], + exp = "p256_mul_modp_test.exp", + deps = [ + "//sw/otbn/crypto:p256_base", ], ) @@ -248,10 +259,10 @@ otbn_consttime_test( ) otbn_consttime_test( - name = "p256_scalar_mult_consttime", - subroutine = "p256_scalar_mult", + name = "p256_shared_key_consttime", + subroutine = "p256_shared_key", deps = [ - "//sw/otbn/crypto:p256_ecdsa", + "//sw/otbn/crypto:p256_ecdh", ], ) @@ -276,7 +287,8 @@ otbn_sim_test( ], exp = "p256_ecdsa_sign_test.exp", deps = [ - "//sw/otbn/crypto:p256", + "//sw/otbn/crypto:p256_base", + "//sw/otbn/crypto:p256_sign", ], ) @@ -287,7 +299,8 @@ otbn_sim_test( ], exp = "p256_ecdsa_verify_test.exp", deps = [ - "//sw/otbn/crypto:p256", + "//sw/otbn/crypto:p256_base", + "//sw/otbn/crypto:p256_verify", ], ) @@ -298,7 +311,8 @@ otbn_sim_test( ], exp = "p256_isoncurve_test.exp", deps = [ - "//sw/otbn/crypto:p256", + "//sw/otbn/crypto:p256_base", + "//sw/otbn/crypto:p256_isoncurve", ], ) @@ -309,7 +323,7 @@ otbn_sim_test( ], exp = "p256_proj_add_test.exp", deps = [ - "//sw/otbn/crypto:p256", + "//sw/otbn/crypto:p256_base", ], ) @@ -320,7 +334,43 @@ otbn_sim_test( ], exp = "p256_scalar_mult_test.exp", deps = [ - "//sw/otbn/crypto:p256", + "//sw/otbn/crypto:p256_base", + ], +) + +otbn_sim_test( + name = "p256_ecdh_shared_key_test", + srcs = [ + "p256_ecdh_shared_key_test.s", + ], + exp = "p256_ecdh_shared_key_test.exp", + deps = [ + "//sw/otbn/crypto:p256_base", + "//sw/otbn/crypto:p256_shared_key", + ], +) + +otbn_sim_test( + name = "p256_arithmetic_to_boolean_test", + srcs = [ + "p256_arithmetic_to_boolean_test.s", + ], + exp = "p256_arithmetic_to_boolean_test.exp", + deps = [ + "//sw/otbn/crypto:p256_base", + "//sw/otbn/crypto:p256_shared_key", + ], +) + +otbn_sim_test( + name = "p256_arithmetic_to_boolean_mod_test", + srcs = [ + "p256_arithmetic_to_boolean_mod_test.s", + ], + exp = "p256_arithmetic_to_boolean_mod_test.exp", + deps = [ + "//sw/otbn/crypto:p256_base", + "//sw/otbn/crypto:p256_shared_key", ], ) @@ -332,7 +382,46 @@ otbn_sim_test( exp = "p384_base_mult_test.exp", deps = [ "//sw/otbn/crypto:p384_base", - "//sw/otbn/crypto:p384_sign", + "//sw/otbn/crypto:p384_base_mult", + "//sw/otbn/crypto:p384_internal_mult", + ], +) + +otbn_sim_test( + name = "p384_arithmetic_to_boolean_test", + srcs = [ + "p384_arithmetic_to_boolean_test.s", + ], + exp = "p384_arithmetic_to_boolean_test.exp", + deps = [ + "//sw/otbn/crypto:p384_a2b", + "//sw/otbn/crypto:p384_base", + ], +) + +otbn_sim_test( + name = "p384_arithmetic_to_boolean_mod_test", + srcs = [ + "p384_arithmetic_to_boolean_mod_test.s", + ], + exp = "p384_arithmetic_to_boolean_mod_test.exp", + deps = [ + "//sw/otbn/crypto:p384_a2b", + "//sw/otbn/crypto:p384_base", + ], +) + +otbn_sim_test( + name = "p384_ecdh_shared_key_test", + srcs = [ + "p384_ecdh_shared_key_test.s", + ], + exp = "p384_ecdh_shared_key_test.exp", + deps = [ + "//sw/otbn/crypto:p384_a2b", + "//sw/otbn/crypto:p384_base", + "//sw/otbn/crypto:p384_internal_mult", + "//sw/otbn/crypto:p384_scalar_mult", ], ) @@ -344,6 +433,8 @@ otbn_sim_test( exp = "p384_ecdsa_sign_test.exp", deps = [ "//sw/otbn/crypto:p384_base", + "//sw/otbn/crypto:p384_internal_mult", + "//sw/otbn/crypto:p384_modinv", "//sw/otbn/crypto:p384_sign", ], ) @@ -356,6 +447,8 @@ otbn_sim_test( exp = "p384_ecdsa_verify_test.exp", deps = [ "//sw/otbn/crypto:p384_base", + "//sw/otbn/crypto:p384_isoncurve", + "//sw/otbn/crypto:p384_modinv", "//sw/otbn/crypto:p384_verify", ], ) @@ -368,7 +461,31 @@ otbn_sim_test( exp = "p384_isoncurve_test.exp", deps = [ "//sw/otbn/crypto:p384_base", - "//sw/otbn/crypto:p384_verify", + "//sw/otbn/crypto:p384_isoncurve", + ], +) + +otbn_sim_test( + name = "p384_curve_point_valid_test", + srcs = [ + "p384_curve_point_valid_test.s", + ], + exp = "p384_curve_point_valid_test.exp", + deps = [ + "//sw/otbn/crypto:p384_base", + "//sw/otbn/crypto:p384_isoncurve", + ], +) + +otbn_sim_test( + name = "p384_keygen_test", + srcs = [ + "p384_keygen_test.s", + ], + exp = "p384_keygen_test.exp", + deps = [ + "//sw/otbn/crypto:p384_base", + "//sw/otbn/crypto:p384_keygen", ], ) @@ -391,7 +508,19 @@ otbn_sim_test( exp = "p384_scalar_mult_test.exp", deps = [ "//sw/otbn/crypto:p384_base", - "//sw/otbn/crypto:p384_sign", + "//sw/otbn/crypto:p384_internal_mult", + "//sw/otbn/crypto:p384_scalar_mult", + ], +) + +otbn_sim_test( + name = "p384_mulmod448x128_test", + srcs = [ + "p384_mulmod448x128_test.s", + ], + exp = "p384_mulmod448x128_test.exp", + deps = [ + "//sw/otbn/crypto:p384_base", ], ) @@ -399,7 +528,15 @@ otbn_consttime_test( name = "p384_base_mult_consttime", subroutine = "p384_base_mult", deps = [ - ":p384_ecdsa_sign_test", + ":p384_base_mult_test", + ], +) + +otbn_consttime_test( + name = "p384_scalar_mult_consttime", + subroutine = "p384_scalar_mult", + deps = [ + ":p384_scalar_mult_test", ], ) @@ -447,11 +584,10 @@ otbn_consttime_test( ], ) -otbn_consttime_test( - name = "scalar_mult_p384_consttime", - subroutine = "scalar_mult_p384", - deps = [ - ":p384_ecdsa_sign_test", +otbn_library( + name = "fake_primality", + srcs = [ + "fake_primality.s", ], ) @@ -618,15 +754,99 @@ otbn_consttime_test( ], ) +otbn_library( + name = "rsa_keygen_checkpq_test_data", + srcs = [ + "rsa_keygen_checkpq_test_data.s", + ], +) + otbn_sim_test( - name = "rsa_keygen_checkpq_test", - # This test is very long because it runs multiple primality tests. - timeout = "eternal", + name = "rsa_keygen_checkp_good_test", + timeout = "long", # runs a primality test + srcs = [ + "rsa_keygen_checkp_good_test.s", + ], + exp = "rsa_keygen_checkp_good_test.exp", + tags = ["nightly"], # slow, do not run in CI + deps = [ + ":rsa_keygen_checkpq_test_data", + "//sw/otbn/crypto:div", + "//sw/otbn/crypto:gcd", + "//sw/otbn/crypto:lcm", + "//sw/otbn/crypto:montmul", + "//sw/otbn/crypto:mul", + "//sw/otbn/crypto:primality", + "//sw/otbn/crypto:rsa_keygen", + ], +) + +otbn_sim_test( + name = "rsa_keygen_checkp_not_relprime_test", + srcs = [ + "rsa_keygen_checkp_not_relprime_test.s", + ], + exp = "rsa_keygen_checkp_not_relprime_test.exp", + deps = [ + ":fake_primality", + ":rsa_keygen_checkpq_test_data", + "//sw/otbn/crypto:div", + "//sw/otbn/crypto:gcd", + "//sw/otbn/crypto:lcm", + "//sw/otbn/crypto:montmul", + "//sw/otbn/crypto:mul", + "//sw/otbn/crypto:rsa_keygen", + ], +) + +otbn_sim_test( + name = "rsa_keygen_checkp_not_prime_test", + timeout = "long", # runs a primality test + srcs = [ + "rsa_keygen_checkp_not_prime_test.s", + ], + exp = "rsa_keygen_checkp_not_prime_test.exp", + deps = [ + ":rsa_keygen_checkpq_test_data", + "//sw/otbn/crypto:div", + "//sw/otbn/crypto:gcd", + "//sw/otbn/crypto:lcm", + "//sw/otbn/crypto:montmul", + "//sw/otbn/crypto:mul", + "//sw/otbn/crypto:primality", + "//sw/otbn/crypto:rsa_keygen", + ], +) + +otbn_sim_test( + name = "rsa_keygen_checkq_good_test", + timeout = "long", # runs a primality test + srcs = [ + "rsa_keygen_checkq_good_test.s", + ], + exp = "rsa_keygen_checkq_good_test.exp", + tags = ["nightly"], # slow, do not run in CI + deps = [ + ":rsa_keygen_checkpq_test_data", + "//sw/otbn/crypto:div", + "//sw/otbn/crypto:gcd", + "//sw/otbn/crypto:lcm", + "//sw/otbn/crypto:montmul", + "//sw/otbn/crypto:mul", + "//sw/otbn/crypto:primality", + "//sw/otbn/crypto:rsa_keygen", + ], +) + +otbn_sim_test( + name = "rsa_keygen_checkq_not_prime_test", + timeout = "long", # runs a primality test srcs = [ - "rsa_keygen_checkpq_test.s", + "rsa_keygen_checkq_not_prime_test.s", ], - exp = "rsa_keygen_checkpq_test.exp", + exp = "rsa_keygen_checkq_not_prime_test.exp", deps = [ + ":rsa_keygen_checkpq_test_data", "//sw/otbn/crypto:div", "//sw/otbn/crypto:gcd", "//sw/otbn/crypto:lcm", @@ -637,6 +857,42 @@ otbn_sim_test( ], ) +otbn_sim_test( + name = "rsa_keygen_checkq_not_relprime_test", + srcs = [ + "rsa_keygen_checkq_not_relprime_test.s", + ], + exp = "rsa_keygen_checkq_not_relprime_test.exp", + deps = [ + ":fake_primality", + ":rsa_keygen_checkpq_test_data", + "//sw/otbn/crypto:div", + "//sw/otbn/crypto:gcd", + "//sw/otbn/crypto:lcm", + "//sw/otbn/crypto:montmul", + "//sw/otbn/crypto:mul", + "//sw/otbn/crypto:rsa_keygen", + ], +) + +otbn_sim_test( + name = "rsa_keygen_checkq_too_close_test", + srcs = [ + "rsa_keygen_checkq_too_close_test.s", + ], + exp = "rsa_keygen_checkq_too_close_test.exp", + deps = [ + ":fake_primality", + ":rsa_keygen_checkpq_test_data", + "//sw/otbn/crypto:div", + "//sw/otbn/crypto:gcd", + "//sw/otbn/crypto:lcm", + "//sw/otbn/crypto:montmul", + "//sw/otbn/crypto:mul", + "//sw/otbn/crypto:rsa_keygen", + ], +) + otbn_sim_test( name = "rsa_1024_dec_test", timeout = "long", @@ -662,6 +918,69 @@ otbn_sim_test( ], ) +otbn_sim_test( + name = "rsa_2048_dec_test", + timeout = "eternal", + srcs = [ + "rsa_2048_dec_test.s", + ], + exp = "rsa_2048_dec_test.exp", + deps = [ + "//sw/otbn/crypto:modexp", + "//sw/otbn/crypto:montmul", + ], +) + +otbn_sim_test( + name = "rsa_2048_enc_test", + srcs = [ + "rsa_2048_enc_test.s", + ], + exp = "rsa_2048_enc_test.exp", + deps = [ + "//sw/otbn/crypto:modexp", + "//sw/otbn/crypto:montmul", + ], +) + +otbn_sim_test( + name = "rsa_3072_dec_test", + timeout = "eternal", + srcs = [ + "rsa_3072_dec_test.s", + ], + exp = "rsa_3072_dec_test.exp", + tags = ["nightly"], # slow, do not run in CI + deps = [ + "//sw/otbn/crypto:modexp", + "//sw/otbn/crypto:montmul", + ], +) + +otbn_sim_test( + name = "rsa_3072_enc_test", + srcs = [ + "rsa_3072_enc_test.s", + ], + exp = "rsa_3072_enc_test.exp", + deps = [ + "//sw/otbn/crypto:modexp", + "//sw/otbn/crypto:montmul", + ], +) + +otbn_sim_test( + name = "rsa_4096_enc_test", + srcs = [ + "rsa_4096_enc_test.s", + ], + exp = "rsa_4096_enc_test.exp", + deps = [ + "//sw/otbn/crypto:modexp", + "//sw/otbn/crypto:montmul", + ], +) + otbn_sim_test( name = "rsa_verify_test", srcs = [ @@ -764,11 +1083,23 @@ otbn_sim_test( ) otbn_sim_test( - name = "x25519_test", + name = "x25519_test1", + srcs = [ + "x25519_test1.s", + ], + exp = "x25519_test1.exp", + deps = [ + "//sw/otbn/crypto:field25519", + "//sw/otbn/crypto:x25519", + ], +) + +otbn_sim_test( + name = "x25519_test2", srcs = [ - "x25519_test.s", + "x25519_test2.s", ], - exp = "x25519_test.exp", + exp = "x25519_test2.exp", deps = [ "//sw/otbn/crypto:field25519", "//sw/otbn/crypto:x25519", @@ -779,6 +1110,6 @@ otbn_consttime_test( name = "x25519_consttime", subroutine = "X25519", deps = [ - ":x25519_test", + ":x25519_test1", ], ) diff --git a/sw/otbn/crypto/tests/ed25519_ext_add_test.s b/sw/otbn/crypto/tests/ed25519_ext_add_test.s index f280d18a3a6b5..cfbb901374bba 100644 --- a/sw/otbn/crypto/tests/ed25519_ext_add_test.s +++ b/sw/otbn/crypto/tests/ed25519_ext_add_test.s @@ -23,7 +23,7 @@ main: li x2, 2 la x3, modulus bn.lid x2, 0(x3) - bn.wsrw 0x0, w2 + bn.wsrw MOD, w2 /* w19 <= 19 */ bn.addi w19, w31, 19 diff --git a/sw/otbn/crypto/tests/fake_primality.s b/sw/otbn/crypto/tests/fake_primality.s new file mode 100644 index 0000000000000..857ee20b48e05 --- /dev/null +++ b/sw/otbn/crypto/tests/fake_primality.s @@ -0,0 +1,15 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Fake primality-test routine. + * + * This will cause an error if called; it should be used for tests where + * calling a full primality test indicates failure (such as a test in which the + * candidate prime should fail earlier checks before being evaluated for + * primality). + */ +.globl miller_rabin +miller_rabin: + unimp diff --git a/sw/otbn/crypto/tests/field25519_test.s b/sw/otbn/crypto/tests/field25519_test.s index e05aa848d3f10..7bf7ea55d7e69 100644 --- a/sw/otbn/crypto/tests/field25519_test.s +++ b/sw/otbn/crypto/tests/field25519_test.s @@ -19,7 +19,7 @@ main: li x2, 2 la x3, modulus bn.lid x2, 0(x3) - bn.wsrw 0x0, w2 + bn.wsrw MOD, w2 /* w19 <= 19 */ bn.addi w19, w31, 19 diff --git a/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.exp b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.exp new file mode 100644 index 0000000000000..35dacd69b5f07 --- /dev/null +++ b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.exp @@ -0,0 +1,2 @@ +# Expected values: +w0 = 0x0000000000000000000000000000000000000000000000000000000000000000 diff --git a/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.s b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.s new file mode 100644 index 0000000000000..57de22159e168 --- /dev/null +++ b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.s @@ -0,0 +1,79 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone elliptic curve P-256 arithmetic-to-boolean masking test + * + * Uses OTBN ECC P-256 lib to perform arithmetic-to-boolean conversion of + * a given masked curve value with a random mask. Afterwards it unmasks the + * result and compares it with the initial value from DMEM. + */ + +.section .text.start + +p256_arithmetic_to_boolean_test: + + /* init all-zero register */ + bn.xor w31, w31, w31 + + /* Load domain parameter. + w29 = dmem[p256_p] */ + li x2, 29 + la x4, p256_p + bn.lid x2, 0(x4) + + /* Set MOD to p */ + bn.wsrw MOD, w29 + + /* Load values into WDRs */ + + /* w11 <= dmem[x] mod p */ + li x3, 11 + la x4, x + bn.lid x3, 0(x4) + bn.addm w11, w11, w31 + + /* w19 <= URND mod p */ + bn.wsrr w19, URND + bn.addm w19, w19, w31 + + /* Arithmetic masking */ + + /* w11 = A <= w11 - w19 = x - r */ + bn.subm w11, w11, w19 + + /* Arithmetic to boolean conversion */ + jal x1, arithmetic_to_boolean_mod + + /* Unmask and compare values + after conversion */ + + /* w20 <= w20 ^ w19 = x' ^ r = x */ + bn.xor w20, w20, w19 + + /* w10 <= dmem[x] mod p */ + li x3, 10 + la x4, x + bn.lid x3, 0(x4) + bn.addm w10, w10, w31 + + /* w0 <= w10 - w20 */ + bn.sub w0, w20, w10 + + ecall + + +.data + +.globl x +.balign 32 +x: + .word 0x2ab77ca0 + .word 0x8031ceb8 + .word 0xff3e1afa + .word 0x353ec814 + .word 0x22fe027b + .word 0x8a29dc16 + .word 0xf7109d54 + .word 0x762c5d06 diff --git a/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.exp b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.exp new file mode 100644 index 0000000000000..879f5d55ea82c --- /dev/null +++ b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.exp @@ -0,0 +1,3 @@ +# Expected values: +w0 = 0x0000000000000000000000000000000000000000000000000000000000000000 +w1 = 0x0000000000000000000000000000000000000000000000000000000000000000 diff --git a/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.s b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.s new file mode 100644 index 0000000000000..19d4d8a07bec5 --- /dev/null +++ b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.s @@ -0,0 +1,108 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone 257-bit arithmetic-to-boolean masking test + * + * Uses OTBN ECC P-256 lib to perform arithmetic-to-boolean conversion of + * a given masked 257-bit value with a random mask. Afterwards it unmasks the + * result and compares it with the initial value from DMEM. + */ + +.section .text.start + +p256_arithmetic_to_boolean_test: + + /* init all-zero register */ + bn.xor w31, w31, w31 + + /* Load domain parameter. + w29 = dmem[p256_p] */ + li x2, 29 + la x4, p256_p + bn.lid x2, 0(x4) + + /* Set MOD to p */ + bn.wsrw MOD, w29 + + /* Load values into WDRs */ + + /* w11 <= dmem[x_l] */ + li x3, 11 + la x4, x_l + bn.lid x3, 0(x4) + + /* w12 <= dmem[x_u] */ + li x3, 12 + la x4, x_u + bn.lid x3, 0(x4) + + /* w18 <= URND + w19 <= URND (1 bit) */ + bn.wsrr w18, URND + bn.wsrr w19, URND + bn.rshi w19, w31, w19 >> 255 + + /* Arithmetic masking */ + + /* [w12,w11] = A <= [w12,w11] - [w19,w18] mod 2^257 = x - r mod 2^257 + This may result in bits above 2^257, but these will be stripped off. */ + bn.sub w11, w11, w18 + bn.subb w12, w12, w19 + bn.rshi w12, w12, w31 >> 1 + bn.rshi w12, w31, w12 >> 255 + + /* Arithmetic to boolean conversion */ + jal x1, arithmetic_to_boolean + + /* Unmask and compare values + after conversion */ + + /* w20 <= w20 ^ w18 = x' ^ r + w21 <= w21 ^ w19 = x' ^ r */ + bn.xor w20, w20, w18 + bn.xor w21, w21, w19 + + /* w11 <= dmem[x_l] */ + li x3, 11 + la x4, x_l + bn.lid x3, 0(x4) + + /* w12 <= dmem[x_u] */ + li x3, 12 + la x4, x_u + bn.lid x3, 0(x4) + + /* [w1,w0] <= [w12,w11] - [w21,w20] */ + bn.sub w0, w11, w20 + bn.subb w1, w12, w21 + + ecall + + +.data + +.globl x_u +.balign 32 +x_u: + .word 0x00000001 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + +.globl x_l +.balign 32 +x_l: + .word 0x2ab77ca0 + .word 0x8031ceb8 + .word 0xff3e1afa + .word 0x353ec814 + .word 0x22fe027b + .word 0x8a29dc16 + .word 0xf7109d54 + .word 0x762c5d06 diff --git a/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.exp b/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.exp new file mode 100644 index 0000000000000..361cbdd59ac67 --- /dev/null +++ b/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.exp @@ -0,0 +1,2 @@ +# Expected value for shared key: +w11 = 0x5f33d746a326640a739a9490ec15c10372869f3de675b2e85742271d18c9eb82 diff --git a/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.s b/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.s new file mode 100644 index 0000000000000..8e462ee3fe06a --- /dev/null +++ b/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.s @@ -0,0 +1,102 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone elliptic curve P-256 ECDH shared key generation test + * + * Uses OTBN ECC P-256 lib to perform a scalar multiplication with a valid + * example curve point and an example scalar. Both scalar and coordinates of + * the curve point are contained in the .data section below. + * The x coordinate of the resulting curve point is masked arithmetically + * with a random value. As the x coorodinate represents the actual + * shared key, the x coordinate and its mask are then converted from an + * arithmetic to a boolean masking scheme. + * + * The result of arithmetical unmasking as well as the result of boolean + * unmasking are compared with an expected value. + */ + +.section .text.start + +p256_ecdh_shared_key_test: + + /* Call P-256 shared key generation to get a boolean-masked key. + dmem[x] <= x0 + dmem[y] <= x1 */ + jal x1, p256_shared_key + + /* Load the two shares. + w11 <= dmem[x] = x0 + w12 <= dmem[y] = x1 */ + li x3, 11 + la x4, x + bn.lid x3++, 0(x4) + la x4, y + bn.lid x3, 0(x4) + + /* Unmask the shared key, x. + w11 <= x0 ^ x1 = x */ + bn.xor w11, w11, w12 + + ecall + + +.data + +/* Secret key d in arithmetic shares. */ +.globl d0 +.balign 32 +d0: + .word 0xfe6d1071 + .word 0x21d0a016 + .word 0xb0b2c781 + .word 0x9590ef5d + .word 0x3fdfa379 + .word 0x1b76ebe8 + .word 0x74210263 + .word 0x1420fc41 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 +.globl d1 +.balign 32 +d1: + .zero 64 + +/* example curve point x-coordinate */ +.globl x +.balign 32 +x: + .word 0xbfa8c334 + .word 0x9773b7b3 + .word 0xf36b0689 + .word 0x6ec0c0b2 + .word 0xdb6c8bf3 + .word 0x1628ce58 + .word 0xfacdc546 + .word 0xb5511a6a + +/* example curve point y-coordinate */ +.globl y +.balign 32 +y: + .word 0x9e008c2e + .word 0xa8707058 + .word 0xab9c6924 + .word 0x7f7a11d0 + .word 0xb53a17fa + .word 0x43dd09ea + .word 0x1f31c143 + .word 0x42a1c697 + +/* affine x-coordinate value before A2B */ +.globl x_a +.balign 32 +x_a: + .zero 32 diff --git a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s index 56735db6132f3..01e20cc1dd822 100644 --- a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s +++ b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s @@ -71,7 +71,7 @@ randomize_share: /* Get a 63-bit pseudorandom number. w0 <= URND()[255:193] = r */ - bn.wsrr w0, 0x2 /* URND*/ + bn.wsrr w0, URND bn.rshi w0, w31, w0 >> 193 /* Load the curve order n. diff --git a/sw/otbn/crypto/tests/p256_isoncurve_test.exp b/sw/otbn/crypto/tests/p256_isoncurve_test.exp index a35217fd56467..65d365eb27705 100644 --- a/sw/otbn/crypto/tests/p256_isoncurve_test.exp +++ b/sw/otbn/crypto/tests/p256_isoncurve_test.exp @@ -1,3 +1,3 @@ -# Expected values (w0=R, w1=S): -w0 = 0xb103b614b389c6b8e1a08330a6ce0b9c4b3726ec0bf61f6bdd66af03a4af5660 -w1 = 0xb103b614b389c6b8e1a08330a6ce0b9c4b3726ec0bf61f6bdd66af03a4af5660 +# Expected values (w18=lhs, w1=rhs): +w18 = 0xb103b614b389c6b8e1a08330a6ce0b9c4b3726ec0bf61f6bdd66af03a4af5660 +w19 = 0xb103b614b389c6b8e1a08330a6ce0b9c4b3726ec0bf61f6bdd66af03a4af5660 diff --git a/sw/otbn/crypto/tests/p256_isoncurve_test.s b/sw/otbn/crypto/tests/p256_isoncurve_test.s index 4e8dad49cd4e9..78336bb666455 100644 --- a/sw/otbn/crypto/tests/p256_isoncurve_test.s +++ b/sw/otbn/crypto/tests/p256_isoncurve_test.s @@ -13,17 +13,14 @@ .section .text.start p256_oncurve_test: + /* Initialize all-zero register. */ + bn.xor w31, w31, w31 - /* call curve point test routine in P-256 lib */ + /* Compute both sides of the Weierstrauss equation. + w18 <= lhs + w19 <= rhs */ jal x1, p256_isoncurve - /* load result to WDRs for comparison with reference */ - li x2, 0 - la x3, r - bn.lid x2++, 0(x3) - la x3, s - bn.lid x2, 0(x3) - ecall diff --git a/sw/otbn/crypto/tests/p256_key_from_seed_test.exp b/sw/otbn/crypto/tests/p256_key_from_seed_test.exp index 0b6ff554a2321..5e74e684e24a0 100644 --- a/sw/otbn/crypto/tests/p256_key_from_seed_test.exp +++ b/sw/otbn/crypto/tests/p256_key_from_seed_test.exp @@ -2,6 +2,6 @@ w20 = 0x9def3b61bc577b4b45c0f8b23ed867e3302b5143e9e71859e3ef3615df0ace13 w21 = 0xe46bcaf84b3890e1 -# [w23, w22]: d1 -w22 = 0x17bcfeef551f77d199dd9f5af7d1a8736f2f939abeb67c9e2df4bec0225596d6 -w23 = 0x63e2e86d4e67f1f7 +# [w10, w11]: d1 +w10 = 0x17bcfeef551f77d199dd9f5af7d1a8736f2f939abeb67c9e2df4bec0225596d6 +w11 = 0x63e2e86d4e67f1f7 diff --git a/sw/otbn/crypto/tests/p256_key_from_seed_test.s b/sw/otbn/crypto/tests/p256_key_from_seed_test.s index 69bbcbefa894e..813028b654ff8 100644 --- a/sw/otbn/crypto/tests/p256_key_from_seed_test.s +++ b/sw/otbn/crypto/tests/p256_key_from_seed_test.s @@ -13,17 +13,16 @@ key_from_seed_test: bn.xor w31, w31, w31 /* Load shares of seed from DMEM. - [w21,w20] <= dmem[seed0] - [w23,w33] <= dmem[seed1] */ + [w20,w21] <= dmem[seed0] + [w10,w11] <= dmem[seed1] */ li x2, 20 la x3, seed0 - bn.lid x2, 0(x3++) - li x2, 21 bn.lid x2++, 0(x3) + bn.lid x2, 32(x3) + li x2, 10 la x3, seed1 - bn.lid x2, 0(x3++) - li x2, 23 - bn.lid x2, 0(x3) + bn.lid x2++, 0(x3) + bn.lid x2, 32(x3) /* Generate the derived secret key. */ jal x1, p256_key_from_seed diff --git a/sw/otbn/crypto/tests/p256_mul_modp_test.exp b/sw/otbn/crypto/tests/p256_mul_modp_test.exp new file mode 100644 index 0000000000000..9675959613abe --- /dev/null +++ b/sw/otbn/crypto/tests/p256_mul_modp_test.exp @@ -0,0 +1 @@ +w19 = 0x3cc57c50d0f2d26fc7bff844a3cdcf866f47b074f3171d5711bacbe3045443a6 diff --git a/sw/otbn/crypto/tests/p256_mul_modp_test.s b/sw/otbn/crypto/tests/p256_mul_modp_test.s new file mode 100644 index 0000000000000..7ef7f6e879708 --- /dev/null +++ b/sw/otbn/crypto/tests/p256_mul_modp_test.s @@ -0,0 +1,70 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone test for P-256 field multiplication. + */ + +.section .text.start +start: + /* Initialize all-zero register. */ + bn.xor w31, w31, w31 + + /* Load the modulus, p. + MOD <= w29 <= dmem[p256_p] = p */ + li x2, 29 + la x3, p256_p + bn.lid x2, 0(x3) + bn.wsrw MOD, w29 + + /* Compute the constant r256 for reduction modulo p. + w28 <= 2^256 - p = r256 */ + bn.sub w28, w31, w29 + + /* Load the constant for reduction modulo p. + w29 <= dmem[p256_r448] = r448 */ + li x2, 29 + la x3, p256_r448 + bn.lid x2, 0(x3) + + /* Load the operands. + w24 <= dmem[value_a] = a + w25 <= dmem[value_b] = b */ + li x2, 24 + la x3, value_a + bn.lid x2++, 0(x3) + la x3, value_b + bn.lid x2, 0(x3) + + /* Run modular multiplication. + w19 <= (w24 * w25) mod p */ + jal x1, mul_modp + + ecall + +.data + +/* First operand, a. + = 0xa8da539ffce03337030a5a44bcd3266608a32b364bb3295cace17a9da3175abc */ +value_a: +.word 0xa3175abc +.word 0xace17a9d +.word 0x4bb3295c +.word 0x08a32b36 +.word 0xbcd32666 +.word 0x030a5a44 +.word 0xfce03337 +.word 0xa8da539f + +/* Second operand, b. + = 0x72c7c6bec94cf13ab2a1c47c60cb522e04a0e4330df8714c96a2db313c873171 */ +value_b: +.word 0x3c873171 +.word 0x96a2db31 +.word 0x0df8714c +.word 0x04a0e433 +.word 0x60cb522e +.word 0xb2a1c47c +.word 0xc94cf13a +.word 0x72c7c6be diff --git a/sw/otbn/crypto/tests/p256_proj_add_test.s b/sw/otbn/crypto/tests/p256_proj_add_test.s index cdfb295b4d8b1..98d662e7dcb6f 100644 --- a/sw/otbn/crypto/tests/p256_proj_add_test.s +++ b/sw/otbn/crypto/tests/p256_proj_add_test.s @@ -37,20 +37,24 @@ p256_proj_add_test: la x3, p256_b bn.lid x2, 0(x3) - /* load lower 256 bit of Barrett constant u for modulus p from dmem - w28 <= u = dmem[p256_u_p] */ - li x2, 28 - la x3, p256_u_p - bn.lid x2, 0(x3) - /* load field modulus p from dmem - w29 <= p = dmem[p256_p] */ + MOD <= w29 <= p = dmem[p256_p] */ li x2, 29 la x3, p256_p bn.lid x2, 0(x3) /* store modulus to MOD WSR */ - bn.wsrw 0, w29 + bn.wsrw MOD, w29 + + /* Compute the constant r256 for reduction modulo p. + w28 <= 2^256 - p = r256 */ + bn.sub w28, w31, w29 + + /* Load the other constant for reduction modulo p. + w29 <= dmem[p256_r448] = r448 */ + li x2, 29 + la x3, p256_r448 + bn.lid x2, 0(x3) /* init all-zero reg */ bn.xor w31, w31, w31 diff --git a/sw/otbn/crypto/tests/p256_scalar_mult_test.exp b/sw/otbn/crypto/tests/p256_scalar_mult_test.exp index 8e66d56142ece..9edc88acb23b2 100644 --- a/sw/otbn/crypto/tests/p256_scalar_mult_test.exp +++ b/sw/otbn/crypto/tests/p256_scalar_mult_test.exp @@ -1,3 +1,3 @@ -# Expected values (w0=X, w1=Y): -w0 = 0x5f33d746a326640a739a9490ec15c10372869f3de675b2e85742271d18c9eb82 -w1 = 0xb5ebbd1e4ac99c9e3d70a862e41fe23ace6ab34f7ac9f99a4c403defb76c462d +# Expected values (w11=X, w12=Y): +w11 = 0x5f33d746a326640a739a9490ec15c10372869f3de675b2e85742271d18c9eb82 +w12 = 0xb5ebbd1e4ac99c9e3d70a862e41fe23ace6ab34f7ac9f99a4c403defb76c462d diff --git a/sw/otbn/crypto/tests/p256_scalar_mult_test.s b/sw/otbn/crypto/tests/p256_scalar_mult_test.s index a4e594077c9bf..2457b36d72d5a 100644 --- a/sw/otbn/crypto/tests/p256_scalar_mult_test.s +++ b/sw/otbn/crypto/tests/p256_scalar_mult_test.s @@ -9,23 +9,43 @@ * example curve point and an example scalar. Both scalar and coordinates of * the curve point are contained in the .data section below. * - * x and y cordinates of the resulting curve points are copied to wide - * registers. See comment at the end of the file for expected values. + * x coordinate of the resulting curve points is copied to a wide + * register. */ .section .text.start scalar_mult_test: + /* Init all-zero register. */ + bn.xor w31, w31, w31 - /* call scalar point multiplication routine in P-256 lib */ - jal x1, p256_scalar_mult + /* Load first share of scalar k from dmem. + w0,w1 = dmem[k0] */ + la x16, k0 + li x2, 0 + bn.lid x2, 0(x16++) + li x2, 1 + bn.lid x2, 0(x16) - /* copy result to wide reg file */ - li x2, 0 - la x3, x - bn.lid x2++, 0(x3) - la x3, y - bn.lid x2, 0(x3) + /* Load second share of scalar k from dmem. + w2,w3 = dmem[k1] */ + la x16, k1 + li x2, 2 + bn.lid x2, 0(x16++) + li x2, 3 + bn.lid x2, 0(x16) + + /* Call internal scalar multiplication routine. + Returns point in projective coordinates. + (w8, w9, w10) <= (X, Y, Z) = k*(x,y) */ + la x21, x + la x22, y + jal x1, scalar_mult_int + + /* Convert to affine coordinates. + w11 <= x + w12 <= y */ + jal x1, proj_to_affine ecall diff --git a/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.exp b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.exp new file mode 100644 index 0000000000000..879f5d55ea82c --- /dev/null +++ b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.exp @@ -0,0 +1,3 @@ +# Expected values: +w0 = 0x0000000000000000000000000000000000000000000000000000000000000000 +w1 = 0x0000000000000000000000000000000000000000000000000000000000000000 diff --git a/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.s b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.s new file mode 100644 index 0000000000000..363a053ee1616 --- /dev/null +++ b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.s @@ -0,0 +1,133 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone elliptic curve P-384 arithmetic-to-boolean masking test + * + * Uses OTBN ECC P-384 lib to perform arithmetic-to-boolean conversion of + * a given masked curve value with a random mask. Afterwards it unmasks the + * result and compares it with the initial value from DMEM. + */ + +.section .text.start + +p256_arithmetic_to_boolean_test: + + /* init all-zero register */ + bn.xor w31, w31, w31 + + /* Load domain parameter. + [w13,w12] = dmem[p384_p] */ + li x2, 12 + la x4, p384_p + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + /* Load values into WDRs */ + + /* [w20,w19,w18] <= dmem[x] */ + li x3, 18 + la x4, x + bn.lid x3++, 0(x4) + bn.lid x3++, 32(x4) + bn.mov w20, w31 + + /* Reduce x mod p + [w5,w4] <= [w20,w19,w18] mod [w13,w12] = x mod p + dmem[x] <= [w31,w5,w4] = x mod p */ + jal x1, p384_reduce_p + bn.mov w4, w16 + bn.mov w5, w17 + li x3, 4 + la x4, x + bn.sid x3++, 0(x4) + bn.sid x3++, 32(x4) + li x3, 31 + bn.sid x3, 64(x4) + + /* [w20,w19,w18] <= URND = r */ + bn.wsrr w18, URND + bn.wsrr w19, URND + bn.wsrr w20, URND + + /* Reduce r mod p + [w7,w6] <= [w20,w19,w18] mod [w13,w12] = r mod p */ + jal x1, p384_reduce_p + bn.mov w6, w16 + bn.mov w7, w17 + + /* Arithmetic masking. + [w12,w11] = A <= [w5,w4] - [w7,w6] mod [w13,w12] = x - r mod p */ + + /* [w19,w18] = A1 <= [w5,w4] - [w7,w6] = x - r */ + bn.sub w18, w4, w6 + bn.subb w19, w5, w7 + + /* [w17,w16] = A2 <= [w19,w18] + [w13,w12] = A1 + p = x - r + p */ + bn.add w16, w18, w12 + bn.addc w17, w19, w13 + + /* If x >= r: [w12,w11] <= A1, else: [w12,w11] <= A2 */ + bn.sub w0, w4, w6 + bn.subb w1, w5, w7 + bn.sel w11, w16, w18, FG0.C + bn.sel w12, w17, w19, FG0.C + + /* Load domain parameter. + [w14,w13] = dmem[p384_p] */ + li x2, 13 + la x4, p384_p + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + /* Move mask r to input registers. + [w19,18] <= [w7,w6] = r */ + bn.mov w18, w6 + bn.mov w19, w7 + + /* Arithmetic to boolean conversion */ + jal x1, p384_arithmetic_to_boolean_mod + + /* Unmask and compare values + after conversion */ + + /* w20 <= w20 ^ w18 = x' ^ r + w21 <= w21 ^ w19 = x' ^ r */ + bn.xor w20, w20, w18 + bn.xor w21, w21, w19 + + /* [w5,w4] <= dmem[x] = x mod p */ + li x3, 4 + la x4, x + bn.lid x3++, 0(x4) + bn.lid x3++, 32(x4) + + /* [w1,w0] <= [w12,w11] - [w21,w20] */ + bn.sub w0, w4, w20 + bn.subb w1, w5, w21 + + ecall + + +.data + +.globl x +.balign 32 +x: + .word 0xab0f7698 + .word 0xc85b787e + .word 0x9d9c9644 + .word 0x9f740ded + .word 0xa1b6fca8 + .word 0x8cd4a7b3 + .word 0x9f7fdc63 + .word 0x74013528 + .word 0x2ab77ca0 + .word 0x8031ceb8 + .word 0xff3e1afa + .word 0x353ec814 + .word 0x22fe027b + .word 0x8a29dc16 + .word 0xf7109d54 + .word 0x762c5d06 diff --git a/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.exp b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.exp new file mode 100644 index 0000000000000..879f5d55ea82c --- /dev/null +++ b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.exp @@ -0,0 +1,3 @@ +# Expected values: +w0 = 0x0000000000000000000000000000000000000000000000000000000000000000 +w1 = 0x0000000000000000000000000000000000000000000000000000000000000000 diff --git a/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.s b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.s new file mode 100644 index 0000000000000..f7dac6a265187 --- /dev/null +++ b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.s @@ -0,0 +1,99 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone 385-bit arithmetic-to-boolean masking test + * + * Uses OTBN ECC P-384 lib to perform arithmetic-to-boolean conversion of + * a given masked 385-bit value with a random mask. Afterwards it unmasks the + * result and compares it with the initial value from DMEM. + */ + +.section .text.start + +p384_arithmetic_to_boolean_test: + + /* init all-zero register */ + bn.xor w31, w31, w31 + + /* Load values into WDRs */ + + /* w11 <= dmem[x_l] */ + li x3, 11 + la x4, x_l + bn.lid x3, 0(x4) + + /* w12 <= dmem[x_u] */ + li x3, 12 + la x4, x_u + bn.lid x3, 0(x4) + + /* w18 <= URND + w19 <= URND (129 bits) */ + bn.wsrr w18, URND + bn.wsrr w19, URND + bn.rshi w19, w31, w19 >> 127 + + /* Arithmetic masking */ + + /* [w12,w11] = A <= [w12,w11] - [w19,w18] mod 2^385 = x - r mod 2^385 + This may result in bits above 2^385, but these will be stripped off (-> mod 2^385). */ + bn.sub w11, w11, w18 + bn.subb w12, w12, w19 + bn.rshi w12, w12, w31 >> 129 + bn.rshi w12, w31, w12 >> 127 + + /* Arithmetic to boolean conversion */ + jal x1, p384_arithmetic_to_boolean + + /* Unmask and compare values + after conversion */ + + /* w20 <= w20 ^ w18 = x' ^ r + w21 <= w21 ^ w19 = x' ^ r */ + bn.xor w20, w20, w18 + bn.xor w21, w21, w19 + + /* w11 <= dmem[x_l] */ + li x3, 11 + la x4, x_l + bn.lid x3, 0(x4) + + /* w12 <= dmem[x_u] */ + li x3, 12 + la x4, x_u + bn.lid x3, 0(x4) + + /* [w1,w0] <= [w12,w11] - [w21,w20] */ + bn.sub w0, w11, w20 + bn.subb w1, w12, w21 + + ecall + + +.data + +.globl x_u +.balign 32 +x_u: + .word 0xab0f7698 + .word 0xc85b787e + .word 0x9d9c9644 + .word 0x9f740ded + .word 0x00000001 + .word 0x00000000 + .word 0x00000000 + .word 0x00000000 + +.globl x_l +.balign 32 +x_l: + .word 0x2ab77ca0 + .word 0x8031ceb8 + .word 0xff3e1afa + .word 0x353ec814 + .word 0x22fe027b + .word 0x8a29dc16 + .word 0xf7109d54 + .word 0x762c5d06 diff --git a/sw/otbn/crypto/tests/p384_base_mult_test.s b/sw/otbn/crypto/tests/p384_base_mult_test.s index 682f599f28268..7a314a558ffb0 100644 --- a/sw/otbn/crypto/tests/p384_base_mult_test.s +++ b/sw/otbn/crypto/tests/p384_base_mult_test.s @@ -16,14 +16,14 @@ p384_base_mult_test: - /* set dmem pointer to point to scalar (private key) d */ - la x2, scalar - la x3, dptr_d + /* set dmem pointer to point to 1st scalar share d0 (private key) */ + la x2, d0 + la x3, dptr_d0 sw x2, 0(x3) - /* set dmem pointer to point to blinding parameter */ - la x2, blinding_param - la x3, dptr_rnd + /* set dmem pointer to point to 2nd scalar share d1 (private key) */ + la x2, d1 + la x3, dptr_d1 sw x2, 0(x3) /* set dmem pointer to point to x-coordinate */ @@ -53,7 +53,43 @@ p384_base_mult_test: .section .data -/* scalar d */ +/* 1st scalar share d0 (448-bit) */ +d0: + .word 0x5c832a51 + .word 0x3eb17c27 + .word 0x9a0c1b76 + .word 0x6e001281 + .word 0x4de8344e + .word 0x5b7d3b0f + .word 0x96d2f9e0 + .word 0x1e9d19e7 + .word 0x16f5c1ee + .word 0x800a4c94 + .word 0xe14cd8df + .word 0xadb9ce1b + .word 0x8677a5f2 + .word 0x32f9e2b0 + .zero 8 + +/* 2nd scalar share d1 (448-bit) */ +d1: + .word 0x33eae098 + .word 0xd31b18d5 + .word 0x507568cd + .word 0xab8fb14d + .word 0x9ef51898 + .word 0x44676e61 + .word 0x9cb814d9 + .word 0x4ad22b6e + .word 0x8930f243 + .word 0xb706d682 + .word 0xa9da1611 + .word 0x13e7014a + .word 0x9ec9b430 + .word 0x9e5dc598 + .zero 8 + +/* scalar d = (d0 + d1) mod n (384-bit) */ scalar: .word 0xe8791ba3 .word 0xf549e1f7 @@ -69,22 +105,6 @@ scalar: .word 0xc1a0cf66 .zero 16 - /* blinding parameter rnd */ - blinding_param: - .word 0xa82c85b0 - .word 0x163ce1c8 - .word 0x32518fd7 - .word 0xf8a428cd - .word 0xf5b9d867 - .word 0x00906f5f - .word 0x7387b4f2 - .word 0xa2d3da7a - .word 0xebe0a647 - .word 0xfb2ef7ca - .word 0x74249432 - .word 0x230e5ff6 - .zero 16 - /* result buffer x-coordinate */ p1_x: .zero 64 diff --git a/sw/otbn/crypto/tests/p384_curve_point_valid_test.exp b/sw/otbn/crypto/tests/p384_curve_point_valid_test.exp new file mode 100644 index 0000000000000..cb88c083b8417 --- /dev/null +++ b/sw/otbn/crypto/tests/p384_curve_point_valid_test.exp @@ -0,0 +1,2 @@ +# This test doesn't require expected WDR values, +# it just needs to complete without fault. diff --git a/sw/otbn/crypto/tests/p384_curve_point_valid_test.s b/sw/otbn/crypto/tests/p384_curve_point_valid_test.s new file mode 100644 index 0000000000000..dca9ed3ec4a20 --- /dev/null +++ b/sw/otbn/crypto/tests/p384_curve_point_valid_test.s @@ -0,0 +1,88 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone elliptic curve P-384 ECDH shared key generation test + * + * Uses OTBN ECC P-384 lib to perform a scalar multiplication with a valid + * example curve point and an example scalar. Both scalar and coordinates of + * the curve point are contained in the .data section below. + * The x coordinate of the resulting curve point is masked arithmetically + * with a random value. As the x coorodinate represents the actual + * shared key, the x coordinate and its mask are then converted from an + * arithmetic to a boolean masking scheme. + * + * The result of boolean unmasking is then compared with the expected shared + * key value. + */ + +.section .text.start + +p384_curve_point_valid_test: + /* Set pointer to x coordinate */ + la x3, dptr_x + la x4, x + sw x4, 0(x3) + + /* Set pointer to y coordinate */ + la x3, dptr_y + la x4, x + sw x4, 0(x3) + + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + jal x1, p384_curve_point_valid + + ecall + +.data + +/* pointer to x-coordinate (dptr_x) */ +.globl dptr_x +.balign 4 +dptr_x: + .zero 4 + +/* pointer to y-coordinate (dptr_y) */ +.globl dptr_y +.balign 4 +dptr_y: + .zero 4 + +/* Curve point x-coordinate. */ +.globl x +.balign 32 +x: + .word 0x4877f3d1 + .word 0x7b829460 + .word 0xb1cac609 + .word 0x5869de54 + .word 0xee0e2beb + .word 0x6c30f2d8 + .word 0x47e80661 + .word 0x394d8b70 + .word 0xcf60d89e + .word 0x1a9ea916 + .word 0xb439d701 + .word 0xca230836 + .zero 16 + +/* Curve point y-coordinate. */ +.globl y +.balign 32 +y: + .word 0xc181f90f + .word 0xc31ef079 + .word 0xbf3aff6e + .word 0xc7e55880 + .word 0xec18818c + .word 0xcea028a9 + .word 0x928c3e92 + .word 0x82b63bf3 + .word 0xd65e905d + .word 0x68eef2d1 + .word 0x03afe2c2 + .word 0xaaafcad2 + .zero 16 diff --git a/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.exp b/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.exp new file mode 100644 index 0000000000000..88b391064b20c --- /dev/null +++ b/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.exp @@ -0,0 +1,4 @@ +# Expected values: +# [w1, w0] is unmasked shared key +w0 = 0x6c5d59dbafa8ecbaf0b2d3c1e818325403634e3b86956e6ead6739217b702c4a +w1 = 0x00000000000000000000000000000000d177aa22a7c535a28cae00d420c4cd27 diff --git a/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.s b/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.s new file mode 100644 index 0000000000000..b950b76b620d1 --- /dev/null +++ b/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.s @@ -0,0 +1,165 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone elliptic curve P-384 ECDH shared key generation test + * + * Uses OTBN ECC P-384 lib to perform a scalar multiplication with a valid + * example curve point and an example scalar. Both scalar and coordinates of + * the curve point are contained in the .data section below. + * The x coordinate of the resulting curve point is masked arithmetically + * with a random value. As the x coorodinate represents the actual + * shared key, the x coordinate and its mask are then converted from an + * arithmetic to a boolean masking scheme. + * + * The result of boolean unmasking is then compared with the expected shared + * key value. + */ + +.section .text.start + +p384_ecdh_shared_key_test: + /* init all-zero register */ + bn.xor w31, w31, w31 + + /* set dmem pointer to point to x-coordinate */ + la x2, p1_x + la x3, dptr_x + sw x2, 0(x3) + + /* set dmem pointer to point to y-coordinate */ + la x2, p1_y + la x3, dptr_y + sw x2, 0(x3) + + /* set dmem pointer to point to 1st scalar share k0 */ + la x2, k0 + la x3, dptr_k0 + sw x2, 0(x3) + + /* set dmem pointer to point to 2nd scalar share k1 */ + la x2, k1 + la x3, dptr_k1 + sw x2, 0(x3) + + /* call scalar point multiplication routine in P-384 lib */ + jal x1, p384_scalar_mult + + /* load result to WDRs for unmasking and comparison with reference + [w12,w11] <= dmem[p1_x] = x_m + [w19,w18] <= dmem[p1_y] = m */ + li x2, 11 + la x3, p1_x + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + li x2, 18 + la x3, p1_y + bn.lid x2++, 0(x3) + bn.lid x2, 32(x3) + + /* Load domain parameter. + [w14,w13] = dmem[p384_p] */ + li x2, 13 + la x4, p384_p + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + /* Arithmetic to boolean conversion */ + jal x1, p384_arithmetic_to_boolean_mod + + /* Boolean unmasking of result value + [w21,w20] <= [w21,w20] ^ [w19,w18] */ + bn.xor w0, w20, w18 + bn.xor w1, w21, w19 + + ecall + + +.data + +.balign 32 + +/* point 1 x-cooridante p1_x */ +p1_x: + .word 0x1a11808b + .word 0x02e3d5a9 + .word 0x440d8db6 + .word 0x5ef02be3 + .word 0x2a35de10 + .word 0xdbdb132e + .word 0xf84e7899 + .word 0x7dff4c2b + .word 0x24705317 + .word 0x30eda4ab + .word 0xb44ba799 + .word 0x3af8f1c5 + .zero 16 + +/* point 1 y-cooridante p1_y*/ +p1_y: + .word 0xa9f8b96e + .word 0x82f268be + .word 0x8e51c662 + .word 0x92b9c4bb + .word 0x757d4493 + .word 0x26b4d3c6 + .word 0xf491007e + .word 0x92a5c72a + .word 0x8d8d8641 + .word 0x87498a20 + .word 0x0fe7dbde + .word 0x841e4949 + .zero 16 + +/* 1st scalar share k0 (448-bit) */ +k0: + .word 0x5c832a51 + .word 0x3eb17c27 + .word 0x9a0c1b76 + .word 0x6e001281 + .word 0x4de8344e + .word 0x5b7d3b0f + .word 0x96d2f9e0 + .word 0x1e9d19e7 + .word 0x16f5c1ee + .word 0x800a4c94 + .word 0xe14cd8df + .word 0xadb9ce1b + .word 0x8677a5f2 + .word 0x32f9e2b0 + .zero 8 + +/* 2nd scalar share k1 (448-bit) */ +k1: + .word 0x33eae098 + .word 0xd31b18d5 + .word 0x507568cd + .word 0xab8fb14d + .word 0x9ef51898 + .word 0x44676e61 + .word 0x9cb814d9 + .word 0x4ad22b6e + .word 0x8930f243 + .word 0xb706d682 + .word 0xa9da1611 + .word 0x13e7014a + .word 0x9ec9b430 + .word 0x9e5dc598 + .zero 8 + +/* scalar k = (k0 + k1) mod n (384-bit)*/ +scalar: + .word 0xe8791ba3 + .word 0xf549e1f7 + .word 0x893be358 + .word 0x100794fe + .word 0xbc9db95d + .word 0xfd7ed624 + .word 0xc60ebab6 + .word 0x97ba9586 + .word 0xa026b431 + .word 0x37112316 + .word 0x8b26eef1 + .word 0xc1a0cf66 + .zero 16 diff --git a/sw/otbn/crypto/tests/p384_ecdsa_sign_test.s b/sw/otbn/crypto/tests/p384_ecdsa_sign_test.s index 04d80611a2b21..9b9df3359f897 100644 --- a/sw/otbn/crypto/tests/p384_ecdsa_sign_test.s +++ b/sw/otbn/crypto/tests/p384_ecdsa_sign_test.s @@ -15,29 +15,29 @@ p384_ecdsa_sign_test: - /* set dmem pointer to nonce k */ - la x2, nonce_k - la x3, dptr_k + /* set dmem pointer to point to 1st scalar share k0 */ + la x2, k0 + la x3, dptr_k0 sw x2, 0(x3) - /* set dmem pointer to point to blinding parameter */ - la x2, blinding_param - la x3, dptr_rnd + /* set dmem pointer to point to 2nd scalar share k1 */ + la x2, k1 + la x3, dptr_k1 sw x2, 0(x3) - /* set dmem pointer to point to message */ - la x2, msg - la x3, dptr_msg + /* set dmem pointer to point to 1st scalar share d0 (private key) */ + la x2, d0 + la x3, dptr_d0 sw x2, 0(x3) - /* set dmem pointer to point to nonce k */ - la x2, nonce_k - la x3, dptr_k + /* set dmem pointer to point to 2nd scalar share d1 (private key) */ + la x2, d1 + la x3, dptr_d1 sw x2, 0(x3) - /* set dmem pointer to point to private key d */ - la x2, priv_key_d - la x3, dptr_d + /* set dmem pointer to point to message */ + la x2, msg + la x3, dptr_msg sw x2, 0(x3) /* set dmem pointer to point to signature */ @@ -66,7 +66,43 @@ p384_ecdsa_sign_test: .data -/* nonce k */ +/* 1st scalar share k0 (448-bit) */ +k0: + .word 0x5c832a51 + .word 0x3eb17c27 + .word 0x9a0c1b76 + .word 0x6e001281 + .word 0x4de8344e + .word 0x5b7d3b0f + .word 0x96d2f9e0 + .word 0x1e9d19e7 + .word 0x16f5c1ee + .word 0x800a4c94 + .word 0xe14cd8df + .word 0xadb9ce1b + .word 0x8677a5f2 + .word 0x32f9e2b0 + .zero 8 + +/* 2nd scalar share k1 (448-bit) */ +k1: + .word 0xe50b5e8e + .word 0x776ad076 + .word 0x60d31f0e + .word 0x3521b5e8 + .word 0x7bf0f8d5 + .word 0xe08231d6 + .word 0x7042f3bb + .word 0x4cb12f81 + .word 0x82a3d7ab + .word 0x198f4d05 + .word 0xb84cc0ba + .word 0xebdfcb7d + .word 0x9ec9b42f + .word 0x9e5dc598 + .zero 8 + +/* nonce k = k0 + k1 mod n (n: curve order) */ nonce_k: .word 0x99999999 .word 0x99999999 @@ -82,20 +118,56 @@ nonce_k: .word 0x99999999 .zero 16 -/* blinding parameter rnd */ - blinding_param: - .word 0xa82c85b0 - .word 0x163ce1c8 - .word 0x32518fd7 - .word 0xf8a428cd - .word 0xf5b9d867 - .word 0x00906f5f - .word 0x7387b4f2 - .word 0xa2d3da7a - .word 0xebe0a647 - .word 0xfb2ef7ca - .word 0x74249432 - .word 0x230e5ff6 +/* 1st private key share d0 (448-bit) */ +d0: + .word 0x5c832a51 + .word 0x3eb17c27 + .word 0x9a0c1b76 + .word 0x6e001281 + .word 0x4de8344e + .word 0x5b7d3b0f + .word 0x96d2f9e0 + .word 0x1e9d19e7 + .word 0x16f5c1ee + .word 0x800a4c94 + .word 0xe14cd8df + .word 0xadb9ce1b + .word 0x8677a5f2 + .word 0x32f9e2b0 + .zero 8 + +/* 2nd private key share d1 (448-bit) */ +d1: + .word 0x33eae098 + .word 0xd31b18d5 + .word 0x507568cd + .word 0xab8fb14d + .word 0x9ef51898 + .word 0x44676e61 + .word 0x9cb814d9 + .word 0x4ad22b6e + .word 0x8930f243 + .word 0xb706d682 + .word 0xa9da1611 + .word 0x13e7014a + .word 0x9ec9b430 + .word 0x9e5dc598 + .zero 8 + +/* private key d = d0 + d1 mod n (n: curve order) */ +priv_key_d: + .word 0xe8791ba3 + .word 0xf549e1f7 + .word 0x893be358 + .word 0x100794fe + .word 0xbc9db95d + .word 0xfd7ed624 + .word 0xc60ebab6 + .word 0x97ba9586 + .word 0xa026b431 + .word 0x37112316 + .word 0x8b26eef1 + .word 0xc1a0cf66 .zero 16 /* message */ @@ -114,22 +186,6 @@ msg: .word 0x55555555 .zero 16 -/* private key d */ -priv_key_d: - .word 0xe8791ba3 - .word 0xf549e1f7 - .word 0x893be358 - .word 0x100794fe - .word 0xbc9db95d - .word 0xfd7ed624 - .word 0xc60ebab6 - .word 0x97ba9586 - .word 0xa026b431 - .word 0x37112316 - .word 0x8b26eef1 - .word 0xc1a0cf66 - .zero 16 - /* signature R */ sig_r: .zero 64 diff --git a/sw/otbn/crypto/tests/p384_isoncurve_test.s b/sw/otbn/crypto/tests/p384_isoncurve_test.s index 3b6e7f919cce6..198d2f0f30eb6 100644 --- a/sw/otbn/crypto/tests/p384_isoncurve_test.s +++ b/sw/otbn/crypto/tests/p384_isoncurve_test.s @@ -15,18 +15,18 @@ p384_oncurve_test: /* set dmem to result */ - la x2, res_r - la x3, dptr_r + la x2, rhs + la x3, dptr_rhs sw x2, 0(x3) - la x2, res_l - la x3, dptr_s + la x2, lhs + la x3, dptr_lhs sw x2, 0(x3) /* set dmem pointer to point to cuve point */ - la x2, point_x + la x2, x la x3, dptr_x sw x2, 0(x3) - la x2, point_y + la x2, y la x3, dptr_y sw x2, 0(x3) @@ -35,10 +35,10 @@ p384_oncurve_test: /* load result to WDRs for comparison with reference */ li x2, 0 - la x3, res_r + la x3, rhs bn.lid x2++, 0(x3) bn.lid x2++, 32(x3) - la x3, res_l + la x3, lhs bn.lid x2++, 0(x3) bn.lid x2++, 32(x3) @@ -48,15 +48,15 @@ p384_oncurve_test: .data /* buffer for right side result of Weierstrass equation */ -res_r: +rhs: .zero 64 /* buffer for left side result of Weierstrass equation */ -res_l: +lhs: .zero 64 /* point affine x-coordinate */ -point_x: +x: .word 0x4877f3d1 .word 0x7b829460 .word 0xb1cac609 @@ -72,7 +72,7 @@ point_x: .zero 16 /* point affine y-coordinate */ -point_y: +y: .word 0xc181f90f .word 0xc31ef079 .word 0xbf3aff6e diff --git a/sw/otbn/crypto/tests/p384_keygen_test.exp b/sw/otbn/crypto/tests/p384_keygen_test.exp new file mode 100644 index 0000000000000..879f5d55ea82c --- /dev/null +++ b/sw/otbn/crypto/tests/p384_keygen_test.exp @@ -0,0 +1,3 @@ +# Expected values: +w0 = 0x0000000000000000000000000000000000000000000000000000000000000000 +w1 = 0x0000000000000000000000000000000000000000000000000000000000000000 diff --git a/sw/otbn/crypto/tests/p384_keygen_test.s b/sw/otbn/crypto/tests/p384_keygen_test.s new file mode 100644 index 0000000000000..0d937e1a4f083 --- /dev/null +++ b/sw/otbn/crypto/tests/p384_keygen_test.s @@ -0,0 +1,362 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone test for P-384 key/scalar generation + * + * Performs generation of a P-384 random secret key and scalar. + * + * This test does not test if the randomness of the generated values is + * properly distributed or if the entropy is large enough etc. + * It only checks if a few generated values are distinct and if the + * associated shares don't add up to zero (mod n). + * + * Actual randomness testing has to be done vial statistical analysis + * of generated values, but this is not possible for simulator based + * automated testing. + */ + +.section .text.start + +p384_keygen_test: + + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* set dmem pointer to point to 1st scalar share k0 */ + la x2, k0 + la x3, dptr_k0 + sw x2, 0(x3) + + /* set dmem pointer to point to 2nd scalar share k1 */ + la x2, k1 + la x3, dptr_k1 + sw x2, 0(x3) + + /* set dmem pointer to point to 1st scalar share d0 (private key) */ + la x2, d0 + la x3, dptr_d0 + sw x2, 0(x3) + + /* set dmem pointer to point to 2nd scalar share d1 (private key) */ + la x2, d1 + la x3, dptr_d1 + sw x2, 0(x3) + + /* generate 4 random 448-bit values and write them to d0, d1, k0, k1 */ + jal x1, p384_generate_random_key + jal x1, p384_generate_k + + /* load generated values into WDRs for range/distinctiveness check */ + li x2, 4 + + /* [w5,w4] <= d0 */ + la x3, dptr_d0 + lw x4, 0(x3) + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + /* [w7,w6] <= d1 */ + la x3, dptr_d1 + lw x4, 0(x3) + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + /* [w9,w8] <= k0 */ + la x3, dptr_k0 + lw x4, 0(x3) + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + /* [w11,w10] <= k1 */ + la x3, dptr_k1 + lw x4, 0(x3) + bn.lid x2++, 0(x4) + bn.lid x2++, 32(x4) + + /* Load the curve order n. + [w13,w12] <= dmem[p384_n] = n */ + li x2, 12 + la x3, p384_n + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so + no need to compute the high part): + w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */ + bn.sub w14, w31, w12 + + /* initialize w0 <= 0 and w1 <= 0 */ + bn.mov w0, w31 + bn.mov w1, w31 + + /* Check if modular addition of shares d0 and d1, as well as k0 and k1 is non-zero. */ + + /* [w17,w16] <= d = [w5,w4] + [w7,w6] mod n = d0 + d1 mod n */ + bn.add w18, w4, w6 + bn.addc w19, w5, w7 + bn.mov w20, w31 + jal x1, p384_reduce_n + + /* Compare w16 to 0. */ + bn.cmp w16, w31 + + /* Read the FG0.Z flag (position 3). + x2 <= 8 if FG0.Z else 0 */ + csrrw x2, FG0, x0 + andi x2, x2, 8 + + /* Compare w17 to 0. */ + bn.cmp w17, w31 + + /* Read the FG0.Z flag (position 3). + x3 <= 8 if FG0.Z else 0 */ + csrrw x3, FG0, x0 + andi x3, x3, 8 + + /* Check if both registers w16 and w17 are equal to 0. + x2 AND x3 == 0 <=> [w17,w16] != 0, x2 AND x3 != 0 <=> [w17,w16] == 0 */ + or x2, x2, x3 + + /* If x2 != 0: w0 <= w0 + 1, else: w0 <= w0 + 0 */ + beq x2, x0, keep_w0_1 + bn.addi w0, w0, 1 + keep_w0_1: + + /* [w17,w16] <= k = [w9,w8] + [w11,w10] mod n = k0 + k1 mod n */ + bn.add w18, w8, w10 + bn.addc w19, w9, w11 + bn.mov w20, w31 + jal x1, p384_reduce_n + + /* Compare w16 to 0. */ + bn.cmp w16, w31 + + /* Read the FG0.Z flag (position 3). + x2 <= 8 if FG0.Z else 0 */ + csrrw x2, FG0, x0 + andi x2, x2, 8 + + /* Compare w17 to 0. */ + bn.cmp w17, w31 + + /* Read the FG0.Z flag (position 3). + x3 <= 8 if FG0.Z else 0 */ + csrrw x3, FG0, x0 + andi x3, x3, 8 + + /* Check if both registers w16 and w17 are equal to 0. + x2 AND x3 == 0 <=> [w17,w16] != 0, x2 AND x3 != 0 <=> [w17,w16] == 0 */ + or x2, x2, x3 + + /* If x2 != 0: w0 <= w0 + 1, else: w0 <= w0 + 0 */ + beq x2, x0, keep_w0_2 + bn.addi w0, w0, 1 + keep_w0_2: + + /* Compare the values and check if they are distinct to each other. + If one value pair is equal, then the zero flag will be set. + In case of an equal pair w1 > 0, otherwise w1 == 0. */ + + /* [w21,w20] <= [w5,w4] - [w7,w6] = d0 - d1 + if d0 - d1 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */ + bn.sub w20, w4, w6 + bn.subb w21, w5, w7 + + /* Compare w20 to 0. */ + bn.cmp w20, w31 + + /* Read the FG0.Z flag (position 3). + x2 <= 8 if FG0.Z else 0 */ + csrrw x2, FG0, x0 + andi x2, x2, 8 + + /* Compare w21 to 0. */ + bn.cmp w21, w31 + + /* Read the FG0.Z flag (position 3). + x3 <= 8 if FG0.Z else 0 */ + csrrw x3, FG0, x0 + andi x3, x3, 8 + + /* Check if both registers w20 and w21 are equal to 0. + x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */ + and x2, x2, x3 + + /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */ + beq x2, x0, keep_w1_1 + bn.addi w1, w1, 1 + keep_w1_1: + + /* [w21,w20] <= [w5,w4] - [w9,w8] = d0 - k0 + if d0 - k0 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */ + bn.sub w20, w4, w8 + bn.subb w21, w5, w9 + + /* Compare w20 to 0. */ + bn.cmp w20, w31 + + /* Read the FG0.Z flag (position 3). + x2 <= 8 if FG0.Z else 0 */ + csrrw x2, FG0, x0 + andi x2, x2, 8 + + /* Compare w21 to 0. */ + bn.cmp w21, w31 + + /* Read the FG0.Z flag (position 3). + x3 <= 8 if FG0.Z else 0 */ + csrrw x3, FG0, x0 + andi x3, x3, 8 + + /* Check if both registers w20 and w21 are equal to 0. + x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */ + and x2, x2, x3 + + /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */ + beq x2, x0, keep_w1_2 + bn.addi w1, w1, 1 + keep_w1_2: + + /* [w21,w20] <= [w5,w4] - [w11,w10] = d0 - k1 + if d0 - k1 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */ + bn.sub w20, w4, w10 + bn.subb w21, w5, w11 + + /* Compare w20 to 0. */ + bn.cmp w20, w31 + + /* Read the FG0.Z flag (position 3). + x2 <= 8 if FG0.Z else 0 */ + csrrw x2, FG0, x0 + andi x2, x2, 8 + + /* Compare w21 to 0. */ + bn.cmp w21, w31 + + /* Read the FG0.Z flag (position 3). + x3 <= 8 if FG0.Z else 0 */ + csrrw x3, FG0, x0 + andi x3, x3, 8 + + /* Check if both registers w20 and w21 are equal to 0. + x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */ + and x2, x2, x3 + + /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */ + beq x2, x0, keep_w1_3 + bn.addi w1, w1, 1 + keep_w1_3: + + /* [w21,w20] <= [w7,w6] - [w9,w8] = d1 - k0 + if d1 - k0 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */ + bn.sub w20, w6, w8 + bn.subb w21, w7, w9 + + /* Compare w20 to 0. */ + bn.cmp w20, w31 + + /* Read the FG0.Z flag (position 3). + x2 <= 8 if FG0.Z else 0 */ + csrrw x2, FG0, x0 + andi x2, x2, 8 + + /* Compare w21 to 0. */ + bn.cmp w21, w31 + + /* Read the FG0.Z flag (position 3). + x3 <= 8 if FG0.Z else 0 */ + csrrw x3, FG0, x0 + andi x3, x3, 8 + + /* Check if both registers w20 and w21 are equal to 0. + x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */ + and x2, x2, x3 + + /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */ + beq x2, x0, keep_w1_4 + bn.addi w1, w1, 1 + keep_w1_4: + + /* [w21,w20] <= [w7,w6] - [w11,w10] = d1 - k1 + if d1 - k1 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */ + bn.sub w20, w6, w10 + bn.subb w21, w7, w11 + + /* Compare w20 to 0. */ + bn.cmp w20, w31 + + /* Read the FG0.Z flag (position 3). + x2 <= 8 if FG0.Z else 0 */ + csrrw x2, FG0, x0 + andi x2, x2, 8 + + /* Compare w21 to 0. */ + bn.cmp w21, w31 + + /* Read the FG0.Z flag (position 3). + x3 <= 8 if FG0.Z else 0 */ + csrrw x3, FG0, x0 + andi x3, x3, 8 + + /* Check if both registers w20 and w21 are equal to 0. + x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */ + and x2, x2, x3 + + /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */ + beq x2, x0, keep_w1_5 + bn.addi w1, w1, 1 + keep_w1_5: + + /* [w21,w20] <= [w9,w8] - [w11,w10] = k0 - k1 + if k0 - k1 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */ + bn.sub w20, w8, w10 + bn.subb w21, w9, w11 + + /* Compare w20 to 0. */ + bn.cmp w20, w31 + + /* Read the FG0.Z flag (position 3). + x2 <= 8 if FG0.Z else 0 */ + csrrw x2, FG0, x0 + andi x2, x2, 8 + + /* Compare w21 to 0. */ + bn.cmp w21, w31 + + /* Read the FG0.Z flag (position 3). + x3 <= 8 if FG0.Z else 0 */ + csrrw x3, FG0, x0 + andi x3, x3, 8 + + /* Check if both registers w20 and w21 are equal to 0. + x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */ + and x2, x2, x3 + + /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */ + beq x2, x0, keep_w1_6 + bn.addi w1, w1, 1 + keep_w1_6: + + ecall + +.section .data + +.balign 32 + +/* 1st private key share d0 (448-bit) */ +d0: + .zero 64 + +/* 2nd private key share d1 (448-bit) */ +d1: + .zero 64 + +/* 1st scalar share k0 (448-bit) */ +k0: + .zero 64 + +/* 2nd scalar share k1 (448-bit) */ +k1: + .zero 64 diff --git a/sw/otbn/crypto/tests/p384_mulmod448x128_test.exp b/sw/otbn/crypto/tests/p384_mulmod448x128_test.exp new file mode 100644 index 0000000000000..521e386310259 --- /dev/null +++ b/sw/otbn/crypto/tests/p384_mulmod448x128_test.exp @@ -0,0 +1,3 @@ +# Expected values (result of modular multiplication) +w0 = 0xb1c0a5d4079771ccbf1e21a89602d8636f771e8cdc5e0e904c5152463aa12b0d +w1 = 0x00000000000000000000000000000000872ed6de74b2c551e0d591aa03cd081d diff --git a/sw/otbn/crypto/tests/p384_mulmod448x128_test.s b/sw/otbn/crypto/tests/p384_mulmod448x128_test.s new file mode 100644 index 0000000000000..9b6502cba626e --- /dev/null +++ b/sw/otbn/crypto/tests/p384_mulmod448x128_test.s @@ -0,0 +1,79 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone test for P-384 ECDSA signature generation + * + * Computes P-384 ECDSA signature for message, nonce and private key + * contained in the .data section. + * + * See comment at the end of the file for expected values of signature. + */ + +.section .text.start + +p384_mulmod448x128_test: + + /* init all-zero reg */ + bn.xor w31, w31, w31 + + /* load multiplication input into WDRs + [w11,w10] <= a + w16 <= b */ + li x2, 10 + la x3, a + bn.lid x2++, 0(x3) + bn.lid x2, 32(x3) + li x2, 16 + la x3, b + bn.lid x2, 0(x3) + + /* load domain parameter n (order of base point) + [w13, w12] <= n = dmem[dptr_n] */ + li x2, 12 + la x3, p384_n + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so + no need to compute the high part): + w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */ + bn.sub w14, w31, w12 + + /* Compute a * b mod n + [w17,w16] <= [w11,w10] * w16 mod [w13,w12] = a * b mod n */ + jal x1, p384_mulmod448x128_n + + /* move result to different WDRs for comparison */ + bn.mov w0, w16 + bn.mov w1, w17 + + ecall + + +.data + +a: + .word 0x5c832a51 + .word 0x3eb17c27 + .word 0x9a0c1b76 + .word 0x6e001281 + .word 0x4de8344e + .word 0x5b7d3b0f + .word 0x96d2f9e0 + .word 0x1e9d19e7 + .word 0x16f5c1ee + .word 0x800a4c94 + .word 0xe14cd8df + .word 0xadb9ce1b + .word 0x8677a5f2 + .word 0x32f9e2b0 + .zero 8 + +b: + .word 0x5c832a51 + .word 0x3eb17c27 + .word 0x9a0c1b76 + .word 0x6e001281 + .zero 48 diff --git a/sw/otbn/crypto/tests/p384_scalar_mult_test.exp b/sw/otbn/crypto/tests/p384_scalar_mult_test.exp index be9d0206a3be8..6f2f96470e94d 100644 --- a/sw/otbn/crypto/tests/p384_scalar_mult_test.exp +++ b/sw/otbn/crypto/tests/p384_scalar_mult_test.exp @@ -1,7 +1,4 @@ # Expected values (x- and y-coordinates of result): -# [w1, w0] is affine x-coordinate of resulting point, -# [w3, w2] is affine y-coordinate of resulting point. +# [w1, w0] is affine x-coordinate of resulting point w0 = 0x6c5d59dbafa8ecbaf0b2d3c1e818325403634e3b86956e6ead6739217b702c4a w1 = 0x00000000000000000000000000000000d177aa22a7c535a28cae00d420c4cd27 -w2 = 0x607c6c698fc5c15cbfadf94e322fa2fa5ff6cf915fe9ad62f538701f1add78ec -w3 = 0x000000000000000000000000000000009e18fa893348fb1d44f40dbedcb5e36c diff --git a/sw/otbn/crypto/tests/p384_scalar_mult_test.s b/sw/otbn/crypto/tests/p384_scalar_mult_test.s index 7fdd588fa70ba..68f70dbab95dc 100644 --- a/sw/otbn/crypto/tests/p384_scalar_mult_test.s +++ b/sw/otbn/crypto/tests/p384_scalar_mult_test.s @@ -17,6 +17,9 @@ p384_scalar_mult_test: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + /* set dmem pointer to point to x-coordinate */ la x2, p1_x la x3, dptr_x @@ -27,18 +30,18 @@ p384_scalar_mult_test: la x3, dptr_y sw x2, 0(x3) - /* set dmem pointer to point to scalar k */ - la x2, scalar - la x3, dptr_k + /* set dmem pointer to point to 1st scalar share k0 */ + la x2, k0 + la x3, dptr_k0 sw x2, 0(x3) - /* set dmem pointer to point to blinding parameter */ - la x2, blinding_param - la x3, dptr_rnd + /* set dmem pointer to point to 2nd scalar share k1 */ + la x2, k1 + la x3, dptr_k1 sw x2, 0(x3) /* call scalar point multiplication routine in P-384 lib */ - jal x1, scalar_mult_p384 + jal x1, p384_scalar_mult /* load result to WDRs for comparison with reference */ li x2, 0 @@ -49,11 +52,31 @@ p384_scalar_mult_test: bn.lid x2++, 0(x3) bn.lid x2, 32(x3) + /* load domain parameter p (modulus) + [w13, w12] = p = dmem[p384_p] */ + li x2, 12 + la x3, p384_p + bn.lid x2++, 0(x3) + bn.lid x2++, 32(x3) + + /* unmask x coordinate x = x_m + m mod p = x-coord. + y-coord. mod p */ + bn.add w0, w0, w2 + bn.addc w1, w1, w3 + + bn.mov w18, w0 + bn.mov w19, w1 + bn.mov w20, w31 + jal x1, p384_reduce_p + bn.mov w0, w16 + bn.mov w1, w17 + ecall .section .data +.balign 32 + /* point 1 x-cooridante p1_x */ p1_x: .word 0x1a11808b @@ -86,7 +109,43 @@ p1_y: .word 0x841e4949 .zero 16 -/* scalar k */ +/* 1st scalar share k0 (448-bit) */ +k0: + .word 0x5c832a51 + .word 0x3eb17c27 + .word 0x9a0c1b76 + .word 0x6e001281 + .word 0x4de8344e + .word 0x5b7d3b0f + .word 0x96d2f9e0 + .word 0x1e9d19e7 + .word 0x16f5c1ee + .word 0x800a4c94 + .word 0xe14cd8df + .word 0xadb9ce1b + .word 0x8677a5f2 + .word 0x32f9e2b0 + .zero 8 + +/* 2nd scalar share k1 (448-bit) */ +k1: + .word 0x33eae098 + .word 0xd31b18d5 + .word 0x507568cd + .word 0xab8fb14d + .word 0x9ef51898 + .word 0x44676e61 + .word 0x9cb814d9 + .word 0x4ad22b6e + .word 0x8930f243 + .word 0xb706d682 + .word 0xa9da1611 + .word 0x13e7014a + .word 0x9ec9b430 + .word 0x9e5dc598 + .zero 8 + +/* scalar k = (k0 + k1) mod n (384-bit)*/ scalar: .word 0xe8791ba3 .word 0xf549e1f7 @@ -101,19 +160,3 @@ scalar: .word 0x8b26eef1 .word 0xc1a0cf66 .zero 16 - - /* blinding parameter rnd */ - blinding_param: - .word 0xa82c85b0 - .word 0x163ce1c8 - .word 0x32518fd7 - .word 0xf8a428cd - .word 0xf5b9d867 - .word 0x00906f5f - .word 0x7387b4f2 - .word 0xa2d3da7a - .word 0xebe0a647 - .word 0xfb2ef7ca - .word 0x74249432 - .word 0x230e5ff6 - .zero 16 diff --git a/sw/otbn/crypto/tests/primality_test.s b/sw/otbn/crypto/tests/primality_test.s index 8af539b7896d1..a174c7d5f2260 100644 --- a/sw/otbn/crypto/tests/primality_test.s +++ b/sw/otbn/crypto/tests/primality_test.s @@ -58,26 +58,26 @@ main: .data /* Candidate prime (randomly generated using pycryptodome) = -0x9ac5b6d69aa1d91c418d9bf315ba72595488aabddbd435dafe630ba818e3d4ef03ab9bf93147a781cc45f6219f8bc92fc500c92dc8539832055036f6537320a1 +0x83f4fb7ca746b70dd7e37ce93847ed7995ccf101bb7a9c628ebcffeeaa0114efd346ddfb53c1d31d51ab13bbcb0b2346d6689cd78210bfe05f458233d8e58e1b */ .balign 32 input: -.word 0x537320a1 -.word 0x055036f6 -.word 0xc8539832 -.word 0xc500c92d -.word 0x9f8bc92f -.word 0xcc45f621 -.word 0x3147a781 -.word 0x03ab9bf9 -.word 0x18e3d4ef -.word 0xfe630ba8 -.word 0xdbd435da -.word 0x5488aabd -.word 0x15ba7259 -.word 0x418d9bf3 -.word 0x9aa1d91c -.word 0x9ac5b6d6 +.word 0xd8e58e1b +.word 0x5f458233 +.word 0x8210bfe0 +.word 0xd6689cd7 +.word 0xcb0b2346 +.word 0x51ab13bb +.word 0x53c1d31d +.word 0xd346ddfb +.word 0xaa0114ef +.word 0x8ebcffee +.word 0xbb7a9c62 +.word 0x95ccf101 +.word 0x3847ed79 +.word 0xd7e37ce9 +.word 0xa746b70d +.word 0x83f4fb7c .section .scratchpad diff --git a/sw/otbn/crypto/tests/primality_test_witness_negative_test.exp b/sw/otbn/crypto/tests/primality_test_witness_negative_test.exp index 3720ca37665bf..411aa2c8d793f 100644 --- a/sw/otbn/crypto/tests/primality_test_witness_negative_test.exp +++ b/sw/otbn/crypto/tests/primality_test_witness_negative_test.exp @@ -1,7 +1,2 @@ -# For this particular composite/witness pair, we don't hit an early-exit case; -# expect to finish the loop so w0, w1 == (b^(w-1) * R) % w -w0 = 0xb470f524ca68f2455a1a85b8dc006872131ceedf6d07883f0f010e0bd222c0e3 -w1 = 0x65a498fbb4f35d9919fea51aaf2e83256c5f624f37bfc26e63a42a3c74f15a65 - # Result from witness test: 0 (indicating "composite") w21 = 0 diff --git a/sw/otbn/crypto/tests/primality_test_witness_negative_test.s b/sw/otbn/crypto/tests/primality_test_witness_negative_test.s index 3093e9fa7abd3..e1736e4581289 100644 --- a/sw/otbn/crypto/tests/primality_test_witness_negative_test.s +++ b/sw/otbn/crypto/tests/primality_test_witness_negative_test.s @@ -30,22 +30,12 @@ main: la x18, mont_rr jal x1, test_witness - /* Load the value from the working buffer into registers. This buffer holds - the witness raised to some portion of the exponent; we can check it to - ensure that w was found to be composite at exactly the point we expected. - w0,w1 <= dmem[tmp:tmp+n*32] */ - li x8, 0 - la x15, tmp - loop x30, 2 - bn.lid x8, 0(x15++) - addi x8, x8, 1 - ecall .data /* Candidate prime (composite, randomly generated) = -0xf7b5cc32e3c7c3ff6f220312fe4be4af76c9e51e8c17648c863751d70359bab17c1d7b4844e01d1ec0cd695ff3bae05dc51d95a001ab7b69f66d0c056c2dec39 +0xf7b5cc32e3c7c3ff6f220312fe4be4af76c9e51e8c17648c863751d70359bab17c1d7b4844e01d1ec0cd695ff3bae05dc51d95a001ab7b69f66d0c056c2dec3b */ .balign 32 input: @@ -89,34 +79,34 @@ witness: /* Precomputed Montgomery constant m0' (256 bits). */ .balign 32 mont_m0inv: -.word 0xd0a3bdf7 -.word 0x7dde1093 -.word 0xf7fe594f -.word 0x8f66b353 -.word 0x03a1c874 -.word 0x3c4a0e42 -.word 0x0d02fb70 -.word 0x2cf2f731 +.word 0xbb5df30d +.word 0xf47b30a4 +.word 0x45c4b2af +.word 0xb6e86212 +.word 0xacafa4f9 +.word 0x6e5afd69 +.word 0x9ae7984c +.word 0xce44dadc /* Precomputed Montgomery constant RR (512 bits). */ .balign 32 mont_rr: -.word 0xd04011c2 -.word 0x8ef6bac2 -.word 0x2c87d164 -.word 0x5f60cb7a -.word 0x5e64a3f6 -.word 0xe9f883b0 -.word 0xa802122b -.word 0xf910bf58 -.word 0x94680653 -.word 0x3dadc1f1 -.word 0x4adf397f -.word 0xa87c8a2a -.word 0x0576494c -.word 0x5ce4999d -.word 0x8188e572 -.word 0x0911fc89 +.word 0xc1e31e17 +.word 0x6f9be028 +.word 0xcd184ada +.word 0xbbd4bbb9 +.word 0x10d84741 +.word 0xa11300bd +.word 0x4e5c6583 +.word 0x50805ac8 +.word 0x78f6cf41 +.word 0x163b312e +.word 0x126593d5 +.word 0x03cc62ac +.word 0x23cbc231 +.word 0xa53b2634 +.word 0x5d9d6071 +.word 0xdf10ee86 .section .scratchpad diff --git a/sw/otbn/crypto/tests/primality_test_witness_test.exp b/sw/otbn/crypto/tests/primality_test_witness_test.exp index 0be310c8ecbfa..16c7f2246c059 100644 --- a/sw/otbn/crypto/tests/primality_test_witness_test.exp +++ b/sw/otbn/crypto/tests/primality_test_witness_test.exp @@ -1,6 +1,6 @@ -# w0, w1 <= (b^(w-1) * R) % w = (1 * R) % w -w0 = 0xf156a85066fa88460f3223454c58b0b878c560be590bf156363935bed6c123f5 -w1 = 0x72e3f25e319a983962943197fd7f7e1a7df76977ec8eb6b6e3d91b8199fb9c6c +# w0, w1 <= (b^((w-1) / 2) * R) % w = (-1 * R) % w +w0 = 0x1d52af5f320aef73e19bb975674e9e8f0e753e834de81d53938d9482527db816 +w1 = 0x1a381b439ccacf8d3ad79cd0050103cb04112d1026e29292384dc8fccc08c726 # Result from witness test: all 1s (indicating "possibly prime") w21 = 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff diff --git a/sw/otbn/crypto/tests/rsa_1024_enc_test.s b/sw/otbn/crypto/tests/rsa_1024_enc_test.s index c9419ecd11340..b041f1da476f4 100644 --- a/sw/otbn/crypto/tests/rsa_1024_enc_test.s +++ b/sw/otbn/crypto/tests/rsa_1024_enc_test.s @@ -13,7 +13,7 @@ * .data segment in this file. * * Copies the encrypted message to wide registers for comparison (starting at - * w0). See comment at the end of the file for expected values. + * w0). */ run_rsa_1024_enc: /* Init all-zero register. */ diff --git a/sw/otbn/crypto/tests/rsa_2048_dec_test.exp b/sw/otbn/crypto/tests/rsa_2048_dec_test.exp new file mode 100644 index 0000000000000..51e2039287ca1 --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_2048_dec_test.exp @@ -0,0 +1,10 @@ +# Expected value: +# 0x6add9548af50f1bea3cb921205a5bb92ee325e01d160e3738a09aa0df7050e6051d693440f0d00cdd56cee5a748ff3b48b1df7be05808ad20068ad387b8b5e4c25c79bba9f87ef971da926f644c26d4273829fd69db71f9eded2cd1a33c367578550346ada160daa272940dd6fc10dae4a0facef437ece40130301c1b847203cc0defd3620ce89d96fa21d30ee63e458b0198adc842f68af8b462df6014955ab68f663a9b5e77caf15a517ab0931308bf9591cecc7691780a2f3bd99d3ce25433d31537e7cab1b4c07d99199e9517132188150d38d633c2b3ef6ba6fb40504e800fca580beb7a19f2315adb451be690fc4f87ea5914d28d5562dc1dce115a852 +w0 = 0x00fca580beb7a19f2315adb451be690fc4f87ea5914d28d5562dc1dce115a852 +w1 = 0x3d31537e7cab1b4c07d99199e9517132188150d38d633c2b3ef6ba6fb40504e8 +w2 = 0x68f663a9b5e77caf15a517ab0931308bf9591cecc7691780a2f3bd99d3ce2543 +w3 = 0xc0defd3620ce89d96fa21d30ee63e458b0198adc842f68af8b462df6014955ab +w4 = 0x8550346ada160daa272940dd6fc10dae4a0facef437ece40130301c1b847203c +w5 = 0x25c79bba9f87ef971da926f644c26d4273829fd69db71f9eded2cd1a33c36757 +w6 = 0x51d693440f0d00cdd56cee5a748ff3b48b1df7be05808ad20068ad387b8b5e4c +w7 = 0x6add9548af50f1bea3cb921205a5bb92ee325e01d160e3738a09aa0df7050e60 diff --git a/sw/otbn/crypto/tests/rsa_2048_dec_test.s b/sw/otbn/crypto/tests/rsa_2048_dec_test.s new file mode 100644 index 0000000000000..4dcd14b0cacbb --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_2048_dec_test.s @@ -0,0 +1,272 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + + +.section .text.start + +/** + * Standalone RSA-2048 modexp with secret exponent (decryption/signing). + */ +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load number of limbs. */ + li x30, 8 + + /* Load pointers to modulus and Montgomery constant buffers. */ + la x16, modulus + la x17, m0inv + la x18, RR + + /* Compute Montgomery constants. */ + jal x1, modload + + /* Run exponentiation. + dmem[result] = dmem[base]^dmem[exp] mod dmem[modulus] */ + la x14, base + la x15, exp + la x2, result + jal x1, modexp + + /* copy all limbs of result to wide reg file */ + la x21, result + li x8, 0 + loop x30, 2 + bn.lid x8, 0(x21++) + addi x8, x8, 1 + + ecall + + +.data + +/* Modulus n = + +0xb5ed720fe7e1b4a65494e8e9421df94910811d23854cb07b08a34508b682b188b16fa70e4804b4c4f54a54ae2a10848abc9253ac7c6085e5b9abcbcd48515db1626b01df4e7f5f1c85b9ce1b4c8d0f77f3854c8bc4f350ad4d993a6815d0d62ac83b47a257adb40023e1acf003d27953f19c5cbede1af58e42ef12ad9907c20ca428f8b7dbb6f3434936b1108d17ee343d7127f8885ff2513eb834c17bf1c4ddec0d61cc26f5f683c10c0e48676608811e9341f2898f690bc9fafd3b7e46d375e2178a141faf0d637767da550de4c5b9939af133ceba7cd2734df4ad269c166180afd8c35060de8ac302ca911aa3f92d139ed1595523a7f6c201cfafed4c17b5 + */ +.balign 32 +modulus: + .word 0xed4c17b5 + .word 0xc201cfaf + .word 0x5523a7f6 + .word 0x139ed159 + .word 0x1aa3f92d + .word 0xc302ca91 + .word 0x5060de8a + .word 0x80afd8c3 + .word 0x269c1661 + .word 0x734df4ad + .word 0xceba7cd2 + .word 0x939af133 + .word 0x0de4c5b9 + .word 0x7767da55 + .word 0x1faf0d63 + .word 0xe2178a14 + .word 0x7e46d375 + .word 0xc9fafd3b + .word 0x898f690b + .word 0x1e9341f2 + .word 0x67660881 + .word 0xc10c0e48 + .word 0x26f5f683 + .word 0xec0d61cc + .word 0x7bf1c4dd + .word 0x3eb834c1 + .word 0x885ff251 + .word 0x3d7127f8 + .word 0x8d17ee34 + .word 0x4936b110 + .word 0xdbb6f343 + .word 0xa428f8b7 + .word 0x9907c20c + .word 0x42ef12ad + .word 0xde1af58e + .word 0xf19c5cbe + .word 0x03d27953 + .word 0x23e1acf0 + .word 0x57adb400 + .word 0xc83b47a2 + .word 0x15d0d62a + .word 0x4d993a68 + .word 0xc4f350ad + .word 0xf3854c8b + .word 0x4c8d0f77 + .word 0x85b9ce1b + .word 0x4e7f5f1c + .word 0x626b01df + .word 0x48515db1 + .word 0xb9abcbcd + .word 0x7c6085e5 + .word 0xbc9253ac + .word 0x2a10848a + .word 0xf54a54ae + .word 0x4804b4c4 + .word 0xb16fa70e + .word 0xb682b188 + .word 0x08a34508 + .word 0x854cb07b + .word 0x10811d23 + .word 0x421df949 + .word 0x5494e8e9 + .word 0xe7e1b4a6 + .word 0xb5ed720f + +/* Base for exponentiation (corresponds to ciphertext for decryption or + message for signing). + + Raw hex value = +0x95fb986cd4aeee4b013effc1d183670380a9e2133ecc6a38dbbfff3f8ef20e1923a5e3741eac8772ee80f28994968fcabd6d454b7791263872bc68d97b6f4fbb76cee24f205d812ad36f2fcb6c11145943009a051c39c18c45b53ee19e51df0254b31eb991783718fb35c51dec249956bceb0276eaee88d8ecdeae2c08ac62a0018408af3923206e911a7ecf6ad786255fa69d63d333e6f44ebd3f5e6ebb7c82443c694d913e200492c89f046943f2dc7d8cf9951c6a33fa721558d1956fb552349ded082714be6a8bff775fd05162744d229fc9fac72509476bdc6434e5187bf3a1cc426cc13f0a10dcf0d15f28abcecfe5674782f232464b1a890d42b6fdd0 + */ +.balign 32 +base: + .word 0x42b6fdd0 + .word 0x4b1a890d + .word 0x82f23246 + .word 0xcfe56747 + .word 0x5f28abce + .word 0x10dcf0d1 + .word 0x6cc13f0a + .word 0xf3a1cc42 + .word 0x34e5187b + .word 0x476bdc64 + .word 0xfac72509 + .word 0x4d229fc9 + .word 0xd0516274 + .word 0x8bff775f + .word 0x2714be6a + .word 0x349ded08 + .word 0x956fb552 + .word 0x721558d1 + .word 0x1c6a33fa + .word 0x7d8cf995 + .word 0x6943f2dc + .word 0x92c89f04 + .word 0x913e2004 + .word 0x443c694d + .word 0x6ebb7c82 + .word 0x4ebd3f5e + .word 0xd333e6f4 + .word 0x5fa69d63 + .word 0x6ad78625 + .word 0x911a7ecf + .word 0x3923206e + .word 0x018408af + .word 0x08ac62a0 + .word 0xecdeae2c + .word 0xeaee88d8 + .word 0xbceb0276 + .word 0xec249956 + .word 0xfb35c51d + .word 0x91783718 + .word 0x54b31eb9 + .word 0x9e51df02 + .word 0x45b53ee1 + .word 0x1c39c18c + .word 0x43009a05 + .word 0x6c111459 + .word 0xd36f2fcb + .word 0x205d812a + .word 0x76cee24f + .word 0x7b6f4fbb + .word 0x72bc68d9 + .word 0x77912638 + .word 0xbd6d454b + .word 0x94968fca + .word 0xee80f289 + .word 0x1eac8772 + .word 0x23a5e374 + .word 0x8ef20e19 + .word 0xdbbfff3f + .word 0x3ecc6a38 + .word 0x80a9e213 + .word 0xd1836703 + .word 0x013effc1 + .word 0xd4aeee4b + .word 0x95fb986c + +/* Private exponent d = +0x51a84a52295a7da34ac3abe746edfd3e7651fdaa3be2b8340124878fe99bafe4130072934e700e537965ebac60e51918cc9b4143627050a95435703cac011974cd200aaf18a4c3242241cbe924eb0bce6357a98bf2d2e39b660128de1f2ca5747e7b5d23d906f68c398ec9f8d13e5f86f623a0dd6b03dec403f71b03207502fbb6c7d812f391e010cbed264655d11ab63c262a803196a128df72ecf1c65ed7f742371e4c4ee355f44cfae81ec0a256da9aa3eb1935fc509d366de08c7edb522411670cd7ee0053bb9395ac4cbe0af6f3cdd1c24e225ee47aa4f381764cfab389db993fed537f397fbff31362a85872993bc467dde42b66894f4cb3ce712b2ee1 + */ +.balign 32 +exp: + .word 0x712b2ee1 + .word 0x4f4cb3ce + .word 0xe42b6689 + .word 0x3bc467dd + .word 0xa8587299 + .word 0xbff31362 + .word 0x537f397f + .word 0xdb993fed + .word 0x4cfab389 + .word 0xa4f38176 + .word 0x225ee47a + .word 0xcdd1c24e + .word 0xbe0af6f3 + .word 0x9395ac4c + .word 0xee0053bb + .word 0x11670cd7 + .word 0x7edb5224 + .word 0x366de08c + .word 0x35fc509d + .word 0x9aa3eb19 + .word 0xc0a256da + .word 0x4cfae81e + .word 0x4ee355f4 + .word 0x42371e4c + .word 0xc65ed7f7 + .word 0xdf72ecf1 + .word 0x3196a128 + .word 0x3c262a80 + .word 0x55d11ab6 + .word 0xcbed2646 + .word 0xf391e010 + .word 0xb6c7d812 + .word 0x207502fb + .word 0x03f71b03 + .word 0x6b03dec4 + .word 0xf623a0dd + .word 0xd13e5f86 + .word 0x398ec9f8 + .word 0xd906f68c + .word 0x7e7b5d23 + .word 0x1f2ca574 + .word 0x660128de + .word 0xf2d2e39b + .word 0x6357a98b + .word 0x24eb0bce + .word 0x2241cbe9 + .word 0x18a4c324 + .word 0xcd200aaf + .word 0xac011974 + .word 0x5435703c + .word 0x627050a9 + .word 0xcc9b4143 + .word 0x60e51918 + .word 0x7965ebac + .word 0x4e700e53 + .word 0x13007293 + .word 0xe99bafe4 + .word 0x0124878f + .word 0x3be2b834 + .word 0x7651fdaa + .word 0x46edfd3e + .word 0x4ac3abe7 + .word 0x295a7da3 + .word 0x51a84a52 + +/* output buffer */ +.balign 32 +result: +.zero 256 + +/* buffer for Montgomery constant RR */ +.balign 32 +RR: +.zero 256 + +/* buffer for Montgomery constant m0inv */ +.balign 32 +m0inv: +.zero 32 diff --git a/sw/otbn/crypto/tests/rsa_2048_enc_test.exp b/sw/otbn/crypto/tests/rsa_2048_enc_test.exp new file mode 100644 index 0000000000000..a8f658f900318 --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_2048_enc_test.exp @@ -0,0 +1,10 @@ +# Expected result (base ^ 65537) mod n = +# 0x95fb986cd4aeee4b013effc1d183670380a9e2133ecc6a38dbbfff3f8ef20e1923a5e3741eac8772ee80f28994968fcabd6d454b7791263872bc68d97b6f4fbb76cee24f205d812ad36f2fcb6c11145943009a051c39c18c45b53ee19e51df0254b31eb991783718fb35c51dec249956bceb0276eaee88d8ecdeae2c08ac62a0018408af3923206e911a7ecf6ad786255fa69d63d333e6f44ebd3f5e6ebb7c82443c694d913e200492c89f046943f2dc7d8cf9951c6a33fa721558d1956fb552349ded082714be6a8bff775fd05162744d229fc9fac72509476bdc6434e5187bf3a1cc426cc13f0a10dcf0d15f28abcecfe5674782f232464b1a890d42b6fdd0 +w0 = 0xf3a1cc426cc13f0a10dcf0d15f28abcecfe5674782f232464b1a890d42b6fdd0 +w1 = 0x349ded082714be6a8bff775fd05162744d229fc9fac72509476bdc6434e5187b +w2 = 0x443c694d913e200492c89f046943f2dc7d8cf9951c6a33fa721558d1956fb552 +w3 = 0x018408af3923206e911a7ecf6ad786255fa69d63d333e6f44ebd3f5e6ebb7c82 +w4 = 0x54b31eb991783718fb35c51dec249956bceb0276eaee88d8ecdeae2c08ac62a0 +w5 = 0x76cee24f205d812ad36f2fcb6c11145943009a051c39c18c45b53ee19e51df02 +w6 = 0x23a5e3741eac8772ee80f28994968fcabd6d454b7791263872bc68d97b6f4fbb +w7 = 0x95fb986cd4aeee4b013effc1d183670380a9e2133ecc6a38dbbfff3f8ef20e19 diff --git a/sw/otbn/crypto/tests/rsa_2048_enc_test.s b/sw/otbn/crypto/tests/rsa_2048_enc_test.s new file mode 100644 index 0000000000000..6721228f4a518 --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_2048_enc_test.s @@ -0,0 +1,201 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + + +.section .text.start + +/** + * Standalone RSA-2048 modexp with e=65537 (encryption/verification). + */ +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load number of limbs. */ + li x30, 8 + + /* Load pointers to modulus and Montgomery constant buffers. */ + la x16, modulus + la x17, m0inv + la x18, RR + + /* Compute Montgomery constants. */ + jal x1, modload + + /* Run exponentiation. + dmem[result] = dmem[base]^dmem[exp] mod dmem[modulus] */ + la x14, base + la x2, result + jal x1, modexp_65537 + + /* copy all limbs of result to wide reg file */ + la x21, result + li x8, 0 + loop x30, 2 + bn.lid x8, 0(x21++) + addi x8, x8, 1 + + ecall + +.data + +/* Modulus n = + +0xb5ed720fe7e1b4a65494e8e9421df94910811d23854cb07b08a34508b682b188b16fa70e4804b4c4f54a54ae2a10848abc9253ac7c6085e5b9abcbcd48515db1626b01df4e7f5f1c85b9ce1b4c8d0f77f3854c8bc4f350ad4d993a6815d0d62ac83b47a257adb40023e1acf003d27953f19c5cbede1af58e42ef12ad9907c20ca428f8b7dbb6f3434936b1108d17ee343d7127f8885ff2513eb834c17bf1c4ddec0d61cc26f5f683c10c0e48676608811e9341f2898f690bc9fafd3b7e46d375e2178a141faf0d637767da550de4c5b9939af133ceba7cd2734df4ad269c166180afd8c35060de8ac302ca911aa3f92d139ed1595523a7f6c201cfafed4c17b5 + */ +.balign 32 +modulus: + .word 0xed4c17b5 + .word 0xc201cfaf + .word 0x5523a7f6 + .word 0x139ed159 + .word 0x1aa3f92d + .word 0xc302ca91 + .word 0x5060de8a + .word 0x80afd8c3 + .word 0x269c1661 + .word 0x734df4ad + .word 0xceba7cd2 + .word 0x939af133 + .word 0x0de4c5b9 + .word 0x7767da55 + .word 0x1faf0d63 + .word 0xe2178a14 + .word 0x7e46d375 + .word 0xc9fafd3b + .word 0x898f690b + .word 0x1e9341f2 + .word 0x67660881 + .word 0xc10c0e48 + .word 0x26f5f683 + .word 0xec0d61cc + .word 0x7bf1c4dd + .word 0x3eb834c1 + .word 0x885ff251 + .word 0x3d7127f8 + .word 0x8d17ee34 + .word 0x4936b110 + .word 0xdbb6f343 + .word 0xa428f8b7 + .word 0x9907c20c + .word 0x42ef12ad + .word 0xde1af58e + .word 0xf19c5cbe + .word 0x03d27953 + .word 0x23e1acf0 + .word 0x57adb400 + .word 0xc83b47a2 + .word 0x15d0d62a + .word 0x4d993a68 + .word 0xc4f350ad + .word 0xf3854c8b + .word 0x4c8d0f77 + .word 0x85b9ce1b + .word 0x4e7f5f1c + .word 0x626b01df + .word 0x48515db1 + .word 0xb9abcbcd + .word 0x7c6085e5 + .word 0xbc9253ac + .word 0x2a10848a + .word 0xf54a54ae + .word 0x4804b4c4 + .word 0xb16fa70e + .word 0xb682b188 + .word 0x08a34508 + .word 0x854cb07b + .word 0x10811d23 + .word 0x421df949 + .word 0x5494e8e9 + .word 0xe7e1b4a6 + .word 0xb5ed720f + + +/* Base for exponentiation (corresponds to plaintext for encryption or + signature for verification). + + Raw hex value (randomly generated) = +0x6add9548af50f1bea3cb921205a5bb92ee325e01d160e3738a09aa0df7050e6051d693440f0d00cdd56cee5a748ff3b48b1df7be05808ad20068ad387b8b5e4c25c79bba9f87ef971da926f644c26d4273829fd69db71f9eded2cd1a33c367578550346ada160daa272940dd6fc10dae4a0facef437ece40130301c1b847203cc0defd3620ce89d96fa21d30ee63e458b0198adc842f68af8b462df6014955ab68f663a9b5e77caf15a517ab0931308bf9591cecc7691780a2f3bd99d3ce25433d31537e7cab1b4c07d99199e9517132188150d38d633c2b3ef6ba6fb40504e800fca580beb7a19f2315adb451be690fc4f87ea5914d28d5562dc1dce115a852 + */ +.balign 32 +base: + .word 0xe115a852 + .word 0x562dc1dc + .word 0x914d28d5 + .word 0xc4f87ea5 + .word 0x51be690f + .word 0x2315adb4 + .word 0xbeb7a19f + .word 0x00fca580 + .word 0xb40504e8 + .word 0x3ef6ba6f + .word 0x8d633c2b + .word 0x188150d3 + .word 0xe9517132 + .word 0x07d99199 + .word 0x7cab1b4c + .word 0x3d31537e + .word 0xd3ce2543 + .word 0xa2f3bd99 + .word 0xc7691780 + .word 0xf9591cec + .word 0x0931308b + .word 0x15a517ab + .word 0xb5e77caf + .word 0x68f663a9 + .word 0x014955ab + .word 0x8b462df6 + .word 0x842f68af + .word 0xb0198adc + .word 0xee63e458 + .word 0x6fa21d30 + .word 0x20ce89d9 + .word 0xc0defd36 + .word 0xb847203c + .word 0x130301c1 + .word 0x437ece40 + .word 0x4a0facef + .word 0x6fc10dae + .word 0x272940dd + .word 0xda160daa + .word 0x8550346a + .word 0x33c36757 + .word 0xded2cd1a + .word 0x9db71f9e + .word 0x73829fd6 + .word 0x44c26d42 + .word 0x1da926f6 + .word 0x9f87ef97 + .word 0x25c79bba + .word 0x7b8b5e4c + .word 0x0068ad38 + .word 0x05808ad2 + .word 0x8b1df7be + .word 0x748ff3b4 + .word 0xd56cee5a + .word 0x0f0d00cd + .word 0x51d69344 + .word 0xf7050e60 + .word 0x8a09aa0d + .word 0xd160e373 + .word 0xee325e01 + .word 0x05a5bb92 + .word 0xa3cb9212 + .word 0xaf50f1be + .word 0x6add9548 + +/* output buffer */ +.balign 32 +result: +.zero 256 + +/* buffer for Montgomery constant RR */ +.balign 32 +RR: +.zero 256 + +/* buffer for Montgomery constant m0inv */ +.balign 32 +m0inv: +.zero 32 diff --git a/sw/otbn/crypto/tests/rsa_3072_dec_test.exp b/sw/otbn/crypto/tests/rsa_3072_dec_test.exp new file mode 100644 index 0000000000000..55a478ea26c50 --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_3072_dec_test.exp @@ -0,0 +1,14 @@ +# Expected value: +# 0x77d133acf99844910deadefd84b95fc010959a01e040c559c691ac8ff0410b369453478a7ca56f74e3f6a1ea1fef9ef490d8a9c0bd385c49e7b3934e93a52e44d49a7737b8153b295d9baf4ef032d00c61609458ddeeaf73a243670ce7fb188e20fb15b6c01c08c825d5f67547c679a1693dd04360813be3cd28c6e5a0d1dca66b410977470710a1f0b3463659be0e6d5946a4adccfae5e555a9360f44dec7b2a311ea186a6bc574fe00b89dc254481c78db835a1971ae2b22ce2caa06dee69a6b25fbef290e351a3aafc3850265ed51dc3237ea918727f9419aa4c335ba80f69a5205d277ff71b47b939780366179f7471ba6b451c21c2d4c288daa2ffc9fc4349e498c2d869021dc9214406c51ee9735d0341225efbb549f3e7b2939e90d211ebeaf5a2711926d53a32c790616502d02c483f3b357d23b958d554e478246175a12b90c2970c8ed47e9d376923812f8913cda3a6d88bd93f576cb143072c473156ae1e3925977b3b76bc804f2a5feeec49499c54463b55921e4c0d24e0bb41d +w0 = 0x156ae1e3925977b3b76bc804f2a5feeec49499c54463b55921e4c0d24e0bb41d +w1 = 0x5a12b90c2970c8ed47e9d376923812f8913cda3a6d88bd93f576cb143072c473 +w2 = 0x1ebeaf5a2711926d53a32c790616502d02c483f3b357d23b958d554e47824617 +w3 = 0x349e498c2d869021dc9214406c51ee9735d0341225efbb549f3e7b2939e90d21 +w4 = 0x9a5205d277ff71b47b939780366179f7471ba6b451c21c2d4c288daa2ffc9fc4 +w5 = 0x6b25fbef290e351a3aafc3850265ed51dc3237ea918727f9419aa4c335ba80f6 +w6 = 0xa311ea186a6bc574fe00b89dc254481c78db835a1971ae2b22ce2caa06dee69a +w7 = 0x6b410977470710a1f0b3463659be0e6d5946a4adccfae5e555a9360f44dec7b2 +w8 = 0x20fb15b6c01c08c825d5f67547c679a1693dd04360813be3cd28c6e5a0d1dca6 +w9 = 0xd49a7737b8153b295d9baf4ef032d00c61609458ddeeaf73a243670ce7fb188e +w10 = 0x9453478a7ca56f74e3f6a1ea1fef9ef490d8a9c0bd385c49e7b3934e93a52e44 +w11 = 0x77d133acf99844910deadefd84b95fc010959a01e040c559c691ac8ff0410b36 diff --git a/sw/otbn/crypto/tests/rsa_3072_dec_test.s b/sw/otbn/crypto/tests/rsa_3072_dec_test.s new file mode 100644 index 0000000000000..4e6a599729ca6 --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_3072_dec_test.s @@ -0,0 +1,368 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + + +.section .text.start + +/** + * Standalone RSA-3072 modexp with secret exponent (decryption/signing). + */ +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load number of limbs. */ + li x30, 12 + + /* Load pointers to modulus and Montgomery constant buffers. */ + la x16, modulus + la x17, m0inv + la x18, RR + + /* Compute Montgomery constants. */ + jal x1, modload + + /* Run exponentiation. + dmem[result] = dmem[base]^dmem[exp] mod dmem[modulus] */ + la x14, base + la x15, exp + la x2, result + jal x1, modexp + + /* copy all limbs of result to wide reg file */ + la x21, result + li x8, 0 + loop x30, 2 + bn.lid x8, 0(x21++) + addi x8, x8, 1 + + ecall + + +.data + +/* Modulus n = +0xb2e73fd1e1dce003def2f2795a1400f2514256a70fe83d64ae8464f114839c94d975c89f97b3598b48de7a560b867b4967ae92d3552f0b204c000b0841f5fac3ef0ba000acfb517a995cf708e46c670a885626d7865ebc5bccc509bc562a4ffc956eb3b859e43bc83debe4888e3e6a55de852c027a874b9c803598a78f4196800db785d91730e8708b8cef986c6d326c9a003201737cb3f5e42cd601c47d74898105671d446b9a5c8a835286f419682fc4b69e79a8d2f9f6aabca5b0c311dabe6fb19d3e03045a729b3107f21370935c6de2316876afae55aeb4da07b8a04aafc1f7717f8d571f47c1a0f395e4ce78ed581db853bda1cb6f224fc4b2c6244611d416b2e729c54ef638d7bd94483b11b56b7b613c06b2564c08de82ef33dff23892e183fd6e96713bfaf76b792c4e8071f3dd5ad695e3748179bebb97140efaabce02687b401b93a513b80b5ec334d38c0b331f90d1454c9b8f3b87017b5174f1d2b75c27fff6e89a3ae099fb0455b5cc9d3bd4840baf510e4d80dbbac4049efb + */ +.balign 32 +modulus: + .word 0xc4049efb + .word 0x4d80dbba + .word 0x0baf510e + .word 0x9d3bd484 + .word 0x0455b5cc + .word 0x3ae099fb + .word 0xfff6e89a + .word 0xd2b75c27 + .word 0x7b5174f1 + .word 0x8f3b8701 + .word 0xd1454c9b + .word 0x0b331f90 + .word 0xc334d38c + .word 0x13b80b5e + .word 0x401b93a5 + .word 0xce02687b + .word 0x140efaab + .word 0x79bebb97 + .word 0x95e37481 + .word 0xf3dd5ad6 + .word 0x2c4e8071 + .word 0xfaf76b79 + .word 0x6e96713b + .word 0x92e183fd + .word 0x33dff238 + .word 0x08de82ef + .word 0x06b2564c + .word 0x6b7b613c + .word 0x483b11b5 + .word 0x38d7bd94 + .word 0x29c54ef6 + .word 0xd416b2e7 + .word 0xc6244611 + .word 0x224fc4b2 + .word 0xbda1cb6f + .word 0x581db853 + .word 0xe4ce78ed + .word 0xc1a0f395 + .word 0x8d571f47 + .word 0xc1f7717f + .word 0xb8a04aaf + .word 0xaeb4da07 + .word 0x76afae55 + .word 0x6de23168 + .word 0x1370935c + .word 0x9b3107f2 + .word 0x03045a72 + .word 0x6fb19d3e + .word 0xc311dabe + .word 0xaabca5b0 + .word 0xa8d2f9f6 + .word 0xc4b69e79 + .word 0xf419682f + .word 0x8a835286 + .word 0x446b9a5c + .word 0x8105671d + .word 0xc47d7489 + .word 0xe42cd601 + .word 0x737cb3f5 + .word 0x9a003201 + .word 0x6c6d326c + .word 0x8b8cef98 + .word 0x1730e870 + .word 0x0db785d9 + .word 0x8f419680 + .word 0x803598a7 + .word 0x7a874b9c + .word 0xde852c02 + .word 0x8e3e6a55 + .word 0x3debe488 + .word 0x59e43bc8 + .word 0x956eb3b8 + .word 0x562a4ffc + .word 0xccc509bc + .word 0x865ebc5b + .word 0x885626d7 + .word 0xe46c670a + .word 0x995cf708 + .word 0xacfb517a + .word 0xef0ba000 + .word 0x41f5fac3 + .word 0x4c000b08 + .word 0x552f0b20 + .word 0x67ae92d3 + .word 0x0b867b49 + .word 0x48de7a56 + .word 0x97b3598b + .word 0xd975c89f + .word 0x14839c94 + .word 0xae8464f1 + .word 0x0fe83d64 + .word 0x514256a7 + .word 0x5a1400f2 + .word 0xdef2f279 + .word 0xe1dce003 + .word 0xb2e73fd1 + +/* Base for exponentiation (corresponds to ciphertext for decryption or + message for signing). + + Raw hex value = +0x1273e84d4509b08a748a9bf81808f0a2387616159d3b04b32ab172285440f09f69f53e37a7cb6a6fc3fb0626a4dad7b64417570b4e112843bd3c82030fb4a01ba9ba2c194e98d51372b9a63153d7279c62452592d597c85dc493d167735918d89c4aa86d20073a0d6ce2a3bf7dc603d73efb38c5ff6fb191db117f8cf4cb0b46e995bfa0e4cee3a055fc44e496989f7248e95c6e3f4879d2c38118f51a0910d37030ddf0a39a0e6e0e3e4be9b0d12e8d25f337657e7f752fa63defebd91b2c107a00475fde78b38da4ca12c17160a8d68f6eecb60481e6ae3f3a0dc03ebb82f327dfe85f850d05215af5f7c07cd5932aaec3d17339e75b2ec803d231188942231edd8c43a6bd9b7be638da9911604a3308117ad41cf697273550765bac72a499cf21d4c4406668f569e4b002d75de22d3e5e7d5c7b33131389cffad951b362865106352780474b3c79378169c6998388e5d2782557c228f0fcf1e132e0fc2d14fe03cca340568cfb5c07a00b052304fa0923f1dfcf627a58e0ea093a97af836 + */ +.balign 32 +base: + .word 0xa97af836 + .word 0x8e0ea093 + .word 0xfcf627a5 + .word 0xa0923f1d + .word 0xb052304f + .word 0xb5c07a00 + .word 0x340568cf + .word 0x4fe03cca + .word 0x2e0fc2d1 + .word 0x0fcf1e13 + .word 0x557c228f + .word 0x8e5d2782 + .word 0x9c699838 + .word 0xc7937816 + .word 0x780474b3 + .word 0x65106352 + .word 0x951b3628 + .word 0x389cffad + .word 0xc7b33131 + .word 0xd3e5e7d5 + .word 0x2d75de22 + .word 0x569e4b00 + .word 0x4406668f + .word 0x9cf21d4c + .word 0xbac72a49 + .word 0x73550765 + .word 0x41cf6972 + .word 0x308117ad + .word 0x911604a3 + .word 0xbe638da9 + .word 0x3a6bd9b7 + .word 0x31edd8c4 + .word 0x11889422 + .word 0xec803d23 + .word 0x339e75b2 + .word 0xaaec3d17 + .word 0x07cd5932 + .word 0x15af5f7c + .word 0xf850d052 + .word 0x327dfe85 + .word 0x03ebb82f + .word 0xe3f3a0dc + .word 0x60481e6a + .word 0x68f6eecb + .word 0x17160a8d + .word 0xda4ca12c + .word 0xfde78b38 + .word 0x07a00475 + .word 0xbd91b2c1 + .word 0xfa63defe + .word 0x57e7f752 + .word 0xd25f3376 + .word 0x9b0d12e8 + .word 0xe0e3e4be + .word 0x0a39a0e6 + .word 0x37030ddf + .word 0x51a0910d + .word 0x2c38118f + .word 0xe3f4879d + .word 0x248e95c6 + .word 0x496989f7 + .word 0x055fc44e + .word 0x0e4cee3a + .word 0x6e995bfa + .word 0xcf4cb0b4 + .word 0x1db117f8 + .word 0x5ff6fb19 + .word 0x73efb38c + .word 0xf7dc603d + .word 0xd6ce2a3b + .word 0xd20073a0 + .word 0x89c4aa86 + .word 0x7735918d + .word 0xdc493d16 + .word 0x2d597c85 + .word 0xc6245259 + .word 0x153d7279 + .word 0x372b9a63 + .word 0x94e98d51 + .word 0xba9ba2c1 + .word 0x30fb4a01 + .word 0x3bd3c820 + .word 0xb4e11284 + .word 0x64417570 + .word 0x6a4dad7b + .word 0xfc3fb062 + .word 0x7a7cb6a6 + .word 0xf69f53e3 + .word 0x85440f09 + .word 0x32ab1722 + .word 0x59d3b04b + .word 0x23876161 + .word 0x81808f0a + .word 0xa748a9bf + .word 0xd4509b08 + .word 0x01273e84 + +/* Private exponent d = +0x1bf6782bb27d670843db3e5a0861d30a0cf86cf9dccb24796daba4e96796f0acf5566b1ec2c3d62da69c9b8b826ea92b7e88b34b53e7affa02d708e26808ee029d04f8a3d265cfc4f55eaa001a4ff54518ad3a91fa5f295ac1e55451bb380edb8071d6a66c6a778ba35e1110e506cd711180483234fb9bae60fdbf980514afd4e10ffbdc443b314192165bc6bbbfcf9f58ecc9e41f2c7126705d2fb00409c5e2ce274d882e0f1188006069504dac00f4626f56d2d637efb905d3c9a418c15c2a9f1b2f1d3fca1461d2b483d3ce354e56f24ebbea9197c2359af199d89cdaf737668626719923e8718ee4f5085ecb1b09aed5f539795ef462f173451e18d04939b2b090fdc6e75bd438be26cc7b0b8244810176d366e6f1b38144510d956f5ed8f5f3f51e50092b54945cf6ecc0a6f317cc44e487dd38f8b3e0f42841ff538d87b75d592fdca3ee5f1eedc81f0d9b2652b5058a3e50b9ab7d266eb0c681f6f829daec744b0cbf7d22d099e96cd3d1e29cb675ecaef5a7d99d35b84ca4d35c6b8d + */ +.balign 32 +exp: + .word 0xd35c6b8d + .word 0x35b84ca4 + .word 0xf5a7d99d + .word 0xb675ecae + .word 0xd3d1e29c + .word 0xd099e96c + .word 0x0cbf7d22 + .word 0xdaec744b + .word 0x81f6f829 + .word 0x266eb0c6 + .word 0x50b9ab7d + .word 0xb5058a3e + .word 0x0d9b2652 + .word 0x1eedc81f + .word 0xdca3ee5f + .word 0xb75d592f + .word 0xff538d87 + .word 0xe0f42841 + .word 0xdd38f8b3 + .word 0xcc44e487 + .word 0xc0a6f317 + .word 0x945cf6ec + .word 0x50092b54 + .word 0xf5f3f51e + .word 0x956f5ed8 + .word 0x8144510d + .word 0x66e6f1b3 + .word 0x810176d3 + .word 0x7b0b8244 + .word 0x38be26cc + .word 0xc6e75bd4 + .word 0xb2b090fd + .word 0x18d04939 + .word 0xf173451e + .word 0x795ef462 + .word 0xaed5f539 + .word 0x5ecb1b09 + .word 0x8ee4f508 + .word 0x9923e871 + .word 0x66862671 + .word 0x9cdaf737 + .word 0x9af199d8 + .word 0x9197c235 + .word 0xf24ebbea + .word 0xce354e56 + .word 0xd2b483d3 + .word 0x3fca1461 + .word 0x9f1b2f1d + .word 0x18c15c2a + .word 0x05d3c9a4 + .word 0xd637efb9 + .word 0x626f56d2 + .word 0x4dac00f4 + .word 0x00606950 + .word 0x2e0f1188 + .word 0xce274d88 + .word 0x0409c5e2 + .word 0x705d2fb0 + .word 0x1f2c7126 + .word 0x58ecc9e4 + .word 0xbbbfcf9f + .word 0x92165bc6 + .word 0x443b3141 + .word 0xe10ffbdc + .word 0x0514afd4 + .word 0x60fdbf98 + .word 0x34fb9bae + .word 0x11804832 + .word 0xe506cd71 + .word 0xa35e1110 + .word 0x6c6a778b + .word 0x8071d6a6 + .word 0xbb380edb + .word 0xc1e55451 + .word 0xfa5f295a + .word 0x18ad3a91 + .word 0x1a4ff545 + .word 0xf55eaa00 + .word 0xd265cfc4 + .word 0x9d04f8a3 + .word 0x6808ee02 + .word 0x02d708e2 + .word 0x53e7affa + .word 0x7e88b34b + .word 0x826ea92b + .word 0xa69c9b8b + .word 0xc2c3d62d + .word 0xf5566b1e + .word 0x6796f0ac + .word 0x6daba4e9 + .word 0xdccb2479 + .word 0x0cf86cf9 + .word 0x0861d30a + .word 0x43db3e5a + .word 0xb27d6708 + .word 0x1bf6782b + + +/* output buffer */ +.balign 32 +result: +.zero 384 + +/* buffer for Montgomery constant RR */ +.balign 32 +RR: +.zero 384 + +/* buffer for Montgomery constant m0inv */ +.balign 32 +m0inv: +.zero 32 diff --git a/sw/otbn/crypto/tests/rsa_3072_enc_test.exp b/sw/otbn/crypto/tests/rsa_3072_enc_test.exp new file mode 100644 index 0000000000000..6ef8409ea67ed --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_3072_enc_test.exp @@ -0,0 +1,14 @@ +# Expected value: +# 0x1273e84d4509b08a748a9bf81808f0a2387616159d3b04b32ab172285440f09f69f53e37a7cb6a6fc3fb0626a4dad7b64417570b4e112843bd3c82030fb4a01ba9ba2c194e98d51372b9a63153d7279c62452592d597c85dc493d167735918d89c4aa86d20073a0d6ce2a3bf7dc603d73efb38c5ff6fb191db117f8cf4cb0b46e995bfa0e4cee3a055fc44e496989f7248e95c6e3f4879d2c38118f51a0910d37030ddf0a39a0e6e0e3e4be9b0d12e8d25f337657e7f752fa63defebd91b2c107a00475fde78b38da4ca12c17160a8d68f6eecb60481e6ae3f3a0dc03ebb82f327dfe85f850d05215af5f7c07cd5932aaec3d17339e75b2ec803d231188942231edd8c43a6bd9b7be638da9911604a3308117ad41cf697273550765bac72a499cf21d4c4406668f569e4b002d75de22d3e5e7d5c7b33131389cffad951b362865106352780474b3c79378169c6998388e5d2782557c228f0fcf1e132e0fc2d14fe03cca340568cfb5c07a00b052304fa0923f1dfcf627a58e0ea093a97af836 +w0 = 0x4fe03cca340568cfb5c07a00b052304fa0923f1dfcf627a58e0ea093a97af836 +w1 = 0x65106352780474b3c79378169c6998388e5d2782557c228f0fcf1e132e0fc2d1 +w2 = 0x9cf21d4c4406668f569e4b002d75de22d3e5e7d5c7b33131389cffad951b3628 +w3 = 0x31edd8c43a6bd9b7be638da9911604a3308117ad41cf697273550765bac72a49 +w4 = 0x327dfe85f850d05215af5f7c07cd5932aaec3d17339e75b2ec803d2311889422 +w5 = 0x07a00475fde78b38da4ca12c17160a8d68f6eecb60481e6ae3f3a0dc03ebb82f +w6 = 0x37030ddf0a39a0e6e0e3e4be9b0d12e8d25f337657e7f752fa63defebd91b2c1 +w7 = 0x6e995bfa0e4cee3a055fc44e496989f7248e95c6e3f4879d2c38118f51a0910d +w8 = 0x89c4aa86d20073a0d6ce2a3bf7dc603d73efb38c5ff6fb191db117f8cf4cb0b4 +w9 = 0xba9ba2c194e98d51372b9a63153d7279c62452592d597c85dc493d167735918d +w10 = 0xf69f53e37a7cb6a6fc3fb0626a4dad7b64417570b4e112843bd3c82030fb4a01 +w11 = 0x01273e84d4509b08a748a9bf81808f0a2387616159d3b04b32ab172285440f09 diff --git a/sw/otbn/crypto/tests/rsa_3072_enc_test.s b/sw/otbn/crypto/tests/rsa_3072_enc_test.s new file mode 100644 index 0000000000000..374fff917c52c --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_3072_enc_test.s @@ -0,0 +1,264 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + + +.section .text.start + +/** + * Standalone RSA-3072 modexp with e=65537 (encryption/verification). + */ +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load number of limbs. */ + li x30, 12 + + /* Load pointers to modulus and Montgomery constant buffers. */ + la x16, modulus + la x17, m0inv + la x18, RR + + /* Compute Montgomery constants. */ + jal x1, modload + + /* Run exponentiation. + dmem[result] = dmem[base]^dmem[exp] mod dmem[modulus] */ + la x14, base + la x2, result + jal x1, modexp_65537 + + /* copy all limbs of result to wide reg file */ + la x21, result + li x8, 0 + loop x30, 2 + bn.lid x8, 0(x21++) + addi x8, x8, 1 + + ecall + +.data + +/* Modulus n = +0xb2e73fd1e1dce003def2f2795a1400f2514256a70fe83d64ae8464f114839c94d975c89f97b3598b48de7a560b867b4967ae92d3552f0b204c000b0841f5fac3ef0ba000acfb517a995cf708e46c670a885626d7865ebc5bccc509bc562a4ffc956eb3b859e43bc83debe4888e3e6a55de852c027a874b9c803598a78f4196800db785d91730e8708b8cef986c6d326c9a003201737cb3f5e42cd601c47d74898105671d446b9a5c8a835286f419682fc4b69e79a8d2f9f6aabca5b0c311dabe6fb19d3e03045a729b3107f21370935c6de2316876afae55aeb4da07b8a04aafc1f7717f8d571f47c1a0f395e4ce78ed581db853bda1cb6f224fc4b2c6244611d416b2e729c54ef638d7bd94483b11b56b7b613c06b2564c08de82ef33dff23892e183fd6e96713bfaf76b792c4e8071f3dd5ad695e3748179bebb97140efaabce02687b401b93a513b80b5ec334d38c0b331f90d1454c9b8f3b87017b5174f1d2b75c27fff6e89a3ae099fb0455b5cc9d3bd4840baf510e4d80dbbac4049efb + */ +.balign 32 +modulus: + .word 0xc4049efb + .word 0x4d80dbba + .word 0x0baf510e + .word 0x9d3bd484 + .word 0x0455b5cc + .word 0x3ae099fb + .word 0xfff6e89a + .word 0xd2b75c27 + .word 0x7b5174f1 + .word 0x8f3b8701 + .word 0xd1454c9b + .word 0x0b331f90 + .word 0xc334d38c + .word 0x13b80b5e + .word 0x401b93a5 + .word 0xce02687b + .word 0x140efaab + .word 0x79bebb97 + .word 0x95e37481 + .word 0xf3dd5ad6 + .word 0x2c4e8071 + .word 0xfaf76b79 + .word 0x6e96713b + .word 0x92e183fd + .word 0x33dff238 + .word 0x08de82ef + .word 0x06b2564c + .word 0x6b7b613c + .word 0x483b11b5 + .word 0x38d7bd94 + .word 0x29c54ef6 + .word 0xd416b2e7 + .word 0xc6244611 + .word 0x224fc4b2 + .word 0xbda1cb6f + .word 0x581db853 + .word 0xe4ce78ed + .word 0xc1a0f395 + .word 0x8d571f47 + .word 0xc1f7717f + .word 0xb8a04aaf + .word 0xaeb4da07 + .word 0x76afae55 + .word 0x6de23168 + .word 0x1370935c + .word 0x9b3107f2 + .word 0x03045a72 + .word 0x6fb19d3e + .word 0xc311dabe + .word 0xaabca5b0 + .word 0xa8d2f9f6 + .word 0xc4b69e79 + .word 0xf419682f + .word 0x8a835286 + .word 0x446b9a5c + .word 0x8105671d + .word 0xc47d7489 + .word 0xe42cd601 + .word 0x737cb3f5 + .word 0x9a003201 + .word 0x6c6d326c + .word 0x8b8cef98 + .word 0x1730e870 + .word 0x0db785d9 + .word 0x8f419680 + .word 0x803598a7 + .word 0x7a874b9c + .word 0xde852c02 + .word 0x8e3e6a55 + .word 0x3debe488 + .word 0x59e43bc8 + .word 0x956eb3b8 + .word 0x562a4ffc + .word 0xccc509bc + .word 0x865ebc5b + .word 0x885626d7 + .word 0xe46c670a + .word 0x995cf708 + .word 0xacfb517a + .word 0xef0ba000 + .word 0x41f5fac3 + .word 0x4c000b08 + .word 0x552f0b20 + .word 0x67ae92d3 + .word 0x0b867b49 + .word 0x48de7a56 + .word 0x97b3598b + .word 0xd975c89f + .word 0x14839c94 + .word 0xae8464f1 + .word 0x0fe83d64 + .word 0x514256a7 + .word 0x5a1400f2 + .word 0xdef2f279 + .word 0xe1dce003 + .word 0xb2e73fd1 + + +/* Base for exponentiation (corresponds to plaintext for encryption or + signature for verification). + + Raw hex value (randomly generated) = +0x77d133acf99844910deadefd84b95fc010959a01e040c559c691ac8ff0410b369453478a7ca56f74e3f6a1ea1fef9ef490d8a9c0bd385c49e7b3934e93a52e44d49a7737b8153b295d9baf4ef032d00c61609458ddeeaf73a243670ce7fb188e20fb15b6c01c08c825d5f67547c679a1693dd04360813be3cd28c6e5a0d1dca66b410977470710a1f0b3463659be0e6d5946a4adccfae5e555a9360f44dec7b2a311ea186a6bc574fe00b89dc254481c78db835a1971ae2b22ce2caa06dee69a6b25fbef290e351a3aafc3850265ed51dc3237ea918727f9419aa4c335ba80f69a5205d277ff71b47b939780366179f7471ba6b451c21c2d4c288daa2ffc9fc4349e498c2d869021dc9214406c51ee9735d0341225efbb549f3e7b2939e90d211ebeaf5a2711926d53a32c790616502d02c483f3b357d23b958d554e478246175a12b90c2970c8ed47e9d376923812f8913cda3a6d88bd93f576cb143072c473156ae1e3925977b3b76bc804f2a5feeec49499c54463b55921e4c0d24e0bb41d + */ +.balign 32 +base: + .word 0x4e0bb41d + .word 0x21e4c0d2 + .word 0x4463b559 + .word 0xc49499c5 + .word 0xf2a5feee + .word 0xb76bc804 + .word 0x925977b3 + .word 0x156ae1e3 + .word 0x3072c473 + .word 0xf576cb14 + .word 0x6d88bd93 + .word 0x913cda3a + .word 0x923812f8 + .word 0x47e9d376 + .word 0x2970c8ed + .word 0x5a12b90c + .word 0x47824617 + .word 0x958d554e + .word 0xb357d23b + .word 0x02c483f3 + .word 0x0616502d + .word 0x53a32c79 + .word 0x2711926d + .word 0x1ebeaf5a + .word 0x39e90d21 + .word 0x9f3e7b29 + .word 0x25efbb54 + .word 0x35d03412 + .word 0x6c51ee97 + .word 0xdc921440 + .word 0x2d869021 + .word 0x349e498c + .word 0x2ffc9fc4 + .word 0x4c288daa + .word 0x51c21c2d + .word 0x471ba6b4 + .word 0x366179f7 + .word 0x7b939780 + .word 0x77ff71b4 + .word 0x9a5205d2 + .word 0x35ba80f6 + .word 0x419aa4c3 + .word 0x918727f9 + .word 0xdc3237ea + .word 0x0265ed51 + .word 0x3aafc385 + .word 0x290e351a + .word 0x6b25fbef + .word 0x06dee69a + .word 0x22ce2caa + .word 0x1971ae2b + .word 0x78db835a + .word 0xc254481c + .word 0xfe00b89d + .word 0x6a6bc574 + .word 0xa311ea18 + .word 0x44dec7b2 + .word 0x55a9360f + .word 0xccfae5e5 + .word 0x5946a4ad + .word 0x59be0e6d + .word 0xf0b34636 + .word 0x470710a1 + .word 0x6b410977 + .word 0xa0d1dca6 + .word 0xcd28c6e5 + .word 0x60813be3 + .word 0x693dd043 + .word 0x47c679a1 + .word 0x25d5f675 + .word 0xc01c08c8 + .word 0x20fb15b6 + .word 0xe7fb188e + .word 0xa243670c + .word 0xddeeaf73 + .word 0x61609458 + .word 0xf032d00c + .word 0x5d9baf4e + .word 0xb8153b29 + .word 0xd49a7737 + .word 0x93a52e44 + .word 0xe7b3934e + .word 0xbd385c49 + .word 0x90d8a9c0 + .word 0x1fef9ef4 + .word 0xe3f6a1ea + .word 0x7ca56f74 + .word 0x9453478a + .word 0xf0410b36 + .word 0xc691ac8f + .word 0xe040c559 + .word 0x10959a01 + .word 0x84b95fc0 + .word 0x0deadefd + .word 0xf9984491 + .word 0x77d133ac + +/* output buffer */ +.balign 32 +result: +.zero 384 + +/* buffer for Montgomery constant RR */ +.balign 32 +RR: +.zero 384 + +/* buffer for Montgomery constant m0inv */ +.balign 32 +m0inv: +.zero 384 diff --git a/sw/otbn/crypto/tests/rsa_4096_enc_test.exp b/sw/otbn/crypto/tests/rsa_4096_enc_test.exp new file mode 100644 index 0000000000000..c4b751cb8b87e --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_4096_enc_test.exp @@ -0,0 +1,18 @@ +# Expected result: +# 0x74798a179a6112dcfabdd7a1b39dab868d30bf4bcd359bc6eaf3c9b3626089ecd0851b9d077cb5deafe302f71035e179b528af0b9bfc0055caee605fd12f7f1b2251bde06b292a7c1a69227ba00945bacf857252571fb71a94bd353ea9d0a94d0068060b44c7f5bd416be032723581c824799f44ed947eba9008d1cf0c1a21a139f0125494b07540be6f55b53bcdcd51ea2b8fd0af22d08a3a74d0ad55b7e2481dc14fec2bc701276175f9eaa84298536d21de5d92b970760fd8898a2b9212145ce94070d2e5d75a85cbed16c660ecab4e71bac82519d367336e5975676e58cc08208d24dce920812f607713415ba58038bd9745d4aba1de2b11788baaa5146e0ef5f8da023c7049dabfc84434f02c9bb5d488e0caaa2794374396420f3c29456fd16448c13a3da797e741879aa4b55b6eb69313c02366393bc7e64b0220fb46a0eb481afb669af2192964a13e37b8050bf5472456905fb224ef27d4e86684024766be8859d7ebc910b35af5ef334497929e2120afa5f0b46539a9b58a9e725a84db8c290f547733085ae8970cda95069c67064461c368c38e10c5e3f4ae84f7a87ae5850c73cde81c3021d50b0aa1af472dd0b08d5d983ec8fdec9366ef9c52ddf7de6f81bd22e1bb4f7356fa1d40d0384b46c2997ce8d85d6bb594624b8e97f5e6134ee2bd795d89b996000f2c2e49c13d808b1b46f0fde6a8e6715403e9ad +w0 = 0xf5e6134ee2bd795d89b996000f2c2e49c13d808b1b46f0fde6a8e6715403e9ad +w1 = 0xddf7de6f81bd22e1bb4f7356fa1d40d0384b46c2997ce8d85d6bb594624b8e97 +w2 = 0xa87ae5850c73cde81c3021d50b0aa1af472dd0b08d5d983ec8fdec9366ef9c52 +w3 = 0x84db8c290f547733085ae8970cda95069c67064461c368c38e10c5e3f4ae84f7 +w4 = 0x4766be8859d7ebc910b35af5ef334497929e2120afa5f0b46539a9b58a9e725a +w5 = 0xa0eb481afb669af2192964a13e37b8050bf5472456905fb224ef27d4e8668402 +w6 = 0x6fd16448c13a3da797e741879aa4b55b6eb69313c02366393bc7e64b0220fb46 +w7 = 0x0ef5f8da023c7049dabfc84434f02c9bb5d488e0caaa2794374396420f3c2945 +w8 = 0x08208d24dce920812f607713415ba58038bd9745d4aba1de2b11788baaa5146e +w9 = 0x5ce94070d2e5d75a85cbed16c660ecab4e71bac82519d367336e5975676e58cc +w10 = 0x1dc14fec2bc701276175f9eaa84298536d21de5d92b970760fd8898a2b921214 +w11 = 0x39f0125494b07540be6f55b53bcdcd51ea2b8fd0af22d08a3a74d0ad55b7e248 +w12 = 0x0068060b44c7f5bd416be032723581c824799f44ed947eba9008d1cf0c1a21a1 +w13 = 0x2251bde06b292a7c1a69227ba00945bacf857252571fb71a94bd353ea9d0a94d +w14 = 0xd0851b9d077cb5deafe302f71035e179b528af0b9bfc0055caee605fd12f7f1b +w15 = 0x74798a179a6112dcfabdd7a1b39dab868d30bf4bcd359bc6eaf3c9b3626089ec diff --git a/sw/otbn/crypto/tests/rsa_4096_enc_test.s b/sw/otbn/crypto/tests/rsa_4096_enc_test.s new file mode 100644 index 0000000000000..d42ea3204cc67 --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_4096_enc_test.s @@ -0,0 +1,329 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + + +.section .text.start + +/** + * Standalone RSA-4096 modexp with e=65537 (encryption/verification). + */ +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load number of limbs. */ + li x30, 16 + + /* Load pointers to modulus and Montgomery constant buffers. */ + la x16, modulus + la x17, m0inv + la x18, RR + + /* Compute Montgomery constants. */ + jal x1, modload + + /* Run exponentiation. + dmem[result] = dmem[base]^dmem[exp] mod dmem[modulus] */ + la x14, base + la x2, result + jal x1, modexp_65537 + + /* copy all limbs of result to wide reg file */ + la x21, result + li x8, 0 + loop x30, 2 + bn.lid x8, 0(x21++) + addi x8, x8, 1 + + ecall + +.data + +/* Modulus n = +0xb25ab5439b6a703a7ba169f099a766944a86466bd18324b2149a23564261af44f1087a7df201eb2dc9583de79d9db60edd4a17aee8ed7b9384de837d70f5f99ad91695d9c780dde5401f160ce02a6135df0ea2339617b962250cf810a2be45acd43b602eddf1be6321d236e6338272e5bd5cda251a896d1d65eb10e2308f9ba8bcf4fb0836a5439c8a86394acdf2a2a3d0b4ae41b75d52894a8d79adfd1cb8db977d42d4865cd9a426bf1156b86e541469ac5a54bc06231da1db901d548cf53f3f003f7cdeee9b1b9ca7b4049b0e36b8cc7fc6d62967ffbffa593aa5cfbb41c68df57003911cf3ba2516378eaa9ee36da6ce4b09d71f072a79615d5619c8132c5467b56eae8a5e2aaa56ac4aa5dc9f696f89dd0cd0f818cdc8b58c938b336f87179cbb52a6a2965a7fcd619a5b315d370bdefeca9cbd6ea39e853f39d39c14f797ca5c31535c89f883cdfbb3bb1934490b136e46f99d8e5411a2a8b73b2519f43d78ee5cc675dcbcfeac8ef693c09a1aa87785cb5713298fa2edfcc67497cc6dbbc5d911edf7b1b5a735f14ab1870b481cd35279c932c74902faf5f047d84e6bedb88c28fced24b3728c5d9dc1114c46bfded6531873e718372dad28aae0a3c4f06dd81542cb9192783a9107a0263c8add0f23b250472f50b18f0e7719a3ba58ba38bc9ab906f86d0507a44690aba5ee96ef1083c237f2f004bff60bc4ecfb99 + */ +.balign 32 +modulus: + .word 0xc4ecfb99 + .word 0x04bff60b + .word 0xc237f2f0 + .word 0x96ef1083 + .word 0x90aba5ee + .word 0x0507a446 + .word 0xb906f86d + .word 0xba38bc9a + .word 0x19a3ba58 + .word 0xb18f0e77 + .word 0x50472f50 + .word 0xdd0f23b2 + .word 0xa0263c8a + .word 0x783a9107 + .word 0x42cb9192 + .word 0xf06dd815 + .word 0xaae0a3c4 + .word 0x372dad28 + .word 0x1873e718 + .word 0xbfded653 + .word 0xc1114c46 + .word 0x728c5d9d + .word 0xfced24b3 + .word 0xedb88c28 + .word 0x47d84e6b + .word 0x02faf5f0 + .word 0xc932c749 + .word 0x1cd35279 + .word 0xb1870b48 + .word 0xa735f14a + .word 0xedf7b1b5 + .word 0xbbc5d911 + .word 0x7497cc6d + .word 0xa2edfcc6 + .word 0x5713298f + .word 0xa87785cb + .word 0x93c09a1a + .word 0xfeac8ef6 + .word 0xc675dcbc + .word 0x3d78ee5c + .word 0x3b2519f4 + .word 0x11a2a8b7 + .word 0xf99d8e54 + .word 0x0b136e46 + .word 0xbb193449 + .word 0x83cdfbb3 + .word 0x535c89f8 + .word 0x97ca5c31 + .word 0xd39c14f7 + .word 0x9e853f39 + .word 0x9cbd6ea3 + .word 0x0bdefeca + .word 0x5b315d37 + .word 0x7fcd619a + .word 0xa6a2965a + .word 0x179cbb52 + .word 0x8b336f87 + .word 0xc8b58c93 + .word 0xd0f818cd + .word 0x6f89dd0c + .word 0xa5dc9f69 + .word 0xaa56ac4a + .word 0xae8a5e2a + .word 0x5467b56e + .word 0x19c8132c + .word 0x79615d56 + .word 0xd71f072a + .word 0xa6ce4b09 + .word 0xaa9ee36d + .word 0x2516378e + .word 0x911cf3ba + .word 0x8df57003 + .word 0xcfbb41c6 + .word 0xfa593aa5 + .word 0x2967ffbf + .word 0xcc7fc6d6 + .word 0x9b0e36b8 + .word 0x9ca7b404 + .word 0xdeee9b1b + .word 0x3f003f7c + .word 0x548cf53f + .word 0xa1db901d + .word 0xbc06231d + .word 0x69ac5a54 + .word 0xb86e5414 + .word 0x26bf1156 + .word 0x865cd9a4 + .word 0x977d42d4 + .word 0xfd1cb8db + .word 0x4a8d79ad + .word 0xb75d5289 + .word 0xd0b4ae41 + .word 0xcdf2a2a3 + .word 0x8a86394a + .word 0x36a5439c + .word 0xbcf4fb08 + .word 0x308f9ba8 + .word 0x65eb10e2 + .word 0x1a896d1d + .word 0xbd5cda25 + .word 0x338272e5 + .word 0x21d236e6 + .word 0xddf1be63 + .word 0xd43b602e + .word 0xa2be45ac + .word 0x250cf810 + .word 0x9617b962 + .word 0xdf0ea233 + .word 0xe02a6135 + .word 0x401f160c + .word 0xc780dde5 + .word 0xd91695d9 + .word 0x70f5f99a + .word 0x84de837d + .word 0xe8ed7b93 + .word 0xdd4a17ae + .word 0x9d9db60e + .word 0xc9583de7 + .word 0xf201eb2d + .word 0xf1087a7d + .word 0x4261af44 + .word 0x149a2356 + .word 0xd18324b2 + .word 0x4a86466b + .word 0x99a76694 + .word 0x7ba169f0 + .word 0x9b6a703a + .word 0xb25ab543 + + +/* Base for exponentiation (corresponds to plaintext for encryption or + signature for verification). + + Raw hex value (randomly generated) = +0x9e67bf21cfb170bd70edb7b9ffb99fbfe6a681f9e17bc8a966bf55d54794b95f9c4ff3657f3eef86433035ec3cc1fd4c092498a59f3fb5ac0b29c1a7a429130509229a001a86f72182354886779211a3f38ae8b864d094f875cc30bfce8df4a999cd6e43ab25c786ebb4d78bd6f439b278937d6d092be28d986564faab071878f0b4982b70af87c2261a0fc4d58b4c5d227cd880b40af25828988a730746b711cd6aaeec67f07b40df881cc8b784f944f4dc9fccac096631baf8ec17201fbacab0f09cbd2e816495820f6a5d7263ab5dd72cee1c1145327e2696066b6103304206c29ace7f13d92b3a7edf3cb9dc3fe5d2da7c22d16319f2fcdf44a9cf14de57cc75f9395b0d1ebd90c107b74ca88d8c99be1e5a4a41d1fa2285ccd8580f4a6206fb4d0cae5945bcd33b5f1308025f660bf96e3e448216b02da98b86d8e9d633e311b2f19fce4dbaa6317d04aaf360ea9245bd0bb70811e64d87accf8ab6339b063ba26b085c85e369f37c2c62a485fab7f2b22edd5f4f6365c3e47fae372ea3e530796473835384e77187ac856b9ebbc3c10f1a0394e9a9c25a8e635a55ac907a011119aa5d00edc26f0b64e9972391ba545a03e003624e0624f824c22710237e8f97a07ffcff0106684f0c17b8df6f975bdd5f286a95f7635416b3e9129aa81e4cee9932dfe177f7f33897412fdde0e8b87f6cc0c54ab2c8f022dcb7fef768 + */ +.balign 32 +base: + .word 0xb7fef768 + .word 0xc8f022dc + .word 0xc0c54ab2 + .word 0xe8b87f6c + .word 0x412fdde0 + .word 0xf7f33897 + .word 0x32dfe177 + .word 0x1e4cee99 + .word 0xe9129aa8 + .word 0x635416b3 + .word 0x286a95f7 + .word 0x975bdd5f + .word 0x17b8df6f + .word 0x06684f0c + .word 0x7ffcff01 + .word 0x7e8f97a0 + .word 0xc2271023 + .word 0x0624f824 + .word 0xe003624e + .word 0xba545a03 + .word 0xe9972391 + .word 0xc26f0b64 + .word 0xaa5d00ed + .word 0x7a011119 + .word 0x5a55ac90 + .word 0xc25a8e63 + .word 0x0394e9a9 + .word 0xc3c10f1a + .word 0x856b9ebb + .word 0xe77187ac + .word 0x73835384 + .word 0xe5307964 + .word 0xae372ea3 + .word 0x65c3e47f + .word 0xdd5f4f63 + .word 0xb7f2b22e + .word 0x62a485fa + .word 0x69f37c2c + .word 0x085c85e3 + .word 0x063ba26b + .word 0x8ab6339b + .word 0x4d87accf + .word 0xb70811e6 + .word 0x9245bd0b + .word 0xaaf360ea + .word 0xa6317d04 + .word 0x9fce4dba + .word 0xe311b2f1 + .word 0xd8e9d633 + .word 0x2da98b86 + .word 0x448216b0 + .word 0x0bf96e3e + .word 0x08025f66 + .word 0xd33b5f13 + .word 0xae5945bc + .word 0x06fb4d0c + .word 0x580f4a62 + .word 0x2285ccd8 + .word 0x4a41d1fa + .word 0x99be1e5a + .word 0x4ca88d8c + .word 0x90c107b7 + .word 0x5b0d1ebd + .word 0xcc75f939 + .word 0xcf14de57 + .word 0xfcdf44a9 + .word 0xd16319f2 + .word 0xd2da7c22 + .word 0xb9dc3fe5 + .word 0x3a7edf3c + .word 0x7f13d92b + .word 0x06c29ace + .word 0x61033042 + .word 0x2696066b + .word 0x1145327e + .word 0xd72cee1c + .word 0x7263ab5d + .word 0x820f6a5d + .word 0x2e816495 + .word 0xb0f09cbd + .word 0x201fbaca + .word 0xbaf8ec17 + .word 0xac096631 + .word 0xf4dc9fcc + .word 0xb784f944 + .word 0xdf881cc8 + .word 0x67f07b40 + .word 0xcd6aaeec + .word 0x0746b711 + .word 0x28988a73 + .word 0xb40af258 + .word 0x227cd880 + .word 0xd58b4c5d + .word 0x261a0fc4 + .word 0x70af87c2 + .word 0xf0b4982b + .word 0xab071878 + .word 0x986564fa + .word 0x092be28d + .word 0x78937d6d + .word 0xd6f439b2 + .word 0xebb4d78b + .word 0xab25c786 + .word 0x99cd6e43 + .word 0xce8df4a9 + .word 0x75cc30bf + .word 0x64d094f8 + .word 0xf38ae8b8 + .word 0x779211a3 + .word 0x82354886 + .word 0x1a86f721 + .word 0x09229a00 + .word 0xa4291305 + .word 0x0b29c1a7 + .word 0x9f3fb5ac + .word 0x092498a5 + .word 0x3cc1fd4c + .word 0x433035ec + .word 0x7f3eef86 + .word 0x9c4ff365 + .word 0x4794b95f + .word 0x66bf55d5 + .word 0xe17bc8a9 + .word 0xe6a681f9 + .word 0xffb99fbf + .word 0x70edb7b9 + .word 0xcfb170bd + .word 0x9e67bf21 + + +/* output buffer */ +.balign 32 +result: +.zero 512 + +/* buffer for Montgomery constant RR */ +.balign 32 +RR: +.zero 512 + +/* buffer for Montgomery constant m0inv */ +.balign 32 +m0inv: +.zero 32 diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.exp new file mode 100644 index 0000000000000..250028a7d63f7 --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.exp @@ -0,0 +1,2 @@ +# Expect 2^256 - 1 (check passed). +w24 = 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.s new file mode 100644 index 0000000000000..19d5c6f5bc7f0 --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.s @@ -0,0 +1,31 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Ensure that a good value for p passes RSA keygen checks. + * + * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for + * RSA-2048. + */ + +.section .text.start + +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load the number of limbs for this test. */ + li x30, 4 + li x31, 3 + + /* Load required constants. */ + li x20, 20 + li x21, 21 + + /* Check an acceptable value of p. + w24 <= 2^256-1 if the check passed, otherwise 0 */ + la x16, good_p + jal x1, check_p + + ecall diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.exp new file mode 100644 index 0000000000000..75275f176e56d --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.exp @@ -0,0 +1,2 @@ +# Expect 0 (check failed). +w24 = 0 diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.s new file mode 100644 index 0000000000000..658faaf11550f --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.s @@ -0,0 +1,31 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Ensure that a nonprime value for p fails RSA keygen checks. + * + * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for + * RSA-2048. + */ + +.section .text.start + +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load the number of limbs for this test. */ + li x30, 4 + li x31, 3 + + /* Load required constants. */ + li x20, 20 + li x21, 21 + + /* Check a value of p that is nonprime. + w24 <= 2^256-1 if the check passed, otherwise 0 */ + la x16, not_prime + jal x1, check_p + + ecall diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.exp new file mode 100644 index 0000000000000..75275f176e56d --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.exp @@ -0,0 +1,2 @@ +# Expect 0 (check failed). +w24 = 0 diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.s new file mode 100644 index 0000000000000..fc2023840c22c --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.s @@ -0,0 +1,31 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Ensure that a multiple of F4 fails RSA keygen checks for p. + * + * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for + * RSA-2048. + */ + +.section .text.start + +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load the number of limbs for this test. */ + li x30, 4 + li x31, 3 + + /* Load required constants. */ + li x20, 20 + li x21, 21 + + /* Check a value of p that is not relatively prime to F4. + w24 <= 2^256-1 if the check passed, otherwise 0 */ + la x16, not_relprime + jal x1, check_p + + ecall diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.exp deleted file mode 100644 index 9fc6e72ba9642..0000000000000 --- a/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.exp +++ /dev/null @@ -1,13 +0,0 @@ -# Expected values are the "good" p and q. - -# p = 0xe85547c5336579f83a2d50a611f489a4f2c3a918d2027fbc3f25c2de2dd36cdedc8901266de144a223b2c78a5a11024488a4aa2f4ef71f0fb93dfdbb2280b4d99dc9b3b77b039fd9fefcc3fe439e2bcb3db3ee3c0378a4d1297c1a5eebcd0d4ab3c0b50eb1511605c7c0907af31564ec5cc635e3de465e99cf6169c933ca0ab5 -w0 = 0xb3c0b50eb1511605c7c0907af31564ec5cc635e3de465e99cf6169c933ca0ab5 -w1 = 0x9dc9b3b77b039fd9fefcc3fe439e2bcb3db3ee3c0378a4d1297c1a5eebcd0d4a -w2 = 0xdc8901266de144a223b2c78a5a11024488a4aa2f4ef71f0fb93dfdbb2280b4d9 -w3 = 0xe85547c5336579f83a2d50a611f489a4f2c3a918d2027fbc3f25c2de2dd36cde - -# q = 0xb863a172d3d5562b582f38e251e540b424d4cbadd5da0ce64cb755227227b9535e0ab2437c1522415a70211eaa1dc4b4192b33148b1226da2ed107b64beeac72b112d99b960df54e21336a13aef97b5ec8646752af38385314a81a531bced7da5a781f6b19d119805941c47777a7aa9580a35b9f75c7dd97545d70790d7e8e9d -w4 = 0x5a781f6b19d119805941c47777a7aa9580a35b9f75c7dd97545d70790d7e8e9d -w5 = 0xb112d99b960df54e21336a13aef97b5ec8646752af38385314a81a531bced7da -w6 = 0x5e0ab2437c1522415a70211eaa1dc4b4192b33148b1226da2ed107b64beeac72 -w7 = 0xb863a172d3d5562b582f38e251e540b424d4cbadd5da0ce64cb755227227b953 diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.s deleted file mode 100644 index e977a97bd4274..0000000000000 --- a/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.s +++ /dev/null @@ -1,531 +0,0 @@ -/* Copyright lowRISC contributors. */ -/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ -/* SPDX-License-Identifier: Apache-2.0 */ - -/** - * Standalone test for checks on RSA keygen p and q values. - * - * See FIPS 186-5 section A.1.3 for the full specification of requirements on p - * and q. The value for p must satisfy: - * - p % 2 = 1 - * - p >= sqrt(2)*(2^(nlen/2 - 1)), where nlen = RSA public key length - * - GCD(p-1,65537) = 1 - * - p is probably prime - * - * For q, we need to satisfy the same requirements as p plus one more: q must - * not be too close to p. Specifically, we need to reject the value if: - * |p-q| < 2^(nlen/2 - 100). - * - * We don't test the oddness requirement here, since the `check_*` routines - * require oddness as a precondition. However, all other requirements are - * tested. - * - * For we use 4-limb (1024-bit) values for p and q in this test, which - * correspond to RSA-2048. - */ - -.section .text.start - -main: - /* Init all-zero register. */ - bn.xor w31, w31, w31 - - /* Load the number of limbs for this test. */ - li x30, 4 - li x31, 3 - - /* Load required constants. */ - li x20, 20 - li x21, 21 - - /* Zeroize the buffer for q so that, if we never get to checking it and - writing any real data there, we don't get DMEM integrity errors when we - try to load it to registers. */ - la x2, zero - jal x1, copy_to_rsa_q - - /* Check a value of p that is too small. */ - la x16, too_small - jal x1, test_bad_p - - /* Check a value of p such that GCD(p-1, 65537) != 1. */ - la x16, not_relprime - jal x1, test_bad_p - - /* Check a value of p that is not prime. */ - la x16, not_prime - jal x1, test_bad_p - - /* Check a value of p that is acceptable. */ - la x16, good_p - jal x1, check_p - jal x1, last_check_to_x2 - - /* Copy the good value of p into dmem[rsa_p] for the q checks. */ - la x3, rsa_p - loop x30, 2 - bn.lid x20, 0(x16++) - bn.sid x20, 0(x3++) - - /* If x2 != 0, the check failed; point to zeroes and exit. */ - la x16, zero - bne x2, x0, _program_exit - - /* Check a value of q that is too small. */ - la x2, too_small - jal x1, test_bad_q - - /* Check a value of q that is too close to p. */ - la x2, too_close - jal x1, test_bad_q - - /* Check a value of q that is acceptable. */ - la x2, good_q - jal x1, copy_to_rsa_q - jal x1, check_q - jal x1, last_check_to_x2 - - /* If x2 == 0, the check passed; jump to exit without zeroing q. */ - beq x2, x0, _program_exit_load_p - - /* If we get here, the good value of q failed; zeroize rsa_q. */ - la x2, zero - jal x1, copy_to_rsa_q - -_program_exit_load_p: - /* This jump point sets x16=rsa_p so p is loaded from that buffer instead of - whatever's in x16. */ - la x16, rsa_p - -_program_exit: - /* Load the selected value of p (or bad value) into registers. - w0,w1,w2,w3 <= dmem[x16..x16+(4*32)] */ - li x3, 0 - loop x30, 2 - bn.lid x3, 0(x16++) - addi x3, x3, 1 - - /* Load the selected value of q into registers. - w4,w5,w6,w7 <= dmem[rsa_q..rsa_q+(4*32)] */ - la x2, rsa_q - loop x30, 2 - bn.lid x3, 0(x2++) - addi x3, x3, 1 - - ecall - -/** - * Copy the value to dmem[rsa_q]. - * - * @param[in] x2: pointer to value to copy - * @param[in] x30: number of limbs - */ -copy_to_rsa_q: - la x3, rsa_q - loop x30, 2 - bn.lid x20, 0(x2++) - bn.sid x20, 0(x3++) - ret - -/** - * Test a bad value for p. - * - * @param[in] x16: pointer to value for test - */ -test_bad_p: - /* Run checks and ensure they failed. */ - jal x1, check_p - jal x1, last_check_to_x2 - - /* If x2 == 0, the check passed, so jump to the exit sequence. */ - beq x2, x0, _program_exit - - /* If we get here, all is well; return to the caller. */ - ret - -/** - * Test a bad value for q. - * - * @param[in] x2: pointer to value for test - */ -test_bad_q: - /* Copy the test value into dmem[rsa_q]. */ - jal x1, copy_to_rsa_q - - /* Run checks and ensure they failed. */ - jal x1, check_q - jal x1, last_check_to_x2 - - /* If x2 == 0, the check passed, so jump to the exit sequence. */ - beq x2, x0, _program_exit_load_p - - /* If we get here, all is well; return to the caller. */ - ret - -/** - * Get the result of the last check in a register. - * - * The result is nonzero if the check FAILED, and zero if it passed. - * - * @param[in] w24: result of last check (all-1 or all-0). - * @param[in] w31: all-zero. - * @param[out] x2: 0 if w24 == 0, otherwise nonzero - */ -last_check_to_x2: - /* Compare the result of the check to zero. - FG0.Z <= (w24 == 0) */ - bn.cmp w24, w31 - - /* Get the FG0.Z flag into a register. - x2 <= CSRs[FG0] & 8 = FG0.Z << 3 */ - csrrs x2, 0x7c0, x0 - andi x2, x2, 8 - ret - -.data - -/* Note: Some of the Python scripts shown below reference the lower bound for - p/q as a Python variable called lower_bound. This value was generated and - checked for RSA-4096 as specified in BoringSSL: - https://boringssl.googlesource.com/boringssl/+/dcabfe2d8940529a69e007660fa7bf6c15954ecc/crypto/fipsmodule/rsa/rsa_impl.c#1006 - - The value for RSA-2048, as used in these tests, is simply the value for - RSA-4096 shifted right by 1024 bits. We can check it using: - >> lower_bound**2 < 2**2047 < (lower_bound+1)**2 - True - - For reference, the hex value of the RSA-2048 lower bound is: - 0xb504f333f9de6484597d89b3754abe9f1d6f60ba893ba84ced17ac85833399154afc83043ab8a2c3a8b1fe6fdc83db390f74a85e439c7b4a780487363dfa2768d2202e8742af1f4e53059c6011bc337bcab1bc911688458a460abc722f7c4e33c6d5a8a38bb7e9dccb2a634331f3c84df52f120f836e582eeaa4a0899040ca4a -*/ - -/** - * An odd 1024-bit value that is too small to be used for p or q. - * - * Specifically, this value is the highest prime number below the lower bound. - * - * Python script for generating the test data (using PyCryptoDome's - * Crypto.Util.number package for the primality check): -too_small = lower_bound - 1 -while True: - if math.gcd(too_small-1, 65537) != 1: - continue - if number.isPrime(too_small): - break - too_small -= 2 - * - * Hex value for reference: - * 0xb504f333f9de6484597d89b3754abe9f1d6f60ba893ba84ced17ac85833399154afc83043ab8a2c3a8b1fe6fdc83db390f74a85e439c7b4a780487363dfa2768d2202e8742af1f4e53059c6011bc337bcab1bc911688458a460abc722f7c4e33c6d5a8a38bb7e9dccb2a634331f3c84df52f120f836e582eeaa4a0899040c619 - */ -.balign 32 -too_small: - .word 0x9040c619 - .word 0xeaa4a089 - .word 0x836e582e - .word 0xf52f120f - .word 0x31f3c84d - .word 0xcb2a6343 - .word 0x8bb7e9dc - .word 0xc6d5a8a3 - .word 0x2f7c4e33 - .word 0x460abc72 - .word 0x1688458a - .word 0xcab1bc91 - .word 0x11bc337b - .word 0x53059c60 - .word 0x42af1f4e - .word 0xd2202e87 - .word 0x3dfa2768 - .word 0x78048736 - .word 0x439c7b4a - .word 0x0f74a85e - .word 0xdc83db39 - .word 0xa8b1fe6f - .word 0x3ab8a2c3 - .word 0x4afc8304 - .word 0x83339915 - .word 0xed17ac85 - .word 0x893ba84c - .word 0x1d6f60ba - .word 0x754abe9f - .word 0x597d89b3 - .word 0xf9de6484 - .word 0xb504f333 - -/** - * An 1024-bit value that doesn't satisfy relative primality with 65537. - * - * This number is selected to be larger than the lower bound and prime, so it - * doesn't fail any other checks than GCD(p-1,e)=1. - * - * Python script for generating the test data (using PyCryptoDome's - * Crypto.Util.number package for the primality check): -while True: - y = random.randrange(lower_bound, (1 << 1024)) - y -= (y % 65537) - if (y & 1 == 0) and number.isPrime(y+1): - break -not_relprime = y+1 - * - * Hex value for reference: - * 0xf36b245b0051285df9f46be79c821a95584a00007b907c4102578d6c8c5d459c4328a174859c703e66bc706a9224e20f387da68e80a362fb1f0f36a912df95c26dc8b40902bff546d3aff671eea79a86df507180e0fba265c0ab601e582580f9fb18a62f9ff4e92d8d698408be08d7c24507244c6d3859be3804f2a7d9f16867 - */ -.balign 32 -not_relprime: - .word 0xd9f16867 - .word 0x3804f2a7 - .word 0x6d3859be - .word 0x4507244c - .word 0xbe08d7c2 - .word 0x8d698408 - .word 0x9ff4e92d - .word 0xfb18a62f - .word 0x582580f9 - .word 0xc0ab601e - .word 0xe0fba265 - .word 0xdf507180 - .word 0xeea79a86 - .word 0xd3aff671 - .word 0x02bff546 - .word 0x6dc8b409 - .word 0x12df95c2 - .word 0x1f0f36a9 - .word 0x80a362fb - .word 0x387da68e - .word 0x9224e20f - .word 0x66bc706a - .word 0x859c703e - .word 0x4328a174 - .word 0x8c5d459c - .word 0x02578d6c - .word 0x7b907c41 - .word 0x584a0000 - .word 0x9c821a95 - .word 0xf9f46be7 - .word 0x0051285d - .word 0xf36b245b - -/** - * An 1024-bit value that passes other checks but isn't prime. - * - * Python script for generating the test data (using PyCryptoDome's - * Crypto.Util.number package for the primality check): -while True: - not_prime = random.randrange(lower_bound, (1 << 1024)) - not_prime |= 1 - if math.gcd(not_prime, 65537) != 1: - continue - if not number.isPrime(not_prime): - break - * - * Hex value for reference: - * 0xecbbd72477e406de8ff72a93afbe19ed4258d3dd8cfa5b2a8b5c76d22053504710a8460c30c5141fc581df484e58a2bd019c03a1acab6c7fd70f9865ac6dcdcce4cca95266e4d2dea9a408b8ded6591daa4416bb7ca78357cad5c7d527d46a06807337d6845484589c8010eb6b674194608e1b9732db4e8cee053d2572158cf5 - */ -.balign 32 -not_prime: - .word 0x72158cf5 - .word 0xee053d25 - .word 0x32db4e8c - .word 0x608e1b97 - .word 0x6b674194 - .word 0x9c8010eb - .word 0x84548458 - .word 0x807337d6 - .word 0x27d46a06 - .word 0xcad5c7d5 - .word 0x7ca78357 - .word 0xaa4416bb - .word 0xded6591d - .word 0xa9a408b8 - .word 0x66e4d2de - .word 0xe4cca952 - .word 0xac6dcdcc - .word 0xd70f9865 - .word 0xacab6c7f - .word 0x019c03a1 - .word 0x4e58a2bd - .word 0xc581df48 - .word 0x30c5141f - .word 0x10a8460c - .word 0x20535047 - .word 0x8b5c76d2 - .word 0x8cfa5b2a - .word 0x4258d3dd - .word 0xafbe19ed - .word 0x8ff72a93 - .word 0x77e406de - .word 0xecbbd724 - -/** - * An acceptable value for p. - * - * To make sure the checks on q are being tested, this value is specifically - * chosen to be far enough away from the "bad" values of q that they wouldn't - * be rejected on that basis. - * - * Python script for generating p (using PyCryptoDome's Crypto.Util.number - * package for the primality check): -while True: - p = random.randrange(lower_bound, 1 << 1024) - p |= 1 - if abs(p - too_small) < (1 << 924): - continue - if abs(p - not_relprime) < (1 << 924): - continue - if abs(p - not_prime) < (1 << 924): - continue - if math.gcd(p-1, 65537) != 1: - continue - if number.isPrime(p): - break - * - * Hex value for reference: - * 0xe85547c5336579f83a2d50a611f489a4f2c3a918d2027fbc3f25c2de2dd36cdedc8901266de144a223b2c78a5a11024488a4aa2f4ef71f0fb93dfdbb2280b4d99dc9b3b77b039fd9fefcc3fe439e2bcb3db3ee3c0378a4d1297c1a5eebcd0d4ab3c0b50eb1511605c7c0907af31564ec5cc635e3de465e99cf6169c933ca0ab5 - */ -.balign 32 -good_p: - .word 0x33ca0ab5 - .word 0xcf6169c9 - .word 0xde465e99 - .word 0x5cc635e3 - .word 0xf31564ec - .word 0xc7c0907a - .word 0xb1511605 - .word 0xb3c0b50e - .word 0xebcd0d4a - .word 0x297c1a5e - .word 0x0378a4d1 - .word 0x3db3ee3c - .word 0x439e2bcb - .word 0xfefcc3fe - .word 0x7b039fd9 - .word 0x9dc9b3b7 - .word 0x2280b4d9 - .word 0xb93dfdbb - .word 0x4ef71f0f - .word 0x88a4aa2f - .word 0x5a110244 - .word 0x23b2c78a - .word 0x6de144a2 - .word 0xdc890126 - .word 0x2dd36cde - .word 0x3f25c2de - .word 0xd2027fbc - .word 0xf2c3a918 - .word 0x11f489a4 - .word 0x3a2d50a6 - .word 0x336579f8 - .word 0xe85547c5 - -/** - * A value for q that is too close to p, but meets other requirements. - * - * Python script for generating test data (using PyCryptoDome's - * Crypto.Util.number package for the primality check): -while True: - too_close = random.randrange(p - (1 << 924), p + (1 << 924)) - if too_close & 1 == 0: - continue - if too_close < lower_bound: - continue - if math.gcd(too_close - 1, 65537) != 1: - continue - if number.isPrime(too_close): - break - * - * Hex value for reference: - * 0xe85547c5336579f83a2d50a60364d13462f8746c6177f91a902b276464b8c39d0ffeb8d77af899a932ed3198d0d3ca66948d678bf7e95f30e95014fdb0a3b13c56927a70b14191134664a3374ada1d0a3d3dfb0a8fbf3704ef0e8588eafebd9e81f0dca5b7b5cca8b753862a472ed36b8c820c618110ca8936e79789e4ec8b71 - */ -.balign 32 -too_close: - .word 0xe4ec8b71 - .word 0x36e79789 - .word 0x8110ca89 - .word 0x8c820c61 - .word 0x472ed36b - .word 0xb753862a - .word 0xb7b5cca8 - .word 0x81f0dca5 - .word 0xeafebd9e - .word 0xef0e8588 - .word 0x8fbf3704 - .word 0x3d3dfb0a - .word 0x4ada1d0a - .word 0x4664a337 - .word 0xb1419113 - .word 0x56927a70 - .word 0xb0a3b13c - .word 0xe95014fd - .word 0xf7e95f30 - .word 0x948d678b - .word 0xd0d3ca66 - .word 0x32ed3198 - .word 0x7af899a9 - .word 0x0ffeb8d7 - .word 0x64b8c39d - .word 0x902b2764 - .word 0x6177f91a - .word 0x62f8746c - .word 0x0364d134 - .word 0x3a2d50a6 - .word 0x336579f8 - .word 0xe85547c5 - -/** - * An acceptable value for q. - * - * Python script for generating q (using PyCryptoDome's Crypto.Util.number - * package for the primality check): -while True: - q = random.randrange(lower_bound, 1 << 1024) - q |= 1 - if abs(p - q) < (1 << 924): - continue - if math.gcd(q-1, 65537) != 1: - continue - if number.isPrime(q): - break - * - * Hex value for reference: - * 0xb863a172d3d5562b582f38e251e540b424d4cbadd5da0ce64cb755227227b9535e0ab2437c1522415a70211eaa1dc4b4192b33148b1226da2ed107b64beeac72b112d99b960df54e21336a13aef97b5ec8646752af38385314a81a531bced7da5a781f6b19d119805941c47777a7aa9580a35b9f75c7dd97545d70790d7e8e9d - */ -.balign 32 -good_q: - .word 0x0d7e8e9d - .word 0x545d7079 - .word 0x75c7dd97 - .word 0x80a35b9f - .word 0x77a7aa95 - .word 0x5941c477 - .word 0x19d11980 - .word 0x5a781f6b - .word 0x1bced7da - .word 0x14a81a53 - .word 0xaf383853 - .word 0xc8646752 - .word 0xaef97b5e - .word 0x21336a13 - .word 0x960df54e - .word 0xb112d99b - .word 0x4beeac72 - .word 0x2ed107b6 - .word 0x8b1226da - .word 0x192b3314 - .word 0xaa1dc4b4 - .word 0x5a70211e - .word 0x7c152241 - .word 0x5e0ab243 - .word 0x7227b953 - .word 0x4cb75522 - .word 0xd5da0ce6 - .word 0x24d4cbad - .word 0x51e540b4 - .word 0x582f38e2 - .word 0xd3d5562b - .word 0xb863a172 - -/** - * Zeroes to point to for the "good value failed" case. - */ -.balign 32 -zero: -.zero 128 diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkpq_test_data.s b/sw/otbn/crypto/tests/rsa_keygen_checkpq_test_data.s new file mode 100644 index 0000000000000..173267dd31afe --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkpq_test_data.s @@ -0,0 +1,371 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Data to test checks on RSA keygen p and q values. + * + * See FIPS 186-5 section A.1.3 for the full specification of requirements on p + * and q. The value for p must satisfy: + * - p % 2 = 1 + * - p >= sqrt(2)*(2^(nlen/2 - 1)), where nlen = RSA public key length + * - GCD(p-1,65537) = 1 + * - p is probably prime + * + * For q, we need to satisfy the same requirements as p plus one more: q must + * not be too close to p. Specifically, we need to reject the value if: + * |p-q| < 2^(nlen/2 - 100). + * + * This test data includes values of p and q that each fail exactly one + * condition, as well as two "good" values of p and q that are compatible with + * each other. + * + * This test data uses 4-limb (1024-bit) values for p and q, which correspond + * to RSA-2048. + */ + +.data + +/* Note: Some of the Python scripts shown below reference the lower bound for + p/q as a Python variable called lower_bound. This value was generated and + checked for RSA-4096 as specified in BoringSSL: + https://boringssl.googlesource.com/boringssl/+/dcabfe2d8940529a69e007660fa7bf6c15954ecc/crypto/fipsmodule/rsa/rsa_impl.c#1006 + + The value for RSA-2048, as used in these tests, is simply the value for + RSA-4096 shifted right by 1024 bits. We can check it using: + >> lower_bound**2 < 2**2047 < (lower_bound+1)**2 + True + + For reference, the hex value of the RSA-2048 lower bound is: + 0xb504f333f9de6484597d89b3754abe9f1d6f60ba893ba84ced17ac85833399154afc83043ab8a2c3a8b1fe6fdc83db390f74a85e439c7b4a780487363dfa2768d2202e8742af1f4e53059c6011bc337bcab1bc911688458a460abc722f7c4e33c6d5a8a38bb7e9dccb2a634331f3c84df52f120f836e582eeaa4a0899040ca4a +*/ + +/** + * An odd 1024-bit value that is too small to be used for p or q. + * + * Specifically, this value is the highest prime number below the lower bound. + * + * Python script for generating the test data (using PyCryptoDome's + * Crypto.Util.number package for the primality check): +too_small = lower_bound - 1 +while True: + if math.gcd(too_small-1, 65537) != 1: + continue + if number.isPrime(too_small): + break + too_small -= 2 + * + * Hex value for reference: + * 0xb504f333f9de6484597d89b3754abe9f1d6f60ba893ba84ced17ac85833399154afc83043ab8a2c3a8b1fe6fdc83db390f74a85e439c7b4a780487363dfa2768d2202e8742af1f4e53059c6011bc337bcab1bc911688458a460abc722f7c4e33c6d5a8a38bb7e9dccb2a634331f3c84df52f120f836e582eeaa4a0899040c619 + */ +.balign 32 +.globl too_small +too_small: + .word 0x9040c619 + .word 0xeaa4a089 + .word 0x836e582e + .word 0xf52f120f + .word 0x31f3c84d + .word 0xcb2a6343 + .word 0x8bb7e9dc + .word 0xc6d5a8a3 + .word 0x2f7c4e33 + .word 0x460abc72 + .word 0x1688458a + .word 0xcab1bc91 + .word 0x11bc337b + .word 0x53059c60 + .word 0x42af1f4e + .word 0xd2202e87 + .word 0x3dfa2768 + .word 0x78048736 + .word 0x439c7b4a + .word 0x0f74a85e + .word 0xdc83db39 + .word 0xa8b1fe6f + .word 0x3ab8a2c3 + .word 0x4afc8304 + .word 0x83339915 + .word 0xed17ac85 + .word 0x893ba84c + .word 0x1d6f60ba + .word 0x754abe9f + .word 0x597d89b3 + .word 0xf9de6484 + .word 0xb504f333 + +/** + * An 1024-bit value that doesn't satisfy relative primality with 65537. + * + * This number is selected to be larger than the lower bound and prime, so it + * doesn't fail any other checks than GCD(p-1,e)=1. + * + * Python script for generating the test data (using PyCryptoDome's + * Crypto.Util.number package for the primality check): +while True: + y = random.randrange(lower_bound, (1 << 1024)) + y -= (y % 65537) + if (y & 1 == 0) and number.isPrime(y+1): + break +not_relprime = y+1 + * + * Hex value for reference: + * 0xf36b245b0051285df9f46be79c821a95584a00007b907c4102578d6c8c5d459c4328a174859c703e66bc706a9224e20f387da68e80a362fb1f0f36a912df95c26dc8b40902bff546d3aff671eea79a86df507180e0fba265c0ab601e582580f9fb18a62f9ff4e92d8d698408be08d7c24507244c6d3859be3804f2a7d9f16867 + */ +.balign 32 +.globl not_relprime +not_relprime: + .word 0xd9f16867 + .word 0x3804f2a7 + .word 0x6d3859be + .word 0x4507244c + .word 0xbe08d7c2 + .word 0x8d698408 + .word 0x9ff4e92d + .word 0xfb18a62f + .word 0x582580f9 + .word 0xc0ab601e + .word 0xe0fba265 + .word 0xdf507180 + .word 0xeea79a86 + .word 0xd3aff671 + .word 0x02bff546 + .word 0x6dc8b409 + .word 0x12df95c2 + .word 0x1f0f36a9 + .word 0x80a362fb + .word 0x387da68e + .word 0x9224e20f + .word 0x66bc706a + .word 0x859c703e + .word 0x4328a174 + .word 0x8c5d459c + .word 0x02578d6c + .word 0x7b907c41 + .word 0x584a0000 + .word 0x9c821a95 + .word 0xf9f46be7 + .word 0x0051285d + .word 0xf36b245b + +/** + * An 1024-bit value that passes other checks but isn't prime. + * + * Python script for generating the test data (using PyCryptoDome's + * Crypto.Util.number package for the primality check): +while True: + not_prime = random.randrange(lower_bound, (1 << 1024)) + not_prime |= 3 + if math.gcd(not_prime, 65537) != 1: + continue + if not number.isPrime(not_prime): + break + * + * Hex value for reference: + * 0xecbbd72477e406de8ff72a93afbe19ed4258d3dd8cfa5b2a8b5c76d22053504710a8460c30c5141fc581df484e58a2bd019c03a1acab6c7fd70f9865ac6dcdcce4cca95266e4d2dea9a408b8ded6591daa4416bb7ca78357cad5c7d527d46a06807337d6845484589c8010eb6b674194608e1b9732db4e8cee053d2572158cf7 + */ +.balign 32 +.globl not_prime +not_prime: + .word 0x72158cf7 + .word 0xee053d25 + .word 0x32db4e8c + .word 0x608e1b97 + .word 0x6b674194 + .word 0x9c8010eb + .word 0x84548458 + .word 0x807337d6 + .word 0x27d46a06 + .word 0xcad5c7d5 + .word 0x7ca78357 + .word 0xaa4416bb + .word 0xded6591d + .word 0xa9a408b8 + .word 0x66e4d2de + .word 0xe4cca952 + .word 0xac6dcdcc + .word 0xd70f9865 + .word 0xacab6c7f + .word 0x019c03a1 + .word 0x4e58a2bd + .word 0xc581df48 + .word 0x30c5141f + .word 0x10a8460c + .word 0x20535047 + .word 0x8b5c76d2 + .word 0x8cfa5b2a + .word 0x4258d3dd + .word 0xafbe19ed + .word 0x8ff72a93 + .word 0x77e406de + .word 0xecbbd724 + +/** + * An acceptable value for p. + * + * To make sure the checks on q are being tested, this value is specifically + * chosen to be far enough away from the "bad" values of q that they wouldn't + * be rejected on that basis. + * + * Python script for generating p (using PyCryptoDome's Crypto.Util.number + * package for the primality check): +while True: + p = random.randrange(lower_bound, 1 << 1024) + p |= 3 + if abs(p - too_small) < (1 << 924): + continue + if abs(p - not_relprime) < (1 << 924): + continue + if abs(p - not_prime) < (1 << 924): + continue + if math.gcd(p-1, 65537) != 1: + continue + if number.isPrime(p): + break + * + * Hex value for reference: + * 0xd10b3338d7d2cca85be7b76c5497f2fe89a9f9b73e613262565636dbc5901c386b1df3c7b8eb3ac8548a9062a5958b33c84dfe0fa9e2c61250d75683be1585008f926d5cfc4d3a3f003746a3beefcc71d287133768fc0268e1f84cb791be8e6dfc48b706ee0515089ff618c0a648854d6a93e9a0452552e93720ffa2021fd53b + */ +.balign 32 +.globl good_p +good_p: + .word 0x021fd53b + .word 0x3720ffa2 + .word 0x452552e9 + .word 0x6a93e9a0 + .word 0xa648854d + .word 0x9ff618c0 + .word 0xee051508 + .word 0xfc48b706 + .word 0x91be8e6d + .word 0xe1f84cb7 + .word 0x68fc0268 + .word 0xd2871337 + .word 0xbeefcc71 + .word 0x003746a3 + .word 0xfc4d3a3f + .word 0x8f926d5c + .word 0xbe158500 + .word 0x50d75683 + .word 0xa9e2c612 + .word 0xc84dfe0f + .word 0xa5958b33 + .word 0x548a9062 + .word 0xb8eb3ac8 + .word 0x6b1df3c7 + .word 0xc5901c38 + .word 0x565636db + .word 0x3e613262 + .word 0x89a9f9b7 + .word 0x5497f2fe + .word 0x5be7b76c + .word 0xd7d2cca8 + .word 0xd10b3338 + +/** + * A value for q that is too close to p, but meets other requirements. + * + * Python script for generating test data (using PyCryptoDome's + * Crypto.Util.number package for the primality check): +while True: + too_close = random.randrange(p - (1 << 924), p + (1 << 924)) + too_close |= 3 + if too_close < lower_bound: + continue + if math.gcd(too_close - 1, 65537) != 1: + continue + if number.isPrime(too_close): + break + * + * Hex value for reference: + * 0xd10b3338d7d2cca85be7b76c479a213a2646058cc86df4e6fb59ec553c4e93bcf9eab3ddcf6caf42e690294667a03e9bc11a94f9b78df5311f5ea7890eb161e7067d759143ff20425120197025aac542ca2cfd1dcfe3ebddeae1f19ece50583c83597856830a0827333d1b67d6d887a16c3f8fe156d119ee6a0b2ca6ba4f62fb + */ +.balign 32 +.globl too_close +too_close: + .word 0xba4f62fb + .word 0x6a0b2ca6 + .word 0x56d119ee + .word 0x6c3f8fe1 + .word 0xd6d887a1 + .word 0x333d1b67 + .word 0x830a0827 + .word 0x83597856 + .word 0xce50583c + .word 0xeae1f19e + .word 0xcfe3ebdd + .word 0xca2cfd1d + .word 0x25aac542 + .word 0x51201970 + .word 0x43ff2042 + .word 0x067d7591 + .word 0x0eb161e7 + .word 0x1f5ea789 + .word 0xb78df531 + .word 0xc11a94f9 + .word 0x67a03e9b + .word 0xe6902946 + .word 0xcf6caf42 + .word 0xf9eab3dd + .word 0x3c4e93bc + .word 0xfb59ec55 + .word 0xc86df4e6 + .word 0x2646058c + .word 0x479a213a + .word 0x5be7b76c + .word 0xd7d2cca8 + .word 0xd10b3338 + +/** + * An acceptable value for q. + * + * Python script for generating q (using PyCryptoDome's Crypto.Util.number + * package for the primality check): +while True: + q = random.randrange(lower_bound, 1 << 1024) + q |= 3 + if abs(p - q) < (1 << 924): + continue + if math.gcd(q-1, 65537) != 1: + continue + if number.isPrime(q): + break + * + * Hex value for reference: + * 0xf83da3592c89b3b8972d1a8dd1de78d7b64a0b1cce4a54ca5125bfc16105ce43ebe4bc6b5e0088e37281d264d2081cf1097671eb3299e91a6c571e4b71cdd1144ca96ad7c45bd05e8e25e371ca8e2043cf73a30ba5e9c979f259bbc9476c1ab3693136e403ebe4e47542c7a6f4164d1a7e2938e65191c9aee6a3534a87c3f1ff + */ +.balign 32 +.globl good_q +good_q: + .word 0x87c3f1ff + .word 0xe6a3534a + .word 0x5191c9ae + .word 0x7e2938e6 + .word 0xf4164d1a + .word 0x7542c7a6 + .word 0x03ebe4e4 + .word 0x693136e4 + .word 0x476c1ab3 + .word 0xf259bbc9 + .word 0xa5e9c979 + .word 0xcf73a30b + .word 0xca8e2043 + .word 0x8e25e371 + .word 0xc45bd05e + .word 0x4ca96ad7 + .word 0x71cdd114 + .word 0x6c571e4b + .word 0x3299e91a + .word 0x097671eb + .word 0xd2081cf1 + .word 0x7281d264 + .word 0x5e0088e3 + .word 0xebe4bc6b + .word 0x6105ce43 + .word 0x5125bfc1 + .word 0xce4a54ca + .word 0xb64a0b1c + .word 0xd1de78d7 + .word 0x972d1a8d + .word 0x2c89b3b8 + .word 0xf83da359 diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.exp new file mode 100644 index 0000000000000..250028a7d63f7 --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.exp @@ -0,0 +1,2 @@ +# Expect 2^256 - 1 (check passed). +w24 = 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.s new file mode 100644 index 0000000000000..f40b0508f5e3d --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.s @@ -0,0 +1,43 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Ensure that an acceptable value for q passes RSA keygen checks. + * + * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for + * RSA-2048. + */ + +.section .text.start + +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load the number of limbs for this test. */ + li x30, 4 + li x31, 3 + + /* Load required constants. */ + li x20, 20 + li x21, 21 + + /* Copy a "good" p value into `rsa_p`. */ + la x16, good_p + la x3, rsa_p + loop x30, 2 + bn.lid x20, 0(x16++) + bn.sid x20, 0(x3++) + + /* Copy the good value into `rsa_q`. */ + la x16, good_q + la x3, rsa_q + loop x30, 2 + bn.lid x20, 0(x16++) + bn.sid x20, 0(x3++) + + /* w24 <= 2^256-1 if the check passed, otherwise 0 */ + jal x1, check_q + + ecall diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.exp new file mode 100644 index 0000000000000..75275f176e56d --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.exp @@ -0,0 +1,2 @@ +# Expect 0 (check failed). +w24 = 0 diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.s new file mode 100644 index 0000000000000..dfd18797fbeb6 --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.s @@ -0,0 +1,43 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Ensure that a nonprime value for q fails RSA keygen checks. + * + * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for + * RSA-2048. + */ + +.section .text.start + +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load the number of limbs for this test. */ + li x30, 4 + li x31, 3 + + /* Load required constants. */ + li x20, 20 + li x21, 21 + + /* Copy a "good" p value into `rsa_p`. */ + la x16, good_p + la x3, rsa_p + loop x30, 2 + bn.lid x20, 0(x16++) + bn.sid x20, 0(x3++) + + /* Copy the nonprime value into `rsa_q`. */ + la x16, not_prime + la x3, rsa_q + loop x30, 2 + bn.lid x20, 0(x16++) + bn.sid x20, 0(x3++) + + /* w24 <= 2^256-1 if the check passed, otherwise 0 */ + jal x1, check_q + + ecall diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.exp new file mode 100644 index 0000000000000..75275f176e56d --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.exp @@ -0,0 +1,2 @@ +# Expect 0 (check failed). +w24 = 0 diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.s new file mode 100644 index 0000000000000..5f13a25eea47a --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.s @@ -0,0 +1,43 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Ensure that a multiple of F4 fails RSA keygen checks for q. + * + * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for + * RSA-2048. + */ + +.section .text.start + +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load the number of limbs for this test. */ + li x30, 4 + li x31, 3 + + /* Load required constants. */ + li x20, 20 + li x21, 21 + + /* Copy a "good" p value into `rsa_p`. */ + la x16, good_p + la x3, rsa_p + loop x30, 2 + bn.lid x20, 0(x16++) + bn.sid x20, 0(x3++) + + /* Copy the bad value into `rsa_q`. */ + la x16, not_relprime + la x3, rsa_q + loop x30, 2 + bn.lid x20, 0(x16++) + bn.sid x20, 0(x3++) + + /* w24 <= 2^256-1 if the check passed, otherwise 0 */ + jal x1, check_q + + ecall diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.exp new file mode 100644 index 0000000000000..75275f176e56d --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.exp @@ -0,0 +1,2 @@ +# Expect 0 (check failed). +w24 = 0 diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.s new file mode 100644 index 0000000000000..d2c3abf5d897b --- /dev/null +++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.s @@ -0,0 +1,43 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Ensure that a value for q which is too close to p fails RSA keygen checks. + * + * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for + * RSA-2048. + */ + +.section .text.start + +main: + /* Init all-zero register. */ + bn.xor w31, w31, w31 + + /* Load the number of limbs for this test. */ + li x30, 4 + li x31, 3 + + /* Load required constants. */ + li x20, 20 + li x21, 21 + + /* Copy a "good" p value into `rsa_p`. */ + la x16, good_p + la x3, rsa_p + loop x30, 2 + bn.lid x20, 0(x16++) + bn.sid x20, 0(x3++) + + /* Copy the too-close value into `rsa_q`. */ + la x16, too_close + la x3, rsa_q + loop x30, 2 + bn.lid x20, 0(x16++) + bn.sid x20, 0(x3++) + + /* w24 <= 2^256-1 if the check passed, otherwise 0 */ + jal x1, check_q + + ecall diff --git a/sw/otbn/crypto/tests/x25519_test.exp b/sw/otbn/crypto/tests/x25519_test.exp deleted file mode 100644 index 40dc093fc9864..0000000000000 --- a/sw/otbn/crypto/tests/x25519_test.exp +++ /dev/null @@ -1,2 +0,0 @@ -# Test failure counter in w0 is 0. -w0 = 0x0 diff --git a/sw/otbn/crypto/tests/x25519_test.s b/sw/otbn/crypto/tests/x25519_test.s deleted file mode 100644 index 6f5fc1e316ca7..0000000000000 --- a/sw/otbn/crypto/tests/x25519_test.s +++ /dev/null @@ -1,166 +0,0 @@ -/* Copyright lowRISC contributors. */ -/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ -/* SPDX-License-Identifier: Apache-2.0 */ - -/** - * Standalone tests for X25519. - * - * This test will exit with the number of failures written to the w0 register; - * w0=0 means all tests succeeded. - */ - -.section .text.start - -main: - /* Initialize failure counter to 0. - w0 <= 0 */ - bn.xor w0, w0, w0 - - /* Run tests. */ - jal x1, run_test1 - jal x1, run_test2 - - ecall - -run_test1: - /* w8 <= dmem[test1_k] = enc(k) */ - li x2, 8 - la x3, test1_k - bn.lid x2, 0(x3) - - /* w9 <= dmem[test1_u] = enc(u) */ - li x2, 9 - la x3, test1_u - bn.lid x2, 0(x3) - - /* w22 <= X25519(k, u) */ - jal x1, X25519 - - /* w25 <= dmem[test1_exp_result] */ - li x2, 25 - la x3, test1_exp_result - bn.lid x2, 0(x3) - - jal x1, check_result - - ret - -run_test2: - /* w8 <= dmem[test2_k] = enc(k) */ - li x2, 8 - la x3, test2_k - bn.lid x2, 0(x3) - - /* w9 <= dmem[test2_u] = enc(u) */ - li x2, 9 - la x3, test2_u - bn.lid x2, 0(x3) - - /* w22 <= X25519(k, u) */ - jal x1, X25519 - - /* w25 <= dmem[test2_exp_result] */ - li x2, 25 - la x3, test2_exp_result - bn.lid x2, 0(x3) - - jal x1, check_result - - ret - -/** - * Increment the failure counter if expected/actual results don't match. - * - * @param[in] w25: expected result - * @param[in] w22: actual result - * @param[in,out] w0: error count - * - * clobbered registers: w0, w1 - * clobbered flag groups: FG0 - */ -check_result: - /* Increment error register if expected < actual. */ - bn.addi w1, w0, 1 - bn.cmp w22, w25 - bn.sel w0, w1, w0, C - - /* Increment error register if actual < expected. */ - bn.addi w1, w0, 1 - bn.cmp w25, w22 - bn.sel w0, w1, w0, C - ret - -.data - -/* Test vector 1 from RFC 7748, section 5.2: - https://datatracker.ietf.org/doc/html/rfc7748#section-5.2 */ - -.balign 32 -test1_k: - .word 0x6be346a5 - .word 0x9d7c52f0 - .word 0x4b15163b - .word 0xdd5e4682 - .word 0x0a4c1462 - .word 0x185afcc1 - .word 0x44226a50 - .word 0xc49a44ba - -.balign 32 -test1_u: - .word 0x6768dbe6 - .word 0xdb303058 - .word 0xa4c19435 - .word 0x7c5fb124 - .word 0xec246672 - .word 0x3b35b326 - .word 0xa603a910 - .word 0x4c1cabd0 - -.balign 32 -test1_exp_result: - .word 0x3755dac3 - .word 0x90c6e99d - .word 0x4dea948e - .word 0x4f088df2 - .word 0x03cfec32 - .word 0xf7711c49 - .word 0x5507b454 - .word 0x5285a277 - - -/* Test vector 2 from RFC 7748, section 5.2: - https://datatracker.ietf.org/doc/html/rfc7748#section-5.2 */ - -.balign 32 -test2_k: - .word 0xd4e9664b - .word 0x3c67b4d1 - .word 0x9126d25a - .word 0xf56a7d95 - .word 0x21641bc1 - .word 0xd401eae0 - .word 0x9e16a42c - .word 0x0dba1879 - -.balign 32 -test2_u: - .word 0x120f21e5 - .word 0xd3116878 - .word 0x9d95b7f4 - .word 0x2cae3805 - .word 0x10e7db31 - .word 0x3e3cc06f - .word 0x49d54cfc - .word 0x93a415c7 - -.balign 32 -test2_exp_result: - .word 0x94decb95 - .word 0x7d90e876 - .word 0x5ce4ad7a - .word 0xf873b8b4 - .word 0x685a598b - .word 0x52a19f79 - .word 0x64f7f8e6 - .word 0x5779ac7a diff --git a/sw/otbn/crypto/tests/x25519_test1.exp b/sw/otbn/crypto/tests/x25519_test1.exp new file mode 100644 index 0000000000000..2ac426a6a0ecc --- /dev/null +++ b/sw/otbn/crypto/tests/x25519_test1.exp @@ -0,0 +1,2 @@ +# Expected result from the RFC. +w22 = 0x5285a2775507b454f7711c4903cfec324f088df24dea948e90c6e99d3755dac3 diff --git a/sw/otbn/crypto/tests/x25519_test1.s b/sw/otbn/crypto/tests/x25519_test1.s new file mode 100644 index 0000000000000..169f664b10fef --- /dev/null +++ b/sw/otbn/crypto/tests/x25519_test1.s @@ -0,0 +1,52 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone test for X25519. + * + * Runs test vector 1 from RFC 7748, section 5.2: + * https://datatracker.ietf.org/doc/html/rfc7748#section-5.2 + */ + +.section .text.start + +main: + /* w8 <= dmem[k] = enc(k) */ + li x2, 8 + la x3, k + bn.lid x2, 0(x3) + + /* w9 <= dmem[u] = enc(u) */ + li x2, 9 + la x3, u + bn.lid x2, 0(x3) + + /* w22 <= X25519(k, u) */ + jal x1, X25519 + + ecall + +.data + +.balign 32 +k: + .word 0x6be346a5 + .word 0x9d7c52f0 + .word 0x4b15163b + .word 0xdd5e4682 + .word 0x0a4c1462 + .word 0x185afcc1 + .word 0x44226a50 + .word 0xc49a44ba + +.balign 32 +u: + .word 0x6768dbe6 + .word 0xdb303058 + .word 0xa4c19435 + .word 0x7c5fb124 + .word 0xec246672 + .word 0x3b35b326 + .word 0xa603a910 + .word 0x4c1cabd0 diff --git a/sw/otbn/crypto/tests/x25519_test2.exp b/sw/otbn/crypto/tests/x25519_test2.exp new file mode 100644 index 0000000000000..0ec3576a4eb09 --- /dev/null +++ b/sw/otbn/crypto/tests/x25519_test2.exp @@ -0,0 +1,2 @@ +# Expected result from the RFC. +w22 = 0x5779ac7a64f7f8e652a19f79685a598bf873b8b45ce4ad7a7d90e87694decb95 diff --git a/sw/otbn/crypto/tests/x25519_test2.s b/sw/otbn/crypto/tests/x25519_test2.s new file mode 100644 index 0000000000000..a0947ed22e93b --- /dev/null +++ b/sw/otbn/crypto/tests/x25519_test2.s @@ -0,0 +1,52 @@ +/* Copyright lowRISC contributors. */ +/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */ +/* SPDX-License-Identifier: Apache-2.0 */ + +/** + * Standalone test for X25519. + * + * Runs test vector 2 from RFC 7748, section 5.2: + * https://datatracker.ietf.org/doc/html/rfc7748#section-5.2 + */ + +.section .text.start + +main: + /* w8 <= dmem[k] = enc(k) */ + li x2, 8 + la x3, k + bn.lid x2, 0(x3) + + /* w9 <= dmem[u] = enc(u) */ + li x2, 9 + la x3, u + bn.lid x2, 0(x3) + + /* w22 <= X25519(k, u) */ + jal x1, X25519 + + ecall + +.data + +.balign 32 +k: + .word 0xd4e9664b + .word 0x3c67b4d1 + .word 0x9126d25a + .word 0xf56a7d95 + .word 0x21641bc1 + .word 0xd401eae0 + .word 0x9e16a42c + .word 0x0dba1879 + +.balign 32 +u: + .word 0x120f21e5 + .word 0xd3116878 + .word 0x9d95b7f4 + .word 0x2cae3805 + .word 0x10e7db31 + .word 0x3e3cc06f + .word 0x49d54cfc + .word 0x93a415c7 diff --git a/sw/otbn/crypto/x25519.s b/sw/otbn/crypto/x25519.s index facf6dfc6be5f..abc2c53e26a3a 100644 --- a/sw/otbn/crypto/x25519.s +++ b/sw/otbn/crypto/x25519.s @@ -38,7 +38,7 @@ X25519: li x2, 2 la x3, modulus25519 bn.lid x2, 0(x3) - bn.wsrw 0x0, w2 + bn.wsrw MOD, w2 /* Decode scalar. From RFC 7748, section 5: diff --git a/util/topgen/templates/chiplevel.sv.tpl b/util/topgen/templates/chiplevel.sv.tpl index 8cd816a60e0bd..ca4bb3f8fae8f 100644 --- a/util/topgen/templates/chiplevel.sv.tpl +++ b/util/topgen/templates/chiplevel.sv.tpl @@ -1690,20 +1690,21 @@ module chip_${top["name"]}_${target["name"]} #( // Capture trigger. // We use the clkmgr_aon_idle signal of the IP of interest to form a precise capture trigger. - // GPIO[11:9] is used for selecting the IP of interest. The encoding is as follows (see + // GPIO[11:10] is used for selecting the IP of interest. The encoding is as follows (see // hint_names_e enum in clkmgr_pkg.sv for details). // - // IP - GPIO[11:9] - Index for clkmgr_aon_idle - // ------------------------------------------------------------ - // AES - 000 - 0 - // HMAC - 001 - 1 - not implemented on CW305 - // KMAC - 010 - 2 - not implemented on CW305 - // OTBN (IO_DIV4) - 011 - 3 - not implemented on CW305 - // OTBN - 100 - 4 - not implemented on CW305 + // IP - GPIO[11:10] - Index for clkmgr_aon_idle + // ------------------------------------------------------------- + // AES - 00 - 0 + // HMAC - 01 - 1 - not implemented on CW305 + // KMAC - 10 - 2 - not implemented on CW305 + // OTBN - 11 - 3 - not implemented on CW305 // - // In addition, GPIO8 is used for gating the capture trigger in software. - // Note that GPIO[11:8] are connected to LED[3:0] on the CW310. - // On the CW305, GPIO[9,8] are connected to LED[5,7]. + // GPIO9 is used for gating the selected capture trigger in software. Alternatively, GPIO8 + // can be used to implement a less precise but fully software-controlled capture trigger + // similar to what can be done on ASIC. + // + // Note that on the CW305, GPIO[9,8] are connected to LED[5(Green),7(Red)]. prim_mubi_pkg::mubi4_t clk_trans_idle, manual_in_io_clk_idle; @@ -1713,14 +1714,14 @@ module chip_${top["name"]}_${target["name"]} #( clkmgr_pkg::hint_names_e trigger_sel; always_comb begin : trigger_sel_mux % if top["name"] == "darjeeling": - unique case ({dio_out[DioGpioGpio11], dio_out[DioGpioGpio10], dio_out[DioGpioGpio9]}) + unique case ({dio_out[DioGpioGpio11], dio_out[DioGpioGpio10]}) % else: - unique case ({mio_out[MioOutGpioGpio11], mio_out[MioOutGpioGpio10], mio_out[MioOutGpioGpio9]}) + unique case ({mio_out[MioOutGpioGpio11], mio_out[MioOutGpioGpio10]}) % endif - 3'b000: trigger_sel = clkmgr_pkg::HintMainAes; - 3'b001: trigger_sel = clkmgr_pkg::HintMainHmac; - 3'b010: trigger_sel = clkmgr_pkg::HintMainKmac; - 3'b100: trigger_sel = clkmgr_pkg::HintMainOtbn; + 2'b00: trigger_sel = clkmgr_pkg::HintMainAes; + 2'b01: trigger_sel = clkmgr_pkg::HintMainHmac; + 2'b10: trigger_sel = clkmgr_pkg::HintMainKmac; + 2'b11: trigger_sel = clkmgr_pkg::HintMainOtbn; default: trigger_sel = clkmgr_pkg::HintMainAes; endcase; end @@ -1730,28 +1731,50 @@ module chip_${top["name"]}_${target["name"]} #( logic clk_io_div4_trigger_en, manual_in_io_clk_trigger_en; logic clk_io_div4_trigger_oe, manual_in_io_clk_trigger_oe; % if top["name"] == "darjeeling": - assign clk_io_div4_trigger_en = dio_out[DioGpioGpio8]; - assign clk_io_div4_trigger_oe = dio_oe[DioGpioGpio8]; + logic clk_io_div4_trigger_hw_en, manual_in_io_clk_trigger_hw_en; + logic clk_io_div4_trigger_hw_oe, manual_in_io_clk_trigger_hw_oe; + logic clk_io_div4_trigger_sw_en, manual_in_io_clk_trigger_sw_en; + logic clk_io_div4_trigger_sw_oe, manual_in_io_clk_trigger_sw_oe; + assign clk_io_div4_trigger_hw_en = dio_out[DioGpioGpio9]; + assign clk_io_div4_trigger_hw_oe = dio_oe[DioGpioGpio9]; + assign clk_io_div4_trigger_sw_en = dio_out[DioGpioGpio8]; + assign clk_io_div4_trigger_sw_oe = dio_oe[DioGpioGpio8]; % else: - assign clk_io_div4_trigger_en = mio_out[MioOutGpioGpio8]; - assign clk_io_div4_trigger_oe = mio_oe[MioOutGpioGpio8]; + logic clk_io_div4_trigger_hw_en, manual_in_io_clk_trigger_hw_en; + logic clk_io_div4_trigger_hw_oe, manual_in_io_clk_trigger_hw_oe; + logic clk_io_div4_trigger_sw_en, manual_in_io_clk_trigger_sw_en; + logic clk_io_div4_trigger_sw_oe, manual_in_io_clk_trigger_sw_oe; + assign clk_io_div4_trigger_hw_en = mio_out[MioOutGpioGpio9]; + assign clk_io_div4_trigger_hw_oe = mio_oe[MioOutGpioGpio9]; + assign clk_io_div4_trigger_sw_en = mio_out[MioOutGpioGpio8]; + assign clk_io_div4_trigger_sw_oe = mio_oe[MioOutGpioGpio8]; % endif // Synchronize signals to manual_in_io_clk. prim_flop_2sync #( - .Width ($bits(clk_trans_idle) + 2) + .Width ($bits(clk_trans_idle) + 4) ) u_sync_trigger ( .clk_i (manual_in_io_clk), .rst_ni(manual_in_por_n), - .d_i ({clk_trans_idle, clk_io_div4_trigger_en, clk_io_div4_trigger_oe}), - .q_o ({manual_in_io_clk_idle, manual_in_io_clk_trigger_en, manual_in_io_clk_trigger_oe}) + .d_i ({clk_trans_idle, + clk_io_div4_trigger_hw_en, + clk_io_div4_trigger_hw_oe, + clk_io_div4_trigger_sw_en, + clk_io_div4_trigger_sw_oe}), + .q_o ({manual_in_io_clk_idle, + manual_in_io_clk_trigger_hw_en, + manual_in_io_clk_trigger_hw_oe, + manual_in_io_clk_trigger_sw_en, + manual_in_io_clk_trigger_sw_oe}) ); - // Generate the actual trigger signal. + // Generate the actual trigger signal as trigger_sw OR trigger_hw. assign manual_attr_io_trigger = '0; - assign manual_oe_io_trigger = manual_in_io_clk_trigger_oe; - assign manual_out_io_trigger = manual_in_io_clk_trigger_en & - prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle); + assign manual_oe_io_trigger = + manual_in_io_clk_trigger_sw_oe | manual_in_io_clk_trigger_hw_oe; + assign manual_out_io_trigger = + manual_in_io_clk_trigger_sw_en | (manual_in_io_clk_trigger_hw_en & + prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle)); % endif ## This separate UART debugging output is needed for the CW305 only. % if target["name"] == "cw305":