diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 0e9f20ea6e6cd..803e1ac68354e 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -394,6 +394,29 @@ jobs:
       make -C hw/ip/otbn/util asm-check
     displayName: Assemble & link code snippets
 
+- job: otbn_crypto_tests
+  displayName: Run OTBN crypto tests
+  dependsOn: lint
+  condition: and(succeeded(), eq(dependencies.lint.outputs['DetermineBuildType.onlyCdcChanges'], '0'))
+  pool:
+    vmImage: ubuntu-20.04
+  timeoutInMinutes: 60
+  steps:
+  - template: ci/checkout-template.yml
+  - template: ci/install-package-dependencies.yml
+  - task: DownloadSecureFile@1
+    condition: eq(variables['Build.SourceBranchName'], 'master')
+    name: bazelCacheGcpKey
+    inputs:
+      secureFile: "bazel_cache_gcp_key.json"
+  - bash: echo "##vso[task.setvariable variable=bazelCacheGcpKeyPath]$(bazelCacheGcpKey.secureFilePath)"
+    condition: eq(variables['Build.SourceBranchName'], 'master')
+    displayName: GCP key path
+    # Set the remote cache GCP key path
+  - bash: |
+      ci/bazelisk.sh test --test_tag_filters=-nightly //sw/otbn/crypto/...
+    displayName: Execute tests
+
 - job: chip_darjeeling_cw310
   displayName: CW310's Darjeeling Bitstream
   # Build CW310 variant of the Darjeeling toplevel design using Vivado
diff --git a/ci/azure-pipelines-nightly.yml b/ci/azure-pipelines-nightly.yml
index aafcde27f73e9..14224d13a428c 100644
--- a/ci/azure-pipelines-nightly.yml
+++ b/ci/azure-pipelines-nightly.yml
@@ -70,6 +70,29 @@ jobs:
         //sw/device/silicon_creator/rom/e2e/...
     displayName: "Run all ROM E2E tests"
 
+- job: slow_otbn_crypto_tests
+  displayName: Run slow OTBN crypto tests
+  dependsOn: lint
+  condition: and(succeeded(), eq(dependencies.lint.outputs['DetermineBuildType.onlyCdcChanges'], '0'))
+  pool:
+    vmImage: ubuntu-20.04
+  timeoutInMinutes: 120
+  steps:
+  - template: ci/checkout-template.yml
+  - template: ci/install-package-dependencies.yml
+  - task: DownloadSecureFile@1
+    condition: eq(variables['Build.SourceBranchName'], 'master')
+    name: bazelCacheGcpKey
+    inputs:
+      secureFile: "bazel_cache_gcp_key.json"
+  - bash: echo "##vso[task.setvariable variable=bazelCacheGcpKeyPath]$(bazelCacheGcpKey.secureFilePath)"
+    condition: eq(variables['Build.SourceBranchName'], 'master')
+    displayName: GCP key path
+    # Set the remote cache GCP key path
+  - bash: |
+      ci/bazelisk.sh test --test_tag_filters=nightly //sw/otbn/crypto/...
+    displayName: Execute tests
+
 - job: bob_spi_i2c
   displayName: "BoB: SPI and I2C Tests"
   timeoutInMinutes: 30
diff --git a/hw/ip/otbn/dv/smoke/smoke_test.s b/hw/ip/otbn/dv/smoke/smoke_test.s
index 0fd354ecd5cad..2e80a021d2fcb 100644
--- a/hw/ip/otbn/dv/smoke/smoke_test.s
+++ b/hw/ip/otbn/dv/smoke/smoke_test.s
@@ -98,24 +98,24 @@ test_label_3:
 # use mod WSR to load bignum registers with base li psuedo-instruction
 # mod = 0x78fccc06_2228e9d6_89c9b54f_887cf14e_c79af825_69be586e_9866bb3b_53769ada
 li x23, 0x78fccc06
-csrrw x0, 0x7d7, x23
+csrrw x0, mod7, x23
 li x23, 0x2228e9d6
-csrrw x0, 0x7d6, x23
+csrrw x0, mod6, x23
 li x23, 0x89c9b54f
-csrrw x0, 0x7d5, x23
+csrrw x0, mod5, x23
 li x23, 0x887cf14e
-csrrw x0, 0x7d4, x23
+csrrw x0, mod4, x23
 li x23, 0xc79af825
-csrrw x0, 0x7d3, x23
+csrrw x0, mod3, x23
 li x23, 0x69be586e
-csrrw x0, 0x7d2, x23
+csrrw x0, mod2, x23
 li x23, 0x9866bb3b
-csrrw x0, 0x7d1, x23
+csrrw x0, mod1, x23
 li x23, 0x53769ada
-csrrw x0, 0x7d0, x23
+csrrw x0, mod0, x23
 
 # x22 = 0x89c9b54f
-csrrs x23, 0x7d5, x0
+csrrs x23, mod5, x0
 
 # Note that some instructions used the fixed inputs (from w1 and w2) others use
 # results from previous instructions. When debugging an failure it is recommened
@@ -126,7 +126,7 @@ csrrs x23, 0x7d5, x0
 bn.wsrr w1, 0x0 /* MOD */
 
 # Request an RND value with a write to CSR RND_PREFETCH
-csrrw x0, 0x7d8, x0
+csrrw x0, rnd_prefetch, x0
 
 # sim environment provides a fixed value for RND (in other environment RND isn't
 # fixed so this test will have a different final state)
@@ -185,7 +185,7 @@ bn.addc w15, w10, w11, FG0
 bn.subb w17, w3, w4, FG1
 
 # x24 = {fg1, fg0} = 0x52
-csrrs x24, 0x7c8, x0
+csrrs x24, flags, x0
 
 # w18 = w1 + (w2 << 136) = 0x23a7769f_bbc28381_34745fe9_22168a4e_c79af825_69be586e_9866bb3b_53769ada
 bn.add w18, w1, w2 << 136
diff --git a/hw/top_darjeeling/rtl/autogen/chip_darjeeling_cw310.sv b/hw/top_darjeeling/rtl/autogen/chip_darjeeling_cw310.sv
index 9b28f0721dd11..fbf2308ac094b 100644
--- a/hw/top_darjeeling/rtl/autogen/chip_darjeeling_cw310.sv
+++ b/hw/top_darjeeling/rtl/autogen/chip_darjeeling_cw310.sv
@@ -1499,30 +1499,31 @@ module chip_darjeeling_cw310 #(
 
   // Capture trigger.
   // We use the clkmgr_aon_idle signal of the IP of interest to form a precise capture trigger.
-  // GPIO[11:9] is used for selecting the IP of interest. The encoding is as follows (see
+  // GPIO[11:10] is used for selecting the IP of interest. The encoding is as follows (see
   // hint_names_e enum in clkmgr_pkg.sv for details).
   //
-  // IP              - GPIO[11:9] - Index for clkmgr_aon_idle
-  // ------------------------------------------------------------
-  //  AES            -   000      -  0
-  //  HMAC           -   001      -  1 - not implemented on CW305
-  //  KMAC           -   010      -  2 - not implemented on CW305
-  //  OTBN (IO_DIV4) -   011      -  3 - not implemented on CW305
-  //  OTBN           -   100      -  4 - not implemented on CW305
+  // IP              - GPIO[11:10] - Index for clkmgr_aon_idle
+  // -------------------------------------------------------------
+  //  AES            -   00       -  0
+  //  HMAC           -   01       -  1 - not implemented on CW305
+  //  KMAC           -   10       -  2 - not implemented on CW305
+  //  OTBN           -   11       -  3 - not implemented on CW305
   //
-  // In addition, GPIO8 is used for gating the capture trigger in software.
-  // Note that GPIO[11:8] are connected to LED[3:0] on the CW310.
-  // On the CW305, GPIO[9,8] are connected to LED[5,7].
+  // GPIO9 is used for gating the selected capture trigger in software. Alternatively, GPIO8
+  // can be used to implement a less precise but fully software-controlled capture trigger
+  // similar to what can be done on ASIC.
+  //
+  // Note that on the CW305, GPIO[9,8] are connected to LED[5(Green),7(Red)].
 
   prim_mubi_pkg::mubi4_t clk_trans_idle, manual_in_io_clk_idle;
 
   clkmgr_pkg::hint_names_e trigger_sel;
   always_comb begin : trigger_sel_mux
-    unique case ({dio_out[DioGpioGpio11], dio_out[DioGpioGpio10], dio_out[DioGpioGpio9]})
-      3'b000:  trigger_sel = clkmgr_pkg::HintMainAes;
-      3'b001:  trigger_sel = clkmgr_pkg::HintMainHmac;
-      3'b010:  trigger_sel = clkmgr_pkg::HintMainKmac;
-      3'b100:  trigger_sel = clkmgr_pkg::HintMainOtbn;
+    unique case ({dio_out[DioGpioGpio11], dio_out[DioGpioGpio10]})
+      2'b00:   trigger_sel = clkmgr_pkg::HintMainAes;
+      2'b01:   trigger_sel = clkmgr_pkg::HintMainHmac;
+      2'b10:   trigger_sel = clkmgr_pkg::HintMainKmac;
+      2'b11:   trigger_sel = clkmgr_pkg::HintMainOtbn;
       default: trigger_sel = clkmgr_pkg::HintMainAes;
     endcase;
   end
@@ -1530,23 +1531,39 @@ module chip_darjeeling_cw310 #(
 
   logic clk_io_div4_trigger_en, manual_in_io_clk_trigger_en;
   logic clk_io_div4_trigger_oe, manual_in_io_clk_trigger_oe;
-  assign clk_io_div4_trigger_en = dio_out[DioGpioGpio8];
-  assign clk_io_div4_trigger_oe = dio_oe[DioGpioGpio8];
+  logic clk_io_div4_trigger_hw_en, manual_in_io_clk_trigger_hw_en;
+  logic clk_io_div4_trigger_hw_oe, manual_in_io_clk_trigger_hw_oe;
+  logic clk_io_div4_trigger_sw_en, manual_in_io_clk_trigger_sw_en;
+  logic clk_io_div4_trigger_sw_oe, manual_in_io_clk_trigger_sw_oe;
+  assign clk_io_div4_trigger_hw_en = dio_out[DioGpioGpio9];
+  assign clk_io_div4_trigger_hw_oe = dio_oe[DioGpioGpio9];
+  assign clk_io_div4_trigger_sw_en = dio_out[DioGpioGpio8];
+  assign clk_io_div4_trigger_sw_oe = dio_oe[DioGpioGpio8];
 
   // Synchronize signals to manual_in_io_clk.
   prim_flop_2sync #(
-    .Width ($bits(clk_trans_idle) + 2)
+    .Width ($bits(clk_trans_idle) + 4)
   ) u_sync_trigger (
     .clk_i (manual_in_io_clk),
     .rst_ni(manual_in_por_n),
-    .d_i   ({clk_trans_idle,        clk_io_div4_trigger_en,      clk_io_div4_trigger_oe}),
-    .q_o   ({manual_in_io_clk_idle, manual_in_io_clk_trigger_en, manual_in_io_clk_trigger_oe})
+    .d_i   ({clk_trans_idle,
+             clk_io_div4_trigger_hw_en,
+             clk_io_div4_trigger_hw_oe,
+             clk_io_div4_trigger_sw_en,
+             clk_io_div4_trigger_sw_oe}),
+    .q_o   ({manual_in_io_clk_idle,
+             manual_in_io_clk_trigger_hw_en,
+             manual_in_io_clk_trigger_hw_oe,
+             manual_in_io_clk_trigger_sw_en,
+             manual_in_io_clk_trigger_sw_oe})
   );
 
-  // Generate the actual trigger signal.
+  // Generate the actual trigger signal as trigger_sw OR trigger_hw.
   assign manual_attr_io_trigger = '0;
-  assign manual_oe_io_trigger  = manual_in_io_clk_trigger_oe;
-  assign manual_out_io_trigger = manual_in_io_clk_trigger_en &
-      prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle);
+  assign manual_oe_io_trigger  =
+      manual_in_io_clk_trigger_sw_oe | manual_in_io_clk_trigger_hw_oe;
+  assign manual_out_io_trigger =
+      manual_in_io_clk_trigger_sw_en | (manual_in_io_clk_trigger_hw_en &
+          prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle));
 
 endmodule : chip_darjeeling_cw310
diff --git a/hw/top_earlgrey/rtl/autogen/chip_earlgrey_cw310.sv b/hw/top_earlgrey/rtl/autogen/chip_earlgrey_cw310.sv
index edfad81678840..84a463c23cf8b 100644
--- a/hw/top_earlgrey/rtl/autogen/chip_earlgrey_cw310.sv
+++ b/hw/top_earlgrey/rtl/autogen/chip_earlgrey_cw310.sv
@@ -1109,30 +1109,31 @@ module chip_earlgrey_cw310 #(
 
   // Capture trigger.
   // We use the clkmgr_aon_idle signal of the IP of interest to form a precise capture trigger.
-  // GPIO[11:9] is used for selecting the IP of interest. The encoding is as follows (see
+  // GPIO[11:10] is used for selecting the IP of interest. The encoding is as follows (see
   // hint_names_e enum in clkmgr_pkg.sv for details).
   //
-  // IP              - GPIO[11:9] - Index for clkmgr_aon_idle
-  // ------------------------------------------------------------
-  //  AES            -   000      -  0
-  //  HMAC           -   001      -  1 - not implemented on CW305
-  //  KMAC           -   010      -  2 - not implemented on CW305
-  //  OTBN (IO_DIV4) -   011      -  3 - not implemented on CW305
-  //  OTBN           -   100      -  4 - not implemented on CW305
+  // IP              - GPIO[11:10] - Index for clkmgr_aon_idle
+  // -------------------------------------------------------------
+  //  AES            -   00       -  0
+  //  HMAC           -   01       -  1 - not implemented on CW305
+  //  KMAC           -   10       -  2 - not implemented on CW305
+  //  OTBN           -   11       -  3 - not implemented on CW305
   //
-  // In addition, GPIO8 is used for gating the capture trigger in software.
-  // Note that GPIO[11:8] are connected to LED[3:0] on the CW310.
-  // On the CW305, GPIO[9,8] are connected to LED[5,7].
+  // GPIO9 is used for gating the selected capture trigger in software. Alternatively, GPIO8
+  // can be used to implement a less precise but fully software-controlled capture trigger
+  // similar to what can be done on ASIC.
+  //
+  // Note that on the CW305, GPIO[9,8] are connected to LED[5(Green),7(Red)].
 
   prim_mubi_pkg::mubi4_t clk_trans_idle, manual_in_io_clk_idle;
 
   clkmgr_pkg::hint_names_e trigger_sel;
   always_comb begin : trigger_sel_mux
-    unique case ({mio_out[MioOutGpioGpio11], mio_out[MioOutGpioGpio10], mio_out[MioOutGpioGpio9]})
-      3'b000:  trigger_sel = clkmgr_pkg::HintMainAes;
-      3'b001:  trigger_sel = clkmgr_pkg::HintMainHmac;
-      3'b010:  trigger_sel = clkmgr_pkg::HintMainKmac;
-      3'b100:  trigger_sel = clkmgr_pkg::HintMainOtbn;
+    unique case ({mio_out[MioOutGpioGpio11], mio_out[MioOutGpioGpio10]})
+      2'b00:   trigger_sel = clkmgr_pkg::HintMainAes;
+      2'b01:   trigger_sel = clkmgr_pkg::HintMainHmac;
+      2'b10:   trigger_sel = clkmgr_pkg::HintMainKmac;
+      2'b11:   trigger_sel = clkmgr_pkg::HintMainOtbn;
       default: trigger_sel = clkmgr_pkg::HintMainAes;
     endcase;
   end
@@ -1140,23 +1141,39 @@ module chip_earlgrey_cw310 #(
 
   logic clk_io_div4_trigger_en, manual_in_io_clk_trigger_en;
   logic clk_io_div4_trigger_oe, manual_in_io_clk_trigger_oe;
-  assign clk_io_div4_trigger_en = mio_out[MioOutGpioGpio8];
-  assign clk_io_div4_trigger_oe = mio_oe[MioOutGpioGpio8];
+  logic clk_io_div4_trigger_hw_en, manual_in_io_clk_trigger_hw_en;
+  logic clk_io_div4_trigger_hw_oe, manual_in_io_clk_trigger_hw_oe;
+  logic clk_io_div4_trigger_sw_en, manual_in_io_clk_trigger_sw_en;
+  logic clk_io_div4_trigger_sw_oe, manual_in_io_clk_trigger_sw_oe;
+  assign clk_io_div4_trigger_hw_en = mio_out[MioOutGpioGpio9];
+  assign clk_io_div4_trigger_hw_oe = mio_oe[MioOutGpioGpio9];
+  assign clk_io_div4_trigger_sw_en = mio_out[MioOutGpioGpio8];
+  assign clk_io_div4_trigger_sw_oe = mio_oe[MioOutGpioGpio8];
 
   // Synchronize signals to manual_in_io_clk.
   prim_flop_2sync #(
-    .Width ($bits(clk_trans_idle) + 2)
+    .Width ($bits(clk_trans_idle) + 4)
   ) u_sync_trigger (
     .clk_i (manual_in_io_clk),
     .rst_ni(manual_in_por_n),
-    .d_i   ({clk_trans_idle,        clk_io_div4_trigger_en,      clk_io_div4_trigger_oe}),
-    .q_o   ({manual_in_io_clk_idle, manual_in_io_clk_trigger_en, manual_in_io_clk_trigger_oe})
+    .d_i   ({clk_trans_idle,
+             clk_io_div4_trigger_hw_en,
+             clk_io_div4_trigger_hw_oe,
+             clk_io_div4_trigger_sw_en,
+             clk_io_div4_trigger_sw_oe}),
+    .q_o   ({manual_in_io_clk_idle,
+             manual_in_io_clk_trigger_hw_en,
+             manual_in_io_clk_trigger_hw_oe,
+             manual_in_io_clk_trigger_sw_en,
+             manual_in_io_clk_trigger_sw_oe})
   );
 
-  // Generate the actual trigger signal.
+  // Generate the actual trigger signal as trigger_sw OR trigger_hw.
   assign manual_attr_io_trigger = '0;
-  assign manual_oe_io_trigger  = manual_in_io_clk_trigger_oe;
-  assign manual_out_io_trigger = manual_in_io_clk_trigger_en &
-      prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle);
+  assign manual_oe_io_trigger  =
+      manual_in_io_clk_trigger_sw_oe | manual_in_io_clk_trigger_hw_oe;
+  assign manual_out_io_trigger =
+      manual_in_io_clk_trigger_sw_en | (manual_in_io_clk_trigger_hw_en &
+          prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle));
 
 endmodule : chip_earlgrey_cw310
diff --git a/sw/device/sca/BUILD b/sw/device/sca/BUILD
index 2b3d7bcb45a63..3104aa0c539cc 100644
--- a/sw/device/sca/BUILD
+++ b/sw/device/sca/BUILD
@@ -23,6 +23,7 @@ opentitan_flash_binary(
         "//sw/device/lib/testing/test_framework:check",
         "//sw/device/lib/testing/test_framework:ottf_ld_silicon_creator_slot_a",
         "//sw/device/lib/testing/test_framework:ottf_main",
+        "//sw/device/sca/lib:aes",
         "//sw/device/sca/lib:prng",
         "//sw/device/sca/lib:sca",
         "//sw/device/sca/lib:simple_serial",
diff --git a/sw/device/sca/aes_serial.c b/sw/device/sca/aes_serial.c
index e4fc551a99a5e..1bfd13ed8dadb 100644
--- a/sw/device/sca/aes_serial.c
+++ b/sw/device/sca/aes_serial.c
@@ -5,6 +5,7 @@
 #include "sw/device/lib/testing/test_framework/check.h"
 #include "sw/device/lib/testing/test_framework/ottf_main.h"
 #include "sw/device/lib/testing/test_framework/ottf_test_config.h"
+#include "sw/device/sca/lib/aes.h"
 #include "sw/device/sca/lib/prng.h"
 #include "sw/device/sca/lib/sca.h"
 #include "sw/device/sca/lib/simple_serial.h"
@@ -26,12 +27,19 @@
  *   - Version ('v')+,
  *   - Seed PRNG ('s')+,
  *   - Batch encrypt ('b')*,
- *   - FvsR batch fixed key set ('t')*,
+ *   - FvsR batch fixed key set ('f')*,
  *   - FvsR batch generate ('g')*,
- *   - FvsR batch encrypt and generate ('f')*,
+ *   - FvsR batch encrypt and generate ('e')*,
+ *   - Batch encrypt alternative routine ('a')*,
+ *   - Batch encrypt alternative routine, initial plaintext input ('i')*.
+ *   - Set default values for AES-based data generation ('d')*,
  * Commands marked with * are implemented in this file. Those marked with + are
  * implemented in the simple serial library. Encryption is done in AES-ECB-128
  * mode. See https://wiki.newae.com/SimpleSerial for details on the protocol.
+ *
+ * Data for running batch capture is generated according to:
+ * [DTR] Test Vector Leakage Assessment (TVLA) Derived Test Requirements (DTR)
+ * with AES
  */
 
 OTTF_DEFINE_TEST_CONFIG();
@@ -45,17 +53,20 @@ enum {
    * noise during AES operations. Caution: This number should be chosen to
    * provide enough time. Otherwise, Ibex might wake up while AES is still busy
    * and disturb the capture. Currently, we use a start trigger delay of 320
-   * clock cycles and the scope captures 60 clock cycles at kClockFreqCpuHz
-   * (1200 samples).
+   * clock cycles and the scope captures 60 clock cycles at kClockFreqCpuHz.
    */
   kIbexAesSleepCycles = 680,
   /**
-   * Max number of encryption that can be captured with the scope
-   * 81 is selected for AES with CW Husky
-   * Note: Maybe it would be better if we use dynamic memory allocation but I
-   * am not sure whether we are supporting it or not.
+   * The maximum number of encryptions to do per batch. The ChipWhisperer Husky
+   * scope determines how many encryptions (capture segments) it wants to record
+   * per batch based on the number of samples per segment. As the plaintexts
+   * and keys are generated in advance for fixed-vs-random batch captures, we
+   * need to make sure the corresponding buffers are sufficiently large. Note
+   * that on both CW305 and CW310, the main SRAM has a size of 128 kBytes. So it
+   * should be fine to allocate space for 256 segments (2 * 16 Bytes * 256 = 8
+   * kBytes).
    */
-  kNumBatchOpsMax = 81,
+  kNumBatchOpsMax = 256,
   /**
    * Max number of encryptions that can be captured before we rewrite the key to
    * reset the internal block counter. Otherwise, the AES peripheral might
@@ -81,9 +92,57 @@ uint8_t batch_plaintexts[kNumBatchOpsMax][kAesTextLength];
 bool sample_fixed = true;
 
 /**
- * Fixed key for fvsr key TVLA batch capture.
+ * An array to store pre-computed round keys derived from the generation key.
+ * The generation key (key_gen) is specified in [DTR] Section 5.1.
+ * This key is used for generating all pseudo-random data for batch captures.
+ * kKeyGen[kAesKeyLength] = {0x12, 0x34, 0x56, 0x78,
+ *                           0x9a, 0xbc, 0xde, 0xf1,
+ *                           0x23, 0x45, 0x67, 0x89,
+ *                           0xab, 0xcd, 0xe0, 0xf0};
  */
-uint8_t key_fixed[kAesKeyLength];
+static const uint32_t kKeyGenRoundKeys[(kAesKeyLength / 4) * 11] = {
+    0xab239a12, 0xcd45bc34, 0xe067de56, 0xf089f178, 0xbc1734ae, 0xe12c69d5,
+    0x836304da, 0x9262eb1a, 0xcb776054, 0x9d7c5039, 0x71f29195, 0x64f6947f,
+    0xd2196e0e, 0x2bb6ca9a, 0xc4b547d6, 0x6602f460, 0x528099f7, 0xd1fa4c86,
+    0xd317a2e5, 0x452321d5, 0x92c040d9, 0x8756ace0, 0xed3e298b, 0x92d7f4d5,
+    0xfc6eaeee, 0xc84f19b5, 0x3ed3edc4, 0x2bb96e9a, 0x7a86e846, 0x99511e07,
+    0x350bd835, 0xd6fd442a, 0x3c46c028, 0x47de8f91, 0x25101bc3, 0x9f49b4f0,
+    0x29155393, 0xb8ff21ae, 0x36130318, 0x79e6af1b, 0xa68f9ac9, 0xcd758aab,
+    0x88beadae, 0x8ef711be};
+
+/**
+ * Plaintext of the fixed set of fixed-vs-random-key TVLA
+ */
+static uint8_t plaintext_fixed[kAesTextLength] = {
+    0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+    0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa};
+/**
+ * Key of the of the fixed set of fixed-vs-random-key TVLA
+ */
+static uint8_t key_fixed[kAesTextLength] = {0x81, 0x1E, 0x37, 0x31, 0xB0, 0x12,
+                                            0x0A, 0x78, 0x42, 0x78, 0x1E, 0x22,
+                                            0xB2, 0x5C, 0xDD, 0xF9};
+/**
+ * Plaintext of the random set of fixed-vs-random-key TVLA
+ */
+static uint8_t plaintext_random[kAesTextLength] = {
+    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+    0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc};
+/**
+ * Key of the random set of fixed-vs-random-key TVLA
+ */
+static uint8_t key_random[kAesTextLength] = {0x53, 0x53, 0x53, 0x53, 0x53, 0x53,
+                                             0x53, 0x53, 0x53, 0x53, 0x53, 0x53,
+                                             0x53, 0x53, 0x53, 0x53};
+/**
+ * Temp ciphertext variable
+ */
+static uint8_t ciphertext_temp[kAesTextLength];
+
+/**
+ * batch_plaintext for batch capture to initially set it using command.
+ */
+static uint8_t batch_plaintext[kAesTextLength];
 
 /**
  * Block counter variable for manually handling reseeding operations of the
@@ -141,7 +200,7 @@ static void aes_manual_trigger(void) {
 }
 
 /**
- * Simple serial 't' (key set) command handler.
+ * Simple serial 'k' (key set) command handler.
  *
  * This command is designed to set the fixed_key variable and in addition also
  * configures the key into the AES peripheral.
@@ -238,6 +297,30 @@ static void aes_serial_single_encrypt(const uint8_t *plaintext,
   aes_send_ciphertext(false);
 }
 
+/**
+ * Advances data for fvsr-key TVLA - fixed set
+ *
+ * This function updates plaintext_fixed for fvsr-key TVLA, according
+ * to DTR recommendations.
+ */
+static void aes_serial_advance_fixed(void) {
+  aes_sw_encrypt_block(plaintext_fixed, kKeyGenRoundKeys, ciphertext_temp);
+  memcpy(plaintext_fixed, ciphertext_temp, kAesTextLength);
+}
+
+/**
+ * Advances data for fvsr-key TVLA - random set
+ *
+ * This function updates plaintext_random and key_random for fvsr-key and
+ * random TVLA, according to DTR recommendations.
+ */
+static void aes_serial_advance_random(void) {
+  aes_sw_encrypt_block(plaintext_random, kKeyGenRoundKeys, ciphertext_temp);
+  memcpy(plaintext_random, ciphertext_temp, kAesTextLength);
+  aes_sw_encrypt_block(key_random, kKeyGenRoundKeys, ciphertext_temp);
+  memcpy(key_random, ciphertext_temp, kAesTextLength);
+}
+
 /**
  * Simple serial 'b' (batch encrypt) command handler.
  *
@@ -279,9 +362,8 @@ static void aes_serial_batch_encrypt(const uint8_t *data, size_t data_len) {
 
   sca_set_trigger_high();
   for (uint32_t i = 0; i < num_encryptions; ++i) {
-    uint8_t plaintext[kAesTextLength];
-    prng_rand_bytes(plaintext, kAesTextLength);
-    aes_encrypt(plaintext, kAesTextLength);
+    aes_encrypt(plaintext_random, kAesTextLength);
+    aes_serial_advance_random();
   }
   sca_set_trigger_low();
 
@@ -289,7 +371,90 @@ static void aes_serial_batch_encrypt(const uint8_t *data, size_t data_len) {
 }
 
 /**
- * Simple serial 't' (fvsr key set) command handler.
+ * Simple serial 'a' (alternative batch encrypt) command handler.
+ *
+ * This command is designed to maximize the capture rate for side-channel
+ * attacks. It uses the first supplied plaintext and repeats AES encryptions
+ * by using every ciphertext as next plaintext with a constant key. This
+ * minimizes the overhead of UART communication and significantly improves the
+ * capture rate.
+
+ * Packet payload must be a `uint32_t` representation of the number of
+ * encryptions to perform. Since generated plaintexts are not cached, there is
+ * no limit on the number of encryptions.
+ *
+ * The key should also be set using 'k' (key set) command.
+ *
+ * The host can verify the operation by checking the last 'r' (ciphertext)
+ * packet that is sent at the end.
+ *
+ * @param data Packet payload.
+ * @param data_len Packet payload length.
+ */
+static void aes_serial_batch_alternative_encrypt(const uint8_t *data,
+                                                 size_t data_len) {
+  // Get num_encryptions from input
+  uint32_t num_encryptions = 0;
+  SS_CHECK(data_len == sizeof(num_encryptions));
+  num_encryptions = read_32(data);
+
+  // Add to current block_ctr to check if > kBlockCtrMax
+  block_ctr += num_encryptions;
+  // Rewrite the key to reset the internal block counter. Otherwise, the AES
+  // peripheral might trigger the reseeding of the internal masking PRNG which
+  // disturbs SCA measurements.
+  if (block_ctr > kBlockCtrMax) {
+    aes_key_mask_and_config(key_fixed, kAesKeyLength);
+    block_ctr = num_encryptions;
+  }
+
+  // First plaintext has been set through command into batch_plaintext
+
+  // Set trigger high outside of loop
+  // On FPGA, the trigger is AND-ed with AES !IDLE and creates a LO-HI-LO per
+  // AES operation
+  sca_set_trigger_high();
+  dif_aes_data_t ciphertext;
+  for (uint32_t i = 0; i < num_encryptions; ++i) {
+    // Encrypt
+    aes_encrypt(batch_plaintext, kAesTextLength);
+
+    // Get ciphertext
+    bool ready = false;
+    do {
+      SS_CHECK_DIF_OK(
+          dif_aes_get_status(&aes, kDifAesStatusOutputValid, &ready));
+    } while (!ready);
+    SS_CHECK_DIF_OK(dif_aes_read_output(&aes, &ciphertext));
+
+    // Use ciphertext as next plaintext (incl. next call to this function)
+    memcpy(batch_plaintext, ciphertext.data, kAesTextLength);
+  }
+  sca_set_trigger_low();
+
+  // send last ciphertext
+  simple_serial_send_packet('r', (uint8_t *)ciphertext.data, kAesTextLength);
+}
+
+/**
+ * Simple serial 'i' (batch plaintext) command handler.
+ *
+ * This command is designed to set the initial plaintext for
+ * aes_serial_batch_alternative_encrypt.
+ *
+ * The plaintext must be `kAesTextLength` bytes long.
+ *
+ * @param plaintext.
+ * @param len.
+ */
+static void aes_serial_batch_plaintext_set(const uint8_t *plaintext,
+                                           size_t len) {
+  SS_CHECK(len == kAesTextLength);
+  memcpy(batch_plaintext, plaintext, len);
+}
+
+/**
+ * Simple serial 'f' (fvsr key set) command handler.
  *
  * This command is designed to set the fixed key which is used for fvsr key TVLA
  * captures.
@@ -339,20 +504,19 @@ static void aes_serial_fvsr_key_batch_generate(const uint8_t *data,
   for (uint32_t i = 0; i < num_encryptions; ++i) {
     if (sample_fixed) {
       memcpy(batch_keys[i], key_fixed, kAesKeyLength);
+      memcpy(batch_plaintexts[i], plaintext_fixed, kAesKeyLength);
+      aes_serial_advance_fixed();
     } else {
-      prng_rand_bytes(batch_keys[i], kAesKeyLength);
+      memcpy(batch_keys[i], key_random, kAesKeyLength);
+      memcpy(batch_plaintexts[i], plaintext_random, kAesKeyLength);
+      aes_serial_advance_random();
     }
-    // Note: To decrease memory usage, plaintexts may be generated before use in
-    // every encryption operation instead of generating and storing them for all
-    // encyrption operation in a batch. Also, a new method should be selected
-    // to set sample_fixed variable.
-    prng_rand_bytes(batch_plaintexts[i], kAesTextLength);
     sample_fixed = batch_plaintexts[i][0] & 0x1;
   }
 }
 
 /**
- * Simple serial 'f' (fixed vs random key batch encrypt and generate) command
+ * Simple serial 'e' (fixed vs random key batch encrypt and generate) command
  * handler.
  *
  * This command is designed to maximize the capture rate for side-channel
@@ -410,12 +574,55 @@ static void aes_serial_fvsr_key_batch_encrypt(const uint8_t *data,
  * Simple serial 'l' (seed lfsr) command handler.
  *
  * This function only supports 4-byte seeds.
+ * Enables/disables masking depending on seed value, i.e. 0 for disable.
  *
  * @param seed A buffer holding the seed.
  */
 static void aes_serial_seed_lfsr(const uint8_t *seed, size_t seed_len) {
   SS_CHECK(seed_len == sizeof(uint32_t));
-  sca_seed_lfsr(read_32(seed));
+  uint32_t seed_local = read_32(seed);
+  if (seed_local == 0) {
+    // disable masking
+    transaction.force_masks = true;
+  } else {
+    // enable masking
+    transaction.force_masks = false;
+  }
+  sca_seed_lfsr(seed_local);
+}
+
+/**
+ * Simple serial 'd' (set starting values) command handler.
+ *
+ * This function sets starting values for FvsR data generation
+ * if the received value is 1.
+ * These values are specified in DTR for AES TVLA
+ *
+ * @param data Input command. For now only data == 1 resets values.
+ */
+static void aes_serial_set_default_values(const uint8_t *data,
+                                          size_t data_len) {
+  SS_CHECK(data_len == sizeof(uint32_t));
+  uint32_t command = 0;
+  command = read_32(data);
+  static const uint8_t kPlaintextFixedStart[kAesTextLength] = {
+      0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa,
+      0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa, 0xaa};
+  static const uint8_t kKeyFixedStart[kAesTextLength] = {
+      0x81, 0x1E, 0x37, 0x31, 0xB0, 0x12, 0x0A, 0x78,
+      0x42, 0x78, 0x1E, 0x22, 0xB2, 0x5C, 0xDD, 0xF9};
+  static const uint8_t kPlaintextRandomStart[kAesTextLength] = {
+      0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc,
+      0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc};
+  static const uint8_t kKeyRandomStart[kAesTextLength] = {
+      0x53, 0x53, 0x53, 0x53, 0x53, 0x53, 0x53, 0x53,
+      0x53, 0x53, 0x53, 0x53, 0x53, 0x53, 0x53, 0x53};
+  if (command == 1) {
+    memcpy(plaintext_fixed, kPlaintextFixedStart, kAesTextLength);
+    memcpy(key_fixed, kKeyFixedStart, kAesKeyLength);
+    memcpy(plaintext_random, kPlaintextRandomStart, kAesTextLength);
+    memcpy(key_random, kKeyRandomStart, kAesKeyLength);
+  }
 }
 
 /**
@@ -443,10 +650,13 @@ bool test_main(void) {
   simple_serial_register_handler('k', aes_serial_key_set);
   simple_serial_register_handler('p', aes_serial_single_encrypt);
   simple_serial_register_handler('b', aes_serial_batch_encrypt);
-  simple_serial_register_handler('t', aes_serial_fvsr_key_set);
+  simple_serial_register_handler('f', aes_serial_fvsr_key_set);
   simple_serial_register_handler('g', aes_serial_fvsr_key_batch_generate);
-  simple_serial_register_handler('f', aes_serial_fvsr_key_batch_encrypt);
+  simple_serial_register_handler('e', aes_serial_fvsr_key_batch_encrypt);
   simple_serial_register_handler('l', aes_serial_seed_lfsr);
+  simple_serial_register_handler('a', aes_serial_batch_alternative_encrypt);
+  simple_serial_register_handler('i', aes_serial_batch_plaintext_set);
+  simple_serial_register_handler('d', aes_serial_set_default_values);
 
   LOG_INFO("Initializing AES unit.");
   init_aes();
diff --git a/sw/device/sca/ecc384_serial.c b/sw/device/sca/ecc384_serial.c
index 74a3a6889634c..7894643fbb0d1 100644
--- a/sw/device/sca/ecc384_serial.c
+++ b/sw/device/sca/ecc384_serial.c
@@ -8,29 +8,29 @@
 #include "sw/device/sca/lib/sca.h"
 #include "sw/device/sca/lib/simple_serial.h"
 #include "sw/ip/entropy_src/test/utils/entropy_testutils.h"
+#include "sw/lib/sw/device/base/abs_mmio.h"
 #include "sw/lib/sw/device/base/memory.h"
 #include "sw/lib/sw/device/base/mmio.h"
 #include "sw/lib/sw/device/runtime/ibex.h"
 #include "sw/lib/sw/device/runtime/log.h"
 
 #include "hw/top_darjeeling/sw/autogen/top_darjeeling.h"
+#include "otbn_regs.h"
 
 /**
  * OpenTitan program for OTBN ECDSA-P384 side-channel analysis.
  *
  * This program implements the following simple serial commands:
- *   - Set ephemeral secret key and sign ('p')*,
+ *   - Set ephemeral secret key ('k')*,
  *   - Set private key ('d')*,
  *   - Set message ('n')*,
+ *   - Start signing ('p')*
  *   - Version ('v')+,
  *   - Seed PRNG ('s')+,
+ * Commands marked with * are implemented in this file. Those marked with + are
+ * implemented in the simple serial library.
  * See https://wiki.newae.com/SimpleSerial for details on the protocol.
  *
- * The OTBN-related code was developed based on
- * https://github.com/lowRISC/opentitan/tree/master/sw/device/lib/crypto/ecc/ecdsa_p256.c
- * and
- * https://github.com/lowRISC/opentitan/blob/master/sw/device/tests/crypto/ecdsa_p256_functest.c
- *
  */
 
 OTTF_DEFINE_TEST_CONFIG();
@@ -53,17 +53,37 @@ enum {
   kEcc384NumWords = kEcc384NumBytes / sizeof(uint32_t),
 };
 
+/**
+ * Two shares of the ephemeral secret key k
+ * k = k0 + k1
+ * k0 = ecc384_secret_k[0:11] (0x00000000...ffffffff)
+ * k1 = ecc384_secret_k[12:23] (0x00000000...00000000)
+ *
+ * The default values can be overwritten via
+ * the simpleserial command `k` (see ecc384_set_private_key_d)
+ */
+uint32_t ecc384_secret_k[2 * kEcc384NumWords] = {
+    0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
 /**
  * Private key d
  * I took this from here: https://www.rfc-editor.org/rfc/rfc6979#page-33
  * The endianness may need to be fixed.
  *
+ * Delivered as 2 shares. The second share is set to all-zero by default.
+ *
  * The value of this variable can be overwritten via the simpleserial command
  * `d` (see ecc384_set_private_key_d)
  */
-uint32_t ecc384_private_key_d[12] = {
+uint32_t ecc384_private_key_d[2 * kEcc384NumWords] = {
     0xAD3D9D6B, 0x1C8C1B2E, 0x7598B105, 0x4D9F65B6, 0x663B3CE2, 0xBA97F27B,
     0x4077A49A, 0xD8377178, 0x4E72D596, 0x25A8704C, 0xEAC972F8, 0xF5EDD260,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 };
 
 /**
@@ -71,7 +91,7 @@ uint32_t ecc384_private_key_d[12] = {
  * The value of this variable can be overwritten via the simpleserial command
  * `n` (see ecc384_set_msg).
  */
-uint32_t ecc384_msg[12] = {
+uint32_t ecc384_msg[kEcc384NumWords] = {
     0x48656c6c,  // 'Hell'
     0x6f204f54,  // 'o OT'
     0x424e0000,  // 'BN'
@@ -85,10 +105,10 @@ OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_r);
 OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_s);
 OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_x);
 OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_y);
-OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_d);
-OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca,
-                         dptr_rnd);  // x_r not used in p384 verify .s
-OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_k);
+OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_d0);
+OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_d1);
+OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_k0);
+OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, dptr_k1);
 
 OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, mode);
 OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, msg);
@@ -96,10 +116,10 @@ OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, r);
 OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, s);
 OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, x);
 OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, y);
-OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, d);
-OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, k);
-OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca,
-                         rnd);  // x_r not used in p384 verify .s file
+OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, d0);
+OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, d1);
+OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, k0);
+OTBN_DECLARE_SYMBOL_ADDR(p384_ecdsa_sca, k1);
 
 static const otbn_app_t kOtbnAppP384Ecdsa = OTBN_APP_T_INIT(p384_ecdsa_sca);
 
@@ -113,12 +133,14 @@ static const otbn_addr_t kOtbnVarDptrX =
     OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_x);
 static const otbn_addr_t kOtbnVarDptrY =
     OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_y);
-static const otbn_addr_t kOtbnVarDptrD =
-    OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_d);
-static const otbn_addr_t kOtbnVarDptrRnd =
-    OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_rnd);
-static const otbn_addr_t kOtbnVarDptrK =
-    OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_k);
+static const otbn_addr_t kOtbnVarDptrD0 =
+    OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_d0);
+static const otbn_addr_t kOtbnVarDptrD1 =
+    OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_d1);
+static const otbn_addr_t kOtbnVarDptrK0 =
+    OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_k0);
+static const otbn_addr_t kOtbnVarDptrK1 =
+    OTBN_ADDR_T_INIT(p384_ecdsa_sca, dptr_k1);
 
 static const otbn_addr_t kOtbnVarMode = OTBN_ADDR_T_INIT(p384_ecdsa_sca, mode);
 static const otbn_addr_t kOtbnVarMsg = OTBN_ADDR_T_INIT(p384_ecdsa_sca, msg);
@@ -126,9 +148,10 @@ static const otbn_addr_t kOtbnVarR = OTBN_ADDR_T_INIT(p384_ecdsa_sca, r);
 static const otbn_addr_t kOtbnVarS = OTBN_ADDR_T_INIT(p384_ecdsa_sca, s);
 static const otbn_addr_t kOtbnVarX = OTBN_ADDR_T_INIT(p384_ecdsa_sca, x);
 static const otbn_addr_t kOtbnVarY = OTBN_ADDR_T_INIT(p384_ecdsa_sca, y);
-static const otbn_addr_t kOtbnVarD = OTBN_ADDR_T_INIT(p384_ecdsa_sca, d);
-static const otbn_addr_t kOtbnVarRnd = OTBN_ADDR_T_INIT(p384_ecdsa_sca, rnd);
-static const otbn_addr_t kOtbnVarK = OTBN_ADDR_T_INIT(p384_ecdsa_sca, k);
+static const otbn_addr_t kOtbnVarD0 = OTBN_ADDR_T_INIT(p384_ecdsa_sca, d0);
+static const otbn_addr_t kOtbnVarD1 = OTBN_ADDR_T_INIT(p384_ecdsa_sca, d1);
+static const otbn_addr_t kOtbnVarK0 = OTBN_ADDR_T_INIT(p384_ecdsa_sca, k0);
+static const otbn_addr_t kOtbnVarK1 = OTBN_ADDR_T_INIT(p384_ecdsa_sca, k1);
 
 /**
  * Makes a single dptr in the P384 library point to where its value is stored.
@@ -158,22 +181,50 @@ static void setup_data_pointers(void) {
   setup_data_pointer(kOtbnVarDptrS, kOtbnVarS);
   setup_data_pointer(kOtbnVarDptrX, kOtbnVarX);
   setup_data_pointer(kOtbnVarDptrY, kOtbnVarY);
-  setup_data_pointer(kOtbnVarDptrD, kOtbnVarD);
-  setup_data_pointer(kOtbnVarDptrRnd, kOtbnVarRnd);
-  setup_data_pointer(kOtbnVarDptrK, kOtbnVarK);
+  setup_data_pointer(kOtbnVarDptrD0, kOtbnVarD0);
+  setup_data_pointer(kOtbnVarDptrD1, kOtbnVarD1);
+  setup_data_pointer(kOtbnVarDptrK0, kOtbnVarK0);
+  setup_data_pointer(kOtbnVarDptrK1, kOtbnVarK1);
+}
+
+/**
+ * Simple serial 'k' (set ephemeral key) command handler.
+ *
+ * This function sets both shares of the secret scalar k.
+ * The first 48 bytes (i.e, kEcc384NumBytes) are used as k0, and
+ * The last 48 bytes (i.e, kEcc384NumBytes) are used as k1.
+ *
+ * Any of the shares can be set to all zeros to simplify the SCA.
+ *
+ * As this function sets both shares,
+ * the data length must be `2*kEcc384NumBytes`.
+ *
+ * @param secret_k Key.
+ * @param secret_k_len Key length.
+ */
+static void ecc384_set_secret_key_k(const uint8_t *secret_k,
+                                    size_t secret_k_len) {
+  SS_CHECK(secret_k_len == 2 * kEcc384NumBytes);
+  memcpy(ecc384_secret_k, secret_k, secret_k_len);
 }
 
 /**
  * Simple serial 'd' (set private key) command handler.
  *
- * This function does not use key shares to simplify side-channel analysis.
- * The key must be `kEcc384NumBytes` bytes long.
+ * This function sets both shares of the private key d.
+ * The first 48 bytes (i.e, kEcc384NumBytes) are used as d0, and
+ * The last 48 bytes (i.e, kEcc384NumBytes) are used as d1.
+ *
+ * Any of the shares can be set to all zeros to simplify the SCA.
+ *
+ * As this function sets both shares,
+ * the data length must be `2*kEcc384NumBytes`.
  *
  * @param key_d Key.
  * @param key_d_len Key length.
  */
 static void ecc_384_set_private_key_d(const uint8_t *key_d, size_t key_d_len) {
-  SS_CHECK(key_d_len == kEcc384NumBytes);
+  SS_CHECK(key_d_len == 2 * kEcc384NumBytes);
   memcpy(ecc384_private_key_d, key_d, key_d_len);
 }
 
@@ -231,23 +282,21 @@ static void p384_ecdsa_sign(const uint32_t *msg, const uint32_t *private_key_d,
   setup_data_pointers();
 
   uint32_t mode = 1;  // mode 1 => sign
-  LOG_INFO("Copy data");
+  // LOG_INFO("Copy data");
   SS_CHECK_STATUS_OK(otbn_dmem_write(/*num_words=*/1, &mode, kOtbnVarMode));
   p384_dmem_write(msg, kOtbnVarMsg);
-  p384_dmem_write(private_key_d, kOtbnVarD);
+  p384_dmem_write(private_key_d, kOtbnVarD0);
+  p384_dmem_write(private_key_d + kEcc384NumWords, kOtbnVarD1);
 
-  SS_CHECK_STATUS_OK(otbn_dmem_write(kEcc384NumWords, k, kOtbnVarK));
+  SS_CHECK_STATUS_OK(otbn_dmem_write(kEcc384NumWords, k, kOtbnVarK0));
+  SS_CHECK_STATUS_OK(
+      otbn_dmem_write(kEcc384NumWords, k + kEcc384NumWords, kOtbnVarK1));
 
-  LOG_INFO("Execute");
   SS_CHECK_STATUS_OK(otbn_execute());
-  LOG_INFO("Wait for done");
   SS_CHECK_STATUS_OK(otbn_busy_wait_for_done());
 
-  LOG_INFO("Get results");
   SS_CHECK_STATUS_OK(otbn_dmem_read(kEcc384NumWords, kOtbnVarR, signature_r));
   SS_CHECK_STATUS_OK(otbn_dmem_read(kEcc384NumWords, kOtbnVarS, signature_s));
-  LOG_INFO("r[0]: 0x%02x", signature_r[0]);
-  LOG_INFO("s[0]: 0x%02x", signature_s[0]);
 }
 
 /**
@@ -263,18 +312,13 @@ static void p384_ecdsa_sign(const uint32_t *msg, const uint32_t *private_key_d,
  * UART.
  * @param secret_k_len Length of the ephemeral key.
  */
-static void ecc_384_ecdsa(const uint8_t *ecc384_secret_k_bytes,
-                          size_t secret_k_len) {
-  if (secret_k_len != kEcc384NumBytes) {
-    LOG_INFO("Invalid data length %hu", (uint8_t)secret_k_len);
-    return;
-  }
-  // Copy k to an aligned buffer.
-  uint32_t ecc384_secret_k[kEcc384NumWords];
-  memcpy(ecc384_secret_k, ecc384_secret_k_bytes, kEcc384NumBytes);
-
+static void ecc384_ecdsa(const uint8_t *ecc384_secret_k_bytes,
+                         size_t secret_k_len) {
   LOG_INFO("SSECDSA starting...");
   SS_CHECK_STATUS_OK(otbn_load_app(kOtbnAppP384Ecdsa));
+  LOG_INFO(
+      "otbn_status: 0x%08x",
+      abs_mmio_read32(TOP_DARJEELING_OTBN_BASE_ADDR + OTBN_STATUS_REG_OFFSET));
 
   uint32_t ecc384_signature_r[kEcc384NumWords];
   uint32_t ecc384_signature_s[kEcc384NumWords];
@@ -301,7 +345,7 @@ static void ecc_384_ecdsa(const uint8_t *ecc384_secret_k_bytes,
   simple_serial_send_packet('r', ecc384_signature_r_bytes, kEcc384NumBytes);
   simple_serial_send_packet('r', ecc384_signature_s_bytes, kEcc384NumBytes);
 
-  LOG_INFO("Clearing OTBN memory");
+  // Clear OTBN memory
   SS_CHECK_STATUS_OK(otbn_dmem_sec_wipe());
   SS_CHECK_STATUS_OK(otbn_imem_sec_wipe());
 }
@@ -321,11 +365,13 @@ static void simple_serial_main(void) {
   LOG_INFO("Initializing simple serial interface to capture board.");
   simple_serial_init(sca_get_uart());
 
-  SS_CHECK(simple_serial_register_handler('p', ecc_384_ecdsa) !=
+  SS_CHECK(simple_serial_register_handler('p', ecc384_ecdsa) ==
            kSimpleSerialOk);
-  SS_CHECK(simple_serial_register_handler('d', ecc_384_set_private_key_d) !=
+  SS_CHECK(simple_serial_register_handler('k', ecc384_set_secret_key_k) ==
            kSimpleSerialOk);
-  SS_CHECK(simple_serial_register_handler('n', ecc384_set_msg) !=
+  SS_CHECK(simple_serial_register_handler('d', ecc_384_set_private_key_d) ==
+           kSimpleSerialOk);
+  SS_CHECK(simple_serial_register_handler('n', ecc384_set_msg) ==
            kSimpleSerialOk);
 
   LOG_INFO("Starting simple serial packet handling.");
@@ -335,6 +381,9 @@ static void simple_serial_main(void) {
 }
 
 bool test_main(void) {
+  (void)kOtbnVarX;
+  (void)kOtbnVarY;
+
   simple_serial_main();
   return true;
 }
diff --git a/sw/device/sca/kmac_serial.c b/sw/device/sca/kmac_serial.c
index 70419708d5505..0516b66e04aaa 100644
--- a/sw/device/sca/kmac_serial.c
+++ b/sw/device/sca/kmac_serial.c
@@ -429,10 +429,13 @@ static void kmac_init(void) {
 
   dif_kmac_config_t config = (dif_kmac_config_t){
       .entropy_mode = kDifKmacEntropyModeSoftware,
+      .entropy_fast_process = kDifToggleDisabled,
       .entropy_seed = {0xaa25b4bf, 0x48ce8fff, 0x5a78282a, 0x48465647,
                        0x70410fef},
-      .entropy_fast_process = false,
-      .msg_mask = true,
+      .message_big_endian = kDifToggleDisabled,
+      .output_big_endian = kDifToggleDisabled,
+      .sideload = kDifToggleDisabled,
+      .msg_mask = kDifToggleEnabled,
   };
   SS_CHECK_DIF_OK(dif_kmac_configure(&kmac, config));
 
@@ -587,7 +590,7 @@ bool test_main(void) {
   simple_serial_register_handler('k', sha3_serial_set_key);
   simple_serial_register_handler('p', sha3_serial_single_absorb);
   simple_serial_register_handler('b', sha3_serial_batch);
-  simple_serial_register_handler('t', sha3_serial_fixed_key_set);
+  simple_serial_register_handler('f', sha3_serial_fixed_key_set);
   simple_serial_register_handler('l', sha3_serial_seed_lfsr);
 
   LOG_INFO("Initializing the KMAC peripheral.");
diff --git a/sw/device/sca/lib/BUILD b/sw/device/sca/lib/BUILD
index 6ee5934cc2136..c9ca5bc81f447 100644
--- a/sw/device/sca/lib/BUILD
+++ b/sw/device/sca/lib/BUILD
@@ -4,6 +4,15 @@
 
 package(default_visibility = ["//visibility:public"])
 
+cc_library(
+    name = "aes",
+    srcs = ["aes.c"],
+    hdrs = ["aes.h"],
+    deps = [
+        "//sw/lib/sw/device/base:memory",
+    ],
+)
+
 cc_library(
     name = "prng",
     srcs = ["prng.c"],
@@ -25,6 +34,7 @@ cc_library(
     hdrs = ["simple_serial.h"],
     deps = [
         ":prng",
+        ":sca",
         "//hw/top_darjeeling/sw/autogen:top_darjeeling",
         "//sw/ip/base/dif:base",
         "//sw/lib/sw/device/arch:device",
diff --git a/sw/device/sca/lib/aes.c b/sw/device/sca/lib/aes.c
new file mode 100644
index 0000000000000..03863b41f6dbe
--- /dev/null
+++ b/sw/device/sca/lib/aes.c
@@ -0,0 +1,212 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * NOTE: The only intended use of this code is to serve as a PRNG for generating
+ * input data for SCA experiments and penetration testing.
+ * The library is not hardened against any type of attacks, and it should not be
+ * used for any purpose other than stated.
+ *
+ * During the SCA experiments, encryptions are verified on the host side by
+ * running the same encryption using PyCryptodome package and comparing the
+ * result.
+ *
+ * Implementation of round-functions is based on a transposed-state technique
+ * for 32-bit architecture presented in:
+ *
+ * [1] Bertoni et. al., Efficient Software Implementation of AES on 32-Bit
+ *     Platforms, CHES 2002.
+ *
+ *     https://link.springer.com/content/pdf/10.1007/3-540-36400-5_13.pdf
+ *
+ */
+#include "aes.h"
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "sw/lib/sw/device/base/memory.h"
+
+enum {
+  kAesNumRounds = 10,
+  kAesNumKeyBytes = 16,
+  kAesNumTextBytes = 16,
+  kAesNumStateBytes = 16,
+  kAesNumStateWords = 4
+};
+
+static void aes_add_round_key(uint32_t *state, const uint32_t *round_key) {
+  state[0] ^= round_key[0];
+  state[1] ^= round_key[1];
+  state[2] ^= round_key[2];
+  state[3] ^= round_key[3];
+}
+
+static void aes_sub_bytes(uint32_t *state) {
+  // SubBytes on a transposed state
+  // Section 3.1 of [1]
+  for (size_t i = 0; i < 4; ++i) {
+    state[i] = (uint32_t)kSbox[state[i] & 0xff] |
+               ((uint32_t)kSbox[(state[i] >> 8) & 0xff] << 8) |
+               ((uint32_t)kSbox[(state[i] >> 16) & 0xff] << 16) |
+               ((uint32_t)kSbox[(state[i] >> 24) & 0xff] << 24);
+  }
+}
+
+static uint32_t aes_mul2(uint32_t s) {
+  // Multiplication by 2 in Rijndael field.
+  // Each byte of the 32b input word is multiplied.
+  uint32_t t;
+  t = (uint32_t)kMul2[s & 0xff] | ((uint32_t)kMul2[(s >> 8) & 0xff] << 8) |
+      ((uint32_t)kMul2[(s >> 16) & 0xff] << 16) |
+      ((uint32_t)kMul2[(s >> 24) & 0xff] << 24);
+  return t;
+}
+
+static void aes_shift_rows(uint32_t *state) {
+  // ShiftRows on a transposed state
+  // Section 3.1 of [1]
+  state[1] = (state[1] >> 8) | (state[1] << 24);
+  state[2] = (state[2] >> 16) | (state[2] << 16);
+  state[3] = (state[3] >> 24) | (state[3] << 8);
+}
+
+static void aes_mix_columns(uint32_t *state) {
+  // MixColumns on a transposed state
+  // Section 3.1 of [1]
+  uint32_t temp[kAesNumStateWords];
+
+  memcpy(temp, state, kAesNumStateBytes);
+
+  state[0] = temp[1] ^ temp[2] ^ temp[3];
+  state[1] = temp[0] ^ temp[2] ^ temp[3];
+  state[2] = temp[0] ^ temp[1] ^ temp[3];
+  state[3] = temp[0] ^ temp[1] ^ temp[2];
+
+  temp[0] = aes_mul2(temp[0]);
+  temp[1] = aes_mul2(temp[1]);
+  temp[2] = aes_mul2(temp[2]);
+  temp[3] = aes_mul2(temp[3]);
+
+  state[0] ^= temp[0] ^ temp[1];
+  state[1] ^= temp[1] ^ temp[2];
+  state[2] ^= temp[2] ^ temp[3];
+  state[3] ^= temp[3] ^ temp[0];
+}
+
+static void aes_transpose_to_32(uint8_t *in_data, uint32_t *out_data) {
+  out_data[0] = (uint32_t)in_data[0] | ((uint32_t)in_data[4] << 8) |
+                ((uint32_t)in_data[8] << 16) | ((uint32_t)in_data[12] << 24);
+  out_data[1] = (uint32_t)in_data[1] | ((uint32_t)in_data[5] << 8) |
+                ((uint32_t)in_data[9] << 16) | ((uint32_t)in_data[13] << 24);
+  out_data[2] = (uint32_t)in_data[2] | ((uint32_t)in_data[6] << 8) |
+                ((uint32_t)in_data[10] << 16) | ((uint32_t)in_data[14] << 24);
+  out_data[3] = (uint32_t)in_data[3] | ((uint32_t)in_data[7] << 8) |
+                ((uint32_t)in_data[11] << 16) | ((uint32_t)in_data[15] << 24);
+}
+
+static void aes_transpose_from_32(uint32_t *in_data, uint8_t *out_data) {
+  out_data[0] = (uint8_t)(in_data[0] & 0xff);
+  out_data[1] = (uint8_t)(in_data[1] & 0xff);
+  out_data[2] = (uint8_t)(in_data[2] & 0xff);
+  out_data[3] = (uint8_t)(in_data[3] & 0xff);
+  out_data[4] = (uint8_t)(in_data[0] >> 8) & 0xff;
+  out_data[5] = (uint8_t)(in_data[1] >> 8) & 0xff;
+  out_data[6] = (uint8_t)(in_data[2] >> 8) & 0xff;
+  out_data[7] = (uint8_t)(in_data[3] >> 8) & 0xff;
+  out_data[8] = (uint8_t)(in_data[0] >> 16) & 0xff;
+  out_data[9] = (uint8_t)(in_data[1] >> 16) & 0xff;
+  out_data[10] = (uint8_t)(in_data[2] >> 16) & 0xff;
+  out_data[11] = (uint8_t)(in_data[3] >> 16) & 0xff;
+  out_data[12] = (uint8_t)(in_data[0] >> 24) & 0xff;
+  out_data[13] = (uint8_t)(in_data[1] >> 24) & 0xff;
+  out_data[14] = (uint8_t)(in_data[2] >> 24) & 0xff;
+  out_data[15] = (uint8_t)(in_data[3] >> 24) & 0xff;
+}
+
+static uint8_t aes_rcon_next(uint8_t rcon) {
+  // rcon cannot be 0
+  if (rcon != 0) {
+    // update round constant
+    return kMul2[rcon];
+  } else {
+    // init round constant to first-round value
+    return 0x1;
+  }
+}
+
+static void aes_key_expand(uint8_t *round_key, uint8_t *rcon) {
+  uint8_t temp[kAesNumStateWords];
+  uint8_t old_key[kAesNumKeyBytes];
+
+  // copy key to temp
+  memcpy(old_key, round_key, kAesNumKeyBytes);
+
+  // shift last word
+  temp[0] = old_key[13];
+  temp[1] = old_key[14];
+  temp[2] = old_key[15];
+  temp[3] = old_key[12];
+
+  // sub bytes in last word
+  temp[0] = kSbox[temp[0]];
+  temp[1] = kSbox[temp[1]];
+  temp[2] = kSbox[temp[2]];
+  temp[3] = kSbox[temp[3]];
+
+  // update rcon
+  *rcon = aes_rcon_next(*rcon);
+
+  // get new words
+  round_key[0] = temp[0] ^ old_key[0] ^ *rcon;
+  round_key[1] = temp[1] ^ old_key[1];
+  round_key[2] = temp[2] ^ old_key[2];
+  round_key[3] = temp[3] ^ old_key[3];
+
+  for (size_t i = 4; i < kAesNumKeyBytes; ++i) {
+    round_key[i] = round_key[i - 4] ^ old_key[i];
+  }
+}
+
+void aes_key_schedule(uint32_t *round_keys, const uint8_t *key) {
+  // Derives all round keys for AES128
+  // Each key is storred in 4 32-bit words in a transposed-state form.
+  uint8_t rcon = 0;
+  uint8_t key_temp[kAesNumKeyBytes];
+  uint32_t key_temp_32[kAesNumStateWords];
+
+  memcpy(key_temp, key, kAesNumKeyBytes);
+  aes_transpose_to_32(key_temp, key_temp_32);
+  memcpy(round_keys, key_temp_32, kAesNumKeyBytes);
+  for (size_t i = 1; i < kAesNumRounds + 1; ++i) {
+    aes_key_expand(key_temp, &rcon);
+    aes_transpose_to_32(key_temp, key_temp_32);
+    memcpy(round_keys + i * kAesNumStateWords, key_temp_32, kAesNumKeyBytes);
+  }
+}
+
+void aes_sw_encrypt_block(const uint8_t *plain_text, const uint32_t *round_keys,
+                          uint8_t *cipher_text) {
+  uint32_t state[kAesNumStateWords];
+
+  // initially transpose state
+  uint8_t pt[kAesNumTextBytes];
+  memcpy(pt, plain_text, kAesNumTextBytes);
+  aes_transpose_to_32(pt, state);
+
+  // encrypt
+  aes_add_round_key(state, round_keys);
+  for (int j = 0; j < kAesNumRounds - 1; ++j) {
+    aes_sub_bytes(state);
+    aes_shift_rows(state);
+    aes_mix_columns(state);
+    aes_add_round_key(state, round_keys + (j + 1) * kAesNumStateWords);
+  }
+  aes_sub_bytes(state);
+  aes_shift_rows(state);
+  aes_add_round_key(state, round_keys + kAesNumStateWords * kAesNumRounds);
+
+  // transpose the result back into the byte form
+  aes_transpose_from_32(state, cipher_text);
+}
diff --git a/sw/device/sca/lib/aes.h b/sw/device/sca/lib/aes.h
new file mode 100644
index 0000000000000..b2db60cae28cc
--- /dev/null
+++ b/sw/device/sca/lib/aes.h
@@ -0,0 +1,149 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+#ifndef OPENTITAN_SW_DEVICE_SCA_LIB_AES_H_
+#define OPENTITAN_SW_DEVICE_SCA_LIB_AES_H_
+
+/**
+ * NOTE: The only intended use of this code is to serve as a PRNG for generating
+ * input data for SCA experiments and penetration testing.
+ * The library is not hardened against any type of attacks, and it should not be
+ * used for any purpose other than stated.
+ *
+ * During the SCA experiments, encryptions are verified on the host side by
+ * running the same encryption using PyCryptodome package and comparing the
+ * result.
+ *
+ * Implementation of round-functions is based on a transposed-state technique
+ * for 32-bit architecture presented in:
+ *
+ * [1] Bertoni et. al., Efficient Software Implementation of AES on 32-Bit
+ *     Platforms, CHES 2002.
+ *
+ *     https://link.springer.com/content/pdf/10.1007/3-540-36400-5_13.pdf
+ *
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "sw/lib/sw/device/base/memory.h"
+
+/**
+ * Encrypt one data block (16 Bytes) in ECB mode.
+ *
+ * @param       plain_text  Input block to enrypt
+ * @param       round_keys  All round keys (pre-computed)
+ * @param[out]  cipher_text Encrypted output block
+ */
+void aes_sw_encrypt_block(const uint8_t *plain_text, const uint32_t *round_keys,
+                          uint8_t *cipher_text);
+
+/**
+ * Generate all round keys for AES-128 encryption.
+ * Store keys in a transposed-state form.
+ *
+ * @param[out]  round_keys Round keys for all rounds
+ * @param  key             Encryption key
+ */
+void aes_key_schedule(uint32_t *round_keys, const uint8_t *key);
+
+static const uint8_t kSbox[256] = {
+    0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5,
+    0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+
+    0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
+    0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+
+    0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC,
+    0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+
+    0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A,
+    0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+
+    0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
+    0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+
+    0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B,
+    0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+
+    0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85,
+    0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+
+    0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
+    0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+
+    0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17,
+    0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+
+    0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88,
+    0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+
+    0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
+    0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+
+    0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9,
+    0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+
+    0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6,
+    0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+
+    0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
+    0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+
+    0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94,
+    0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+
+    0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68,
+    0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16};
+
+static const uint8_t kMul2[256] = {
+    0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
+    0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e,
+
+    0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e,
+    0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e,
+
+    0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e,
+    0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e,
+
+    0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e,
+    0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e,
+
+    0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e,
+    0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e,
+
+    0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae,
+    0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe,
+
+    0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce,
+    0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde,
+
+    0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee,
+    0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe,
+
+    0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15,
+    0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05,
+
+    0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35,
+    0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25,
+
+    0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55,
+    0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45,
+
+    0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75,
+    0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65,
+
+    0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95,
+    0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85,
+
+    0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5,
+    0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5,
+
+    0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5,
+    0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5,
+
+    0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5,
+    0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5};
+
+#endif  // OPENTITAN_SW_DEVICE_SCA_LIB_AES_H_
diff --git a/sw/device/sca/lib/sca.c b/sw/device/sca/lib/sca.c
index c22de550faf20..13210836b4764 100644
--- a/sw/device/sca/lib/sca.c
+++ b/sw/device/sca/lib/sca.c
@@ -32,21 +32,28 @@
 /**
  * Bitfield for the trigger source.
  *
- * Bits 9 to 11 are used to select the trigger source. See chiplevel.sv.tpl for
- * details.
+ * Bits 10 and 11 are used to select the trigger source. See chiplevel.sv.tpl
+ * for details.
  */
 static const bitfield_field32_t kTriggerSourceBitfield = {
-    .index = 9,
-    .mask = 0x7,
+    .index = 10,
+    .mask = 0x3,
 };
 
 enum {
   /**
-   * Bit index of the trigger gate signal for gating the trigger from software.
+   * Bit index of the hardware trigger gate signal for gating the hardware
+   * trigger from software.
    *
    * See chiplevel.sv.tpl for details.
    */
-  kTriggerGateBitIndex = 8,
+  kTriggerHwGateBitIndex = 9,
+  /**
+   * Bit index of the software trigger signal.
+   *
+   * See chiplevel.sv.tpl for details.
+   */
+  kTriggerSwBitIndex = 8,
   /**
    * RV timer settings.
    */
@@ -54,6 +61,9 @@ enum {
   kRvTimerHart = kTopDarjeelingPlicTargetIbex0,
 };
 
+// By default, we use the precise, hardware-gated capture trigger.
+static unsigned int trigger_bit_index = kTriggerHwGateBitIndex;
+
 static dif_uart_t uart0;
 static dif_gpio_t gpio;
 static dif_pinmux_t pinmux;
@@ -98,7 +108,8 @@ static void sca_init_gpio(sca_trigger_source_t trigger) {
 
   uint32_t select_mask =
       bitfield_field32_write(0, kTriggerSourceBitfield, UINT32_MAX);
-  uint32_t enable_mask = bitfield_bit32_write(0, kTriggerGateBitIndex, true);
+  uint32_t enable_mask = bitfield_bit32_write(0, kTriggerHwGateBitIndex, true);
+  enable_mask = bitfield_bit32_write(enable_mask, kTriggerSwBitIndex, true);
 
   OT_DISCARD(dif_gpio_output_set_enabled_all(&gpio, select_mask | enable_mask));
 
@@ -245,12 +256,20 @@ void sca_init(sca_trigger_source_t trigger, sca_peripherals_t enable) {
 
 const dif_uart_t *sca_get_uart(void) { return &uart0; }
 
+void sca_select_trigger_type(sca_trigger_type_t trigger_type) {
+  if (trigger_type == kScaTriggerTypeHwGated) {
+    trigger_bit_index = kTriggerHwGateBitIndex;
+  } else if (trigger_type == kScaTriggerTypeSw) {
+    trigger_bit_index = kTriggerSwBitIndex;
+  }
+}
+
 void sca_set_trigger_high(void) {
-  OT_DISCARD(dif_gpio_write(&gpio, kTriggerGateBitIndex, true));
+  OT_DISCARD(dif_gpio_write(&gpio, trigger_bit_index, true));
 }
 
 void sca_set_trigger_low(void) {
-  OT_DISCARD(dif_gpio_write(&gpio, kTriggerGateBitIndex, false));
+  OT_DISCARD(dif_gpio_write(&gpio, trigger_bit_index, false));
 }
 
 void sca_call_and_sleep(sca_callee callee, uint32_t sleep_cycles) {
diff --git a/sw/device/sca/lib/sca.h b/sw/device/sca/lib/sca.h
index 8a7805b442d0a..9b8ef77e2c80e 100644
--- a/sw/device/sca/lib/sca.h
+++ b/sw/device/sca/lib/sca.h
@@ -24,28 +24,43 @@ typedef enum sca_trigger_source {
   /**
    * Use AES for capture trigger.
    *
-   * The trigger signal will go high 40 cycles after `dif_aes_trigger()` is
+   * The trigger signal will go high 320 cycles after `dif_aes_trigger()` is
    * called and remain high until the operation is complete.
    */
-  kScaTriggerSourceAes,
+  kScaTriggerSourceAes = 0,
   /**
    * Use HMAC for capture trigger.
    */
-  kScaTriggerSourceHmac,
+  kScaTriggerSourceHmac = 1,
   /**
    * Use KMAC for capture trigger.
    */
-  kScaTriggerSourceKmac,
-  /**
-   * Use OTBN (IO_DIV4 clock) for capture trigger.
-   */
-  kScaTriggerSourceOtbnIoDiv4,
+  kScaTriggerSourceKmac = 2,
   /**
    * Use OTBN for capture trigger.
    */
-  kScaTriggerSourceOtbn,
+  kScaTriggerSourceOtbn = 3,
 } sca_trigger_source_t;
 
+/**
+ * Trigger type.
+ */
+typedef enum sca_trigger_type {
+  /**
+   * Use the precise hardware capture trigger gateable by software. If selected,
+   * the actual capture trigger is generated based on the clkmgr_aon_idle signal
+   * of the peripheral corresponding to selected trigger source.
+   *
+   * Note that this is available on FPGA only.
+   */
+  kScaTriggerTypeHwGated = 0,
+  /**
+   * Use the fully software controlled capture trigger. If selected, the
+   * configured trigger source is not relevant.
+   */
+  kScaTriggerTypeSw = 1,
+} sca_trigger_type_t;
+
 /**
  * Peripherals.
  *
@@ -124,6 +139,13 @@ void sca_init(sca_trigger_source_t trigger, sca_peripherals_t enable);
  */
 const dif_uart_t *sca_get_uart(void);
 
+/**
+ * Select the capture trigger type.
+ *
+ * @param trigger_type The trigger type to select.
+ */
+void sca_select_trigger_type(sca_trigger_type_t trigger_type);
+
 /**
  * Sets capture trigger high.
  *
diff --git a/sw/device/sca/lib/simple_serial.c b/sw/device/sca/lib/simple_serial.c
index 564a10f5c500e..b3ae942efb914 100644
--- a/sw/device/sca/lib/simple_serial.c
+++ b/sw/device/sca/lib/simple_serial.c
@@ -5,6 +5,7 @@
 #include "sw/device/sca/lib/simple_serial.h"
 
 #include "sw/device/sca/lib/prng.h"
+#include "sw/device/sca/lib/sca.h"
 #include "sw/ip/uart/dif/dif_uart.h"
 #include "sw/lib/sw/device/arch/device.h"
 #include "sw/lib/sw/device/base/macros.h"
@@ -34,11 +35,11 @@ enum {
  * Command handlers.
  *
  * Clients can register handlers for commands 'a'-'z' using
- * `simple_serial_register_handler()` except for 'v' (version) and 's' (seed
- * PRNG), which are handled by this library. This array has an extra element
- * (27) that is initialized in `simple_serial_init()` to point to
- * `simple_serial_unknown_command()` in order to simplify handling of invalid
- * commands in `simple_serial_process_packet()`.
+ * `simple_serial_register_handler()` except for 'v' (version), 's' (seed
+ * PRNG), and 't' (select trigger type) which are handled by this library. This
+ * array has an extra element (27) that is initialized in `simple_serial_init()`
+ * to point to `simple_serial_unknown_command()` in order to simplify handling
+ * of invalid commands in `simple_serial_process_packet()`.
  */
 static simple_serial_command_handler handlers[27];
 static const dif_uart_t *uart;
@@ -161,6 +162,20 @@ static void simple_serial_seed_prng(const uint8_t *seed, size_t seed_len) {
   prng_seed(read_32(seed));
 }
 
+/**
+ * Simple serial 't' (select trigger type) command handler.
+ *
+ * This function only supports 1-byte trigger values.
+ *
+ * @param trigger A buffer holding the trigger type.
+ * @param trigger_len Buffer length.
+ */
+static void simple_serial_select_trigger_type(const uint8_t *trigger,
+                                              size_t trigger_len) {
+  SS_CHECK(trigger_len == 1);
+  sca_select_trigger_type((sca_trigger_type_t)trigger[0]);
+}
+
 /**
  * Handler for uninmplemented simple serial commands.
  *
@@ -181,6 +196,8 @@ void simple_serial_init(const dif_uart_t *uart_) {
     handlers[i] = simple_serial_unknown_command;
   }
   handlers[simple_serial_get_handler_index('s')] = simple_serial_seed_prng;
+  handlers[simple_serial_get_handler_index('t')] =
+      simple_serial_select_trigger_type;
   handlers[simple_serial_get_handler_index('v')] = simple_serial_version;
 }
 
@@ -188,7 +205,7 @@ simple_serial_result_t simple_serial_register_handler(
     uint8_t cmd, simple_serial_command_handler handler) {
   if (!simple_serial_is_valid_command(cmd)) {
     return kSimpleSerialError;
-  } else if (cmd == 's' || cmd == 'v') {
+  } else if (cmd == 's' || cmd == 't' || cmd == 'v') {
     // Cannot register handlers for built-in commands.
     return kSimpleSerialError;
   } else {
diff --git a/sw/device/sca/otbn_vertical/ecc256_keygen_serial.c b/sw/device/sca/otbn_vertical/ecc256_keygen_serial.c
index 1b9b6c02a5673..bcc972f02b7ea 100644
--- a/sw/device/sca/otbn_vertical/ecc256_keygen_serial.c
+++ b/sw/device/sca/otbn_vertical/ecc256_keygen_serial.c
@@ -123,6 +123,21 @@ uint32_t ecc256_seed[kEcc256SeedNumWords] = {
     0x22b79d5f, 0x1176f31d, 0xb5ac3a51, 0x99a082d7, 0x484eb366,
 };
 
+uint32_t ecc256_C[kEcc256SeedNumWords] = {
+    0x016064e9, 0x11e3f4d6, 0xac3a6fa7, 0xaba11a1b, 0x8f9271d1,
+    0x22b79d5f, 0x1176f31d, 0xb5ac3a51, 0x99a082d7, 0x484eb366,
+};
+
+uint32_t random_number[kEcc256CoordNumWords] = {
+    0x016064e9, 0x11e3f4d6, 0xac3a6fa7, 0xaba11a1b,
+    0x22b79d5f, 0x1176f31d, 0xb5ac3a51, 0x99a082d7,
+};
+
+uint32_t ecc256_fixed_number[kEcc256CoordNumWords] = {
+    0x04030201, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+};
+
 void ecc256_en_masks(const uint8_t *enable, size_t enable_len) {
   SS_CHECK(enable_len == 1);
   if (*enable) {
@@ -132,6 +147,14 @@ void ecc256_en_masks(const uint8_t *enable, size_t enable_len) {
   }
 }
 
+/**
+ * Simple serial 'x' (set seed) command handler.
+ *
+ * The seed must be `kEcc256SeedNumBytes` bytes long.
+ *
+ * @param seed Value for seed share.
+ * @param seed_len Length of seed share.
+ */
 void ecc256_set_seed(const uint8_t *seed, size_t seed_len) {
   SS_CHECK(seed_len == kEcc256SeedNumBytes);
   memcpy(ecc256_seed, seed, seed_len);
@@ -141,6 +164,19 @@ void ecc256_set_seed(const uint8_t *seed, size_t seed_len) {
   prng_rand_bytes((unsigned char *)temp, kEcc256SeedNumBytes);
 }
 
+/**
+ * Simple serial 'c' (set constant) command handler.
+ *
+ * The constant must be `kEcc256SeedNumBytes` bytes long.
+ *
+ * @param C Value of the C constant.
+ * @param len Length of the C constant.
+ */
+void ecc256_set_c(const uint8_t *C, size_t len) {
+  SS_CHECK(len == kEcc256SeedNumBytes);
+  memcpy(ecc256_C, C, len);
+}
+
 /**
  * Callback wrapper for OTBN manual trigger function.
  */
@@ -173,7 +209,7 @@ static void p256_run_keygen(uint32_t mode, const uint32_t *share0,
   sca_set_trigger_low();
 }
 
-void ecc256_ecdsa_secret_keygen_batch(const uint8_t *data, size_t data_len) {
+void ecc256_ecdsa_keygen_fvsr_seed_batch(const uint8_t *data, size_t data_len) {
   uint32_t num_traces = 0;
   uint32_t batch_digest[kEcc256SeedNumWords];
   uint8_t dummy[kEcc256SeedNumBytes];
@@ -234,6 +270,103 @@ void ecc256_ecdsa_secret_keygen_batch(const uint8_t *data, size_t data_len) {
                             kEcc256SeedNumWords * 4);
 }
 
+/**
+ * Adds two integers storred in byte arrays.
+ *
+ * Adds the integer stored in source array to the integer stored in
+ * destination aray.
+ * The user needs to ensure that dest_len isenough to store the result
+ * without overflow.
+ *
+ * @param[in] dest  Location of the first input array and the result.
+ * @param[in] source  Location of the second input array.
+ * @param[in] dest_len   Length od the dest array in bytes.
+ * @param[in] source_len   Length of the source array in bytes.
+ */
+static void add_arrays(uint8_t *dest, uint8_t *source, size_t dest_len,
+                       size_t source_len) {
+  uint16_t temp = 0;
+
+  for (size_t i = 0; i < source_len; i++) {
+    temp += (uint16_t)source[i] + dest[i];
+    dest[i] = (uint8_t)(temp & 0x00FF);
+    temp >>= 8;
+  }
+
+  for (size_t i = source_len; i < dest_len; i++) {
+    temp += (uint16_t)dest[i];
+    dest[i] = (uint8_t)(temp & 0x00FF);
+    temp >>= 8;
+  }
+}
+
+void ecc256_ecdsa_keygen_fvsr_key_batch(const uint8_t *data, size_t data_len) {
+  uint32_t num_traces = 0;
+  uint32_t batch_digest[kEcc256SeedNumWords];
+  uint8_t dummy[kEcc256SeedNumBytes];
+  SS_CHECK(data_len == sizeof(num_traces));
+  num_traces = read_32(data);
+
+  if (num_traces > kNumBatchOpsMax) {
+    LOG_ERROR("Too many traces for one batch.");
+    return;
+  }
+
+  // zero the batch digest
+  for (uint32_t j = 0; j < kEcc256SeedNumWords; ++j) {
+    batch_digest[j] = 0;
+  }
+
+  for (uint32_t i = 0; i < num_traces; ++i) {
+    if (run_fixed) {
+      memcpy(batch_share0[i], ecc256_seed, kEcc256SeedNumBytes);
+    } else {
+      // Here change to random_number + C
+      // It is necessary to set the C first
+      memcpy(batch_share0[i], ecc256_C, kEcc256SeedNumBytes);
+      prng_rand_bytes((unsigned char *)random_number, kEcc256CoordNumBytes);
+      add_arrays((unsigned char *)batch_share0[i],
+                 (unsigned char *)random_number, kEcc256SeedNumBytes,
+                 kEcc256CoordNumBytes);
+    }
+    if (en_masks) {
+      prng_rand_bytes((unsigned char *)batch_share1[i], kEcc256SeedNumBytes);
+    } else {
+      for (uint32_t j = 0; j < kEcc256SeedNumWords; ++j) {
+        batch_share1[i][j] = 0;
+      }
+    }
+    for (uint32_t j = 0; j < kEcc256SeedNumWords; ++j) {
+      batch_share0[i][j] ^= batch_share1[i][j];
+    }
+    // Another PRNG run to determine 'run_fixed' for the next cycle.
+    prng_rand_bytes(dummy, kEcc256SeedNumBytes);
+
+    run_fixed = dummy[0] & 0x1;
+  }
+
+  for (uint32_t i = 0; i < num_traces; ++i) {
+    p256_run_keygen(kEcc256ModePrivateKeyOnly, batch_share0[i],
+                    batch_share1[i]);
+
+    // Read results.
+    SS_CHECK_STATUS_OK(
+        otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD0, d0_batch));
+    SS_CHECK_STATUS_OK(
+        otbn_dmem_read(kEcc256SeedNumWords, kOtbnVarD1, d1_batch));
+
+    // The correctness of each batch is verified by computing and sending
+    // the batch digest. This digest is computed by XORing all d0 shares of
+    // the batch.
+    for (uint32_t j = 0; j < kEcc256SeedNumWords; ++j) {
+      batch_digest[j] ^= d0_batch[j];
+    }
+  }
+  // Send the batch digest to the host for verification.
+  simple_serial_send_packet('r', (uint8_t *)batch_digest,
+                            kEcc256SeedNumWords * 4);
+}
+
 /**
  * Generates a secret key from a masked seed.
  *
diff --git a/sw/device/sca/otbn_vertical/ecc256_keygen_serial.h b/sw/device/sca/otbn_vertical/ecc256_keygen_serial.h
index fa9b37cc4bf68..533e2e749075d 100644
--- a/sw/device/sca/otbn_vertical/ecc256_keygen_serial.h
+++ b/sw/device/sca/otbn_vertical/ecc256_keygen_serial.h
@@ -50,15 +50,46 @@ void ecc256_en_masks(const uint8_t *enable, size_t enable_len);
  */
 void ecc256_set_seed(const uint8_t *seed, size_t seed_len);
 
+/**
+ * Simple serial 'c' (set constant) command handler.
+ *
+ * The constant must be `kEcc256SeedNumBytes` bytes long.
+ *
+ * @param C Value of the C constant.
+ * @param len Length of the C constant.
+ */
+void ecc256_set_c(const uint8_t *C, size_t len);
+
+/**
+ * Simple serial 'e' (secret keygen fvsr key batch mode) command handler.
+ *
+ * Collects data for ECDSA keygen fixed-vs-random test in the KEY mode.
+ * In the KEY mode, the fixed set of measurements is generated using the fixed
+ * 320 bit seed. The random set of measurements is generated in two steps:
+ *   1. Choose a random 256 bit number r
+ *   2. Compute the seed as (C + r) where C is the fixed 320 bit constant. Note
+ * that in this case the used key is equal to (C + r) mod curve_order_n.
+ * Takes a number of traces that has to be captured in one batch as input.
+ *
+ * @param data Value for trace count.
+ * @param data_len Length of trace count input.
+ */
+void ecc256_ecdsa_keygen_fvsr_key_batch(const uint8_t *data, size_t data_len);
+
 /**
  * Simple serial 'b' (secret keygen batch mode) command handler.
  *
+ * Collects data for ECDSA keygen fixed-vs-random test in the SEED mode.
+ * In the SEED mode, the fixed set of measurements is generated using the fixed
+ * 320 bit seed. The random set of measurements is generated using a random 320
+ * bit seed. In both cases, the used key is equal to seed mod curve_order_n
+ *
  * Takes a number of traces that has to be captured in one batch as input.
  *
  * @param data Value for trace count.
  * @param data_len Length of trace count input.
  */
-void ecc256_ecdsa_secret_keygen_batch(const uint8_t *data, size_t data_len);
+void ecc256_ecdsa_keygen_fvsr_seed_batch(const uint8_t *data, size_t data_len);
 
 /**
  * Simple serial 'k' (secret keygen) command handler.
diff --git a/sw/device/sca/otbn_vertical/otbn_vertical_serial.c b/sw/device/sca/otbn_vertical/otbn_vertical_serial.c
index fdabf1489184c..db9fdc43ea210 100644
--- a/sw/device/sca/otbn_vertical/otbn_vertical_serial.c
+++ b/sw/device/sca/otbn_vertical/otbn_vertical_serial.c
@@ -74,13 +74,17 @@ static void simple_serial_main(void) {
 
   simple_serial_init(sca_get_uart());
   SS_CHECK(simple_serial_register_handler(
-               'b', ecc256_ecdsa_secret_keygen_batch) == kSimpleSerialOk);
+               'b', ecc256_ecdsa_keygen_fvsr_seed_batch) == kSimpleSerialOk);
+  SS_CHECK(simple_serial_register_handler(
+               'e', ecc256_ecdsa_keygen_fvsr_key_batch) == kSimpleSerialOk);
   SS_CHECK(simple_serial_register_handler('k', ecc256_ecdsa_secret_keygen) ==
            kSimpleSerialOk);
   SS_CHECK(simple_serial_register_handler('p', ecc256_ecdsa_gen_keypair) ==
            kSimpleSerialOk);
   SS_CHECK(simple_serial_register_handler('x', ecc256_set_seed) ==
            kSimpleSerialOk);
+  SS_CHECK(simple_serial_register_handler('c', ecc256_set_c) ==
+           kSimpleSerialOk);
   SS_CHECK(simple_serial_register_handler('m', ecc256_en_masks) ==
            kSimpleSerialOk);
   SS_CHECK(simple_serial_register_handler('a', ecc256_app_select) ==
diff --git a/sw/device/sca/sha3_serial.c b/sw/device/sca/sha3_serial.c
index d47435b9208c1..b7b1497cc73f9 100644
--- a/sw/device/sca/sha3_serial.c
+++ b/sw/device/sca/sha3_serial.c
@@ -70,6 +70,20 @@ enum {
  */
 static dif_kmac_t kmac;
 
+/**
+ * The KMAC config.
+ */
+static dif_kmac_config_t config = (dif_kmac_config_t){
+    .entropy_mode = kDifKmacEntropyModeSoftware,
+    .entropy_fast_process = kDifToggleDisabled,
+    .entropy_seed = {0xaa25b4bf, 0x48ce8fff, 0x5a78282a, 0x48465647,
+                     0x70410fef},
+    .message_big_endian = kDifToggleDisabled,
+    .output_big_endian = kDifToggleDisabled,
+    .sideload = kDifToggleDisabled,
+    .msg_mask = kDifToggleEnabled,
+};
+
 /**
  * KMAC operation state.
  */
@@ -344,14 +358,6 @@ static void kmac_init(void) {
   SS_CHECK_DIF_OK(dif_kmac_init(
       mmio_region_from_addr(TOP_DARJEELING_KMAC_BASE_ADDR), &kmac));
 
-  dif_kmac_config_t config = (dif_kmac_config_t){
-      .entropy_mode = kDifKmacEntropyModeSoftware,
-      .entropy_seed = {0xaa25b4bf, 0x48ce8fff, 0x5a78282a, 0x48465647,
-                       0x70410fef},
-      .entropy_fast_process = false,
-      .msg_mask = true,
-  };
-
   SS_CHECK_DIF_OK(dif_kmac_configure(&kmac, config));
 
   kmac_block_until_idle();
@@ -367,14 +373,13 @@ static void kmac_disable_masking(const uint8_t *masks_off, size_t off_len) {
   SS_CHECK_DIF_OK(dif_kmac_init(
       mmio_region_from_addr(TOP_DARJEELING_KMAC_BASE_ADDR), &kmac));
 
-  dif_kmac_config_t config;
   if (masks_off[0]) {
-    config.entropy_fast_process = true;
-    config.msg_mask = false;
+    config.entropy_fast_process = kDifToggleEnabled;
+    config.msg_mask = kDifToggleDisabled;
     LOG_INFO("Initializing the KMAC peripheral with masking disabled.");
   } else {
-    config.entropy_fast_process = false;
-    config.msg_mask = true;
+    config.entropy_fast_process = kDifToggleDisabled;
+    config.msg_mask = kDifToggleEnabled;
     LOG_INFO("Initializing the KMAC peripheral with masking enabled.");
   }
   SS_CHECK_DIF_OK(dif_kmac_configure(&kmac, config));
@@ -513,7 +518,7 @@ bool test_main(void) {
   simple_serial_init(sca_get_uart());
   simple_serial_register_handler('p', sha3_serial_single_absorb);
   simple_serial_register_handler('b', sha3_serial_batch);
-  simple_serial_register_handler('t', sha3_serial_fixed_message_set);
+  simple_serial_register_handler('f', sha3_serial_fixed_message_set);
   simple_serial_register_handler('l', sha3_serial_seed_lfsr);
   simple_serial_register_handler('m', kmac_disable_masking);
 
diff --git a/sw/device/silicon_creator/lib/drivers/BUILD b/sw/device/silicon_creator/lib/drivers/BUILD
index 0cc5ba7931b4a..43b72777dda41 100644
--- a/sw/device/silicon_creator/lib/drivers/BUILD
+++ b/sw/device/silicon_creator/lib/drivers/BUILD
@@ -251,6 +251,7 @@ cc_library(
         "//hw/top_darjeeling/sw/autogen:top_darjeeling",
         "//sw/lib/sw/device/base:abs_mmio",
         "//sw/lib/sw/device/base:macros",
+        "//sw/lib/sw/device/runtime:hart",
         "//sw/lib/sw/device/silicon_creator:error",
         "//sw/lib/sw/device/silicon_creator:keymgr_binding",
         "//sw/lib/sw/device/silicon_creator/base:sec_mmio",
diff --git a/sw/device/silicon_creator/lib/drivers/keymgr.c b/sw/device/silicon_creator/lib/drivers/keymgr.c
index f08b96c67ae7a..ceb36af1d6252 100644
--- a/sw/device/silicon_creator/lib/drivers/keymgr.c
+++ b/sw/device/silicon_creator/lib/drivers/keymgr.c
@@ -8,6 +8,7 @@
 
 #include "sw/lib/sw/device/base/abs_mmio.h"
 #include "sw/lib/sw/device/base/macros.h"
+#include "sw/lib/sw/device/runtime/hart.h"
 #include "sw/lib/sw/device/silicon_creator/base/sec_mmio.h"
 
 #include "hw/top_darjeeling/sw/autogen/top_darjeeling.h"
@@ -132,3 +133,131 @@ void keymgr_advance_state(void) {
 rom_error_t keymgr_state_check(keymgr_state_t expected_state) {
   return expected_state_check(expected_state);
 }
+
+/**
+ * Fails if the keymgr is not idle.
+ *
+ * @return OK if the key manager is idle, kErrorKeymgrInternal otherwise.
+ */
+OT_WARN_UNUSED_RESULT
+static rom_error_t keymgr_is_idle(void) {
+  uint32_t reg = abs_mmio_read32(kBase + KEYMGR_OP_STATUS_REG_OFFSET);
+  uint32_t status = bitfield_field32_read(reg, KEYMGR_OP_STATUS_STATUS_FIELD);
+  if (launder32(status) == KEYMGR_OP_STATUS_STATUS_VALUE_IDLE) {
+    HARDENED_CHECK_EQ(status, KEYMGR_OP_STATUS_STATUS_VALUE_IDLE);
+    return kErrorOk;
+  }
+  return kErrorKeymgrInternal;
+}
+
+/**
+ * Wait for the key manager to finish an operation.
+ *
+ * Polls the key manager until it is no longer busy. If the operation completed
+ * successfully or the key manager was already idle, returns kErrorOk. If
+ * there was an error during the operation, reads and clears the error code
+ * and returns kErrorKeymgrInternal.
+ *
+ * @return OK or error.
+ */
+OT_WARN_UNUSED_RESULT
+static rom_error_t keymgr_wait_until_done(void) {
+  // Poll the OP_STATUS register until it is something other than "WIP".
+  uint32_t reg;
+  uint32_t status;
+  do {
+    // Read OP_STATUS and then clear by writing back the value we read.
+    reg = abs_mmio_read32(kBase + KEYMGR_OP_STATUS_REG_OFFSET);
+    abs_mmio_write32(kBase + KEYMGR_OP_STATUS_REG_OFFSET, reg);
+    status = bitfield_field32_read(reg, KEYMGR_OP_STATUS_STATUS_FIELD);
+  } while (status == KEYMGR_OP_STATUS_STATUS_VALUE_WIP);
+
+  // Check if the key manager reported errors. If it is already idle or
+  // completed an operation successfully, return an OK status. A `WIP` status
+  // should not be possible because of the check above.
+  switch (launder32(status)) {
+    case KEYMGR_OP_STATUS_STATUS_VALUE_IDLE:
+      HARDENED_CHECK_EQ(status, KEYMGR_OP_STATUS_STATUS_VALUE_IDLE);
+      return kErrorOk;
+    case KEYMGR_OP_STATUS_STATUS_VALUE_DONE_SUCCESS:
+      HARDENED_CHECK_EQ(status, KEYMGR_OP_STATUS_STATUS_VALUE_DONE_SUCCESS);
+      return kErrorOk;
+    case KEYMGR_OP_STATUS_STATUS_VALUE_DONE_ERROR: {
+      // Clear the ERR_CODE register before returning.
+      uint32_t err_code = abs_mmio_read32(kBase + KEYMGR_ERR_CODE_REG_OFFSET);
+      abs_mmio_write32(kBase + KEYMGR_ERR_CODE_REG_OFFSET, err_code);
+      return kErrorKeymgrInternal;
+    }
+  }
+
+  // Should be unreachable.
+  HARDENED_TRAP();
+  return kErrorKeymgrInternal;
+}
+
+rom_error_t keymgr_generate_attestation_key_otbn(
+    keymgr_diversification_t diversification) {
+  HARDENED_RETURN_IF_ERROR(keymgr_is_idle());
+
+  // Select OTBN as the destination.
+  uint32_t ctrl =
+      bitfield_field32_write(0, KEYMGR_CONTROL_SHADOWED_DEST_SEL_FIELD,
+                             KEYMGR_CONTROL_SHADOWED_DEST_SEL_VALUE_OTBN);
+
+  // Select the attestation CDI.
+  ctrl = bitfield_bit32_write(ctrl, KEYMGR_CONTROL_SHADOWED_CDI_SEL_BIT, true);
+
+  // Select the "generate" operation.
+  ctrl = bitfield_field32_write(
+      ctrl, KEYMGR_CONTROL_SHADOWED_OPERATION_FIELD,
+      KEYMGR_CONTROL_SHADOWED_OPERATION_VALUE_GENERATE_HW_OUTPUT);
+
+  // Write the control register.
+  abs_mmio_write32_shadowed(kBase + KEYMGR_CONTROL_SHADOWED_REG_OFFSET, ctrl);
+
+  // Set the version.
+  abs_mmio_write32(kBase + KEYMGR_KEY_VERSION_REG_OFFSET,
+                   diversification.version);
+  // Set the salt.
+  for (size_t i = 0; i < kKeymgrSaltNumWords; i++) {
+    abs_mmio_write32(kBase + KEYMGR_SALT_0_REG_OFFSET + (i * sizeof(uint32_t)),
+                     diversification.salt[i]);
+  }
+
+  // Issue the start command.
+  abs_mmio_write32(kBase + KEYMGR_START_REG_OFFSET, 1 << KEYMGR_START_EN_BIT);
+
+  // Block until keymgr is done.
+  return keymgr_wait_until_done();
+}
+
+rom_error_t keymgr_sideload_clear_otbn(void) {
+  HARDENED_RETURN_IF_ERROR(keymgr_is_idle());
+
+  // Set SIDELOAD_CLEAR to begin continuously clearing the requested slot.
+  abs_mmio_write32(
+      kBase + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET,
+      bitfield_field32_write(0, KEYMGR_SIDELOAD_CLEAR_VAL_FIELD,
+                             KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_OTBN));
+
+  // Read back the value (hardening measure).
+  uint32_t sideload_clear =
+      abs_mmio_read32(kBase + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET);
+  if (bitfield_field32_read(sideload_clear, KEYMGR_SIDELOAD_CLEAR_VAL_FIELD) !=
+      KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_OTBN) {
+    return kErrorKeymgrInternal;
+  }
+
+  // Spin for 100 microseconds.
+  // TODO(#20024): this value seems to work for tests, but it would be good to
+  // run a more principled analysis.
+  busy_spin_micros(100);
+
+  // Stop continuous clearing.
+  abs_mmio_write32(
+      kBase + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET,
+      bitfield_field32_write(0, KEYMGR_SIDELOAD_CLEAR_VAL_FIELD,
+                             KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_NONE));
+
+  return kErrorOk;
+}
diff --git a/sw/device/silicon_creator/lib/drivers/keymgr.h b/sw/device/silicon_creator/lib/drivers/keymgr.h
index 6b561278669f1..b01f4ffbceb60 100644
--- a/sw/device/silicon_creator/lib/drivers/keymgr.h
+++ b/sw/device/silicon_creator/lib/drivers/keymgr.h
@@ -57,6 +57,27 @@ typedef enum keymgr_state {
   kKeymgrStateNumStates,
 } keymgr_state_t;
 
+enum {
+  /**
+   * Number of 32-bit words for the salt.
+   */
+  kKeymgrSaltNumWords = 8,
+};
+
+/**
+ * Data used to differentiate a generated keymgr key.
+ */
+typedef struct keymgr_diversification {
+  /**
+   * Salt value to use for key generation.
+   */
+  uint32_t salt[kKeymgrSaltNumWords];
+  /**
+   * Version for key generation (anti-rollback protection).
+   */
+  uint32_t version;
+} keymgr_diversification_t;
+
 /**
  * The following constants represent the expected number of sec_mmio register
  * writes performed by functions in provided in this module. See
@@ -160,6 +181,31 @@ void keymgr_advance_state(void);
 OT_WARN_UNUSED_RESULT
 rom_error_t keymgr_state_check(keymgr_state_t expected_state);
 
+/**
+ * Derive a key manager key for the OTBN block.
+ *
+ * Calls the key manager to sideload a key into the OTBN hardware block and
+ * waits until the operation is complete before returning. Always uses the
+ * attestation (not sealing) CDI; call this only for attestation keys.
+ *
+ * @param diversification Diversification input for the key derivation.
+ * @return OK or error.
+ */
+OT_WARN_UNUSED_RESULT
+rom_error_t keymgr_generate_attestation_key_otbn(
+    const keymgr_diversification_t diversification);
+
+/**
+ * Clear OTBN's sideloaded key slot.
+ *
+ * The entropy complex needs to be initialized before calling this function, so
+ * that keymgr can use it to clear the slot.
+ *
+ * @return OK or error.
+ */
+OT_WARN_UNUSED_RESULT
+rom_error_t keymgr_sideload_clear_otbn(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/sw/device/silicon_creator/lib/drivers/keymgr_unittest.cc b/sw/device/silicon_creator/lib/drivers/keymgr_unittest.cc
index 02ad25df0ea39..9916f1c4457b0 100644
--- a/sw/device/silicon_creator/lib/drivers/keymgr_unittest.cc
+++ b/sw/device/silicon_creator/lib/drivers/keymgr_unittest.cc
@@ -36,6 +36,39 @@ class KeymgrTest : public rom_test::RomTest {
 
     EXPECT_SEC_READ32(base_ + KEYMGR_WORKING_STATE_REG_OFFSET, km_state);
   }
+  void ExpectIdleCheck(uint32_t op_status) {
+    EXPECT_ABS_READ32(base_ + KEYMGR_OP_STATUS_REG_OFFSET, op_status);
+  }
+  void ExpectDiversificationWrite(keymgr_diversification_t diversification) {
+    EXPECT_ABS_WRITE32(base_ + KEYMGR_KEY_VERSION_REG_OFFSET,
+                       diversification.version);
+    EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_0_REG_OFFSET,
+                       diversification.salt[0]);
+    EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_1_REG_OFFSET,
+                       diversification.salt[1]);
+    EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_2_REG_OFFSET,
+                       diversification.salt[2]);
+    EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_3_REG_OFFSET,
+                       diversification.salt[3]);
+    EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_4_REG_OFFSET,
+                       diversification.salt[4]);
+    EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_5_REG_OFFSET,
+                       diversification.salt[5]);
+    EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_6_REG_OFFSET,
+                       diversification.salt[6]);
+    EXPECT_ABS_WRITE32(base_ + KEYMGR_SALT_7_REG_OFFSET,
+                       diversification.salt[7]);
+  }
+  void ExpectWaitUntilDone(size_t busy_cycles, uint32_t end_status) {
+    for (size_t i = 0; i < busy_cycles; i++) {
+      EXPECT_ABS_READ32(base_ + KEYMGR_OP_STATUS_REG_OFFSET,
+                        KEYMGR_OP_STATUS_STATUS_VALUE_WIP);
+      EXPECT_ABS_WRITE32(base_ + KEYMGR_OP_STATUS_REG_OFFSET,
+                         KEYMGR_OP_STATUS_STATUS_VALUE_WIP);
+    }
+    EXPECT_ABS_READ32(base_ + KEYMGR_OP_STATUS_REG_OFFSET, end_status);
+    EXPECT_ABS_WRITE32(base_ + KEYMGR_OP_STATUS_REG_OFFSET, end_status);
+  }
   uint32_t base_ = TOP_DARJEELING_KEYMGR_BASE_ADDR;
   SwBindingCfg cfg_ = {
       .max_key_ver = 0xA5A5A5A5,
@@ -162,5 +195,132 @@ TEST_F(KeymgrTest, CheckStateInvalidResponse) {
             kErrorKeymgrInternal);
 }
 
+TEST_F(KeymgrTest, GenAttestationKey) {
+  keymgr_diversification_t test_diversification = {
+      .salt = {0xf0f1f2f3, 0xf4f5f6f7, 0xf8f9fafb, 0xfcfdfeff, 0xd0d1d2d3,
+               0xd4d5d6d7, 0xd8d9dadb, 0xdcdddedf},
+      .version = cfg_.max_key_ver - 1,
+  };
+
+  ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_IDLE);
+  EXPECT_ABS_WRITE32_SHADOWED(
+      base_ + KEYMGR_CONTROL_SHADOWED_REG_OFFSET,
+      {
+          {KEYMGR_CONTROL_SHADOWED_DEST_SEL_OFFSET,
+           KEYMGR_CONTROL_SHADOWED_DEST_SEL_VALUE_OTBN},
+          {KEYMGR_CONTROL_SHADOWED_CDI_SEL_BIT, true},
+          {KEYMGR_CONTROL_SHADOWED_OPERATION_OFFSET,
+           KEYMGR_CONTROL_SHADOWED_OPERATION_VALUE_GENERATE_HW_OUTPUT},
+      });
+  ExpectDiversificationWrite(test_diversification);
+  EXPECT_ABS_WRITE32(base_ + KEYMGR_START_REG_OFFSET,
+                     {
+                         {KEYMGR_START_EN_BIT, true},
+                     });
+  ExpectWaitUntilDone(/*busy_cycles=*/2,
+                      KEYMGR_OP_STATUS_STATUS_VALUE_DONE_SUCCESS);
+
+  EXPECT_EQ(keymgr_generate_attestation_key_otbn(test_diversification),
+            kErrorOk);
+}
+
+TEST_F(KeymgrTest, GenAttestationKeyNotIdle) {
+  keymgr_diversification_t test_diversification = {
+      .salt = {0xf0f1f2f3, 0xf4f5f6f7, 0xf8f9fafb, 0xfcfdfeff, 0xd0d1d2d3,
+               0xd4d5d6d7, 0xd8d9dadb, 0xdcdddedf},
+      .version = cfg_.max_key_ver - 1,
+  };
+
+  ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_WIP);
+  EXPECT_EQ(keymgr_generate_attestation_key_otbn(test_diversification),
+            kErrorKeymgrInternal);
+  ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_DONE_ERROR);
+  EXPECT_EQ(keymgr_generate_attestation_key_otbn(test_diversification),
+            kErrorKeymgrInternal);
+  ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_DONE_SUCCESS);
+  EXPECT_EQ(keymgr_generate_attestation_key_otbn(test_diversification),
+            kErrorKeymgrInternal);
+}
+
+TEST_F(KeymgrTest, GenAttestationKeyError) {
+  keymgr_diversification_t test_diversification = {
+      .salt = {0xf0f1f2f3, 0xf4f5f6f7, 0xf8f9fafb, 0xfcfdfeff, 0xd0d1d2d3,
+               0xd4d5d6d7, 0xd8d9dadb, 0xdcdddedf},
+      .version = cfg_.max_key_ver - 1,
+  };
+  uint32_t err_code = 0x1;
+
+  ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_IDLE);
+  EXPECT_ABS_WRITE32_SHADOWED(
+      base_ + KEYMGR_CONTROL_SHADOWED_REG_OFFSET,
+      {
+          {KEYMGR_CONTROL_SHADOWED_DEST_SEL_OFFSET,
+           KEYMGR_CONTROL_SHADOWED_DEST_SEL_VALUE_OTBN},
+          {KEYMGR_CONTROL_SHADOWED_CDI_SEL_BIT, true},
+          {KEYMGR_CONTROL_SHADOWED_OPERATION_OFFSET,
+           KEYMGR_CONTROL_SHADOWED_OPERATION_VALUE_GENERATE_HW_OUTPUT},
+      });
+  ExpectDiversificationWrite(test_diversification);
+  EXPECT_ABS_WRITE32(base_ + KEYMGR_START_REG_OFFSET,
+                     {
+                         {KEYMGR_START_EN_BIT, true},
+                     });
+  ExpectWaitUntilDone(/*busy_cycles=*/2,
+                      KEYMGR_OP_STATUS_STATUS_VALUE_DONE_ERROR);
+  EXPECT_ABS_READ32(base_ + KEYMGR_ERR_CODE_REG_OFFSET, err_code);
+  EXPECT_ABS_WRITE32(base_ + KEYMGR_ERR_CODE_REG_OFFSET, err_code);
+
+  EXPECT_EQ(keymgr_generate_attestation_key_otbn(test_diversification),
+            kErrorKeymgrInternal);
+}
+
+TEST_F(KeymgrTest, SideloadClearOtbn) {
+  ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_IDLE);
+  EXPECT_ABS_WRITE32(base_ + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET,
+                     {
+                         {KEYMGR_SIDELOAD_CLEAR_VAL_OFFSET,
+                          KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_OTBN},
+                     });
+  EXPECT_ABS_READ32(base_ + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET,
+                    {
+                        {KEYMGR_SIDELOAD_CLEAR_VAL_OFFSET,
+                         KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_OTBN},
+                    });
+  EXPECT_ABS_WRITE32(base_ + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET,
+                     {
+                         {KEYMGR_SIDELOAD_CLEAR_VAL_OFFSET,
+                          KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_NONE},
+                     });
+
+  EXPECT_EQ(keymgr_sideload_clear_otbn(), kErrorOk);
+}
+
+TEST_F(KeymgrTest, SideloadClearOtbnNotIdle) {
+  ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_WIP);
+  EXPECT_EQ(keymgr_sideload_clear_otbn(), kErrorKeymgrInternal);
+  ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_DONE_SUCCESS);
+  EXPECT_EQ(keymgr_sideload_clear_otbn(), kErrorKeymgrInternal);
+  ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_DONE_ERROR);
+  EXPECT_EQ(keymgr_sideload_clear_otbn(), kErrorKeymgrInternal);
+}
+
+TEST_F(KeymgrTest, SideloadClearOtbnReadbackMismatch) {
+  ExpectIdleCheck(KEYMGR_OP_STATUS_STATUS_VALUE_IDLE);
+  EXPECT_ABS_WRITE32(base_ + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET,
+                     {
+                         {KEYMGR_SIDELOAD_CLEAR_VAL_OFFSET,
+                          KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_OTBN},
+                     });
+
+  // Readback does not match the value written.
+  EXPECT_ABS_READ32(base_ + KEYMGR_SIDELOAD_CLEAR_REG_OFFSET,
+                    {
+                        {KEYMGR_SIDELOAD_CLEAR_VAL_OFFSET,
+                         KEYMGR_SIDELOAD_CLEAR_VAL_VALUE_AES},
+                    });
+
+  EXPECT_EQ(keymgr_sideload_clear_otbn(), kErrorKeymgrInternal);
+}
+
 }  // namespace
 }  // namespace keymgr_unittest
diff --git a/sw/device/silicon_creator/lib/drivers/otbn.h b/sw/device/silicon_creator/lib/drivers/otbn.h
index fab051a2a1f4c..f76f27dc8192d 100644
--- a/sw/device/silicon_creator/lib/drivers/otbn.h
+++ b/sw/device/silicon_creator/lib/drivers/otbn.h
@@ -210,7 +210,7 @@ typedef struct otbn_app {
   ((uint32_t)OTBN_SYMBOL_ADDR(app_name, symbol_name))
 
 /**
- * (Re-)loads the RSA application into OTBN.
+ * (Re-)loads an application into OTBN.
  *
  * Load the application image with both instruction and data segments into
  * OTBN.
diff --git a/sw/lib/sw/device/silicon_creator/BUILD b/sw/lib/sw/device/silicon_creator/BUILD
index ae85b613a7213..42bf8db1b991d 100644
--- a/sw/lib/sw/device/silicon_creator/BUILD
+++ b/sw/lib/sw/device/silicon_creator/BUILD
@@ -140,6 +140,29 @@ dual_cc_library(
     ),
 )
 
+cc_library(
+    name = "attestation",
+    hdrs = ["attestation.h"],
+)
+
+cc_library(
+    name = "otbn_boot_services",
+    srcs = ["otbn_boot_services.c"],
+    hdrs = ["otbn_boot_services.h"],
+    # This target uses OTBN pointers internally, so it cannot work host-side.
+    target_compatible_with = [OPENTITAN_CPU],
+    deps = [
+        ":attestation",
+        "//sw/device/silicon_creator/lib/drivers:hmac",
+        "//sw/device/silicon_creator/lib/drivers:keymgr",
+        "//sw/device/silicon_creator/lib/drivers:otbn",
+        "//sw/lib/sw/device/base:macros",
+        "//sw/lib/sw/device/silicon_creator:error",
+        "//sw/lib/sw/device/silicon_creator/base:sec_mmio",
+        "//sw/lib/sw/device/silicon_creator/sigverify:rsa_key",
+    ],
+)
+
 exports_files([
     "boot_data.h",
     "boot_data.c",
diff --git a/sw/lib/sw/device/silicon_creator/attestation.h b/sw/lib/sw/device/silicon_creator/attestation.h
new file mode 100644
index 0000000000000..720161d2f8212
--- /dev/null
+++ b/sw/lib/sw/device/silicon_creator/attestation.h
@@ -0,0 +1,86 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_ATTESTATION_H_
+#define OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_ATTESTATION_H_
+
+#include "sw/lib/sw/device/silicon_creator/error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+enum {
+  /**
+   * Size of the additional seed for attestation key generation in bits.
+   */
+  kAttestationSeedBits = 320,
+  /**
+   * Size of the additional seed for attestation key generation in bytes.
+   */
+  kAttestationSeedBytes = kAttestationSeedBits / 8,
+  /**
+   * Size of the additional seed for attestation key generation in 32b words.
+   */
+  kAttestationSeedWords = kAttestationSeedBytes / sizeof(uint32_t),
+  /**
+   * Size of a coordinate for an attestation public key in bits.
+   */
+  kAttestationPublicKeyCoordBits = 256,
+  /**
+   * Size of a coordinate for an attestation public key in bytes.
+   */
+  kAttestationPublicKeyCoordBytes = kAttestationPublicKeyCoordBits / 8,
+  /**
+   * Size of a coordinate for an attestation public key in 32b words.
+   */
+  kAttestationPublicKeyCoordWords =
+      kAttestationPublicKeyCoordBytes / sizeof(uint32_t),
+  /**
+   * Size of an attestation signature in bits.
+   */
+  kAttestationSignatureBits = 512,
+  /**
+   * Size of an attestation signature in bytes.
+   */
+  kAttestationSignatureBytes = kAttestationSignatureBits / 8,
+  /**
+   * Size of an attestation signature in 32b words.
+   */
+  kAttestationSignatureWords = kAttestationSignatureBytes / sizeof(uint32_t),
+};
+
+/**
+ * Holds an additional seed for use in attestation key generation.
+ */
+typedef struct attestation_seed {
+  uint32_t seed[kAttestationSeedWords];
+} attestation_seed_t;
+
+/**
+ * Holds an attestation public key (ECDSA-P256).
+ */
+typedef struct attestation_public_key {
+  /**
+   * Affine x-coordinate of the point.
+   */
+  uint32_t x[kAttestationPublicKeyCoordWords];
+  /**
+   * Affine y-coordinate of the point.
+   */
+  uint32_t y[kAttestationPublicKeyCoordWords];
+} attestation_public_key_t;
+
+/**
+ * Holds an attestation signature (ECDSA-P256).
+ */
+typedef struct attestation_signature {
+  uint32_t sig[kAttestationSignatureWords];
+} attestation_signature_t;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_ATTESTATION_H_
diff --git a/sw/lib/sw/device/silicon_creator/otbn_boot_services.c b/sw/lib/sw/device/silicon_creator/otbn_boot_services.c
new file mode 100644
index 0000000000000..210fe11f83952
--- /dev/null
+++ b/sw/lib/sw/device/silicon_creator/otbn_boot_services.c
@@ -0,0 +1,190 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "sw/lib/sw/device/silicon_creator/otbn_boot_services.h"
+
+#include "sw/device/silicon_creator/lib/drivers/hmac.h"
+#include "sw/device/silicon_creator/lib/drivers/keymgr.h"
+#include "sw/device/silicon_creator/lib/drivers/otbn.h"
+#include "sw/lib/sw/device/silicon_creator/attestation.h"
+#include "sw/lib/sw/device/silicon_creator/base/sec_mmio.h"
+
+OTBN_DECLARE_APP_SYMBOLS(boot);             // The OTBN boot-services app.
+OTBN_DECLARE_SYMBOL_ADDR(boot, mode);       // Application mode.
+OTBN_DECLARE_SYMBOL_ADDR(boot, rsa_mod);    // RSA modulus.
+OTBN_DECLARE_SYMBOL_ADDR(boot, rsa_m0inv);  // RSA Montgomery constant.
+OTBN_DECLARE_SYMBOL_ADDR(boot, rsa_inout);  // RSA input/output buffer.
+OTBN_DECLARE_SYMBOL_ADDR(boot, msg);        // ECDSA message digest.
+OTBN_DECLARE_SYMBOL_ADDR(boot, x);          // ECDSA public key x-coordinate.
+OTBN_DECLARE_SYMBOL_ADDR(boot, y);          // ECDSA public key y-coordinate.
+OTBN_DECLARE_SYMBOL_ADDR(boot, r);          // ECDSA signature component r.
+OTBN_DECLARE_SYMBOL_ADDR(boot, s);          // ECDSA signature component s.
+OTBN_DECLARE_SYMBOL_ADDR(
+    boot, attestation_additional_seed);  // Additional seed for ECDSA keygen.
+
+static const otbn_app_t kOtbnAppBoot = OTBN_APP_T_INIT(boot);
+static const otbn_addr_t kOtbnVarBootMode = OTBN_ADDR_T_INIT(boot, mode);
+static const otbn_addr_t kOtbnVarBootRsaMod = OTBN_ADDR_T_INIT(boot, rsa_mod);
+static const otbn_addr_t kOtbnVarBootRsaM0inv =
+    OTBN_ADDR_T_INIT(boot, rsa_m0inv);
+static const otbn_addr_t kOtbnVarBootRsaInout =
+    OTBN_ADDR_T_INIT(boot, rsa_inout);
+static const otbn_addr_t kOtbnVarBootMsg = OTBN_ADDR_T_INIT(boot, msg);
+static const otbn_addr_t kOtbnVarBootX = OTBN_ADDR_T_INIT(boot, x);
+static const otbn_addr_t kOtbnVarBootY = OTBN_ADDR_T_INIT(boot, y);
+static const otbn_addr_t kOtbnVarBootR = OTBN_ADDR_T_INIT(boot, r);
+static const otbn_addr_t kOtbnVarBootS = OTBN_ADDR_T_INIT(boot, s);
+static const otbn_addr_t kOtbnVarBootAttestationAdditionalSeed =
+    OTBN_ADDR_T_INIT(boot, attestation_additional_seed);
+
+enum {
+  /*
+   * Mode is represented by a single word.
+   */
+  kOtbnBootModeWords = 1,
+  /*
+   * Mode to run RSA modular exponentiation.
+   *
+   * Value taken from `boot.s`.
+   */
+  kOtbnBootModeSecBootModexp = 0x7d3,
+  /*
+   * Mode to generate an attestation keypair.
+   *
+   * Value taken from `boot.s`.
+   */
+  kOtbnBootModeAttestationKeygen = 0x2bf,
+  /*
+   * Mode to endorse a message with a saved private key.
+   *
+   * Value taken from `boot.s`.
+   */
+  kOtbnBootModeAttestationEndorse = 0x5e8,
+  /*
+   * Mode to save an attesation private key.
+   *
+   * Value taken from `boot.s`.
+   */
+  kOtbnBootModeAttestationKeySave = 0x64d,
+};
+
+rom_error_t otbn_boot_app_load(void) { return otbn_load_app(kOtbnAppBoot); }
+
+rom_error_t otbn_boot_attestation_keygen(
+    const attestation_seed_t *additional_seed,
+    keymgr_diversification_t diversification,
+    attestation_public_key_t *public_key) {
+  // Trigger key manager to sideload the attestation key into OTBN.
+  HARDENED_RETURN_IF_ERROR(
+      keymgr_generate_attestation_key_otbn(diversification));
+
+  // Write the mode.
+  uint32_t mode = kOtbnBootModeAttestationKeygen;
+  HARDENED_RETURN_IF_ERROR(
+      otbn_dmem_write(kOtbnBootModeWords, &mode, kOtbnVarBootMode));
+
+  // Write the additional seed.
+  HARDENED_RETURN_IF_ERROR(
+      otbn_dmem_write(kAttestationSeedWords, additional_seed->seed,
+                      kOtbnVarBootAttestationAdditionalSeed));
+
+  // Run the OTBN program (blocks until OTBN is done).
+  HARDENED_RETURN_IF_ERROR(otbn_execute());
+  SEC_MMIO_WRITE_INCREMENT(kOtbnSecMmioExecute);
+
+  // TODO(#20023): Check the instruction count register (see `mod_exp_otbn`).
+
+  // Retrieve the public key.
+  HARDENED_RETURN_IF_ERROR(otbn_dmem_read(kAttestationPublicKeyCoordWords,
+                                          kOtbnVarBootX, public_key->x));
+  HARDENED_RETURN_IF_ERROR(otbn_dmem_read(kAttestationPublicKeyCoordWords,
+                                          kOtbnVarBootY, public_key->y));
+
+  return kErrorOk;
+}
+
+rom_error_t otbn_boot_attestation_key_save(
+    const attestation_seed_t *additional_seed,
+    keymgr_diversification_t diversification) {
+  // Trigger key manager to sideload the attestation key into OTBN.
+  HARDENED_RETURN_IF_ERROR(
+      keymgr_generate_attestation_key_otbn(diversification));
+
+  // Write the mode.
+  uint32_t mode = kOtbnBootModeAttestationKeySave;
+  HARDENED_RETURN_IF_ERROR(
+      otbn_dmem_write(kOtbnBootModeWords, &mode, kOtbnVarBootMode));
+
+  // Write the additional seed.
+  HARDENED_RETURN_IF_ERROR(
+      otbn_dmem_write(kAttestationSeedWords, additional_seed->seed,
+                      kOtbnVarBootAttestationAdditionalSeed));
+
+  // Run the OTBN program (blocks until OTBN is done).
+  HARDENED_RETURN_IF_ERROR(otbn_execute());
+  SEC_MMIO_WRITE_INCREMENT(kOtbnSecMmioExecute);
+
+  // TODO(#20023): Check the instruction count register (see `mod_exp_otbn`).
+
+  return kErrorOk;
+}
+
+rom_error_t otbn_boot_attestation_key_clear(void) {
+  return otbn_dmem_sec_wipe();
+}
+
+rom_error_t otbn_boot_attestation_endorse(const hmac_digest_t *digest,
+                                          attestation_signature_t *sig) {
+  // Write the mode.
+  uint32_t mode = kOtbnBootModeAttestationEndorse;
+  HARDENED_RETURN_IF_ERROR(
+      otbn_dmem_write(kOtbnBootModeWords, &mode, kOtbnVarBootMode));
+
+  // Write the message digest.
+  HARDENED_RETURN_IF_ERROR(
+      otbn_dmem_write(kHmacDigestNumWords, digest->digest, kOtbnVarBootMsg));
+
+  // Run the OTBN program (blocks until OTBN is done).
+  HARDENED_RETURN_IF_ERROR(otbn_execute());
+  SEC_MMIO_WRITE_INCREMENT(kOtbnSecMmioExecute);
+
+  // TODO(#20023): Check the instruction count register (see `mod_exp_otbn`).
+
+  // Retrieve the signature (in two parts, r and s).
+  size_t half_num_words = kAttestationSignatureWords / 2;
+  uint32_t *r_dest = sig->sig;
+  uint32_t *s_dest = &sig->sig[half_num_words];
+  HARDENED_RETURN_IF_ERROR(
+      otbn_dmem_read(half_num_words, kOtbnVarBootR, r_dest));
+  HARDENED_RETURN_IF_ERROR(
+      otbn_dmem_read(half_num_words, kOtbnVarBootS, s_dest));
+
+  return kErrorOk;
+}
+
+rom_error_t otbn_boot_sigverify_mod_exp(const sigverify_rsa_key_t *key,
+                                        const sigverify_rsa_buffer_t *sig,
+                                        sigverify_rsa_buffer_t *result) {
+  // Set the modulus (n).
+  HARDENED_RETURN_IF_ERROR(
+      otbn_dmem_write(kSigVerifyRsaNumWords, key->n.data, kOtbnVarBootRsaMod));
+
+  // Set the encoded message.
+  HARDENED_RETURN_IF_ERROR(
+      otbn_dmem_write(kSigVerifyRsaNumWords, sig->data, kOtbnVarBootRsaInout));
+
+  // Set the precomputed constant m0_inv.
+  HARDENED_RETURN_IF_ERROR(otbn_dmem_write(kOtbnWideWordNumWords, key->n0_inv,
+                                           kOtbnVarBootRsaM0inv));
+
+  // Start the OTBN routine.
+  HARDENED_RETURN_IF_ERROR(otbn_execute());
+  SEC_MMIO_WRITE_INCREMENT(kOtbnSecMmioExecute);
+
+  // TODO(#20023): Check the instruction count register (see `mod_exp_otbn`).
+
+  // Read recovered message out of OTBN dmem.
+  return otbn_dmem_read(kSigVerifyRsaNumWords, kOtbnVarBootRsaInout,
+                        result->data);
+}
diff --git a/sw/lib/sw/device/silicon_creator/otbn_boot_services.h b/sw/lib/sw/device/silicon_creator/otbn_boot_services.h
new file mode 100644
index 0000000000000..d5e66759a701c
--- /dev/null
+++ b/sw/lib/sw/device/silicon_creator/otbn_boot_services.h
@@ -0,0 +1,138 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_OTBN_BOOT_SERVICES_H_
+#define OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_OTBN_BOOT_SERVICES_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "sw/device/silicon_creator/lib/drivers/hmac.h"
+#include "sw/device/silicon_creator/lib/drivers/keymgr.h"
+#include "sw/lib/sw/device/silicon_creator/attestation.h"
+#include "sw/lib/sw/device/silicon_creator/sigverify/rsa_key.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/**
+ * Loads the OTBN boot-services application.
+ *
+ * Loads the OTBN program that runs attestation and code-signature
+ * verification. The program can later be cleared by wiping OTBN's IMEM and
+ * DMEM, or by loading a diffierent OTBN application.
+ *
+ * @return The result of the operation.
+ */
+OT_WARN_UNUSED_RESULT
+rom_error_t otbn_boot_app_load(void);
+
+/**
+ * Generate an attestation public key from a keymgr-derived secret.
+ *
+ * This routine triggers the key manager to sideload key material into OTBN,
+ * and also takes in an extra seed to XOR with the key material. The final
+ * private key is:
+ *   d = (additional_seed ^ keymgr_seed) mod n
+ * ...where n is the P256 curve order. The public key is d*G, where G is the
+ * P256 base point.
+ *
+ * The `additional_seed` is expected to be the output from a specially seeded
+ * DRBG. It must be fully independent from the key manager seed.
+ *
+ * Expects the OTBN boot-services program to already be loaded; see
+ * `otbn_boot_app_load`.
+ *
+ * @param additional_seed Seed material from DRBG.
+ * @param diversification Salt and version information for key manager.
+ * @param[out] public_key Attestation public key.
+ * @return The result of the operation.
+ */
+OT_WARN_UNUSED_RESULT
+rom_error_t otbn_boot_attestation_keygen(
+    const attestation_seed_t *additional_seed,
+    keymgr_diversification_t diversification,
+    attestation_public_key_t *public_key);
+
+/**
+ * Saves an attestation private key to OTBN's scratchpad.
+ *
+ * This routine takes the same arguments as `otbn_boot_attestation_keygen`, but
+ * instead of computing the public key, it computes only the private key and
+ * saves it to OTBN's scratchpad memory.
+ *
+ * Expects the OTBN boot-services program to already be loaded; see
+ * `otbn_boot_app_load`.
+ *
+ * @param additional_seed Seed material from DRBG.
+ * @param diversification Salt and version information for key manager.
+ * @return The result of the operation.
+ */
+OT_WARN_UNUSED_RESULT
+rom_error_t otbn_boot_attestation_key_save(
+    const attestation_seed_t *additional_seed,
+    keymgr_diversification_t diversification);
+
+/**
+ * Clears any saved attestation key from OTBN's scratchpad.
+ *
+ * This routine clears OTBN's DMEM. If called after
+ * `otbn_boot_attestation_key_save`, it will clear the saved key.
+ *
+ * @return The result of the operation.
+ */
+OT_WARN_UNUSED_RESULT
+rom_error_t otbn_boot_attestation_key_clear(void);
+
+/**
+ * Signs the message with the saved attestation key, and clears the key.
+ *
+ * Must be called when there is a saved attestation key in OTBN's scratchpad;
+ * use `otbn_boot_attestation_key_save` to store one.
+ *
+ * The intended purpose of this function is to sign the current stage's
+ * attestation certificate with the private key of the previous stage. The
+ * caller should hash the certificate with SHA-256 before calling this
+ * function.
+ *
+ * Expects the OTBN boot-services program to already be loaded; see
+ * `otbn_boot_app_load`.
+ *
+ * @param digest Digest to sign.
+ * @param[out] sig Resulting signature.
+ * @return The result of the operation.
+ */
+OT_WARN_UNUSED_RESULT
+rom_error_t otbn_boot_attestation_endorse(const hmac_digest_t *digest,
+                                          attestation_signature_t *sig);
+
+/**
+ * Computes the modular exponentiation of an RSA signature on OTBN.
+ *
+ * Given an RSA public key and sig, this function computes sig^e mod n using
+ * Montgomery multiplication, where
+ * - sig is an RSA signature,
+ * - e and n are the exponent and the modulus of the key, respectively.
+ *
+ * The key exponent is always 65537; no other exponents are supported.
+ *
+ * Expects the OTBN boot-services program to already be loaded; see
+ * `otbn_boot_app_load`.
+ *
+ * @param key An RSA public key.
+ * @param sig Buffer that holds the signature, little-endian.
+ * @param[out] result Buffer to write the result to, little-endian.
+ * @return The result of the operation.
+ */
+OT_WARN_UNUSED_RESULT
+rom_error_t otbn_boot_sigverify_mod_exp(const sigverify_rsa_key_t *key,
+                                        const sigverify_rsa_buffer_t *sig,
+                                        sigverify_rsa_buffer_t *result);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // OPENTITAN_SW_LIB_SW_DEVICE_SILICON_CREATOR_OTBN_BOOT_SERVICES_H_
diff --git a/sw/otbn/crypto/BUILD b/sw/otbn/crypto/BUILD
index aded20c8ec6d5..70783f9aeddff 100644
--- a/sw/otbn/crypto/BUILD
+++ b/sw/otbn/crypto/BUILD
@@ -6,6 +6,19 @@ load("//rules:otbn.bzl", "otbn_binary", "otbn_library")
 
 package(default_visibility = ["//visibility:public"])
 
+otbn_binary(
+    name = "boot",
+    srcs = [
+        "boot.s",
+    ],
+    deps = [
+        ":p256_base",
+        ":p256_sign",
+        ":rsa_verify_3072",
+        ":rsa_verify_3072_rr",
+    ],
+)
+
 otbn_library(
     name = "ed25519",
     srcs = [
@@ -70,9 +83,37 @@ otbn_library(
 )
 
 otbn_library(
-    name = "p256",
+    name = "p256_shared_key",
+    srcs = [
+        "p256_shared_key.s",
+    ],
+)
+
+otbn_library(
+    name = "p256_base",
     srcs = [
-        "p256.s",
+        "p256_base.s",
+    ],
+)
+
+otbn_library(
+    name = "p256_isoncurve",
+    srcs = [
+        "p256_isoncurve.s",
+    ],
+)
+
+otbn_library(
+    name = "p256_sign",
+    srcs = [
+        "p256_sign.s",
+    ],
+)
+
+otbn_library(
+    name = "p256_verify",
+    srcs = [
+        "p256_verify.s",
     ],
 )
 
@@ -82,7 +123,9 @@ otbn_binary(
         "p256_ecdh.s",
     ],
     deps = [
-        ":p256",
+        ":p256_base",
+        ":p256_isoncurve",
+        ":p256_shared_key",
     ],
 )
 
@@ -92,7 +135,10 @@ otbn_binary(
         "p256_ecdsa.s",
     ],
     deps = [
-        ":p256",
+        ":p256_base",
+        ":p256_isoncurve",
+        ":p256_sign",
+        ":p256_verify",
     ],
 )
 
@@ -103,6 +149,20 @@ otbn_library(
     ],
 )
 
+otbn_library(
+    name = "p384_a2b",
+    srcs = [
+        "p384_a2b.s",
+    ],
+)
+
+otbn_library(
+    name = "p384_isoncurve",
+    srcs = [
+        "p384_isoncurve.s",
+    ],
+)
+
 otbn_library(
     name = "p384_sign",
     srcs = [
@@ -110,6 +170,41 @@ otbn_library(
     ],
 )
 
+otbn_library(
+    name = "p384_internal_mult",
+    srcs = [
+        "p384_internal_mult.s",
+    ],
+)
+
+otbn_library(
+    name = "p384_keygen",
+    srcs = [
+        "p384_keygen.s",
+    ],
+)
+
+otbn_library(
+    name = "p384_base_mult",
+    srcs = [
+        "p384_base_mult.s",
+    ],
+)
+
+otbn_library(
+    name = "p384_modinv",
+    srcs = [
+        "p384_modinv.s",
+    ],
+)
+
+otbn_library(
+    name = "p384_scalar_mult",
+    srcs = [
+        "p384_scalar_mult.s",
+    ],
+)
+
 otbn_library(
     name = "p384_verify",
     srcs = [
@@ -226,7 +321,10 @@ otbn_binary(
         "p256_ecdsa_sca.s",
     ],
     deps = [
-        ":p256",
+        ":p256_base",
+        ":p256_isoncurve",
+        ":p256_sign",
+        ":p256_verify",
     ],
 )
 
@@ -236,7 +334,7 @@ otbn_binary(
         "p256_key_from_seed_sca.s",
     ],
     deps = [
-        ":p256",
+        ":p256_base",
     ],
 )
 
@@ -246,7 +344,7 @@ otbn_binary(
         "p256_mod_inv_sca.s",
     ],
     deps = [
-        ":p256",
+        ":p256_base",
     ],
 )
 
@@ -257,10 +355,80 @@ otbn_binary(
     ],
     deps = [
         ":p384_base",
+        ":p384_internal_mult",
+        ":p384_modinv",
+        ":p384_sign",
+    ],
+)
+
+otbn_binary(
+    name = "p384_curve_point_valid",
+    srcs = [
+        "p384_curve_point_valid.s",
+    ],
+    deps = [
+        ":p384_base",
+        ":p384_isoncurve",
+    ],
+)
+
+otbn_binary(
+    name = "p384_ecdh",
+    srcs = [
+        "p384_ecdh.s",
+    ],
+    deps = [
+        ":p384_a2b",
+        ":p384_base",
+        ":p384_base_mult",
+        ":p384_internal_mult",
+        ":p384_keygen",
+        ":p384_scalar_mult",
+    ],
+)
+
+otbn_binary(
+    name = "p384_ecdsa_keygen",
+    srcs = [
+        "p384_ecdsa_keygen.s",
+    ],
+    deps = [
+        ":p384_base",
+        ":p384_base_mult",
+        ":p384_internal_mult",
+        ":p384_keygen",
+    ],
+)
+
+otbn_binary(
+    name = "p384_ecdsa_sign",
+    srcs = [
+        "p384_ecdsa_sign.s",
+    ],
+    deps = [
+        ":p384_base",
+        ":p384_base_mult",
+        ":p384_internal_mult",
+        ":p384_keygen",
+        ":p384_modinv",
         ":p384_sign",
     ],
 )
 
+otbn_binary(
+    name = "p384_ecdsa_verify",
+    srcs = [
+        "p384_ecdsa_verify.s",
+    ],
+    deps = [
+        ":p384_base",
+        ":p384_base_mult",
+        ":p384_internal_mult",
+        ":p384_modinv",
+        ":p384_verify",
+    ],
+)
+
 otbn_library(
     name = "sha256",
     srcs = [
diff --git a/sw/otbn/crypto/boot.s b/sw/otbn/crypto/boot.s
new file mode 100644
index 0000000000000..25f6200010281
--- /dev/null
+++ b/sw/otbn/crypto/boot.s
@@ -0,0 +1,374 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Unified boot-services OTBN program.
+ *
+ * During the boot process, this program should remain loaded. This binary has
+ * the following modes:
+ *   1. MODE_SEC_BOOT_MODEXP: RSA-3072 modexp (to verify a code signature).
+ *   2. MODE_ATTESTATION_KEYGEN: Derive a new attestation keypair (ECDSA-P256).
+ *   3. MODE_ATTESTATION_ENDORSE: Sign with a saved attestation signing key.
+ *   4. MODE_ATTESTATION_KEY_SAVE: Save an attestation signing key.
+ *
+ * Ibex will run `MODE_SEC_BOOT_MODEXP` as part of checking the code
+ * signature of the next boot stage. This mode doesn't interact or interfere
+ * with any other modes, and can be called at any point.
+ *
+ * The attestation modes are more entangled with each other. Part of the
+ * purpose of this program is to store the attestation key of a particular key
+ * manager stage long enough to sign the public key of the next stage, without
+ * rebooting. At each key manager stage, Ibex should:
+ *   - Call `MODE_ATTESTATION_KEYGEN` to get the current public key
+ *   - Construct the attestation certificate for the current stage, including
+ *     the public key
+ *   - Call `MODE_ATTESTATION_ENDORSE` to sign the certificate with the stored
+ *     signing key from the *previous stage* and clear the key
+ *   - Call `MODE_ATTESTATION_KEY_SAVE` to save the current stage's signing
+ *     key, which will later endorse the next stage's certificate
+ *
+ * Of course, in the first stage there is no previous stage signing key and no
+ * certificate, so Ibex should skip the `MODE_ATTESTATION_ENDORSE` step. Ibex
+ * may clear IMEM/DMEM if it needs to run a different OTBN routine (e.g.
+ * signature verification for ownership transfer), but doing so will wipe any
+ * saved keys. This binary is designed so that it should not need to be
+ * cleared and re-loaded on a normal boot.
+ *
+ * The attestation keys are derived from a key manager seed value, which is
+ * XORed with output from a specially seeded DRBG in order to satisfy the FIPS
+ * 186-5 requirement that the seed comes from a DRBG (other FIPS documents say
+ * it is permissible to XOR DRBG output with implementation-specific values, so
+ * the key manager seed is effectively ignored for FIPS compliance).  The saved
+ * signing key is stored in OTBN's scratchpad memory, which is not accessible
+ * to Ibex over the bus.
+ */
+
+/**
+ * Mode magic values, generated with
+ * $ ./util/design/sparse-fsm-encode.py -d 6 -m 4 -n 11 --avoid-zero -s 3357382482
+ *
+ * Call the same utility with the same arguments and a higher -m to generate
+ * additional value(s) without changing the others or sacrificing mutual HD.
+ *
+ * TODO(#17727): in some places the OTBN assembler support for .equ directives
+ * is lacking, so they cannot be used in bignum instructions or pseudo-ops such
+ * as `li`. If support is added, we could use 32-bit values here instead of
+ * 11-bit.
+ */
+.equ MODE_SEC_BOOT_MODEXP, 0x7d3
+.equ MODE_ATTESTATION_KEYGEN, 0x2bf
+.equ MODE_ATTESTATION_ENDORSE, 0x5e8
+.equ MODE_ATTESTATION_KEY_SAVE, 0x64d
+
+.section .text.start
+start:
+  /* Read the mode and tail-call the requested operation. */
+  la    x2, mode
+  lw    x2, 0(x2)
+
+  addi  x3, x0, MODE_SEC_BOOT_MODEXP
+  beq   x2, x3, sec_boot_modexp
+
+  addi  x3, x0, MODE_ATTESTATION_KEYGEN
+  beq   x2, x3, attestation_keygen
+
+  addi  x3, x0, MODE_ATTESTATION_ENDORSE
+  beq   x2, x3, attestation_endorse
+
+  addi  x3, x0, MODE_ATTESTATION_KEY_SAVE
+  beq   x2, x3, attestation_key_save
+
+  /* Invalid mode; fail. */
+  unimp
+  unimp
+  unimp
+
+/**
+ * RSA-3072 modular exponentation.
+ *
+ * Computes msg = (sig^65537) mod M, where
+ *          sig is the signature
+ *          M is the public key modulus
+ *
+ * Uses the specialized RSA-3072 OTBN modexp implementation to recover an
+ * encoded message from an input signature. Ibex needs to check that the
+ * encoded message matches the encoding of the expected message to complete
+ * signature verification.
+ *
+ * Assumes that the Montgomery constant m0_inv is provided, but computes the RR
+ * constant on the fly. The only exponent supported is e=65537.
+ *
+ * @param[in] dmem[rsa_mod]: Modulus of the RSA public key
+ * @param[in] dmem[rsa_inout]: Signature to check against
+ * @param[in] dmem[m0inv]: Montgomery constant (-(M^-1)) mod 2^256
+ * @param[out] dmem[rsa_inout]: Recovered message digest
+ */
+sec_boot_modexp:
+  /* Compute R^2 (same for both exponents): dmem[rr] <= R^2 */
+  jal      x1, compute_rr
+
+  /* Set pointers to buffers for modexp. */
+  la        x24, rsa_inout
+  la        x16, rsa_mod
+  la        x23, rsa_inout
+  la        x26, rr
+  la        x17, m0inv
+
+  /* run modular exponentiation */
+  jal      x1, modexp_var_3072_f4
+
+  ecall
+
+/**
+ * Generate an attestation keypair from a sideloaded seed.
+ *
+ * Takes two input seeds, one from the key manager in the key-sideload slots
+ * and one from DMEM that is expected to be the output of a DRBG and fully
+ * independent from the first. For both seeds, only the first 320 bits are used
+ * and the rest are ignored.
+ *
+ * @param[in]  dmem[attestation_additional_seed]: DRBG output.
+ * @param[out]  dmem[x]: Public key x-coordinate.
+ * @param[out]  dmem[y]: Public key y-coordinate.
+ */
+attestation_keygen:
+  /* Initialize all-zero register. */
+  bn.xor   w31, w31, w31
+
+  /* Generate secret key in shares.
+       w20, w21 <= d0 (first share of secret key)
+       w10, w11 <= d1 (second share of secret key) */
+  jal      x1, attestation_secret_key_from_seed
+
+  /* Call scalar multiplication with base point.
+     R = (x_p, y_p, z_p) = (w8, w9, w10) <= d*G */
+  la        x21, p256_gx
+  la        x22, p256_gy
+  jal       x1, scalar_mult_int
+
+  /* Convert masked result back to affine coordinates.
+     R = (x_a, y_a) = (w11, w12) */
+  jal       x1, proj_to_affine
+
+  /* Store public key in DMEM.
+     dmem[x] <= x_a = w11
+     dmem[y] <= y_a = w12 */
+  li        x2, 11
+  la        x21, x
+  bn.sid    x2++, 0(x21)
+  la        x22, y
+  bn.sid    x2, 0(x22)
+
+  ecall
+
+/**
+ * Sign a message using the saved signing key from the scratchpad.
+ *
+ * Clears the saved key after use, so only one signature is possible with a
+ * saved key.
+ *
+ * @param[in]  dmem[msg]: Message digest (256 bits)
+ * @param[in]   dmem[d0]: First share of private key d (320 bits)
+ * @param[in]   dmem[d1]: Second share of private key d (320 bits)
+ * @param[out]   dmem[r]: Buffer for r component of signature (256 bits)
+ * @param[out]   dmem[s]: Buffer for s component of signature (256 bits)
+ */
+attestation_endorse:
+  /* Generate a fresh random scalar for signing.
+       dmem[k0] <= first share of k
+       dmem[k1] <= second share of k */
+  jal      x1, p256_generate_k
+
+  /* Generate the signature.
+       dmem[r], dmem[s] <= signature */
+  jal      x1, p256_sign
+
+  /* Clear the saved key by overwriting with random data.
+       dmem[d0], dmem[d1] <= RND */
+  li        x20, 20
+  la        x2, d0
+  bn.wsrr   w20, RND
+  bn.sid    x20, 0(x2++)
+  bn.wsrr   w20, RND
+  bn.sid    x20, 0(x2)
+  la        x2, d1
+  bn.wsrr   w20, RND
+  bn.sid    x20, 0(x2++)
+  bn.wsrr   w20, RND
+  bn.sid    x20, 0(x2)
+
+  ecall
+
+/**
+ * Save an attestation signing key to the scratchpad.
+ *
+ * @param[in]  dmem[attestation_additional_seed]: DRBG output.
+ * @param[out]  dmem[d0]: First share of private key (320 bits).
+ * @param[out]  dmem[d1]: Second share of private key (320 bits).
+ */
+attestation_key_save:
+  /* Initialize all-zero register. */
+  bn.xor   w31, w31, w31
+
+  /* Generate secret key in shares.
+       w20, w21 <= d0 (first share of secret key)
+       w10, w11 <= d1 (second share of secret key) */
+  jal      x1, attestation_secret_key_from_seed
+
+  /* Store secret key in DMEM.
+     dmem[d0] <= w20, w21 = d0
+     dmem[d1] <= w10, w11 = d1 */
+  li        x2, 20
+  la        x3, d0
+  bn.sid    x2++, 0(x3)
+  bn.sid    x2, 32(x3)
+  li        x2, 10
+  la        x3, d1
+  bn.sid    x2++, 0(x3)
+  bn.sid    x2, 32(x3)
+
+  ecall
+
+/**
+ * Generate an attestation secret key from a sideloaded seed.
+ *
+ * Takes two input seeds, one from the key manager in the key-sideload slots
+ * and one from DMEM that is expected to be the output of a DRBG and fully
+ * independent from the first. For both seeds, only the first 320 bits are used
+ * and the rest are ignored.
+ *
+ * Returns the key in two 320-bit shares d0 and d1, such that the secret key d
+ * = (d0 + d1) mod n.
+ *
+ * @param[in]   w31: all-zero
+ * @param[in]  dmem[attestation_additional_seed]: DRBG output seed
+ * @param[out]  w20: Lower 256 bits of first share of secret key (d0)
+ * @param[out]  w21: Upper 64 bits of first share of secret key (d0)
+ * @param[out]  w10: Lower 256 bits of first share of secret key (d1)
+ * @param[out]  w11: Upper 64 bits of second share of secret key (d1)
+ *
+ * clobbered registers: x2, x3, x20, w1 to w4, w10, w11, w20 to w29
+ * clobbered flag groups: FG0
+ */
+attestation_secret_key_from_seed:
+  /* Load keymgr seeds from WSRs.
+       w20,w21 <= seed0
+       w10,w11 <= seed1 */
+  bn.wsrr  w20, KEY_S0_L
+  bn.wsrr  w10, KEY_S1_L
+  bn.wsrr  w21, KEY_S0_H
+  bn.wsrr  w11, KEY_S1_H
+
+  /* Load the additional DRBG seed from DMEM and XOR with one share of the
+     sideloaded seed.
+       w20, w21 <= seed0 ^ dmem[attestation_additional_seed] */
+  la       x2, attestation_additional_seed
+  li       x3, 22
+  bn.xor   w20, w20, w22
+  bn.lid   x3++, 0(x2)
+  bn.xor   w21, w21, w23
+  bn.lid   x3, 32(x2)
+
+  /* Tail-call `p256_key_from_seed` to generate secret key shares.
+       w20, w21 <= d0
+       w10, w11 <= d1 */
+  jal      x0, p256_key_from_seed
+
+.bss
+
+/* Operation mode. */
+.globl mode
+.balign 4
+mode:
+.zero 4
+
+/* Input buffer for RSA-3072 modulus. */
+.globl rsa_mod
+.balign 32
+rsa_mod:
+.zero 384
+
+/* Input buffer for precomputed RSA-3072 Montgomery constant:
+      m0' = (- M) mod 2^256. */
+.globl rsa_m0inv
+.balign 32
+rsa_m0inv:
+.zero 32
+
+/* Input/output buffer for RSA-3072 modexp:
+     input: signature
+     output: recovered message = (signature ^ 65537) mod M */
+.globl rsa_inout
+.balign 32
+rsa_inout:
+.zero 384
+
+/* Input buffer for an ECDSA-P256 message digest. */
+.globl msg
+.balign 32
+msg:
+.zero 32
+
+/* Output buffer for the first part of an ECDSA-P256 signature. */
+.globl r
+.balign 32
+r:
+.zero 32
+
+/* Output buffer for the second part of an ECDSA-P256 signature. */
+.globl s
+.balign 32
+s:
+.zero 32
+
+/* ECDSA-P256 public key x-coordinate. */
+.globl x
+.balign 32
+x:
+.zero 32
+
+/* ECDSA-P256 public key y-coordinate. */
+.globl y
+.balign 32
+y:
+.zero 32
+
+/* DRBG output to XOR with key manager seed. */
+.globl attestation_additional_seed
+.balign 32
+attestation_additional_seed:
+.zero 64
+
+.section .scratchpad
+
+/* First share of the saved attestation ECDSA-P256 private key (d). */
+.globl d0
+.balign 32
+d0:
+.zero 64
+
+/* Second share of the saved attestation ECDSA-P256 private key (d). */
+.globl d1
+.balign 32
+d1:
+.zero 64
+
+/* First share of the per-signature ECDSA-P256 secret scalar (k). */
+.globl k0
+.balign 32
+k0:
+.zero 64
+
+/* Second share of the per-signature ECDSA-P256 secret scalar (k). */
+.globl k1
+.balign 32
+k1:
+.zero 64
+
+/* Buffer for the squared Mongomery Radix RR = (2^3072)^2 mod M.
+   Populated by the RSA-3072 implementation. */
+.balign 32
+.globl rr
+rr:
+.zero 384
diff --git a/sw/otbn/crypto/div.s b/sw/otbn/crypto/div.s
index ee9597b797840..ef9c94b0a8244 100644
--- a/sw/otbn/crypto/div.s
+++ b/sw/otbn/crypto/div.s
@@ -2,6 +2,9 @@
 /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
 /* SPDX-License-Identifier: Apache-2.0 */
 
+/* Public interface. */
+.globl div
+
 /**
  * Shift a bignum one bit to the right.
  *
@@ -269,7 +272,6 @@ cond_sub_shifted:
  * clobbered registers: x2 to x5, x8, x23 to x25, w23 to w27
  * clobbered flag groups: FG0
  */
-.globl div
 div:
   /* Initialize quotient to zero.
        dmem[dptr_q..dptr_q+n*32] = 0  */
diff --git a/sw/otbn/crypto/ed25519_scalar.s b/sw/otbn/crypto/ed25519_scalar.s
index c3747ce145f50..135ecff866fe1 100644
--- a/sw/otbn/crypto/ed25519_scalar.s
+++ b/sw/otbn/crypto/ed25519_scalar.s
@@ -35,7 +35,7 @@ sc_init:
   li      x2, 14
   la      x3, ed25519_scalar_L
   bn.lid  x2, 0(x3)
-  bn.wsrw 0x0, w14
+  bn.wsrw MOD, w14
 
   /* Load lower half of precomputed constant mu (260 bits).
        w14 <= mu mod 2^256 */
@@ -188,7 +188,7 @@ sc_reduce:
 
   /* Load L from the MOD register.
        w11 <= WSR[0x0] = MOD = L */
-  bn.wsrr  w11, 0x0
+  bn.wsrr  w11, MOD
 
   /* Compute the value r2 = (q3 * L) mod 2^256. Since q3 has 260 bits and L has
      253, we use a 320x256-bit multiplication, but we stop after the lowest 256
diff --git a/sw/otbn/crypto/handwritten/rsa_verify_3072.s b/sw/otbn/crypto/handwritten/rsa_verify_3072.s
index d907b418a7d32..97fdea07b9604 100644
--- a/sw/otbn/crypto/handwritten/rsa_verify_3072.s
+++ b/sw/otbn/crypto/handwritten/rsa_verify_3072.s
@@ -229,7 +229,7 @@ mont_loop:
   bn.movr   x10++, x13
 
   /* No subtracion if carry bit of addition of carry words not set. */
-  csrrs     x2, 0x7c1, x0
+  csrrs     x2, FG1, x0
   andi      x2, x2, 1
   beq       x2, x0, mont_loop_no_sub
 
@@ -388,7 +388,7 @@ modexp_var_3072_f4:
     bn.lid    x9, 0(x16++)
     bn.subb   w2, w2, w3
     bn.movr   x17++, x11
-  csrrs     x2, 0x7c0, x0
+  csrrs     x2, FG0, x0
   /* TODO: currently we subtract the modulus if out_buf == M. This should
             never happen in an RSA context. We could catch this and raise an
             alert. */
diff --git a/sw/otbn/crypto/modexp.s b/sw/otbn/crypto/modexp.s
index 233b620d345db..fec8b142fca2a 100644
--- a/sw/otbn/crypto/modexp.s
+++ b/sw/otbn/crypto/modexp.s
@@ -305,3 +305,151 @@ modexp_65537:
   jal       x1, montmul_mul1
 
   ret
+
+/**
+ * Constant time conditional bigint subtraction
+ *
+ * Returns C = A-x*B
+ *         with A being a bigint of length 256..4096 bit
+ *              B being a bigint of length 256..4096 bit
+ *              C being a bigint of length 256..4096 bit
+ *              x being a boolean value [0,1]
+ *
+ * Depending on state of FG1.C subtracts a bigint B located in dmem from
+ * another bigint A, located in the wide reg file and stores result C in dmem.
+ *
+ * Flags: When leaving this subroutine, flags of FG0 depend on a
+ *        potentially discarded value and therefore are not usable after
+ *        return. FG1 is not modified in this subroutine.
+ *
+ * @param[in]  x16: dmem pointer to first limb of subtrahend (B)
+ * @param[in]  x8: regfile pointer to first limb of minuend (input A)
+ * @param[in]  x21: dmem pointer to first limb of result (C)
+ * @param[in]  x30: N, number of limbs
+ * @param[in]  FG1.C: subtraction condition, subtract if 1 (x)
+ * @param[in]  x9: pointer to temp reg, must be set to 3
+ * @param[in]  x11: pointer to temp reg, must be set to 2
+ * @param[in]  FG0.C: needs to be set to 0
+ *
+ * clobbered registers: x8, x16, x21, w2, w3
+ * clobbered Flag Groups: FG0
+ */
+cond_sub_to_dmem:
+  /* iterate over all limbs for conditional limb-wise subtraction */
+  loop      x30, 5
+    /* load limb of subtrahend (input B): w3 = dmem[x16+i] */
+    bn.lid    x9, 0(x16++)
+
+    /* move limb from bignum bufer to w2 */
+    bn.movr   x11, x8++
+
+    /* perform subtraction for a limb w3 = w2-1 */
+    bn.subb   w3, w2, w3
+
+    /* conditionally select subtraction result or unmodified limb */
+    bn.sel    w2, w3, w2, FG1.C
+
+    /* store selection result in dmem */
+    bn.sid    x11, 0(x21++)
+
+  ret
+
+/**
+ * Constant-time Montgomery modular multiply by one
+ *
+ * Returns: C = montmul(1,A) = A*R^(-1) mod M
+ *
+ * Routine for back-conversion from Montgomery domain.
+ * This implements the limb-by-limb interleaved Montgomery Modular
+ * Multiplication Algorithm, with one operand fixed to 1. This is only a
+ * wrapper around the main loop body. For algorithmic implementation details
+ * see the mont_loop subroutine.
+ *
+ * Flags: The states of both FG0 and FG1 depend on intermediate values and are
+ *        not usable after return.
+ *
+ * @param[in]  x16: dmem pointer to first limb of modulus M
+ * @param[in]  x17: dptr_m0d, dmem pointer to Montgomery Constant m0'
+ * @param[in]  x19: dmem pointer to first limb of operand A
+ * @param[in]  x21: dmem pointer to first limb of result C
+ * @param[in]  x30: N, number of limbs
+ * @param[in]  x31: N-1, number of limbs minus one
+ * @param[in]  x8: pointer to temp reg, must be set to 4
+ * @param[in]  x9: pointer to temp reg, must be set to 3
+ * @param[in]  x10: pointer to temp reg, must be set to 4
+ * @param[in]  x11: pointer to temp reg, must be set to 2
+ * @param[in]  w31: all-zero
+ *
+ * clobbered registers: x6, x7, x8, x12, x13, x21, x22,
+ *                      w2, w3, w4 to w[4+N-1], w24 to w30
+ * clobbered Flag Groups: FG0, FG1
+ */
+montmul_mul1:
+  /* load Montgomery constant: w3 = dmem[x17] = dmem[dptr_m0d] = m0' */
+  bn.lid    x9, 0(x17)
+
+  /* init regfile bigint buffer with zeros */
+  bn.mov    w2, w31
+  loop      x30, 1
+    bn.movr   x10++, x11
+
+  /* w2=1 this is operand B */
+  bn.xor    w2, w2, w2
+  bn.addi   w2, w2, 1
+
+  /* save dmem pointers for operand A and modulus */
+  addi      x6, x16, 0
+  addi      x7, x19, 0
+
+  /* iterate over limbs of operand B */
+  loop      x30, 4
+
+    /* restore  dmem pointers for operand A and modulus */
+    addi      x16, x6, 0
+    addi      x19, x7, 0
+
+    /* Main loop body of Montgomery Multiplication algorithm */
+    /* 1[i]*A */
+    jal       x1, mont_loop
+
+    /* all subsequent limbs of operand B are zero since B=1 */
+    bn.mov    w2, w31
+
+  /* restore dmem pointers for operand A and modulus */
+  addi      x16, x6, 0
+  addi      x19, x7, 0
+
+  /* zeroize w2 and clear flags */
+  bn.sub    w2, w2, w2, FG1
+
+  /* iterate over all limbs of bigint buffer for limbwise comparison of
+     buffer with the Modulus. After last loop cycle, FG1.C is set if bigint
+     in buffer is larger than Modulus */
+  loop      x30, 3
+
+    /* load limb of limb of Modulus to w3 */
+    bn.lid    x9, 0(x16++)
+
+    /* load limb from bigint buffer to w2 */
+    bn.movr   x11, x8++
+
+    /* compare limb of flag with limb of Modulus */
+    bn.cmpb   w3, w2, FG1
+
+  /* restore pointers to bigint buffer in regfile */
+  li         x8, 4
+  li        x10, 4
+
+  /* restore  dmem pointers for operand A and modulus */
+  addi      x16, x6, 0
+  addi      x19, x7, 0
+
+  /* conditionally subtract Modulus from buffer and store result in
+     dmem[x21] to dmem[x21+N] */
+  jal       x1, cond_sub_to_dmem
+
+  /* restore  dmem pointers for operand A and modulus */
+  addi      x16, x6, 0
+  addi      x19, x7, 0
+
+  ret
diff --git a/sw/otbn/crypto/montmul.s b/sw/otbn/crypto/montmul.s
index 56c0da011e6e5..6ae2cc26a2857 100644
--- a/sw/otbn/crypto/montmul.s
+++ b/sw/otbn/crypto/montmul.s
@@ -11,7 +11,7 @@
 .text
 .globl modload
 .globl montmul
-.globl montmul_mul1
+.globl mont_loop
 
 /**
  * Precomputation of a constant m0' for Montgomery modular arithmetic
@@ -92,69 +92,82 @@ m0inv:
   ret
 
 /**
- * Constant time conditional subtraction of modulus from a bigint
+ * Doubles a number and reduces modulo M in-place.
  *
- * Returns C <= C-s*M
- *         with C being a bigint of length 256..4096 bit
- *              M being the modulus of length 256..4096 bit
- *              s being a boolean value [0,1]
+ *   Returns: C = (A + A) mod M
  *
- * Conditionally subtracts the modulus located in dmem from the bigint
- * located in a buffer in the wide regfile (starting at w5). The subtracted
- * value is selected when FG1.C equals 1, otherwise the unmodified value is
- * selected.
+ * Requires that A < M < 2^(256*N). Writes output to the A buffer in DMEM.
  *
- * Note that the interpretation of the subtrahend as a modulus is only
- * contextual. In theory, it can be any bigint. However, the subtrahend is
- * expected in dmem at a location that is reserved for the modulus according
- * to the calling conventions within this library.
+ * This routine runs in constant time.
  *
- * Flags: When leaving this subroutine, flags of FG0 depend on a
- *        potentially discarded value and therefore are not usable after
- *        return.
- *        FG1 is not modified in this subroutine.
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
- * @param[in]  x16: dptr_m, pointer to 1st limb of modulus M
- * @param[in]  x30: N, number of 256 bit limbs in modulus and bigint
+ * @param[in]  x16: dmem pointer to first limb of modulus M
+ * @param[in]  x30: N, number of limbs
+ * @param[in]  [w4:w(4+N-1)]: operand A
  * @param[in]  w31: all-zero
- * @param[in]  FG1.C: s, selection flag
- * @param[out] [w[5+N-1]:w5]: new bigint value
- * @param[in]  FG0.C: needs to be set to 0
+ * @param[out] [w4:w(4+N-1)]: result C
  *
- * clobbered registers: x8, x10, x11, x16, w2, w3, w4, w5 to w[5+N-1]
- * clobbered flag groups: FG0
+ * clobbered registers: x2, x3, x8, x10 to x13
+ *                      w2, w3, w4 to w(4+N-1), w24, w29, w30
+ * clobbered Flag Groups: FG0, FG1
  */
-cond_sub_mod:
-
-  /* setup pointers */
-  li         x8, 5
-  li        x10, 3
-  li        x11, 2
-
-  /* reset flags for FG0 */
-  bn.add    w31, w31, w31
-
-  /* iterate over all limbs for limb-wise subtraction + conditional selection*/
+double_and_reduce:
+  /* Clear carry flags. */
+  bn.sub    w31, w31, w31
+  bn.sub    w31, w31, w31, FG1
+
+  /* Double the input and compare the sum to the modulus.
+       [w4:w(4+N-1)] <= (A+A) mod 2^(256*N)
+       FG1.C <= (A+A-M) < 0 */
+  li        x2, 2
+  li        x3, 3
+  li        x10, 4
+  addi      x11, x16, 0
   loop      x30, 5
+    /* w3 <= a[i] */
+    bn.movr   x3, x10
+    /* FG0.C, w3 <= w3 + w3 + FG0.C  */
+    bn.addc   w3, w3, w3
+    /* w2 <= M[i] */
+    bn.lid    x2, 0(x11++)
+    /* FG1.C <= (w3 - M[i] - FG1.C) < 0 */
+    bn.cmpb   w3, w2, FG1
+    /* w[4+i] <= w3 */
+    bn.movr   x10++, x3
+
+  /* Now, FG0.C is 1 if (A + A) >= 2^(256*N) and 0 otherwise, and FG1.C is 1 if
+     (A + A) mod 2^(256*N) < M. So we have the following cases:
+     1) FG0.C is 0, FG1.C is 0 : A+A < 2^(256*N) and A + A >= M
+     2) FG0.C is 0, FG1.C is 1 : A+A < 2^(256*N) and A + A < M
+     3) FG0.C is 1, FG1.C is 0 : A+A >= 2^(256*N) and (A + A) mod 2^(256*N) >= M
+     4) FG0.C is 1, FG1.C is 1 : A+A >= 2^(256*N) and (A + A) mod 2^(256*N) < M
+
+     Case (3) is impossible given the bounds on A and M, because it would
+     require that A + A > 2^(256*N) + M. Case (2) is the only one in which we
+     don't need to subtract the modulus, since A + A < M. In cases (1) and (4)
+     we need to subtract the modulus. */
+
+  /* Clear FG0.C, and set FG1.C so that it is 1 if and only if FG0.C and FG1.C
+     match.
+       FG0.C <= 0
+       FG1.C <= (FG0.C ^ FG1.C) <? 1 */
+  bn.addc  w2, w31, w31
+  bn.addc  w3, w31, w31, FG1
+  bn.xor   w2, w2, w3
+  bn.subi  w2, w2, 1, FG1
+
+  /* Conditionally subtract M.
+      [w4:w(4+N-1)] <= [w4:w(4+N-1)] - FG1.C * M = (A + A) mod M */
+  li        x8, 4
+  addi      x10, x16, 0
+  jal       x1, cond_sub_to_reg
 
-    /* load a limb of modulus from dmem to w3 */
-    bn.lid    x10, 0(x16++)
-
-    /* load the limb of bigint buffer to w2 */
-    bn.movr   x11, x8
-
-    /* subtract the current limb of the modulus from current limb of bigint */
-    bn.subb   w4, w2, w3
-
-    /* conditionally select subtraction result or unmodified limb */
-    bn.sel    w3, w4, w2, FG1.C
-
-    /* move back result from w3 to bigint buffer */
-    bn.movr   x8++, x10
+  /* Restore modulus pointer (clobbered by cond_sub_to_reg). */
+  addi      x16, x10, 0
 
   ret
 
-
 /**
 * Compute square of Montgomery modulus
 *
@@ -171,107 +184,74 @@ cond_sub_mod:
 *        not usable after return.
 *
 * @param[in]  x16: dptr_M, pointer to first limb of modulus in dmem
+* @param[in]  x17: dptr_m0d, dmem pointer to Montgomery Constant m0'
 * @param[in]  x18: dptr_RR: dmem pointer to first limb of output buffer for RR
 * @param[in]  x30: N, number of limbs
+* @param[in]  x31: N-1, number of limbs minus 1
 * @param[in]  w31: all-zero
 * @param[out] dmem[dptr_RR+N*32:dptr_RR]: computed RR
 *
-* clobbered registers: x3, x8, x10, x11, x22
+* clobbered registers: x3, x8, x10, x11
 *                      w0, w2, w3, w4, w5 to w20 depending on N
 * clobbered flag groups: FG0, FG1
 */
 compute_rr:
-  /* save pointer to modulus */
-  addi      x22, x16, 0
-
-  /* zeroize w3 */
-  bn.xor    w3, w3, w3
-
-  /* compute full length of current bigint size in bits
-     N*w = x24 = N*256 = N*2^8 = x30 << 8 */
-  slli      x24, x30, 8
-
-  /* reg pointers */
-  li        x8, 5
-  li        x10, 3
-
-  /* zeroize w3 */
-  bn.xor    w3, w3, w3
+  /* Prepare all-zero register and clear FG0.C. */
+  bn.sub    w31, w31, w31
+
+  /* Initialize the buffer with R mod M = 2^(256*N) - M. Because of the bounds
+     on M, the subtraction will never underflow.
+       [w4:w(4+N-1)] <= (0 - M) mod 2^(256*N) = R mod M */
+  addi      x10, x16, 0
+  li        x11, 4
+  li        x3, 3
+  loop      x30, 3
+    /* w3 <= M[i] */
+    bn.lid    x3, 0(x10++)
+    /* FG0.C, w3 <= (0 - M[i] - FG0.C) */
+    bn.subb   w3, w31, w3
+    /* w[4+i] <= w3 */
+    bn.movr   x11++, x3
+
+  /* Repeatedly double R until 5 squarings is enough to get R^2; that is, we
+     compute T = (2^(256*N / 32) * R) mod M. We could use different cutoffs for
+     switching from doubling to squaring, but this cutoff is empirically
+     fastest for RSA-3072.
+       [w4:w(4+N-1)] = (2^(8*N) * [w4:w(4+N-1)]) mod M = T */
+  slli      x10, x30, 3
+  loop      x10, 2
+    jal       x1, double_and_reduce
+    nop
+
+  /* Store T in output buffer (in preparation for montmul).
+     dmem[dptr_RR] <= [w4:w(4+N-1)] = T */
+  li        x8, 4
+  addi      x21, x18, 0
+  loop      x30, 2
+    bn.sid    x8, 0(x21++)
+    addi      x8, x8, 1
 
-  /* zeroize all limbs of bigint in regfile */
-  loop      x30, 1
-    bn.movr   x8++, x10
+  /* Prepare pointers to temp regs for montmul. */
+  li        x9, 3
+  li        x10, 4
+  li        x11, 2
 
-  /* compute R-M
-     since R = 2^(N*w), this can be computed as R-M = unsigned(0-M) */
-  bn.addi w0, w31, 1
-  bn.sub    w3, w31, w0, FG1
-  addi      x16, x22, 0
-  jal       x1, cond_sub_mod
-
-  /* Compute R^2 mod M = R*2^(N*w) mod M.
-     => R^2 mod M can be computed by performing N*w duplications of R.
-     We directly perform a modulo reduction in each step such that the
-     final result will already be reduced. */
-  loop      x24, 18
-    /* reset pointer */
-    li        x8, 5
-
-    /* zeroize w3 reset flags of FG1 */
-    bn.sub    w3, w3, w3, FG1
-
-    /* Duplicate the intermediate bigint result. This can overflow such that
-       bit 2^(N*w) (represented by the carry bit after the final loop cycle)
-       is set. */
-    loop      x30, 3
-      /* copy current limb of bigint to w2 */
-      bn.movr   x11, x8
-
-      /* perform the doubling */
-      bn.addc   w2, w2, w2, FG1
-
-      /* copy result back to bigint in regfile */
-      bn.movr   x8++, x11
-
-    /* Conditionally subtract the modulus from the current bigint Y if there
-       was an overflow. Again, just considering the lowest N*w bits is
-       sufficient, since (in case of an overflow) we can write
-       2*Y as 2^(N*w) + X with M > X >= 0.
-       Then, 2*Y - M = 2^(N*w) + X - M = X + unsigned(0-M) */
-    addi      x16, x22, 0
-    jal       x1, cond_sub_mod
-
-    /* reset pointer to 1st limb of bigint in regfile */
-    li        x8, 5
-
-    /* reset pointer to modulus in dmem */
-    addi      x16, x22, 0
-
-    /* reset flags of FG1 */
-    bn.sub    w3, w3, w3, FG1
-
-    /* compare intermediate bigint y with modulus
-       subtract modulus if Y > M */
-    loop      x30, 3
-      bn.lid    x10, 0(x16++)
-      bn.movr   x11, x8++
-      bn.cmpb   w3, w2, FG1
-    addi      x16, x22, 0
-    jal       x1, cond_sub_mod
-
-    li        x0, 0
-
-  /* reset pointer to 1st limb of bigint in regfile */
-  li        x8, 5
-
-  /* reset pointer to modulus */
-  addi      x16, x22, 0
+  /* Prepare a pointer to the w4 register for storing the result. */
+  li        x8, 4
 
-  /* store computed RR in dmem */
-  addi      x3, x18, 0
-  loop      x30, 2
-    bn.sid    x8, 0(x3++)
-    addi      x8, x8, 1
+  /* Five montgomery squares to compute RR = (T^(2^5) * R) mod M. */
+  loopi     5,9
+    /* [w4:w(4+N-1)] <= montmul(dmem[rr], dmem[rr]) */
+    addi      x19, x18, 0
+    addi      x20, x18, 0
+    jal       x1, montmul
+    /* Store result: dmem[rr] <= [w4:w(4+N-1)] */
+    addi      x2, x18, 0
+    addi      x3, x8, 0
+    loop      x30, 2
+      bn.sid    x3, 0(x2++)
+      addi      x3, x3, 1
+    nop
 
   ret
 
@@ -368,7 +348,7 @@ mul256_w30xw2:
  * @param[in]  x30: number of limbs
  * @param[in]  FG0.C: needs to be set to 0
  *
- * clobbered registers: x8, x16, w24, w29, w30, w[x8] to w[x8+N-1]
+ * clobbered registers: x8, x12, x13, x16, w24, w29, w30, w[x8] to w[x8+N-1]
  * clobbered Flag Groups: FG0
  */
 cond_sub_to_reg:
@@ -378,7 +358,7 @@ cond_sub_to_reg:
   li        x13, 24
 
   /* iterate over all limbs for conditional limb-wise subtraction */
-  loop      x30, 6
+  loop      x30, 5
     /* load limb of subtrahend (input B) to w24 */
     bn.lid    x13, 0(x16++)
 
@@ -388,8 +368,6 @@ cond_sub_to_reg:
     /* perform subtraction for a limb */
     bn.subb   w29, w30, w24
 
-    bn.movr   x8, x13
-
     /* conditionally select subtraction result or unmodified limb */
     bn.sel    w24, w29, w30, FG1.C
 
@@ -567,156 +545,6 @@ mont_loop:
   ret
 
 
-/**
- * Constant time conditional bigint subtraction
- *
- * Returns C = A-x*B
- *         with A being a bigint of length 256..4096 bit
- *              B being a bigint of length 256..4096 bit
- *              C being a bigint of length 256..4096 bit
- *              x being a boolean value [0,1]
- *
- * Depending on state of FG1.C subtracts a bigint B located in dmem from
- * another bigint A, located in the wide reg file and stores result C in dmem.
- *
- * Flags: When leaving this subroutine, flags of FG0 depend on a
- *        potentially discarded value and therefore are not usable after
- *        return. FG1 is not modified in this subroutine.
- *
- * @param[in]  x16: dmem pointer to first limb of subtrahend (B)
- * @param[in]  x8: regfile pointer to first limb of minuend (input A)
- * @param[in]  x21: dmem pointer to first limb of result (C)
- * @param[in]  x30: N, number of limbs
- * @param[in]  FG1.C: subtraction condition, subtract if 1 (x)
- * @param[in]  x9: pointer to temp reg, must be set to 3
- * @param[in]  x11: pointer to temp reg, must be set to 2
- * @param[in]  FG0.C: needs to be set to 0
- *
- * clobbered registers: x8, x16, x21, w2, w3
- * clobbered Flag Groups: FG0
- */
-cond_sub_to_dmem:
-  /* iterate over all limbs for conditional limb-wise subtraction */
-  loop      x30, 5
-    /* load limb of subtrahend (input B): w3 = dmem[x16+i] */
-    bn.lid    x9, 0(x16++)
-
-    /* move limb from bignum bufer to w2 */
-    bn.movr   x11, x8++
-
-    /* perform subtraction for a limb w3 = w2-1 */
-    bn.subb   w3, w2, w3
-
-    /* conditionally select subtraction result or unmodified limb */
-    bn.sel    w2, w3, w2, FG1.C
-
-    /* store selection result in dmem */
-    bn.sid    x11, 0(x21++)
-
-  ret
-
-
-/**
- * Constant-time Montgomery modular multiply by one
- *
- * Returns: C = montmul(1,A) = A*R^(-1) mod M
- *
- * Routine for back-conversion from Montgomery domain.
- * This implements the limb-by-limb interleaved Montgomery Modular
- * Multiplication Algorithm, with one operand fixed to 1. This is only a
- * wrapper around the main loop body. For algorithmic implementation details
- * see the mont_loop subroutine.
- *
- * Flags: The states of both FG0 and FG1 depend on intermediate values and are
- *        not usable after return.
- *
- * @param[in]  x16: dmem pointer to first limb of modulus M
- * @param[in]  x17: dptr_m0d, dmem pointer to Montgomery Constant m0'
- * @param[in]  x19: dmem pointer to first limb of operand A
- * @param[in]  x21: dmem pointer to first limb of result C
- * @param[in]  x30: N, number of limbs
- * @param[in]  x31: N-1, number of limbs minus one
- * @param[in]  x8: pointer to temp reg, must be set to 4
- * @param[in]  x9: pointer to temp reg, must be set to 3
- * @param[in]  x10: pointer to temp reg, must be set to 4
- * @param[in]  x11: pointer to temp reg, must be set to 2
- * @param[in]  w31: all-zero
- *
- * clobbered registers: x6, x7, x8, x12, x13, x21, x22,
- *                      w2, w3, w4 to w[4+N-1], w24 to w30
- * clobbered Flag Groups: FG0, FG1
- */
-montmul_mul1:
-  /* load Montgomery constant: w3 = dmem[x17] = dmem[dptr_m0d] = m0' */
-  bn.lid    x9, 0(x17)
-
-  /* init regfile bigint buffer with zeros */
-  bn.mov    w2, w31
-  loop      x30, 1
-    bn.movr   x10++, x11
-
-  /* w2=1 this is operand B */
-  bn.xor    w2, w2, w2
-  bn.addi   w2, w2, 1
-
-  /* save dmem pointers for operand A and modulus */
-  addi      x6, x16, 0
-  addi      x7, x19, 0
-
-  /* iterate over limbs of operand B */
-  loop      x30, 4
-
-    /* restore  dmem pointers for operand A and modulus */
-    addi      x16, x6, 0
-    addi      x19, x7, 0
-
-    /* Main loop body of Montgomery Multiplication algorithm */
-    /* 1[i]*A */
-    jal       x1, mont_loop
-
-    /* all subsequent limbs of operand B are zero since B=1 */
-    bn.mov    w2, w31
-
-  /* restore dmem pointers for operand A and modulus */
-  addi      x16, x6, 0
-  addi      x19, x7, 0
-
-  /* zeroize w2 and clear flags */
-  bn.sub    w2, w2, w2, FG1
-
-  /* iterate over all limbs of bigint buffer for limbwise comparison of
-     buffer with the Modulus. After last loop cycle, FG1.C is set if bigint
-     in buffer is larger than Modulus */
-  loop      x30, 3
-
-    /* load limb of limb of Modulus to w3 */
-    bn.lid    x9, 0(x16++)
-
-    /* load limb from bigint buffer to w2 */
-    bn.movr   x11, x8++
-
-    /* compare limb of flag with limb of Modulus */
-    bn.cmpb   w3, w2, FG1
-
-  /* restore pointers to bigint buffer in regfile */
-  li         x8, 4
-  li        x10, 4
-
-  /* restore  dmem pointers for operand A and modulus */
-  addi      x16, x6, 0
-  addi      x19, x7, 0
-
-  /* conditionally subtract Modulus from buffer and store result in
-     dmem[x21] to dmem[x21+N] */
-  jal       x1, cond_sub_to_dmem
-
-  /* restore  dmem pointers for operand A and modulus */
-  addi      x16, x6, 0
-  addi      x19, x7, 0
-
-  ret
-
-
 /**
  * Constant-time Montgomery Modular Multiplication
  *
@@ -741,7 +569,7 @@ montmul_mul1:
  * @param[in]  x11: pointer to temp reg, must be set to 2
  * @param[out] [w[4+N-1]:w4]: result C
  *
- * clobbered registers: x5, x6, x7, x8, x10, x12, x13, x20, x22
+ * clobbered registers: x5 to x9, x12, x13, x20, x22
  *                      w2, w3, w4 to w[4+N-1], w24 to w30
  * clobbered Flag Groups: FG0, FG1
  */
@@ -776,6 +604,7 @@ montmul:
   /* restore pointers */
   li        x8, 4
   li        x10, 4
+  li        x11, 2
 
   ret
 
@@ -802,6 +631,10 @@ modload:
   li       x8, 28
   bn.lid   x8, 0(x16)
 
+  /* x31 <= N - 1 */
+  li       x2, 1
+  sub      x31, x30, x2
+
   /* Compute Montgomery constant */
   jal      x1, m0inv
 
diff --git a/sw/otbn/crypto/p256.s b/sw/otbn/crypto/p256_base.s
similarity index 66%
rename from sw/otbn/crypto/p256.s
rename to sw/otbn/crypto/p256_base.s
index 857f423da9367..f9593701f7cde 100644
--- a/sw/otbn/crypto/p256.s
+++ b/sw/otbn/crypto/p256_base.s
@@ -1,5 +1,8 @@
-/* Copyright lowRISC Contributors.
- * Copyright 2016 The Chromium OS Authors. All rights reserved.
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2016 The Chromium OS Authors. All rights reserved.
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE.dcrypto file.
  *
@@ -7,19 +10,22 @@
  * https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/cr50_stab/chip/g/dcrypto/dcrypto_p256.c
  */
 
-.globl p256_isoncurve
 .globl p256_scalar_mult
 .globl p256_base_mult
-.globl p256_sign
-.globl p256_verify
 .globl p256_generate_k
 .globl p256_generate_random_key
 .globl p256_key_from_seed
+.globl trigger_fault_if_fg0_z
+.globl mul_modp
+.globl setup_modp
+.globl mod_mul_256x256
+.globl mod_mul_320x128
+.globl scalar_mult_int
+.globl proj_add
+.globl proj_to_affine
 
 /* Exposed only for testing or SCA purposes. */
-.globl proj_add
 .globl mod_inv
-.globl mod_mul_320x128
 
 .text
 
@@ -43,7 +49,7 @@
 trigger_fault_if_fg0_z:
   /* Read the FG0.Z flag (position 3).
        x2 <= FG0.Z */
-  csrrw     x2, 0x7c0, x0
+  csrrw     x2, FG0, x0
   andi      x2, x2, 8
   srli      x2, x2, 3
 
@@ -312,104 +318,214 @@ mod_mul_320x128:
   ret
 
 /**
- * Checks if a point is a valid curve point on curve P-256 (secp256r1)
- *
- * Returns r = x^3 + ax + b  mod p
- *     and s = y^2  mod p
- *         with x,y being the affine coordinates of the curve point
- *              a, b and p being the domain parameters of P-256
- *
- * This routine checks if a point with given x- and y-coordinate is a valid
- * curve point on P-256.
- * The routine checks whether the coordinates are a solution of the
- * Weierstrass equation y^2 = x^3 + ax + b  mod p.
- * The routine makes use of the property that the domain parameter 'a' can be
- * written as a=-3 for the P-256 curve, hence the routine is limited to P-256.
- * The routine does not return a boolean result but computes the left side
- * and the right sight of the Weierstrass equation and leaves the final
- * comparison to the caller.
- * The routine runs in constant time.
+ * 256-bit modular multiplication for P-256 coordinate field.
+ *
+ * Returns c = a * b mod p
+ *
+ * Uses a specialized algorithm to quicly multiply modulo the P-256 coordinate
+ * modulus p = 2^256 - 2^224 + 2^192 + 2^96 - 1.
+ *
+ * This code has been proven correct in Coq here against a simplified model of
+ * OTBN (simplified in the sense of only including the instructions and
+ * functionality that this code uses):
+ * https://gist.github.com/jadephilipoom/5c1910fd355f730238c99ce620aed98a
+ *
+ * For more details about the code and how to read the proofs above, see the PR
+ * description here: https://github.com/lowRISC/opentitan/pull/20701
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
- * @param[in]  dmem[x]: affine x-coordinate of input point
- * @param[in]  dmem[y]: affine y-coordinate of input point
- * @param[out] dmem[r]: right side result r
- * @param[out] dmem[s]: left side result s
+ * @param[in]  w24: a, first 256 bit operand (a < p)
+ * @param[in]  w25: b, second 256 bit operand (b < p)
+ * @param[in]  w28: r256, constant, 2^256 mod p = 2^256 - p
+ * @param[in]  w29: r448, constant, 2^448 mod p
+ * @param[in]  w31: all-zero
+ * @param[in]  MOD: p, modulus of P-256 underlying finite field
+ * @param[out]  w19: c, result
  *
- * clobbered registers: x2, x3, x19, x20, w0, w19 to w25
+ * clobbered registers: w19, w20, w21, w22, w23, w24, w25
  * clobbered flag groups: FG0
  */
-p256_isoncurve:
+mul_modp:
+  /* First, compute the high partial products (coefficient 2^192 or higher).
+       w19,w20.U <= 2^192*(a0b3 + a1b2 + a2b1 + a3b0)
+                    + 2^256*(a1b3 + a2b2 + a3b1)
+                    + 2^320*(a2b3 + a3b2)
+                    + 2^384*a3b3 */
+  bn.mulqacc.z          w24.0, w25.3, 64  /* a0b3 */
+  bn.mulqacc            w24.1, w25.2, 64  /* a1b2 */
+  bn.mulqacc            w24.2, w25.1, 64  /* a2b1 */
+  bn.mulqacc.so  w20.U, w24.3, w25.0, 64  /* a3b0 */
+  bn.mulqacc            w24.1, w25.3, 0   /* a1b3 */
+  bn.mulqacc            w24.2, w25.2, 0   /* a2b2 */
+  bn.mulqacc            w24.3, w25.1, 0   /* a3b1 */
+  bn.mulqacc            w24.2, w25.3, 64  /* a2b3 */
+  bn.mulqacc            w24.3, w25.2, 64  /* a3b2 */
+  bn.mulqacc.wo    w19, w24.3, w25.3, 128 /* a3b3 */
+
+  /* Now, we have:
+     a * b = a0b0 + 2^64*(a0b1 + a1b0) + 2^128*(a0b2 + a1b1 + a2b0 + w20.U)
+             + 2^256*w19
+
+     If we separate w19 into limbs t0, t1, t2, and t3, that gives us
+     a * b = a0b0 + 2^64*(a0b1 + a1b0) + 2^128*(a0b2 + a1b1 + a2b0 + w20.U)
+              + 2^256*t0 + 2^320*t1 + 2^384*t2 + 2^448*t3
+
+     This implies the modular equivalence:
+     (a * b) mod p
+       \equiv (a0b0 + 2^64*(a0b1 + a1b0) + 2^128*(a0b2 + a1b1 + a2b0 + w20.U)
+              + (2^256 mod p)*t0 + (2^448 mod p)*t3 - ((-2^320) mod p)*t1
+              - ((-2^384) mod p)*t2
+
+     The only reason above for using ((-2^320) mod p) and ((-2^384) mod p)
+     instead of (2^320 mod p) and (2^384 mod p) is that, for these specific
+     values, the positive terms are ~256 bits and the negative ones are ~224
+     bits, so the negative ones are quicker to compute.
+
+     For simplicity, let's call the additive terms u and the subtractive ones v:
+     u = a0b0 + 2^64*(a0b1 + a1b0) + 2^128*(a0b2 + a1b1 + a2b0 + w20.U)
+         + (2^256 mod p)*t0 + (2^448 mod p)*t3
+     v = ((-2^320) mod p)*t1 + ((-2^384) mod p)*t2
+     (a * b) mod p \equiv (u - v) mod p
+  */
 
-  /* setup all-zero reg */
-  bn.xor    w31, w31, w31
+  /* Compute the additive terms (u). The term in w21 is offset 128 bits to save
+     a writeback instruction.
+       w20 + w21 << 384 = u  */
+  bn.mulqacc.z          w24.0, w25.0, 0   /* a0b0 */
+  bn.mulqacc            w28.0, w19.0, 0   /* r256[0] * t0 */
+  bn.mulqacc            w29.0, w19.3, 0   /* r448[0] * t3 */
+  bn.mulqacc            w24.0, w25.1, 64  /* a0b1 */
+  bn.mulqacc            w24.1, w25.0, 64  /* a1b0 */
+  bn.mulqacc            w28.1, w19.0, 64  /* r256[1] * t0 */
+  bn.mulqacc.so  w20.L, w29.1, w19.3, 64  /* r448[1] * t3 */
+  bn.mulqacc            w24.0, w25.2, 0   /* a0b2 */
+  bn.mulqacc            w24.1, w25.1, 0   /* a1b1 */
+  bn.mulqacc            w24.2, w25.0, 0   /* a2b0 */
+  bn.mulqacc            w28.2, w19.0, 0   /* r256[2] * t0 */
+  bn.mulqacc            w29.2, w19.3, 0   /* r448[2] * t3 */
+  bn.mulqacc            w28.3, w19.0, 64  /* r256[3] * t0 */
+  bn.mulqacc.wo    w21, w29.3, w19.3, 64  /* r448[3] * t3 */
+
+  /* To fully reduce u mod p, we'll separate the low 256 bits (u0) from the
+     high 33 bits (u1) and compute:
+      u0 + (2^256 mod p)*u1 = u0 + (2^224 - 2^192 - 2^96 + 1) * u1 */
+
+  /* Rotate 128 bits to undo the offset and put u1 in the least significant
+     position.
+       w22 <= w21[128:0] << 128 | w21[255:127] */
+  bn.rshi   w22, w21, w21 >> 128
+
+  /* w21 <= (u0 + u1) mod p */
+  bn.addm   w20, w20, w31
+  bn.addm   w21, w22, w31
+  bn.addm   w21, w20, w21
+
+  /* w24 <= u1 << 223 */
+  bn.rshi   w24, w22, w31 >> 33
+
+  /* w25 <= u1 * (2^223 - 2^191 - 2^95) */
+  bn.sub    w25, w24, w24 >> 32
+  bn.sub    w25, w25, w24 >> 128
+
+  /* Note: the value in w25 is small enough for addm because u1 < 2^33, and
+     2^33*(2^223 - 2^191 - 2^95) < p.
+     w25 <= (u0 + (2^224 - 2^192 - 2^96 + 1) * u1) mod p = u mod p */
+  bn.addm   w25, w25, w25
+  bn.addm   w25, w25, w21
+
+  /* Now, compute the subtractive terms (v). We don't store constants for this
+     one; instead we transform the expression into something that is
+     computable with (the minimum number of) shifts and adds.
+       v = ((-2^320) mod p)*t1 + ((-2^384) mod p)*t2
+         = t1 * (2^224 + 2^160 + 2^128 - 2^64 - 2^32)
+           + t2 * (2^224 - 2*2^128 - 2*2^96 + 2^32 + 1)
+         = 2^224 * (t1 + t2) + (2^32 + 1) * (t1*2^128 + t2)
+           - 2^32 * (2^32 + 1) * (t1 + t2*2*2^64) */
+
+  /* First, isolate t1 and t2 using `mulqacc` and the lowest limb of r256,
+     which happens to be 1. This method is faster than using shifts.
+       w20 <= t1
+       w21 <= t2 */
+  bn.mulqacc.wo.z  w20, w28.0, w19.1, 0
+  bn.mulqacc.wo.z  w21, w28.0, w19.2, 0
+
+  /* w22 <= (2^32 + 1) * (t1*2^128 + t2) */
+  bn.add    w22, w21, w20 << 128
+  bn.add    w22, w22, w22 << 32
+
+  /* w23 <= t1 + t2 */
+  bn.add    w23, w20, w21
+
+  /* w24 <= (2^32 + 1) * (t1 + 2*2^64*t2) */
+  bn.add    w24, w20, w21 << 64
+  bn.add    w24, w24, w21 << 64
+  bn.add    w24, w24, w24 << 32
+
+  /* w21, w20 <= v */
+  bn.add    w20, w22, w23 << 224
+  bn.addc   w21, w31, w23 >> 32
+  bn.sub    w20, w20, w24 << 32
+  bn.subb   w21, w21, w31
+
+  /* The maximum value of v is 289 bits, so we can now reduce v the same way we
+     reduced u earlier. */
+
+  /* w22 <= (v0 + v1) mod p */
+  bn.addm   w22, w20, w21
+
+  /* w24 <= v1 << 223 */
+  bn.rshi   w24, w21, w31 >> 33
+
+  /* w23 <= v1 * (2^223 - 2^191 - 2^95) */
+  bn.sub    w23, w24, w24 >> 32
+  bn.sub    w23, w23, w24 >> 128
+
+  /* w23 <= (v0 + (2^224 - 2^192 - 2^96 + 1) * v1) mod p = v mod p */
+  bn.addm   w23, w23, w23
+  bn.addm   w23, w23, w22
+
+  /* w19 = (u - v) mod p = (a * b) mod p */
+  bn.subm   w19, w25, w23
 
-  /* setup modulus p and Barrett constant u
-     MOD <= w29 <= dmem[p256_p] = p; w28 <= dmem[p256_u_p] = u_p */
-  li        x2, 29
-  la        x3, p256_p
-  bn.lid    x2, 0(x3)
-  bn.wsrw   0, w29
-  li        x2, 28
-  la        x3, p256_u_p
-  bn.lid    x2, 0(x3)
+  ret
 
-  /* load domain parameter b from dmem
-     w27 <= b = dmem[p256_b] */
-  li        x2, 27
-  la        x3, p256_b
-  bn.lid    x2, 0(x3)
 
-  /* load affine y-coordinate of curve point from dmem
-     w26 <= dmem[y] */
-  la        x3, y
-  li        x2, 24
+/**
+ * Set up for coordinate field operations modulo the prime p.
+ *
+ * Loads the constants required by `mul_modp` and other coordinate-arithmetic
+ * routines.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  w31: all-zero
+ * @param[out] MOD: p, modulus of P-256 underlying finite field
+ * @param[out] w28: r256, constant, 2^256 mod p = 2^256 - p
+ * @param[out] w29: r448, constant, 2^448 mod p
+ *
+ * clobbered registers: w28, w29
+ * clobbered flag groups: FG0
+ */
+setup_modp:
+  /* Load the modulus p from DMEM and store it in MOD.
+     MOD <= w29 <= p = dmem[p256_p] */
+  li        x2, 29
+  la        x3, p256_p
   bn.lid    x2, 0(x3)
+  bn.wsrw   MOD, w29
 
-  /* w19 <= y^2 = w24*w24 */
-  bn.mov    w25, w24
-  jal       x1, mod_mul_256x256
-
-  /* store left side result: dmem[s] <= w19 = y^2  mod p */
-  la        x20, s
-  li        x2, 19
-  bn.sid    x2, 0(x20)
+  /* Compute the constant r256 for reduction modulo p.
+       w28 <= 2^256 - p = r256 */
+  bn.sub   w28, w31, w29
 
-  /* load affine x-coordinate of curve point from dmem
-     w26 <= dmem[x] */
-  la        x3, x
-  li        x2, 26
+  /* Load the constant r448 for reduction modulo p.
+     w29 <= dmem[p256_r448] = r448 */
+  li        x2, 29
+  la        x3, p256_r448
   bn.lid    x2, 0(x3)
-
-  /* w19 <= x^2 = w26*w26 */
-  bn.mov    w25, w26
-  bn.mov    w24, w26
-  jal       x1, mod_mul_256x256
-
-  /* w19 = x^3 <= x^2 * x = w25*w24 = w26*w19 */
-  bn.mov    w25, w19
-  bn.mov    w24, w26
-  jal       x1, mod_mul_256x256
-
-  /* for curve P-256, 'a' can be written as a = -3, therefore we subtract
-     x three times from x^3.
-     w19 = x^3 + ax <= x^3 - 3x  mod p */
-  bn.subm   w19, w19, w26
-  bn.subm   w19, w19, w26
-  bn.subm   w19, w19, w26
-
-  /* w24 <= x^3 + ax + b mod p = w19 + w27 mod p */
-  bn.addm   w19, w19, w27
-
-  /* store right side result: dmem[r] <= w19 = x^3 + ax + b mod p */
-  la        x19, r
-  li        x2, 19
-  bn.sid    x2, 0(x19)
-
   ret
 
-
 /**
  * P-256 point addition in projective coordinates
  *
@@ -427,7 +543,7 @@ p256_isoncurve:
  * terminology of Algorithm 4 of [2].
  * The routine is limited to P-256 curve points due to:
  *   - fixed a=-3 domain parameter
- *   - usage of a P-256 optimized Barrett multiplication kernel
+ *   - usage of a P-256 optimized modular multiplication kernel
  * This routine runs in constant time.
  *
  * [1] https://doi.org/10.1006/jnth.1995.1088
@@ -440,9 +556,8 @@ p256_isoncurve:
  * @param[in]  w12: y_q, x-coordinate of input point Q
  * @param[in]  w13: z_q, x-coordinate of input point Q
  * @param[in]  w27: b, curve domain parameter
- * @param[in]  w29: p, modulus, 2^256 > p > 2^255.
- * @param[in]  w28: u, pre-computed Barrett constant (without u[256]/MSb
- *                           of u which is always 1 for the allowed range.
+ * @param[in]  w28: r256, constant, 2^256 mod p = 2^256 - p
+ * @param[in]  w29: r448, constant, 2^448 mod p
  * @param[in]  w31: all-zero.
  * @param[in]  MOD: p, modulus, 2^256 > p > 2^255.
  * @param[out]  w11: x_r, x-coordinate of resulting point R
@@ -462,19 +577,19 @@ proj_add:
   /* 1: w14 = t0 <= X1*X2 = w11*w8 */
   bn.mov    w24, w11
   bn.mov    w25, w8
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w14, w19
 
   /* 2: w15 = t1 <= Y1*Y2 = w12*w9 */
   bn.mov    w24, w12
   bn.mov    w25, w9
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w15, w19
 
   /* 3: w16 = t2 <= Z1*Z2 = w13*w10*/
   bn.mov    w24, w13
   bn.mov    w25, w10
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w16, w19
 
   /* 5: w17 = t4 <= X2+Y2 = w11 + w12 */
@@ -486,7 +601,7 @@ proj_add:
   /* 6: w19 = t3 <= t3*t4 = w18*w17 */
   bn.mov    w24, w17
   bn.mov    w25, w18
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 7: w18 = t4 <= t0+t1 = w14+w15 */
   bn.addm   w18, w14, w15
@@ -503,7 +618,7 @@ proj_add:
   /* 11: w18 = t4 <= t4 * X3 = w19 * w18 */
   bn.mov    w24, w18
   bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w18, w19
 
   /* 12: w19 = X3 <= t1 + t2 = w15 + w16 */
@@ -521,7 +636,7 @@ proj_add:
   /* 16: w11 = X3 <= X3 * Y3 = w12 * w19 */
   bn.mov    w24, w19
   bn.mov    w25, w12
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w11, w19
 
   /* 17: w12 = Y3 <= t0 + t2 = w14 + w16 */
@@ -533,7 +648,7 @@ proj_add:
   /* 19: w19 = Z3 <= b * t2 =  w27 * w16 */
   bn.mov    w24, w27
   bn.mov    w25, w16
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 20: w11 = X3 <= Y3 -Z3 = w12 - w19 */
   bn.subm   w11, w12, w19
@@ -553,7 +668,7 @@ proj_add:
   /* 25: w19 = Y3 <= w27 * w12 = b * Y3 */
   bn.mov    w24, w27
   bn.mov    w25, w12
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 26: w15 = t1 <= t2 + t2 = w16 + w16 */
   bn.addm   w15, w16, w16
@@ -585,19 +700,19 @@ proj_add:
   /* 35: w15 = t1 <= t4 * Y3 = w18 * w12 */
   bn.mov    w24, w18
   bn.mov    w25, w12
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w15, w19
 
   /* 36: w16 = t2 <= t0 * Y3 = w14 * w12 */
   bn.mov    w24, w14
   bn.mov    w25, w12
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w16, w19
 
   /* 37: w12 = Y3 <= X3 * Z3 = w11 * w13 */
   bn.mov    w24, w11
   bn.mov    w25, w13
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 38: w12 = Y3 <= Y3 + t2 = w19 + w16 */
   bn.addm   w12, w19, w16
@@ -605,7 +720,7 @@ proj_add:
   /* 39: w19 = X3 <= t3 * X3 = w17 * w11 */
   bn.mov    w24, w17
   bn.mov    w25, w11
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 40: w11 = X3 <= X3 - t1 = w19 - w15 */
   bn.subm   w11, w19, w15
@@ -613,13 +728,13 @@ proj_add:
   /* 41: w13 = Z3 <= t4 * Z3 = w18 * w13 */
   bn.mov    w24, w18
   bn.mov    w25, w13
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w13, w19
 
   /* 42: w19 = t1 <= t3 * t0 = w17 * w14 */
   bn.mov    w24, w17
   bn.mov    w25, w14
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 43: w13 = Z3 <= Z3 + t1 = w13 + w19 */
   bn.addm   w13, w13, w19
@@ -659,12 +774,12 @@ proj_add:
  * @param[in]  w8: x, x-coordinate of curve point (projective)
  * @param[in]  w9: y, y-coordinate of curve point (projective)
  * @param[in]  w10: z, z-coordinate of curve point (projective)
- * @param[in]  w29: p, modulus, 2^256 > p > 2^255.
- * @param[in]  w28: u, pre-computed Barrett constant (without u[256]/MSb
- *                           of u which is always 1 for the allowed range.
+ * @param[in]  w28: r256, constant, 2^256 mod p = 2^256 - p
+ * @param[in]  w29: r448, constant, 2^448 mod p
  * @param[in]  MOD: p, modulus of the finite field of P-256
  * @param[out]  w11: x_a, x-coordinate of curve point (affine)
  * @param[out]  w12: y_a, y-coordinate of curve point (affine)
+ * @param[out]  w14: z^-1, modular inverse of the projective z-coordinate
  *
  * clobbered registers: w10 to w19, w24, w25
  * clobbered flag groups: FG0
@@ -677,81 +792,81 @@ proj_to_affine:
   /* 2: exp = 0x2 = 2*0x1 */
   bn.mov    w24, w10
   bn.mov    w25, w10
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 3: exp = 0x3 = 0x2+0x1 */
   bn.mov    w24, w19
   bn.mov    w25, w10
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w12, w19
 
   /* 4: exp = 0x6 = 2*0x3 */
   bn.mov    w24, w19
   bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 5: exp = 0xc = 2*0x6 */
   bn.mov    w24, w19
   bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 6: exp = 0xf = 0xc+0x3 */
   bn.mov    w24, w19
   bn.mov    w25, w12
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w13, w19
 
   /* 7: exp = 0xf0 = 16*0xf */
   loopi     4, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     nop
 
   /* 8: exp = 0xff = 0xf0+0xf */
   bn.mov    w24, w19
   bn.mov    w25, w13
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w14, w19
 
   /* 9: exp = 0xff00 = 256*0xff */
   loopi     8, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     nop
 
   /* 10: exp = 0xffff = 0xff00+0xff */
   bn.mov    w24, w19
   bn.mov    w25, w14
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w15, w19
 
   /* 11: exp = 0xffff0000 = 2^16*0xffff */
   loopi     16, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     nop
 
   /* 12: exp = 0xffffffff = 0xffff0000+0xffff */
   bn.mov    w24, w19
   bn.mov    w25, w15
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w16, w19
 
   /* 13: exp = 0xffffffff00000000 = 2^32*0xffffffff */
   loopi     32, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     nop
   bn.mov    w17, w19
 
   /* 14: exp = 0xffffffff00000001 = 0xffffffff00000000+0x1 */
   bn.mov    w24, w10
   bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 15: exp =
            0xffffffff00000001000000000000000000000000000000000000000000000000
@@ -759,74 +874,74 @@ proj_to_affine:
   loopi     192, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     nop
   bn.mov    w18, w19
 
   /* 16: exp = 0xffffffffffffffff = 0xffffffff00000000+0xffffffff */
   bn.mov    w24, w17
   bn.mov    w25, w16
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 17: exp = 0xffffffffffffffff0000 = 2^16*0xffffffffffffffff */
   loopi     16, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     nop
 
   /* 18: exp = 0xffffffffffffffffffff = 0xffffffffffffffff0000+0xffff */
   bn.mov    w24, w15
   bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 19: exp = 0xffffffffffffffffffff00 = 256*0xffffffffffffffffffff */
   loopi     8, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     nop
 
   /* 20: exp = 0xffffffffffffffffffffff = 0xffffffffffffffffffff00+0xff */
   bn.mov    w24, w14
   bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 21: exp = 0xffffffffffffffffffffff0 = 16*0xffffffffffffffffffffff */
   loopi     4, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     nop
 
   /* 22: exp = 0xfffffffffffffffffffffff = 0xffffffffffffffffffffff0+0xf */
   bn.mov    w24, w13
   bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 23: exp = 0x3ffffffffffffffffffffffc = 4*0xfffffffffffffffffffffff */
   loopi     2, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     nop
 
   /* 24: exp = 0x3fffffffffffffffffffffff = 0x3ffffffffffffffffffffffc+0x3 */
   bn.mov    w24, w12
   bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 25: exp = 0xfffffffffffffffffffffffc = 4*0x3fffffffffffffffffffffff */
   loopi     2, 4
     bn.mov    w24, w19
     bn.mov    w25, w19
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     nop
 
   /* 26: exp = 0xfffffffffffffffffffffffd = 0xfffffffffffffffffffffffc+0x1 */
   bn.mov    w24, w10
   bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
 
   /* 27: exp = p-2
          = 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffd
@@ -835,21 +950,21 @@ proj_to_affine:
      w14 = z^exp = z^(p-2) = z^-1   mod p */
   bn.mov    w24, w19
   bn.mov    w25, w18
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w14, w19
 
   /* convert x-coordinate to affine
      w11 = x_a = x/z = x * z^(-1) = w8 * w14 */
   bn.mov    w24, w8
   bn.mov    w25, w14
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w11, w19
 
   /* convert y-coordinate to affine
      w12 = y_a = y/z = y * z^(-1) = w9 * w14 */
   bn.mov    w24, w9
   bn.mov    w25, w14
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w12, w19
 
   ret
@@ -884,7 +999,7 @@ mod_inv:
 
   /* subtract 2 from modulus for Fermat's little theorem
      w2 = MOD - 2 = m - 2 */
-  bn.wsrr   w2, 0
+  bn.wsrr   w2, MOD
   bn.subi   w2, w2, 2
 
   /* init square and multiply: w1 = 1 */
@@ -905,7 +1020,7 @@ mod_inv:
 
     /* skip multiplication if C flag not set */
     bn.sel    w1, w1, w3, C
-    csrrs     x2, 0x7c0, x0
+    csrrs     x2, FG0, x0
     andi      x2, x2, 1
     beq       x2, x0, nomul
 
@@ -939,8 +1054,8 @@ mod_inv:
  *                          x-coordinate of input point
  * @param[in]  x22: dptr_y, pointer to dmem location containing affine
  *                          y-coordinate of input point
- * @param[in]  w28: u, lower 256 bit of Barrett constant for curve P-256
- * @param[in]  w29: p, modulus of P-256 underlying finite field
+ * @param[in]  w28: r256, constant, 2^256 mod p = 2^256 - p
+ * @param[in]  w29: r448, constant, 2^448 mod p
  * @param[in]  w31: all-zero
  * @param[in]  MOD: p, modulus of P-256 underlying finite field
  * @param[out] w14: x, projective x-coordinate
@@ -956,7 +1071,7 @@ mod_inv:
 fetch_proj_randomize:
 
   /* get random number from URND */
-  bn.wsrr   w16, 2 /* URND */
+  bn.wsrr   w16, URND
 
   /* reduce random number
      w16 = z <= w16 mod p */
@@ -969,7 +1084,7 @@ fetch_proj_randomize:
   /* scale x-coordinate
      w14 = x <= w24*w16 = x_a*z  mod p */
   bn.mov    w25, w16
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w14, w19
 
   /* fetch y-coordinate from dmem
@@ -979,7 +1094,7 @@ fetch_proj_randomize:
   /* scale y-coordinate
      w15 = y <= w24*w16 = y_a*z  mod p */
   bn.mov    w25, w16
-  jal       x1, mod_mul_256x256
+  jal       x1, mul_modp
   bn.mov    w15, w19
 
   ret
@@ -1000,8 +1115,8 @@ fetch_proj_randomize:
  * @param[in]  w9: y_p, y-coordinate of input point
  * @param[in]  w10: z_p, z-coordinate of input point
  * @param[in]  w27: b, curve domain parameter
- * @param[in]  w29: p, p, modulus of P-256 underlying finite field
- * @param[in]  w28: u, u, lower 256 bit of Barrett constant for curve P-256
+ * @param[in]  w28: r256, constant, 2^256 mod p = 2^256 - p
+ * @param[in]  w29: r448, constant, 2^448 mod p
  * @param[in]  w31: all-zero.
  * @param[in]  MOD: p, modulus of P-256 underlying finite field
  * @param[out]  w11: x_r, x-coordinate of resulting point
@@ -1066,8 +1181,9 @@ proj_double:
  * @param[in]  w27: b, curve domain parameter
  * @param[in]  w31: all-zero
  * @param[in]  MOD: p, modulus, 2^256 > p > 2^255.
- * @param[out]  w11: x_r, affine x-coordinate of resulting point
- * @param[out]  w12: y_r, affine y-coordinate of resulting point
+ * @param[out]  w8: x, x-coordinate of curve point (projective)
+ * @param[out]  w9: y, y-coordinate of curve point (projective)
+ * @param[out]  w10: z, z-coordinate of curve point (projective)
  *
  * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
  *        the computed affine y-coordinate.
@@ -1076,21 +1192,11 @@ proj_double:
  * clobbered flag groups: FG0
  */
 scalar_mult_int:
-
-  /* load field modulus p from dmem
-     w29 <= p = dmem[p256_p] */
-  li        x2, 29
-  la        x3, p256_p
-  bn.lid    x2, 0(x3)
-
-  /* store modulus to MOD WSR */
-  bn.wsrw   0, w29
-
-  /* load lower 256 bit of Barrett constant u for modulus p from dmem
-     w28 <= u = dmem[p256_u_p] */
-  li        x2, 28
-  la        x3, p256_u_p
-  bn.lid    x2, 0(x3)
+  /* Set up for coordinate arithmetic.
+       MOD <= p
+       w28 <= r256
+       w29 <= r448 */
+  jal       x1, setup_modp
 
   /* load domain parameter b from dmem
      w27 <= b = dmem[p256_b] */
@@ -1190,31 +1296,31 @@ scalar_mult_int:
     bn.rshi   w2, w2, w31 >> 255
 
     /* init regs with random numbers from URND */
-    bn.wsrr   w11, 2
-    bn.wsrr   w12, 2
-    bn.wsrr   w13, 2
+    bn.wsrr   w11, URND
+    bn.wsrr   w12, URND
+    bn.wsrr   w13, URND
 
     /* get a fresh random number from URND and scale the coordinates of
        2P = (w3, w4, w5) (scaling each projective coordinate with same
        factor results in same point) */
-    bn.wsrr   w7, 2
+    bn.wsrr   w7, URND
 
     /* w4 = w4 * w7 */
     bn.mov    w24, w4
     bn.mov    w25, w7
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     bn.mov    w4, w19
 
     /* w5 = w5 * w7 */
     bn.mov    w24, w5
     bn.mov    w25, w7
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     bn.mov    w5, w19
 
     /* w6 = w6 * w7 */
     bn.mov    w24, w6
     bn.mov    w25, w7
-    jal       x1, mod_mul_256x256
+    jal       x1, mul_modp
     bn.mov    w6, w19
 
   /* Check if the z-coordinate of Q is 0. If so, fail; this represents the
@@ -1225,244 +1331,8 @@ scalar_mult_int:
   bn.cmp    w10, w31
   jal       x1, trigger_fault_if_fg0_z
 
-  /* convert back to affine coordinates
-     R = (x_a, y_a) = (w11, w12) */
-  jal       x1, proj_to_affine
-
-  ret
-
-
-/**
- * P-256 ECDSA signature generation
- *
- * returns the signature as the pair r, s with
- *         r = x_1  mod n
- *     and s = k^(-1)(msg + r*d)  mod n
- *         with x_1 being the affine x-coordinate of the curve point k*G,
- *                  where G is the curve's base point.
- *              k being a supplied secret random number,
- *              n being the order of the base point G of P-256,
- *              msg being the msg to be signed,
- *              d being the private key.
- *
- * This routine runs in constant time.
- *
- * Note: Some versions of the ECDSA spec suggest that msg must be reduced
- * modulo n (e.g. RFC 6979, section 2.4). However, for this implementation, it
- * is sufficient that msg < 2^256, because the message is multiplied with
- * k^(-1) mod n, and our Barrett multiplication implementation accepts any
- * operands a and b such that a * b < 2^256 * p and fully reduces the result.
- *
- * This routine assumes that the secret scalars d and k are provided in two
- * shares each (d0/d1 and k0/k1 respectively), where
- *   d = (d0 + d1) mod n
- *   k = (k0 + k1) mod n
- *
- * Each share is 320 bits, which gives us 64 bits of extra redundancy modulo n
- * (256 bits). This is a protection measure against side-channel attacks.
- *
- * For s = k^-1 * (r * d + msg), we compute a random nonzero masking scalar
- * alpha, and compute s as:
- *   s = ((k * alpha)^-1 * (r * (d * alpha) + alpha * msg)) mod n
- *
- * We choose alpha to be at most 128 bits, so the product with a 320b share
- * produces fits in the same 512-bit modular reduction routine that we use for
- * 256x256-bit multiplications. It should be safe to compute e.g. k * alpha =
- * (k0 * alpha + k1 * alpha) mod n, because alpha has enough randomness to mask
- * the true value of k.
- *
- * @param[in]  dmem[k0]:  first share of secret scalar (320 bits)
- * @param[in]  dmem[k1]:  second share of secret scalar (320 bits)
- * @param[in]  dmem[msg]: message to be signed (256 bits)
- * @param[in]  dmem[r]:   dmem buffer for r component of signature (256 bits)
- * @param[in]  dmem[s]:   dmem buffer for s component of signature (256 bits)
- * @param[in]  dmem[d0]:  first share of private key d (320 bits)
- * @param[in]  dmem[d1]:  second share of private key d (320 bits)
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- *        the computed affine y-coordinate.
- *
- * clobbered registers: x2, x3, x16 to x23, w0 to w26
- * clobbered flag groups: FG0
- */
-p256_sign:
-
-  /* init all-zero register */
-  bn.xor    w31, w31, w31
-
-  /* load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */
-  la        x16, k0
-  li        x2, 0
-  bn.lid    x2, 0(x16++)
-  li        x2, 1
-  bn.lid    x2, 0(x16)
-
-  /* load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */
-  la        x16, k1
-  li        x2, 2
-  bn.lid    x2, 0(x16++)
-  li        x2, 3
-  bn.lid    x2, 0(x16)
-
-  /* setup modulus n (curve order) and Barrett constant
-     MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n]  */
-  li        x2, 29
-  la        x3, p256_n
-  bn.lid    x2, 0(x3)
-  bn.wsrw   0, w29
-  li        x2, 28
-  la        x3, p256_u_n
-  bn.lid    x2, 0(x3)
-
-  /* scalar multiplication with base point
-     (x_1, y_1) = (w11, w12) <= k*G = w0*(dmem[p256_gx], dmem[p256_gy]) */
-  la        x21, p256_gx
-  la        x22, p256_gy
-  jal       x1, scalar_mult_int
-
-  /* setup modulus n (curve order) and Barrett constant
-     MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n]  */
-  li        x2, 29
-  la        x3, p256_n
-  bn.lid    x2, 0(x3)
-  bn.wsrw   0, w29
-  li        x2, 28
-  la        x3, p256_u_n
-  bn.lid    x2, 0(x3)
-
-  /* re-load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */
-  la        x16, k0
-  li        x2, 0
-  bn.lid    x2, 0(x16++)
-  li        x2, 1
-  bn.lid    x2, 0(x16)
-
-  /* re-load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */
-  la        x16, k1
-  li        x2, 2
-  bn.lid    x2, 0(x16++)
-  li        x2, 3
-  bn.lid    x2, 0(x16)
-
-  /* Generate a random 127-bit number.
-       w4 <= URND()[255:129] */
-  bn.wsrr  w4, 0x2 /* URND */
-  bn.rshi  w4, w31, w4 >> 129
-
-  /* Add 1 to get a 128-bit nonzero scalar for masking.
-       w4 <= w4 + 1 = alpha */
-  bn.addi  w4, w4, 1
-
-  /* w0 <= ([w0,w1] * w4) mod n = (k0 * alpha) mod n */
-  bn.mov    w24, w0
-  bn.mov    w25, w1
-  bn.mov    w26, w4
-  jal       x1, mod_mul_320x128
-  bn.mov    w0, w19
-
-  /* w19 <= ([w2,w3] * w26) mod n = (k1 * alpha) mod n */
-  bn.mov    w24, w2
-  bn.mov    w25, w3
-  jal       x1, mod_mul_320x128
-
-  /* w0 <= (w0+w19) mod n = (k * alpha) mod n */
-  bn.addm   w0, w0, w19
-
-  /* w1 <= w0^-1 mod n = (k * alpha)^-1 mod n */
-  jal       x1, mod_inv
-
-  /* Load first share of secret key d from dmem.
-       w2,w3 = dmem[d0] */
-  la        x16, d0
-  li        x2, 2
-  bn.lid    x2, 0(x16++)
-  li        x2, 3
-  bn.lid    x2, 0(x16)
-
-  /* Load second share of secret key d from dmem.
-       w5,w6 = dmem[d1] */
-  la        x16, d1
-  li        x2, 5
-  bn.lid    x2, 0(x16++)
-  li        x2, 6
-  bn.lid    x2, 0(x16)
-
-  /* w0 <= ([w2,w3] * w4) mod n = (d0 * alpha) mod n */
-  bn.mov    w24, w2
-  bn.mov    w25, w3
-  bn.mov    w26, w4
-  jal       x1, mod_mul_320x128
-  bn.mov    w0, w19
-
-  /* w19 <= ([w5,w6] * w4) mod n = (d1 * alpha) mod n */
-  bn.mov    w24, w5
-  bn.mov    w25, w6
-  bn.mov    w26, w4
-  jal       x1, mod_mul_320x128
-
-  /* w0 <= (w0+w19) mod n = (d * alpha) mod n */
-  bn.addm   w0, w0, w19
-
-  /* Compare to 0.
-     FG0.Z <= (w0 =? w31) = ((d * alpha) mod n =? 0) */
-  bn.cmp    w0, w31
-
-  /* Trigger a fault if FG0.Z is set, aborting the computation.
-
-     Since alpha is nonzero mod n, (d * alpha) mod n = 0 means d is zero mod n,
-     which violates ECDSA private key requirements. This could technically be
-     triggered by an unlucky key manager seed, but the probability is so low (~1/n)
-     that it more likely indicates a fault attack. */
-  jal       x1, trigger_fault_if_fg0_z
-
-  /* w24 = r <= w11  mod n */
-  bn.addm   w24, w11, w31
-
-  /* Store r of signature in dmem.
-       dmem[r] <= r = w24 */
-  la        x19, r
-  li        x2, 24
-  bn.sid    x2, 0(x19)
-
-  /* w19 <= (w24 * w0) mod n = (r * d * alpha) mod n */
-  bn.mov    w25, w0
-  jal       x1, mod_mul_256x256
-
-  /* w0 <= (w1 * w19) mod n = ((k * alpha)^-1 * (r * d * alpha)) mod n
-                            = (k^-1 * r * d) mod n */
-  bn.mov    w24, w1
-  bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
-  bn.mov    w0, w19
-
-  /* Load message from dmem:
-       w24 = msg <= dmem[msg] */
-  la        x18, msg
-  li        x2, 24
-  bn.lid    x2, 0(x18)
-
-  /* w19 = (w24 * w4) mod n = <= (msg * alpha)  mod n */
-  bn.mov    w25, w4
-  jal       x1, mod_mul_256x256
-
-  /* w19 = (w1 * w19) mod n = ((k * alpha)^-1 * (msg * alpha)) mod n
-                            = (k^-1 * msg) mod n */
-  bn.mov    w24, w1
-  bn.mov    w25, w19
-  jal       x1, mod_mul_256x256
-
-  /* w0 = (w0 + w19) mod n = (k^-1*r*d + k^-1*msg) mod n = s */
-  bn.addm   w0, w0, w19
-
-  /* Store s of signature in dmem.
-       dmem[s] <= s = w0 */
-  la        x20, s
-  li        x2, 0
-  bn.sid    x2, 0(x20)
-
   ret
 
-
 /**
  * P-256 scalar multiplication with base point G
  *
@@ -1514,11 +1384,15 @@ p256_base_mult:
   bn.lid    x2, 0(x16)
 
   /* call internal scalar multiplication routine
-     R = (x_a, y_a) = (w11, w12) <= d*P = (w0 + w1)*P */
+     R = (x_p, y_p, z_p) = (w8, w9, w10) <= d*P = (w0 + w1)*P */
   la        x21, p256_gx
   la        x22, p256_gy
   jal       x1, scalar_mult_int
 
+  /* Convert masked result back to affine coordinates.
+     R = (x_a, y_a) = (w11, w12) */
+  jal       x1, proj_to_affine
+
   /* store result (affine coordinates) in dmem
      dmem[x] <= x_a = w11
      dmem[y] <= y_a = w12 */
@@ -1531,440 +1405,6 @@ p256_base_mult:
   ret
 
 
-/**
- * Variable time modular multiplicative inverse computation
- *
- * Returns c <= a^(-1) mod m
- *         with a being a bigint of length 256 bit with a < m
- *              m being the modulus with a length of 256 bit
- *              c being a 256-bit result
- *
- * This routine implements the computation of the modular multiplicative
- * inverse based on the binary GCD or Stein's algorithm.
- * The implemented variant is based on the
- * "right-shift binary extended GCD" as it is described in section 3.1 of [1]
- * (Algorithm 1).
- * [1] https://doi.org/10.1155/ES/2006/32192
- *
- * Note that this is a variable time implementation. I.e. this routine will
- * show a data dependent timing and execution profile. Only use in situations
- * where a full white-box environment is acceptable.
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * @param[in]  w0: a, operand
- * @param[in]  MOD: m, modulus
- * @param[in]  w31: all-zero
- * @param[out]  w1: result c
- *
- * clobbered registers: x2, w2, w3, w4, w7
- * clobbered flag groups: FG0
- */
-mod_inv_var:
-
-  /* w2 = r = 0 */
-  bn.mov    w2, w31
-
-  /* w3 = s = 1 */
-  bn.addi   w3, w31, 1
-
-  /* w4 = u = MOD */
-  bn.wsrr   w4, 0
-  bn.wsrr   w7, 0
-
-  /* w5 = v = w0 */
-  bn.mov    w5, w0
-
-  ebgcd_loop:
-  /* test if u is odd */
-  bn.or     w4, w4, w4
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 4
-  bne       x2, x0, ebgcd_u_odd
-
-  /* u is even: */
-  /* w4 = u <= u/2 = w4 >> 1 */
-  bn.rshi   w4, w31, w4 >> 1
-
-  /* test if r is odd */
-  bn.or     w2, w2, w2
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 4
-  bne       x2, x0, ebgcd_r_odd
-
-  /* r is even: */
-  /* w2 = r <= r/2 = w2 >> 1 */
-  bn.rshi   w2, w31, w2 >> 1
-  jal       x0, ebgcd_loop
-
-  ebgcd_r_odd:
-  /* w2 = r <= (r + m)/2 = (w2 + w7) >> 1 */
-  bn.add    w2, w7, w2
-  bn.addc   w6, w31, w31
-  bn.rshi   w2, w6, w2 >> 1
-  jal       x0, ebgcd_loop
-
-  ebgcd_u_odd:
-  /* test if v is odd */
-  bn.or     w5, w5, w5
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 4
-  bne       x2, x0, ebgcd_uv_odd
-
-  /* v is even: */
-  /* w5 = v <= v/2 = w5 >> 1 */
-  bn.rshi   w5, w31, w5 >> 1
-
-  /* test if s is odd */
-  bn.or     w3, w3, w3
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 4
-  bne       x2, x0, ebgcd_s_odd
-
-  /* s is even: */
-  /* w3 = s <= s/2 = w3 >> 1 */
-  bn.rshi   w3, w31, w3 >> 1
-  jal       x0, ebgcd_loop
-
-  ebgcd_s_odd:
-  /* w3 = s <= (s + m)/2 = (w3 + w7) >> 1 */
-  bn.add    w3, w7, w3
-  bn.addc   w6, w31, w31
-  bn.rshi   w3, w6, w3 >> 1
-  jal       x0, ebgcd_loop
-
-  ebgcd_uv_odd:
-  /* test if v >= u */
-  bn.cmp    w5, w4
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 1
-  beq       x2, x0, ebgcd_v_gte_u
-
-  /* u > v: */
-  /* w2 = r <= r - s = w2 - w3; if (r < 0): r <= r + m */
-  bn.subm   w2, w2, w3
-
-  /* w4 = u <= u - v = w4 - w5 */
-  bn.sub    w4, w4, w5
-  jal       x0, ebgcd_loop
-
-  ebgcd_v_gte_u:
-  /* w3 = s <= s - r = w3 - w2; if (s < 0) s <= s + m */
-  bn.subm   w3, w3, w2
-
-  /* w5 = v <= v - u = w5 - w4 */
-  bn.sub    w5, w5, w4
-
-  /* if v > 0 go back to start of loop */
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 8
-  beq       x2, x0, ebgcd_loop
-
-  /* v <= 0: */
-  /* if (r > m): w1 = a = r - m = w2 - MOD else: w1 = a = r = w2 */
-  bn.addm   w1, w2, w31
-
-  ret
-
-
-/**
- * P-256 ECDSA signature verification
- *
- * returns the affine x-coordinate of
- *         (x1, y1) = u1*G + u2*Q
- *         with u1 = z*s^-1 mod n  and  u2 = r*s^-1 mod n
- *         with G being the curve's base point,
- *              z being the message
- *              r, s being the signature
- *              Q being the public key.
- *
- * The routine computes the x1 coordinate and places it in dmem. x1 will be
- * reduced (mod n), however, the final comparison has to be performed on the
- * host side. The signature is valid if x1 == r.
- * This routine runs in variable time.
- *
- * @param[in]  dmem[msg]: message to be verified (256 bits)
- * @param[in]  dmem[r]:   r component of signature (256 bits)
- * @param[in]  dmem[s]:   s component of signature (256 bits)
- * @param[in]  dmem[x]:   affine x-coordinate of public key (256 bits)
- * @param[in]  dmem[y]:   affine y-coordinate of public key (256 bits)
- * @param[out] dmem[x_r]: dmem buffer for reduced affine x_r-coordinate (x_1)
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * clobbered registers: x2, x3, x13, x14, x17 to x24, w0 to w25
- * clobbered flag groups: FG0
- */
-p256_verify:
-
-  /* init all-zero register */
-  bn.xor    w31, w31, w31
-
-  /* load domain parameter b from dmem
-     w27 <= b = dmem[p256_b] */
-  li        x2, 27
-  la        x3, p256_b
-  bn.lid    x2, 0(x3)
-
-  /* load r of signature from dmem: w24 = r = dmem[r] */
-  la        x19, r
-  li        x2, 11
-  bn.lid    x2, 0(x19)
-
-  /* setup modulus n (curve order) and Barrett constant
-     MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n]  */
-  li        x2, 29
-  la        x3, p256_n
-  bn.lid    x2, 0(x3)
-  bn.wsrw   0, w29
-  li        x2, 28
-  la        x3, p256_u_n
-  bn.lid    x2, 0(x3)
-
-  /* load s of signature from dmem: w0 = s = dmem[s] */
-  la        x20, s
-  bn.lid    x0, 0(x20)
-
-  /* goto 'fail' if w0 == w31 <=> s == 0 */
-  bn.cmp    w0, w31
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 8
-  bne       x2, x0, fail
-
-  /* goto 'fail' if w0 >= w29 <=> s >= n */
-  bn.cmp    w0, w29
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 1
-  beq       x2, x0, fail
-
-  /* w1 = s^-1  mod n */
-  jal       x1, mod_inv_var
-
-  /* load r of signature from dmem: w24 = r = dmem[r] */
-  la        x19, r
-  li        x2,  24
-  bn.lid    x2, 0(x19)
-
-  /* goto 'fail' if w24 == w31 <=> r == 0 */
-  bn.cmp    w24, w31
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 8
-  bne       x2, x0, fail
-
-  /* goto 'fail' if w0 >= w29 <=> r >= n */
-  bn.cmp    w24, w29
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 1
-  beq       x2, x0, fail
-
-  /* w25 = s^-1 = w1 */
-  bn.mov    w25, w1
-
-  /* u2 = w0 = w19 <= w24*w25 = r*s^-1 mod n */
-  jal       x1, mod_mul_256x256
-  bn.mov    w0, w19
-
-  /* load message, w24 = msg = dmem[msg] */
-  la        x18, msg
-  li        x2, 24
-  bn.lid    x2, 0(x18)
-
-  /* u1 = w1 = w19 <= w24*w25 = w24*w1 = msg*s^-1 mod n */
-  bn.mov    w25, w1
-  jal       x1, mod_mul_256x256
-  bn.mov    w1, w19
-
-  /* setup modulus p and Barrett constant */
-  li        x2, 29
-  la        x3, p256_p
-  bn.lid    x2, 0(x3)
-  bn.wsrw   0, w29
-  li        x2, 28
-  la        x3, p256_u_p
-  bn.lid    x2, 0(x3)
-
-  /* load public key Q from dmem and use in projective form (set z to 1)
-     Q = (w11, w12, w13) = (dmem[x], dmem[y], 1) */
-  li        x2, 11
-  la        x21, x
-  bn.lid    x2++, 0(x21)
-  la        x22, y
-  bn.lid    x2, 0(x22)
-  bn.addi   w13, w31, 1
-
-  /* load base point G and use in projective form (set z to 1)
-     G = (w8, w9, w10) = (x_g, y_g, 1) */
-  li        x13, 8
-  la        x23, p256_gx
-  bn.lid    x13, 0(x23)
-  li        x14, 9
-  la        x24, p256_gy
-  bn.lid    x14, 0(x24)
-  bn.addi   w10, w31, 1
-
-  /* The rest of the routine implements a variable time double-and-add
-     algorithm. For the signature verification we need to compute the point
-     C = (x1, y1) = u_1*G + u_2*Q. This can be done in a single
-     double-and-add routine by using Shamir's Trick. */
-
-  /* G+Q = (w3,w4,w5) = (w11,w12,w13) = (w8,w9,w10) (+) (w11,w12,w13) */
-  jal       x1, proj_add
-  bn.mov    w3, w11
-  bn.mov    w4, w12
-  bn.mov    w5, w13
-
-  /* w2 = u_2 & u_0 = w0 & w1*/
-  bn.and    w2, w0, w1
-
-  /* init double and add algorithm with (0, 1, 0) */
-  bn.mov    w11, w31
-  bn.addi   w12, w31, 1
-  bn.mov    w13, w31
-
-  /* main loop with dicreasing index i (i=255 downto 0) */
-  loopi     256, 31
-
-    /* always double: C = (w11,w12,w13) <= 2 (*) C = 2 (*) (w11,w12,w13) */
-    bn.mov    w8, w11
-    bn.mov    w9, w12
-    bn.mov    w10, w13
-    jal       x1, proj_add
-
-    /* if either  u_1[i] == 0 or u_2[i] == 0 jump to 'no_both' */
-    bn.add    w2, w2, w2
-    csrrs     x2, 0x7c0, x0
-    andi      x2, x2, 1
-    beq       x2, x0, no_both
-
-    /* both bits at current index (u1[i] and u2[i]) are set:
-       do C <= C + (P + Q) and jump to end */
-    bn.mov    w8, w3
-    bn.mov    w9, w4
-    bn.mov    w10, w5
-    jal       x1, proj_add
-    jal       x0, no_q
-
-    /* either u1[i] or u2[i] is set, but not both */
-    no_both:
-
-    /* if u2[i] is not set jump to 'no_g' */
-    bn.add    w6, w0, w0
-    csrrs     x2, 0x7c0, x0
-    andi      x2, x2, 1
-    beq       x2, x0, no_g
-
-    /* u2[i] is set: do C <= C + Q */
-    bn.lid    x13, 0(x21)
-    bn.lid    x14, 0(x22)
-    bn.addi   w10, w31, 1
-    jal       x1, proj_add
-
-    no_g:
-    /* if u1[i] is not set jump to 'no_q' */
-    bn.add    w6, w1, w1
-    csrrs     x2, 0x7c0, x0
-    andi      x2, x2, 1
-    beq       x2, x0, no_q
-
-    /* load base point x-coordinate
-      w8 <= g_x = dmem [p256_gx]; w9 <= g_y = dmem[p256_gy] */
-    bn.lid    x13, 0(x23)
-    bn.lid    x14, 0(x24)
-
-    /* u1[i] is set: do C <= C + G */
-    bn.addi   w10, w31, 1
-    jal       x1, proj_add
-
-    no_q:
-    /* left shift w0 and w1 to decrease index */
-    bn.add    w0, w0, w0
-    bn.add    w1, w1, w1
-
-  /* compute inverse of z-coordinate: w1 = z_c^-1  mod p */
-  bn.mov    w0, w13
-  jal       x1, mod_inv_var
-
-  /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1  mod p */
-  bn.mov    w24, w1
-  bn.mov    w25, w11
-  jal       x1, mod_mul_256x256
-
-  /* final reduction: w24 = x1 <= x1 mod n */
-  la        x3, p256_n
-  bn.lid    x0, 0(x3)
-  bn.wsrw   0, w0
-  bn.subm   w24, w19, w31
-
-  fail:
-  /* store affine x-coordinate in dmem: dmem[x_r] = w24 = x_r */
-  la        x17, x_r
-  li        x2, 24
-  bn.sid    x2, 0(x17)
-
-  ret
-
-
-/**
- * Externally callable wrapper for P-256 scalar point multiplication
- *
- * returns R = k*P = k*(x_p, y_p, z_p)
- *         with R, P being valid P-256 curve points in projective form,
- *              k being a 256 bit scalar.
- *
- * This routine assumes that the scalar k is provided in two shares, k0 and k1,
- * where:
- *   k = (k0 + k1) mod n
- *
- * Sets up context and calls internal scalar multiplication routine.
- * This routine runs in constant time.
- *
- * @param[in]      dmem[k0]:  first share of scalar k (256 bits)
- * @param[in]      dmem[k1]:  second share of scalar k (256 bits)
- * @param[in,out]  dmem[x]:   affine x-coordinate in dmem
- * @param[in,out]  dmem[y]:   affine y-coordinate in dmem
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- *        the computed affine y-coordinate.
- *
- * clobbered registers: x2, x3, x16, x17, x21, x22, w0 to w25
- * clobbered flag groups: FG0
- */
-p256_scalar_mult:
-
-  /* init all-zero register */
-  bn.xor    w31, w31, w31
-
-  /* Load first share of secret key k from dmem.
-       w0,w1 = dmem[k0] */
-  la        x16, k0
-  li        x2, 0
-  bn.lid    x2, 0(x16++)
-  li        x2, 1
-  bn.lid    x2, 0(x16)
-
-  /* Load second share of secret key d from dmem.
-       w2,w3 = dmem[k1] */
-  la        x16, k1
-  li        x2, 2
-  bn.lid    x2, 0(x16++)
-  li        x2, 3
-  bn.lid    x2, 0(x16)
-
-  /* call internal scalar multiplication routine
-     R = (x_a, y_a) = (w11, w12) <= k*P = w0*P */
-  la        x21, x
-  la        x22, y
-  jal       x1, scalar_mult_int
-
-  /* store result (affine coordinates) in dmem
-     dmem[x] <= x_a = w11
-     dmem[y] <= y_a = w12 */
-  li        x2, 11
-  bn.sid    x2++, 0(x21)
-  bn.sid    x2, 0(x22)
-
-  ret
-
 /**
  * Generate a nonzero random value in the scalar field.
  *
@@ -2013,7 +1453,7 @@ p256_random_scalar:
   bn.lid    x2, 0(x3)
 
   /* Copy n into the MOD register. */
-  bn.wsrw   0, w29
+  bn.wsrw   MOD, w29
 
   /* Load Barrett constant for n.
      w28 <= u_n = dmem[p256_u_n]  */
@@ -2023,18 +1463,18 @@ p256_random_scalar:
 
   random_scalar_retry:
   /* Obtain 768 bits of randomness from RND. */
-  bn.wsrr   w15, 0x1 /* RND */
-  bn.wsrr   w16, 0x1 /* RND */
-  bn.wsrr   w17, 0x1 /* RND */
+  bn.wsrr   w15, RND
+  bn.wsrr   w16, RND
+  bn.wsrr   w17, RND
 
   /* XOR with bits from URND, just in case there's any vulnerability in EDN
      that lets the attacker recover bits before they reach OTBN. */
-  bn.wsrr   w20, 0x2 /* URND */
+  bn.wsrr   w20, URND
+  bn.xor    w15, w15, w20
+  bn.wsrr   w20, URND
   bn.xor    w16, w16, w20
-  bn.wsrr   w20, 0x2 /* URND */
+  bn.wsrr   w20, URND
   bn.xor    w17, w17, w20
-  bn.wsrr   w20, 0x2 /* URND */
-  bn.xor    w18, w18, w20
 
   /* Shift bits to get 320-bit seeds.
      w18 <= w16[255:192]
@@ -2045,7 +1485,7 @@ p256_random_scalar:
 
   /* Generate a random masking parameter.
      w14 <= URND(127) + 1 = x */
-  bn.wsrr   w14, 0x2 /* URND */
+  bn.wsrr   w14, URND
   bn.addi   w14, w14, 1
 
   /* w12 <= ([w15,w16] * w14) mod n = (seed0 * x) mod n */
@@ -2071,7 +1511,7 @@ p256_random_scalar:
 
   /* Read the FG0.Z flag (position 3).
      x2 <= 8 if FG0.Z else 0 */
-  csrrw     x2, 0x7c0, x0
+  csrrw     x2, FG0, x0
   andi      x2, x2, 8
 
   /* Retry if x2 != 0. */
@@ -2187,13 +1627,20 @@ p256_generate_k:
  *
  * This routine runs in constant time.
  *
+ * We are aware that MSB of the intermediate values here may leak 1-bit of
+ * secret seed. We observed this with formal masking analysis tool and FPGA
+ * experiments. The algorithm runs with 64-bit excess randomness, so we don't
+ * expect that to be possible to use that leakage and retrieve secret values.
+ * We also verified that the leakage disappeared after running the routine on
+ * 320-bit instead of 321-bit.
+ *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  [w21, w20]: s0, first share of seed (320 bits)
- * @param[in]  [w23, w22]: s1, second share of seed (320 bits)
+ * @param[in]  [w11, w10]: s1, second share of seed (320 bits)
  * @param[in]         w31: all-zero
  * @param[out] [w21, w20]: result x0 (321 bits)
- * @param[out] [w23, w22]: result x1 (320 bits)
+ * @param[out] [w11, w10]: result x1 (320 bits)
  *
  * clobbered registers: w1 to w5, w20 to w23
  * clobbered flag groups: FG0
@@ -2201,16 +1648,17 @@ p256_generate_k:
 boolean_to_arithmetic:
   /* Mask out excess bits from seed shares.
        [w21, w20] <= s0 mod 2^320
-       [w23, w22] <= s1 mod 2^320 = x1 */
+       [w11, w10] <= s1 mod 2^320 = x1 */
   bn.rshi   w21, w21, w31 >> 64
   bn.rshi   w21, w31, w21 >> 192
-  bn.rshi   w23, w23, w31 >> 64
-  bn.rshi   w23, w31, w23 >> 192
+  bn.rshi   w31, w31, w31 >> 192 # dummy instruction to flush ALU datapath
+  bn.rshi   w11, w11, w31 >> 64
+  bn.rshi   w11, w31, w11 >> 192
 
   /* Fetch 321 bits of randomness from URND.
        [w2, w1] <= gamma */
-  bn.wsrr   w1, 2
-  bn.wsrr   w2, 2
+  bn.wsrr   w1, URND
+  bn.wsrr   w2, URND
   bn.rshi   w2, w31, w2 >> 191
 
   /* [w4, w3] <= [w21, w20] ^ [w2, w1] = s0 ^ gamma */
@@ -2222,6 +1670,7 @@ boolean_to_arithmetic:
        [w4, w3] <= [w4, w3] - [w2, w1] = ((s0 ^ gamma) - gamma) mod 2^512 */
   bn.sub    w3, w3, w1
   bn.subb   w4, w4, w2
+  bn.sub    w31, w31, w31 # dummy instruction to clear flags
 
   /* Truncate subtraction result to 321 bits.
        [w4, w3] <= [w4, w3] mod 2^321 = T */
@@ -2232,9 +1681,9 @@ boolean_to_arithmetic:
   bn.xor    w3, w3, w20
   bn.xor    w4, w4, w21
 
-  /* [w2, w1] <= [w2, w1] ^ [w23, w22] = gamma ^ s1 = G */
-  bn.xor    w1, w1, w22
-  bn.xor    w2, w2, w23
+  /* [w2, w1] <= [w2, w1] ^ [w11, w10] = gamma ^ s1 = G */
+  bn.xor    w1, w1, w10
+  bn.xor    w2, w2, w11
 
   /* [w21, w20] <= [w21, w20] ^ [w2, w1] = s0 ^ G */
   bn.xor    w20, w20, w1
@@ -2243,15 +1692,26 @@ boolean_to_arithmetic:
   /* [w21, w20] <= [w21, w20] - [w2, w1] = ((s0 ^ G) - G) mod 2^512 */
   bn.sub    w20, w20, w1
   bn.subb   w21, w21, w2
+  bn.sub    w31, w31, w31 # dummy instruction to clear flags
 
   /* [w21, w20] <= [w21, w20] mod 2^321 = A */
   bn.rshi   w21, w21, w31 >> 65
   bn.rshi   w21, w31, w21 >> 191
 
+  /* apply fresh mask to w20 and w21 before xoring with w3 and w4 */
+  bn.wsrr   w28, RND
+  bn.wsrr   w29, RND
+  bn.xor    w20, w28, w20
+  bn.xor    w21, w29, w21
+
   /* [w21, w20] <= [w21, w20] ^ [w4, w3] = A ^ T2 = x0 */
   bn.xor    w20, w20, w3
   bn.xor    w21, w21, w4
 
+  /* remove fresh mask */
+  bn.xor    w20, w28, w20
+  bn.xor    w21, w29, w21
+
   ret
 
 /**
@@ -2290,10 +1750,10 @@ boolean_to_arithmetic:
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  [w21, w20]: seed0, first share of seed (320 bits)
- * @param[in]  [w23, w22]: seed1, second share of seed (320 bits)
+ * @param[in]  [w11, w10]: seed1, second share of seed (320 bits)
  * @param[in]         w31: all-zero
  * @param[out] [w21, w20]: d0, first share of private key d (320 bits)
- * @param[out] [w23, w22]: d1, second share of private key d (320 bits)
+ * @param[out] [w11, w10]: d1, second share of private key d (320 bits)
  *
  * clobbered registers: x2, x3, w1 to w4, w20 to w29
  * clobbered flag groups: FG0
@@ -2305,7 +1765,7 @@ p256_key_from_seed:
 
   /* At this point, we have arithmetic shares modulo 2^321:
        [w21, w20] : x0
-       [w23, w22] : x1
+       [w11, w10] : x1
 
      We know that x1=seed1, and seed and x1 are at most 320 bits. Therefore,
      the highest bit of x0 holds a carry bit modulo 2^320:
@@ -2343,15 +1803,15 @@ p256_key_from_seed:
   bn.rshi   w29, w31, w29 >> 192
 
   /* [w25,w24] <= (x1 - (n << 64)) mod 2^512 */
-  bn.sub    w24, w22, w28
-  bn.subb   w25, w23, w29
+  bn.sub    w24, w10, w28
+  bn.subb   w25, w11, w29
 
   /* Compute d1. Because 2^320 < 2 * (n << 64), a conditional subtraction is
      sufficient to reduce. Similarly to the carry bit, the conditional bit here
      is not very sensitive because the shares are large relative to n.
-       [w23,w22] <= x1 mod (n << 64) = d1 */
-  bn.sel    w22, w22, w24, FG0.C
-  bn.sel    w23, w23, w25, FG0.C
+       [w11,w10] <= x1 mod (n << 64) = d1 */
+  bn.sel    w10, w10, w24, FG0.C
+  bn.sel    w11, w11, w25, FG0.C
 
   /* Isolate the carry bit and shift it back into position.
        w25 <= x0[320] << 64 */
@@ -2418,18 +1878,18 @@ p256_p:
   .word 0x00000001
   .word 0xffffffff
 
-/* Barrett constant u for modulus p */
-.globl p256_u_p
+/* Constant ((2^448) mod p) for reduction modulo p. */
+.globl p256_r448
 .balign 32
-p256_u_p:
-  .word 0x00000003
-  .word 0x00000000
+p256_r448:
   .word 0xffffffff
   .word 0xfffffffe
   .word 0xfffffffe
-  .word 0xfffffffe
   .word 0xffffffff
   .word 0x00000000
+  .word 0x00000002
+  .word 0x00000003
+  .word 0x00000000
 
 /* P-256 domain parameter n (order of base point) */
 .globl p256_n
diff --git a/sw/otbn/crypto/p256_ecdh.s b/sw/otbn/crypto/p256_ecdh.s
index 57fff9fdcc0b9..01489fd39a46a 100644
--- a/sw/otbn/crypto/p256_ecdh.s
+++ b/sw/otbn/crypto/p256_ecdh.s
@@ -106,36 +106,15 @@ keypair_random:
  * @param[out]  dmem[y]: x1, second share of shared key.
  */
 shared_key:
-  /* Generate shared key d*Q.
-       dmem[x] <= (d*Q).x
-       dmem[y] <= (d*Q).y */
-  jal      x1, p256_scalar_mult
-
-  /* TODO: `p256_scalar_mult` and the code below briefly handle the shared key
-     in unmasked form. The best way to fixing this is likely:
-       - modify scalar_mult_int to return projective coordinates
-       - get additive arithmetic mask for x before converting it to affine
-       - multiply both shares by Z^-1 to convert to affine form
-       - run a safe arithmetic-to-boolean conversion algorithm
- */
-
-  /* Fetch a fresh random number for blinding.
-       w2 <= URND() */
-  bn.wsrr   w2, 0x2 /* URND */
+  /* Validate the public key. Halts the program if the key is invalid and jumps
+     back here if it's OK. */
+  jal      x0, check_public_key_valid
+  _pk_valid:
 
-  /* Store the random number as the second share.
-       dmem[y] <= w2 */
-  li        x2, 2
-  la        x4, y
-  bn.sid    x2, 0(x4)
-
-  /* Blind the x-coordinate.
-       dmem[x] <= dmem[x] ^ w2 */
-  li        x3, 3
-  la        x4, x
-  bn.lid    x3, 0(x4)
-  bn.xor    w3, w3, w2
-  bn.sid    x3, 0(x4)
+  /* Generate boolean-masked shared key (d*Q).x.
+       dmem[x] <= x0
+       dmem[y] <= x1 */
+  jal      x1, p256_shared_key
 
   ecall
 
@@ -212,15 +191,15 @@ shared_key_from_seed:
 secret_key_from_seed:
   /* Load keymgr seeds from WSRs.
        w20,w21 <= seed0
-       w22,w23 <= seed1 */
-  bn.wsrr  w20, 0x4 /* KEY_S0_L */
-  bn.wsrr  w21, 0x5 /* KEY_S0_H */
-  bn.wsrr  w22, 0x6 /* KEY_S1_L */
-  bn.wsrr  w23, 0x7 /* KEY_S1_H */
+       w10,w11 <= seed1 */
+  bn.wsrr  w20, KEY_S0_L
+  bn.wsrr  w21, KEY_S0_H
+  bn.wsrr  w10, KEY_S1_L
+  bn.wsrr  w11, KEY_S1_H
 
   /* Generate secret key shares.
        w20, w21 <= d0
-       w22, w23 <= d1 */
+       w10, w11 <= d1 */
   jal      x1, p256_key_from_seed
 
   /* Store secret key shares.
@@ -230,12 +209,95 @@ secret_key_from_seed:
   la       x3, d0
   bn.sid   x2++, 0(x3)
   bn.sid   x2++, 32(x3)
-  la       x3, d0
+  li       x2, 10
+  la       x3, d1
   bn.sid   x2++, 0(x3)
   bn.sid   x2, 32(x3)
 
   ret
 
+/**
+ * Check if a provided public key is valid.
+ *
+ * For a given public key (x, y), check that:
+ * - x and y are both fully reduced mod p
+ * - (x, y) is on the P-256 curve.
+ *
+ * Note that, because the point is in affine form, it is not possible that (x,
+ * y) is the point at infinity. In some other forms such as projective
+ * coordinates, we would need to check for this also.
+ *
+ * This routine raises a software error and halts operation if the public key
+ * is invalid.
+ *
+ * @param[in] dmem[x]: Public key x-coordinate.
+ * @param[in] dmem[y]: Public key y-coordinate.
+ */
+check_public_key_valid:
+  /* Init all-zero register. */
+  bn.xor   w31, w31, w31
+
+  /* Load domain parameter p.
+       w29 <= dmem[p256_p] = p */
+  li        x2, 29
+  la        x3, p256_p
+  bn.lid    x2, 0(x3)
+
+  /* Load public key x-coordinate.
+       w2 <= dmem[x] = x */
+  li        x2, 2
+  la        x3, x
+  bn.lid    x2, 0(x3)
+
+  /* Compare x to p.
+       FG0.C <= (x < p) */
+  bn.cmp    w2, w29
+
+  /* Trigger a fault if FG0.C is false. */
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 1
+  bne       x2, x0, _x_valid
+  unimp
+
+  _x_valid:
+
+  /* Load public key y-coordinate.
+       w2 <= dmem[y] = y */
+  li        x2, 2
+  la        x3, y
+  bn.lid    x2, 0(x3)
+
+  /* Compare y to p.
+       FG0.C <= (y < p) */
+  bn.cmp    w2, w29
+
+  /* Trigger a fault if FG0.C is false. */
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 1
+  bne       x2, x0, _y_valid
+  unimp
+
+  _y_valid:
+
+  /* Compute both sides of the Weierstrauss equation.
+       w18 <= (x^3 + ax + b) mod p
+       w19 <= (y^2) mod p */
+  jal      x1, p256_isoncurve
+
+  /* Compare the two sides of the equation.
+       FG0.Z <= (y^2) mod p == (x^2 + ax + b) mod p */
+  bn.cmp    w18, w19
+
+  /* Trigger a fault if FG0.Z is false; otherwise jump back to the single call
+     site. */
+  csrrs     x2, FG0, x0
+  srli      x2, x2, 3
+  andi      x2, x2, 1
+  bne       x2, x0, _pk_valid
+  unimp
+  unimp
+  unimp
+
 .bss
 
 /* Operational mode. */
@@ -256,20 +318,14 @@ x:
 y:
   .zero 32
 
-/* Secret key (d) in two shares: d = (d0 + d1) mod n.
-
-   Note: This is also labeled k0, k1 because the `p256_scalar_mult` algorithm
-   is also used for ECDSA signing and reads from those labels; in the case of
-   ECDH, the scalar in `p256_scalar_mult` is always the private key (d). */
+/* Secret key (d) in two shares: d = (d0 + d1) mod n. */
 .globl d0
-.globl k0
 .balign 32
 d0:
 k0:
   .zero 64
 
 .globl d1
-.globl k1
 .balign 32
 d1:
 k1:
diff --git a/sw/otbn/crypto/p256_ecdsa.s b/sw/otbn/crypto/p256_ecdsa.s
index a0fb63364c0e9..7fd02dfdf3cf4 100644
--- a/sw/otbn/crypto/p256_ecdsa.s
+++ b/sw/otbn/crypto/p256_ecdsa.s
@@ -111,8 +111,9 @@ ecdsa_sign:
  * @param[out] dmem[x_r]: dmem buffer for reduced affine x_r-coordinate (x_1)
  */
 ecdsa_verify:
-  /* Validate the public key. */
-  jal      x1, check_public_key_valid
+  /* Validate the public key (jumps back here if successful). */
+  jal      x0, check_public_key_valid
+  _pk_valid:
 
   /* Verify the signature (compute x_r). */
   jal      x1, p256_verify
@@ -165,18 +166,18 @@ sideload_ecdsa_sign:
 secret_key_from_seed:
   /* Load keymgr seeds from WSRs.
        w20,w21 <= seed0
-       w22,w23 <= seed1 */
-  bn.wsrr  w20, 4 /*KEY_S0_L*/
-  bn.wsrr  w21, 5 /*KEY_S0_H*/
-  bn.wsrr  w22, 6 /*KEY_S1_L*/
-  bn.wsrr  w23, 7 /*KEY_S1_H*/
+       w10,w11 <= seed1 */
+  bn.wsrr  w20, KEY_S0_L
+  bn.wsrr  w21, KEY_S0_H
+  bn.wsrr  w10, KEY_S1_L
+  bn.wsrr  w11, KEY_S1_H
 
   /* Init all-zero register. */
   bn.xor   w31, w31, w31
 
   /* Generate secret key shares.
        w20, w21 <= d0
-       w22, w23 <= d1 */
+       w10, w11 <= d1 */
   jal      x1, p256_key_from_seed
 
   /* Store secret key shares.
@@ -186,7 +187,8 @@ secret_key_from_seed:
   la       x3, d0
   bn.sid   x2++, 0(x3)
   bn.sid   x2++, 32(x3)
-  la       x3, d0
+  li       x2, 10
+  la       x3, d1
   bn.sid   x2++, 0(x3)
   bn.sid   x2, 32(x3)
 
@@ -230,7 +232,7 @@ check_public_key_valid:
   bn.cmp    w2, w29
 
   /* Trigger a fault if FG0.C is false. */
-  csrrs     x2, 0x7c0, x0
+  csrrs     x2, FG0, x0
   andi      x2, x2, 1
   bne       x2, x0, _x_valid
   unimp
@@ -248,59 +250,31 @@ check_public_key_valid:
   bn.cmp    w2, w29
 
   /* Trigger a fault if FG0.C is false. */
-  csrrs     x2, 0x7c0, x0
+  csrrs     x2, FG0, x0
   andi      x2, x2, 1
   bne       x2, x0, _y_valid
   unimp
 
   _y_valid:
 
-  /* Save the signature values to registers.
-       w4 <= dmem[r]
-       w5 <= dmem[s] */
-  li        x2, 4
-  la        x3, r
-  bn.lid    x2++, 0(x3)
-  la        x3, s
-  bn.lid    x2, 0(x3)
-
   /* Compute both sides of the Weierstrauss equation.
-       dmem[r] <= (x^3 + ax + b) mod p
-       dmem[s] <= (y^2) mod p */
+       w18 <= (x^3 + ax + b) mod p
+       w19 <= (y^2) mod p */
   jal      x1, p256_isoncurve
 
-  /* Load both sides of the equation.
-       w2 <= dmem[r]
-       w3 <= dmem[s] */
-  li        x2, 2
-  la        x3, r
-  bn.lid    x2++, 0(x3)
-  la        x3, s
-  bn.lid    x2, 0(x3)
-
   /* Compare the two sides of the equation.
        FG0.Z <= (y^2) mod p == (x^2 + ax + b) mod p */
-  bn.cmp    w2, w3
+  bn.cmp    w18, w19
 
-  /* Trigger a fault if FG0.Z is false. */
-  csrrs     x2, 0x7c0, x0
+  /* Trigger a fault if FG0.Z is false; otherwise jump back to the single call
+     site. */
+  csrrs     x2, FG0, x0
   srli      x2, x2, 3
   andi      x2, x2, 1
   bne       x2, x0, _pk_valid
   unimp
-
-  _pk_valid:
-
-  /* Write back the saved signature values.
-       dmem[r] <= w4
-       dmem[s] <= w5 */
-  li        x2, 4
-  la        x3, r
-  bn.sid    x2++, 0(x3)
-  la        x3, s
-  bn.sid    x2, 0(x3)
-
-  ret
+  unimp
+  unimp
 
 .bss
 
diff --git a/sw/otbn/crypto/p256_isoncurve.s b/sw/otbn/crypto/p256_isoncurve.s
new file mode 100644
index 0000000000000..07d139bb5e404
--- /dev/null
+++ b/sw/otbn/crypto/p256_isoncurve.s
@@ -0,0 +1,86 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+.globl p256_isoncurve
+
+/**
+ * Checks if a point is a valid curve point on curve P-256 (secp256r1)
+ *
+ * Returns rhs = x^3 + ax + b  mod p
+ *     and lhs = y^2  mod p
+ *         with x,y being the affine coordinates of the curve point
+ *              a, b and p being the domain parameters of P-256
+ *
+ * This routine checks if a point with given x- and y-coordinate is a valid
+ * curve point on P-256.
+ * The routine checks whether the coordinates are a solution of the
+ * Weierstrass equation y^2 = x^3 + ax + b  mod p.
+ * The routine makes use of the property that the domain parameter 'a' can be
+ * written as a=-3 for the P-256 curve, hence the routine is limited to P-256.
+ * The routine does not return a boolean result but computes the left side
+ * and the right sight of the Weierstrass equation and leaves the final
+ * comparison to the caller.
+ * The routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]      w31: all-zero
+ * @param[in]  dmem[x]: affine x-coordinate of input point
+ * @param[in]  dmem[y]: affine y-coordinate of input point
+ * @param[out]     w18: lhs, left side of equation = (x^3 + ax + b) mod p
+ * @param[out]     w19: rhs, right side of equation = y^2 mod p
+ *
+ * clobbered registers: x2, x3, x19, x20, w0, w19 to w29
+ * clobbered flag groups: FG0
+ */
+p256_isoncurve:
+  /* Set up for coordinate arithmetic.
+       MOD <= p
+       w28 <= r256
+       w29 <= r448 */
+  jal       x1, setup_modp
+
+  /* load domain parameter b from dmem
+     w27 <= b = dmem[p256_b] */
+  li        x2, 27
+  la        x3, p256_b
+  bn.lid    x2, 0(x3)
+
+  /* load affine x-coordinate of curve point from dmem
+     w26 <= dmem[x] */
+  la        x3, x
+  li        x2, 26
+  bn.lid    x2, 0(x3)
+
+  /* w19 <= x^2 = w26*w26 */
+  bn.mov    w25, w26
+  bn.mov    w24, w26
+  jal       x1, mul_modp
+
+  /* w19 = x^3 <= x^2 * x = w25*w24 = w26*w19 */
+  bn.mov    w25, w19
+  bn.mov    w24, w26
+  jal       x1, mul_modp
+
+  /* for curve P-256, 'a' can be written as a = -3, therefore we subtract
+     x three times from x^3.
+     w19 = x^3 + ax <= x^3 - 3x  mod p */
+  bn.subm   w19, w19, w26
+  bn.subm   w19, w19, w26
+  bn.subm   w19, w19, w26
+
+  /* w18 <= x^3 + ax + b mod p = w19 + w27 mod p = lhs */
+  bn.addm   w18, w19, w27
+
+  /* Load affine y-coordinate of curve point from dmem
+     w26 <= dmem[y] */
+  la        x3, y
+  li        x2, 24
+  bn.lid    x2, 0(x3)
+
+  /* w19 <= w24*w24 mod p = y^2 mod p = rhs */
+  bn.mov    w25, w24
+  jal       x1, mul_modp
+
+  ret
diff --git a/sw/otbn/crypto/p256_key_from_seed_sca.s b/sw/otbn/crypto/p256_key_from_seed_sca.s
index 5a6b7d04538d6..5429b75981d61 100644
--- a/sw/otbn/crypto/p256_key_from_seed_sca.s
+++ b/sw/otbn/crypto/p256_key_from_seed_sca.s
@@ -53,33 +53,35 @@ run_gen_secret_key:
 
   /* Load shares of seed from DMEM.
        [w21,w20] <= dmem[seed0]
-       [w23,w33] <= dmem[seed1] */
+       [w11,w10] <= dmem[seed1] */
   li        x2, 20
   la        x3, seed0
   bn.lid    x2, 0(x3++)
   li        x2, 21
-  bn.lid    x2++, 0(x3)
+  bn.lid    x2, 0(x3)
+  li        x2, 10
   la        x3, seed1
   bn.lid    x2, 0(x3++)
-  li        x2, 23
+  li        x2, 11
   bn.lid    x2, 0(x3)
 
   /* Generate the derived secret key.
        [w21,w20] <= d0
-       [w23,w33] <= d1 */
+       [w11,w10] <= d1 */
   jal       x1, p256_key_from_seed
 
   /* Write the results to DMEM.
        dmem[d0] <= [w21, w20]
-       dmem[d1] <= [w23, w22] */
+       dmem[d1] <= [w11, w10] */
   li        x2, 20
   la        x3, d0
   bn.sid    x2, 0(x3++)
   li        x2, 21
-  bn.sid    x2++, 0(x3)
+  bn.sid    x2, 0(x3)
+  li        x2, 10
   la        x3, d1
   bn.sid    x2, 0(x3++)
-  li        x2, 23
+  li        x2, 11
   bn.sid    x2, 0(x3)
 
   ret
diff --git a/sw/otbn/crypto/p256_mod_inv_sca.s b/sw/otbn/crypto/p256_mod_inv_sca.s
index ac8464dca7744..5b7c3d53b0fcf 100644
--- a/sw/otbn/crypto/p256_mod_inv_sca.s
+++ b/sw/otbn/crypto/p256_mod_inv_sca.s
@@ -25,7 +25,7 @@ main:
   li        x2, 29
   la        x3, p256_n
   bn.lid    x2, 0(x3)
-  bn.wsrw   0, w29
+  bn.wsrw   MOD, w29
 
   /* Load first share of input.
        w0, w1 <= dmem[k0] */
@@ -48,7 +48,7 @@ main:
 
   /* Generate a random 127-bit number.
        w4 <= URND()[255:129] */
-  bn.wsrr  w4, 0x2 /* URND */
+  bn.wsrr  w4, URND
   bn.rshi  w4, w31, w4 >> 129
 
   /* Add 1 to get a 128-bit nonzero scalar for masking.
diff --git a/sw/otbn/crypto/p256_shared_key.s b/sw/otbn/crypto/p256_shared_key.s
new file mode 100644
index 0000000000000..122f4926a2473
--- /dev/null
+++ b/sw/otbn/crypto/p256_shared_key.s
@@ -0,0 +1,358 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Public interface. */
+.globl p256_shared_key
+
+/* Exposed only for testing or SCA purposes. */
+.globl arithmetic_to_boolean_mod
+.globl arithmetic_to_boolean
+
+.text
+
+/**
+ * Externally callable wrapper for P-256 scalar point multiplication.
+ *
+ * Returns x0, x1 such that x0 ^ x1 = x-coordinate of (d * P).
+ *
+ * This routine is specialized for ECDH shared key generation and includes an
+ * arithmetic-to-boolean masking conversion.
+ *
+ * This routine assumes that the scalar d is provided in two arithmetic shares,
+ * d0 and d1, where d = (d0 + d1) mod n.
+ *
+ * This routine runs in constant time.
+ *
+ * @param[in]      dmem[d0]:  first share of scalar d (320 bits)
+ * @param[in]      dmem[d1]:  second share of scalar d (320 bits)
+ * @param[in]      dmem[x]:   affine x-coordinate in dmem
+ * @param[in]      dmem[y]:   affine y-coordinate in dmem
+ * @param[out]     dmem[x]:   x0, first share of x-coordinate in dmem
+ * @param[out]     dmem[y]:   x1, second share of x-coordinate in dmem
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ *        the computed affine y-coordinate.
+ *
+ * clobbered registers: x2, x3, x16, x17, x21, x22, w0 to w25
+ * clobbered flag groups: FG0
+ */
+p256_shared_key:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load first share of secret key d from dmem.
+       w0,w1 = dmem[d0] */
+  la        x16, d0
+  li        x2, 0
+  bn.lid    x2, 0(x16++)
+  li        x2, 1
+  bn.lid    x2, 0(x16)
+
+  /* Load second share of secret key d from dmem.
+       w2,w3 = dmem[d1] */
+  la        x16, d1
+  li        x2, 2
+  bn.lid    x2, 0(x16++)
+  li        x2, 3
+  bn.lid    x2, 0(x16)
+
+  /* Call internal scalar multiplication routine.
+     Returns point in projective coordinates.
+     R = (x, y, z) = (w8, w9, w10) <= k*P = w0*P */
+  la        x21, x
+  la        x22, y
+  jal       x1, scalar_mult_int
+
+  /* Arithmetic masking:
+   1. Generate a random mask
+   2. Subtract masks from projective x coordinate
+      (x, y, z) -> ((x - m) mod p,
+                     y,
+                     z)
+   3. Convert masked curve point back to affine
+      form.
+   4. Multiply mask with z^-1 for use in
+      affine space. */
+
+  /* Fetch a fresh random number as mask.
+       w2 <= URND() */
+  bn.wsrr   w2, URND
+
+  /* Subtract random mask from x coordinate of
+     projective point.
+     The subtraction has to be done within the underlying
+     finite field -> mod p.
+     w8 = (w8 - w2) mod p */
+  bn.subm    w8, w8, w2
+
+  /* Convert masked result back to affine coordinates.
+     R = (x_a, y_a) = (w11, w12) */
+  jal       x1, proj_to_affine
+
+  /* Store result (masked affine x-coordinate) in DMEM.
+     Y-coordinate not needed, will be overwritten with
+     mask value below.
+     dmem[x] <= x_a = w11 */
+  li        x2, 11
+  bn.sid    x2, 0(x21)
+
+  /* Get modular inverse z^-1 of projective z coordinate
+     and multiply the random masks with z^-1 to
+     also convert them into affine space. */
+
+  /* Move z^-1 and x coordinate mask to mul_modp input WDRs.
+     z^-1 is still stored in w14 from previous
+     proj_to_affine call.
+     w25 <= w14 = z^-1
+     w24 <= w2 = m_x */
+  bn.mov    w25, w14
+  bn.mov    w24, w2
+
+  /* Compute modular multiplication of m_x and z^-1.
+     w19 = w24 * w25 mod p = m_x * z^-1 mod p = x1 */
+  jal       x1, mul_modp
+
+  /* Store "affine" mask to DMEM. Use the y-coordinate
+     to save memory (not needed afterwards)
+     dmem[y] <= w19 = x1 */
+  li        x2, 19
+  bn.sid    x2, 0(x22)
+
+  /* Arithmetic-to-boolean conversion.
+       w20 <= x ^ x1 = x0 */
+  jal       x1, arithmetic_to_boolean_mod
+
+  /* dmem[x] <= w20 = x0 */
+  li        x3, 20
+  la        x4, x
+  bn.sid    x3, 0(x4)
+
+  ret
+
+/**
+ * Converts arithmetic shares mod p to boolean shares.
+ *
+ * Calls the 257-bit A2B function twice, first using unmodified 256-bit shares
+ * in reduced form, and then using modified 257-bit shares in unreduced form.
+ *
+ * It then checks if the MSB (carry bit) is true or false, to decide
+ * which of the two A2B results is used. This detects and handles an
+ * underflow during the subtraction of arithmetic masking.
+ *
+ * The logic behind the carry bit handling is as follows:
+ * If x >= r, then  A = (x - r) mod p = x - r exactly.
+ * So when we add 2^257 - p and then add A and x, we get
+ * (2^257 - p + x - r + r) mod 2^257 = 2^257 - p + x.
+ * In this case, the high bit is always true since p - x <= p < 2^256,
+ * so we choose the A2B conversion without the 2^257 - p added.
+ * On the other hand, if x < r, then A = (x - r) mod p = x - r + p.
+ * When we add 2^257 - p and then add A and x, we get
+ * (2^257 - p + x - r + p + r) mod 2^257 = (2^257 + x) mod 2^257 = x.
+ * In this case, the high bit is always false since x < p < 2^256, so we
+ * choose this second A2B conversion.
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  w31: all-zero wide data register
+ * @param[in]  w19: mask r
+ * @param[in]  w11: arithmetically masked value A, such that x = A + r
+ * @param[out] w20: boolean masked value x', such that x = x' ^ r
+ *
+ * clobbered registers: w1 to w6, w11, w12, w18, w20 to w27, and w29
+ * clobbered flag groups: FG0
+ */
+arithmetic_to_boolean_mod:
+  /* First step: calculate A2B from reduced values. */
+
+  /* Save inputs for second A2B execution.
+     w24 <= w19 = r
+     w25 <= w11 = A */
+  bn.mov    w24, w19
+  bn.mov    w25, w11
+
+  /* Expand inputs r and A (w19 and w11) to 257-bit values [w19,w18]
+     and [w12,w11] and prepare input for 257-bit A2B function.
+     w18 <= w19
+     w19 <= w31
+     w11 <= w11 -> obsolete
+     w12 <= w31 */
+  bn.mov    w18, w19
+  bn.mov    w19, w31
+  bn.mov    w12, w31
+
+  /* Call 257-bit A2B function.
+     [w21,w20] <= x' */
+  jal       x1, arithmetic_to_boolean
+
+  /* Save intermediate result of reduced inputs.
+     w26 <= w20 = x' (lower part)
+     w27 <= w21 = x' (upper part) */
+  bn.mov    w26, w20
+  bn.mov    w27, w21
+
+  /* Second step: calculate A2B from unreduced values. */
+
+  /* Restore and expand inputs r and A (w19 and w11) to 257-bit
+     values [w19,w18] and [w12,w11] and prepare input for
+     257-bit A2B function.
+     w18 <= w24
+     w19 <= w31
+     w11 <= w25
+     w12 <= w31 */
+  bn.mov    w18, w24
+  bn.mov    w19, w31
+  bn.mov    w11, w25
+  bn.mov    w12, w31
+
+  /* Get field modulus p.
+     w29 <= MOD() */
+  bn.wsrr   w29, MOD
+
+  /* Convert input A ([w12,w11]) to an unreduced value
+     in the 2^257 domain. For this add (2^257 - p) to A.
+     [w12,w11] <= [w12,w11] + 2^257 - w29 = A + 2^257 - p
+     w12 <= w12 + 0x2 = A + 2^257
+            -> equal to addition of 2^257
+               (w11 doesn't need to be touched)
+     [w12,w11] <= [w12,w11] - w29 = (A + 2^257) - p */
+  bn.addi   w12, w12, 0x2
+  bn.sub    w11, w11, w29
+  bn.subb   w12, w12, w31
+
+  /* Call 257-bit A2B function.
+     [w21,w20] <= x' */
+  jal       x1, arithmetic_to_boolean
+
+  /* Restore initial mask input of w19 for consistency
+     in calling functions.
+     w19 <= w24 */
+  bn.mov    w19, w24
+
+  /* Check MSB (carry bit) of second A2B result for true or false. */
+  bn.cmp    w21, w31 /* w21 can only be 0x1 or 0x0 */
+
+  /* Return the unreduced A2B computation (second result),
+     if zero flag is set, otherwise return the reduced
+     A2B computation (first result). */
+  bn.sel    w20, w20, w26, FG0.Z
+
+  ret
+
+/**
+ * Convert arithmetic shares to boolean ones using Goubin's algorithm.
+ *
+ * We use Goubin's boolean-to-arithmetic masking algorithm to switch from
+ * an arithmetic masking scheme to a boolean one without ever unmasking the
+ * seed. See Algorithm 2 here:
+ * https://link.springer.com/content/pdf/10.1007/3-540-44709-1_2.pdf
+ *
+ * This implementation expands the algorithm to 257 bits for carry bit
+ * handling. The carry bit can be used to detect and handle an
+ * underflow during the subtraction of arithmetic masking.
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  w31: all-zero wide data register
+ * @param[in]  w18: lower part of mask r
+ * @param[in]  w19: upper part of mask r
+ * @param[in]  w11: lower part of arithmetically masked value A,
+ *                  such that x = A + r
+ * @param[in]  w12: upper part of arithmetically masked value A,
+ *                  such that x = A + r
+ * @param[out] w20: lower part of boolean masked value x',
+ *                  such that x = x' ^ r
+ * @param[out] w21: upper part of boolean masked value x',
+ *                  such that x = x' ^ r
+ *
+ * clobbered registers: w1 - w6, w11, w12, and w18 - w21
+ * clobbered flag groups: FG0
+ */
+arithmetic_to_boolean:
+  /* Initialize inputs: in case of randomness in upper part of inputs
+     truncate to 257 bits. */
+  bn.rshi   w19, w19, w31 >> 1
+  bn.rshi   w19, w31, w19 >> 255
+  bn.rshi   w12, w12, w31 >> 1
+  bn.rshi   w12, w31, w12 >> 255
+
+  /* Fetch 257 bits of randomness.
+     [w2,w1] = gamma    <= URND */
+  bn.wsrr   w1, URND
+  bn.wsrr   w2, URND
+  bn.rshi   w2, w31, w2 >> 255
+
+  /* Double gamma and truncate to 257 bits.
+     [w4,w3] = T        <= 2 * [w2,w1] = 2 * gamma */
+  bn.add    w3, w1, w1
+  bn.addc   w4, w2, w2
+  bn.rshi   w4, w4, w31 >> 1
+  bn.rshi   w4, w31, w4 >> 255
+
+  /* [w21,w20] = x'     <= [w2,w1] ^ [w19,w18] = gamma ^ r */
+  bn.xor    w20, w1, w18
+  bn.xor    w21, w2, w19
+
+  /* [w6,w5] = omega    <= [w2,w1] & [w21,w20] = gamma & x' */
+  bn.and    w5, w1, w20
+  bn.and    w6, w2, w21
+
+  /* [w21,w20] = x'     <= [w4,w3] ^ [w12,w11] = T ^ A */
+  bn.xor    w20, w3, w11
+  bn.xor    w21, w4, w12
+
+  /* [w2,w1] = gamma    <= [w2,w1] ^ [w21,w20] = gamma ^ x' */
+  bn.xor    w1, w1, w20
+  bn.xor    w2, w2, w21
+
+  /* [w2,w1] = gamma    <= [w2,w1] & [w19,w18] = gamma & r */
+  bn.and    w1, w1, w18
+  bn.and    w2, w2, w19
+
+  /* [w6,w5] = omega    <= [w6,w5] ^ [w2,w1] = omega ^ gamma */
+  bn.xor    w5, w5, w1
+  bn.xor    w6, w6, w2
+
+  /* [w2,w1] = gamma    <= [w4,w3] & [w12,w11] = T & A */
+  bn.and    w1, w3, w11
+  bn.and    w2, w4, w12
+
+  /* [w6,w5] = omega    <= [w6,w5] ^ [w2,w1] = omega ^ gamma */
+  bn.xor    w5, w5, w1
+  bn.xor    w6, w6, w2
+
+  /* Loop for k = 1 to K - 1 = 257 - 1 */
+  loopi     256, 12
+
+    /* [w2,w1] = gamma  <= [w4,w3] & [w19,w18] = T & r */
+    bn.and     w1, w3, w18
+    bn.and     w2, w4, w19
+
+    /* [w2,w1] = gamma  <= [w2,w1] ^ [w6,w5] = gamma ^ omega */
+    bn.xor     w1, w1, w5
+    bn.xor     w2, w2, w6
+
+    /* [w4,w3] = T      <= [w4,w3] & [w12,w11] = T & A */
+    bn.and     w3, w3, w11
+    bn.and     w4, w4, w12
+
+    /* [w2,w1] = gamma  <= [w2,w1] ^ [w4,w3] = gamma ^ T */
+    bn.xor     w1, w1, w3
+    bn.xor     w2, w2, w4
+
+    /* Double gamma and truncate to 257 bits.
+       [w4,w3] = T      <= 2 * [w2,w1] = 2 * gamma */
+    bn.add    w3, w1, w1
+    bn.addc   w4, w2, w2
+    bn.rshi   w4, w4, w31 >> 1
+    bn.rshi   w4, w31, w4 >> 255
+
+  /* [w21,w20] = x'     <= [w21,w20] ^ [w4,w3] = x' ^ T */
+  bn.xor    w20, w20, w3
+  bn.xor    w21, w21, w4
+
+  ret
diff --git a/sw/otbn/crypto/p256_sign.s b/sw/otbn/crypto/p256_sign.s
new file mode 100644
index 0000000000000..0adffc0d5b9b0
--- /dev/null
+++ b/sw/otbn/crypto/p256_sign.s
@@ -0,0 +1,289 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2016 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE.dcrypto file.
+ *
+ * Derived from code in
+ * https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/cr50_stab/chip/g/dcrypto/dcrypto_p256.c
+ */
+
+.globl p256_sign
+
+.text
+
+ /**
+ * P-256 ECDSA signature generation
+ *
+ * returns the signature as the pair r, s with
+ *         r = x_1  mod n
+ *     and s = k^(-1)(msg + r*d)  mod n
+ *         with x_1 being the affine x-coordinate of the curve point k*G,
+ *                  where G is the curve's base point.
+ *              k being a supplied secret random number,
+ *              n being the order of the base point G of P-256,
+ *              msg being the msg to be signed,
+ *              d being the private key.
+ *
+ * This routine runs in constant time.
+ *
+ * Note: Some versions of the ECDSA spec suggest that msg must be reduced
+ * modulo n (e.g. RFC 6979, section 2.4). However, for this implementation, it
+ * is sufficient that msg < 2^256, because the message is multiplied with
+ * k^(-1) mod n, and our Barrett multiplication implementation accepts any
+ * operands a and b such that a * b < 2^256 * p and fully reduces the result.
+ *
+ * This routine assumes that the secret scalars d and k are provided in two
+ * shares each (d0/d1 and k0/k1 respectively), where
+ *   d = (d0 + d1) mod n
+ *   k = (k0 + k1) mod n
+ *
+ * Each share is 320 bits, which gives us 64 bits of extra redundancy modulo n
+ * (256 bits). This is a protection measure against side-channel attacks.
+ *
+ * For s = k^-1 * (r * d + msg), we compute a random nonzero masking scalar
+ * alpha, and compute s as:
+ *   s = ((k * alpha)^-1 * (r * (d * alpha) + alpha * msg)) mod n
+ *
+ * We choose alpha to be at most 128 bits, so the product with a 320b share
+ * produces fits in the same 512-bit modular reduction routine that we use for
+ * 256x256-bit multiplications. It should be safe to compute e.g. k * alpha =
+ * (k0 * alpha + k1 * alpha) mod n, because alpha has enough randomness to mask
+ * the true value of k.
+ *
+ * @param[in]  dmem[k0]:  first share of secret scalar (320 bits)
+ * @param[in]  dmem[k1]:  second share of secret scalar (320 bits)
+ * @param[in]  dmem[msg]: message to be signed (256 bits)
+ * @param[in]  dmem[r]:   dmem buffer for r component of signature (256 bits)
+ * @param[in]  dmem[s]:   dmem buffer for s component of signature (256 bits)
+ * @param[in]  dmem[d0]:  first share of private key d (320 bits)
+ * @param[in]  dmem[d1]:  second share of private key d (320 bits)
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ *        the computed affine y-coordinate.
+ *
+ * clobbered registers: x2, x3, x16 to x23, w0 to w26
+ * clobbered flag groups: FG0
+ */
+p256_sign:
+
+  /* init all-zero register */
+  bn.xor    w31, w31, w31
+
+  /* load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */
+  la        x16, k0
+  li        x2, 0
+  bn.lid    x2, 0(x16++)
+  li        x2, 1
+  bn.lid    x2, 0(x16)
+
+  /* load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */
+  la        x16, k1
+  li        x2, 2
+  bn.lid    x2, 0(x16++)
+  li        x2, 3
+  bn.lid    x2, 0(x16)
+
+  /* setup modulus n (curve order) and Barrett constant
+     MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n]  */
+  li        x2, 29
+  la        x3, p256_n
+  bn.lid    x2, 0(x3)
+  bn.wsrw   MOD, w29
+  li        x2, 28
+  la        x3, p256_u_n
+  bn.lid    x2, 0(x3)
+
+  /* scalar multiplication with base point (projective)
+     (x_1, y_1, z_1) = (w8, w9, w10) <= k*G = w0*(dmem[p256_gx], dmem[p256_gy]) */
+  la        x21, p256_gx
+  la        x22, p256_gy
+  jal       x1, scalar_mult_int
+
+  /* Convert masked result back to affine coordinates.
+     R = (x_a, y_a) = (w11, w12) */
+  jal       x1, proj_to_affine
+
+  /* setup modulus n (curve order) and Barrett constant
+     MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n]  */
+  li        x2, 29
+  la        x3, p256_n
+  bn.lid    x2, 0(x3)
+  bn.wsrw   MOD, w29
+  li        x2, 28
+  la        x3, p256_u_n
+  bn.lid    x2, 0(x3)
+
+  /* re-load first share of secret scalar k from dmem: w0,w1 = dmem[k0] */
+  la        x16, k0
+  li        x2, 0
+  bn.lid    x2, 0(x16++)
+  li        x2, 1
+  bn.lid    x2, 0(x16)
+
+  /* re-load second share of secret scalar k from dmem: w2,w3 = dmem[k1] */
+  la        x16, k1
+  li        x2, 2
+  bn.lid    x2, 0(x16++)
+  li        x2, 3
+  bn.lid    x2, 0(x16)
+
+  /* Generate a random 127-bit number.
+       w4 <= URND()[255:129] */
+  bn.wsrr  w4, URND
+  bn.rshi  w4, w31, w4 >> 129
+
+  /* Add 1 to get a 128-bit nonzero scalar for masking.
+       w4 <= w4 + 1 = alpha */
+  bn.addi  w4, w4, 1
+
+  /* w0 <= ([w0,w1] * w4) mod n = (k0 * alpha) mod n */
+  bn.mov    w24, w0
+  bn.mov    w25, w1
+  bn.mov    w26, w4
+  jal       x1, mod_mul_320x128
+  bn.mov    w0, w19
+
+  /* w19 <= ([w2,w3] * w26) mod n = (k1 * alpha) mod n */
+  bn.mov    w24, w2
+  bn.mov    w25, w3
+  jal       x1, mod_mul_320x128
+
+  /* w0 <= (w0+w19) mod n = (k * alpha) mod n */
+  bn.addm   w0, w0, w19
+
+  /* w1 <= w0^-1 mod n = (k * alpha)^-1 mod n */
+  jal       x1, mod_inv
+
+  /* Load first share of secret key d from dmem.
+       w2,w3 = dmem[d0] */
+  la        x16, d0
+  li        x2, 2
+  bn.lid    x2, 0(x16++)
+  li        x2, 3
+  bn.lid    x2, 0(x16)
+
+  /* Load second share of secret key d from dmem.
+       w5,w6 = dmem[d1] */
+  la        x16, d1
+  li        x2, 5
+  bn.lid    x2, 0(x16++)
+  li        x2, 6
+  bn.lid    x2, 0(x16)
+
+  /* w0 <= ([w2,w3] * w4) mod n = (d0 * alpha) mod n */
+  bn.mov    w24, w2
+  bn.mov    w25, w3
+  bn.mov    w26, w4
+  jal       x1, mod_mul_320x128
+  bn.mov    w0, w19
+
+  /* w19 <= ([w5,w6] * w4) mod n = (d1 * alpha) mod n */
+  bn.mov    w24, w5
+  bn.mov    w25, w6
+  bn.mov    w26, w4
+  jal       x1, mod_mul_320x128
+
+  /* w0 <= (w0+w19) mod n = (d * alpha) mod n */
+  bn.addm   w0, w0, w19
+
+  /* Compare to 0.
+     FG0.Z <= (w0 =? w31) = ((d * alpha) mod n =? 0) */
+  bn.cmp    w0, w31
+
+  /* Trigger a fault if FG0.Z is set, aborting the computation.
+
+     Since alpha is nonzero mod n, (d * alpha) mod n = 0 means d is zero mod n,
+     which violates ECDSA private key requirements. This could technically be
+     triggered by an unlucky key manager seed, but the probability is so low (~1/n)
+     that it more likely indicates a fault attack. */
+  jal       x1, trigger_fault_if_fg0_z
+
+  /* w24 = r <= w11  mod n */
+  bn.addm   w24, w11, w31
+
+  /* Store r of signature in dmem.
+       dmem[r] <= r = w24 */
+  la        x19, r
+  li        x2, 24
+  bn.sid    x2, 0(x19)
+
+  /* w19 <= (w24 * w0) mod n = (r * d * alpha) mod n */
+  bn.mov    w25, w0
+  jal       x1, mod_mul_256x256
+
+  /* w0 <= (w1 * w19) mod n = ((k * alpha)^-1 * (r * d * alpha)) mod n
+                            = (k^-1 * r * d) mod n */
+  bn.mov    w24, w1
+  bn.mov    w25, w19
+  jal       x1, mod_mul_256x256
+  bn.mov    w0, w19
+
+  /* Load message from dmem:
+       w24 = msg <= dmem[msg] */
+  la        x18, msg
+  li        x2, 24
+  bn.lid    x2, 0(x18)
+
+  /* w19 = (w24 * w4) mod n = <= (msg * alpha)  mod n */
+  bn.mov    w25, w4
+  jal       x1, mod_mul_256x256
+
+  /* w19 = (w1 * w19) mod n = ((k * alpha)^-1 * (msg * alpha)) mod n
+                            = (k^-1 * msg) mod n */
+  bn.mov    w24, w1
+  bn.mov    w25, w19
+  jal       x1, mod_mul_256x256
+
+  /* w0 = (w0 + w19) mod n = (k^-1*r*d + k^-1*msg) mod n = s */
+  bn.addm   w0, w0, w19
+
+  /* Store s of signature in dmem.
+       dmem[s] <= s = w0 */
+  la        x20, s
+  li        x2, 0
+  bn.sid    x2, 0(x20)
+
+  ret
+
+.section .bss
+
+/* random scalar k (in two 320b shares) */
+.balign 32
+.weak k0
+k0:
+  .zero 64
+.balign 32
+.weak k1
+k1:
+  .zero 64
+
+/* message digest */
+.balign 32
+.weak msg
+msg:
+  .zero 32
+
+/* signature R */
+.balign 32
+.weak r
+r:
+  .zero 32
+
+/* signature S */
+.balign 32
+.weak s
+s:
+  .zero 32
+
+/* private key d (in two 320b shares) */
+.balign 32
+.weak d0
+d0:
+  .zero 64
+.balign 32
+.weak d1
+d1:
+  .zero 64
diff --git a/sw/otbn/crypto/p256_verify.s b/sw/otbn/crypto/p256_verify.s
new file mode 100644
index 0000000000000..78315f7e51319
--- /dev/null
+++ b/sw/otbn/crypto/p256_verify.s
@@ -0,0 +1,417 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/* Copyright 2016 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE.dcrypto file.
+ *
+ * Derived from code in
+ * https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/cr50_stab/chip/g/dcrypto/dcrypto_p256.c
+ */
+
+.globl p256_verify
+
+.text
+
+ /**
+ * P-256 ECDSA signature verification
+ *
+ * returns the affine x-coordinate of
+ *         (x1, y1) = u1*G + u2*Q
+ *         with u1 = z*s^-1 mod n  and  u2 = r*s^-1 mod n
+ *         with G being the curve's base point,
+ *              z being the message
+ *              r, s being the signature
+ *              Q being the public key.
+ *
+ * The routine computes the x1 coordinate and places it in dmem. x1 will be
+ * reduced (mod n), however, the final comparison has to be performed on the
+ * host side. The signature is valid if x1 == r.
+ * This routine runs in variable time.
+ *
+ * @param[in]  dmem[msg]: message to be verified (256 bits)
+ * @param[in]  dmem[r]:   r component of signature (256 bits)
+ * @param[in]  dmem[s]:   s component of signature (256 bits)
+ * @param[in]  dmem[x]:   affine x-coordinate of public key (256 bits)
+ * @param[in]  dmem[y]:   affine y-coordinate of public key (256 bits)
+ * @param[out] dmem[x_r]: dmem buffer for reduced affine x_r-coordinate (x_1)
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: x2, x3, x13, x14, x17 to x24, w0 to w25
+ * clobbered flag groups: FG0
+ */
+p256_verify:
+
+  /* init all-zero register */
+  bn.xor    w31, w31, w31
+
+  /* load domain parameter b from dmem
+     w27 <= b = dmem[p256_b] */
+  li        x2, 27
+  la        x3, p256_b
+  bn.lid    x2, 0(x3)
+
+  /* setup modulus n (curve order) and Barrett constant
+     MOD <= w29 <= n = dmem[p256_n]; w28 <= u_n = dmem[p256_u_n]  */
+  li        x2, 29
+  la        x3, p256_n
+  bn.lid    x2, 0(x3)
+  bn.wsrw   MOD, w29
+  li        x2, 28
+  la        x3, p256_u_n
+  bn.lid    x2, 0(x3)
+
+  /* load s of signature from dmem: w0 = s = dmem[s] */
+  la        x20, s
+  bn.lid    x0, 0(x20)
+
+  /* goto 'fail' if w0 == w31 <=> s == 0 */
+  bn.cmp    w0, w31
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 8
+  bne       x2, x0, fail
+
+  /* goto 'fail' if w0 >= w29 <=> s >= n */
+  bn.cmp    w0, w29
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 1
+  beq       x2, x0, fail
+
+  /* w1 = s^-1  mod n */
+  jal       x1, mod_inv_var
+
+  /* load r of signature from dmem: w24 = r = dmem[r] */
+  la        x19, r
+  li        x2,  24
+  bn.lid    x2, 0(x19)
+
+  /* goto 'fail' if w24 == w31 <=> r == 0 */
+  bn.cmp    w24, w31
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 8
+  bne       x2, x0, fail
+
+  /* goto 'fail' if w0 >= w29 <=> r >= n */
+  bn.cmp    w24, w29
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 1
+  beq       x2, x0, fail
+
+  /* w25 = s^-1 = w1 */
+  bn.mov    w25, w1
+
+  /* u2 = w0 = w19 <= w24*w25 = r*s^-1 mod n */
+  jal       x1, mod_mul_256x256
+  bn.mov    w0, w19
+
+  /* load message, w24 = msg = dmem[msg] */
+  la        x18, msg
+  li        x2, 24
+  bn.lid    x2, 0(x18)
+
+  /* u1 = w1 = w19 <= w24*w25 = w24*w1 = msg*s^-1 mod n */
+  bn.mov    w25, w1
+  jal       x1, mod_mul_256x256
+  bn.mov    w1, w19
+
+  /* Set up for coordinate arithmetic.
+       MOD <= p
+       w28 <= r256
+       w29 <= r448 */
+  jal       x1, setup_modp
+
+  /* load public key Q from dmem and use in projective form (set z to 1)
+     Q = (w11, w12, w13) = (dmem[x], dmem[y], 1) */
+  li        x2, 11
+  la        x21, x
+  bn.lid    x2++, 0(x21)
+  la        x22, y
+  bn.lid    x2, 0(x22)
+  bn.addi   w13, w31, 1
+
+  /* load base point G and use in projective form (set z to 1)
+     G = (w8, w9, w10) = (x_g, y_g, 1) */
+  li        x13, 8
+  la        x23, p256_gx
+  bn.lid    x13, 0(x23)
+  li        x14, 9
+  la        x24, p256_gy
+  bn.lid    x14, 0(x24)
+  bn.addi   w10, w31, 1
+
+  /* The rest of the routine implements a variable time double-and-add
+     algorithm. For the signature verification we need to compute the point
+     C = (x1, y1) = u_1*G + u_2*Q. This can be done in a single
+     double-and-add routine by using Shamir's Trick. */
+
+  /* G+Q = (w3,w4,w5) = (w11,w12,w13) = (w8,w9,w10) (+) (w11,w12,w13) */
+  jal       x1, proj_add
+  bn.mov    w3, w11
+  bn.mov    w4, w12
+  bn.mov    w5, w13
+
+  /* w2 = u_2 & u_0 = w0 & w1*/
+  bn.and    w2, w0, w1
+
+  /* init double and add algorithm with (0, 1, 0) */
+  bn.mov    w11, w31
+  bn.addi   w12, w31, 1
+  bn.mov    w13, w31
+
+  /* main loop with dicreasing index i (i=255 downto 0) */
+  loopi     256, 31
+
+    /* always double: C = (w11,w12,w13) <= 2 (*) C = 2 (*) (w11,w12,w13) */
+    bn.mov    w8, w11
+    bn.mov    w9, w12
+    bn.mov    w10, w13
+    jal       x1, proj_add
+
+    /* if either  u_1[i] == 0 or u_2[i] == 0 jump to 'no_both' */
+    bn.add    w2, w2, w2
+    csrrs     x2, FG0, x0
+    andi      x2, x2, 1
+    beq       x2, x0, no_both
+
+    /* both bits at current index (u1[i] and u2[i]) are set:
+       do C <= C + (P + Q) and jump to end */
+    bn.mov    w8, w3
+    bn.mov    w9, w4
+    bn.mov    w10, w5
+    jal       x1, proj_add
+    jal       x0, no_q
+
+    /* either u1[i] or u2[i] is set, but not both */
+    no_both:
+
+    /* if u2[i] is not set jump to 'no_g' */
+    bn.add    w6, w0, w0
+    csrrs     x2, FG0, x0
+    andi      x2, x2, 1
+    beq       x2, x0, no_g
+
+    /* u2[i] is set: do C <= C + Q */
+    bn.lid    x13, 0(x21)
+    bn.lid    x14, 0(x22)
+    bn.addi   w10, w31, 1
+    jal       x1, proj_add
+
+    no_g:
+    /* if u1[i] is not set jump to 'no_q' */
+    bn.add    w6, w1, w1
+    csrrs     x2, FG0, x0
+    andi      x2, x2, 1
+    beq       x2, x0, no_q
+
+    /* load base point x-coordinate
+      w8 <= g_x = dmem [p256_gx]; w9 <= g_y = dmem[p256_gy] */
+    bn.lid    x13, 0(x23)
+    bn.lid    x14, 0(x24)
+
+    /* u1[i] is set: do C <= C + G */
+    bn.addi   w10, w31, 1
+    jal       x1, proj_add
+
+    no_q:
+    /* left shift w0 and w1 to decrease index */
+    bn.add    w0, w0, w0
+    bn.add    w1, w1, w1
+
+  /* compute inverse of z-coordinate: w1 = z_c^-1  mod p */
+  bn.mov    w0, w13
+  jal       x1, mod_inv_var
+
+  /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1  mod p */
+  bn.mov    w24, w1
+  bn.mov    w25, w11
+  jal       x1, mul_modp
+
+  /* final reduction: w24 = x1 <= x1 mod n */
+  la        x3, p256_n
+  bn.lid    x0, 0(x3)
+  bn.wsrw   MOD, w0
+  bn.subm   w24, w19, w31
+
+  fail:
+  /* store affine x-coordinate in dmem: dmem[x_r] = w24 = x_r */
+  la        x17, x_r
+  li        x2, 24
+  bn.sid    x2, 0(x17)
+
+  ret
+
+
+/**
+ * Variable time modular multiplicative inverse computation
+ *
+ * Returns c <= a^(-1) mod m
+ *         with a being a bigint of length 256 bit with a < m
+ *              m being the modulus with a length of 256 bit
+ *              c being a 256-bit result
+ *
+ * This routine implements the computation of the modular multiplicative
+ * inverse based on the binary GCD or Stein's algorithm.
+ * The implemented variant is based on the
+ * "right-shift binary extended GCD" as it is described in section 3.1 of [1]
+ * (Algorithm 1).
+ * [1] https://doi.org/10.1155/ES/2006/32192
+ *
+ * Note that this is a variable time implementation. I.e. this routine will
+ * show a data dependent timing and execution profile. Only use in situations
+ * where a full white-box environment is acceptable.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  w0: a, operand
+ * @param[in]  MOD: m, modulus
+ * @param[in]  w31: all-zero
+ * @param[out]  w1: result c
+ *
+ * clobbered registers: x2, w2, w3, w4, w7
+ * clobbered flag groups: FG0
+ */
+mod_inv_var:
+
+  /* w2 = r = 0 */
+  bn.mov    w2, w31
+
+  /* w3 = s = 1 */
+  bn.addi   w3, w31, 1
+
+  /* w4 = u = MOD */
+  bn.wsrr   w4, MOD
+  bn.wsrr   w7, MOD
+
+  /* w5 = v = w0 */
+  bn.mov    w5, w0
+
+  ebgcd_loop:
+  /* test if u is odd */
+  bn.or     w4, w4, w4
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 4
+  bne       x2, x0, ebgcd_u_odd
+
+  /* u is even: */
+  /* w4 = u <= u/2 = w4 >> 1 */
+  bn.rshi   w4, w31, w4 >> 1
+
+  /* test if r is odd */
+  bn.or     w2, w2, w2
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 4
+  bne       x2, x0, ebgcd_r_odd
+
+  /* r is even: */
+  /* w2 = r <= r/2 = w2 >> 1 */
+  bn.rshi   w2, w31, w2 >> 1
+  jal       x0, ebgcd_loop
+
+  ebgcd_r_odd:
+  /* w2 = r <= (r + m)/2 = (w2 + w7) >> 1 */
+  bn.add    w2, w7, w2
+  bn.addc   w6, w31, w31
+  bn.rshi   w2, w6, w2 >> 1
+  jal       x0, ebgcd_loop
+
+  ebgcd_u_odd:
+  /* test if v is odd */
+  bn.or     w5, w5, w5
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 4
+  bne       x2, x0, ebgcd_uv_odd
+
+  /* v is even: */
+  /* w5 = v <= v/2 = w5 >> 1 */
+  bn.rshi   w5, w31, w5 >> 1
+
+  /* test if s is odd */
+  bn.or     w3, w3, w3
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 4
+  bne       x2, x0, ebgcd_s_odd
+
+  /* s is even: */
+  /* w3 = s <= s/2 = w3 >> 1 */
+  bn.rshi   w3, w31, w3 >> 1
+  jal       x0, ebgcd_loop
+
+  ebgcd_s_odd:
+  /* w3 = s <= (s + m)/2 = (w3 + w7) >> 1 */
+  bn.add    w3, w7, w3
+  bn.addc   w6, w31, w31
+  bn.rshi   w3, w6, w3 >> 1
+  jal       x0, ebgcd_loop
+
+  ebgcd_uv_odd:
+  /* test if v >= u */
+  bn.cmp    w5, w4
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 1
+  beq       x2, x0, ebgcd_v_gte_u
+
+  /* u > v: */
+  /* w2 = r <= r - s = w2 - w3; if (r < 0): r <= r + m */
+  bn.subm   w2, w2, w3
+
+  /* w4 = u <= u - v = w4 - w5 */
+  bn.sub    w4, w4, w5
+  jal       x0, ebgcd_loop
+
+  ebgcd_v_gte_u:
+  /* w3 = s <= s - r = w3 - w2; if (s < 0) s <= s + m */
+  bn.subm   w3, w3, w2
+
+  /* w5 = v <= v - u = w5 - w4 */
+  bn.sub    w5, w5, w4
+
+  /* if v > 0 go back to start of loop */
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 8
+  beq       x2, x0, ebgcd_loop
+
+  /* v <= 0: */
+  /* if (r > m): w1 = a = r - m = w2 - MOD else: w1 = a = r = w2 */
+  bn.addm   w1, w2, w31
+
+  ret
+
+.section .bss
+
+/* message digest */
+.balign 32
+.weak msg
+msg:
+  .zero 32
+
+/* signature R */
+.balign 32
+.weak r
+r:
+  .zero 32
+
+/* signature S */
+.balign 32
+.weak s
+s:
+  .zero 32
+
+/* public key x-coordinate */
+.balign 32
+.weak x
+x:
+  .zero 32
+
+/* public key y-coordinate */
+.balign 32
+.weak y
+y:
+  .zero 32
+
+/* verification result x_r (aka x_1) */
+.balign 32
+.weak x_r
+x_r:
+  .zero 32
diff --git a/sw/otbn/crypto/p384_a2b.s b/sw/otbn/crypto/p384_a2b.s
new file mode 100644
index 0000000000000..8c851829af907
--- /dev/null
+++ b/sw/otbn/crypto/p384_a2b.s
@@ -0,0 +1,211 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+.globl p384_arithmetic_to_boolean_mod
+.globl p384_arithmetic_to_boolean
+
+.text
+
+/**
+ * Converts arithmetic shares mod p to boolean shares.
+ *
+ * Calls the 385-bit A2B function twice, first using unmodified 384-bit shares
+ * in reduced form, and then using modified 385-bit shares in unreduced form.
+ *
+ * It then checks if the MSB (carry bit) is true or false, to decide
+ * which of the two A2B results is used. This detects and handles an
+ * underflow during the subtraction of arithmetic masking.
+ *
+ * The logic behind the carry bit handling is as follows:
+ * If x >= r, then  A = (x - r) mod p = x - r exactly.
+ * So when we add 2^385 - p and then add A and x, we get
+ * (2^385 - p + x - r + r) mod 2^385 = 2^385 - p + x.
+ * In this case, the high bit is always true since p - x <= p < 2^384,
+ * so we choose the A2B conversion without the 2^385 - p added.
+ * On the other hand, if x < r, then A = (x - r) mod p = x - r + p.
+ * When we add 2^385 - p and then add A and x, we get
+ * (2^385 - p + x - r + p + r) mod 2^385 = (2^385 + x) mod 2^385 = x.
+ * In this case, the high bit is always false since x < p < 2^384, so we
+ * choose this second A2B conversion.
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]        w31: all-zero wide data register
+ * @param[in]  [w14,w13]: field modulus p
+ * @param[in]  [w19,w18]: mask r
+ * @param[in]  [w12,w11]: arithmetically masked value A, such that x = A + r
+ * @param[out] [w21,w20]: boolean masked value x', such that x = x' ^ r
+ *
+ * clobbered registers: w1 to w6, w10 to w12, w20, w21, w23 to w28
+ * clobbered flag groups: FG0
+ */
+p384_arithmetic_to_boolean_mod:
+  /* First step: calculate A2B from reduced values. */
+
+  /* Save inputs for second A2B execution.
+     [w24,w23] <= [w19,w18] = r
+     [w26,w25] <= [w12,w11] = A */
+  bn.mov    w23, w18
+  bn.mov    w24, w19
+  bn.mov    w25, w11
+  bn.mov    w26, w12
+
+  /* Call 385-bit A2B function.
+     [w21,w20] <= x' */
+  jal       x1, p384_arithmetic_to_boolean
+
+  /* Save intermediate result of reduced inputs.
+     [w28,w27] <= [w21,w20] = x' */
+  bn.mov    w27, w20
+  bn.mov    w28, w21
+
+  /* Second step: calculate A2B from unreduced values. */
+
+  /* Restore inputs r and A values [w19,w18] and [w12,w11] and
+     prepare input for 385-bit A2B function. */
+  bn.mov    w18, w23
+  bn.mov    w19, w24
+  bn.mov    w11, w25
+  bn.mov    w12, w26
+
+  /* Convert input A ([w12,w11]) to an unreduced value
+     in the 2^385 domain. For this add (2^385 - p) to A.
+     [w12,w11] <= [w12,w11] + 2^385 - [w14,w13] = A + 2^385 - p */
+  bn.addi   w10, w31, 0x2
+  bn.add    w12, w12, w10 << 128
+  bn.sub    w11, w11, w13
+  bn.subb   w12, w12, w14
+
+  /* Call 385-bit A2B function.
+     [w21,w20] <= x' */
+  jal       x1, p384_arithmetic_to_boolean
+
+  /* Restore initial mask input of w19 for consistency
+     in calling functions.
+     w18 <= w23
+     w19 <= w24 */
+  bn.mov    w18, w23
+  bn.mov    w19, w24
+
+  /* Check MSB (carry bit) of second A2B result for true or false. */
+  bn.cmp    w31, w21 >> 128
+
+  /* Return the unreduced A2B computation (second result),
+     if zero flag is set, otherwise return the reduced
+     A2B computation (first result). */
+  bn.sel    w20, w20, w27, FG0.Z
+  bn.sel    w21, w21, w28, FG0.Z
+
+  ret
+
+/**
+ * Convert arithmetic shares to boolean ones using Goubin's algorithm.
+ *
+ * We use Goubin's boolean-to-arithmetic masking algorithm to switch from
+ * an arithmetic masking scheme to a boolean one without ever unmasking the
+ * seed. See Algorithm 2 here:
+ * https://link.springer.com/content/pdf/10.1007/3-540-44709-1_2.pdf
+ *
+ * This implementation expands the algorithm to 385 bits for carry bit
+ * handling. The carry bit can be used to detect and handle an
+ * underflow during the subtraction of arithmetic masking.
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  w31: all-zero wide data register
+ * @param[in]  w18: lower part of mask r
+ * @param[in]  w19: upper part of mask r
+ * @param[in]  w11: lower part of arithmetically masked value A,
+ *                  such that x = A + r
+ * @param[in]  w12: upper part of arithmetically masked value A,
+ *                  such that x = A + r
+ * @param[out] w20: lower part of boolean masked value x',
+ *                  such that x = x' ^ r
+ * @param[out] w21: upper part of boolean masked value x',
+ *                  such that x = x' ^ r
+ *
+ * clobbered registers: w1 to w6, w11, w12, and w18 to w21
+ * clobbered flag groups: FG0
+ */
+p384_arithmetic_to_boolean:
+  /* Fetch 385 bits of randomness.
+     [w2,w1] = gamma    <= URND */
+  bn.wsrr   w1, 2
+  bn.wsrr   w2, 2
+  bn.rshi   w2, w31, w2 >> 127
+
+  /* Double gamma and truncate to 385 bits.
+     [w4,w3] = T        <= 2 * [w2,w1] = 2 * gamma */
+  bn.add    w3, w1, w1
+  bn.addc   w4, w2, w2
+  bn.rshi   w4, w4, w31 >> 129
+  bn.rshi   w4, w31, w4 >> 127
+
+  /* [w21,w20] = x'     <= [w2,w1] ^ [w19,w18] = gamma ^ r */
+  bn.xor    w20, w1, w18
+  bn.xor    w21, w2, w19
+
+  /* [w6,w5] = omega    <= [w2,w1] & [w21,w20] = gamma & x' */
+  bn.and    w5, w1, w20
+  bn.and    w6, w2, w21
+
+  /* [w21,w20] = x'     <= [w4,w3] ^ [w12,w11] = T ^ A */
+  bn.xor    w20, w3, w11
+  bn.xor    w21, w4, w12
+
+  /* [w2,w1] = gamma    <= [w2,w1] ^ [w21,w20] = gamma ^ x' */
+  bn.xor    w1, w1, w20
+  bn.xor    w2, w2, w21
+
+  /* [w2,w1] = gamma    <= [w2,w1] & [w19,w18] = gamma & r */
+  bn.and    w1, w1, w18
+  bn.and    w2, w2, w19
+
+  /* [w6,w5] = omega    <= [w6,w5] ^ [w2,w1] = omega ^ gamma */
+  bn.xor    w5, w5, w1
+  bn.xor    w6, w6, w2
+
+  /* [w2,w1] = gamma    <= [w4,w3] & [w12,w11] = T & A */
+  bn.and    w1, w3, w11
+  bn.and    w2, w4, w12
+
+  /* [w6,w5] = omega    <= [w6,w5] ^ [w2,w1] = omega ^ gamma */
+  bn.xor    w5, w5, w1
+  bn.xor    w6, w6, w2
+
+  /* Loop for k = 1 to K - 1 = 385 - 1 */
+  loopi     384, 12
+
+    /* [w2,w1] = gamma  <= [w4,w3] & [w19,w18] = T & r */
+    bn.and     w1, w3, w18
+    bn.and     w2, w4, w19
+
+    /* [w2,w1] = gamma  <= [w2,w1] ^ [w6,w5] = gamma ^ omega */
+    bn.xor     w1, w1, w5
+    bn.xor     w2, w2, w6
+
+    /* [w4,w3] = T      <= [w4,w3] & [w12,w11] = T & A */
+    bn.and     w3, w3, w11
+    bn.and     w4, w4, w12
+
+    /* [w2,w1] = gamma  <= [w2,w1] ^ [w4,w3] = gamma ^ T */
+    bn.xor     w1, w1, w3
+    bn.xor     w2, w2, w4
+
+    /* Double gamma and truncate to 385 bits.
+       [w4,w3] = T      <= 2 * [w2,w1] = 2 * gamma */
+    bn.add    w3, w1, w1
+    bn.addc   w4, w2, w2
+    bn.rshi   w4, w4, w31 >> 129
+    bn.rshi   w4, w31, w4 >> 127
+
+  /* [w21,w20] = x'     <= [w21,w20] ^ [w4,w3] = x' ^ T */
+  bn.xor    w20, w20, w3
+  bn.xor    w21, w21, w4
+
+  ret
diff --git a/sw/otbn/crypto/p384_base.s b/sw/otbn/crypto/p384_base.s
index 41e272ced942a..ded113c92c658 100644
--- a/sw/otbn/crypto/p384_base.s
+++ b/sw/otbn/crypto/p384_base.s
@@ -5,6 +5,7 @@
  *   This library contains:
  *   - P-384 specific routines for point addition in projective space
  *   - P-384 domain parameters
+ *   - P-384 specific routines for multiplication and reduction of large values
  */
 
  .section .text
@@ -66,11 +67,46 @@ mul384:
 
   ret
 
+/**
+ * Unrolled 572=448x128 bit multiplication.
+ *
+ * Returns c = a x b.
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in] [w11, w10]: a, first operand, max. length 448 bit, a < m.
+ * @param[in] w16: b, second operand, max. length 128 bit, b < m.
+ * @param[in] w31: all-zero.
+ * @param[out] [w20:w18]: c, result, max. length 572 bit.
+ *
+ * Clobbered registers: w18 to w20
+ * Clobbered flag groups: FG0
+ */
+mul448x128:
+  bn.mulqacc.z          w10.0, w16.0, 0
+  bn.mulqacc            w10.0, w16.1, 64
+  bn.mulqacc.so  w18.L, w10.1, w16.0, 64
+  bn.mulqacc            w10.1, w16.1, 0
+  bn.mulqacc            w10.2, w16.0, 0
+  bn.mulqacc            w10.2, w16.1, 64
+  bn.mulqacc.so  w18.U, w10.3, w16.0, 64
+  bn.mulqacc            w10.3, w16.1, 0
+  bn.mulqacc            w11.0, w16.0, 0
+  bn.mulqacc            w11.1, w16.0, 64
+  bn.mulqacc.so  w19.L, w11.0, w16.1, 64
+  bn.mulqacc            w11.2, w16.0, 0
+  bn.mulqacc            w11.1, w16.1, 0
+  bn.mulqacc.so  w19.U, w11.2, w16.1, 64
+  bn.mulqacc.wo    w20, w31.0, w31.0, 0
+
+  ret
 
 /**
- * 384-bit modular multiplication based on Solinas reduction algorithm.
+ * Solinas reduction algorithm.
  *
- * Returns c = a x b % p.
+ * Returns c = a mod m = (x + 2^384 * y) mod m.
  *
  * This subroutine is specialized to the coordinate field of P-384 and cannot
  * be used for other moduli.
@@ -90,8 +126,7 @@ mul384:
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
- * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m.
- * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
+ * @param[in] [w20:w18]: a, input to reduce, max. length 768 bit.
  * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384.
  * @param[in] w31: all-zero.
  * @param[out] [w17, w16]: c, result, max. length 384 bit.
@@ -99,12 +134,8 @@ mul384:
  * Clobbered registers: w16 to w24
  * Clobbered flag groups: FG0
  */
-.globl p384_mulmod_p
-p384_mulmod_p:
-  /* Compute the raw 768-bit product:
-       ab = [w20:w18] <= a * b */
-  jal     x1, mul384
-
+.globl p384_reduce_p
+p384_reduce_p:
   /* Solinas reduction step. Based on the observation that:
      (x + 2^384 * y) mod (2^384 - K) = (x + K * y) mod (2^384 - K).
 
@@ -196,13 +227,12 @@ p384_mulmod_p:
   bn.sel w16, w18, w16, C
   bn.sel w17, w19, w17, C
 
-  /* return result: c =[w17, w16] =  a * b % m. */
   ret
 
 /**
- * 384-bit modular multiplication based on Solinas reduction algorithm.
+ * Solinas reduction algorithm.
  *
- * Returns c = a x b % m.
+ * Returns c = a mod m = (x + 2^384 * y) mod m.
  *
  * This subroutine is intended for use with the group order (n) of P-384, but
  * will work for any modulus m such that 2^384 - 2^191 < m < 2^384.
@@ -220,8 +250,7 @@ p384_mulmod_p:
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
- * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m.
- * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
+ * @param[in] [w20:w18]: a, input to reduce, max. length 768 bit.
  * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384.
  * @param[in] w14: k, Solinas constant (2^384 - modulus), max. length 191 bit.
  * @param[in] w31: all-zero.
@@ -230,15 +259,8 @@ p384_mulmod_p:
  * Clobbered registers: w16 to w24
  * Clobbered flag groups: FG0
  */
-.globl p384_mulmod_n
-p384_mulmod_n:
-  /* Compute the raw 768-bit product:
-       ab = [w20:w18] <= a * b */
-  jal     x1, mul384
-
-  /* Solinas reduction step. Based on the observation that:
-     (x + 2^384 * y) mod (2^384 - K) = (x + K * y) mod (2^384 - K). */
-
+.globl p384_reduce_n
+p384_reduce_n:
   /* Extract the high 128 bits from the middle term and the low 128 bits from
      the high term:
        w21 <= ab[639:384] */
@@ -336,9 +358,106 @@ p384_mulmod_n:
   bn.sel w16, w18, w16, C
   bn.sel w17, w19, w17, C
 
-  /* return result: c =[w17, w16] =  a * b % m. */
   ret
 
+/**
+ * 384-bit modular multiplication based on Solinas reduction algorithm.
+ *
+ * Returns c = a x b % p.
+ *
+ * This subroutine is specialized to the coordinate field of P-384 and cannot
+ * be used for other moduli.
+ *
+ * For mor information on the reduction algorith, see 'p384_reduce_p'.
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m.
+ * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
+ * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384.
+ * @param[in] w31: all-zero.
+ * @param[out] [w17, w16]: c, result, max. length 384 bit.
+ *
+ * Clobbered registers: w16 to w24
+ * Clobbered flag groups: FG0
+ */
+.globl p384_mulmod_p
+p384_mulmod_p:
+  /* Compute the raw 768-bit product:
+       ab = [w20:w18] <= a * b */
+  jal     x1, mul384
+
+  /* return [w17, w16] = ab mod m = [w20:w18] mod m */
+  jal     x0, p384_reduce_p
+
+/**
+ * 384-bit modular multiplication based on Solinas reduction algorithm.
+ *
+ * Returns c = a * b mod m.
+ *
+ * This subroutine is intended for use with the group order (n) of P-384, but
+ * will work for any modulus m such that 2^384 - 2^191 < m < 2^384.
+ *
+ * For mor information on the reduction algorith, see 'p384_reduce_n'.
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m.
+ * @param[in] [w17, w16]: b, second operand, max. length 384 bit, b < m.
+ * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384.
+ * @param[in] w14: k, Solinas constant (2^384 - modulus), max. length 191 bit.
+ * @param[in] w31: all-zero.
+ * @param[out] [w17, w16]: c, result, max. length 384 bit.
+ *
+ * Clobbered registers: w16 to w24
+ * Clobbered flag groups: FG0
+ */
+.globl p384_mulmod_n
+p384_mulmod_n:
+  /* Compute the raw 768-bit product:
+       ab = [w20:w18] <= a * b */
+  jal     x1, mul384
+
+  /* return [w17, w16] = ab mod m = [w20:w18] mod m */
+  jal     x0, p384_reduce_n
+
+/**
+ * 448x128=572-bit modular multiplication based on Solinas reduction algorithm.
+ *
+ * Returns c = a * b mod m.
+ *
+ * This subroutine is intended for use with the group order (n) of P-384, but
+ * will work for any modulus m such that 2^384 - 2^191 < m < 2^384.
+ *
+ * For mor information on the reduction algorith, see 'p384_reduce_n'.
+ *
+ * This routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in] [w11, w10]: a, first operand, max. length 384 bit, b < m.
+ * @param[in] w16: b, second operand, max. length 128 bit, b < m.
+ * @param[in] [w13, w12]: m, modulus, 2^383 <= m < 2^384.
+ * @param[in] w14: k, Solinas constant (2^384 - modulus), max. length 191 bit.
+ * @param[in] w31: all-zero.
+ * @param[out] [w17, w16]: c, result, max. length 384 bit.
+ *
+ * Clobbered registers: w16 to w24
+ * Clobbered flag groups: FG0
+ */
+.globl p384_mulmod448x128_n
+p384_mulmod448x128_n:
+  /* Compute the raw 768-bit product:
+       ab = [w20:w18] <= a * b */
+  jal     x1, mul448x128
+
+  /* return [w17, w16] = ab mod m = [w20:w18] mod m */
+  jal     x0, p384_reduce_n
+
 /**
  * P-384 point addition in projective space
  *
@@ -422,10 +541,8 @@ proj_add_p384:
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w6, w16
-  bn.mov    w7, w17
+  bn.sel    w6, w16, w10, C
+  bn.sel    w7, w17, w11, C
 
   /* 5: [w9, w8] = t4 <= X2+Y2 = dmem[x27+0]+dmem[x27+64] */
   bn.lid    x22, 0(x27)
@@ -436,10 +553,8 @@ proj_add_p384:
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w8, w16
-  bn.mov    w9, w17
+  bn.sel    w8, w16, w10, C
+  bn.sel    w9, w17, w11, C
 
   /* 6: [w7, w6] = t3 <= t3*t4 = [w7, w6]*[w9, w8] */
   bn.mov    w10, w6
@@ -455,20 +570,16 @@ proj_add_p384:
   bn.addc   w17, w1, w3
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w8, w16
-  bn.mov    w9, w17
+  bn.sel    w8, w16, w10, C
+  bn.sel    w9, w17, w11, C
 
   /* 8: [w7, w6] = t3 <= t3-t4 = [w7, w6]-[w9, w8] */
   bn.sub    w16, w6, w8
   bn.subb   w17, w7, w9
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w6, w16
-  bn.mov    w7, w17
+  bn.sel    w6, w10, w16, C
+  bn.sel    w7, w11, w17, C
 
   /* 9: [w9, w8] = t4 <= Y1+Z1 = dmem[x26+64]+dmem[x26+128] */
   bn.lid    x22, 64(x26)
@@ -479,10 +590,8 @@ proj_add_p384:
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w8, w16
-  bn.mov    w9, w17
+  bn.sel    w8, w16, w10, C
+  bn.sel    w9, w17, w11, C
 
   /* 10: [w26, w25] = X3 <= Y2+Z2 = dmem[x27+64]+dmem[x27+128] */
   bn.lid    x22, 64(x27)
@@ -493,10 +602,8 @@ proj_add_p384:
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
+  bn.sel    w25, w16, w10, C
+  bn.sel    w26, w17, w11, C
 
   /* 11: [w9, w8] = t4 <= t4*X3 = [w9, w8]*[w26, w25] */
   bn.mov    w10, w8
@@ -512,20 +619,16 @@ proj_add_p384:
   bn.addc   w17, w3, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
+  bn.sel    w25, w16, w10, C
+  bn.sel    w26, w17, w11, C
 
   /* 13: [w9, w8] = t4 <= t4-X3 = [w9, w8]-[w26, w25] */
   bn.sub    w16, w8, w25
   bn.subb   w17, w9, w26
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w8, w16
-  bn.mov    w9, w17
+  bn.sel    w8, w10, w16, C
+  bn.sel    w9, w11, w17, C
 
   /* 14: [w26, w25] = X3 <= X1+Z1 = dmem[x26+0]+dmem[x26+128] */
   bn.lid    x22, 0(x26)
@@ -536,10 +639,8 @@ proj_add_p384:
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
+  bn.sel    w25, w16, w10, C
+  bn.sel    w26, w17, w11, C
 
   /* 15: [w28, w27] = Y3 <= X2+Z2 = dmem[x27+0]+dmem[x27+128] */
   bn.lid    x22, 0(x27)
@@ -550,10 +651,8 @@ proj_add_p384:
   bn.addc   w17, w11, w17
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
+  bn.sel    w27, w16, w10, C
+  bn.sel    w28, w17, w11, C
 
   /* 16: [w26, w25] = X3 <= X3*Y3 = [w26, w25]*[w28, w27] */
   bn.mov    w10, w25
@@ -569,20 +668,16 @@ proj_add_p384:
   bn.addc   w17, w1, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
+  bn.sel    w27, w16, w10, C
+  bn.sel    w28, w17, w11, C
 
   /* 18: [w28, w27] = Y3 <= X3-Y3 = [w26, w25]-[w28, w27] */
   bn.sub    w16, w25, w27
   bn.subb   w17, w26, w28
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
+  bn.sel    w27, w10, w16, C
+  bn.sel    w28, w11, w17, C
 
   /* 19: [w30, w29] = Z3 <= b*t2 = dmem[x28+0]*[w5, w4] */
   bn.lid    x22, 0(x28)
@@ -598,50 +693,40 @@ proj_add_p384:
   bn.subb   w17, w28, w30
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
+  bn.sel    w25, w10, w16, C
+  bn.sel    w26, w11, w17, C
 
   /* 21: [w30, w29] = Z3 <= X3+X3 = [w26, w25]+[w26, w25] */
   bn.add    w16, w25, w25
   bn.addc   w17, w26, w26
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w29, w16
-  bn.mov    w30, w17
+  bn.sel    w29, w16, w10, C
+  bn.sel    w30, w17, w11, C
 
   /* 22: [w26, w25] = X3 <= X3+Z3 = [w26, w25]+[w30, w29] */
   bn.add    w16, w25, w29
   bn.addc   w17, w26, w30
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
+  bn.sel    w25, w16, w10, C
+  bn.sel    w26, w17, w11, C
 
   /* 23: [w30, w29] = Z3 <= t1-X3 = [w3, w2]-[w26, w25] */
   bn.sub    w16, w2, w25
   bn.subb   w17, w3, w26
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w29, w16
-  bn.mov    w30, w17
+  bn.sel    w29, w10, w16, C
+  bn.sel    w30, w11, w17, C
 
   /* 24: [w26, w25] = X3 <= t1+X3 = [w3, w2]+[w26, w25] */
   bn.add    w16, w2, w25
   bn.addc   w17, w3, w26
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
+  bn.sel    w25, w16, w10, C
+  bn.sel    w26, w17, w11, C
 
   /* 25: [w28, w27] = Y3 <= b*Y3 = dmem[x28+0]*[w28, w27] */
   bn.lid    x22, 0(x28)
@@ -657,90 +742,72 @@ proj_add_p384:
   bn.addc   w17, w5, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w2, w16
-  bn.mov    w3, w17
+  bn.sel    w2, w16, w10, C
+  bn.sel    w3, w17, w11, C
 
   /* 27: [w5, w4] = t2 <= t1+t2 = [w3, w2]+[w5, w4] */
   bn.add    w16, w2, w4
   bn.addc   w17, w3, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w4, w16
-  bn.mov    w5, w17
+  bn.sel    w4, w16, w10, C
+  bn.sel    w5, w17, w11, C
 
   /* 28: [w28, w27] = Y3 <= Y3-t2 = [w28, w27]-[w5, w4] */
   bn.sub    w16, w27, w4
   bn.subb   w17, w28, w5
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
+  bn.sel    w27, w10, w16, C
+  bn.sel    w28, w11, w17, C
 
   /* 29: [w28, w27] = Y3 <= Y3-t0 = [w28, w27]-[w1, w0] */
   bn.sub    w16, w27, w0
   bn.subb   w17, w28, w1
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
+  bn.sel    w27, w10, w16, C
+  bn.sel    w28, w11, w17, C
 
   /* 30: [w3, w2] = t1 <= Y3+Y3 = [w28, w27]+[w28, w27] */
   bn.add    w16, w27, w27
   bn.addc   w17, w28, w28
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w2, w16
-  bn.mov    w3, w17
+  bn.sel    w2, w16, w10, C
+  bn.sel    w3, w17, w11, C
 
   /* 31: [w28, w27] = Y3 <= t1+Y3 = [w3, w2]+[w28, w27] */
   bn.add    w16, w2, w27
   bn.addc   w17, w3, w28
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
+  bn.sel    w27, w16, w10, C
+  bn.sel    w28, w17, w11, C
 
   /* 32: [w3, w2] = t1 <= t0+t0 = [w1, w0]+[w1, w0] */
   bn.add    w16, w0, w0
   bn.addc   w17, w1, w1
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w2, w16
-  bn.mov    w3, w17
+  bn.sel    w2, w16, w10, C
+  bn.sel    w3, w17, w11, C
 
   /* 33: [w1, w0] = t0 <= t1+t0 = [w3, w2]+[w1, w0] */
   bn.add    w16, w2, w0
   bn.addc   w17, w3, w1
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w0, w16
-  bn.mov    w1, w17
+  bn.sel    w0, w16, w10, C
+  bn.sel    w1, w17, w11, C
 
   /* 34: [w1, w0] = t0 <= t0-t2 = [w1, w0]-[w5, w4] */
   bn.sub    w16, w0, w4
   bn.subb   w17, w1, w5
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w0, w16
-  bn.mov    w1, w17
+  bn.sel    w0, w10, w16, C
+  bn.sel    w1, w11, w17, C
 
   /* 35: [w3, w2] = t1 <= t4*Y3 = [w9, w8]*[w28, w27] */
   bn.mov    w10, w8
@@ -774,10 +841,8 @@ proj_add_p384:
   bn.addc   w17, w28, w5
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w27, w16
-  bn.mov    w28, w17
+  bn.sel    w27, w16, w10, C
+  bn.sel    w28, w17, w11, C
 
   /* 39: [w26, w25] = X3 <= t3*X3 = [w7, w6]*[w26, w25] */
   bn.mov    w10, w6
@@ -793,10 +858,8 @@ proj_add_p384:
   bn.subb   w17, w26, w3
   bn.add    w10, w16, w12
   bn.addc   w11, w17, w13
-  bn.sel    w16, w10, w16, C
-  bn.sel    w17, w11, w17, C
-  bn.mov    w25, w16
-  bn.mov    w26, w17
+  bn.sel    w25, w10, w16, C
+  bn.sel    w26, w11, w17, C
 
   /* 41: [w30, w29] = Z3 <= t4*Z3 = [w9, w8]*[w30, w29] */
   bn.mov    w10, w8
@@ -821,16 +884,272 @@ proj_add_p384:
   bn.addc   w17, w30, w3
   bn.sub    w10, w16, w12
   bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-  bn.mov    w29, w16
-  bn.mov    w30, w17
+  bn.sel    w29, w16, w10, C
+  bn.sel    w30, w17, w11, C
 
   ret
 
+/**
+ * Convert projective coordinates of a P-384 curve point to affine coordinates
+ *
+ * returns P = (x_a, y_a) = (x/z mod p, y/z mod p)
+ *              where P is a valid P-384 curve point,
+ *                    x_a and y_a are the resulting affine coordinates of the
+ *                      curve point,
+ *                    x,y and z are a set of projective coordinates of the
+ *                      point and
+ *                    p is the modulus of the P-384 underlying finite field.
+ *
+ * This routine computes the affine coordinates for a set of projective
+ * coordinates of a valid P-384 curve point. The routine performs the required
+ * divisions by computing the multiplicative modular inverse of the
+ * projective z-coordinate in the underlying finite field of the P-384 curve.
+ * For inverse computation Fermat's little theorem is used, i.e.
+ * we compute z^-1 = z^(p-2) mod p.
+ * For exponentiation a 16 step addition chain is used.
+ * Source of the addition chain is the addchain project:
+ * https://github.com/mmcloughlin/addchain/
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  [w26,w25]: x, x-coordinate of curve point (projective).
+ * @param[in]  [w28,w27]: y, y-coordinate of curve point (projective).
+ * @param[in]  [w30,w29]: z, z-coordinate of curve point (projective).
+ * @param[in]  [w13, w12]: p, modulus of P-384.
+ * @param[in]  w31: all-zero.
+ * @param[out] [w1, w0]: z^-1. inverse of z-coordinate of curve point.
+ * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point.
+ * @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point.
+ *
+ * clobbered registers: w0 to w28
+ * clobbered flag groups: FG0
+ */
+ .globl proj_to_affine_p384
+proj_to_affine_p384:
+
+  /* Exp: 0b10 = 2*0b1
+     Val: r10 = z^2 mod p
+          [w17,w16] <= [w30,w29]^2 mod [w13,w12] */
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  bn.mov    w16, w29
+  bn.mov    w17, w30
+  jal       x1, p384_mulmod_p
+
+  /* Exp: 0b11 = 0b1+0b10
+     Val: r11 <= z*r10 mod p
+          [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  jal       x1, p384_mulmod_p
+
+  /* Exp: 0b110 = 2*0b11
+     Val: r110 = r11^2 mod p
+          [w17,w16] <= [w17,w16]^2 mod [w13,w12] */
+  bn.mov    w10, w16
+  bn.mov    w11, w17
+  jal       x1, p384_mulmod_p
+
+  /* Exp: 0b111 = 0b1+0b110
+     Val: r111 <= z*r110  mod p
+          [w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  jal       x1, p384_mulmod_p
+  bn.mov    w0, w16
+  bn.mov    w1, w17
+
+  /* Exp: 0b111000 = 0b111<<3
+     Val: r111000 <= r111^(2^3)  mod p
+          [w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */
+  loopi     3, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+
+  /* Exp: 0b1111111 = 0b111+0b111000
+     Val: r1111111 <= r111*r111000 mod p
+          [w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */
+  bn.mov    w10, w0
+  bn.mov    w11, w1
+  jal       x1, p384_mulmod_p
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* Exp: 2^12-1 = (0b1111111<<6)+0b111111
+     Val: r_12_1 <= r111111^(2^6)*r111111 mod p
+          [w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */
+  loopi     6, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+  bn.mov    w10, w2
+  bn.mov    w11, w3
+  jal       x1, p384_mulmod_p
+  bn.mov    w4, w16
+  bn.mov    w5, w17
+
+  /* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1)
+     Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p
+          [w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */
+  loopi     12, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+  bn.mov    w10, w4
+  bn.mov    w11, w5
+  jal       x1, p384_mulmod_p
+
+  /* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111
+     Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p
+          [w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */
+  loopi     6, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+  bn.mov    w10, w2
+  bn.mov    w11, w3
+  jal       x1, p384_mulmod_p
+  bn.mov    w2, w16
+  bn.mov    w3, w17
+
+  /* Exp: 2^31-1 <= (2^30-1)*2+0b1
+     Val: r_31_1 <= r30_1^2*z mod p
+          [w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
+  bn.mov    w10, w16
+  bn.mov    w11, w17
+  jal       x1, p384_mulmod_p
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  jal       x1, p384_mulmod_p
+  bn.mov    w6, w16
+  bn.mov    w7, w17
+
+  /* Exp: 2^32-1 <= (2^30-1)*2+0b1
+     Val: r_32_1 <= r31_1^2*z mod p
+          [w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
+  bn.mov    w10, w16
+  bn.mov    w11, w17
+  jal       x1, p384_mulmod_p
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  jal       x1, p384_mulmod_p
+  bn.mov    w9, w16
+  bn.mov    w8, w17
+
+  /* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1)
+     Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p
+          [w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */
+  loopi     31, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+  bn.mov    w10, w6
+  bn.mov    w11, w7
+  jal       x1, p384_mulmod_p
+  bn.mov    w6, w16
+  bn.mov    w7,w17
+
+  /* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1)
+     Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p
+          [w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */
+  loopi     63, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+  bn.mov    w10, w6
+  bn.mov    w11, w7
+  jal       x1, p384_mulmod_p
+  bn.mov    w6, w16
+  bn.mov    w7, w17
+
+  /* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1)
+     Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p
+          [w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */
+  loopi     126, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+  bn.mov    w10, w6
+  bn.mov    w11, w7
+  jal       x1, p384_mulmod_p
+
+  /* Exp: 2^255-1 = ((2^252-1)<<3)+0b111
+     Val: r_255_1 <= r_252_1^(2^3)*r111 mod p
+          [w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */
+  loopi     3, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+  bn.mov    w10, w0
+  bn.mov    w11, w1
+  jal       x1, p384_mulmod_p
+
+  /* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1
+     Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p
+          [w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2)
+                       *[w30,w29] mod [w13,w12] */
+  loopi     33, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+  bn.mov    w10, w9
+  bn.mov    w11, w8
+  jal       x1, p384_mulmod_p
+  loopi     94, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+  bn.mov    w10, w2
+  bn.mov    w11, w3
+  jal       x1, p384_mulmod_p
+  loopi     2, 4
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_p
+    nop
+  bn.mov    w10, w29
+  bn.mov    w11, w30
+  jal       x1, p384_mulmod_p
+
+  /* store inverse [w1,w0] <= [w17,w16] = z_inv*/
+  bn.mov w0, w16
+  bn.mov w1, w17
+
+  /* convert x-coordinate to affine space
+     [w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */
+  bn.mov    w10, w25
+  bn.mov    w11, w26
+  jal       x1, p384_mulmod_p
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* convert y-coordinate to affine space
+     [w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */
+  bn.mov    w10, w27
+  bn.mov    w11, w28
+  bn.mov    w16, w0
+  bn.mov    w17, w1
+  jal       x1, p384_mulmod_p
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  ret
 
 .section .data
 
+.balign 32
+
 /* P-384 domain parameter b */
 .globl p384_b
 p384_b:
diff --git a/sw/otbn/crypto/p384_base_mult.s b/sw/otbn/crypto/p384_base_mult.s
new file mode 100644
index 0000000000000..55f96294199ca
--- /dev/null
+++ b/sw/otbn/crypto/p384_base_mult.s
@@ -0,0 +1,149 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ *   P-384 specific routines for constant-time base point multiplication.
+ */
+
+ .section .text
+
+/**
+ * Externally callable routine for P-384 base point multiplication
+ *
+ * returns Q = d (*) G
+ *         where Q is a resulting valid P-384 curve point in affine
+ *                   coordinates,
+ *               G is the base point of curve P-384, and
+ *               d is a 384-bit scalar.
+ *
+ * Sets up context and calls the internal scalar multiplication routine.
+ * This routine runs in constant time.
+ *
+ * @param[in]  dmem[0]: dptr_d0, pointer to location in dmem containing
+ *                      1st private key share d0
+ * @param[in]  dmem[4]: dptr_d1, pointer to location in dmem containing
+ *                      2nd private key share d1
+ * @param[in]  dmem[20]: dptr_x, pointer to result buffer for x-coordinate
+ * @param[in]  dmem[24]: dptr_y, pointer to result buffer for y-coordinate
+ *
+ * 384-bit quantities have to be provided in dmem in little-endian format,
+ * 512 bit aligned, with the highest 128 bit set to zero.
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 correspond
+ *        to the computed affine y-coordinate.
+ *
+ * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30
+ *                      w0 to w30
+ * clobbered flag groups: FG0
+ */
+.globl p384_base_mult
+p384_base_mult:
+
+  /* set dmem pointer to x-coordinate of base point*/
+  la        x20, p384_gx
+
+  /* set dmem pointer to y-coordinate of base point */
+  la        x21, p384_gy
+
+  /* set dmem pointer to 1st scalar share d0 */
+  la        x17, dptr_d0
+  lw        x17, 0(x17)
+
+  /* set dmem pointer to 2nd scalar share d1 */
+  la        x19, dptr_d1
+  lw        x19, 0(x19)
+
+  /* set dmem pointer to domain parameter b */
+  la        x28, p384_b
+
+  /* set dmem pointer to scratchpad */
+  la        x30, scratchpad
+
+  /* load domain parameter n (order of base point)
+     [w11, w10] = n = dmem[p384_n] */
+  li        x2, 10
+  la        x3, p384_n
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* load domain parameter p (modulus)
+     [w13, w12] = p = dmem[p384_p] */
+  la        x3, p384_p
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* init all-zero reg */
+  bn.xor    w31, w31, w31
+
+  /* scalar multiplication in projective space
+     [w30:w25] <= (x, y, z) = d * G */
+  jal       x1, scalar_mult_int_p384
+
+  /* conversion into affine space
+     [w28:w25] <= (x, y) */
+  jal       x1, proj_to_affine_p384
+
+  /* set dmem pointer to point x-coordinate */
+  la        x20, dptr_x
+  lw        x20, 0(x20)
+
+  /* set dmem pointer to point y-coordinate */
+  la        x21, dptr_y
+  lw        x21, 0(x21)
+
+  /* store result in dmem */
+  li        x2, 25
+  bn.sid    x2++, 0(x20)
+  bn.sid    x2++, 32(x20)
+  bn.sid    x2++, 0(x21)
+  bn.sid    x2++, 32(x21)
+
+  ret
+
+/* pointers and scratchpad memory */
+.section .data
+
+.balign 32
+
+  /* pointer to k0 (dptr_k0) */
+.globl dptr_k0
+.weak dptr_k0
+dptr_k0:
+  .zero 4
+
+/* pointer to k1 (dptr_k1) */
+.globl dptr_k1
+.weak dptr_k1
+dptr_k1:
+  .zero 4
+
+/* pointer to d0 (dptr_d0) */
+.globl dptr_d0
+.weak dptr_d0
+dptr_d0:
+  .zero 4
+
+/* pointer to d1 (dptr_d1) */
+.globl dptr_d1
+.weak dptr_d1
+dptr_d1:
+  .zero 4
+
+/* pointer to X (dptr_x) */
+.globl dptr_x
+.weak dptr_x
+dptr_x:
+  .zero 4
+
+/* pointer to Y (dptr_y) */
+.globl dptr_y
+.weak dptr_y
+dptr_y:
+  .zero 4
+
+/* 704 bytes of scratchpad memory */
+.balign 32
+.globl scratchpad
+.weak scratchpad
+scratchpad:
+  .zero 704
diff --git a/sw/otbn/crypto/p384_curve_point_valid.s b/sw/otbn/crypto/p384_curve_point_valid.s
new file mode 100644
index 0000000000000..b0d57e4b84134
--- /dev/null
+++ b/sw/otbn/crypto/p384_curve_point_valid.s
@@ -0,0 +1,58 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Curve point validation for curve P-384.
+ *
+ * Checks if a given curve point (e.g. public key for ECDH shared key
+ * generation) is a valid curve point on the P-384 curve.
+ *
+ * The check is successful when the the binary execution completes without
+ * error. In case of an unvalid point, a software error is raised and execution
+ * is halted.
+ */
+
+.section .text.start
+start:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  jal       x1, validate_point
+
+  /* Unsupported mode; fail. */
+  unimp
+  unimp
+  unimp
+
+validate_point:
+  /* Call curve point validation function */
+  jal       x1, p384_curve_point_valid
+
+  ecall
+
+.data
+
+/* pointer to x-coordinate (dptr_x) */
+.globl dptr_x
+.balign 4
+dptr_x:
+  .zero 4
+
+/* pointer to y-coordinate (dptr_y) */
+.globl dptr_y
+.balign 4
+dptr_y:
+  .zero 4
+
+/* Public key x-coordinate. */
+.globl x
+.balign 32
+x:
+  .zero 64
+
+/* Public key y-coordinate. */
+.globl y
+.balign 32
+y:
+  .zero 64
diff --git a/sw/otbn/crypto/p384_ecdh.s b/sw/otbn/crypto/p384_ecdh.s
new file mode 100644
index 0000000000000..0fb8fd42271b5
--- /dev/null
+++ b/sw/otbn/crypto/p384_ecdh.s
@@ -0,0 +1,217 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Elliptic-curve Diffie-Hellman (ECDH) on curve P-384.
+ *
+ * This binary has the following modes of operation:
+ * 1. MODE_KEYGEN_RANDOM: generate a random keypair
+ * 2. MODE_SHARED_KEYGEN: compute shared key - !!! Attention !!! - before
+ *                        shared key computation p384_curve_point_valid
+ *                        binary has to be executed to check if the provided
+ *                        public key is valid.
+ */
+
+ /**
+ * Mode magic values generated with
+ * $ ./util/design/sparse-fsm-encode.py -d 6 -m 4 -n 11 \
+ *    --avoid-zero -s 3660400884
+ *
+ * Call the same utility with the same arguments and a higher -m to generate
+ * additional value(s) without changing the others or sacrificing mutual HD.
+ *
+ * TODO(#17727): in some places the OTBN assembler support for .equ directives
+ * is lacking, so they cannot be used in bignum instructions or pseudo-ops such
+ * as `li`. If support is added, we could use 32-bit values here instead of
+ * 11-bit.
+ */
+.equ MODE_SHARED_KEY, 0x5ec
+.equ MODE_KEYPAIR_RANDOM, 0x3f1
+
+.section .text.start
+start:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Read the mode and tail-call the requested operation. */
+  la        x2, mode
+  lw        x2, 0(x2)
+
+  addi      x3, x0, MODE_KEYPAIR_RANDOM
+  beq       x2, x3, keypair_random
+
+  addi      x3, x0, MODE_SHARED_KEY
+  beq       x2, x3, shared_key
+
+  /* Unsupported mode; fail. */
+  unimp
+  unimp
+  unimp
+
+/**
+ * Generate a fresh random keypair.
+ *
+ * Returns secret key d in 448-bit shares d0, d1.
+ *
+ * Returns public key Q = d*G in affine coordinates (x, y).
+ *
+ * This routine runs in constant time (except potentially waiting for entropy
+ * from RND).
+ *
+ * @param[in]       w31: all-zero
+ * @param[in]   dmem[0]: dptr_d0, pointer to location in dmem containing
+ *                       1st private key share d0
+ * @param[in]   dmem[4]: dptr_d1, pointer to location in dmem containing
+ *                       2nd private key share d1
+ * @param[in]  dmem[20]: dptr_x, pointer to result buffer for x-coordinate
+ * @param[in]  dmem[24]: dptr_y, pointer to result buffer for y-coordinate
+ * @param[out] dmem[d0]: 1st private key share d0
+ * @param[out] dmem[d1]: 2nd private key share d1
+ * @param[out]  dmem[x]: Public key x-coordinate
+ * @param[out]  dmem[y]: Public key y-coordinate
+ *
+ * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30, w0 to w30
+ * clobbered flag groups: FG0
+ */
+keypair_random:
+  /* Generate secret key d in shares.
+       dmem[d0] <= d0
+       dmem[d1] <= d1 */
+  jal       x1, p384_generate_random_key
+
+  /* Generate public key d*G.
+       dmem[x] <= (d*G).x
+       dmem[y] <= (d*G).y */
+  jal       x1, p384_base_mult
+
+  ecall
+
+/**
+ * Generate a shared key from a secret and public key.
+ *
+ * Returns the shared key, which is the affine x-coordinate of (d*Q). The
+ * shared key is expressed in boolean shares x0, x1 such that the key is (x0 ^
+ * x1).
+ *
+ * This routine runs in constant time.
+ *
+ * !!! Attention !!! - before shared key computation p384_curve_point_valid
+ * binary has to be executed to check if the provided public key is valid.
+ *
+ * @param[in]       w31: all-zero
+ * @param[in]   dmem[0]: dptr_k0, pointer to location in dmem containing
+ *                       1st private key share d0/k0
+ * @param[in]   dmem[4]: dptr_k1, pointer to location in dmem containing
+ *                       2nd private key share d1/k0
+ * @param[in]  dmem[20]: dptr_x, pointer to result buffer for x-coordinate
+ * @param[in]  dmem[24]: dptr_y, pointer to result buffer for y-coordinate
+ * @param[out]  dmem[x]: x0, first share of shared key.
+ * @param[out]  dmem[y]: x1, second share of shared key.
+ *
+ * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30, w0 to w30
+ * clobbered flag groups: FG0
+ */
+shared_key:
+    /* Generate arithmetically masked shared key d*Q.
+       dmem[x] <= (d*Q).x - m_x mod p
+       dmem[y] <= m_x */
+  jal       x1, p384_scalar_mult
+
+  /* Arithmetic-to-boolean conversion*/
+
+  /* load result to WDRs for a2b conversion.
+     [w12,w11] <= dmem[p1_x] = x_m
+     [w19,w18] <= dmem[p1_y] = m */
+  li        x2, 11
+  la        x3, x
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+  li        x2, 18
+  la        x3, y
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2, 32(x3)
+
+  /* Load domain parameter.
+     [w14,w13] = dmem[p384_p] */
+  li        x2, 13
+  la        x4, p384_p
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  jal       x1, p384_arithmetic_to_boolean_mod
+
+  /* dmem[x] <= w20 = x' */
+  li        x3, 20
+  la        x4, x
+  bn.sid    x3, 0(x4)
+
+  ecall
+
+.data
+
+/* Operational mode. */
+.globl mode
+.balign 4
+mode:
+  .zero 4
+
+/* pointer to x-coordinate (dptr_x) */
+.globl dptr_x
+.balign 4
+dptr_x:
+  .zero 4
+
+/* pointer to y-coordinate (dptr_y) */
+.globl dptr_y
+.balign 4
+dptr_y:
+  .zero 4
+
+/* Public key x-coordinate. */
+.globl x
+.balign 32
+x:
+  .zero 64
+
+/* Public key y-coordinate. */
+.globl y
+.balign 32
+y:
+  .zero 64
+
+/* Secret key (d) in two shares: d = (d0 + d1) mod n.
+
+   Note: This is also labeled k0, k1 because the `p384_scalar_mult` algorithm
+   is also used for ECDSA signing and reads from those labels; in the case of
+   ECDH, the scalar in `p384_scalar_mult` is always the private key (d). */
+
+/* pointer to d0 (dptr_d0) */
+.globl dptr_k0
+.globl dptr_d0
+.balign 4
+dptr_d0:
+  .zero 4
+
+/* pointer to d1 (dptr_d1) */
+.globl dptr_k1
+.globl dptr_d1
+.balign 4
+dptr_d1:
+  .zero 4
+
+.globl d0
+.globl k0
+.balign 32
+d0:
+k0:
+  .zero 64
+
+.globl d1
+.globl k1
+.balign 32
+d1:
+k1:
+  .zero 64
+
+.balign 32
diff --git a/sw/otbn/crypto/p384_ecdsa_keygen.s b/sw/otbn/crypto/p384_ecdsa_keygen.s
new file mode 100644
index 0000000000000..e7d282b2a03c8
--- /dev/null
+++ b/sw/otbn/crypto/p384_ecdsa_keygen.s
@@ -0,0 +1,106 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Entrypoint for P-384 ECDSA key generation operations.
+ *
+ * This binary generates a new keypair.
+ */
+
+.section .text.start
+.globl start
+start:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  jal       x1, random_keygen
+
+  /* Invalid mode; fail. */
+  unimp
+  unimp
+  unimp
+
+/**
+ * Generate a fresh random keypair.
+ *
+ * Returns secret key d in 448-bit shares d0, d1.
+ * Returns public key Q = d*G in affine coordinates (x, y).
+ *
+ * @param[in]       w31: all-zero
+ * @param[in]   dmem[0]: dptr_d0, pointer to location in dmem containing
+ *                       1st private key share d0
+ * @param[in]   dmem[4]: dptr_d1, pointer to location in dmem containing
+ *                       2nd private key share d1
+ * @param[in]  dmem[20]: dptr_x, pointer to result buffer for x-coordinate
+ * @param[in]  dmem[24]: dptr_y, pointer to result buffer for y-coordinate
+ * @param[out] dmem[d0]: 1st private key share d0
+ * @param[out] dmem[d1]: 2nd private key share d1
+ * @param[out]  dmem[x]: Public key x-coordinate
+ * @param[out]  dmem[y]: Public key y-coordinate
+ */
+random_keygen:
+  /* Generate secret key d in shares.
+       dmem[d0] <= d0
+       dmem[d1] <= d1 */
+  jal       x1, p384_generate_random_key
+
+  /* Generate public key d*G.
+       dmem[x] <= (d*G).x
+       dmem[y] <= (d*G).y */
+  jal       x1, p384_base_mult
+
+  ecall
+
+.bss
+
+/* pointer to k0 (dptr_k0) */
+.globl dptr_k0
+dptr_k0:
+  .zero 4
+
+/* pointer to k1 (dptr_k1) */
+.globl dptr_k1
+dptr_k1:
+  .zero 4
+
+/* pointer to d0 (dptr_d0) */
+.globl dptr_d0
+dptr_d0:
+  .zero 4
+
+/* pointer to d1 (dptr_d1) */
+.globl dptr_d1
+dptr_d1:
+  .zero 4
+
+/* random scalar first share */
+.globl k0
+.balign 32
+k0:
+  .zero 64
+
+/* random scalar second share */
+.globl k1
+.balign 32
+k1:
+  .zero 64
+
+/* private key first share */
+.globl d0
+.balign 32
+d0:
+  .zero 64
+
+/* private key second share */
+.globl d1
+.balign 32
+d1:
+  .zero 64
+
+/* 704 bytes of scratchpad memory
+  defined globally to save dmem */
+.balign 32
+.globl scratchpad
+scratchpad:
+  .zero 704
diff --git a/sw/otbn/crypto/p384_ecdsa_sca.s b/sw/otbn/crypto/p384_ecdsa_sca.s
index fbe766b144ef0..77ec9f800cec6 100644
--- a/sw/otbn/crypto/p384_ecdsa_sca.s
+++ b/sw/otbn/crypto/p384_ecdsa_sca.s
@@ -26,7 +26,7 @@ start:
 
 .text
 p384_ecdsa_sign:
-  jal      x1, p384_ecdsa_setup_rand
+  jal      x1, p384_ecdsa_setup
   jal      x1, p384_sign
   ecall
 
@@ -37,23 +37,40 @@ p384_ecdsa_verify:
 /**
  * Populate the variables rnd and k with randomness, and setup data pointers.
  */
-p384_ecdsa_setup_rand:
-  /* Obtain the blinding constant from URND, and write it to `rnd` in DMEM. */
-  /* bn.wsrr   w0, 0x2 */ /* URND */
-  la        x10, rnd
-  /* bn.sid    x0, 0(x10) */
-
-  /* Point dptr_rnd to rnd. */
-  la        x11, dptr_rnd
+p384_ecdsa_setup:
+  /* Point dptr_k0 to k0. */
+  la        x10, k0
+  la        x11, dptr_k0
   sw        x10, 0(x11)
 
-  /* Obtain the nonce (k) from RND. */
-  /*bn.wsrr   w0, 0x1 *//* RND */
-  la        x10, k
-  /*bn.sid    x0, 0(x10)*/
+  /* Point dptr_k1 to k1. */
+  la        x10, k1
+  la        x11, dptr_k1
+  sw        x10, 0(x11)
+
+  /* Point dptr_d0 to d0. */
+  la        x10, d0
+  la        x11, dptr_d0
+  sw        x10, 0(x11)
+
+  /* Point dptr_d1 to d1. */
+  la        x10, d1
+  la        x11, dptr_d1
+  sw        x10, 0(x11)
 
-  /* Point dptr_k to k. */
-  la        x11, dptr_k
+  /* Point dptr_msg to msg. */
+  la        x10, msg
+  la        x11, dptr_msg
+  sw        x10, 0(x11)
+
+  /* Point dptr_r to sig_r. */
+  la        x10, r
+  la        x11, dptr_r
+  sw        x10, 0(x11)
+
+  /* Point dptr_s to sig_s. */
+  la        x10, s
+  la        x11, dptr_s
   sw        x10, 0(x11)
 
   ret
@@ -70,15 +87,21 @@ mode:
 
 /* All constants below must be 256b-aligned. */
 
-/* random scalar k */
-.global k
+/* random scalar k0*/
+.global k0
 .balign 64
-k:
+k0:
   .zero 64
 
-/* randomness for blinding */
+/* random scalar k1*/
+.global k1
 .balign 64
+k1:
+  .zero 64
+
+/* randomness for blinding */
 .global rnd
+.balign 64
 rnd:
   .zero 64
 
@@ -112,10 +135,16 @@ x:
 y:
   .zero 64
 
-/* private key d */
-.globl d
+/* private key d0 */
+.globl d0
 .balign 64
-d:
+d0:
+  .zero 64
+
+/* private key d1 */
+.globl d1
+.balign 64
+d1:
   .zero 64
 
 /* verification result x_r (aka x_1) */
@@ -123,3 +152,53 @@ d:
 .balign 64
 x_r:
   .zero 64
+
+/* pointer to rnd (dptr_rnd) */
+.globl dptr_rnd
+dptr_rnd:
+  .zero 4
+
+/* pointer to k0 (dptr_k0) */
+.globl dptr_k0
+dptr_k0:
+  .zero 4
+
+/* pointer to k1 (dptr_k1) */
+.globl dptr_k1
+dptr_k1:
+  .zero 4
+
+/* pointer to msg (dptr_msg) */
+.globl dptr_msg
+dptr_msg:
+  .zero 4
+
+/* pointer to R (dptr_r) */
+.globl dptr_r
+dptr_r:
+  .zero 4
+
+/* pointer to S (dptr_s) */
+.globl dptr_s
+dptr_s:
+  .zero 4
+
+/* pointer to X (dptr_x) */
+.globl dptr_x
+dptr_x:
+  .zero 4
+
+/* pointer to Y (dptr_y) */
+.globl dptr_y
+dptr_y:
+  .zero 4
+
+/* pointer to d0 (dptr_d0) */
+.globl dptr_d0
+dptr_d0:
+  .zero 4
+
+/* pointer to d1 (dptr_d1) */
+.globl dptr_d1
+dptr_d1:
+  .zero 4
diff --git a/sw/otbn/crypto/p384_ecdsa_sign.s b/sw/otbn/crypto/p384_ecdsa_sign.s
new file mode 100644
index 0000000000000..c063c15bff2b4
--- /dev/null
+++ b/sw/otbn/crypto/p384_ecdsa_sign.s
@@ -0,0 +1,163 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Entrypoint for P-384 ECDSA signing operations.
+ *
+ * This binary generates a signature using a caller-provided secret key.
+ */
+
+.section .text.start
+.globl start
+start:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  jal       x1, ecdsa_sign
+
+  /* Invalid mode; fail. */
+  unimp
+  unimp
+  unimp
+
+/**
+ * P-384 ECDSA signature generation.
+ * Generate the secret scalar k from a random seed.
+ *
+ * @param[in]  dmem[0]: dptr_k0, pointer to location in dmem containing
+ *                      1st scalar share k0
+ * @param[in]  dmem[4]: dptr_k1, pointer to location in dmem containing
+ *                      2nd scalar share k1
+ * @param[in]  dmem[8]: dptr_msg, pointer to the message to be signed in dmem
+ * @param[in]  dmem[12]: dptr_r, pointer to dmem location where s component
+ *                               of signature will be placed
+ * @param[in]  dmem[16]: dptr_s, pointer to dmem location where r component
+ *                               of signature will be placed
+ * @param[in]  dmem[28]: dptr_d0, pointer to location in dmem containing
+ *                      1st private key share d0
+ * @param[in]  dmem[32]: dptr_d1, pointer to location in dmem containing
+ *                      2nd private key share d1
+ * @param[out] dmem[r]: r component of signature
+ * @param[out] dmem[s]: s component of signature
+ */
+ecdsa_sign:
+  /* Generate a fresh random scalar for signing.
+       dmem[k0] <= first share of k
+       dmem[k1] <= second share of k */
+  jal      x1, p384_generate_k
+
+  /* Generate the signature. */
+  jal      x1, p384_sign
+
+  ecall
+
+.bss
+
+/* pointer to x-coordinate (dptr_x) */
+.globl dptr_x
+.balign 4
+dptr_x:
+  .zero 4
+
+/* pointer to y-coordinate (dptr_y) */
+.globl dptr_y
+.balign 4
+dptr_y:
+  .zero 4
+
+/* pointer to k0 (dptr_k0) */
+.globl dptr_k0
+dptr_k0:
+  .zero 4
+
+/* pointer to k1 (dptr_k1) */
+.globl dptr_k1
+dptr_k1:
+  .zero 4
+
+/* pointer to d0 (dptr_d0) */
+.globl dptr_d0
+dptr_d0:
+  .zero 4
+
+/* pointer to d1 (dptr_d1) */
+.globl dptr_d1
+dptr_d1:
+  .zero 4
+
+/* pointer to msg (dptr_msg) */
+.globl dptr_msg
+dptr_msg:
+  .zero 4
+
+/* pointer to R (dptr_r) */
+.globl dptr_r
+dptr_r:
+  .zero 4
+
+/* pointer to S (dptr_s) */
+.globl dptr_s
+dptr_s:
+  .zero 4
+
+/* x-coordinate. */
+.globl x
+.balign 32
+x:
+  .zero 64
+
+/* y-coordinate. */
+.globl y
+.balign 32
+y:
+  .zero 64
+
+/* random scalar first share */
+.globl k0
+.balign 32
+k0:
+  .zero 64
+
+/* random scalar second share */
+.globl k1
+.balign 32
+k1:
+  .zero 64
+
+/* private key first share */
+.globl d0
+.balign 32
+d0:
+  .zero 64
+
+/* private key second share */
+.globl d1
+.balign 32
+d1:
+  .zero 64
+
+/* hash message to sign/verify */
+.globl msg
+.balign 32
+msg:
+  .zero 64
+
+/* r part of signature */
+.globl r
+.balign 32
+r:
+  .zero 64
+
+/* s part of signature */
+.globl s
+.balign 32
+s:
+  .zero 64
+
+/* 704 bytes of scratchpad memory
+  defined globally to save dmem */
+.balign 32
+.globl scratchpad
+scratchpad:
+  .zero 704
diff --git a/sw/otbn/crypto/p384_ecdsa_verify.s b/sw/otbn/crypto/p384_ecdsa_verify.s
new file mode 100644
index 0000000000000..577dcc2184bd5
--- /dev/null
+++ b/sw/otbn/crypto/p384_ecdsa_verify.s
@@ -0,0 +1,130 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Entrypoint for P-384 ECDSA verifying operations.
+ *
+ * This binary verifies a signature. - !!! Attention !!! - before
+ * signature verification p384_curve_point_valid
+ * binary has to be executed to check if the provided
+ * public key is valid.
+ */
+
+.section .text.start
+.globl start
+start:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  jal       x1, ecdsa_verify
+
+  /* Invalid mode; fail. */
+  unimp
+  unimp
+  unimp
+
+
+/**
+ * P-384 ECDSA signature verification
+ *
+ * The routine computes the x1 coordinate and places it in dmem. x1 will be
+ * reduced (mod n), however, the final comparison has to be performed on the
+ * host side. The signature is valid if x1 == r.
+ * This routine runs in variable time.
+ *
+ * @param[in]  dmem[4]: dptr_rnd, pointer to dmem location where the reduced
+ *                           affine x1-coordinate will be stored
+ * @param[in]  dmem[8]: dptr_msg, pointer to the message to be verified in dmem
+ * @param[in]  dmem[12]: dptr_r, pointer to r of signature in dmem
+ * @param[in]  dmem[16]: dptr_s, pointer to s of signature in dmem
+ * @param[in]  dmem[20]: dptr_x, pointer to x-coordinate of public key in dmem
+ * @param[in]  dmem[20]: dptr_y, pointer to y-coordinate of public key in dmem
+ * @param[out] dmem[rnd]: x1 coordinate to be compared to rs
+ *
+ * !!! Attention !!! - before signature verification p384_curve_point_valid
+ * binary has to be executed to check if the provided public key is valid.
+ *
+ */
+ecdsa_verify:
+  /* Verify the signature (compute x1). */
+  jal      x1, p384_verify
+
+  ecall
+
+.bss
+
+/* pointer to x-coordinate (dptr_x) */
+.globl dptr_x
+.balign 4
+dptr_x:
+  .zero 4
+
+/* pointer to y-coordinate (dptr_y) */
+.globl dptr_y
+.balign 4
+dptr_y:
+  .zero 4
+
+/* pointer to rnd (dptr_rnd) */
+.globl dptr_rnd
+dptr_rnd:
+  .zero 4
+
+/* pointer to msg (dptr_msg) */
+.globl dptr_msg
+dptr_msg:
+  .zero 4
+
+/* pointer to R (dptr_r) */
+.globl dptr_r
+dptr_r:
+  .zero 4
+
+/* pointer to S (dptr_s) */
+.globl dptr_s
+dptr_s:
+  .zero 4
+
+/* Public key x-coordinate. */
+.globl x
+.balign 32
+x:
+  .zero 64
+
+/* Public key y-coordinate. */
+.globl y
+.balign 32
+y:
+  .zero 64
+
+/* result of verify (x1 coordinate) */
+.globl rnd
+.balign 32
+rnd:
+  .zero 64
+
+/* hash message to sign/verify */
+.globl msg
+.balign 32
+msg:
+  .zero 64
+
+/* r part of signature */
+.globl r
+.balign 32
+r:
+  .zero 64
+
+/* s part of signature */
+.globl s
+.balign 32
+s:
+  .zero 64
+
+/* 896 bytes of scratchpad memory
+  defined globally to save dmem. */
+.balign 32
+.globl scratchpad
+scratchpad:
+  .zero 896
diff --git a/sw/otbn/crypto/p384_internal_mult.s b/sw/otbn/crypto/p384_internal_mult.s
new file mode 100644
index 0000000000000..409404937696d
--- /dev/null
+++ b/sw/otbn/crypto/p384_internal_mult.s
@@ -0,0 +1,374 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ *   P-384 specific routines for internal scalar multiplication of curve points.
+ */
+
+ .section .text
+
+/**
+ * Fetch curve point from dmem, randomize z-coordinate and store point in dmem
+ *
+ * returns P = (x, y, z) = (x_a*z, y_a*z, z)
+ *         with P being a valid P-384 curve point in projective coordinates
+ *              x_a and y_a being the affine coordinates as fetched from dmem
+ *              z being a randomized z-coordinate
+ *
+ * This routines fetches the affine x- and y-coordinates of a curve point from
+ * dmem and computes a valid set of projective coordinates. The z-coordinate is
+ * randomized and x and y are scaled appropriately. The resulting projective
+ * coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells,
+ * i.e. each coordinate is stored 512 bit aligned, little endian.
+ * This routine runs in constant time.
+ *
+ * @param[in]  x20: dptr_x, pointer to dmem location containing affine
+ *                          x-coordinate of input point
+ * @param[in]  x21: dptr_y, pointer to dmem location containing affine
+ *                          y-coordinate of input point
+ * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
+ *                                    modulus p
+ * @param[in]  [w13, w12]: p, modulus of P-384 underlying finite field
+ * @param[in]  w31: all-zero
+ * @param[in]  x18: dptr_p_p, pointer to dmem location to store resulting point
+ *                            in projective space
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ *        the upper limb of projective y-coordinate.
+ *
+ * clobbered registers: x10, x11 to x13
+  *                     w2, w3, w8 to w11, w16 to w24, w29, w30
+ * clobbered flag groups: FG0
+ */
+ .globl store_proj_randomize
+store_proj_randomize:
+
+  /* get a 384-bit random number from URND
+    [w3, w2] = random(384) */
+  bn.wsrr   w2, 2
+  bn.wsrr   w3, 2
+  bn.rshi   w3, w31, w3 >> 128
+
+  /* reduce random number
+     [w2, w3] = z <= [w2, w3] mod p */
+  bn.sub   w10, w2, w12
+  bn.subb  w11, w3, w13
+  bn.sel   w2, w2, w10, C
+  bn.sel   w3, w3, w11, C
+
+  bn.mov w10, w2
+  bn.mov w11, w3
+
+  /* store z-coordinate
+     dmem[x20+128] = [w10, w11] */
+  li        x10, 10
+  li        x11, 11
+  bn.sid    x10, 128(x18)
+  bn.sid    x11, 160(x18)
+
+  /* fetch x-coordinate from dmem
+     [w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */
+  li x12, 16
+  li x13, 17
+  bn.lid    x12,  0(x20)
+  bn.lid    x13, 32(x20)
+
+  /* scale and store x-coordinate
+     [dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] =
+       x_p <= [w11, w10] * [w17, w16] = z*x  mod p */
+
+  jal       x1, p384_mulmod_p
+  bn.sid    x12,  0(x18)
+  bn.sid    x13, 32(x18)
+
+  /* fetch y-coordinate from dmem
+     [w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */
+  bn.lid    x12,  0(x21)
+  bn.lid    x13, 32(x21)
+
+  /* scale and store y-coordinate
+     [dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] =
+       y_p <= [w11, w10] * [w17, w16] = z*y  mod p */
+  bn.mov w10, w2
+  bn.mov w11, w3
+  jal       x1, p384_mulmod_p
+  bn.sid    x12, 64(x18)
+  bn.sid    x13, 96(x18)
+
+  ret
+
+/**
+ * P-384 scalar point multiplication in projective space
+ *
+ * returns R = k*P = k*(x_p, y_p)
+ *         where P is a valid P-384 curve point in affine coordinates,
+ *               k is a 384-bit scalar,
+ *               R is a valid P-384 curve point in projective coordinates.
+ *
+ * This routine performs scalar multiplication based on the group laws
+ * of Weierstrass curves.
+ * A constant time double-and-add algorithm (sometimes referred to as
+ * double-and-add-always) is used.
+ * Due to the P-384 optimized implementations of the internally called routines
+ * for point addition and doubling, this routine is limited to P-384 curves.
+ * The routine makes use of blinding by additive splitting the
+ * exponent/scalar d into two shares. The double-and-add loop operates on both
+ * shares in parallel applying the Strauss-Shamir trick:
+ * The routine receives the scalar in two shares k0, k1 such that
+ *   k = (k0 + k1) mod n
+ * The loop operates on both shares in parallel, computing (k0 + k1) * P as
+ * follows:
+ *  Q = (0, 1, 0) # origin
+ *  for i in 448..0:
+ *    Q = 2 * Q
+ *    A = if (k0[i] ^ k1[i]) then P else 2P
+ *    B = Q + A
+ *    Q = if (k0[i] | k1[i]) then B else Q
+ *
+ * Each share k0/k1 is 448 bits, even though it represents a 384-bit value.
+ * This is a side-channel protection measure.
+ *
+ * @param[in]  x17: dptr_k0, pointer to first share k0 of scalar k
+ *                           (0 < k < n) in dmem (448-bit)
+ * @param[in]  x19: dptr_k1, pointer to second share k1 of scalar k
+ *                           (0 < k < n) in dmem (448-bit)
+ * @param[in]  x20: dptr_x, pointer to affine x-coordinate in dmem
+ * @param[in]  x21: dptr_y, pointer to affine y-coordinate in dmem
+ * @param[in]  x28: dptr_b, pointer to domain parameter b of P-384 in dmem
+ * @param[in]  x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem
+ * @param[in]  [w13, w12]: p, modulus of P-384 underlying finite field
+ * @param[in]  [w11, w10]: n, domain parameter of P-384 curve
+ *                            (order of base point G)
+ * @param[in]  w31: all-zero
+ * @param[out]  [w26,w25]: x, x-coordinate of resulting point R (projective).
+ * @param[out]  [w28,w27]: y, y-coordinate of resulting point R (projective).
+ * @param[out]  [w30,w29]: z, z-coordinate of resulting point R (projective).
+ *
+ * Scratchpad memory layout:
+ * The routine expects at least 704 bytes of scratchpad memory at dmem
+ * location 'scratchpad' (sp). Internally the scratchpad is used as follows:
+ * dptr_sp     .. dptr_sp+191: point P, projective
+ * dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar
+ * dptr_sp+256 .. dptr_sp+447: point 2P, projective
+ * dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar
+ * dptr_sp+512 .. dptr_sp+703: point Q, projective
+ *
+ * Projective coordinates of a point are kept in dmem in little endian format
+ * with the individual coordinates 512 bit aligned. The coordinates are stored
+ * in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit
+ * curve point occupies 6 consecutive 256-bit dmem cells.
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ *        the computed affine y-coordinate.
+ *
+ * clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30
+ * clobbered flag groups: FG0
+ */
+ .globl scalar_mult_int_p384
+scalar_mult_int_p384:
+
+  /* set regfile pointers to in/out regs of Barrett routine. Set here to avoid
+     resetting in very call to point addition routine */
+  li        x22, 10
+  li        x23, 11
+  li        x24, 16
+  li        x25, 17
+
+  /* fetch 1st share of scalar from dmem
+     s0 = [w1, w0] <= dmem[dptr_k0] = [dmem[x17], dmem[x17+32]] = k0 */
+  li        x2, 0
+  bn.lid    x2++, 0(x17)
+  bn.lid    x2++, 32(x17)
+
+  /* fetch 2nd share of scalar from dmem
+     s0 = [w3, w2] <= dmem[dptr_k1] = [dmem[x19], dmem[x19+32]] = k1 */
+  bn.lid    x2++, 0(x19)
+  bn.lid    x2++, 32(x19)
+
+  /* left align both shares for probing of MSB in loop body */
+  bn.rshi   w1, w1, w0 >> 192
+  bn.rshi   w0, w0, w31 >> 192
+  bn.rshi   w3, w3, w2 >> 192
+  bn.rshi   w2, w2, w31 >> 192
+
+   /* store shares in scratchpad */
+  li        x2, 0
+  bn.sid    x2++, 192(x30)
+  bn.sid    x2++, 224(x30)
+  bn.sid    x2++, 448(x30)
+  bn.sid    x2++, 480(x30)
+
+  /* get randomized projective coodinates of curve point
+     P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */
+  add       x18, x30, 0
+  jal       x1, store_proj_randomize
+
+  /* double point P
+     2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */
+  add       x27, x30, x0
+  add       x26, x30, x0
+  jal       x1, proj_add_p384
+
+  /* store point 2P in scratchpad @w30+256
+     dmem[dptr_sc+256] = [w30:w25] = 2P */
+  li        x2, 25
+  bn.sid    x2++, 256(x30)
+  bn.sid    x2++, 288(x30)
+  bn.sid    x2++, 320(x30)
+  bn.sid    x2++, 352(x30)
+  bn.sid    x2++, 384(x30)
+  bn.sid    x2++, 416(x30)
+
+  /* init point Q = (0,1,0) for double-and-add in scratchpad */
+  /* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */
+  addi      x26, x30, 512
+  li        x2, 30
+  bn.addi   w30, w31, 1
+  bn.sid    x2++, 64(x26)
+  bn.sid    x2, 0(x26)
+  bn.sid    x2, 32(x26)
+  bn.sid    x2, 96(x26)
+  bn.sid    x2, 128(x26)
+  bn.sid    x2, 160(x26)
+
+  /* double-and-add loop with decreasing index */
+  loopi     448, 85
+
+    /* double point Q
+       Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
+    add       x27, x26, x0
+    jal       x1, proj_add_p384
+
+    /* store Q in dmem
+     dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
+    li        x2, 25
+    bn.sid    x2++, 0(x26)
+    bn.sid    x2++, 32(x26)
+    bn.sid    x2++, 64(x26)
+    bn.sid    x2++, 96(x26)
+    bn.sid    x2++, 128(x26)
+    bn.sid    x2++, 160(x26)
+
+    /* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both
+       is 1.
+       If only one MSb is set, select P for addition.
+       If both MSbs are set, select 2P for addition.
+       (If neither MSB is set, 2P will be selected but result discarded.) */
+    li        x2, 0
+    bn.lid    x2++, 224(x30)
+    bn.lid    x2, 480(x30)
+    bn.xor    w8, w0, w1
+    /* Create conditional offeset into scratchpad.
+       if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */
+    csrrs     x3, FG0, x0
+    xori      x3, x3, -1
+    andi      x3, x3, 2
+    slli      x27, x3, 7
+    add       x27, x27, x30
+
+    /* Reload randomized projective coodinates for curve point P.
+       P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */
+    jal       x1, store_proj_randomize
+
+    /* Add points Q+P or Q+2P depending on offset in x27.
+       Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
+    jal       x1, proj_add_p384
+
+    /* load shares from scratchpad
+       [w1, w0] = s0; [w3, w2] = s1 */
+    li        x2, 0
+    bn.lid    x2++, 192(x30)
+    bn.lid    x2++, 224(x30)
+    bn.lid    x2++, 448(x30)
+    bn.lid    x2++, 480(x30)
+
+    /* M = s0[511] | s1[511] */
+    bn.or     w8, w1, w3
+
+    /* load q from scratchpad
+        Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */
+    li        x2, 4
+    bn.lid    x2++, 0(x26)
+    bn.lid    x2++, 32(x26)
+    bn.lid    x2++, 64(x26)
+    bn.lid    x2++, 96(x26)
+    bn.lid    x2++, 128(x26)
+    bn.lid    x2++, 160(x26)
+
+    /* select either Q or Q_a
+       if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */
+    bn.sel    w25, w25, w4, M
+    bn.sel    w26, w26, w5, M
+    bn.sel    w27, w27, w6, M
+    bn.sel    w28, w28, w7, M
+    bn.sel    w29, w29, w8, M
+    bn.sel    w30, w30, w9, M
+
+    /* store Q in dmem
+     dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
+    li        x2, 25
+    bn.sid    x2++, 0(x26)
+    bn.sid    x2++, 32(x26)
+    bn.sid    x2++, 64(x26)
+    bn.sid    x2++, 96(x26)
+    bn.sid    x2++, 128(x26)
+    bn.sid    x2++, 160(x26)
+
+    /* left shift both shares
+       s0 <= s0 << 1 ; s1 <= s1 << 1 */
+    bn.add    w0, w0, w0
+    bn.addc   w1, w1, w1
+    bn.add    w2, w2, w2
+    bn.addc   w3, w3, w3
+    /* store both shares in scratchpad */
+    li        x2, 0
+    bn.sid    x2++, 192(x30)
+    bn.sid    x2++, 224(x30)
+    bn.sid    x2++, 448(x30)
+    bn.sid    x2++, 480(x30)
+
+
+    /* Get a fresh random number from URND and scale the coordinates of 2P.
+       (scaling each proj. coordinate by same factor results in same point) */
+
+    /* get a 384-bit random number from URND */
+    bn.wsrr   w2, 2
+    bn.wsrr   w3, 2
+    bn.rshi   w3, w31, w3 >> 128
+
+    /* reduce random number
+      [w2, w3] = z <= [w2, w3] mod p */
+    bn.sub    w10, w2, w12
+    bn.subb   w11, w3, w13
+    bn.sel    w2, w2, w10, C
+    bn.sel    w3, w3, w11, C
+
+    /* scale all coordinates in scratchpad */
+    li        x2, 16
+    li        x3, 17
+    /* x-coordinate */
+    bn.mov    w10, w2
+    bn.mov    w11, w3
+    bn.lid    x2, 256(x30)
+    bn.lid    x3, 288(x30)
+    jal       x1, p384_mulmod_p
+    bn.sid    x2, 256(x30)
+    bn.sid    x3, 288(x30)
+    /* y-coordinate */
+    bn.mov    w10, w2
+    bn.mov    w11, w3
+    bn.lid    x2, 320(x30)
+    bn.lid    x3, 352(x30)
+    jal       x1, p384_mulmod_p
+    bn.sid    x2, 320(x30)
+    bn.sid    x3, 352(x30)
+    /* z-coordinate */
+    bn.mov    w10, w2
+    bn.mov    w11, w3
+    bn.lid    x2, 384(x30)
+    bn.lid    x3, 416(x30)
+    jal       x1, p384_mulmod_p
+    bn.sid    x2, 384(x30)
+    bn.sid    x3, 416(x30)
+
+  ret
diff --git a/sw/otbn/crypto/p384_isoncurve.s b/sw/otbn/crypto/p384_isoncurve.s
new file mode 100644
index 0000000000000..ef8bb215dbd75
--- /dev/null
+++ b/sw/otbn/crypto/p384_isoncurve.s
@@ -0,0 +1,316 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ *   P-384 specific routines for ECDSA signature verification and curve point
+ *   test.
+ */
+
+ .section .text
+
+/**
+ * Checks if a point is a valid curve point on curve P-384
+ *
+ * Returns rhs = x^3 + ax + b  mod p
+ *     and lhs = y^2  mod p
+ *         where x,y are the affine coordinates of the curve point and
+ *              a, b and p being the domain parameters of curve P-384.
+ *
+ * This routine checks if a point with given x- and y-coordinate is a valid
+ * curve point on P-384.
+ * The routine checks whether the coordinates are a solution of the
+ * Weierstrass equation y^2 = x^3 + ax + b  mod p.
+ * The routine makes use of the property that the domain parameter 'a' can be
+ * written as a=-3 for the P-384 curve, hence the routine is limited to P-384.
+ * The routine does not return a boolean result but computes the left side
+ * and the right sight of the Weierstrass equation and leaves the final
+ * comparison to the caller.
+ * The routine runs in constant time.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  dmem[12]: dptr_rhs, pointer to dmem location where right
+ *                               side result will be stored
+ * @param[in]  dmem[16]: dptr_lhs, pointer to dmem location where left side
+ *                               result will be stored
+ * @param[in]  dmem[20]: dptr_x, pointer to dmem location containing affine
+ *                               x-coordinate of input point
+ * @param[in]  dmem[24]: dptr_y, pointer to dmem location containing affine
+ *                               y-coordinate of input point
+ *
+ * clobbered registers: x2, x3, w0 to w5, w10 to w17
+ * clobbered flag groups: FG0
+ */
+ .globl p384_isoncurve
+p384_isoncurve:
+
+  /* setup all-zero reg */
+  bn.xor    w31, w31, w31
+
+  /* load affine x-coordinate of curve point from dmem
+     [w1, w0] <= dmem[dptr_x] = dmem[20] */
+  la        x3, dptr_x
+  lw        x3, 0(x3)
+  li        x2, 0
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* load affine y-coordinate of curve point from dmem
+     [w3, w2] <= dmem[dptr_y] = dmem[24] */
+  la        x3, dptr_y
+  lw        x3, 0(x3)
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2, 32(x3)
+
+  /* load domain parameter p (modulus) from dmem
+     [w13, w12] = p = dmem[p384_p] */
+  li        x2, 12
+  la        x3, p384_p
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* load domain parameter b from dmem
+     [w4, w5] = b = dmem[p384_b] */
+  li        x2, 4
+  la        x3, p384_b
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* y^2 = [w17,w16] <= y*y = [w3,w2]*w[w3,w2] */
+  bn.mov    w10, w2
+  bn.mov    w11, w3
+  bn.mov    w16, w2
+  bn.mov    w17, w3
+  jal       x1, p384_mulmod_p
+
+  /* store result (left side): dmem[dptr_lhs] <= y^2 = [w17,w16] */
+  la        x3, dptr_lhs
+  lw        x3, 0(x3)
+  li        x2, 16
+  bn.sid    x2++, 0(x3)
+  bn.sid    x2++, 32(x3)
+
+  /*  x^3 = [w17,w16] <= (x*x)*x = ([w1,w0]*(w1,w0])*[w1,w0] */
+  bn.mov    w10, w0
+  bn.mov    w11, w1
+  bn.mov    w16, w0
+  bn.mov    w17, w1
+  jal       x1, p384_mulmod_p
+  bn.mov    w10, w0
+  bn.mov    w11, w1
+  jal       x1, p384_mulmod_p
+
+  /* for curve P-384, 'a' can be written as a = -3, therefore we subtract
+     x three times from x^3.
+     x^3 + ax  mod p = [w17,w16] <= x^3 -3 x mod p
+                     = [w17,w16] - [w1,w0] - [w1,w0] - [w1,w0] mod [w13,w12] */
+  loopi     3, 6
+    bn.sub    w16, w16, w0
+    bn.subb   w17, w17, w1
+    bn.add    w10, w16, w12
+    bn.addc   w11, w17, w13
+    bn.sel    w16, w10, w16, C
+    bn.sel    w17, w11, w17, C
+
+  /* add domain parameter b
+     x^3 + ax + b mod p = [w17,w16] <= [w17,w16] + [w5,w4] mod [w13,w12] */
+  bn.add    w16, w16, w4
+  bn.addc   w17, w17, w5
+  bn.sub    w10, w16, w12
+  bn.subb   w11, w17, w13
+  bn.sel    w16, w16, w10, C
+  bn.sel    w17, w17, w11, C
+
+  /* store result (right side)
+     dmem[dptr_rhs] <= x^3 + ax + b mod p = [w17,w16] */
+  la        x3, dptr_rhs
+  lw        x3, 0(x3)
+  li        x2, 16
+  bn.sid    x2++, 0(x3)
+  bn.sid    x2++, 32(x3)
+
+  ret
+
+/**
+ * Check if a provided curve point is valid.
+ *
+ * For a given curve point (x, y), check that:
+ * - x and y are both fully reduced mod p
+ * - (x, y) is on the P-384 curve.
+ *
+ * Note that, because the point is in affine form, it is not possible that (x,
+ * y) is the point at infinity. In some other forms such as projective
+ * coordinates, we would need to check for this also.
+ *
+ * This routine raises a software error and halts operation if the curve point
+ * is invalid.
+ *
+ * @param[in]  dmem[12]: dptr_rhs, pointer to dmem location where right hand
+ *                               side result rhs will be stored
+ * @param[in]  dmem[16]: dptr_lhs, pointer to dmem location where left hand
+ *                               side result lhs will be stored
+ * @param[in]  dmem[20]: dptr_x, pointer to dmem location containing affine
+ *                               x-coordinate of input point
+ * @param[in]  dmem[24]: dptr_y, pointer to dmem location containing affine
+ *                               y-coordinate of input point
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: x2, x3, x20 to x23, w0 to w17
+ * clobbered flag groups: FG0
+ */
+ .globl p384_curve_point_valid
+p384_curve_point_valid:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* load domain parameter p (modulus)
+     [w13, w12] = p = dmem[p384_p] */
+  li        x2, 12
+  la        x3, p384_p
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* Load public key x-coordinate.
+     [w11, w10] <= dmem[x] = x */
+  la        x20, dptr_x
+  lw        x20, 0(x20)
+  li        x2, 10
+  bn.lid    x2++, 0(x20)
+  bn.lid    x2, 32(x20)
+
+  /* Compare x to p.
+       FG0.C <= (x < p) */
+  bn.sub    w0, w10, w12
+  bn.subb   w0, w11, w13
+
+  /* Trigger a fault if FG0.C is false. */
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 1
+  bne       x2, x0, _x_valid
+  unimp
+
+  _x_valid:
+
+  /* Load public key y-coordinate.
+       w2 <= dmem[y] = y */
+  la        x21, dptr_y
+  lw        x21, 0(x21)
+  li        x2, 8
+  bn.lid    x2++, 0(x21)
+  bn.lid    x2, 32(x21)
+
+  /* Compare y to p.
+       FG0.C <= (y < p) */
+  bn.sub    w0, w8, w12
+  bn.subb   w0, w9, w13
+
+  /* Trigger a fault if FG0.C is false. */
+  csrrs     x2, FG0, x0
+  andi      x2, x2, 1
+  bne       x2, x0, _y_valid
+  unimp
+
+  _y_valid:
+
+  /* Compute both sides of the Weierstrauss equation.
+       dmem[rhs] <= (x^3 + ax + b) mod p
+       dmem[lhs] <= (y^2) mod p */
+  jal       x1, p384_isoncurve
+
+  /* Load both sides of the equation.
+       [w7, w6] <= dmem[rhs]
+       [w5, w4] <= dmem[lhs] */
+  la        x22, dptr_rhs
+  lw        x22, 0(x22)
+  li        x2, 6
+  bn.lid    x2++, 0(x22)
+  bn.lid    x2, 32(x22)
+  la        x23, dptr_lhs
+  lw        x23, 0(x23)
+  li        x2, 4
+  bn.lid    x2++, 0(x23)
+  bn.lid    x2, 32(x23)
+
+  /* Compare the two sides of the equation.
+       FG0.Z <= (y^2) mod p == (x^2 + ax + b) mod p */
+  bn.sub    w0, w4, w6
+  bn.subb   w1, w5, w7
+
+  bn.cmp    w0, w31
+
+  /* Trigger a fault if FG0.Z is false. */
+  csrrs     x2, FG0, x0
+  srli      x2, x2, 3
+  andi      x2, x2, 1
+  bne       x2, x0, _pt_1st_reg_valid
+  unimp
+  unimp
+  unimp
+
+  _pt_1st_reg_valid:
+
+  bn.cmp    w1, w31
+
+  /* Trigger a fault if FG0.Z is false. */
+  csrrs     x2, FG0, x0
+  srli      x2, x2, 3
+  andi      x2, x2, 1
+  bne       x2, x0, _pt_valid
+  unimp
+  unimp
+  unimp
+
+  _pt_valid:
+
+  ret
+
+.data
+
+/* Right side of Weierstrass equation */
+.globl rhs
+.balign 32
+rhs:
+  .zero 64
+
+/* Left side of Weierstrass equation */
+.globl lhs
+.balign 32
+lhs:
+  .zero 64
+
+/* Curve point x-coordinate. */
+.globl x
+.weak x
+.balign 32
+x:
+  .zero 64
+
+/* Curve point y-coordinate. */
+.globl y
+.weak y
+.balign 32
+y:
+  .zero 64
+
+/* pointer to R (dptr_rhs) */
+.globl dptr_rhs
+dptr_rhs:
+  .zero 4
+
+/* pointer to S (dptr_lhs) */
+.globl dptr_lhs
+dptr_lhs:
+  .zero 4
+
+/* pointer to X (dptr_x) */
+.globl dptr_x
+.weak dptr_x
+dptr_x:
+  .zero 4
+
+/* pointer to Y (dptr_y) */
+.globl dptr_y
+.weak dptr_y
+dptr_y:
+  .zero 4
diff --git a/sw/otbn/crypto/p384_keygen.s b/sw/otbn/crypto/p384_keygen.s
new file mode 100644
index 0000000000000..b608ea8af9f5b
--- /dev/null
+++ b/sw/otbn/crypto/p384_keygen.s
@@ -0,0 +1,256 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ *   This library contains:
+ *   - P-384 specific routines to generate random values for keys and scalars
+ */
+
+.section .text
+
+/**
+ * Generate a nonzero random value in the scalar field.
+ *
+ * Returns t, a random value that is nonzero mod n, in shares.
+ *
+ * This follows a modified version of the method in FIPS 186-4 sections B.4.1
+ * and B.5.1 for generation of secret scalar values d and k. The computation
+ * in FIPS 186-4 is:
+ *   seed = RBG(seedlen) // seedlen >= 448
+ *   return (seed mod (n-1)) + 1
+ *
+ * The important features here are that (a) the seed is at least 64 bits longer
+ * than n in order to minimize bias after the reduction and (b) the resulting
+ * scalar is guaranteed to be nonzero.
+ *
+ * We deviate from FIPS a little bit here because for side-channel protection,
+ * we do not want to fully reduce the seed modulo (n-1) or combine the shares.
+ * Instead, we do the following:
+ *   seed0 = RBG(448)
+ *   seed1 = RBG(448)
+ *   x = URND(127) + 1 // random value for masking
+ *   if (seed0 * x + seed1 * x) mod n == 0:
+ *     retry
+ *   return seed0, seed1
+ *
+ * Essentially, we get two independent seeds and interpret these as additive
+ * shares of the scalar t = (seed0 + seed1) mod n. Then, we need to ensure t is
+ * nonzero. Multiplying each share with a random masking parameter allows us to
+ * safely add them, and then check if this result is 0; if it is, then t must
+ * be 0 mod n and we need to retry.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]           w31:  all-zero
+ * @param[in]  dmem[p384_n]:  Curve order n
+ * @param[out]      [w7,w6]:  first share of secret scalar t (448 bits)
+ * @param[out]      [w9,w8]:  second share of secret scalar t (448 bits)
+ *
+ * clobbered registers: x2, x3, w4 to w11, w14, w16 to w28
+ * clobbered flag groups: FG0
+ */
+p384_random_scalar:
+  /* Load the curve order n.
+     [w13,w12] <= dmem[p384_n] = n */
+  li        x2, 12
+  la        x3, p384_n
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  random_scalar_retry:
+  /* Obtain 1024 bits of randomness from RND. */
+  bn.wsrr   w6, RND
+  bn.wsrr   w7, RND
+  bn.wsrr   w8, RND
+  bn.wsrr   w9, RND
+
+  /* XOR with bits from URND, just in case there's any vulnerability in EDN
+     that lets the attacker recover bits before they reach OTBN. */
+  bn.wsrr   w5, URND
+  bn.xor    w6, w6, w5
+  bn.wsrr   w5, URND
+  bn.xor    w7, w7, w5
+  bn.wsrr   w5, URND
+  bn.xor    w8, w8, w5
+  bn.wsrr   w5, URND
+  bn.xor    w9, w9, w5
+
+  /* Shift bits to get 448-bit seeds.
+     seed0 = [w7,w6], seed1 = [w9,w8]
+     w7 <= w7[192:0]
+     w9 <= w9[192:0] */
+  bn.rshi   w7, w31, w7 >> 64
+  bn.rshi   w9, w31, w9 >> 64
+
+  /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so
+     no need to compute the high part):
+     w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */
+  bn.sub    w14, w31, w12
+
+  /* Generate a random 127-bit number.
+     w4 <= URND()[255:129] */
+  bn.wsrr   w4, URND
+  bn.rshi   w4, w31, w4 >> 129
+
+  /* Add 1 to get a 128-bit nonzero scalar for masking.
+     w4 <= w4 + 1 = x */
+  bn.addi   w4, w4, 1
+
+  /* [w26,w25] <= ([w7,w6] * w4) mod n = (seed0 * x) mod n */
+  bn.mov    w16, w4
+  bn.mov    w10, w6
+  bn.mov    w11, w7
+  jal       x1, p384_mulmod448x128_n
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* [w28,w27] <= ([w9,w8] * w4) mod n = (seed1 * x) mod n */
+  bn.mov    w16, w4
+  bn.mov    w10, w8
+  bn.mov    w11, w9
+  jal       x1, p384_mulmod448x128_n
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* Compute (seed * x) mod n = (seed0 * x + seed1 * x) mod n
+     [w17,w16] <= seed * x = [w26,w25] + [w28,w27] mod n */
+  bn.add    w18, w27, w25
+  bn.addc   w19, w28, w26
+  bn.mov    w20, w31
+  jal       x1, p384_reduce_n
+
+  /* Compare w16 to 0. */
+  bn.cmp    w16, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, FG0, x0
+  andi      x2, x2, 8
+
+  /* Compare w17 to 0. */
+  bn.cmp    w17, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x3 <= 8 if FG0.Z else 0 */
+  csrrw     x3, FG0, x0
+  andi      x3, x3, 8
+
+  /* Check if both registers w16 and w17 are equal to 0.
+     x2 AND x3 == 0 <=> [w17,w16] != 0, x2 AND x3 != 0 <=> [w17,w16] == 0 */
+  or        x2, x2, x3
+
+  /* Retry if x2 != 0. */
+  bne       x2, x0, random_scalar_retry
+
+  /* If we get here, then (seed0 + seed1) mod n is nonzero mod n; return. */
+
+  ret
+
+/**
+ * Generate the secret key d from a random seed.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  dmem[0]: dptr_d0, pointer to location in dmem containing
+ *                      1st private key share d0
+ * @param[in]  dmem[4]: dptr_d1, pointer to location in dmem containing
+ *                      2nd private key share d1
+ *
+ * clobbered registers: x2, x3, x20, w4 to w11, w14, w16 to w28
+ * clobbered flag groups: FG0
+ */
+.globl p384_generate_random_key
+p384_generate_random_key:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Generate a random scalar in two 448-bit shares.
+     [w7,w6] <= d0
+     [w9,w8] <= d1 */
+  jal  x1, p384_random_scalar
+
+  /* Write first share to DMEM.
+     dmem[d0] <= [w7,w6] = d0 */
+  la        x20, dptr_d0
+  lw        x20, 0(x20)
+  li        x2, 6
+  bn.sid    x2++, 0(x20)
+  bn.sid    x2++, 32(x20)
+
+  /* Write second share to DMEM.
+     dmem[d1] <= [w9,w8] = d1 */
+  la        x20, dptr_d1
+  lw        x20, 0(x20)
+  bn.sid    x2++, 0(x20)
+  bn.sid    x2++, 32(x20)
+
+  ret
+
+/**
+ * Generate the secret scalar k from a random seed.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  dmem[0]: dptr_k0, pointer to location in dmem containing
+ *                      1st scalar share k0
+ * @param[in]  dmem[4]: dptr_k1, pointer to location in dmem containing
+ *                      2nd scalar share k1
+ *
+ * clobbered registers: x2, x3, x20, w4 to w11, w14, w16 to w28
+ * clobbered flag groups: FG0
+ */
+.globl p384_generate_k
+p384_generate_k:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Generate a random scalar in two 448-bit shares.
+     [w7,w6] <= k0
+     [w9,w8] <= k1 */
+  jal  x1, p384_random_scalar
+
+  /* Write first share to DMEM.
+     dmem[k0] <= [w7,w6] = k0 */
+  la        x20, dptr_k0
+  lw        x20, 0(x20)
+  li        x2, 6
+  bn.sid    x2++, 0(x20)
+  bn.sid    x2++, 32(x20)
+
+  /* Write second share to DMEM.
+     dmem[k1] <= [w9,w8] = k1 */
+  la        x20, dptr_k1
+  lw        x20, 0(x20)
+  bn.sid    x2++, 0(x20)
+  bn.sid    x2++, 32(x20)
+
+  ret
+
+/* pointers */
+.section .data
+
+.balign 32
+
+/* pointer to k0 (dptr_k0) */
+.globl dptr_k0
+.weak dptr_k0
+dptr_k0:
+  .zero 4
+
+/* pointer to k1 (dptr_k1) */
+.globl dptr_k1
+.weak dptr_k1
+dptr_k1:
+  .zero 4
+
+/* pointer to d0 (dptr_d0) */
+.globl dptr_d0
+.weak dptr_d0
+dptr_d0:
+  .zero 4
+
+/* pointer to d1 (dptr_d1) */
+.globl dptr_d1
+.weak dptr_d1
+dptr_d1:
+  .zero 4
diff --git a/sw/otbn/crypto/p384_modinv.s b/sw/otbn/crypto/p384_modinv.s
new file mode 100644
index 0000000000000..7d1712b8239c9
--- /dev/null
+++ b/sw/otbn/crypto/p384_modinv.s
@@ -0,0 +1,87 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ *   P-384 specific routines for ECC modular inverse computation.
+ */
+
+ .section .text
+
+/**
+ * Variable-time modular multiplicative inverse computation
+ *
+ * returns x_inv = x^-1 mod m
+ *
+ * This routine computes the modular multiplicative inverse for any x < m in
+ * the finite field GF(m) where m is prime.
+ *
+ * For inverse computation, Fermat's little theorem is used, i.e.
+ * we compute x^-1 = x^(m-2) mod m.
+ * For exponentiation we use a standard, variable-time (!) square and multiply
+ * algorithm.
+ *
+ * This routine is mainly intended to be used for inversion of scalars in
+ * context of the P-384 curve. In theory, it can be used with any 384-bit
+ * modulus m with a corresponding 385-bit Barrett constant u,
+ * where u[383:192] = 0.
+ *
+ * Note: When used for P-384 scalar inversion, the routine will need 672 calls
+ * to the multiplication routine. By using an adder chain this could be reduced
+ * to ~433 multiplications, however, at the cost of a significant code size
+ * increase.
+ *
+ * Note: This routine runs in variable-time w.r.t. the modulus. It should only
+ * be used with a non-secret modulus.
+ *
+ * @param[in]  [w13, w12]: m, 384 bit modulus
+ * @param[in]  w14: k, Solinas constant (2^384 - m) (max. length 191 bits).
+ * @param[in]  [w30, w29]: x, 384 bit operand
+ * @param[in]  w31, all-zero
+ * @param[out] [w17, w16]: x_inv, modular multiplicative inverse
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * clobbered registers: x2, w2, w3, w10, w11, w16 to w24
+ * clobbered flag groups: FG0
+ */
+ .globl mod_inv_n_p384
+mod_inv_n_p384:
+
+  /* subtract 2 from modulus for Fermat's little theorem
+     [w3,w2] <= m - 2 = [w13,w12]-2 (left aligned) */
+  bn.subi   w2, w12, 2
+  bn.subb   w3, w13, w31
+  bn.rshi   w3, w3, w2 >> 128
+  bn.rshi   w2, w2, w31 >> 128
+
+  /* init square and multiply: [w17,w16] = 1 */
+  bn.addi   w16, w31, 1
+  bn.mov    w17, w31
+
+  /* square and multiply loop */
+  loopi     384, 12
+
+    /* square: [w17,w16] <= [w17, w16]*[w11,w10] mod [w13, w12] */
+    bn.mov    w10, w16
+    bn.mov    w11, w17
+    jal       x1, p384_mulmod_n
+
+    /* shift MSB into carry flag
+       [w3,w2] = 2*[w3,w2] = [w3,w2] << 1 */
+    bn.add    w2, w2, w2
+    bn.addc   w3, w3, w3
+
+    /* skip multiplication if C flag not set */
+    csrrs     x2, 0x7c0, x0
+    andi      x2, x2, 1
+    beq       x2, x0, nomul
+
+    /* multiply: [w17,w16] <= [w17, w16]*[w30,w29] mod [w13, w12] */
+    bn.mov    w10, w29
+    bn.mov    w11, w30
+    jal       x1, p384_mulmod_n
+
+    nomul:
+    nop
+
+  ret
diff --git a/sw/otbn/crypto/p384_scalar_mult.s b/sw/otbn/crypto/p384_scalar_mult.s
new file mode 100644
index 0000000000000..ab8700a4ff96e
--- /dev/null
+++ b/sw/otbn/crypto/p384_scalar_mult.s
@@ -0,0 +1,225 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+/*
+ *   P-384 specific routines for constant-time scalar multiplication.
+ */
+
+ .section .text
+
+/**
+ * Externally callable wrapper for P-384 scalar point multiplication
+ *
+ * Calculates R = k*P = k*(x_p, y_p)
+ *         where R, P are valid P-384 curve points in affine coordinates,
+ *         k is a 384-bit scalar.
+ *         The x coordinate of R is arithmetically masked.
+ * Returns the masked x coordinate of R and the corresponding mask.
+ *
+ * Sets up context and calls the internal scalar multiplication routine.
+ * This routine runs in constant time.
+ *
+ * @param[in]   dmem[0]: dptr_k0, pointer to location in dmem containing
+ *                       1st scalar share k0
+ * @param[in]   dmem[4]: dptr_k1, pointer to location in dmem containing
+ *                       2nd scalar share k1
+ * @param[in]  dmem[20]: dptr_x, pointer to affine x-coordinate in dmem
+ * @param[in]  dmem[22]: dptr_y, pointer to affine y-coordinate in dmem
+ * @param[out]  dmem[x]: masked x coordinate of R
+ * @param[out]  dmem[y]: corresponding mask
+ *
+ * 384-bit quantities have to be provided in dmem in little-endian format,
+ * 512 bit aligned, with the highest 128 bit set to zero.
+ *
+ * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
+ *        the computed affine y-coordinate.
+ *
+ * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30
+ *                      w0 to w30
+ * clobbered flag groups: FG0
+ */
+.globl p384_scalar_mult
+p384_scalar_mult:
+
+  /* Init all-zero register. */
+  bn.xor  w31, w31, w31
+
+  /* set dmem pointer to point x-coordinate */
+  la        x20, dptr_x
+  lw        x20, 0(x20)
+
+  /* set dmem pointer to point y-coordinate */
+  la        x21, dptr_y
+  lw        x21, 0(x21)
+
+  /* set dmem pointer to 1st scalar share k0 */
+  la        x17, dptr_k0
+  lw        x17, 0(x17)
+
+  /* set dmem pointer to 2nd scalar share k1 */
+  la        x19, dptr_k1
+  lw        x19, 0(x19)
+
+  /* set dmem pointer to domain parameter b */
+  la        x28, p384_b
+
+  /* set dmem pointer to scratchpad */
+  la        x30, scratchpad
+
+  /* load domain parameter p (modulus)
+     [w13, w12] = p = dmem[p384_p] */
+  li        x2, 12
+  la        x3, p384_p
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* load domain parameter n (order of base point)
+     [w11, w10] = n = dmem[p384_n] */
+  li        x2, 10
+  la        x3, p384_n
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* scalar multiplication inprojective space
+     [w30:w25] <= (x, y, z) = k * P */
+  jal       x1, scalar_mult_int_p384
+
+  /* Arithmetic masking:
+   1. Generate a random mask r
+   2. Subtract masks from projective x coordinate
+      (x, y, z) -> ((x - r) mod p,
+                     y,
+                     z)
+   3. Convert masked curve point back to affine
+      form.
+   4. Multiply mask with z^-1 for use in
+      affine space. */
+
+  /* Load domain parameter.
+     [w13,w12] = dmem[p384_p] */
+  li        x2, 12
+  la        x4, p384_p
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  /* Fetch a fresh randomness for mask.
+     [w20, w19, w18] <= URND() = r */
+  bn.wsrr   w18, URND
+  bn.wsrr   w19, URND
+  bn.wsrr   w20, URND
+
+  /* Reduce r mod p
+     [w19, w18] <= [w20, w19, w18] mod [w13, w12] = r mod p */
+  jal       x1, p384_reduce_p
+  bn.mov    w18, w16
+  bn.mov    w19, w17
+
+  /* Arithmetic masking.
+     [w26,w25] = A <= [w26,w25] - [w19,w18] mod [w13,w12] = x - r mod p */
+
+  /* [w9,w8] = A1 <= [w26,w25] - [w19,w18] = x - r */
+  bn.sub    w8, w25, w18
+  bn.subb   w9, w26, w19
+
+  /* [w7,w6] = A2 <= [w9,w8] + [w13,w12] = A1 + p = x - r + p */
+  bn.add    w6, w8, w12
+  bn.addc   w7, w9, w13
+
+  /* If x < r: [w26,w25] <= A2, else: [w26,w25] <= A1 */
+  bn.sub    w0, w25, w18
+  bn.subb   w1, w26, w19
+  bn.sel    w25, w6, w8, FG0.C
+  bn.sel    w26, w7, w9, FG0.C
+
+  /* Store mask to dmem for later use.
+     y coordinate is not required afterwards and therefore can be used
+     for the mask. */
+  li        x2, 18
+  bn.sid    x2++, 0(x21)
+  bn.sid    x2, 32(x21)
+
+  /* conversion into affine space
+     [w1, w0] <= z^-1
+     [w28:w25] <= (x, y) */
+  jal       x1, proj_to_affine_p384
+
+  /* Get modular inverse z^-1 of projective z coordinate
+     and multiply the random masks with z^-1 to
+     also convert them into affine space. */
+
+  /* Load domain parameter.
+     [w13,w12] = dmem[p384_p] */
+  li        x2, 12
+  la        x4, p384_p
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  /* Move previously stored mask r and z^-1 into input WDRs
+     for multiplication. */
+  li        x2, 10
+  bn.lid    x2++, 0(x21)
+  bn.lid    x2, 32(x21)
+  bn.mov    w16, w0
+  bn.mov    w17, w1
+
+  /* Compute affine mask by modular multiplication of r and z^-1.
+     [w17, w16] = r_a = r * z^-1 mod p */
+  jal       x1, p384_mulmod_p
+
+  /* Store result in dmem.
+     y coordinate is not required afterwards and
+     is therefore replaced by the affine mask r_a*/
+  li        x2, 25
+  bn.sid    x2++, 0(x20)
+  bn.sid    x2, 32(x20)
+  li        x2, 16
+  bn.sid    x2++, 0(x21)
+  bn.sid    x2, 32(x21)
+
+  ret
+
+/* pointers and scratchpad memory */
+.section .data
+
+.balign 32
+
+  /* pointer to k0 (dptr_k0) */
+.globl dptr_k0
+.weak dptr_k0
+dptr_k0:
+  .zero 4
+
+/* pointer to k1 (dptr_k1) */
+.globl dptr_k1
+.weak dptr_k1
+dptr_k1:
+  .zero 4
+
+/* pointer to d0 (dptr_d0) */
+.globl dptr_d0
+.weak dptr_d0
+dptr_d0:
+  .zero 4
+
+/* pointer to d1 (dptr_d1) */
+.globl dptr_d1
+.weak dptr_d1
+dptr_d1:
+  .zero 4
+
+/* pointer to X (dptr_x) */
+.globl dptr_x
+.weak dptr_x
+dptr_x:
+  .zero 4
+
+/* pointer to Y (dptr_y) */
+.globl dptr_y
+.weak dptr_y
+dptr_y:
+  .zero 4
+
+/* 704 bytes of scratchpad memory */
+.balign 32
+scratchpad:
+  .zero 704
diff --git a/sw/otbn/crypto/p384_sign.s b/sw/otbn/crypto/p384_sign.s
index 21aaeff1fec9d..6422ba62c7884 100644
--- a/sw/otbn/crypto/p384_sign.s
+++ b/sw/otbn/crypto/p384_sign.s
@@ -2,884 +2,11 @@
 /* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
 /* SPDX-License-Identifier: Apache-2.0 */
 /*
- *   P-384 specific routines for ECDSA signature generation and constant-time
- *   scalar multiplication.
+ *   P-384 specific routines for ECDSA signature generation.
  */
 
  .section .text
 
-/**
- * Convert projective coordinates of a P-384 curve point to affine coordinates
- *
- * returns P = (x_a, y_a) = (x/z mod p, y/z mod p)
- *              where P is a valid P-384 curve point,
- *                    x_a and y_a are the resulting affine coordinates of the
- *                      curve point,
- *                    x,y and z are a set of projective coordinates of the
- *                      point and
- *                    p is the modulus of the P-384 underlying finite field.
- *
- * This routine computes the affine coordinates for a set of projective
- * coordinates of a valid P-384 curve point. The routine performs the required
- * divisions by computing the multiplicative modular inverse of the
- * projective z-coordinate in the underlying finite field of the P-384 curve.
- * For inverse computation Fermat's little theorem is used, i.e.
- * we compute z^-1 = z^(p-2) mod p.
- * For exponentiation a 16 step addition chain is used.
- * Source of the addition chain is the addchain project:
- * https://github.com/mmcloughlin/addchain/
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * @param[in]  [w26,w25]: x, x-coordinate of curve point (projective).
- * @param[in]  [w26,w25]: y, y-coordinate of curve point (projective).
- * @param[in]  [w30,w29]: z, z-coordinate of curve point (projective).
- * @param[in]  [w13, w12]: p, modulus of P-384.
- * @param[in]  w31: all-zero.
- * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point.
- * @param[out] [w28, w27]: y_a, affine y-coordinate of resulting point.
- *
- * clobbered registers: w0 to w28
- * clobbered flag groups: FG0
- */
-proj_to_affine_p384:
-
-  /* Exp: 0b10 = 2*0b1
-     Val: r10 = z^2 mod p
-          [w17,w16] <= [w30,w29]^2 mod [w13,w12] */
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  bn.mov    w16, w29
-  bn.mov    w17, w30
-  jal       x1, p384_mulmod_p
-
-  /* Exp: 0b11 = 0b1+0b10
-     Val: r11 <= z*r10 mod p
-          [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  jal       x1, p384_mulmod_p
-
-  /* Exp: 0b110 = 2*0b11
-     Val: r110 = r11^2 mod p
-          [w17,w16] <= [w17,w16]^2 mod [w13,w12] */
-  bn.mov    w10, w16
-  bn.mov    w11, w17
-  jal       x1, p384_mulmod_p
-
-  /* Exp: 0b111 = 0b1+0b110
-     Val: r111 <= z*r110  mod p
-          [w1,w0] = [w17,w16] <= [w30,w29]*[w17,w16] mod [w13,w12] */
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  jal       x1, p384_mulmod_p
-  bn.mov    w0, w16
-  bn.mov    w1, w17
-
-  /* Exp: 0b111000 = 0b111<<3
-     Val: r111000 <= r111^(2^3)  mod p
-          [w17,w16] <= [w17,w16]^(2^3) mod [w13,w12] */
-  loopi     3, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-
-  /* Exp: 0b1111111 = 0b111+0b111000
-     Val: r1111111 <= r111*r111000 mod p
-          [w3,w2] = [w17,w16] <= [w1,w0]*[w17,w16] mod [w13,w12] */
-  bn.mov    w10, w0
-  bn.mov    w11, w1
-  jal       x1, p384_mulmod_p
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* Exp: 2^12-1 = (0b1111111<<6)+0b111111
-     Val: r_12_1 <= r111111^(2^6)*r111111 mod p
-          [w5,w4] = [w17,w16] <= [w17,w16]^(2^6)*[w17,w16] mod [w13,w12] */
-  loopi     6, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-  bn.mov    w10, w2
-  bn.mov    w11, w3
-  jal       x1, p384_mulmod_p
-  bn.mov    w4, w16
-  bn.mov    w5, w17
-
-  /* Exp: 2^24-1 = ((2^12-1)<<12)+(2^12-1)
-     Val: r_24_1 <= r_12_1^(2^12)*r12_1 mod p
-          [w17,w16] <= [w17,w16]^(2^12)*[w5,w4] mod [w13,w12] */
-  loopi     12, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-  bn.mov    w10, w4
-  bn.mov    w11, w5
-  jal       x1, p384_mulmod_p
-
-  /* Exp: 2^30-1 = ((2^24-1)<<6)+0b111111
-     Val: r_30_1 <= r_24_1^(2^6)*r111111 mod p
-          [w3, w2] = [w17,w16] <= [w17,w16]^(2^6)*[w3,w2] mod [w13,w12] */
-  loopi     6, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-  bn.mov    w10, w2
-  bn.mov    w11, w3
-  jal       x1, p384_mulmod_p
-  bn.mov    w2, w16
-  bn.mov    w3, w17
-
-  /* Exp: 2^31-1 <= (2^30-1)*2+0b1
-     Val: r_31_1 <= r30_1^2*z mod p
-          [w7,w6] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
-  bn.mov    w10, w16
-  bn.mov    w11, w17
-  jal       x1, p384_mulmod_p
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  jal       x1, p384_mulmod_p
-  bn.mov    w6, w16
-  bn.mov    w7, w17
-
-  /* Exp: 2^32-1 <= (2^30-1)*2+0b1
-     Val: r_32_1 <= r31_1^2*z mod p
-          [w9,w8] = [w17,w16] <= [w17,w16]^2*[w30,w29] mod [w13,w12] */
-  bn.mov    w10, w16
-  bn.mov    w11, w17
-  jal       x1, p384_mulmod_p
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  jal       x1, p384_mulmod_p
-  bn.mov    w9, w16
-  bn.mov    w8, w17
-
-  /* Exp: 2^63-1 <= ((2^32-1)<<31)+(2^31-1)
-     Val: r_63_1 <= r_32_1^(2^31)*r_31_1 mod p
-          [w7,w6] = [w17,w16] <= [w17,w16]^(2^31)*[w7,w6] mod [w13,w12] */
-  loopi     31, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-  bn.mov    w10, w6
-  bn.mov    w11, w7
-  jal       x1, p384_mulmod_p
-  bn.mov    w6, w16
-  bn.mov    w7,w17
-
-  /* Exp: 2^126-1 = ((2^63-1)<<63) + (2^63-1)
-     Val: r_126_1 <= r_63_1^(2^63)*r_63_1 mod p
-          [w7,w6] = [w17,w16] <= [w17,w16]^(2^63)*[w7,w6] mod [w13,w12] */
-  loopi     63, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-  bn.mov    w10, w6
-  bn.mov    w11, w7
-  jal       x1, p384_mulmod_p
-  bn.mov    w6, w16
-  bn.mov    w7, w17
-
-  /* Exp: 2^252-1 = ((2^126-1)<<126)+(2^126-1)
-     Val: r_252_1 <= r_126_1^(2^63)*r_126_1 mod p
-          [w17,w16] <= [w17,w16]^(2^126)*[w7,w6] mod [w13,w12] */
-  loopi     126, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-  bn.mov    w10, w6
-  bn.mov    w11, w7
-  jal       x1, p384_mulmod_p
-
-  /* Exp: 2^255-1 = ((2^252-1)<<3)+0b111
-     Val: r_255_1 <= r_252_1^(2^3)*r111 mod p
-          [w17,w16] <= [w17,w16]^(2^3)*[w1,w0] mod [w13,w12] */
-  loopi     3, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-  bn.mov    w10, w0
-  bn.mov    w11, w1
-  jal       x1, p384_mulmod_p
-
-  /* Exp: p-2 = ((((((2^255-1)<<33)+(2^32-1))<<94)+(2^30-1))<<2)+0b1
-     Val: x_inv <=((r_255_1^(2^33)*r_32_1)^(2^94)*r_30_1)^(2^2)*z mod p
-          [w17,w16] <= (([w17,w16]^(2^33)*[w9,w8])^(2^94)*[w3,w2])^(2^2)
-                       *[w30,w29] mod [w13,w12] */
-  loopi     33, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-  bn.mov    w10, w9
-  bn.mov    w11, w8
-  jal       x1, p384_mulmod_p
-  loopi     94, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-  bn.mov    w10, w2
-  bn.mov    w11, w3
-  jal       x1, p384_mulmod_p
-  loopi     2, 4
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_p
-    nop
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-  jal       x1, p384_mulmod_p
-
-  /* store inverse [w1,w0] <= [w17,w16] = z_inv*/
-  bn.mov w0, w16
-  bn.mov w1, w17
-
-  /* convert x-coordinate to affine space
-     [w26,w25] <= [w17,w16] = x_a <= x/z = x*z_inv = [w26,w25]*[w1,w0] mod p */
-  bn.mov    w10, w25
-  bn.mov    w11, w26
-  jal       x1, p384_mulmod_p
-  bn.mov    w25, w16
-  bn.mov    w26, w17
-
-  /* convert y-coordinate to affine space
-     [w28,w27] <= [w17,w16] = y_a <= y/z = y*z_inv = [w28,w27]*[w1,w0] mod p */
-  bn.mov    w10, w27
-  bn.mov    w11, w28
-  bn.mov    w16, w0
-  bn.mov    w17, w1
-  jal       x1, p384_mulmod_p
-  bn.mov    w27, w16
-  bn.mov    w28, w17
-
-  ret
-
-
-/**
- * Fetch curve point from dmem, randomize z-coordinate and store point in dmem
- *
- * returns P = (x, y, z) = (x_a*z, y_a*z, z)
- *         with P being a valid P-384 curve point in projective coordinates
- *              x_a and y_a being the affine coordinates as fetched from dmem
- *              z being a randomized z-coordinate
- *
- * This routines fetches the affine x- and y-coordinates of a curve point from
- * dmem and computes a valid set of projective coordinates. The z-coordinate is
- * randomized and x and y are scaled appropriately. The resulting projective
- * coordinates are stored at dmem[dptr_p_p] using 6 consecutive 256-bit cells,
- * i.e. each coordinate is stored 512 bit aligned, little endian.
- * This routine runs in constant time.
- *
- * @param[in]  x20: dptr_x, pointer to dmem location containing affine
- *                          x-coordinate of input point
- * @param[in]  x21: dptr_y, pointer to dmem location containing affine
- *                          y-coordinate of input point
- * @param[in]  [w15, w14]: u[383:0] lower 384 bit of Barrett constant u for
- *                                    modulus p
- * @param[in]  [w13, w12]: p, modulus of P-384 underlying finite field
- * @param[in]  w31: all-zero
- * @param[in]  x18: dptr_p_p, pointer to dmem location to store resulting point
- *                            in projective space
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- *        the upper limb of projective y-coordinate.
- *
- * clobbered registers: x10, x11 to x13
-  *                     w2, w3, w8 to w11, w16 to w24, w29, w30
- * clobbered flag groups: FG0
- */
-store_proj_randomize:
-
-  /* get a 384-bit random number from URND
-    [w3, w2] = random(384) */
-  bn.wsrr   w2, 2
-  bn.wsrr   w3, 2
-  bn.rshi   w3, w31, w3 >> 128
-
-  /* reduce random number
-     [w2, w3] = z <= [w2, w3] mod p */
-  bn.sub   w10, w2, w12
-  bn.subb  w11, w3, w13
-  bn.sel   w2, w2, w10, C
-  bn.sel   w3, w3, w11, C
-
-  bn.mov w10, w2
-  bn.mov w11, w3
-
-  /* store z-coordinate
-     dmem[x20+128] = [w10, w11] */
-  li        x10, 10
-  li        x11, 11
-  bn.sid    x10, 128(x18)
-  bn.sid    x11, 160(x18)
-
-  /* fetch x-coordinate from dmem
-     [w16, w17] = x <= [dmem[dptr_x], dmem[dptr_x+32]] */
-  li x12, 16
-  li x13, 17
-  bn.lid    x12,  0(x20)
-  bn.lid    x13, 32(x20)
-
-  /* scale and store x-coordinate
-     [dmem[dptr_p_p], dmem[dptr_p_p+32]] = [w17, w16] =
-       x_p <= [w11, w10] * [w17, w16] = z*x  mod p */
-
-  jal       x1, p384_mulmod_p
-  bn.sid    x12,  0(x18)
-  bn.sid    x13, 32(x18)
-
-  /* fetch y-coordinate from dmem
-     [w11, w10] = x <= [dmem[dptr_y], dmem[dptr_y+32]] */
-  bn.lid    x12,  0(x21)
-  bn.lid    x13, 32(x21)
-
-  /* scale and store y-coordinate
-     [dmem[dptr_p_p+64], dmem[dptr_p_p+96]] = [w17, w16] =
-       y_p <= [w11, w10] * [w17, w16] = z*y  mod p */
-  bn.mov w10, w2
-  bn.mov w11, w3
-  jal       x1, p384_mulmod_p
-  bn.sid    x12, 64(x18)
-  bn.sid    x13, 96(x18)
-
-  ret
-
-
-/**
- * P-384 scalar point multiplication in affine space
- *
- * returns R = k*P = k*(x_p, y_p)
- *         where R, P are valid P-384 curve points in affine coordinates,
- *               k is a 384-bit scalar.
- *
- * This routine performs scalar multiplication based on the group laws
- * of Weierstrass curves.
- * A constant time double-and-add algorithm (sometimes referred to as
- * double-and-add-always) is used.
- * Due to the P-384 optimized implementations of the internally called routines
- * for point addition and doubling, this routine is limited to P-384 curves.
- * The routine makes use of blinding by additive splitting the
- * exponent/scalar d into two shares. The double-and-add loop operates on both
- * shares in parallel applying Shamir's trick.
- *
- * @param[in]  x9: dptr_rnd, pointer to location in dmem containing random
- *                           number to be used for additive splitting of scalar
- * @param[in]  x19: dptr_k, pointer to scalar k (0 < k < n) in dmem
- * @param[in]  x20: dptr_x, pointer to affine x-coordinate in dmem
- * @param[in]  x21: dptr_y, pointer to affine y-coordinate in dmem
- * @param[in]  x28: dptr_b, pointer to domain parameter b of P-384 in dmem
- * @param[in]  x30: dptr_sp, pointer to 704 bytes of scratchpad memory in dmem
- * @param[in]  [w13, w12]: p, modulus of P-384 underlying finite field
- * @param[in]  [w11, w10]: n, domain parameter of P-384 curve
- *                            (order of base point G)
- * @param[in]  w31: all-zero
- * @param[out] [w26, w25]: x_a, affine x-coordinate of resulting point R.
- * @param[out] [w28, w26]: y_a, affine y-coordinate of resulting point R.
- *
- * Scratchpad memory layout:
- * The routine expects at least 704 bytes of scratchpad memory at dmem
- * location 'scratchpad' (sp). Internally the scratchpad is used as follows:
- * dptr_sp     .. dptr_sp+191: point P, projective
- * dptr_sp+192 .. dptr_sp+255: s0, 1st share of scalar
- * dptr_sp+256 .. dptr_sp+447: point 2P, projective
- * dptr_sp+448 .. dptr_sp+511: s1, 2nd share of scalar
- * dptr_sp+512 .. dptr_sp+703: point Q, projective
- *
- * Projective coordinates of a point are kept in dmem in little endian format
- * with the individual coordinates 512 bit aligned. The coordinates are stored
- * in x,y,z order (i.e. x at lowest, z at highest address). Thus, a 384 bit
- * curve point occupies 6 consecutive 256-bit dmem cells.
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- *        the computed affine y-coordinate.
- *
- * clobbered registers: x2, x10, x11 to x13, x18, x26, x27, w0 to w30
- * clobbered flag groups: FG0
- */
-scalar_mult_int_p384:
-
-  /* set regfile pointers to in/out regs of Barrett routine. Set here to avoid
-     resetting in very call to point addition routine */
-  li        x22, 10
-  li        x23, 11
-  li        x24, 16
-  li        x25, 17
-
-  /* fetch externally supplied random number from dmem
-     [w1, w0] = dmem[dptr_rnd] = [dmem[x9], dmem[x9+32]] = rnd */
-  li        x2, 0
-  bn.lid    x2++, 0(x9)
-  bn.lid    x2++, 32(x9)
-
-  /* 1st share (reduced rnd)
-     s0 = [w1, w0] <= rnd mod n = [w1, w0] mod [w11, w10] */
-  bn.sub    w9, w0, w10
-  bn.subb   w8, w1, w11
-  bn.sel    w0, w0, w9, C
-  bn.sel    w1, w1, w8, C
-
-  /* load scalar k from dmem
-     [w3, w2] = k <= dmem[dptr_k] = [dmem[x19], dmem[x19+32]] */
-  bn.lid    x2++, 0(x19)
-  bn.lid    x2, 32(x19)
-
-  /* 2nd share (k-s0)
-     s1 = [w3, w2] <= k - s0 mod n = [w2, w3] - [w1, w0] mod [w11, w10] */
-  bn.sub    w2, w2, w0
-  bn.subb   w3, w3, w1
-  bn.add    w8, w2, w10
-  bn.addc   w9, w3, w11
-  bn.sel    w2, w8, w2, C
-  bn.sel    w3, w9, w3, C
-
-  /* left align both shares for probing of MSB in loop body */
-  bn.rshi   w1, w1, w0 >> 128
-  bn.rshi   w0, w0, w31 >> 128
-  bn.rshi   w3, w3, w2 >> 128
-  bn.rshi   w2, w2, w31 >> 128
-
-   /* store shares in scratchpad */
-  li        x2, 0
-  bn.sid    x2++, 192(x30)
-  bn.sid    x2++, 224(x30)
-  bn.sid    x2++, 448(x30)
-  bn.sid    x2++, 480(x30)
-
-  /* get randomized projective coodinates of curve point
-     P = (x_p, y_p, z_p) = dmem[dptr_sp] = (x*z mod p, y*z mod p, z) */
-  add       x18, x30, 0
-  jal       x1, store_proj_randomize
-
-  /* double point P
-     2P = ([w30,w29], [w28,w27], [w26, w25]) <= 2*P */
-  add       x27, x30, x0
-  add       x26, x30, x0
-  jal       x1, proj_add_p384
-
-  /* store point 2P in scratchpad @w30+256
-     dmem[dptr_sc+256] = [w30:w25] = 2P */
-  li        x2, 25
-  bn.sid    x2++, 256(x30)
-  bn.sid    x2++, 288(x30)
-  bn.sid    x2++, 320(x30)
-  bn.sid    x2++, 352(x30)
-  bn.sid    x2++, 384(x30)
-  bn.sid    x2++, 416(x30)
-
-  /* init point Q = (0,1,0) for double-and-add in scratchpad */
-  /* dmem[x26] = dmem[dptr_sc+512] = Q = (0,1,0) */
-  addi      x26, x30, 512
-  li        x2, 30
-  bn.addi   w30, w31, 1
-  bn.sid    x2++, 64(x26)
-  bn.sid    x2, 0(x26)
-  bn.sid    x2, 32(x26)
-  bn.sid    x2, 96(x26)
-  bn.sid    x2, 128(x26)
-  bn.sid    x2, 160(x26)
-
-  /* double-and-add loop with decreasing index */
-  loopi     384, 85
-
-    /* double point Q
-       Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
-    add       x27, x26, x0
-    jal       x1, proj_add_p384
-
-    /* store Q in dmem
-     dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
-    li        x2, 25
-    bn.sid    x2++, 0(x26)
-    bn.sid    x2++, 32(x26)
-    bn.sid    x2++, 64(x26)
-    bn.sid    x2++, 96(x26)
-    bn.sid    x2++, 128(x26)
-    bn.sid    x2++, 160(x26)
-
-    /* Probe if MSb of either of the two scalars (rnd or d-rnd) but not both
-       is 1.
-       If only one MSb is set, select P for addition.
-       If both MSbs are set, select 2P for addition.
-       (If neither MSB is set, 2P will be selected but result discarded.) */
-    li        x2, 0
-    bn.lid    x2++, 224(x30)
-    bn.lid    x2, 480(x30)
-    bn.xor    w8, w0, w1
-    /* Create conditional offeset into scratchpad.
-       if (s0[512] xor s1[512]) x27 <= x30 else x27 <= x30+256 */
-    csrrs     x3, 0x7c0, x0
-    xori      x3, x3, -1
-    andi      x3, x3, 2
-    slli      x27, x3, 7
-    add       x27, x27, x30
-
-    /* Reload randomized projective coodinates for curve point P.
-       P = (x_p, y_p, z_p) = dmem[dptr_sp] <= (x*z mod p, y*z mod p, z) */
-    jal       x1, store_proj_randomize
-
-    /* Add points Q+P or Q+2P depending on offset in x27.
-       Q_a = ([w30,w29], [w28,w27], [w26, w25]) <= Q + dmem[x27] */
-    jal       x1, proj_add_p384
-
-    /* load shares from scratchpad
-       [w1, w0] = s0; [w3, w2] = s1 */
-    li        x2, 0
-    bn.lid    x2++, 192(x30)
-    bn.lid    x2++, 224(x30)
-    bn.lid    x2++, 448(x30)
-    bn.lid    x2++, 480(x30)
-
-    /* M = s0[511] | s1[511] */
-    bn.or     w8, w1, w3
-
-    /* load q from scratchpad
-        Q = ([w9,w8], [w7,w6], [w5,w4]) <= dmem[x26] */
-    li        x2, 4
-    bn.lid    x2++, 0(x26)
-    bn.lid    x2++, 32(x26)
-    bn.lid    x2++, 64(x26)
-    bn.lid    x2++, 96(x26)
-    bn.lid    x2++, 128(x26)
-    bn.lid    x2++, 160(x26)
-
-    /* select either Q or Q_a
-       if M: Q = ([w30,w29], [w28,w27], [w26, w25]) <= Q else: Q <= Q_a */
-    bn.sel    w25, w25, w4, M
-    bn.sel    w26, w26, w5, M
-    bn.sel    w27, w27, w6, M
-    bn.sel    w28, w28, w7, M
-    bn.sel    w29, w29, w8, M
-    bn.sel    w30, w30, w9, M
-
-    /* store Q in dmem
-     dmem[x26] = dmem[dptr_sc+512] <= [w30:w25] */
-    li        x2, 25
-    bn.sid    x2++, 0(x26)
-    bn.sid    x2++, 32(x26)
-    bn.sid    x2++, 64(x26)
-    bn.sid    x2++, 96(x26)
-    bn.sid    x2++, 128(x26)
-    bn.sid    x2++, 160(x26)
-
-    /* left shift both shares
-       s0 <= s0 << 1 ; s1 <= s1 << 1 */
-    bn.add    w0, w0, w0
-    bn.addc   w1, w1, w1
-    bn.add    w2, w2, w2
-    bn.addc   w3, w3, w3
-    /* store both shares in scratchpad */
-    li        x2, 0
-    bn.sid    x2++, 192(x30)
-    bn.sid    x2++, 224(x30)
-    bn.sid    x2++, 448(x30)
-    bn.sid    x2++, 480(x30)
-
-
-    /* Get a fresh random number from URND and scale the coordinates of 2P.
-       (scaling each proj. coordinate by same factor results in same point) */
-
-    /* get a 384-bit random number from URND */
-    bn.wsrr   w2, 2
-    bn.wsrr   w3, 2
-    bn.rshi   w3, w31, w3 >> 128
-
-    /* reduce random number
-      [w2, w3] = z <= [w2, w3] mod p */
-    bn.sub    w10, w2, w12
-    bn.subb   w11, w3, w13
-    bn.sel    w2, w2, w10, C
-    bn.sel    w3, w3, w11, C
-
-    /* scale all coordinates in scratchpad */
-    li        x2, 16
-    li        x3, 17
-    /* x-coordinate */
-    bn.mov    w10, w2
-    bn.mov    w11, w3
-    bn.lid    x2, 256(x30)
-    bn.lid    x3, 288(x30)
-    jal       x1, p384_mulmod_p
-    bn.sid    x2, 256(x30)
-    bn.sid    x3, 288(x30)
-    /* y-coordinate */
-    bn.mov    w10, w2
-    bn.mov    w11, w3
-    bn.lid    x2, 320(x30)
-    bn.lid    x3, 352(x30)
-    jal       x1, p384_mulmod_p
-    bn.sid    x2, 320(x30)
-    bn.sid    x3, 352(x30)
-    /* z-coordinate */
-    bn.mov    w10, w2
-    bn.mov    w11, w3
-    bn.lid    x2, 384(x30)
-    bn.lid    x3, 416(x30)
-    jal       x1, p384_mulmod_p
-    bn.sid    x2, 384(x30)
-    bn.sid    x3, 416(x30)
-
-  /* convert coordinates to affine space */
-  jal       x1, proj_to_affine_p384
-
-  ret
-
-
-/**
- * Externally callable wrapper for P-384 scalar point multiplication
- *
- * returns R = k*P = k*(x_p, y_p)
- *         where R, P are valid P-384 curve points in affine coordinates,
- *               k is a 384-bit scalar..
- *
- * Sets up context and calls the internal scalar multiplication routine.
- * This routine runs in constant time.
- *
- * @param[in]  dmem[0]: dK, pointer to location in dmem containing scalar k
- * @param[in]  dmem[4]: dRnd, pointer to location in dmem containing random
- *                        number for blinding
- * @param[in]  dmem[20]: dptr_x, pointer to affine x-coordinate in dmem
- * @param[in]  dmem[22]: dptr_y, pointer to affine y-coordinate in dmem
- *
- * 384-bit quantities have to be provided in dmem in little-endian format,
- * 512 bit aligned, with the highest 128 bit set to zero.
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 depend on
- *        the computed affine y-coordinate.
- *
- * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30
- *                      w0 to w30
- * clobbered flag groups: FG0
- */
-.globl scalar_mult_p384
-scalar_mult_p384:
-
-  /* set dmem pointer to point x-coordinate */
-  la        x20, dptr_x
-  lw        x20, 0(x20)
-
-  /* set dmem pointer to point y-coordinate */
-  la        x21, dptr_y
-  lw        x21, 0(x21)
-
-  /* set dmem pointer to scalar k */
-  la        x19, dptr_k
-  lw        x19, 0(x19)
-
-  /* set pointer to blinding parameter */
-  la        x9, dptr_rnd
-  lw        x9, 0(x9)
-
-  /* set dmem pointer to domain parameter b */
-  la        x28, p384_b
-
-  /* set dmem pointer to scratchpad */
-  la        x30, scratchpad
-
-  /* load domain parameter p (modulus)
-     [w13, w12] = p = dmem[p384_p] */
-  li        x2, 12
-  la        x3, p384_p
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* load domain parameter n (order of base point)
-     [w11, w10] = n = dmem[p384_n] */
-  li        x2, 10
-  la        x3, p384_n
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* init all-zero reg */
-  bn.xor    w31, w31, w31
-
-  jal       x1, scalar_mult_int_p384
-
-  /* store result in dmem */
-  li        x2, 25
-  bn.sid    x2++, 0(x20)
-  bn.sid    x2++, 32(x20)
-  bn.sid    x2++, 0(x21)
-  bn.sid    x2++, 32(x21)
-
-  ret
-
-/**
- * Externally callable routine for P-384 base point multiplication
- *
- * returns Q = d (*) G
- *         where Q is a resulting valid P-384 curve point in affine
- *                   coordinates,
- *               G is the base point of curve P-384, and
- *               d is a 384-bit scalar.
- *
- * Sets up context and calls the internal scalar multiplication routine.
- * This routine runs in constant time.
- *
- * @param[in]  dmem[0]: dptr_d, pointer to location in dmem containing
- *                      scalar d.
- * @param[in]  dmem[20]: dptr_x, pointer to result buffer for x-coordinate
- * @param[in]  dmem[24]: dptr_y, pointer to result buffer for y-coordinate
- * @param[in]  dmem[28]: dptr_rnd, pointer to location in dmem containing
- *                       random number for blinding.
- *
- * 384-bit quantities have to be provided in dmem in little-endian format,
- * 512 bit aligned, with the highest 128 bit set to zero.
- *
- * Flags: When leaving this subroutine, the M, L and Z flags of FG0 correspond
- *        to the computed affine y-coordinate.
- *
- * clobbered registers: x2, x3, x9 to x13, x18 to x21, x26 to x30
- *                      w0 to w30
- * clobbered flag groups: FG0
- */
-.globl p384_base_mult
-p384_base_mult:
-
-  /* set dmem pointer to x-coordinate of base point*/
-  la        x20, p384_gx
-
-  /* set dmem pointer to y-coordinate of base point */
-  la        x21, p384_gy
-
-  /* set dmem pointer to scalar d */
-  la        x19, dptr_d
-  lw        x19, 0(x19)
-
-  /* set pointer to blinding parameter */
-  la        x9, dptr_rnd
-  lw        x9, 0(x9)
-
-  /* set dmem pointer to domain parameter b */
-  la        x28, p384_b
-
-  /* set dmem pointer to scratchpad */
-  la        x30, scratchpad
-
-  /* load domain parameter p (modulus)
-     [w13, w12] = p = dmem[p384_p] */
-  li        x2, 12
-  la        x3, p384_p
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* load domain parameter n (order of base point)
-     [w11, w10] = n = dmem[p384_n] */
-  li        x2, 10
-  la        x3, p384_n
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* init all-zero reg */
-  bn.xor    w31, w31, w31
-
-  jal       x1, scalar_mult_int_p384
-
-  /* set dmem pointer to point x-coordinate */
-  la        x20, dptr_x
-  lw        x20, 0(x20)
-
-  /* set dmem pointer to point y-coordinate */
-  la        x21, dptr_y
-  lw        x21, 0(x21)
-
-  /* store result in dmem */
-  li        x2, 25
-  bn.sid    x2++, 0(x20)
-  bn.sid    x2++, 32(x20)
-  bn.sid    x2++, 0(x21)
-  bn.sid    x2++, 32(x21)
-
-  ret
-
-
-/**
- * Variable-time modular multiplicative inverse computation
- *
- * returns x_inv = x^-1 mod m
- *
- * This routine computes the modular multiplicative inverse for any x < m in
- * the finite field GF(m) where m is prime.
- *
- * For inverse computation, Fermat's little theorem is used, i.e.
- * we compute x^-1 = x^(m-2) mod m.
- * For exponentiation we use a standard, variable-time (!) square and multiply
- * algorithm.
- *
- * This routine is mainly intended to be used for inversion of scalars in
- * context of the P-384 curve. In theory, it can be used with any 384-bit
- * modulus m with a corresponding 385-bit Barrett constant u,
- * where u[383:192] = 0.
- *
- * Note: When used for P-384 scalar inversion, the routine will need 672 calls
- * to the multiplication routine. By using an adder chain this could be reduced
- * to ~433 multiplications, however, at the cost of a significant codes size
- * increase.
- *
- * Note: This routine runs in variable-time w.r.t. the modulus. It should only
- * be used with a non-secret modulus.
- *
- * @param[in]  [w13, w12]: m, 384 bit modulus
- * @param[in]  w14: k, Solinas constant (2^384 - m) (max. length 191 bits).
- * @param[in]  [w30, w29]: x, 384 bit operand
- * @param[in]  w31, all-zero
- * @param[out] [w17, w16]: x_inv, modular multiplicative inverse
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * clobbered registers: x2, w2, w3, w10, w11, w16 to w24
- * clobbered flag groups: FG0
- */
-mod_inv_n_p384:
-
-  /* subtract 2 from modulus for Fermat's little theorem
-     [w13,w12] <= m - 2 = [w11,w10]-2 (left aligned) */
-  bn.subi   w2, w12, 2
-  bn.subb   w3, w13, w31
-  bn.rshi   w3, w3, w2 >> 128
-  bn.rshi   w2, w2, w31 >> 128
-
-  /* init square and multiply: [w17,w16] = 1 */
-  bn.addi   w16, w31, 1
-  bn.mov    w17, w31
-
-  /* square and multiply loop */
-  loopi     384, 12
-
-    /* square: [w17,w16] <= [w17, w16]*[w11,w10] mod [w13, w12] */
-    bn.mov    w10, w16
-    bn.mov    w11, w17
-    jal       x1, p384_mulmod_n
-
-    /* shift MSB into carry flag
-       [w3,w2] = 2*[w3,w2] = [w3,w2] << 1 */
-    bn.add    w2, w2, w2
-    bn.addc   w3, w3, w3
-
-    /* skip multiplication if C flag not set */
-    csrrs     x2, 0x7c0, x0
-    andi      x2, x2, 1
-    beq       x2, x0, nomul
-
-    /* multiply: [w17,w16] <= [w17, w16]*[w30,w29] mod [w13, w12] */
-    bn.mov    w10, w29
-    bn.mov    w11, w30
-    jal       x1, p384_mulmod_n
-
-    nomul:
-    nop
-
-  ret
-
-
 /**
  * P-384 ECDSA signature generation
  *
@@ -895,19 +22,23 @@ mod_inv_n_p384:
  *
  * This routine runs in constant time.
  *
- * @param[in]  dmem[0]: dptr_k, pointer to a 384 bit random secret in dmem
- * @param[in]  dmem[4]: dptr_rnd, pointer to location in dmem containing
- *                       a 384-bit random number for blinding
+ * @param[in]  dmem[0]: dptr_k0, pointer to location in dmem containing
+ *                      1st scalar share k0
+ * @param[in]  dmem[4]: dptr_k1, pointer to location in dmem containing
+ *                      2nd scalar share k1
  * @param[in]  dmem[8]: dptr_msg, pointer to the message to be signed in dmem
  * @param[in]  dmem[12]: dptr_r, pointer to dmem location where s component
  *                               of signature will be placed
  * @param[in]  dmem[16]: dptr_s, pointer to dmem location where r component
  *                               of signature will be placed
- * @param[in]  dmem[28]: dptr_d, pointer to private key d in dmem
+ * @param[in]  dmem[28]: dptr_d0, pointer to location in dmem containing
+ *                      1st private key share d0
+ * @param[in]  dmem[32]: dptr_d1, pointer to location in dmem containing
+ *                      2nd private key share d1
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
- * clobbered registers: x2, x3, x9 to x13, x18 to x28, x30
+ * clobbered registers: x2 to x6, x9 to x15, x17 to x28, x30
  *                      w0 to w31
  * clobbered flag groups: FG0
  */
@@ -925,13 +56,33 @@ p384_sign:
   /* set dmem pointer to base point y-coordinate */
   la        x21, p384_gy
 
-  /* set dmem pointer to secret random scalar k */
-  la        x19, dptr_k
+  /* set dmem pointer to 1st scalar share k0 */
+  la        x17, dptr_k0
+  lw        x17, 0(x17)
+
+  /* set dmem pointer to 2nd scalar share k1 */
+  la        x19, dptr_k1
   lw        x19, 0(x19)
 
-  /* set pointer to blinding parameter */
-  la        x9, dptr_rnd
-  lw        x9, 0(x9)
+  /* set dmem pointer to 1st private key share d0 */
+  la        x4, dptr_d0
+  lw        x4, 0(x4)
+
+  /* set dmem pointer to 2nd private key share d1 */
+  la        x5, dptr_d1
+  lw        x5, 0(x5)
+
+  /* set dmem pointer to message msg */
+  la        x6, dptr_msg
+  lw        x6, 0(x6)
+
+  /* set dmem pointer to signature r */
+  la        x14, dptr_r
+  lw        x14, 0(x14)
+
+  /* set dmem pointer to signature s */
+  la        x15, dptr_s
+  lw        x15, 0(x15)
 
   /* set dmem pointer to scratchpad */
   la        x30, scratchpad
@@ -950,25 +101,19 @@ p384_sign:
   bn.lid    x2++, 0(x3)
   bn.lid    x2++, 32(x3)
 
-  /* scalar multiplication with base point
-     [w28:w25] <= (x_1, y_1) = k*G */
+  /* scalar multiplication with base point and
+     conversion of projective coordinates to affine space
+     [w28:w25] <= (x_1, y_1) = (k*alpha) * G */
   jal       x1, scalar_mult_int_p384
+  jal       x1, proj_to_affine_p384
 
   /* store r of signature in dmem: dmem[dptr_r] <= r = [w26,w25] */
   li        x2, 25
-  la        x3, dptr_r
-  lw        x3, 0(x3)
-  bn.sid    x2++, 0(x3)
-  bn.sid    x2++, 32(x3)
-
-  /* load secret random number k from dmem
-     [w30,w29] <= k = dmem[dptr_k] */
-  li        x2, 29
-  bn.lid    x2++, 0(x19)
-  bn.lid    x2++, 32(x19)
+  bn.sid    x2++, 0(x14)
+  bn.sid    x2++, 32(x14)
 
   /* load domain parameter n (order of base point)
-     [w13, w12] <= p = dmem[p384_n] */
+     [w13, w12] <= n = dmem[p384_n] */
   li        x2, 12
   la        x3, p384_n
   bn.lid    x2++, 0(x3)
@@ -979,40 +124,126 @@ p384_sign:
      w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */
   bn.sub    w14, w31, w12
 
+  /* Multiplicative masking of shares k0 and k1 */
+
+  /* Generate a random 127-bit number.
+     w4 <= URND()[255:129] */
+  bn.wsrr   w4, URND
+  bn.rshi   w4, w31, w4 >> 129
+
+  /* Add 1 to get a 128-bit nonzero scalar for masking.
+     w4 <= w4 + 1 = alpha */
+  bn.addi   w4, w4, 1
+
+  /* load 1st share k0 from dmem
+     [w11,w10] <= k0 = dmem[dptr_k0] */
+  li        x2, 10
+  bn.lid    x2++, 0(x17)
+  bn.lid    x2++, 32(x17)
+
+  /* [w26,w25] <= ([w11,w10] * w4) mod n = (k0 * alpha) mod n */
+  bn.mov    w16, w4
+  jal       x1, p384_mulmod448x128_n
+  bn.mov    w25, w16
+  bn.mov    w26, w17
+
+  /* load 2nd share k1 from dmem
+     [w11,w10] <= k1 = dmem[dptr_k1] */
+  li        x2, 10
+  bn.lid    x2++, 0(x19)
+  bn.lid    x2++, 32(x19)
+
+  /* [w28,w27] <= ([w11,w10] * w4) mod n = (k1 * alpha) mod n */
+  bn.mov    w16, w4
+  jal       x1, p384_mulmod448x128_n
+  bn.mov    w27, w16
+  bn.mov    w28, w17
+
+  /* Multiplicative masking of shares d0 and d1 */
+
+  /* load 1st share d0 from dmem
+     [w11,w10] <= d0 = dmem[dptr_d0] */
+  li        x2, 10
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  /* [w7,w6] <= ([w11,w10] * w4) mod n = (d0 * alpha) mod n */
+  bn.mov    w16, w4
+  jal       x1, p384_mulmod448x128_n
+  bn.mov    w6, w16
+  bn.mov    w7, w17
+
+  /* load 2nd share d1 from dmem
+     [w11,w10] <= d1 = dmem[dptr_d1] */
+  li        x2, 10
+  bn.lid    x2++, 0(x5)
+  bn.lid    x2++, 32(x5)
+
+  /* [w9,w8] <= ([w11,w10] * w4) mod n = (d1 * alpha) mod n */
+  bn.mov    w16, w4
+  jal       x1, p384_mulmod448x128_n
+  bn.mov    w8, w16
+  bn.mov    w9, w17
+
+  /* Multiplicative masking of message msg */
+
+  /* load message from dmem
+     [w11, w10] <= msg = dmem[dptr_msg] */
+  li        x2, 10
+  bn.lid    x2++, 0(x6)
+  bn.lid    x2++, 32(x6)
+
+  /* [w1,w0] <= ([w11,w10] * w4) mod n = (msg * alpha) mod n */
+  bn.mov    w16, w4
+  jal       x1, p384_mulmod448x128_n
+  bn.mov    w0, w16
+  bn.mov    w1, w17
+
+  /* Compute (k*alpha) mod n = (k0*alpha + k1*alpha) mod n
+     [w17,w16] <= k*alpha = [w26,w25] + [w28,w27] mod n */
+  bn.add    w18, w27, w25
+  bn.addc   w19, w28, w26
+  bn.mov    w20, w31
+  jal       x1, p384_reduce_n
+
   /* modular multiplicative inverse of k
-     [w3, w2] <= [w17, w16] <= k^(-1) mod n */
+     [w3, w2] <= [w17, w16] <= (k*alpha)^(-1) mod n */
+  bn.mov    w29, w16
+  bn.mov    w30, w17
   jal       x1, mod_inv_n_p384
   bn.mov    w2, w16
   bn.mov    w3, w17
 
-  /* load private key d from dmem
-     [w11,w10] <= d = dmem[dptr_d] */
-  li        x2, 10
-  la        x3, dptr_d
-  lw        x3, 0(x3)
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
+  /* Compute (d*alpha) mod n = (d0*alpha + d1*alpha) mod n
+     [w17,w16] <= d*alpha = [w7,w6] + [w9,w8] mod n */
+  bn.add    w18, w8, w6
+  bn.addc   w19, w9, w7
+  bn.mov    w20, w31
+  jal       x1, p384_reduce_n
 
-  /* [w17, w16] <= k^(-1)*d mod n = [w17, w16] * [w11, w10] mod [w13, w12] */
+  /* [w17, w16] <= (k*alpha)^(-1)*d*alpha mod n = [w3, w2] * [w17, w16] mod [w13, w12] */
+  bn.mov    w10, w2
+  bn.mov    w11, w3
   jal       x1, p384_mulmod_n
 
+  /* load r of signature from dmem
+     [w11,w10] <= r = dmem[dptr_r] */
+  li        x2, 10
+  bn.lid    x2++, 0(x14)
+  bn.lid    x2++, 32(x14)
+
   /*  [w5, w4] <= [w17, w16]
-        <= r * (k^(-1)*d) mod n = [w26, w25] * [w17, w16] mod [w13, w12] */
-  bn.mov    w10, w25
-  bn.mov    w11, w26
+        <= r * (k^(-1)*d) mod n = r * ((k*alpha)^(-1)*d*alpha) mod n =
+           = [w11, w10] * [w17, w16] mod [w13, w12] */
   jal       x1, p384_mulmod_n
   bn.mov    w4, w16
   bn.mov    w5, w17
 
-  /* load message from dmem
-     [w11, w10] <= msg = dmem[dptr_msg] */
-  li        x2, 10
-  la        x3, dptr_msg
-  lw        x3, 0(x3)
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* [w17, w16] <= k^(-1) * msg = [w3, w2]*[w17, w16] mod n */
+  /* [w17, w16] <= k^(-1) * msg =
+                   = (k*alpha)^(-1) * msg*alpha =
+                   = [w3, w2]*[w1, w0] mod n */
+  bn.mov    w10, w0
+  bn.mov    w11, w1
   bn.mov    w16, w2
   bn.mov    w17, w3
   jal       x1, p384_mulmod_n
@@ -1029,10 +260,8 @@ p384_sign:
 
   /* store s of signature in dmem: dmem[dptr_s] <= s = [w28, w27] */
   li        x2, 27
-  la        x3, dptr_s
-  lw        x3, 0(x3)
-  bn.sid    x2++, 0(x3)
-  bn.sid    x2++, 32(x3)
+  bn.sid    x2++, 0(x15)
+  bn.sid    x2++, 32(x15)
 
   ret
 
@@ -1040,46 +269,65 @@ p384_sign:
 /* pointers and scratchpad memory */
 .section .data
 
-/* pointer to k (dptr_k) */
-.globl dptr_k
-dptr_k:
+.balign 32
+
+/* pointer to k0 (dptr_k0) */
+.globl dptr_k0
+.weak dptr_k0
+dptr_k0:
   .zero 4
 
-/* pointer to rnd (dptr_rnd) */
-.globl dptr_rnd
-dptr_rnd:
+/* pointer to k1 (dptr_k1) */
+.globl dptr_k1
+.weak dptr_k1
+dptr_k1:
   .zero 4
 
 /* pointer to msg (dptr_msg) */
 .globl dptr_msg
+.weak dptr_msg
 dptr_msg:
   .zero 4
 
 /* pointer to R (dptr_r) */
 .globl dptr_r
+.weak dptr_r
 dptr_r:
   .zero 4
 
 /* pointer to S (dptr_s) */
 .globl dptr_s
+.weak dptr_s
 dptr_s:
   .zero 4
 
 /* pointer to X (dptr_x) */
 .globl dptr_x
+.weak dptr_x
 dptr_x:
   .zero 4
 
 /* pointer to Y (dptr_y) */
 .globl dptr_y
+.weak dptr_y
 dptr_y:
   .zero 4
 
-/* pointer to D (dptr_d) */
-.globl dptr_d
-dptr_d:
+/* pointer to d0 (dptr_d0) */
+.globl dptr_d0
+.weak dptr_d0
+dptr_d0:
+  .zero 4
+
+/* pointer to d1 (dptr_d1) */
+.globl dptr_d1
+.weak dptr_d1
+dptr_d1:
   .zero 4
 
 /* 704 bytes of scratchpad memory */
+.balign 32
+.globl scratchpad
+.weak scratchpad
 scratchpad:
   .zero 704
diff --git a/sw/otbn/crypto/p384_verify.s b/sw/otbn/crypto/p384_verify.s
index 8c0a1adf67bcc..a4ab44ebe3584 100644
--- a/sw/otbn/crypto/p384_verify.s
+++ b/sw/otbn/crypto/p384_verify.s
@@ -8,294 +8,6 @@
 
  .section .text
 
-/**
- * Checks if a point is a valid curve point on curve P-384
- *
- * Returns r = x^3 + ax + b  mod p
- *     and s = y^2  mod p
- *         where x,y are the affine coordinates of the curve point and
- *              a, b and p being the domain parameters of curve P-384.
- *
- * This routine checks if a point with given x- and y-coordinate is a valid
- * curve point on P-384.
- * The routine checks whether the coordinates are a solution of the
- * Weierstrass equation y^2 = x^3 + ax + b  mod p.
- * The routine makes use of the property that the domain parameter 'a' can be
- * written as a=-3 for the P-384 curve, hence the routine is limited to P-384.
- * The routine does not return a boolean result but computes the left side
- * and the right sight of the Weierstrass equation and leaves the final
- * comparison to the caller.
- * The routine runs in constant time.
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * @param[in]  dmem[12]: dptr_r, pointer to dmem location where right
- *                               side result r will be stored
- * @param[in]  dmem[16]: dptr_s, pointer to dmem location where left side
- *                               result s will be stored
- * @param[in]  dmem[20]: dptr_x, pointer to dmem location containing affine
- *                               x-coordinate of input point
- * @param[in]  dmem[24]: dptr_y, pointer to dmem location containing affine
- *                               y-coordinate of input point
- *
- * clobbered registers: x2, x3, w0 to w5, w10 to w17
- * clobbered flag groups: FG0
- */
- .globl p384_isoncurve
-p384_isoncurve:
-
-  /* setup all-zero reg */
-  bn.xor    w31, w31, w31
-
-  /* load affine x-coordinate of curve point from dmem
-     [w1, w0] <= dmem[dptr_x] = dmem[20] */
-  la        x3, dptr_x
-  lw        x3, 0(x3)
-  li        x2, 0
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* load affine y-coordinate of curve point from dmem
-     [w3, w2] <= dmem[dptr_y] = dmem[24] */
-  la        x3, dptr_y
-  lw        x3, 0(x3)
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2, 32(x3)
-
-  /* load domain parameter p (modulus) from dmem
-     [w13, w12] = p = dmem[p384_p] */
-  li        x2, 12
-  la        x3, p384_p
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* load domain parameter b from dmem
-     [w4, w5] = b = dmem[p384_b] */
-  li        x2, 4
-  la        x3, p384_b
-  bn.lid    x2++, 0(x3)
-  bn.lid    x2++, 32(x3)
-
-  /* y^2 = [w17,w16] <= y*y = [w3,w2]*w[w3,w2] */
-  bn.mov    w10, w2
-  bn.mov    w11, w3
-  bn.mov    w16, w2
-  bn.mov    w17, w3
-  jal       x1, p384_mulmod_p
-
-  /* store result (left side): dmem[dptr_s] <= y^2 = [w17,w16] */
-  la        x3, dptr_s
-  lw        x3, 0(x3)
-  li        x2, 16
-  bn.sid    x2++, 0(x3)
-  bn.sid    x2++, 32(x3)
-
-  /*  x^3 = [w17,w16] <= (x*x)*x = ([w1,w0]*(w1,w0])*[w1,w0] */
-  bn.mov    w10, w0
-  bn.mov    w11, w1
-  bn.mov    w16, w0
-  bn.mov    w17, w1
-  jal       x1, p384_mulmod_p
-  bn.mov    w10, w0
-  bn.mov    w11, w1
-  jal       x1, p384_mulmod_p
-
-  /* for curve P-384, 'a' can be written as a = -3, therefore we subtract
-     x three times from x^3.
-     x^3 + ax  mod p = [w17,w16] <= x^3 -3 x mod p
-                     = [w17,w16] - [w1,w0] - [w1,w0] - [w1,w0] mod [w13,w12] */
-  loopi     3, 6
-    bn.sub    w16, w16, w0
-    bn.subb   w17, w17, w1
-    bn.add    w10, w16, w12
-    bn.addc   w11, w17, w13
-    bn.sel    w16, w10, w16, C
-    bn.sel    w17, w11, w17, C
-
-  /* add domain parameter b
-     x^3 + ax + b mod p = [w17,w16] <= [w17,w16] + [w5,w4] mod [w13,w12] */
-  bn.add    w16, w16, w4
-  bn.addc   w17, w17, w5
-  bn.sub    w10, w16, w12
-  bn.subb   w11, w17, w13
-  bn.sel    w16, w16, w10, C
-  bn.sel    w17, w17, w11, C
-
-  /* store result (right side)
-     dmem[dptr_r] <= x^3 + ax + b mod p = [w17,w16] */
-  la        x3, dptr_r
-  lw        x3, 0(x3)
-  li        x2, 16
-  bn.sid    x2++, 0(x3)
-  bn.sid    x2++, 32(x3)
-
-  ret
-
-
-/**
- * 384-bit variable time modular multiplicative inverse computation
- *
- * Returns c <= a^(-1) mod m
- *         where 'a' is a bigint of length 384 bit with a < m
- *               'm' is the modulus with a length of 384 bit
- *               'c' is a 384-bit result
- *
- * This routine implements the computation of the modular multiplicative
- * inverse based on the binary GCD or Stein's algorithm.
- * The implemented variant is based on the "right-shift binary extended GCD"
- * as it is described in section 3.1 of [1] (Algorithm 1).
- * [1] https://doi.org/10.1155/ES/2006/32192
- *
- * Note that this is a variable time implementation. I.e. this routine will
- * show a data-dependent timing and execution profile. Only use where a
- * full white-box scenario is acceptable.
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * @param[in]  [w30, w29]: a, 384-bit operand
- * @param[in]  [w13, w12]: m, modulus
- * @param[in]  w31: all-zero
- * @param[out] [w17,w16]: result c
- *
- * clobbered registers: x2, w2, w4 to w11, w16 to w19
- * clobbered flag groups: FG0
- */
-mod_inv_var:
-  /* [w5,w4] = r <= 0 */
-  bn.xor    w4, w4, w4
-  bn.xor    w5, w5, w5
-
-  /* [w7,w6] = s <= 1 */
-  bn.addi   w6, w31, 1
-  bn.xor    w7, w7, w7
-
-  /* [w9,w8] = u <= m = [w13, w12]*/
-  bn.mov    w8, w12
-  bn.mov    w9, w13
-
-  /* [w11,w10] = v <= [w30, w29] */
-  bn.mov    w10, w29
-  bn.mov    w11, w30
-
-  ebgcd_loop:
-  /* test if u is odd */
-  bn.or     w8, w8, w8
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 4
-  bne       x2, x0, ebgcd_u_odd
-
-  /* u is even: */
-  /* [w9,w8] = u <= u/2 = [w9,w8] >> 1 */
-  bn.rshi   w8, w9, w8 >> 1
-  bn.rshi   w9, w31, w9 >> 1
-
-  /* test if r is odd */
-  bn.or     w4, w4, w4
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 4
-  bne       x2, x0, ebgcd_r_odd
-
-  /* r is even: */
-  /* [w5,w4] = r <= r/2 = [w5,w4] >> 1 */
-  bn.rshi   w4, w5, w4 >> 1
-  bn.rshi   w5, w31, w5 >> 1
-  jal       x0, ebgcd_loop
-
-  ebgcd_r_odd:
-  /* [w5,w4] = r <= (r + m)/2 = ([w5,w4] + [w13,w12]) >> 1 */
-  bn.add    w4, w4, w12
-  bn.addc   w5, w5, w13
-  bn.rshi   w4, w5, w4 >> 1
-  bn.rshi   w5, w31, w5 >> 1
-  jal       x0, ebgcd_loop
-
-  ebgcd_u_odd:
-  /* test if v is odd */
-  bn.or     w10, w10, w10
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 4
-  bne       x2, x0, ebgcd_uv_odd
-
-  /* v is even: */
-  /* [w11,w10] = v <= v/2 = [w11,w10] >> 1 */
-  bn.rshi   w10, w11, w10 >> 1
-  bn.rshi   w11, w31, w11 >> 1
-
-  /* test if s is odd */
-  bn.or     w6, w6, w6
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 4
-  bne       x2, x0, ebgcd_s_odd
-
-  /* s is even: */
-  /* [w7,w6] = s <= s/2 = [w7,w6] >> 1 */
-  bn.rshi   w6, w7, w6 >> 1
-  bn.rshi   w7, w31, w7 >> 1
-  jal       x0, ebgcd_loop
-
-  ebgcd_s_odd:
-  /* [w7,w6] = s <= (s + m)/2 = ([w7,w6] + [w13,w12]) >> 1 */
-  bn.add    w6, w6, w12
-  bn.addc   w7, w7, w13
-  bn.rshi   w6, w7, w6 >> 1
-  bn.rshi   w7, w31, w7 >> 1
-  jal       x0, ebgcd_loop
-
-  ebgcd_uv_odd:
-  /* test if v >= u */
-  bn.cmp    w10, w8
-  bn.cmpb   w11, w9
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 1
-  beq       x2, x0, ebgcd_v_gte_u
-
-  /* u > v: */
-  /* [w5,w4] = r <= r - s = [w5,w4] - [w7,w6]; if (r < 0): r <= r + m */
-  bn.sub    w4, w4, w6
-  bn.subb   w5, w5, w7
-  bn.add    w18, w4, w12
-  bn.addc   w19, w5, w13
-  bn.sel    w4, w18, w4, C
-  bn.sel    w5, w19, w5, C
-
-  /* [w9,w8] = u <= u - v = [w9,w8] - [w11,w10] */
-  bn.sub    w8, w8, w10
-  bn.subb   w9, w9, w11
-  jal       x0, ebgcd_loop
-
-  ebgcd_v_gte_u:
-  /* [w7,w6] = s <= s - r = [w7,w6] - [w5,w4]; if (s < 0) s <= s + m */
-  bn.sub    w6, w6, w4
-  bn.subb   w7, w7, w5
-  bn.add    w18, w6, w12
-  bn.addc   w19, w7, w13
-  bn.sel    w6, w18, w6, C
-  bn.sel    w7, w19, w7, C
-
-  /* [w11,w10] = v <= v - u = [w11,w10] - [w9,w8] */
-  bn.sub    w10, w10, w8
-  bn.subb   w11, w11, w9
-
-  /* if v > 0 go back to start of loop */
-  bn.cmp    w31, w10
-  bn.cmpb   w31, w11
-  csrrs     x2, 0x7c0, x0
-  andi      x2, x2, 1
-  bne       x2, x0, ebgcd_loop
-
-  /* v <= 0: */
-  /* if (r > m): [w17,w16] = a <= r - m = [w5,w4] - [w13,w12]
-     else: [w17,w16] = a <= r = [w5,w4] */
-  bn.sub    w18, w4, w12
-  bn.subb   w19, w5, w13
-  bn.cmp    w12, w4
-  bn.cmpb   w13, w5
-  bn.sel    w16, w18, w4, C
-  bn.sel    w17, w19, w5, C
-
-  ret
-
-
 /**
  * Store curve point in projective coordinates (non randomized)
  *
@@ -339,7 +51,6 @@ store_aff_proj:
 
   ret
 
-
 /**
  * Store curve point in projective coordinates (non randomized)
  *
@@ -394,7 +105,8 @@ store_proj:
  *
  * Scratchpad memory layout:
  * The routine expects at least 896 bytes of scratchpad memory at dmem
- * location 'scratchpad' (sp). Internally the scratchpad is used as follows:
+ * location 'scratchpad' (sp).
+ * Internally the scratchpad is used as follows:
  * dptr_sp     .. dptr_sp+191: point C, projective
  * dptr_sp+192 .. dptr_sp+383: point G, projective
  * dptr_sp+384 .. dptr_sp+575: point Q, projective
@@ -431,30 +143,30 @@ p384_verify:
   /* goto 'fail' if [w30,w29] == [w31, w31] <=> s == 0 */
   bn.cmp    w31, w29
   bn.cmpb   w31, w30
-  csrrs     x2, 0x7c0, x0
+  csrrs     x2, FG0, x0
   andi      x2, x2, 1
   beq       x2, x0, fail
 
   /* goto 'fail' if [w30,w29] >= [w12,w13] <=> s >= n */
   bn.cmp    w29, w12
   bn.cmpb   w30, w13
-  csrrs     x2, 0x7c0, x0
+  csrrs     x2, FG0, x0
   andi      x2, x2, 1
   beq       x2, x0, fail
 
+  /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so
+     no need to compute the high part):
+     w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */
+  bn.sub    w14, w31, w12
+
   /* Compute modular inverse of S
      Note: This can be replaced by the 'mod_inv_n_p384' subroutine at the
            cost of ~60k cycles if reduced code size is targeted */
   /* [w9,w8] <= [w17,w16] <= s^-1  mod n = [w30,w29]^-1 mod [w13,w12] */
-  jal       x1, mod_inv_var
+  jal       x1, mod_inv_n_p384
   bn.mov    w8, w16
   bn.mov    w9, w17
 
-  /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so
-     no need to compute the high part):
-     w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */
-  bn.sub    w14, w31, w12
-
   /* set regfile pointers to in/out regs of Barrett routine */
   li        x22, 10
   li        x23, 11
@@ -472,14 +184,14 @@ p384_verify:
   /* goto 'fail' if [w11, w10] == [w31, w31] <=> r == 0 */
   bn.cmp    w31, w10
   bn.cmpb   w31, w11
-  csrrs     x2, 0x7c0, x0
+  csrrs     x2, FG0, x0
   andi      x2, x2, 1
   beq       x2, x0, fail
 
   /* goto 'fail' if [w11,w10] >= [w12,w13] <=> r >= n */
   bn.cmp    w10, w12
   bn.cmpb   w11, w13
-  csrrs     x2, 0x7c0, x0
+  csrrs     x2, FG0, x0
   andi      x2, x2, 1
   beq       x2, x0, fail
 
@@ -590,7 +302,7 @@ p384_verify:
     bn.addc   w1, w1, w1
 
     /* keep MSB/carry bit in x3: x3 <= u1[i] */
-    csrrs     x3, 0x7c0, x0
+    csrrs     x3, FG0, x0
     andi      x3, x3, 1
 
     /* left shift u2 = [w3,w2] <= [w3,w2] << 1 */
@@ -598,7 +310,7 @@ p384_verify:
     bn.addc   w3, w3, w3
 
     /* keep MSB/carry bit in x3: x4 <= u2[i] */
-    csrrs     x4, 0x7c0, x0
+    csrrs     x4, FG0, x0
     andi      x4, x4, 1
     li        x2, 0
 
@@ -651,8 +363,20 @@ p384_verify:
     jal       x1, store_proj
     nop
 
-  /* compute inverse of z-coordinate: [w1,w0] <= z_c^-1  mod p */
-  jal       x1, mod_inv_var
+  /* load domain parameter p (order of finite field)
+     [w13, w12] <= p = dmem[p384_p] */
+  li        x2, 12
+  la        x3, p384_p
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* Compute Solinas constant k for modulus p (we know it is only 191 bits, so
+     no need to compute the high part):
+     w14 <= 2^256 - p[255:0] = (2^384 - p) mod (2^256) = 2^384 - p */
+  bn.sub    w14, w31, w12
+
+  /* compute inverse of z-coordinate: [w17,w16] <= z_c^-1  mod p */
+  jal       x1, mod_inv_n_p384
 
   /* convert x-coordinate of C back to affine: x1 = x_c * z_c^-1  mod p */
   bn.mov    w10, w25
@@ -687,47 +411,46 @@ p384_verify:
 /* pointers and scratchpad memory */
 .section .data
 
-/* pointer to k (dptr_k) */
-.globl dptr_k
-dptr_k:
-  .zero 4
-
 /* pointer to rnd (dptr_rnd)
    used for result here */
 .globl dptr_rnd
+.weak dptr_rnd
 dptr_rnd:
   .zero 4
 
 /* pointer to msg (dptr_msg) */
 .globl dptr_msg
+.weak dptr_msg
 dptr_msg:
   .zero 4
 
-/* pointer to R (dptr_r) */
-.globl dptr_r
-dptr_r:
-  .zero 4
-
-/* pointer to S (dptr_s) */
-.globl dptr_s
-dptr_s:
-  .zero 4
-
 /* pointer to X (dptr_x) */
 .globl dptr_x
+.weak dptr_x
 dptr_x:
   .zero 4
 
 /* pointer to Y (dptr_y) */
 .globl dptr_y
+.weak dptr_y
 dptr_y:
   .zero 4
 
-/* pointer to D (dptr_d) */
-.globl dptr_d
-dptr_d:
+/* pointer to R (dptr_r) */
+.globl dptr_r
+.weak dptr_r
+dptr_r:
+  .zero 4
+
+/* pointer to S (dptr_s) */
+.globl dptr_s
+.weak dptr_s
+dptr_s:
   .zero 4
 
 /* Scratchpad memory */
+.balign 32
+.globl scratchpad
+.weak scratchpad
 scratchpad:
   .zero 896
diff --git a/sw/otbn/crypto/primality.s b/sw/otbn/crypto/primality.s
index afc2599b05255..9d51bdeef59d2 100644
--- a/sw/otbn/crypto/primality.s
+++ b/sw/otbn/crypto/primality.s
@@ -33,7 +33,7 @@
  * @param[in] x10: t, number of Miller-Rabin rounds (security parameter)
  * @param[in] x14: dptr_b, pointer to temporary working buffer in dmem (n*32 bytes)
  * @param[in] x15: dptr_z, pointer to temporary working buffer in dmem (n*32 bytes)
- * @param[in] x16: dptr_w, pointer to candidate prime w in dmem
+ * @param[in] x16: dptr_w, pointer to candidate prime w in dmem, w mod 4 = 3
  * @param[in] x17: dptr_m0inv, pointer to Montgomery constant m0' (for w) in dmem
  * @param[in] x18: dptr_rr, pointer to Montgomery constant RR = R^2 mod w in dmem
  * @param[in] x30: n, number of limbs for all bignums (wlen / 256; n <= 16)
@@ -56,7 +56,7 @@ miller_rabin:
     bn.cmp   w31, w21
 
     /* x2 <= CSRs[FG0][0] = FG0.C */
-    csrrs    x2, 0x7c0, x0
+    csrrs    x2, FG0, x0
     andi     x2, x2, 1
 
     /* Skip the rest of the loop if w is composite (x2 == 0). We can't exit
@@ -95,7 +95,7 @@ miller_rabin:
  *
  * @param[in] x14: dptr_b, pointer to temporary working buffer in dmem (n*32 bytes)
  * @param[in] x15: dptr_z, pointer to temporary working buffer in dmem (n*32 bytes)
- * @param[in] x16: dptr_w, pointer to candidate prime w in dmem
+ * @param[in] x16: dptr_w, pointer to candidate prime w in dmem, w mod 4 = 3
  * @param[in] x17: dptr_m0inv, pointer to Montgomery constant m0' (for w) in dmem
  * @param[in] x18: dptr_rr, pointer to Montgomery constant RR = R^2 mod w in dmem
  * @param[in] x30: n, number of limbs for all bignums (wlen / 256; n <= 16)
@@ -116,9 +116,9 @@ miller_rabin_round:
   addi     x2, x14, 0
   loop     x30, 4
     /* w22 <= URND() */
-    bn.wsrr  w22, 0x1
+    bn.wsrr  w22, URND
     /* w23 <= RND() */
-    bn.wsrr  w23, 0x2
+    bn.wsrr  w23, RND
     /* w23 <= w22 ^ w23 */
     bn.xor   w23, w22, w23
     /* b[i] <= w23 */
@@ -142,7 +142,7 @@ miller_rabin_round:
 
   /* Extract FG0.C into a small register and jump back to the start if it is 0.
        x2 <= CSRs[FG0][0] = FG0.C */
-  csrrs    x2, 0x7c0, x0
+  csrrs    x2, FG0, x0
   andi     x2, x2, 1
   beq      x2, x0, miller_rabin_round
 
@@ -164,7 +164,7 @@ miller_rabin_round:
 
   /* Extract FG0.C into a small register and retry if it is 0.
        x2 <= CSRs[FG0][0] = FG0.C */
-  csrrs    x2, 0x7c0, x0
+  csrrs    x2, FG0, x0
   andi     x2, x2, 1
   beq      x2, x0, miller_rabin_round
 
@@ -198,56 +198,27 @@ miller_rabin_round:
  *     4.7 Continue.
  *   5. Return PROBABLY PRIME.
  *
- * This routine corresponds to steps 4.3 through 4.7.
- *
- * This loop needs to be constant-time relative to w if w is possibly prime (if
- * we find proof that w is composite we are permitted to break early). To make
- * the algorithm constant-time, we need to:
- *   - Compute b^(w-1) mod w in one loop, rather than separate into b^m mod w
- *     and a squaring loop for trailing zeroes.
- *   - Keep track of whether we have already reached a "step 4.7" condition,
- *     meaning we should return that w is possibly prime regardless of the rest
- *     of the loop.
- *
- * For each bit of (w-1), if we are in the case where all the remaining bits
- * are 0 (steps 4.4-4.5 of the FIPS procedure), then we have four possible
- * cases:
- *   1. If z == w - 1, then b is a witness to the primality of w regardless of
- *      what happens in the rest of the loop (step 4.4/step 4.5.2).
- *   2. If z == 1 and the current bit of (w - 1) is 1, then b is a witness to
- *      the primality of w regardless of what happens in the rest of the loop
- *      (step 4.4).
- *   3. If z == 1 and the current bit of (w - 1) is 0, then w is composite and
- *      we can exit early (optional and not currently implemented).
- *   4. If none of the above, we should continue the exponentiation.
- *
- * In pseudocode, the constant-time variant of steps 4.3-4.7 above looks like:
- *   z = 1
- *   possibly_prime = false // 0 represents "composite"
- *   for i=wlen-1 down to 0 {
- *     // Perform the next step of modular exponentiation.
- *     wi = ((w - 1) >> i) & 1
- *     z = wi ? (z^2 * b) mod w : (z^2) mod w
- *
- *     // Get the lower bits (to see if they're all zero).
- *     w_low = (w - 1) mod (2^i)
- *
- *     // Determine if b is a witness to the primality of w.
- *     possibly_prime |= ((w_low == 0) && (z == w-1))
- *     possibly_prime |= ((w_low == 0) && (wi == 1) && (z == 1))
- *   }
- *   return possibly_prime
+ * If we specialize to the case that w mod 4 = 3, the routine becomes much
+ * simpler and easier to make constant-time, because a in step 1 is always 1.
+ * In pseudocode, the modified version of steps 4.3 through 4.7 is:
+ *   4.3 Compute z = b^((w-1)/2) mod w.
+ *   4.4 If ((z = 1) or (z = w - 1)), then go to step 4.7.
+ *   4.5 No-op.
+ *   4.6 Return COMPOSITE.
+ *   4.7 Continue.
  *
  * Expects the Montgomery constants for w to be precomputed before entry. For
  * this routine, R = 2^(n*256) and R/2 < w < R. None of the input buffers may
  * overlap in DMEM. This routine runs in constant time relative to w if w is
  * possibly prime.
  *
+ * This routine is constant-time relative to w if w is possibly prime.
+ *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in] x14: dptr_b, pointer to randomly-generated witness to use for testing
  * @param[in] x15: dptr_z, pointer to temporary working buffer in dmem (n*32 bytes)
- * @param[in] x16: dptr_w, pointer to candidate prime w in dmem
+ * @param[in] x16: dptr_w, pointer to candidate prime w in dmem, w mod 4 = 3
  * @param[in] x17: dptr_m0inv, pointer to Montgomery constant m0' (for w) in dmem
  * @param[in] x18: dptr_rr, pointer to Montgomery constant RR = R^2 mod w in dmem
  * @param[in] x30: n, number of limbs for all bignums (wlen / 256; n <= 16)
@@ -276,207 +247,137 @@ test_witness:
     bn.sid    x8, 0(x21++)
     addi      x8, x8, 1
 
-  /* Initialize work buffer to R mod w (1 in Montgomery form).
-       dmem[dptr_z:dptr_z+n*32] <= montmul(1, RR) = R mod w */
-  addi      x19, x18, 0
-  addi      x21, x15, 0
-  jal       x1, montmul_mul1
-
-  /* Initialize the "possibly prime" tracking register to 0.
-       w21 <= 0 */
-  bn.mov    w21, w31
-
   /* Initialize wide-register pointers. */
   li        x23, 23
   li        x25, 25
 
-  /* Initialize loop counter.
-       x26 <= n */
-  addi      x26, x30, 0
-
-  /* Loop through the limbs of (w - 1), most significant first.
-
-     Throughout the loop we maintain a mask that is 0 until all remaining bits
-     of (w - 1) are 0.
-       - We have not yet reached the part of the loop where the remaining bits
-         of (w-1) are all 0 (i.e. step 4.5), OR
-       - We have already discovered that b is a witness to the primality of w
-
-     Loop invariants at start of loop for iteration i (i=n-1..0):
-       x4  = 0 if w has already been found to be composite, all 1s otherwise
-       x16 = dptr_w
-       x26 = i+1
-       w21 = all 1s if b is already a witness to the primality of w, otherwise 0
-       dmem[dptr_z:dptr_z+n*32] <= (b^((w - 1) >> (i*256)) * R) mod w
-  */
-  loop    x30, 12
-    /* w20 <= 2^256 - 1 */
-    bn.not  w20, w31
-
-    /* Set flags in preparation for loop.
-         FG0.C <= 1
-         FG0.Z <= 1 */
-    bn.addi  w25, w20, 1
-
-    /* Compute limb i of (w-1) and set the mask (w20) based on whether
-       the lower limbs are all-zero. */
-    addi     x3, x16, 0
-    loop     x26, 3
-      /* Select mask based on whether the previous limb was 0.
-           w20 <= FG0.Z ? w20 : w31 */
-      bn.sel   w20, w20, w31, FG0.Z
-      /* w25 <= next limb of w */
-      bn.lid   x25, 0(x3++)
-      /* w22 <= (w24 - FG0.C) mod 2^256 = next limb of (w - 1) */
-      bn.subb  w22, w25, w31
-
-   /* Loop through the bits of this limb. The code is separated in order to
-      make it more readable and to make loop instruction counting easier, even
-      though this is the only call site. We use unconditional branches instead
-      of jal/ret to avoid consuming the call stack unnecessarily. */
-    loopi   256, 2
-      jal      x0, test_witness_step
-_test_witness_step_done:
-      nop
-
-    /* Update the loop counter.
-         x26 <= x26 - 1 = i - 1 */
-    addi     x3, x0, 1
-    sub      x26, x26, x3
-
-  /* TODO: add an FI check here to ensure we completed all loop iterations if
-     the result register is all 1s. */
+  /* Ensure the last 3 bits of the candidate prime are set so that w mod 4 = 3.
+     This is a precondition of the subroutine, but re-setting the bits here
+     provides further protection from e.g. fault injection attacks. */
+  bn.lid    x25, 0(x16)
+  bn.addi   w23, w31, 3
+  bn.or    w25, w25, w23
+  bn.sid    x25, 0(x16)
 
-  ret
-
-/**
- * Inner loop body for the Miller-Rabin primality test.
- *
- * This subroutine expects and maintains the following loop invariants, for
- * loop counter j=0..255:
- *   x9 = 3
- *   x10 = 4
- *   x11 = 2
- *   x23 = 23
- *   x25 = 25
- *   x30 = n
- *   x31 = n-1
- *   w21 = all 1s if b is already a witness to the primality of w, otherwise 0
- *   w22 = ((w - 1)[i] << j) mod 2^256
- *   dmem[dptr_z:dptr_z+n*32] <= (b^((w - 1) >> (i*256+j)) * R) mod w
- *
- * See `test_witness` for more explanation.
- *
- * Flags: Flags have no meaning beyond the scope of this subroutine.
- *
- * @param[in]  x9: 3, constant
- * @param[in] x10: 4, constant
- * @param[in] x11: 2, constant
- * @param[in] x14: dptr_b, pointer to randomly-generated witness to use for testing
- * @param[in] x15: dptr_z, pointer to temporary working buffer in dmem (n*32 bytes)
- * @param[in] x16: dptr_w, pointer to candidate prime w in dmem
- * @param[in] x17: dptr_m0inv, pointer to Montgomery constant m0' (for w) in dmem
- * @param[in] x23: 23, constant
- * @param[in] x25: 25, constant
- * @param[in] x30: n, number of limbs
- * @param[in] x31: n-1
- * @param[in] w31: all-zero
- * @param[in,out] w21: 2^256-1 if w is possibly prime, 0 otherwise
- * @param[in,out] w22: current limb of exponent, shifted (see invariant)
- * @param[in,out] dmem[dptr_z:dptr_z+n*32]: intermediate value (see invariant)
- *
- * clobbered registers: x2, x3, x5 to x8, x10, x12, x13, x19 to x22,
- *                      w2, w3, w4..w[4+(n-1)], w21 to w30
- * clobbered flag groups: FG0, FG1
- */
-test_witness_step:
-  /* Perform the next squaring step of modular exponentiation.
-       w4..w[4+(n-1)] = montmul(z, z) */
-  addi      x19, x15, 0
-  addi      x20, x15, 0
-  jal       x1, montmul
+  /* Clear carry flag.
+       FG0.C <= 0 */
+  bn.sub    w31, w31, w31
 
-  /* Store squaring result in work buffer.
-       dmem[dptr_z:dptr_z+n*32] <= w4..w[4+(n-1)] */
+  /* Initialize work buffer to (R - w) mod w (1 in Montgomery form).
+       dmem[dptr_z:dptr_z+n*32] <= (0 - w) mod R = R - w = R mod w */
+  addi      x20, x16, 0
   addi      x21, x15, 0
-  loop      x30, 2
-    bn.sid    x8, 0(x21++)
-    addi      x8, x8, 1
+  loop      x30, 3
+    bn.lid    x23, 0(x20++)
+    bn.subb   w23, w31, w23
+    bn.sid    x23, 0(x21++)
 
-  /* Perform the next multiplication step of modular exponentiation.
-       w4..w[4+(n-1)] = montmul(z, b) */
-  addi      x19, x14, 0
-  addi      x20, x15, 0
-  jal       x1, montmul
+  /* Initialize loop counter and high limb.
+       x26 <= n - 1
+       w20 <= 0 */
+  addi      x26, x31, 0
+  bn.sub    w20, w20, w20
 
-  /* Shift the exponent and update flags; FG0.C will now be the next bit of
-     the exponent, and FG0.Z will be 1 if the remaining bits in this limb
-     are zero.
-       w22 <= (w22 << 1) mod 2^256
-       FG0.C <= w22[255]
-       FG0.Z <= w22 mod 2^255 =? 0 */
-  bn.add    w22, w22, w22
-
-  /* Select either squared or squared+multiplied result based on FG0.C.
-       dmem[dptr_z:dptr_z+n*32] <=
-         FG0.C ? w4..w[4+(n-1)] : dmem[dptr_z:dptr_z+n*32] */
-  addi      x2, x15, 0
-  li        x8, 4
-  loop      x30, 4
-    /* w23 <= dmem[dptr_z+i*32] */
-    bn.lid    x23, 0(x2)
-    /* w25 <= w[4+i] */
-    bn.movr   x25, x8++
-    /* w23 = FG0.C ? w[4+i] : dmem[dptr_z+i*32] */
-    bn.sel    w23, w25, w23, FG0.C
-    /* dmem[dptr_z+i*32] <= w23 */
-    bn.sid    x23, 0(x2++)
+  /* Perform modular exponentiation to compute b^((w-1)/2).
 
-  /* Select a mask that is all 1s if all the remaining bits of (w-1) are 0.
-     That means BOTH:
-       - the lower limbs are 0 (w20 == 2^256 - 1), AND
-       - the rest of the current limb is 0 (FG0.Z == 1)
+     Loop through the limbs, most significant first, then iterate through each
+     bit of each limb.
 
-       w3  <= FG0.Z ? w20 : w31
-            = all 1s if w mod 2^(i*256+j) is 0, otherwise 0 */
-  bn.sel    w3, w20, w31, FG0.Z
+     Loop invariants (i=n-1 to 0):
+       x15 = dptr_z
+       x16 = dptr_w
+       x26 = i
+       w20 = w[i+1] (or 0 if i=n-1)
+       dmem[dptr_z:dptr_z+n*32] <= (b^((w - 1) >> (i*256)) * R) mod w */
+  loop    x30, 27
+    /* Get the ith limb of w.
+         w25 <= dmem[dptr_w + (i << 5)] = w[i] */
+    slli      x13, x26, 5
+    add       x13, x13, x16
+    bn.lid    x25, 0(x13)
+
+    /* Get limb i of ((w-1) / 2). Since we know w is odd, we can simply
+       concatenate with the limb above and shift right by 1.
+         w22 <= (w20[0] << 255) | (w[i] >> 1) = (w >> 1)[i] */
+    bn.rshi   w22, w20, w25 >> 1
+
+    /* Save the ith limb for the next iteration.
+         w20 <= w[i] */
+    bn.mov    w20, w25
+
+   /* Loop through the bits of this limb and multiply/accumulate. */
+    loopi   256, 19
+      /* Perform the next squaring step of modular exponentiation.
+           w4..w[4+(n-1)] = montmul(z, z) */
+      addi      x19, x15, 0
+      addi      x20, x15, 0
+      jal       x1, montmul
+
+      /* Store squaring result in work buffer.
+           dmem[dptr_z:dptr_z+n*32] <= w4..w[4+(n-1)] */
+      addi      x21, x15, 0
+      loop      x30, 2
+        bn.sid    x8, 0(x21++)
+        addi      x8, x8, 1
+
+      /* Perform the next multiplication step of modular exponentiation.
+           w4..w[4+(n-1)] = montmul(z, b) */
+      addi      x19, x14, 0
+      addi      x20, x15, 0
+      jal       x1, montmul
+
+      /* Shift the exponent and update flags; FG0.C will now be the next bit of
+         the exponent.
+           w22 <= (w22 << 1) mod 2^256
+           FG0.C <= w22[255] */
+      bn.add    w22, w22, w22
+
+      /* Select either squared or squared+multiplied result based on FG0.C.
+           dmem[dptr_z:dptr_z+n*32] <=
+             FG0.C ? w4..w[4+(n-1)] : dmem[dptr_z:dptr_z+n*32] */
+      addi      x2, x15, 0
+      li        x8, 4
+      loop      x30, 4
+        /* w23 <= dmem[dptr_z+i*32] */
+        bn.lid    x23, 0(x2)
+        /* w25 <= w[4+i] */
+        bn.movr   x25, x8++
+        /* w23 = FG0.C ? w[4+i] : dmem[dptr_z+i*32] */
+        bn.sel    w23, w25, w23, FG0.C
+        /* dmem[dptr_z+i*32] <= w23 */
+        bn.sid    x23, 0(x2++)
+
+      /* End of inner loop. */
+      nop
 
-  /* Capture FG0.C, the current bit of (w - 1), as a mask.
-       w24 <= (0 - FG0.C) mod 2^256 = FG0.C ? 2^256 - 1 : 0 */
-  bn.subb   w24, w31, w31
+    /* Update loop counter.
+         x26 <= x26 - 1 = i - 1 */
+    li        x3, 1
+    sub       x26, x26, x3
+    /* End of outer loop. */
 
   /* Fully reduce mod w. The `montmul` routine does not guarantee that the
      result is < w, only < R.
        dmem[dptr_z:dptr_z+n*32] <= dmem[dptr_z:dptr_z+n*32] mod w */
   jal       x1, reduce_modw
 
+  /* Check if the intermediate result represents 1 in Montgomery form.
+       w22 <= all 1s if dmem[x15:x15+n*32] is R mod w, otherwise 0 */
+  jal      x1, is_mont1
+  bn.mov   w22, w26
+
   /* Check if the work buffer is (-R) mod w, which is the Montgomery form
      representation of (-1) mod w = w - 1.
         w26 <= all 1s if dmem[x15:x15+n*32] is (-R) mod w, otherwise 0 */
   jal      x1, is_mont_minus1
 
-  /* If the intermediate result is w - 1 (w26) AND the remaining bits of w
-     are all-zero (w3), then b is a witness to the primality of w. This
-     corresponds to steps 4.4 and 4.5.2 of the FIPS procedure.
-       w21 <= w21 | (w3 & w26) */
-  bn.and   w2, w3, w26
-  bn.or    w21, w21, w2
+  /* If either check returned all-ones, then the input is possibly prime. */
+  bn.or    w21, w26, w22
 
-  /* Check if the intermediate result represents 1 in Montgomery form.
-       w26 <= all 1s if dmem[x15:x15+n*32] is R mod w, otherwise 0 */
-  jal      x1, is_mont1
-
-  /* If the intermediate result is 1 (w26) AND the remaining bits of w are
-     all-zero (w3) AND the current bit of w is 1 (w24), then b is a
-     witness to the primality of w. This corresponds to step 4.4 in the
-     FIPS procedure.
-       w21 <= w21 | (w3 & w24 & w26) */
-  bn.and   w2, w3, w26
-  bn.and   w2, w2, w24
-  bn.or    w21, w21, w2
+  /* TODO: add an FI check here to ensure we completed all loop iterations if
+     the result register is all 1s. */
 
-  /* Unconditional branch back to `test_witness`. */
-  jal      x0, _test_witness_step_done
+  ret
 
 /**
  * Fully reduce modulo a candidate prime w.
@@ -592,6 +493,8 @@ is_mont1:
  * specialized and sensitive to the range of w (for some w, 3w - R could also
  * be equivalent to w - 1).
  *
+ * WARNING: this routine clobbers its input in DMEM (dmem[dptr_x..dptr_x+n*32]).
+ *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in] x15: dptr_x, pointer to input buffer x in dmem
@@ -627,21 +530,4 @@ is_mont_minus1:
        w26 <= all 1s if dmem[dptr_x:dptr_x+n*32] == (-R) mod w, otherwise 0 */
   jal      x1, is_mont1
 
-  /* Clear flags. */
-  bn.sub   w31, w31, w31
-
-  /* Negate the input back to its previous form.
-       dmem[dptr_x:dptr_x+n*32] <= w - dmem[dptr_x:dptr_x+n*32] */
-  addi     x2, x15, 0
-  addi     x3, x16, 0
-  loop     x30, 4
-    /* w23 <= x[i] */
-    bn.lid   x23, 0(x2)
-    /* w25 <= w[i] */
-    bn.lid   x25, 0(x3++)
-    /* w23 <= w[i] - out[i] - FG0.C */
-    bn.subb  w23, w25, w23
-    /* out[i] <= w23 */
-    bn.sid   x23, 0(x2++)
-
   ret
diff --git a/sw/otbn/crypto/rsa_keygen.s b/sw/otbn/crypto/rsa_keygen.s
index 03959f28fb99d..4483c112d628f 100644
--- a/sw/otbn/crypto/rsa_keygen.s
+++ b/sw/otbn/crypto/rsa_keygen.s
@@ -4,6 +4,7 @@
 
 /* Public interface. */
 .globl rsa_keygen
+.globl rsa_key_from_cofactor
 
 /* Exposed for testing purposes only. */
 .globl relprime_f4
@@ -14,6 +15,10 @@
 /**
  * Generate a random RSA key pair.
  *
+ * The public key is the pair (n, e), where n is the modulus and e is the
+ * public exponent. and the private key is the pair (n, d), where n is the same
+ * modulus as in the public key and d is the private exponent.
+ *
  * For the official specification, see FIPS 186-5 section A.1.3. For the
  * purposes of this implementation, the RSA public exponent e is always 65537
  * (aka the Fermat number "F4", 2^16 + 1).
@@ -28,13 +33,13 @@
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
- * @param[in]  x30: number of 256-bit limbs for p and q (key size in bits / 512)
+ * @param[in]  x30: plen, number of 256-bit limbs for p and q
  * @param[in]  w31: all-zero
- * @param[out] dmem[rsa_n..rsa_n+(n*2*32)] RSA public key modulus (n)
- * @param[out] dmem[rsa_d..rsa_d+(n*2*32)] RSA private exponent (d)
+ * @param[out] dmem[rsa_n..rsa_n+(plen*2*32)] RSA public key modulus (n)
+ * @param[out] dmem[rsa_d..rsa_d+(plen*2*32)] RSA private exponent (d)
  *
- * clobbered registers: x2 to x15, x17 to x26, x31,
- *                      w2, w3, w4..w[4+(n-1)], w20 to w30
+ * clobbered registers: x2 to x26, x31,
+ *                      w2, w3, w4..w[4+(plen-1)], w20 to w30
  * clobbered flag groups: FG0, FG1
  */
 rsa_keygen:
@@ -50,21 +55,26 @@ rsa_keygen:
   li       x21, 21
 
   /* Generate the first prime, p.
-       dmem[rsa_p..rsa_p+(n*32)] <= p */
+       dmem[rsa_p..rsa_p+(plen*32)] <= p */
   jal      x1, generate_p
   /* Generate the second prime, q.
-       dmem[rsa_q..rsa_q+(n*32)] <= q */
+       dmem[rsa_q..rsa_q+(plen*32)] <= q */
   jal      x1, generate_q
 
   /* Multiply p and q to get the public modulus n.
-       dmem[rsa_n..rsa_n+(n*2*32)] <= p * q */
+       dmem[rsa_n..rsa_n+(plen*2*32)] <= p * q */
   la       x10, rsa_p
   la       x11, rsa_q
   la       x12, rsa_n
   jal      x1, bignum_mul
 
-  /* Derive the private exponent d from p and q (tail-call). */
-  jal      x0, derive_d
+  /* Derive the private exponent d from p and q.
+       x2 <= zero if d is OK, otherwise nonzero */
+  jal      x1, derive_d
+
+  /* Check that d is large enough (tail-call). If d is not large enough,
+     then `check_d` will restart the key-generation process. */
+  jal      x0, check_d
 
 /**
  * Derive the private RSA exponent d.
@@ -72,18 +82,21 @@ rsa_keygen:
  * Returns d = (65537^-1) mod LCM(p-1, q-1).
  *
  * This function overwrites p and q, and requires that they are continuous in
- * memory (specifically, it expects to be able to use 512 bytes of space
- * following the label `rsa_pq`).
+ * memory. Specifically, it expects to be able to use 512 bytes of space
+ * following the label `rsa_pq`.
  *
- * Flags: Flags are not set in this subroutine.
+ * Important: This routine uses `rsa_cofactor` as a second 512-byte work buffer
+ * and clobbers the contents.
  *
- * @param[in] dmem[rsa_p..rsa_p+(n*32)]: first prime p
- * @param[in] dmem[rsa_q..rsa_q+(n*32)]: second prime q
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in] dmem[rsa_p..rsa_p+(plen*32)]: first prime p
+ * @param[in] dmem[rsa_q..rsa_q+(plen*32)]: second prime q
  * @param[in]  x20: 20, constant
  * @param[in]  x21: 21, constant
- * @param[in]  x30: number of 256-bit limbs for p and q
+ * @param[in]  x30: plen, number of 256-bit limbs for p and q
  * @param[in]  w31: all-zero
- * @param[out] dmem[rsa_d..rsa_d+(n*2*32)]: result, private exponent d
+ * @param[out] dmem[rsa_d..rsa_d+(plen*2*32)]: result, private exponent d
  *
  * clobbered registers: x2 to x8, x10 to x15, x20 to x26, x31, w20 to w28
  * clobbered flag groups: FG0, FG1
@@ -94,13 +107,13 @@ derive_d:
   la       x11, rsa_q
 
   /* Subtract 1 from p in-place (no carry from lowest limb since p is odd).
-       dmem[rsa_p..rsa_p+(n*32)] <= p - 1 */
+       dmem[rsa_p..rsa_p+(plen*32)] <= p - 1 */
   bn.lid   x20, 0(x10)
   bn.subi  w20, w20, 1
   bn.sid   x20, 0(x10)
 
   /* Subtract 1 from q in-place (no carry from lowest limb since p is odd).
-       dmem[rsa_q..rsa_q+(n*32)] <= q - 1 */
+       dmem[rsa_q..rsa_q+(plen*32)] <= q - 1 */
   bn.lid   x20, 0(x11)
   bn.subi  w20, w20, 1
   bn.sid   x20, 0(x11)
@@ -111,52 +124,152 @@ derive_d:
   jal      x1, lcm
 
   /* Update the number of limbs for modinv.
-       x30 <= n*2 */
+       x30 <= plen*2 */
   add      x30, x30, x30
 
   /* Compute d = (65537^-1) mod LCM(p-1,q-1). The modular inverse
-     routine requires two working buffers, which we construct from `tmp_data`
-     and the required-contiguous `rsa_p` and `rsa_q` buffers.
-       dmem[rsa_d..rsa_d+(n*2*32)] <= (65537^-1) mod dmem[x12..x12+(n*2*32)] */
+     routine requires two working buffers, which we construct from
+     `rsa_cofactor` and the required-contiguous `rsa_p` and `rsa_q` buffers.
+       dmem[rsa_d..rsa_d+(plen*2*32)] <= (65537^-1) mod dmem[x12..x12+(n*2*32)] */
+  la       x12, tmp_scratchpad
   la       x13, rsa_d
-  la       x14, tmp_data
+  la       x14, rsa_cofactor
   la       x15, rsa_pq
   jal      x1, modinv_f4
 
-  /* x30 <= (n*2) >> 1 = n */
+  /* Reset the limb count.
+       x30 <= (plen*2) >> 1 = n */
   srli     x30, x30, 1
+  ret
 
-  /* Get a pointer to the nth limb of d (halfway through the number).
-       x3 <= rsa_d + n*32 */
+/**
+ * Check the private RSA exponent d.
+ *
+ * Calls `rsa_keygen` if d is too small, otherwise returns. Designed to be
+ * tail-called by `rsa_keygen`.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  x20: 20, constant
+ * @param[in]  x30: plen, number of 256-bit limbs for p and q
+ * @param[in]  w31: all-zero
+ * @param[out] dmem[rsa_d..rsa_d+(plen*2*32)]: result, private exponent d
+ *
+ * clobbered registers: x2, x3, w20, w23
+ * clobbered flag groups: FG0, FG1
+ */
+check_d:
+  /* Get a pointer to the second half of d.
+       x3 <= rsa_d + plen*32 */
   slli     x2, x30, 5
   la       x3, rsa_d
   add      x3, x3, x2
 
-  /* Check that d > 2^(n*256), i.e. that the highest n limbs are nonzero. We
+  /* Check that d > 2^(plen*256), i.e. that the highest plen limbs are nonzero. We
      need to retry if it's too small (see FIPS 186-5 section A.1.1), although
-     in practice this is unlikely. We do this by ORing the n highest limbs.
-       FG0.Z <= (d >> (n*256)) == 0 */
+     in practice this is unlikely. We do this by ORing the plen highest limbs.
+       FG0.Z <= (d >> (plen*256)) == 0 */
   bn.mov   w23, w31
   loop     x30, 2
     /* w20 <= d[n+i] */
-    bn.lid   x20, 0(x3++)
+    bn.lid  x20, 0(x3++)
     /* w23 <= w23 | w20 */
     bn.or   w23, w23, w20
 
   /* Get the FG0.Z flag into a register.
        x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */
-  csrrs    x2, 0x7c0, x0
+  csrrs    x2, FG0, x0
   srli     x2, x2, 3
   andi     x2, x2, 1
 
-  /* If the flag is set, the high limbs are zero and we should start from
-     scratch, generating a new p and q. Note that x30 MUST be set to n here,
-     not n*2, to meet the rsa_keygen preconditions. */
+  /* If x2 != 0, then d is too small and we need to restart key generation from
+     scratch. */
   bne      x2, x0, rsa_keygen
 
-  /* If we get here, d is OK; return. */
   ret
 
+/**
+ * Construct an RSA key pair from a modulus and cofactor.
+ *
+ * This routine does not check the validity of the RSA key pair; it does not
+ * ensure that the factors are prime or check any other properties, simply
+ * divides the modulus by the cofactor and derives the private exponent. The
+ * only public exponent supported is e=65537.
+ *
+ * This routine will recompute the public modulus n after deriving the factors;
+ * the caller may want to check that the value matches. If the modulus is not
+ * in fact divisible by the cofactor, or the cofactor is much too small, it
+ * will not match.
+ *
+ * Flags: Flags have no meaning beyond the scope of this subroutine.
+ *
+ * @param[in]  x30: plen, number of 256-bit limbs for p and q
+ * @param[in]  w31: all-zero
+ * @param[in] dmem[rsa_n..rsa_n+(plen*2*32)] RSA public key modulus (n)
+ * @param[in] dmem[rsa_cofactor..rsa_cofactor+(plen*32)] Cofactor (p or q)
+ * @param[out] dmem[rsa_n..rsa_n+(plen*2*32)] Recomputed public key modulus (n)
+ * @param[out] dmem[rsa_d..rsa_d+(plen*2*32)] RSA private exponent (d)
+ *
+ * clobbered registers: x2 to x8, x10 to x15, x20 to x26, x31, w3, w20 to w28
+ * clobbered flag groups: FG0, FG1
+ */
+rsa_key_from_cofactor:
+  /* Initialize wide-register pointers.
+       x20 <= 20
+       x21 <= 21 */
+  li       x20, 20
+  li       x21, 21
+
+  /* Get a pointer to the end of the cofactor.
+       x2 <= rsa_cofactor + plen*32 */
+  slli     x2, x30, 5
+  la       x3, rsa_cofactor
+  add      x2, x2, x3
+
+  /* Set the second half of the cofactor buffer to zero, so the cofactor is the
+     same size as the modulus for division.
+      dmem[rsa_cofactor+plen*32..rsa_cofactor+plen*2*32] <= 0 */
+  li       x3, 31
+  loop     x30, 1
+    bn.sid   x3, 0(x2++)
+
+  /* Update the number of limbs for division.
+       x30 <= plen*2 */
+  add     x30, x30, x30
+
+  /* Compute (n / cofactor) and store the result in `rsa_pq`. The quotient will
+     only occupy the first half (`rsa_p`) if the input is valid.
+       dmem[rsa_n..rsa_n+plen*2*32] <= n % cofactor
+       dmem[rsa_pq..rsa_pq+plen*2*32] <= n / cofactor */
+  la       x10, rsa_n
+  la       x11, rsa_cofactor
+  la       x12, rsa_pq
+  jal      x1, div
+
+  /* Reset the limb count.
+       x30 <= (plen*2) >> 1 = n */
+  srli     x30, x30, 1
+
+  /* Copy the original cofactor into `rsa_q` and compute
+     the private exponent.
+      dmem[rsa_q..rsa_q+plen*32] <= dmem[rsa_cofactor..rsa_cofactor+plen*32] */
+  la       x11, rsa_cofactor
+  la       x2, rsa_q
+  li       x3, 3
+  loop     x30, 2
+    bn.lid   x3, 0(x11++)
+    bn.sid   x3, 0(x2++)
+
+  /* Multiply p and q to get the public modulus n.
+       dmem[rsa_n..rsa_n+(plen*2*32)] <= p * q */
+  la       x10, rsa_p
+  la       x11, rsa_q
+  la       x12, rsa_n
+  jal      x1, bignum_mul
+
+  /* Derive the private exponent d from p and q (tail-call). */
+  jal      x0, derive_d
+
 /**
  * Compute the inverse of 65537 modulo a given number.
  *
@@ -275,17 +388,17 @@ derive_d:
  * @param[in]  x15: dptr_v, pointer to a temporary buffer in DMEM (n limbs)
  * @param[in]  x20: 20, constant
  * @param[in]  x21: 21, constant
- * @param[in]  x30: n, number of 256-bit limbs for modulus m and result d
+ * @param[in]  x30: nlen, number of 256-bit limbs for modulus m and result d
  * @param[in]  w31: all-zero
- * @param[out] dmem[dptr_A..dptr_A+(n*32)]: result, modular inverse d
+ * @param[out] dmem[dptr_A..dptr_A+(plen*32)]: result, modular inverse d
  *
  * clobbered registers: MOD, x2 to x4, x31, w20 to w28
  * clobbered flag groups: FG0, FG1
  */
 modinv_f4:
   /* Zero the intermediate buffers.
-       dmem[dptr_A..dptr_A+(n*32)] <= 0
-       dmem[dptr_C..dptr_C+(n*32)] <= 0 */
+       dmem[dptr_A..dptr_A+(nlen*32)] <= 0
+       dmem[dptr_C..dptr_C+(nlen*32)] <= 0 */
   li       x2, 31
   addi     x3, x13, 0
   addi     x4, x14, 0
@@ -307,7 +420,7 @@ modinv_f4:
   bn.addi  w28, w31, 1
 
   /* Copy the modulus to the buffer for v.
-       dmem[dptr_v..dptr_v+(n*32)] <= m */
+       dmem[dptr_v..dptr_v+(nlen*32)] <= m */
   addi     x3, x12, 0
   addi     x4, x15, 0
   loop     x30, 2
@@ -320,7 +433,7 @@ modinv_f4:
   bn.add   w22, w23, w23 << 16
 
   /* MOD <= 65537 */
-  bn.wsrw  0x0, w22
+  bn.wsrw  MOD, w22
 
   /* Calculate number of loop iterations = bitlen(m) + bitlen(65537).
        x31 <= (x30 << 8) + 17 = 256*n + 17 */
@@ -370,7 +483,7 @@ modinv_f4:
     bn.sub   w22, w22, w20
 
     /* Conditionally subtract u from v.
-         dmem[dptr_v..dptr_v+(n*32)] <= v - (u & w25) */
+         dmem[dptr_v..dptr_v+(nlen*32)] <= v - (u & w25) */
     bn.and   w23, w22, w25
     addi     x2, x15, 0
     loop     x30, 4
@@ -433,7 +546,7 @@ modinv_f4:
 
     /* Update A if we updated u in the previous steps (w24 == 2^256-1). We
        additionally subtract the modulus if *both* w24,w26 == 2^256-1.
-         dmem[dptr_A..dptr_A+(n*32)] <= (w24 == 2^256-1) ? (A + C) mod m : A */
+         dmem[dptr_A..dptr_A+(nlen*32)] <= (w24 == 2^256-1) ? (A + C) mod m : A */
     addi     x2, x12, 0
     addi     x3, x13, 0
     addi     x4, x14, 0
@@ -459,7 +572,7 @@ modinv_f4:
 
     /* Update C if we updated v in the previous steps (w25 == 2^256-1). We
        additionally subtract the modulus if *both* w25,w26 == 2^256-1.
-         dmem[dptr_C..dptr_C+(n*32)] <= (w25 == 2^256-1) ? (A + C) mod m : C */
+         dmem[dptr_C..dptr_C+(nlen*32)] <= (w25 == 2^256-1) ? (A + C) mod m : C */
     addi     x2, x12, 0
     addi     x3, x13, 0
     addi     x4, x14, 0
@@ -509,7 +622,7 @@ modinv_f4:
 
     /* Conditionally add to B.
          w27 <= B + (65537 & w23) */
-    bn.wsrr  w24, 0x0 /* MOD */
+    bn.wsrr  w24, MOD
     bn.and   w24, w24, w23
     bn.add   w27, w27, w24
 
@@ -522,7 +635,7 @@ modinv_f4:
     bn.sub   w31, w31, w31
 
     /* Conditionally add m to A.
-         dmem[dptr_A..dptr_A+(n+32)] <= (!u[0] && (A[0] | B[0])) ? A + m : A */
+         dmem[dptr_A..dptr_A+(nlen*32)] <= (!u[0] && (A[0] | B[0])) ? A + m : A */
     addi     x2, x12, 0
     addi     x3, x13, 0
     loop     x30, 5
@@ -542,7 +655,7 @@ modinv_f4:
     bn.addc  w23, w31, w31
 
     /* Shift A to the right 1 if FG1.L is unset.
-         dmem[dptr_A..dptr_A+(n+32)] <= FG1.L ? A : A >> 1 */
+         dmem[dptr_A..dptr_A+(nlen*32)] <= FG1.L ? A : A >> 1 */
     addi     x3, x13, 0
     jal      x1, bignum_rshift1_if_not_fg1L
 
@@ -552,7 +665,7 @@ modinv_f4:
     bn.or    w20, w20, w31, FG1
 
     /* Shift v to the right 1 if FG1.L is unset.
-         dmem[dptr_v..dptr_v+(n+32)] <= FG1.L ? v : v >> 1 */
+         dmem[dptr_v..dptr_v+(nlen*32)] <= FG1.L ? v : v >> 1 */
     addi     x3, x15, 0
     bn.mov   w23, w31
     jal      x1, bignum_rshift1_if_not_fg1L
@@ -574,7 +687,7 @@ modinv_f4:
 
     /* Conditionally add to D.
          w28 <= D + (65537 & w23) */
-    bn.wsrr  w24, 0x0 /* MOD */
+    bn.wsrr  w24, MOD
     bn.and   w24, w24, w23
     bn.add   w28, w28, w24
 
@@ -587,7 +700,7 @@ modinv_f4:
     bn.sub   w31, w31, w31
 
     /* Conditionally add m to C.
-         dmem[dptr_C..dptr_C+(n+32)] <= (!v[0] && (C[0] | D[0])) ? C + m : C */
+         dmem[dptr_C..dptr_C+(nlen*32)] <= (!v[0] && (C[0] | D[0])) ? C + m : C */
     addi     x2, x12, 0
     addi     x3, x14, 0
     loop     x30, 5
@@ -607,7 +720,7 @@ modinv_f4:
     bn.addc  w23, w31, w31
 
     /* Shift C to the right 1 if FG1.L is unset.
-         dmem[dptr_C..dptr_C+(n+32)] <= FG1.L ? C : C >> 1 */
+         dmem[dptr_C..dptr_C+(nlen*32)] <= FG1.L ? C : C >> 1 */
     addi     x3, x14, 0
     jal      x1, bignum_rshift1_if_not_fg1L
 
@@ -620,7 +733,7 @@ modinv_f4:
 
   /* Get the FG0.Z flag into a register.
        x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */
-  csrrs    x2, 0x7c0, x0
+  csrrs    x2, FG0, x0
   srli     x2, x2, 3
   andi     x2, x2, 1
 
@@ -647,10 +760,10 @@ _modinv_f4_u_ok:
  * @param[in]   x3: dptr_A, pointer to input A in DMEM
  * @param[in]  x20: 20, constant
  * @param[in]  x21: 21, constant
- * @param[in]  x30: n, number of 256-bit limbs for input A
+ * @param[in]  x30: alen, number of 256-bit limbs for input A
  * @param[in]   w23: value to use as the msb
  * @param[in]   w31: all-zero
- * @param[out] dmem[dptr_A..dptr_A+n*32]: A', result
+ * @param[out] dmem[dptr_A..dptr_A+alen*32]: A', result
  *
  * clobbered registers: x2, x3, x4, w20, w21
  * clobbered flag groups: FG0
@@ -698,19 +811,19 @@ bignum_rshift1_if_not_fg1L:
  *
  * @param[in]  x20: 20, constant
  * @param[in]  x21: 21, constant
- * @param[in]  x30: n, number of 256-bit limbs in the candidate prime
+ * @param[in]  x30: plen, number of 256-bit limbs in the candidate prime
  * @param[in]  x31: n-1, constant
  * @param[in]  w31: all-zero
- * @param[out] dmem[rsa_p..rsa_p+(n*32)]: result, probable prime p
+ * @param[out] dmem[rsa_p..rsa_p+(plen*32)]: result, probable prime p
  *
- * clobbered registers: x2 to x13, x17 to x19, x22 to x26,
- *                      w2, w3, w4..w[4+(n-1)], w20 to w30
+ * clobbered registers: x2 to x13, x16 to x19, x22 to x26,
+ *                      w2, w3, w4..w[4+(plen-1)], w20 to w30
  * clobbered flag groups: FG0, FG1
  */
 generate_p:
   /* Compute nlen, the bit-length of the RSA modulus based on the number of
      limbs for p.
-       x4 <= n << 9 = n*256*2 = nlen */
+       x4 <= n << 9 = plen*256*2 = nlen */
   slli     x4, x30, 9
 
   /* Initialize counter for # of attempts.
@@ -731,7 +844,7 @@ _generate_p_counter_nonzero:
   sub      x4, x4, x5
 
   /* Generate a new random value for p.
-       dmem[rsa_p] <= <random n*256-bit odd value> */
+       dmem[rsa_p] <= <random plen*256-bit odd value> */
   la       x16, rsa_p
   jal      x1, generate_prime_candidate
 
@@ -746,7 +859,7 @@ _generate_p_counter_nonzero:
 
   /* Get the FG0.Z flag into a register.
        x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */
-  csrrs    x2, 0x7c0, x0
+  csrrs    x2, FG0, x0
   srli     x2, x2, 3
   andi     x2, x2, 1
 
@@ -770,19 +883,19 @@ _generate_p_counter_nonzero:
  *
  * @param[in]  x20: 20, constant
  * @param[in]  x21: 21, constant
- * @param[in]  x30: n, number of 256-bit limbs in the candidate prime
+ * @param[in]  x30: plen, number of 256-bit limbs in the candidate prime
  * @param[in]  x31: n-1, constant
  * @param[in]  w31: all-zero
- * @param[out] dmem[rsa_p..rsa_p+(n*32)]: result, probable prime p
+ * @param[out] dmem[rsa_p..rsa_p+(plen*32)]: result, probable prime p
  *
- * clobbered registers: x2 to x13, x17 to x19, x22 to x26,
- *                      w2, w3, w4..w[4+(n-1)], w20 to w30
+ * clobbered registers: x2 to x13, x16 to x19, x22 to x26,
+ *                      w2, w3, w4..w[4+(plen-1)], w20 to w30
  * clobbered flag groups: FG0, FG1
  */
 generate_q:
   /* Compute nlen, the bit-length of the RSA modulus based on the number of
      limbs for q.
-       x4 <= n << 9 = n*256*2 = nlen */
+       x4 <= n << 9 = plen*256*2 = nlen */
   slli     x4, x30, 9
 
   /* Initialize counter for # of attempts.
@@ -804,7 +917,7 @@ _generate_q_counter_nonzero:
   sub      x4, x4, x5
 
   /* Generate a new random value for q.
-       dmem[rsa_q] <= <random n*256-bit odd value> */
+       dmem[rsa_q] <= <random plen*256-bit odd value> */
   la       x16, rsa_q
   jal      x1, generate_prime_candidate
 
@@ -819,7 +932,7 @@ _generate_q_counter_nonzero:
 
   /* Get the FG0.Z flag into a register.
        x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */
-  csrrs    x2, 0x7c0, x0
+  csrrs    x2, FG0, x0
   srli     x2, x2, 3
   andi     x2, x2, 1
 
@@ -835,13 +948,14 @@ _generate_q_counter_nonzero:
  * Returns all 1s if the check passess, and 0 if it fails.
  *
  * For the candidate value p, this check passes only if:
- *   * p >= sqrt(2)*(2^(nlen/2 - 1)), where nlen = RSA public key length, and
  *   * GCD(p-1, 65537) = 1, and
  *   * p passes 5 rounds of the Miller-Rabin primality test.
  *
  * Assumes that the input is an odd number (this is a precondition for the
- * primality test). Before using this to check untrusted or imported keys, the
- * caller must check to ensure p is odd.
+ * primality test) and that p >= sqrt(2)*(2^(nlen/2 - 1)), where nlen = RSA
+ * public key length. Internally, `generate_prime_candidate` guarantees these
+ * conditions. The caller must ensure them before using this routine to check
+ * untrusted or imported keys.
  *
  * See FIPS 186-5 section A.1.3 for the official spec. See this comment in
  * BoringSSL's implementation for a detailed description of how to choose the
@@ -856,13 +970,13 @@ _generate_q_counter_nonzero:
  * @param[in]  x16: dptr_p, address of the candidate prime in DMEM
  * @param[in]  x20: 20, constant
  * @param[in]  x21: 21, constant
- * @param[in]  x30: n, number of 256-bit limbs in the candidate prime
- * @param[in]  x31: n-1, constant
+ * @param[in]  x30: plen, number of 256-bit limbs in the candidate prime
+ * @param[in]  x31: plen-1, constant
  * @param[in]  w31: all-zero
  * @param[out] w24: result, all 1s if the check passed and 0 otherwise
  *
  * clobbered registers: x2, x3, x5 to x13, x17 to x19, x22 to x26,
- *                      w2, w3, w4..w[4+(n-1)], w20 to w30
+ *                      w2, w3, w4..w[4+(plen-1)], w20 to w30
  * clobbered flag groups: FG0, FG1
  */
 check_p:
@@ -870,46 +984,6 @@ check_p:
        w24 <= 2^256 - 1 */
   bn.not   w24, w31
 
-  /* Get a pointer to the precomputed constant sqrt(2)*2^2047. */
-  la       x2, sqrt2_rsa4k
-
-  /* For RSA-2048 and RSA-3072, we will need to shift the lower bound right to
-     get sqrt(2)*2^1535 and sqrt(2)*2^1023, respectively. We can do this by
-     simply adjusting the pointer to skip the lower limbs.
-       x2 <= x2 + ((8 - x30) << 5) = sqrt2_rsa4k + ((8 - n) * 32) */
-  li       x3, 8
-  sub      x3, x3, x30
-  slli     x3, x3, 5
-  add      x2, x2, x3
-
-  /* Clear flags. */
-  bn.sub   w31, w31, w31
-
-  /* Now, the value at dmem[x2] is n limbs long and represents the lower bound
-     for p. Compare the two values. */
-  addi  x3, x16, 0
-  loop  x30, 3
-    /* w20 <= dmem[x2] = lower_bound[i] */
-    bn.lid    x20, 0(x2++)
-    /* w21 <= dmem[x3] = p[i] */
-    bn.lid    x21, 0(x3++)
-    /* FG0.C <= p[i] <? lower_bound[i] + FG0.C */
-    bn.cmpb   w21, w20
-
-  /* If FG0.C is set, p is smaller than the lower bound; set the result to
-     "checks failed" (0).
-       w24 <= FG0.C ? 0 : w24 */
-  bn.sel    w24, w31, w24, FG0.C
-
-  /* Get the FG0.C flag into a register.
-       x2 <= CSRs[FG0][0] = FG0.C */
-  csrrs    x2, 0x7c0, x0
-  andi     x2, x2, 1
-
-  /* If the flag is set, then the check failed and we can skip the remaining
-     checks. */
-  bne      x2, x0, _check_prime_fail
-
   /* Subtract 1 from the lowest limb in-place.
        dmem[x16] <= dmem[x16] - 1 = p - 1 */
   bn.lid   x20, 0(x16)
@@ -928,7 +1002,7 @@ check_p:
 
   /* Get the FG0.Z flag into a register.
        x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */
-  csrrs    x2, 0x7c0, x0
+  csrrs    x2, FG0, x0
   srli     x2, x2, 3
   andi     x2, x2, 1
 
@@ -1021,15 +1095,15 @@ _check_prime_fail:
  *
  * @param[in]  x20: 20, constant
  * @param[in]  x21: 21, constant
- * @param[in]  x30: n, number of 256-bit limbs in the candidate prime
- * @param[in]  x31: n-1, constant
+ * @param[in]  x30: plen, number of 256-bit limbs in the candidate prime
+ * @param[in]  x31: plen-1, constant
  * @param[in]  w31: all-zero
- * @param[in]  dmem[rsa_p..rsa_p+(n*32)]: value for p
- * @param[in]  dmem[rsa_q..rsa_q+(n*32)]: candidate value for q
+ * @param[in]  dmem[rsa_p..rsa_p+(plen*32)]: value for p
+ * @param[in]  dmem[rsa_q..rsa_q+(plen*32)]: candidate value for q
  * @param[out] w24: result, all 1s if the check passed and 0 otherwise
  *
  * clobbered registers: x2, x3, x5 to x13, x17 to x19, x22 to x26,
- *                      w2, w3, w4..w[4+(n-1)], w20 to w30
+ *                      w2, w3, w4..w[4+(plen-1)], w20 to w30
  * clobbered flag groups: FG0, FG1
  */
 check_q:
@@ -1067,7 +1141,7 @@ check_q:
 
   /* Get the FG0.Z flag into a register.
        x2 <= (CSRs[FG0] >> 3) & 1 = FG0.Z */
-  csrrs    x2, 0x7c0, x0
+  csrrs    x2, FG0, x0
   srli     x2, x2, 3
   andi     x2, x2, 1
 
@@ -1082,43 +1156,45 @@ check_q:
 /**
  * Generate a candidate prime (can be used for either p or q).
  *
- * Fixes the lowest and highest bits to 1, so the number is always odd and >=
- * 2^(256*n). All other bits are fully random.
+ * Fixes the lowest 3 bits to 1 and the highest 2 bits to 1, so the number is
+ * always equivalent to 7 mod 8 and is always >= 2^(256*n - 1) * 1.5.  This
+ * implies that the prime candidate is always in range, i.e. it is greater than
+ * sqrt(2) * (2^(256*n - 1)), because sqrt(2) < 1.5. All other bits are fully
+ * random. This follows FIPS 186-5 section A.1.3, which allows generating prime
+ * candidates with a specific value mod 8 and allows the highest 2 bits to be
+ * set arbitrarily.
  *
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  x16: dptr_result, address of the result buffer in DMEM
  * @param[in]  x20: 20, constant
- * @param[in]  x30: n, number of 256-bit limbs for the result
- * @param[in]  x31: n-1, constant
+ * @param[in]  x30: plen, number of 256-bit limbs for the result
+ * @param[in]  x31: plen-1, constant
  * @param[in]  w31: all-zero
- * @param[out] dmem[x16..x16+(n*32)]: random candidate prime
+ * @param[out] dmem[x16..x16+(plen*32)]: random candidate prime
  *
  * clobbered registers: x2, x3, w20, w21
  * clobbered flag groups: FG0
  */
 generate_prime_candidate:
   /* Generate random 256-bit limbs.
-       dmem[x16..x16+(n*32)] <= RND(n*32) ^ URND(n*32)  */
+       dmem[x16..x16+(plen*32)] <= RND(n*32) ^ URND(n*32)  */
   addi     x2, x16, 0
   loop     x30, 4
     /* w20 <= RND() */
-    bn.wsrr  w20, 0x1 /* RND */
+    bn.wsrr  w20, RND
     /* w21 <= URND() */
-    bn.wsrr  w21, 0x2 /* URND */
+    bn.wsrr  w21, URND
     /* w20 <= w20 ^ w21 */
     bn.xor   w20, w20, w21
     /* dmem[x2] <= w20 */
     bn.sid   x20, 0(x2++)
 
-  /* Create an all-ones mask.
-       w21 <= 2^256 - 1 */
-  bn.not   w21, w31
-
-  /* Fix the lowest bit to 1 so the number is always odd.
-       dmem[x16] <= (dmem[x16] << 1) mod 2^256 | 1 */
+  /* Fix the lowest 3 bits to 1 so the number is always 7 mod 8.
+       dmem[x16] <= dmem[x16] | 7 */
   bn.lid   x20, 0(x16)
-  bn.rshi  w20, w20, w21 >> 255
+  bn.addi  w21, w31, 7
+  bn.or    w20, w20, w21
   bn.sid   x20, 0(x16)
 
   /* Get a pointer to the last limb.
@@ -1126,12 +1202,11 @@ generate_prime_candidate:
   slli     x3, x31, 5
   add      x2, x16, x3
 
-  /* Fix the highest bit to 1 so the number is always at least 2^(256*n-1).
-     This is implied by the lower bound and setting the bit is explicitly
-     permitted by FIPS 186-5.
-       dmem[x2] <= 1 << 255 | (dmem[x2] >> 1) */
+  /* Fix the highest 2 bits to 1.
+       dmem[x2] <= dmem[x2] | (3 << 6) << 248 = dmem[x2] | 3 << 254 */
   bn.lid   x20, 0(x2)
-  bn.rshi  w20, w21, w20 >> 1
+  bn.addi  w21, w31, 192
+  bn.or    w20, w20, w21 << 248
   bn.sid   x20, 0(x2)
 
   ret
@@ -1175,7 +1250,7 @@ generate_prime_candidate:
  * Flags: Flags have no meaning beyond the scope of this subroutine.
  *
  * @param[in]  x16: dptr_x, pointer to first limb of x in dmem
- * @param[in]  x30: n, number of 256-bit limbs for x
+ * @param[in]  x30: plen, number of 256-bit limbs for x
  * @param[in]  w31: all-zero
  * @param[out] w22: result, 0 only if x is not relatively prime to F4
  *
@@ -1187,7 +1262,7 @@ relprime_f4:
        MOD <= 2^16 + 1 */
   bn.addi  w22, w31, 1
   bn.add   w22, w22, w22 << 16
-  bn.wsrw  0x0, w22
+  bn.wsrw  MOD, w22
 
   /* Initialize constants for loop. */
   li      x22, 22
@@ -1282,7 +1357,8 @@ relprime_f4:
 
 /* Extra label marking the start of p || q in memory. The `derive_d` function
    uses this to get a 512-byte working buffer, which means p and q must be
-   continuous in memory (but it's OK if their order is reversed). */
+   continuous in memory. In addition, `rsa_key_from_cofactor` uses the
+   larger buffer for division and depends on the order of `p` and `q`. */
 .balign 32
 rsa_pq:
 
@@ -1301,7 +1377,7 @@ rsa_q:
 tmp_scratchpad:
 .zero 512
 
-.section .data
+.bss
 
 /* RSA modulus n = p*q (up to 4096 bits). */
 .balign 32
@@ -1315,9 +1391,11 @@ rsa_n:
 rsa_d:
 .zero 512
 
-/* Temporary working buffer (4096 bits). */
+/* Prime cofactor for n for `rsa_key_from_cofactor`; also used as a temporary
+ * work buffer. */
 .balign 32
-tmp_data:
+.globl rsa_cofactor
+rsa_cofactor:
 .zero 512
 
 /* Montgomery constant m0' (256 bits). */
@@ -1329,77 +1407,3 @@ mont_m0inv:
 .balign 32
 mont_rr:
 .zero 256
-
-/* Precomputed value for sqrt(2)*(2^2047), such that
-     (sqrt2_rsa4k^2 < 2**4095 < (sqrt2_rsa4k+1)^2
-
-   This number was taken from BoringSSL's implementation and has enough
-   precision to be exact for RSA-4096 and smaller:
-     https://boringssl.googlesource.com/boringssl/+/dcabfe2d8940529a69e007660fa7bf6c15954ecc/crypto/fipsmodule/rsa/rsa_impl.c#1006
-*/
-.balign 32
-sqrt2_rsa4k:
-  .word 0xe633e3e1
-  .word 0x4d7c60a5
-  .word 0xca3ea33b
-  .word 0x5fcf8f7b
-  .word 0x92957023
-  .word 0xc246785e
-  .word 0x797f2805
-  .word 0xf9acce41
-  .word 0xd3b1f780
-  .word 0xfdfe170f
-  .word 0x3facb882
-  .word 0xd24f4a76
-  .word 0xaff5f3b2
-  .word 0x18838a2e
-  .word 0xa2f7dc33
-  .word 0xc1fcbdde
-  .word 0xf7aa81c2
-  .word 0xdea06241
-  .word 0xca221307
-  .word 0xf6a1be3f
-  .word 0x7bda1ebf
-  .word 0x332a5e9f
-  .word 0xfe32352f
-  .word 0x0104dc01
-  .word 0x6f8236c7
-  .word 0xb8cf341b
-  .word 0xd528b651
-  .word 0x4264dabc
-  .word 0xebc93e0c
-  .word 0xf4d3a02c
-  .word 0xd8fd0efd
-  .word 0x81394ab6
-  .word 0x9040ca4a
-  .word 0xeaa4a089
-  .word 0x836e582e
-  .word 0xf52f120f
-  .word 0x31f3c84d
-  .word 0xcb2a6343
-  .word 0x8bb7e9dc
-  .word 0xc6d5a8a3
-  .word 0x2f7c4e33
-  .word 0x460abc72
-  .word 0x1688458a
-  .word 0xcab1bc91
-  .word 0x11bc337b
-  .word 0x53059c60
-  .word 0x42af1f4e
-  .word 0xd2202e87
-  .word 0x3dfa2768
-  .word 0x78048736
-  .word 0x439c7b4a
-  .word 0x0f74a85e
-  .word 0xdc83db39
-  .word 0xa8b1fe6f
-  .word 0x3ab8a2c3
-  .word 0x4afc8304
-  .word 0x83339915
-  .word 0xed17ac85
-  .word 0x893ba84c
-  .word 0x1d6f60ba
-  .word 0x754abe9f
-  .word 0x597d89b3
-  .word 0xf9de6484
-  .word 0xb504f333
diff --git a/sw/otbn/crypto/rsa_verify.s b/sw/otbn/crypto/rsa_verify.s
index 50de37ff7d3c3..e9b0ee3618a37 100644
--- a/sw/otbn/crypto/rsa_verify.s
+++ b/sw/otbn/crypto/rsa_verify.s
@@ -132,7 +132,7 @@ cmp_dmem_reg_buf:
 
   /* compare limbs and store comparison result in x3 */
   bn.cmp    w2, w3, FG1
-  csrrs     x3, 0x7c1, x0
+  csrrs     x3, FG1, x0
 
   /* leave loop if lowest limb was reached */
   beq       x8, x7, cmp_end
@@ -230,7 +230,7 @@ compute_rr:
 
     /* In case of final carry in doubling procedure substract modulus */
     /* Jump to 'rr_sub' if FG1.C == 1 */
-    csrrs     x3, 0x7c1, x0
+    csrrs     x3, FG1, x0
     andi      x3, x3, 1
     bne       x3, x0, rr_sub
 
@@ -239,7 +239,7 @@ compute_rr:
     bn.lid    x10, 0(x17)
     bn.movr   x11, x9
     bn.cmp    w2, w3, FG1
-    csrrs     x3, 0x7c1, x0
+    csrrs     x3, FG1, x0
 
     /* If the highest limbs of buf and mod are equal we have to run a
        multi-limb comparison. This is very unlikely to happen. If this
@@ -501,7 +501,7 @@ mont_loop:
   bn.movr   x10++, x13
 
   /* No subtracion if carry bit of addition of carry words not set. */
-  csrrs     x2, 0x7c1, x0
+  csrrs     x2, FG1, x0
   andi      x2, x2, 1
   beq       x2, x0, mont_loop_no_sub
 
@@ -698,7 +698,7 @@ modexp_var:
     bn.lid    x9, 0(x16++)
     bn.subb   w2, w2, w3
     bn.movr   x17++, x11
-  csrrs     x2, 0x7c0, x0
+  csrrs     x2, FG0, x0
   /* TODO: currently we subtract the modulus if out_buf == M. This should
             never happen in an RSA context. We could catch this and raise an
             alert. */
diff --git a/sw/otbn/crypto/rsa_verify_3072_m0inv.s b/sw/otbn/crypto/rsa_verify_3072_m0inv.s
index 6807f9a7074df..650ec0519eb61 100644
--- a/sw/otbn/crypto/rsa_verify_3072_m0inv.s
+++ b/sw/otbn/crypto/rsa_verify_3072_m0inv.s
@@ -68,7 +68,7 @@ check_eq_w6w27:
 
     /* Get value from flag register.
          x3 <= (b < a) */
-    csrrs     x3, 0x7c0, x0
+    csrrs     x3, FG0, x0
     andi      x3, x3, 1
 
     /* Check if a < b. */
@@ -76,7 +76,7 @@ check_eq_w6w27:
 
     /* Get value from flag register.
          x4 <= (a < b) */
-    csrrs     x4, 0x7c0, x0
+    csrrs     x4, FG0, x0
     andi      x4, x4, 1
 
     /* If b < a or a < b, then a != b; otherwise a = b.
diff --git a/sw/otbn/crypto/rsa_verify_3072_rr.s b/sw/otbn/crypto/rsa_verify_3072_rr.s
index 072432c6773a5..b6145bbb51ac0 100644
--- a/sw/otbn/crypto/rsa_verify_3072_rr.s
+++ b/sw/otbn/crypto/rsa_verify_3072_rr.s
@@ -98,13 +98,13 @@ double_mod_var:
 
   /* Extract final carry bit from flags register.
        x2 <= aa[3072] */
-  csrrs     x2, 0x7c0, x0
+  csrrs     x2, FG0, x0
   andi      x2, x2, 1
 
   jal       x1, subtract_modulus_var
 
   /* Extract final borrow bit from flags register. */
-  csrrs     x3, 0x7c0, x0
+  csrrs     x3, FG0, x0
   andi      x3, x3, 1
 
   /**
diff --git a/sw/otbn/crypto/run_rsa_keygen.s b/sw/otbn/crypto/run_rsa_keygen.s
index a82bfcf816ea5..7544ba7ed853c 100644
--- a/sw/otbn/crypto/run_rsa_keygen.s
+++ b/sw/otbn/crypto/run_rsa_keygen.s
@@ -4,11 +4,17 @@
 
 /**
  * RSA key generation.
+ *
+ * This binary can be called in two different modes:
+ * - `GEN` mode generates a new, random keypair
+ * - `COFACTOR` mode constructs a keypair from n, e, d, and either p or q.
+ *
+ * Both modes support three sizes: RSA-2048, RSA-3072, and RSA-4096.
  */
 
 /**
  * Mode magic values generated with
- * $ ./util/design/sparse-fsm-encode.py -d 6 -m 4 -n 11 \
+ * $ ./util/design/sparse-fsm-encode.py -d 6 -m 6 -n 11 \
  *    --avoid-zero -s 561689407
  *
  * Call the same utility with the same arguments and a higher -m to generate
@@ -19,9 +25,12 @@
  * as `li`. If support is added, we could use 32-bit values here instead of
  * 11-bit.
  */
-.equ MODE_RSA_2048, 0x3b7
-.equ MODE_RSA_3072, 0x4fa
-.equ MODE_RSA_4096, 0x74d
+.equ MODE_GEN_RSA_2048, 0x137
+.equ MODE_GEN_RSA_3072, 0x4e5
+.equ MODE_GEN_RSA_4096, 0x63a
+.equ MODE_COFACTOR_RSA_2048, 0x34e
+.equ MODE_COFACTOR_RSA_3072, 0x0db
+.equ MODE_COFACTOR_RSA_4096, 0x794
 
 .section .text.start
 start:
@@ -32,15 +41,24 @@ start:
   la      x2, mode
   lw      x2, 0(x2)
 
-  addi    x3, x0, MODE_RSA_2048
+  addi    x3, x0, MODE_GEN_RSA_2048
   beq     x2, x3, rsa_keygen_2048
 
-  addi    x3, x0, MODE_RSA_3072
+  addi    x3, x0, MODE_GEN_RSA_3072
   beq     x2, x3, rsa_keygen_3072
 
-  addi    x3, x0, MODE_RSA_4096
+  addi    x3, x0, MODE_GEN_RSA_4096
   beq     x2, x3, rsa_keygen_4096
 
+  addi    x3, x0, MODE_COFACTOR_RSA_2048
+  beq     x2, x3, rsa_key_from_cofactor_2048
+
+  addi    x3, x0, MODE_COFACTOR_RSA_3072
+  beq     x2, x3, rsa_key_from_cofactor_3072
+
+  addi    x3, x0, MODE_COFACTOR_RSA_4096
+  beq     x2, x3, rsa_key_from_cofactor_4096
+
   /* Unsupported mode; fail. */
   unimp
   unimp
@@ -70,10 +88,34 @@ rsa_keygen_4096:
   jal     x1, rsa_keygen
   ecall
 
+rsa_key_from_cofactor_2048:
+  /* Set the number of limbs for the primes (2048 / 2 / 256). */
+  li      x30, 4
+
+  /* Generate a key (results in dmem[rsa_n] and dmem[rsa_d]). */
+  jal     x1, rsa_key_from_cofactor
+  ecall
+
+rsa_key_from_cofactor_3072:
+  /* Set the number of limbs for the primes (3072 / 2 / 256). */
+  li      x30, 6
+
+  /* Generate a key (results in dmem[rsa_n] and dmem[rsa_d]). */
+  jal     x1, rsa_key_from_cofactor
+  ecall
+
+rsa_key_from_cofactor_4096:
+  /* Set the number of limbs for the primes (4096 / 2 / 256). */
+  li      x30, 8
+
+  /* Generate a key (results in dmem[rsa_n] and dmem[rsa_d]). */
+  jal     x1, rsa_key_from_cofactor
+  ecall
+
 .bss
 
 /* Operational mode. */
 .globl mode
 .balign 4
 mode:
-  .zero 4
+.zero 4
diff --git a/sw/otbn/crypto/tests/BUILD b/sw/otbn/crypto/tests/BUILD
index 3044ad623373d..10172edecd588 100644
--- a/sw/otbn/crypto/tests/BUILD
+++ b/sw/otbn/crypto/tests/BUILD
@@ -2,7 +2,7 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 
-load("//rules:otbn.bzl", "otbn_consttime_test", "otbn_sim_test")
+load("//rules:otbn.bzl", "otbn_consttime_test", "otbn_library", "otbn_sim_test")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -200,7 +200,7 @@ otbn_sim_test(
     ],
     exp = "p256_base_mult_test.exp",
     deps = [
-        "//sw/otbn/crypto:p256",
+        "//sw/otbn/crypto:p256_base",
     ],
 )
 
@@ -235,7 +235,18 @@ otbn_sim_test(
     ],
     exp = "p256_key_from_seed_test.exp",
     deps = [
-        "//sw/otbn/crypto:p256",
+        "//sw/otbn/crypto:p256_base",
+    ],
+)
+
+otbn_sim_test(
+    name = "p256_mul_modp_test",
+    srcs = [
+        "p256_mul_modp_test.s",
+    ],
+    exp = "p256_mul_modp_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p256_base",
     ],
 )
 
@@ -248,10 +259,10 @@ otbn_consttime_test(
 )
 
 otbn_consttime_test(
-    name = "p256_scalar_mult_consttime",
-    subroutine = "p256_scalar_mult",
+    name = "p256_shared_key_consttime",
+    subroutine = "p256_shared_key",
     deps = [
-        "//sw/otbn/crypto:p256_ecdsa",
+        "//sw/otbn/crypto:p256_ecdh",
     ],
 )
 
@@ -276,7 +287,8 @@ otbn_sim_test(
     ],
     exp = "p256_ecdsa_sign_test.exp",
     deps = [
-        "//sw/otbn/crypto:p256",
+        "//sw/otbn/crypto:p256_base",
+        "//sw/otbn/crypto:p256_sign",
     ],
 )
 
@@ -287,7 +299,8 @@ otbn_sim_test(
     ],
     exp = "p256_ecdsa_verify_test.exp",
     deps = [
-        "//sw/otbn/crypto:p256",
+        "//sw/otbn/crypto:p256_base",
+        "//sw/otbn/crypto:p256_verify",
     ],
 )
 
@@ -298,7 +311,8 @@ otbn_sim_test(
     ],
     exp = "p256_isoncurve_test.exp",
     deps = [
-        "//sw/otbn/crypto:p256",
+        "//sw/otbn/crypto:p256_base",
+        "//sw/otbn/crypto:p256_isoncurve",
     ],
 )
 
@@ -309,7 +323,7 @@ otbn_sim_test(
     ],
     exp = "p256_proj_add_test.exp",
     deps = [
-        "//sw/otbn/crypto:p256",
+        "//sw/otbn/crypto:p256_base",
     ],
 )
 
@@ -320,7 +334,43 @@ otbn_sim_test(
     ],
     exp = "p256_scalar_mult_test.exp",
     deps = [
-        "//sw/otbn/crypto:p256",
+        "//sw/otbn/crypto:p256_base",
+    ],
+)
+
+otbn_sim_test(
+    name = "p256_ecdh_shared_key_test",
+    srcs = [
+        "p256_ecdh_shared_key_test.s",
+    ],
+    exp = "p256_ecdh_shared_key_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p256_base",
+        "//sw/otbn/crypto:p256_shared_key",
+    ],
+)
+
+otbn_sim_test(
+    name = "p256_arithmetic_to_boolean_test",
+    srcs = [
+        "p256_arithmetic_to_boolean_test.s",
+    ],
+    exp = "p256_arithmetic_to_boolean_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p256_base",
+        "//sw/otbn/crypto:p256_shared_key",
+    ],
+)
+
+otbn_sim_test(
+    name = "p256_arithmetic_to_boolean_mod_test",
+    srcs = [
+        "p256_arithmetic_to_boolean_mod_test.s",
+    ],
+    exp = "p256_arithmetic_to_boolean_mod_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p256_base",
+        "//sw/otbn/crypto:p256_shared_key",
     ],
 )
 
@@ -332,7 +382,46 @@ otbn_sim_test(
     exp = "p384_base_mult_test.exp",
     deps = [
         "//sw/otbn/crypto:p384_base",
-        "//sw/otbn/crypto:p384_sign",
+        "//sw/otbn/crypto:p384_base_mult",
+        "//sw/otbn/crypto:p384_internal_mult",
+    ],
+)
+
+otbn_sim_test(
+    name = "p384_arithmetic_to_boolean_test",
+    srcs = [
+        "p384_arithmetic_to_boolean_test.s",
+    ],
+    exp = "p384_arithmetic_to_boolean_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p384_a2b",
+        "//sw/otbn/crypto:p384_base",
+    ],
+)
+
+otbn_sim_test(
+    name = "p384_arithmetic_to_boolean_mod_test",
+    srcs = [
+        "p384_arithmetic_to_boolean_mod_test.s",
+    ],
+    exp = "p384_arithmetic_to_boolean_mod_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p384_a2b",
+        "//sw/otbn/crypto:p384_base",
+    ],
+)
+
+otbn_sim_test(
+    name = "p384_ecdh_shared_key_test",
+    srcs = [
+        "p384_ecdh_shared_key_test.s",
+    ],
+    exp = "p384_ecdh_shared_key_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p384_a2b",
+        "//sw/otbn/crypto:p384_base",
+        "//sw/otbn/crypto:p384_internal_mult",
+        "//sw/otbn/crypto:p384_scalar_mult",
     ],
 )
 
@@ -344,6 +433,8 @@ otbn_sim_test(
     exp = "p384_ecdsa_sign_test.exp",
     deps = [
         "//sw/otbn/crypto:p384_base",
+        "//sw/otbn/crypto:p384_internal_mult",
+        "//sw/otbn/crypto:p384_modinv",
         "//sw/otbn/crypto:p384_sign",
     ],
 )
@@ -356,6 +447,8 @@ otbn_sim_test(
     exp = "p384_ecdsa_verify_test.exp",
     deps = [
         "//sw/otbn/crypto:p384_base",
+        "//sw/otbn/crypto:p384_isoncurve",
+        "//sw/otbn/crypto:p384_modinv",
         "//sw/otbn/crypto:p384_verify",
     ],
 )
@@ -368,7 +461,31 @@ otbn_sim_test(
     exp = "p384_isoncurve_test.exp",
     deps = [
         "//sw/otbn/crypto:p384_base",
-        "//sw/otbn/crypto:p384_verify",
+        "//sw/otbn/crypto:p384_isoncurve",
+    ],
+)
+
+otbn_sim_test(
+    name = "p384_curve_point_valid_test",
+    srcs = [
+        "p384_curve_point_valid_test.s",
+    ],
+    exp = "p384_curve_point_valid_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p384_base",
+        "//sw/otbn/crypto:p384_isoncurve",
+    ],
+)
+
+otbn_sim_test(
+    name = "p384_keygen_test",
+    srcs = [
+        "p384_keygen_test.s",
+    ],
+    exp = "p384_keygen_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p384_base",
+        "//sw/otbn/crypto:p384_keygen",
     ],
 )
 
@@ -391,7 +508,19 @@ otbn_sim_test(
     exp = "p384_scalar_mult_test.exp",
     deps = [
         "//sw/otbn/crypto:p384_base",
-        "//sw/otbn/crypto:p384_sign",
+        "//sw/otbn/crypto:p384_internal_mult",
+        "//sw/otbn/crypto:p384_scalar_mult",
+    ],
+)
+
+otbn_sim_test(
+    name = "p384_mulmod448x128_test",
+    srcs = [
+        "p384_mulmod448x128_test.s",
+    ],
+    exp = "p384_mulmod448x128_test.exp",
+    deps = [
+        "//sw/otbn/crypto:p384_base",
     ],
 )
 
@@ -399,7 +528,15 @@ otbn_consttime_test(
     name = "p384_base_mult_consttime",
     subroutine = "p384_base_mult",
     deps = [
-        ":p384_ecdsa_sign_test",
+        ":p384_base_mult_test",
+    ],
+)
+
+otbn_consttime_test(
+    name = "p384_scalar_mult_consttime",
+    subroutine = "p384_scalar_mult",
+    deps = [
+        ":p384_scalar_mult_test",
     ],
 )
 
@@ -447,11 +584,10 @@ otbn_consttime_test(
     ],
 )
 
-otbn_consttime_test(
-    name = "scalar_mult_p384_consttime",
-    subroutine = "scalar_mult_p384",
-    deps = [
-        ":p384_ecdsa_sign_test",
+otbn_library(
+    name = "fake_primality",
+    srcs = [
+        "fake_primality.s",
     ],
 )
 
@@ -618,15 +754,99 @@ otbn_consttime_test(
     ],
 )
 
+otbn_library(
+    name = "rsa_keygen_checkpq_test_data",
+    srcs = [
+        "rsa_keygen_checkpq_test_data.s",
+    ],
+)
+
 otbn_sim_test(
-    name = "rsa_keygen_checkpq_test",
-    # This test is very long because it runs multiple primality tests.
-    timeout = "eternal",
+    name = "rsa_keygen_checkp_good_test",
+    timeout = "long",  # runs a primality test
+    srcs = [
+        "rsa_keygen_checkp_good_test.s",
+    ],
+    exp = "rsa_keygen_checkp_good_test.exp",
+    tags = ["nightly"],  # slow, do not run in CI
+    deps = [
+        ":rsa_keygen_checkpq_test_data",
+        "//sw/otbn/crypto:div",
+        "//sw/otbn/crypto:gcd",
+        "//sw/otbn/crypto:lcm",
+        "//sw/otbn/crypto:montmul",
+        "//sw/otbn/crypto:mul",
+        "//sw/otbn/crypto:primality",
+        "//sw/otbn/crypto:rsa_keygen",
+    ],
+)
+
+otbn_sim_test(
+    name = "rsa_keygen_checkp_not_relprime_test",
+    srcs = [
+        "rsa_keygen_checkp_not_relprime_test.s",
+    ],
+    exp = "rsa_keygen_checkp_not_relprime_test.exp",
+    deps = [
+        ":fake_primality",
+        ":rsa_keygen_checkpq_test_data",
+        "//sw/otbn/crypto:div",
+        "//sw/otbn/crypto:gcd",
+        "//sw/otbn/crypto:lcm",
+        "//sw/otbn/crypto:montmul",
+        "//sw/otbn/crypto:mul",
+        "//sw/otbn/crypto:rsa_keygen",
+    ],
+)
+
+otbn_sim_test(
+    name = "rsa_keygen_checkp_not_prime_test",
+    timeout = "long",  # runs a primality test
+    srcs = [
+        "rsa_keygen_checkp_not_prime_test.s",
+    ],
+    exp = "rsa_keygen_checkp_not_prime_test.exp",
+    deps = [
+        ":rsa_keygen_checkpq_test_data",
+        "//sw/otbn/crypto:div",
+        "//sw/otbn/crypto:gcd",
+        "//sw/otbn/crypto:lcm",
+        "//sw/otbn/crypto:montmul",
+        "//sw/otbn/crypto:mul",
+        "//sw/otbn/crypto:primality",
+        "//sw/otbn/crypto:rsa_keygen",
+    ],
+)
+
+otbn_sim_test(
+    name = "rsa_keygen_checkq_good_test",
+    timeout = "long",  # runs a primality test
+    srcs = [
+        "rsa_keygen_checkq_good_test.s",
+    ],
+    exp = "rsa_keygen_checkq_good_test.exp",
+    tags = ["nightly"],  # slow, do not run in CI
+    deps = [
+        ":rsa_keygen_checkpq_test_data",
+        "//sw/otbn/crypto:div",
+        "//sw/otbn/crypto:gcd",
+        "//sw/otbn/crypto:lcm",
+        "//sw/otbn/crypto:montmul",
+        "//sw/otbn/crypto:mul",
+        "//sw/otbn/crypto:primality",
+        "//sw/otbn/crypto:rsa_keygen",
+    ],
+)
+
+otbn_sim_test(
+    name = "rsa_keygen_checkq_not_prime_test",
+    timeout = "long",  # runs a primality test
     srcs = [
-        "rsa_keygen_checkpq_test.s",
+        "rsa_keygen_checkq_not_prime_test.s",
     ],
-    exp = "rsa_keygen_checkpq_test.exp",
+    exp = "rsa_keygen_checkq_not_prime_test.exp",
     deps = [
+        ":rsa_keygen_checkpq_test_data",
         "//sw/otbn/crypto:div",
         "//sw/otbn/crypto:gcd",
         "//sw/otbn/crypto:lcm",
@@ -637,6 +857,42 @@ otbn_sim_test(
     ],
 )
 
+otbn_sim_test(
+    name = "rsa_keygen_checkq_not_relprime_test",
+    srcs = [
+        "rsa_keygen_checkq_not_relprime_test.s",
+    ],
+    exp = "rsa_keygen_checkq_not_relprime_test.exp",
+    deps = [
+        ":fake_primality",
+        ":rsa_keygen_checkpq_test_data",
+        "//sw/otbn/crypto:div",
+        "//sw/otbn/crypto:gcd",
+        "//sw/otbn/crypto:lcm",
+        "//sw/otbn/crypto:montmul",
+        "//sw/otbn/crypto:mul",
+        "//sw/otbn/crypto:rsa_keygen",
+    ],
+)
+
+otbn_sim_test(
+    name = "rsa_keygen_checkq_too_close_test",
+    srcs = [
+        "rsa_keygen_checkq_too_close_test.s",
+    ],
+    exp = "rsa_keygen_checkq_too_close_test.exp",
+    deps = [
+        ":fake_primality",
+        ":rsa_keygen_checkpq_test_data",
+        "//sw/otbn/crypto:div",
+        "//sw/otbn/crypto:gcd",
+        "//sw/otbn/crypto:lcm",
+        "//sw/otbn/crypto:montmul",
+        "//sw/otbn/crypto:mul",
+        "//sw/otbn/crypto:rsa_keygen",
+    ],
+)
+
 otbn_sim_test(
     name = "rsa_1024_dec_test",
     timeout = "long",
@@ -662,6 +918,69 @@ otbn_sim_test(
     ],
 )
 
+otbn_sim_test(
+    name = "rsa_2048_dec_test",
+    timeout = "eternal",
+    srcs = [
+        "rsa_2048_dec_test.s",
+    ],
+    exp = "rsa_2048_dec_test.exp",
+    deps = [
+        "//sw/otbn/crypto:modexp",
+        "//sw/otbn/crypto:montmul",
+    ],
+)
+
+otbn_sim_test(
+    name = "rsa_2048_enc_test",
+    srcs = [
+        "rsa_2048_enc_test.s",
+    ],
+    exp = "rsa_2048_enc_test.exp",
+    deps = [
+        "//sw/otbn/crypto:modexp",
+        "//sw/otbn/crypto:montmul",
+    ],
+)
+
+otbn_sim_test(
+    name = "rsa_3072_dec_test",
+    timeout = "eternal",
+    srcs = [
+        "rsa_3072_dec_test.s",
+    ],
+    exp = "rsa_3072_dec_test.exp",
+    tags = ["nightly"],  # slow, do not run in CI
+    deps = [
+        "//sw/otbn/crypto:modexp",
+        "//sw/otbn/crypto:montmul",
+    ],
+)
+
+otbn_sim_test(
+    name = "rsa_3072_enc_test",
+    srcs = [
+        "rsa_3072_enc_test.s",
+    ],
+    exp = "rsa_3072_enc_test.exp",
+    deps = [
+        "//sw/otbn/crypto:modexp",
+        "//sw/otbn/crypto:montmul",
+    ],
+)
+
+otbn_sim_test(
+    name = "rsa_4096_enc_test",
+    srcs = [
+        "rsa_4096_enc_test.s",
+    ],
+    exp = "rsa_4096_enc_test.exp",
+    deps = [
+        "//sw/otbn/crypto:modexp",
+        "//sw/otbn/crypto:montmul",
+    ],
+)
+
 otbn_sim_test(
     name = "rsa_verify_test",
     srcs = [
@@ -764,11 +1083,23 @@ otbn_sim_test(
 )
 
 otbn_sim_test(
-    name = "x25519_test",
+    name = "x25519_test1",
+    srcs = [
+        "x25519_test1.s",
+    ],
+    exp = "x25519_test1.exp",
+    deps = [
+        "//sw/otbn/crypto:field25519",
+        "//sw/otbn/crypto:x25519",
+    ],
+)
+
+otbn_sim_test(
+    name = "x25519_test2",
     srcs = [
-        "x25519_test.s",
+        "x25519_test2.s",
     ],
-    exp = "x25519_test.exp",
+    exp = "x25519_test2.exp",
     deps = [
         "//sw/otbn/crypto:field25519",
         "//sw/otbn/crypto:x25519",
@@ -779,6 +1110,6 @@ otbn_consttime_test(
     name = "x25519_consttime",
     subroutine = "X25519",
     deps = [
-        ":x25519_test",
+        ":x25519_test1",
     ],
 )
diff --git a/sw/otbn/crypto/tests/ed25519_ext_add_test.s b/sw/otbn/crypto/tests/ed25519_ext_add_test.s
index f280d18a3a6b5..cfbb901374bba 100644
--- a/sw/otbn/crypto/tests/ed25519_ext_add_test.s
+++ b/sw/otbn/crypto/tests/ed25519_ext_add_test.s
@@ -23,7 +23,7 @@ main:
   li      x2, 2
   la      x3, modulus
   bn.lid  x2, 0(x3)
-  bn.wsrw 0x0, w2
+  bn.wsrw MOD, w2
 
   /* w19 <= 19 */
   bn.addi w19, w31, 19
diff --git a/sw/otbn/crypto/tests/fake_primality.s b/sw/otbn/crypto/tests/fake_primality.s
new file mode 100644
index 0000000000000..857ee20b48e05
--- /dev/null
+++ b/sw/otbn/crypto/tests/fake_primality.s
@@ -0,0 +1,15 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Fake primality-test routine.
+ *
+ * This will cause an error if called; it should be used for tests where
+ * calling a full primality test indicates failure (such as a test in which the
+ * candidate prime should fail earlier checks before being evaluated for
+ * primality).
+ */
+.globl miller_rabin
+miller_rabin:
+  unimp
diff --git a/sw/otbn/crypto/tests/field25519_test.s b/sw/otbn/crypto/tests/field25519_test.s
index e05aa848d3f10..7bf7ea55d7e69 100644
--- a/sw/otbn/crypto/tests/field25519_test.s
+++ b/sw/otbn/crypto/tests/field25519_test.s
@@ -19,7 +19,7 @@ main:
   li      x2, 2
   la      x3, modulus
   bn.lid  x2, 0(x3)
-  bn.wsrw 0x0, w2
+  bn.wsrw MOD, w2
 
   /* w19 <= 19 */
   bn.addi w19, w31, 19
diff --git a/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.exp b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.exp
new file mode 100644
index 0000000000000..35dacd69b5f07
--- /dev/null
+++ b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.exp
@@ -0,0 +1,2 @@
+# Expected values:
+w0 = 0x0000000000000000000000000000000000000000000000000000000000000000
diff --git a/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.s b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.s
new file mode 100644
index 0000000000000..57de22159e168
--- /dev/null
+++ b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_mod_test.s
@@ -0,0 +1,79 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone elliptic curve P-256 arithmetic-to-boolean masking test
+ *
+ * Uses OTBN ECC P-256 lib to perform arithmetic-to-boolean conversion of
+ * a given masked curve value with a random mask. Afterwards it unmasks the
+ * result and compares it with the initial value from DMEM.
+ */
+
+.section .text.start
+
+p256_arithmetic_to_boolean_test:
+
+  /* init all-zero register */
+  bn.xor    w31, w31, w31
+
+  /* Load domain parameter.
+     w29 = dmem[p256_p] */
+  li        x2, 29
+  la        x4, p256_p
+  bn.lid    x2, 0(x4)
+
+  /* Set MOD to p */
+  bn.wsrw   MOD, w29
+
+  /* Load values into WDRs */
+
+  /* w11 <= dmem[x] mod p */
+  li        x3, 11
+  la        x4, x
+  bn.lid    x3, 0(x4)
+  bn.addm   w11, w11, w31
+
+  /* w19 <= URND mod p */
+  bn.wsrr   w19, URND
+  bn.addm   w19, w19, w31
+
+  /* Arithmetic masking */
+
+  /* w11 = A <= w11 - w19 = x - r */
+  bn.subm    w11, w11, w19
+
+  /* Arithmetic to boolean conversion */
+  jal       x1, arithmetic_to_boolean_mod
+
+  /* Unmask and compare values
+     after conversion */
+
+  /* w20 <= w20 ^ w19 = x' ^ r = x */
+  bn.xor    w20, w20, w19
+
+  /* w10 <= dmem[x] mod p */
+  li        x3, 10
+  la        x4, x
+  bn.lid    x3, 0(x4)
+  bn.addm   w10, w10, w31
+
+  /* w0 <= w10 - w20 */
+  bn.sub    w0, w20, w10
+
+  ecall
+
+
+.data
+
+.globl x
+.balign 32
+x:
+  .word 0x2ab77ca0
+  .word 0x8031ceb8
+  .word 0xff3e1afa
+  .word 0x353ec814
+  .word 0x22fe027b
+  .word 0x8a29dc16
+  .word 0xf7109d54
+  .word 0x762c5d06
diff --git a/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.exp b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.exp
new file mode 100644
index 0000000000000..879f5d55ea82c
--- /dev/null
+++ b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.exp
@@ -0,0 +1,3 @@
+# Expected values:
+w0 = 0x0000000000000000000000000000000000000000000000000000000000000000
+w1 = 0x0000000000000000000000000000000000000000000000000000000000000000
diff --git a/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.s b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.s
new file mode 100644
index 0000000000000..19d4d8a07bec5
--- /dev/null
+++ b/sw/otbn/crypto/tests/p256_arithmetic_to_boolean_test.s
@@ -0,0 +1,108 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone 257-bit arithmetic-to-boolean masking test
+ *
+ * Uses OTBN ECC P-256 lib to perform arithmetic-to-boolean conversion of
+ * a given masked 257-bit value with a random mask. Afterwards it unmasks the
+ * result and compares it with the initial value from DMEM.
+ */
+
+.section .text.start
+
+p256_arithmetic_to_boolean_test:
+
+  /* init all-zero register */
+  bn.xor    w31, w31, w31
+
+  /* Load domain parameter.
+     w29 = dmem[p256_p] */
+  li        x2, 29
+  la        x4, p256_p
+  bn.lid    x2, 0(x4)
+
+  /* Set MOD to p */
+  bn.wsrw   MOD, w29
+
+  /* Load values into WDRs */
+
+  /* w11 <= dmem[x_l] */
+  li        x3, 11
+  la        x4, x_l
+  bn.lid    x3, 0(x4)
+
+  /* w12 <= dmem[x_u] */
+  li        x3, 12
+  la        x4, x_u
+  bn.lid    x3, 0(x4)
+
+  /* w18 <= URND
+     w19 <= URND (1 bit) */
+  bn.wsrr   w18, URND
+  bn.wsrr   w19, URND
+  bn.rshi   w19, w31, w19 >> 255
+
+  /* Arithmetic masking */
+
+  /* [w12,w11] = A <= [w12,w11] - [w19,w18] mod 2^257 = x - r mod 2^257
+     This may result in bits above 2^257, but these will be stripped off. */
+  bn.sub    w11, w11, w18
+  bn.subb   w12, w12, w19
+  bn.rshi   w12, w12, w31 >> 1
+  bn.rshi   w12, w31, w12 >> 255
+
+  /* Arithmetic to boolean conversion */
+  jal       x1, arithmetic_to_boolean
+
+  /* Unmask and compare values
+     after conversion */
+
+  /* w20 <= w20 ^ w18 = x' ^ r
+     w21 <= w21 ^ w19 = x' ^ r */
+  bn.xor    w20, w20, w18
+  bn.xor    w21, w21, w19
+
+  /* w11 <= dmem[x_l] */
+  li        x3, 11
+  la        x4, x_l
+  bn.lid    x3, 0(x4)
+
+  /* w12 <= dmem[x_u] */
+  li        x3, 12
+  la        x4, x_u
+  bn.lid    x3, 0(x4)
+
+  /* [w1,w0] <= [w12,w11] - [w21,w20] */
+  bn.sub    w0, w11, w20
+  bn.subb   w1, w12, w21
+
+  ecall
+
+
+.data
+
+.globl x_u
+.balign 32
+x_u:
+  .word 0x00000001
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+
+.globl x_l
+.balign 32
+x_l:
+  .word 0x2ab77ca0
+  .word 0x8031ceb8
+  .word 0xff3e1afa
+  .word 0x353ec814
+  .word 0x22fe027b
+  .word 0x8a29dc16
+  .word 0xf7109d54
+  .word 0x762c5d06
diff --git a/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.exp b/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.exp
new file mode 100644
index 0000000000000..361cbdd59ac67
--- /dev/null
+++ b/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.exp
@@ -0,0 +1,2 @@
+# Expected value for shared key:
+w11 = 0x5f33d746a326640a739a9490ec15c10372869f3de675b2e85742271d18c9eb82
diff --git a/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.s b/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.s
new file mode 100644
index 0000000000000..8e462ee3fe06a
--- /dev/null
+++ b/sw/otbn/crypto/tests/p256_ecdh_shared_key_test.s
@@ -0,0 +1,102 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone elliptic curve P-256 ECDH shared key generation test
+ *
+ * Uses OTBN ECC P-256 lib to perform a scalar multiplication with a valid
+ * example curve point and an example scalar. Both scalar and coordinates of
+ * the curve point are contained in the .data section below.
+ * The x coordinate of the resulting curve point is masked arithmetically
+ * with a random value. As the x coorodinate represents the actual
+ * shared key, the x coordinate and its mask are then converted from an
+ * arithmetic to a boolean masking scheme.
+ *
+ * The result of arithmetical unmasking as well as the result of boolean
+ * unmasking are compared with an expected value.
+ */
+
+.section .text.start
+
+p256_ecdh_shared_key_test:
+
+  /* Call P-256 shared key generation to get a boolean-masked key.
+       dmem[x] <= x0
+       dmem[y] <= x1 */
+  jal      x1, p256_shared_key
+
+  /* Load the two shares.
+       w11 <= dmem[x] = x0
+       w12 <= dmem[y] = x1 */
+  li        x3, 11
+  la        x4, x
+  bn.lid    x3++, 0(x4)
+  la        x4, y
+  bn.lid    x3, 0(x4)
+
+  /* Unmask the shared key, x.
+       w11 <= x0 ^ x1 = x */
+  bn.xor    w11, w11, w12
+
+  ecall
+
+
+.data
+
+/* Secret key d in arithmetic shares. */
+.globl d0
+.balign 32
+d0:
+  .word 0xfe6d1071
+  .word 0x21d0a016
+  .word 0xb0b2c781
+  .word 0x9590ef5d
+  .word 0x3fdfa379
+  .word 0x1b76ebe8
+  .word 0x74210263
+  .word 0x1420fc41
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+.globl d1
+.balign 32
+d1:
+  .zero 64
+
+/* example curve point x-coordinate */
+.globl x
+.balign 32
+x:
+  .word 0xbfa8c334
+  .word 0x9773b7b3
+  .word 0xf36b0689
+  .word 0x6ec0c0b2
+  .word 0xdb6c8bf3
+  .word 0x1628ce58
+  .word 0xfacdc546
+  .word 0xb5511a6a
+
+/* example curve point y-coordinate */
+.globl y
+.balign 32
+y:
+  .word 0x9e008c2e
+  .word 0xa8707058
+  .word 0xab9c6924
+  .word 0x7f7a11d0
+  .word 0xb53a17fa
+  .word 0x43dd09ea
+  .word 0x1f31c143
+  .word 0x42a1c697
+
+/* affine x-coordinate value before A2B */
+.globl x_a
+.balign 32
+x_a:
+  .zero 32
diff --git a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
index 56735db6132f3..01e20cc1dd822 100644
--- a/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
+++ b/sw/otbn/crypto/tests/p256_ecdsa_sign_test.s
@@ -71,7 +71,7 @@ randomize_share:
 
   /* Get a 63-bit pseudorandom number.
        w0 <= URND()[255:193] = r */
-  bn.wsrr  w0, 0x2 /* URND*/
+  bn.wsrr  w0, URND
   bn.rshi  w0, w31, w0 >> 193
 
   /* Load the curve order n.
diff --git a/sw/otbn/crypto/tests/p256_isoncurve_test.exp b/sw/otbn/crypto/tests/p256_isoncurve_test.exp
index a35217fd56467..65d365eb27705 100644
--- a/sw/otbn/crypto/tests/p256_isoncurve_test.exp
+++ b/sw/otbn/crypto/tests/p256_isoncurve_test.exp
@@ -1,3 +1,3 @@
-# Expected values (w0=R, w1=S):
-w0 = 0xb103b614b389c6b8e1a08330a6ce0b9c4b3726ec0bf61f6bdd66af03a4af5660
-w1 = 0xb103b614b389c6b8e1a08330a6ce0b9c4b3726ec0bf61f6bdd66af03a4af5660
+# Expected values (w18=lhs, w1=rhs):
+w18 = 0xb103b614b389c6b8e1a08330a6ce0b9c4b3726ec0bf61f6bdd66af03a4af5660
+w19 = 0xb103b614b389c6b8e1a08330a6ce0b9c4b3726ec0bf61f6bdd66af03a4af5660
diff --git a/sw/otbn/crypto/tests/p256_isoncurve_test.s b/sw/otbn/crypto/tests/p256_isoncurve_test.s
index 4e8dad49cd4e9..78336bb666455 100644
--- a/sw/otbn/crypto/tests/p256_isoncurve_test.s
+++ b/sw/otbn/crypto/tests/p256_isoncurve_test.s
@@ -13,17 +13,14 @@
 .section .text.start
 
 p256_oncurve_test:
+  /* Initialize all-zero register. */
+  bn.xor   w31, w31, w31
 
-  /* call curve point test routine in P-256 lib */
+  /* Compute both sides of the Weierstrauss equation.
+       w18 <= lhs
+       w19 <= rhs */
   jal      x1, p256_isoncurve
 
-  /* load result to WDRs for comparison with reference */
-  li        x2, 0
-  la        x3, r
-  bn.lid    x2++, 0(x3)
-  la        x3, s
-  bn.lid    x2, 0(x3)
-
   ecall
 
 
diff --git a/sw/otbn/crypto/tests/p256_key_from_seed_test.exp b/sw/otbn/crypto/tests/p256_key_from_seed_test.exp
index 0b6ff554a2321..5e74e684e24a0 100644
--- a/sw/otbn/crypto/tests/p256_key_from_seed_test.exp
+++ b/sw/otbn/crypto/tests/p256_key_from_seed_test.exp
@@ -2,6 +2,6 @@
 w20 = 0x9def3b61bc577b4b45c0f8b23ed867e3302b5143e9e71859e3ef3615df0ace13
 w21 = 0xe46bcaf84b3890e1
 
-# [w23, w22]: d1
-w22 = 0x17bcfeef551f77d199dd9f5af7d1a8736f2f939abeb67c9e2df4bec0225596d6
-w23 = 0x63e2e86d4e67f1f7
+# [w10, w11]: d1
+w10 = 0x17bcfeef551f77d199dd9f5af7d1a8736f2f939abeb67c9e2df4bec0225596d6
+w11 = 0x63e2e86d4e67f1f7
diff --git a/sw/otbn/crypto/tests/p256_key_from_seed_test.s b/sw/otbn/crypto/tests/p256_key_from_seed_test.s
index 69bbcbefa894e..813028b654ff8 100644
--- a/sw/otbn/crypto/tests/p256_key_from_seed_test.s
+++ b/sw/otbn/crypto/tests/p256_key_from_seed_test.s
@@ -13,17 +13,16 @@ key_from_seed_test:
   bn.xor    w31, w31, w31
 
   /* Load shares of seed from DMEM.
-       [w21,w20] <= dmem[seed0]
-       [w23,w33] <= dmem[seed1] */
+       [w20,w21] <= dmem[seed0]
+       [w10,w11] <= dmem[seed1] */
   li        x2, 20
   la        x3, seed0
-  bn.lid    x2, 0(x3++)
-  li        x2, 21
   bn.lid    x2++, 0(x3)
+  bn.lid    x2, 32(x3)
+  li        x2, 10
   la        x3, seed1
-  bn.lid    x2, 0(x3++)
-  li        x2, 23
-  bn.lid    x2, 0(x3)
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2, 32(x3)
 
   /* Generate the derived secret key. */
   jal       x1, p256_key_from_seed
diff --git a/sw/otbn/crypto/tests/p256_mul_modp_test.exp b/sw/otbn/crypto/tests/p256_mul_modp_test.exp
new file mode 100644
index 0000000000000..9675959613abe
--- /dev/null
+++ b/sw/otbn/crypto/tests/p256_mul_modp_test.exp
@@ -0,0 +1 @@
+w19 = 0x3cc57c50d0f2d26fc7bff844a3cdcf866f47b074f3171d5711bacbe3045443a6
diff --git a/sw/otbn/crypto/tests/p256_mul_modp_test.s b/sw/otbn/crypto/tests/p256_mul_modp_test.s
new file mode 100644
index 0000000000000..7ef7f6e879708
--- /dev/null
+++ b/sw/otbn/crypto/tests/p256_mul_modp_test.s
@@ -0,0 +1,70 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone test for P-256 field multiplication.
+ */
+
+.section .text.start
+start:
+  /* Initialize all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load the modulus, p.
+     MOD <= w29 <= dmem[p256_p] = p */
+  li        x2, 29
+  la        x3, p256_p
+  bn.lid    x2, 0(x3)
+  bn.wsrw   MOD, w29
+
+  /* Compute the constant r256 for reduction modulo p.
+       w28 <= 2^256 - p = r256 */
+  bn.sub   w28, w31, w29
+
+  /* Load the constant for reduction modulo p.
+     w29 <= dmem[p256_r448] = r448 */
+  li        x2, 29
+  la        x3, p256_r448
+  bn.lid    x2, 0(x3)
+
+  /* Load the operands.
+       w24 <= dmem[value_a] = a
+       w25 <= dmem[value_b] = b */
+  li        x2, 24
+  la        x3, value_a
+  bn.lid    x2++, 0(x3)
+  la        x3, value_b
+  bn.lid    x2, 0(x3)
+
+  /* Run modular multiplication.
+       w19 <= (w24 * w25) mod p */
+  jal       x1, mul_modp
+
+  ecall
+
+.data
+
+/* First operand, a.
+   = 0xa8da539ffce03337030a5a44bcd3266608a32b364bb3295cace17a9da3175abc */
+value_a:
+.word 0xa3175abc
+.word 0xace17a9d
+.word 0x4bb3295c
+.word 0x08a32b36
+.word 0xbcd32666
+.word 0x030a5a44
+.word 0xfce03337
+.word 0xa8da539f
+
+/* Second operand, b.
+   = 0x72c7c6bec94cf13ab2a1c47c60cb522e04a0e4330df8714c96a2db313c873171 */
+value_b:
+.word 0x3c873171
+.word 0x96a2db31
+.word 0x0df8714c
+.word 0x04a0e433
+.word 0x60cb522e
+.word 0xb2a1c47c
+.word 0xc94cf13a
+.word 0x72c7c6be
diff --git a/sw/otbn/crypto/tests/p256_proj_add_test.s b/sw/otbn/crypto/tests/p256_proj_add_test.s
index cdfb295b4d8b1..98d662e7dcb6f 100644
--- a/sw/otbn/crypto/tests/p256_proj_add_test.s
+++ b/sw/otbn/crypto/tests/p256_proj_add_test.s
@@ -37,20 +37,24 @@ p256_proj_add_test:
   la        x3, p256_b
   bn.lid    x2, 0(x3)
 
-  /* load lower 256 bit of Barrett constant u for modulus p from dmem
-     w28 <= u = dmem[p256_u_p] */
-  li        x2, 28
-  la        x3, p256_u_p
-  bn.lid    x2, 0(x3)
-
   /* load field modulus p from dmem
-     w29 <= p = dmem[p256_p] */
+     MOD <= w29 <= p = dmem[p256_p] */
   li        x2, 29
   la        x3, p256_p
   bn.lid    x2, 0(x3)
 
   /* store modulus to MOD WSR */
-  bn.wsrw   0, w29
+  bn.wsrw   MOD, w29
+
+  /* Compute the constant r256 for reduction modulo p.
+       w28 <= 2^256 - p = r256 */
+  bn.sub   w28, w31, w29
+
+  /* Load the other constant for reduction modulo p.
+     w29 <= dmem[p256_r448] = r448 */
+  li        x2, 29
+  la        x3, p256_r448
+  bn.lid    x2, 0(x3)
 
   /* init all-zero reg */
   bn.xor   w31, w31, w31
diff --git a/sw/otbn/crypto/tests/p256_scalar_mult_test.exp b/sw/otbn/crypto/tests/p256_scalar_mult_test.exp
index 8e66d56142ece..9edc88acb23b2 100644
--- a/sw/otbn/crypto/tests/p256_scalar_mult_test.exp
+++ b/sw/otbn/crypto/tests/p256_scalar_mult_test.exp
@@ -1,3 +1,3 @@
-# Expected values (w0=X, w1=Y):
-w0 = 0x5f33d746a326640a739a9490ec15c10372869f3de675b2e85742271d18c9eb82
-w1 = 0xb5ebbd1e4ac99c9e3d70a862e41fe23ace6ab34f7ac9f99a4c403defb76c462d
+# Expected values (w11=X, w12=Y):
+w11 = 0x5f33d746a326640a739a9490ec15c10372869f3de675b2e85742271d18c9eb82
+w12 = 0xb5ebbd1e4ac99c9e3d70a862e41fe23ace6ab34f7ac9f99a4c403defb76c462d
diff --git a/sw/otbn/crypto/tests/p256_scalar_mult_test.s b/sw/otbn/crypto/tests/p256_scalar_mult_test.s
index a4e594077c9bf..2457b36d72d5a 100644
--- a/sw/otbn/crypto/tests/p256_scalar_mult_test.s
+++ b/sw/otbn/crypto/tests/p256_scalar_mult_test.s
@@ -9,23 +9,43 @@
  * example curve point and an example scalar. Both scalar and coordinates of
  * the curve point are contained in the .data section below.
  *
- * x and y cordinates of the resulting curve points are copied to wide
- * registers. See comment at the end of the file for expected values.
+ * x coordinate of the resulting curve points is copied to a wide
+ * register.
  */
 
 .section .text.start
 
 scalar_mult_test:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
 
-  /* call scalar point multiplication routine in P-256 lib */
-  jal      x1, p256_scalar_mult
+  /* Load first share of scalar k from dmem.
+       w0,w1 = dmem[k0] */
+  la        x16, k0
+  li        x2, 0
+  bn.lid    x2, 0(x16++)
+  li        x2, 1
+  bn.lid    x2, 0(x16)
 
-  /* copy result to wide reg file */
-  li       x2, 0
-  la       x3, x
-  bn.lid   x2++, 0(x3)
-  la       x3, y
-  bn.lid   x2, 0(x3)
+  /* Load second share of scalar k from dmem.
+       w2,w3 = dmem[k1] */
+  la        x16, k1
+  li        x2, 2
+  bn.lid    x2, 0(x16++)
+  li        x2, 3
+  bn.lid    x2, 0(x16)
+
+  /* Call internal scalar multiplication routine.
+     Returns point in projective coordinates.
+     (w8, w9, w10) <= (X, Y, Z) = k*(x,y) */
+  la        x21, x
+  la        x22, y
+  jal       x1, scalar_mult_int
+
+  /* Convert to affine coordinates.
+       w11 <= x
+       w12 <= y */
+  jal       x1, proj_to_affine
 
   ecall
 
diff --git a/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.exp b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.exp
new file mode 100644
index 0000000000000..879f5d55ea82c
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.exp
@@ -0,0 +1,3 @@
+# Expected values:
+w0 = 0x0000000000000000000000000000000000000000000000000000000000000000
+w1 = 0x0000000000000000000000000000000000000000000000000000000000000000
diff --git a/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.s b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.s
new file mode 100644
index 0000000000000..363a053ee1616
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_mod_test.s
@@ -0,0 +1,133 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone elliptic curve P-384 arithmetic-to-boolean masking test
+ *
+ * Uses OTBN ECC P-384 lib to perform arithmetic-to-boolean conversion of
+ * a given masked curve value with a random mask. Afterwards it unmasks the
+ * result and compares it with the initial value from DMEM.
+ */
+
+.section .text.start
+
+p256_arithmetic_to_boolean_test:
+
+  /* init all-zero register */
+  bn.xor    w31, w31, w31
+
+  /* Load domain parameter.
+     [w13,w12] = dmem[p384_p] */
+  li        x2, 12
+  la        x4, p384_p
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  /* Load values into WDRs */
+
+  /* [w20,w19,w18] <= dmem[x] */
+  li        x3, 18
+  la        x4, x
+  bn.lid    x3++, 0(x4)
+  bn.lid    x3++, 32(x4)
+  bn.mov    w20, w31
+
+  /* Reduce x mod p
+     [w5,w4] <= [w20,w19,w18] mod [w13,w12] = x mod p
+     dmem[x] <= [w31,w5,w4] = x mod p */
+  jal       x1, p384_reduce_p
+  bn.mov    w4, w16
+  bn.mov    w5, w17
+  li        x3, 4
+  la        x4, x
+  bn.sid    x3++, 0(x4)
+  bn.sid    x3++, 32(x4)
+  li        x3, 31
+  bn.sid    x3, 64(x4)
+
+  /* [w20,w19,w18] <= URND = r */
+  bn.wsrr   w18, URND
+  bn.wsrr   w19, URND
+  bn.wsrr   w20, URND
+
+  /* Reduce r mod p
+     [w7,w6] <= [w20,w19,w18] mod [w13,w12] = r mod p */
+  jal       x1, p384_reduce_p
+  bn.mov    w6, w16
+  bn.mov    w7, w17
+
+  /* Arithmetic masking.
+     [w12,w11] = A <= [w5,w4] - [w7,w6] mod [w13,w12] = x - r mod p */
+
+  /* [w19,w18] = A1 <= [w5,w4] - [w7,w6] = x - r */
+  bn.sub    w18, w4, w6
+  bn.subb   w19, w5, w7
+
+  /* [w17,w16] = A2 <= [w19,w18] + [w13,w12] = A1 + p = x - r + p */
+  bn.add    w16, w18, w12
+  bn.addc   w17, w19, w13
+
+  /* If x >= r: [w12,w11] <= A1, else: [w12,w11] <= A2 */
+  bn.sub    w0, w4, w6
+  bn.subb   w1, w5, w7
+  bn.sel    w11, w16, w18, FG0.C
+  bn.sel    w12, w17, w19, FG0.C
+
+  /* Load domain parameter.
+     [w14,w13] = dmem[p384_p] */
+  li        x2, 13
+  la        x4, p384_p
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  /* Move mask r to input registers.
+     [w19,18] <= [w7,w6] = r */
+  bn.mov    w18, w6
+  bn.mov    w19, w7
+
+  /* Arithmetic to boolean conversion */
+  jal       x1, p384_arithmetic_to_boolean_mod
+
+  /* Unmask and compare values
+     after conversion */
+
+  /* w20 <= w20 ^ w18 = x' ^ r
+     w21 <= w21 ^ w19 = x' ^ r */
+  bn.xor    w20, w20, w18
+  bn.xor    w21, w21, w19
+
+  /* [w5,w4] <= dmem[x] = x mod p */
+  li        x3, 4
+  la        x4, x
+  bn.lid    x3++, 0(x4)
+  bn.lid    x3++, 32(x4)
+
+  /* [w1,w0] <= [w12,w11] - [w21,w20] */
+  bn.sub    w0, w4, w20
+  bn.subb   w1, w5, w21
+
+  ecall
+
+
+.data
+
+.globl x
+.balign 32
+x:
+  .word 0xab0f7698
+  .word 0xc85b787e
+  .word 0x9d9c9644
+  .word 0x9f740ded
+  .word 0xa1b6fca8
+  .word 0x8cd4a7b3
+  .word 0x9f7fdc63
+  .word 0x74013528
+  .word 0x2ab77ca0
+  .word 0x8031ceb8
+  .word 0xff3e1afa
+  .word 0x353ec814
+  .word 0x22fe027b
+  .word 0x8a29dc16
+  .word 0xf7109d54
+  .word 0x762c5d06
diff --git a/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.exp b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.exp
new file mode 100644
index 0000000000000..879f5d55ea82c
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.exp
@@ -0,0 +1,3 @@
+# Expected values:
+w0 = 0x0000000000000000000000000000000000000000000000000000000000000000
+w1 = 0x0000000000000000000000000000000000000000000000000000000000000000
diff --git a/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.s b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.s
new file mode 100644
index 0000000000000..f7dac6a265187
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_arithmetic_to_boolean_test.s
@@ -0,0 +1,99 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone 385-bit arithmetic-to-boolean masking test
+ *
+ * Uses OTBN ECC P-384 lib to perform arithmetic-to-boolean conversion of
+ * a given masked 385-bit value with a random mask. Afterwards it unmasks the
+ * result and compares it with the initial value from DMEM.
+ */
+
+.section .text.start
+
+p384_arithmetic_to_boolean_test:
+
+  /* init all-zero register */
+  bn.xor    w31, w31, w31
+
+  /* Load values into WDRs */
+
+  /* w11 <= dmem[x_l] */
+  li        x3, 11
+  la        x4, x_l
+  bn.lid    x3, 0(x4)
+
+  /* w12 <= dmem[x_u] */
+  li        x3, 12
+  la        x4, x_u
+  bn.lid    x3, 0(x4)
+
+  /* w18 <= URND
+     w19 <= URND (129 bits) */
+  bn.wsrr   w18, URND
+  bn.wsrr   w19, URND
+  bn.rshi   w19, w31, w19 >> 127
+
+  /* Arithmetic masking */
+
+  /* [w12,w11] = A <= [w12,w11] - [w19,w18] mod 2^385 = x - r mod 2^385
+     This may result in bits above 2^385, but these will be stripped off (-> mod 2^385). */
+  bn.sub    w11, w11, w18
+  bn.subb   w12, w12, w19
+  bn.rshi   w12, w12, w31 >> 129
+  bn.rshi   w12, w31, w12 >> 127
+
+  /* Arithmetic to boolean conversion */
+  jal       x1, p384_arithmetic_to_boolean
+
+  /* Unmask and compare values
+     after conversion */
+
+  /* w20 <= w20 ^ w18 = x' ^ r
+     w21 <= w21 ^ w19 = x' ^ r */
+  bn.xor    w20, w20, w18
+  bn.xor    w21, w21, w19
+
+  /* w11 <= dmem[x_l] */
+  li        x3, 11
+  la        x4, x_l
+  bn.lid    x3, 0(x4)
+
+  /* w12 <= dmem[x_u] */
+  li        x3, 12
+  la        x4, x_u
+  bn.lid    x3, 0(x4)
+
+  /* [w1,w0] <= [w12,w11] - [w21,w20] */
+  bn.sub    w0, w11, w20
+  bn.subb   w1, w12, w21
+
+  ecall
+
+
+.data
+
+.globl x_u
+.balign 32
+x_u:
+  .word 0xab0f7698
+  .word 0xc85b787e
+  .word 0x9d9c9644
+  .word 0x9f740ded
+  .word 0x00000001
+  .word 0x00000000
+  .word 0x00000000
+  .word 0x00000000
+
+.globl x_l
+.balign 32
+x_l:
+  .word 0x2ab77ca0
+  .word 0x8031ceb8
+  .word 0xff3e1afa
+  .word 0x353ec814
+  .word 0x22fe027b
+  .word 0x8a29dc16
+  .word 0xf7109d54
+  .word 0x762c5d06
diff --git a/sw/otbn/crypto/tests/p384_base_mult_test.s b/sw/otbn/crypto/tests/p384_base_mult_test.s
index 682f599f28268..7a314a558ffb0 100644
--- a/sw/otbn/crypto/tests/p384_base_mult_test.s
+++ b/sw/otbn/crypto/tests/p384_base_mult_test.s
@@ -16,14 +16,14 @@
 
 p384_base_mult_test:
 
-  /* set dmem pointer to point to scalar (private key) d */
-  la       x2, scalar
-  la       x3, dptr_d
+  /* set dmem pointer to point to 1st scalar share d0 (private key) */
+  la       x2, d0
+  la       x3, dptr_d0
   sw       x2, 0(x3)
 
-  /* set dmem pointer to point to blinding parameter */
-  la       x2, blinding_param
-  la       x3, dptr_rnd
+  /* set dmem pointer to point to 2nd scalar share d1 (private key) */
+  la       x2, d1
+  la       x3, dptr_d1
   sw       x2, 0(x3)
 
   /* set dmem pointer to point to x-coordinate */
@@ -53,7 +53,43 @@ p384_base_mult_test:
 
 .section .data
 
-/* scalar d */
+/* 1st scalar share d0 (448-bit) */
+d0:
+  .word 0x5c832a51
+  .word 0x3eb17c27
+  .word 0x9a0c1b76
+  .word 0x6e001281
+  .word 0x4de8344e
+  .word 0x5b7d3b0f
+  .word 0x96d2f9e0
+  .word 0x1e9d19e7
+  .word 0x16f5c1ee
+  .word 0x800a4c94
+  .word 0xe14cd8df
+  .word 0xadb9ce1b
+  .word 0x8677a5f2
+  .word 0x32f9e2b0
+  .zero 8
+
+/* 2nd scalar share d1 (448-bit) */
+d1:
+  .word 0x33eae098
+  .word 0xd31b18d5
+  .word 0x507568cd
+  .word 0xab8fb14d
+  .word 0x9ef51898
+  .word 0x44676e61
+  .word 0x9cb814d9
+  .word 0x4ad22b6e
+  .word 0x8930f243
+  .word 0xb706d682
+  .word 0xa9da1611
+  .word 0x13e7014a
+  .word 0x9ec9b430
+  .word 0x9e5dc598
+  .zero 8
+
+/* scalar d = (d0 + d1) mod n (384-bit) */
 scalar:
   .word 0xe8791ba3
   .word 0xf549e1f7
@@ -69,22 +105,6 @@ scalar:
   .word 0xc1a0cf66
   .zero 16
 
-   /* blinding parameter rnd */
- blinding_param:
-  .word 0xa82c85b0
-  .word 0x163ce1c8
-  .word 0x32518fd7
-  .word 0xf8a428cd
-  .word 0xf5b9d867
-  .word 0x00906f5f
-  .word 0x7387b4f2
-  .word 0xa2d3da7a
-  .word 0xebe0a647
-  .word 0xfb2ef7ca
-  .word 0x74249432
-  .word 0x230e5ff6
-  .zero 16
-
 /* result buffer x-coordinate */
 p1_x:
   .zero 64
diff --git a/sw/otbn/crypto/tests/p384_curve_point_valid_test.exp b/sw/otbn/crypto/tests/p384_curve_point_valid_test.exp
new file mode 100644
index 0000000000000..cb88c083b8417
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_curve_point_valid_test.exp
@@ -0,0 +1,2 @@
+# This test doesn't require expected WDR values,
+# it just needs to complete without fault.
diff --git a/sw/otbn/crypto/tests/p384_curve_point_valid_test.s b/sw/otbn/crypto/tests/p384_curve_point_valid_test.s
new file mode 100644
index 0000000000000..dca9ed3ec4a20
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_curve_point_valid_test.s
@@ -0,0 +1,88 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone elliptic curve P-384 ECDH shared key generation test
+ *
+ * Uses OTBN ECC P-384 lib to perform a scalar multiplication with a valid
+ * example curve point and an example scalar. Both scalar and coordinates of
+ * the curve point are contained in the .data section below.
+ * The x coordinate of the resulting curve point is masked arithmetically
+ * with a random value. As the x coorodinate represents the actual
+ * shared key, the x coordinate and its mask are then converted from an
+ * arithmetic to a boolean masking scheme.
+ *
+ * The result of boolean unmasking is then compared with the expected shared
+ * key value.
+ */
+
+.section .text.start
+
+p384_curve_point_valid_test:
+  /* Set  pointer to x coordinate */
+  la        x3, dptr_x
+  la        x4, x
+  sw        x4, 0(x3)
+
+  /* Set  pointer to y coordinate */
+  la        x3, dptr_y
+  la        x4, x
+  sw        x4, 0(x3)
+
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  jal       x1, p384_curve_point_valid
+
+  ecall
+
+.data
+
+/* pointer to x-coordinate (dptr_x) */
+.globl dptr_x
+.balign 4
+dptr_x:
+  .zero 4
+
+/* pointer to y-coordinate (dptr_y) */
+.globl dptr_y
+.balign 4
+dptr_y:
+  .zero 4
+
+/* Curve point x-coordinate. */
+.globl x
+.balign 32
+x:
+  .word 0x4877f3d1
+  .word 0x7b829460
+  .word 0xb1cac609
+  .word 0x5869de54
+  .word 0xee0e2beb
+  .word 0x6c30f2d8
+  .word 0x47e80661
+  .word 0x394d8b70
+  .word 0xcf60d89e
+  .word 0x1a9ea916
+  .word 0xb439d701
+  .word 0xca230836
+  .zero 16
+
+/* Curve point y-coordinate. */
+.globl y
+.balign 32
+y:
+  .word 0xc181f90f
+  .word 0xc31ef079
+  .word 0xbf3aff6e
+  .word 0xc7e55880
+  .word 0xec18818c
+  .word 0xcea028a9
+  .word 0x928c3e92
+  .word 0x82b63bf3
+  .word 0xd65e905d
+  .word 0x68eef2d1
+  .word 0x03afe2c2
+  .word 0xaaafcad2
+  .zero 16
diff --git a/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.exp b/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.exp
new file mode 100644
index 0000000000000..88b391064b20c
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.exp
@@ -0,0 +1,4 @@
+# Expected values:
+# [w1, w0] is unmasked shared key
+w0  = 0x6c5d59dbafa8ecbaf0b2d3c1e818325403634e3b86956e6ead6739217b702c4a
+w1  = 0x00000000000000000000000000000000d177aa22a7c535a28cae00d420c4cd27
diff --git a/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.s b/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.s
new file mode 100644
index 0000000000000..b950b76b620d1
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_ecdh_shared_key_test.s
@@ -0,0 +1,165 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone elliptic curve P-384 ECDH shared key generation test
+ *
+ * Uses OTBN ECC P-384 lib to perform a scalar multiplication with a valid
+ * example curve point and an example scalar. Both scalar and coordinates of
+ * the curve point are contained in the .data section below.
+ * The x coordinate of the resulting curve point is masked arithmetically
+ * with a random value. As the x coorodinate represents the actual
+ * shared key, the x coordinate and its mask are then converted from an
+ * arithmetic to a boolean masking scheme.
+ *
+ * The result of boolean unmasking is then compared with the expected shared
+ * key value.
+ */
+
+.section .text.start
+
+p384_ecdh_shared_key_test:
+  /* init all-zero register */
+  bn.xor    w31, w31, w31
+
+  /* set dmem pointer to point to x-coordinate */
+  la       x2, p1_x
+  la       x3, dptr_x
+  sw       x2, 0(x3)
+
+  /* set dmem pointer to point to y-coordinate */
+  la       x2, p1_y
+  la       x3, dptr_y
+  sw       x2, 0(x3)
+
+  /* set dmem pointer to point to 1st scalar share k0 */
+  la       x2, k0
+  la       x3, dptr_k0
+  sw       x2, 0(x3)
+
+  /* set dmem pointer to point to 2nd scalar share k1 */
+  la       x2, k1
+  la       x3, dptr_k1
+  sw       x2, 0(x3)
+
+  /* call scalar point multiplication routine in P-384 lib */
+  jal      x1, p384_scalar_mult
+
+  /* load result to WDRs for unmasking and comparison with reference
+     [w12,w11] <= dmem[p1_x] = x_m
+     [w19,w18] <= dmem[p1_y] = m */
+  li        x2, 11
+  la        x3, p1_x
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+  li        x2, 18
+  la        x3, p1_y
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2, 32(x3)
+
+  /* Load domain parameter.
+     [w14,w13] = dmem[p384_p] */
+  li        x2, 13
+  la        x4, p384_p
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  /* Arithmetic to boolean conversion */
+  jal       x1, p384_arithmetic_to_boolean_mod
+
+  /* Boolean unmasking of result value
+     [w21,w20] <= [w21,w20] ^ [w19,w18] */
+  bn.xor    w0, w20, w18
+  bn.xor    w1, w21, w19
+
+  ecall
+
+
+.data
+
+.balign 32
+
+/* point 1 x-cooridante p1_x */
+p1_x:
+  .word 0x1a11808b
+  .word 0x02e3d5a9
+  .word 0x440d8db6
+  .word 0x5ef02be3
+  .word 0x2a35de10
+  .word 0xdbdb132e
+  .word 0xf84e7899
+  .word 0x7dff4c2b
+  .word 0x24705317
+  .word 0x30eda4ab
+  .word 0xb44ba799
+  .word 0x3af8f1c5
+  .zero 16
+
+/* point 1 y-cooridante p1_y*/
+p1_y:
+  .word 0xa9f8b96e
+  .word 0x82f268be
+  .word 0x8e51c662
+  .word 0x92b9c4bb
+  .word 0x757d4493
+  .word 0x26b4d3c6
+  .word 0xf491007e
+  .word 0x92a5c72a
+  .word 0x8d8d8641
+  .word 0x87498a20
+  .word 0x0fe7dbde
+  .word 0x841e4949
+  .zero 16
+
+/* 1st scalar share k0 (448-bit) */
+k0:
+  .word 0x5c832a51
+  .word 0x3eb17c27
+  .word 0x9a0c1b76
+  .word 0x6e001281
+  .word 0x4de8344e
+  .word 0x5b7d3b0f
+  .word 0x96d2f9e0
+  .word 0x1e9d19e7
+  .word 0x16f5c1ee
+  .word 0x800a4c94
+  .word 0xe14cd8df
+  .word 0xadb9ce1b
+  .word 0x8677a5f2
+  .word 0x32f9e2b0
+  .zero 8
+
+/* 2nd scalar share k1 (448-bit) */
+k1:
+  .word 0x33eae098
+  .word 0xd31b18d5
+  .word 0x507568cd
+  .word 0xab8fb14d
+  .word 0x9ef51898
+  .word 0x44676e61
+  .word 0x9cb814d9
+  .word 0x4ad22b6e
+  .word 0x8930f243
+  .word 0xb706d682
+  .word 0xa9da1611
+  .word 0x13e7014a
+  .word 0x9ec9b430
+  .word 0x9e5dc598
+  .zero 8
+
+/* scalar k = (k0 + k1) mod n (384-bit)*/
+scalar:
+  .word 0xe8791ba3
+  .word 0xf549e1f7
+  .word 0x893be358
+  .word 0x100794fe
+  .word 0xbc9db95d
+  .word 0xfd7ed624
+  .word 0xc60ebab6
+  .word 0x97ba9586
+  .word 0xa026b431
+  .word 0x37112316
+  .word 0x8b26eef1
+  .word 0xc1a0cf66
+  .zero 16
diff --git a/sw/otbn/crypto/tests/p384_ecdsa_sign_test.s b/sw/otbn/crypto/tests/p384_ecdsa_sign_test.s
index 04d80611a2b21..9b9df3359f897 100644
--- a/sw/otbn/crypto/tests/p384_ecdsa_sign_test.s
+++ b/sw/otbn/crypto/tests/p384_ecdsa_sign_test.s
@@ -15,29 +15,29 @@
 
 p384_ecdsa_sign_test:
 
-  /* set dmem pointer to nonce k */
-  la       x2, nonce_k
-  la       x3, dptr_k
+  /* set dmem pointer to point to 1st scalar share k0 */
+  la       x2, k0
+  la       x3, dptr_k0
   sw       x2, 0(x3)
 
-  /* set dmem pointer to point to blinding parameter */
-  la       x2, blinding_param
-  la       x3, dptr_rnd
+  /* set dmem pointer to point to 2nd scalar share k1 */
+  la       x2, k1
+  la       x3, dptr_k1
   sw       x2, 0(x3)
 
-  /* set dmem pointer to point to message */
-  la       x2, msg
-  la       x3, dptr_msg
+  /* set dmem pointer to point to 1st scalar share d0 (private key) */
+  la       x2, d0
+  la       x3, dptr_d0
   sw       x2, 0(x3)
 
-  /* set dmem pointer to point to nonce k */
-  la       x2, nonce_k
-  la       x3, dptr_k
+  /* set dmem pointer to point to 2nd scalar share d1 (private key) */
+  la       x2, d1
+  la       x3, dptr_d1
   sw       x2, 0(x3)
 
-  /* set dmem pointer to point to private key d */
-  la       x2, priv_key_d
-  la       x3, dptr_d
+  /* set dmem pointer to point to message */
+  la       x2, msg
+  la       x3, dptr_msg
   sw       x2, 0(x3)
 
   /* set dmem pointer to point to signature */
@@ -66,7 +66,43 @@ p384_ecdsa_sign_test:
 
 .data
 
-/* nonce k */
+/* 1st scalar share k0 (448-bit) */
+k0:
+  .word 0x5c832a51
+  .word 0x3eb17c27
+  .word 0x9a0c1b76
+  .word 0x6e001281
+  .word 0x4de8344e
+  .word 0x5b7d3b0f
+  .word 0x96d2f9e0
+  .word 0x1e9d19e7
+  .word 0x16f5c1ee
+  .word 0x800a4c94
+  .word 0xe14cd8df
+  .word 0xadb9ce1b
+  .word 0x8677a5f2
+  .word 0x32f9e2b0
+  .zero 8
+
+/* 2nd scalar share k1 (448-bit) */
+k1:
+  .word 0xe50b5e8e
+  .word 0x776ad076
+  .word 0x60d31f0e
+  .word 0x3521b5e8
+  .word 0x7bf0f8d5
+  .word 0xe08231d6
+  .word 0x7042f3bb
+  .word 0x4cb12f81
+  .word 0x82a3d7ab
+  .word 0x198f4d05
+  .word 0xb84cc0ba
+  .word 0xebdfcb7d
+  .word 0x9ec9b42f
+  .word 0x9e5dc598
+  .zero 8
+
+/* nonce k = k0 + k1 mod n (n: curve order) */
 nonce_k:
   .word 0x99999999
   .word 0x99999999
@@ -82,20 +118,56 @@ nonce_k:
   .word 0x99999999
   .zero 16
 
-/* blinding parameter rnd */
- blinding_param:
-  .word 0xa82c85b0
-  .word 0x163ce1c8
-  .word 0x32518fd7
-  .word 0xf8a428cd
-  .word 0xf5b9d867
-  .word 0x00906f5f
-  .word 0x7387b4f2
-  .word 0xa2d3da7a
-  .word 0xebe0a647
-  .word 0xfb2ef7ca
-  .word 0x74249432
-  .word 0x230e5ff6
+/* 1st private key share d0 (448-bit) */
+d0:
+  .word 0x5c832a51
+  .word 0x3eb17c27
+  .word 0x9a0c1b76
+  .word 0x6e001281
+  .word 0x4de8344e
+  .word 0x5b7d3b0f
+  .word 0x96d2f9e0
+  .word 0x1e9d19e7
+  .word 0x16f5c1ee
+  .word 0x800a4c94
+  .word 0xe14cd8df
+  .word 0xadb9ce1b
+  .word 0x8677a5f2
+  .word 0x32f9e2b0
+  .zero 8
+
+/* 2nd private key share d1 (448-bit) */
+d1:
+  .word 0x33eae098
+  .word 0xd31b18d5
+  .word 0x507568cd
+  .word 0xab8fb14d
+  .word 0x9ef51898
+  .word 0x44676e61
+  .word 0x9cb814d9
+  .word 0x4ad22b6e
+  .word 0x8930f243
+  .word 0xb706d682
+  .word 0xa9da1611
+  .word 0x13e7014a
+  .word 0x9ec9b430
+  .word 0x9e5dc598
+  .zero 8
+
+/* private key d = d0 + d1 mod n (n: curve order) */
+priv_key_d:
+  .word 0xe8791ba3
+  .word 0xf549e1f7
+  .word 0x893be358
+  .word 0x100794fe
+  .word 0xbc9db95d
+  .word 0xfd7ed624
+  .word 0xc60ebab6
+  .word 0x97ba9586
+  .word 0xa026b431
+  .word 0x37112316
+  .word 0x8b26eef1
+  .word 0xc1a0cf66
   .zero 16
 
 /* message */
@@ -114,22 +186,6 @@ msg:
   .word 0x55555555
   .zero 16
 
-/* private key d */
-priv_key_d:
-  .word 0xe8791ba3
-  .word 0xf549e1f7
-  .word 0x893be358
-  .word 0x100794fe
-  .word 0xbc9db95d
-  .word 0xfd7ed624
-  .word 0xc60ebab6
-  .word 0x97ba9586
-  .word 0xa026b431
-  .word 0x37112316
-  .word 0x8b26eef1
-  .word 0xc1a0cf66
-  .zero 16
-
 /* signature R */
 sig_r:
   .zero 64
diff --git a/sw/otbn/crypto/tests/p384_isoncurve_test.s b/sw/otbn/crypto/tests/p384_isoncurve_test.s
index 3b6e7f919cce6..198d2f0f30eb6 100644
--- a/sw/otbn/crypto/tests/p384_isoncurve_test.s
+++ b/sw/otbn/crypto/tests/p384_isoncurve_test.s
@@ -15,18 +15,18 @@
 p384_oncurve_test:
 
   /* set dmem to result */
-  la       x2, res_r
-  la       x3, dptr_r
+  la       x2, rhs
+  la       x3, dptr_rhs
   sw       x2, 0(x3)
-  la       x2, res_l
-  la       x3, dptr_s
+  la       x2, lhs
+  la       x3, dptr_lhs
   sw       x2, 0(x3)
 
   /* set dmem pointer to point to cuve point */
-  la       x2, point_x
+  la       x2, x
   la       x3, dptr_x
   sw       x2, 0(x3)
-  la       x2, point_y
+  la       x2, y
   la       x3, dptr_y
   sw       x2, 0(x3)
 
@@ -35,10 +35,10 @@ p384_oncurve_test:
 
   /* load result to WDRs for comparison with reference */
   li        x2, 0
-  la        x3, res_r
+  la        x3, rhs
   bn.lid    x2++, 0(x3)
   bn.lid    x2++, 32(x3)
-  la        x3, res_l
+  la        x3, lhs
   bn.lid    x2++, 0(x3)
   bn.lid    x2++, 32(x3)
 
@@ -48,15 +48,15 @@ p384_oncurve_test:
 .data
 
 /* buffer for right side result of Weierstrass equation */
-res_r:
+rhs:
   .zero 64
 
 /* buffer for left side result of Weierstrass equation */
-res_l:
+lhs:
   .zero 64
 
 /* point affine x-coordinate */
-point_x:
+x:
   .word 0x4877f3d1
   .word 0x7b829460
   .word 0xb1cac609
@@ -72,7 +72,7 @@ point_x:
   .zero 16
 
 /* point affine y-coordinate */
-point_y:
+y:
   .word 0xc181f90f
   .word 0xc31ef079
   .word 0xbf3aff6e
diff --git a/sw/otbn/crypto/tests/p384_keygen_test.exp b/sw/otbn/crypto/tests/p384_keygen_test.exp
new file mode 100644
index 0000000000000..879f5d55ea82c
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_keygen_test.exp
@@ -0,0 +1,3 @@
+# Expected values:
+w0 = 0x0000000000000000000000000000000000000000000000000000000000000000
+w1 = 0x0000000000000000000000000000000000000000000000000000000000000000
diff --git a/sw/otbn/crypto/tests/p384_keygen_test.s b/sw/otbn/crypto/tests/p384_keygen_test.s
new file mode 100644
index 0000000000000..0d937e1a4f083
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_keygen_test.s
@@ -0,0 +1,362 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone test for P-384 key/scalar generation
+ *
+ * Performs generation of a P-384 random secret key and scalar.
+ *
+ * This test does not test if the randomness of the generated values is
+ * properly distributed or if the entropy is large enough etc.
+ * It only checks if a few generated values are distinct and if the
+ * associated shares don't add up to zero (mod n).
+ *
+ * Actual randomness testing has to be done vial statistical analysis
+ * of generated values, but this is not possible for simulator based
+ * automated testing.
+ */
+
+.section .text.start
+
+p384_keygen_test:
+
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* set dmem pointer to point to 1st scalar share k0 */
+  la        x2, k0
+  la        x3, dptr_k0
+  sw        x2, 0(x3)
+
+  /* set dmem pointer to point to 2nd scalar share k1 */
+  la        x2, k1
+  la        x3, dptr_k1
+  sw        x2, 0(x3)
+
+  /* set dmem pointer to point to 1st scalar share d0 (private key) */
+  la        x2, d0
+  la        x3, dptr_d0
+  sw        x2, 0(x3)
+
+  /* set dmem pointer to point to 2nd scalar share d1 (private key) */
+  la        x2, d1
+  la        x3, dptr_d1
+  sw        x2, 0(x3)
+
+  /* generate 4 random 448-bit values and write them to d0, d1, k0, k1 */
+  jal       x1, p384_generate_random_key
+  jal       x1, p384_generate_k
+
+  /* load generated values into WDRs for range/distinctiveness check */
+  li        x2, 4
+
+  /* [w5,w4] <= d0 */
+  la        x3, dptr_d0
+  lw        x4, 0(x3)
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  /* [w7,w6] <= d1 */
+  la        x3, dptr_d1
+  lw        x4, 0(x3)
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  /* [w9,w8] <= k0 */
+  la        x3, dptr_k0
+  lw        x4, 0(x3)
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  /* [w11,w10] <= k1 */
+  la        x3, dptr_k1
+  lw        x4, 0(x3)
+  bn.lid    x2++, 0(x4)
+  bn.lid    x2++, 32(x4)
+
+  /* Load the curve order n.
+     [w13,w12] <= dmem[p384_n] = n */
+  li        x2, 12
+  la        x3, p384_n
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so
+     no need to compute the high part):
+     w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */
+  bn.sub    w14, w31, w12
+
+  /* initialize w0 <= 0 and w1 <= 0 */
+  bn.mov    w0, w31
+  bn.mov    w1, w31
+
+  /* Check if modular addition of shares d0 and d1, as well as k0 and k1 is non-zero. */
+
+  /* [w17,w16] <= d = [w5,w4] + [w7,w6] mod n = d0 + d1 mod n */
+  bn.add    w18, w4, w6
+  bn.addc   w19, w5, w7
+  bn.mov    w20, w31
+  jal       x1, p384_reduce_n
+
+  /* Compare w16 to 0. */
+  bn.cmp    w16, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, FG0, x0
+  andi      x2, x2, 8
+
+  /* Compare w17 to 0. */
+  bn.cmp    w17, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x3 <= 8 if FG0.Z else 0 */
+  csrrw     x3, FG0, x0
+  andi      x3, x3, 8
+
+  /* Check if both registers w16 and w17 are equal to 0.
+     x2 AND x3 == 0 <=> [w17,w16] != 0, x2 AND x3 != 0 <=> [w17,w16] == 0 */
+  or        x2, x2, x3
+
+  /* If x2 != 0: w0 <= w0 + 1, else: w0 <= w0 + 0 */
+  beq       x2, x0, keep_w0_1
+  bn.addi   w0, w0, 1
+  keep_w0_1:
+
+  /* [w17,w16] <= k = [w9,w8] + [w11,w10] mod n = k0 + k1 mod n */
+  bn.add    w18, w8, w10
+  bn.addc   w19, w9, w11
+  bn.mov    w20, w31
+  jal       x1, p384_reduce_n
+
+  /* Compare w16 to 0. */
+  bn.cmp    w16, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, FG0, x0
+  andi      x2, x2, 8
+
+  /* Compare w17 to 0. */
+  bn.cmp    w17, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x3 <= 8 if FG0.Z else 0 */
+  csrrw     x3, FG0, x0
+  andi      x3, x3, 8
+
+  /* Check if both registers w16 and w17 are equal to 0.
+     x2 AND x3 == 0 <=> [w17,w16] != 0, x2 AND x3 != 0 <=> [w17,w16] == 0 */
+  or        x2, x2, x3
+
+  /* If x2 != 0: w0 <= w0 + 1, else: w0 <= w0 + 0 */
+  beq       x2, x0, keep_w0_2
+  bn.addi   w0, w0, 1
+  keep_w0_2:
+
+  /* Compare the values and check if they are distinct to each other.
+     If one value pair is equal, then the zero flag will be set.
+     In case of an equal pair w1 > 0, otherwise w1 == 0. */
+
+  /* [w21,w20] <= [w5,w4] - [w7,w6] = d0 - d1
+     if d0 - d1 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */
+  bn.sub    w20, w4, w6
+  bn.subb   w21, w5, w7
+
+  /* Compare w20 to 0. */
+  bn.cmp    w20, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, FG0, x0
+  andi      x2, x2, 8
+
+  /* Compare w21 to 0. */
+  bn.cmp    w21, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x3 <= 8 if FG0.Z else 0 */
+  csrrw     x3, FG0, x0
+  andi      x3, x3, 8
+
+  /* Check if both registers w20 and w21 are equal to 0.
+     x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */
+  and       x2, x2, x3
+
+  /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */
+  beq       x2, x0, keep_w1_1
+  bn.addi   w1, w1, 1
+  keep_w1_1:
+
+  /* [w21,w20] <= [w5,w4] - [w9,w8] = d0 - k0
+     if d0 - k0 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */
+  bn.sub    w20, w4, w8
+  bn.subb   w21, w5, w9
+
+  /* Compare w20 to 0. */
+  bn.cmp    w20, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, FG0, x0
+  andi      x2, x2, 8
+
+  /* Compare w21 to 0. */
+  bn.cmp    w21, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x3 <= 8 if FG0.Z else 0 */
+  csrrw     x3, FG0, x0
+  andi      x3, x3, 8
+
+  /* Check if both registers w20 and w21 are equal to 0.
+     x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */
+  and       x2, x2, x3
+
+  /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */
+  beq       x2, x0, keep_w1_2
+  bn.addi   w1, w1, 1
+  keep_w1_2:
+
+  /* [w21,w20] <= [w5,w4] - [w11,w10] = d0 - k1
+     if d0 - k1 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */
+  bn.sub    w20, w4, w10
+  bn.subb   w21, w5, w11
+
+  /* Compare w20 to 0. */
+  bn.cmp    w20, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, FG0, x0
+  andi      x2, x2, 8
+
+  /* Compare w21 to 0. */
+  bn.cmp    w21, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x3 <= 8 if FG0.Z else 0 */
+  csrrw     x3, FG0, x0
+  andi      x3, x3, 8
+
+  /* Check if both registers w20 and w21 are equal to 0.
+     x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */
+  and       x2, x2, x3
+
+  /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */
+  beq       x2, x0, keep_w1_3
+  bn.addi   w1, w1, 1
+  keep_w1_3:
+
+  /* [w21,w20] <= [w7,w6] - [w9,w8] = d1 - k0
+     if d1 - k0 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */
+  bn.sub    w20, w6, w8
+  bn.subb   w21, w7, w9
+
+  /* Compare w20 to 0. */
+  bn.cmp    w20, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, FG0, x0
+  andi      x2, x2, 8
+
+  /* Compare w21 to 0. */
+  bn.cmp    w21, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x3 <= 8 if FG0.Z else 0 */
+  csrrw     x3, FG0, x0
+  andi      x3, x3, 8
+
+  /* Check if both registers w20 and w21 are equal to 0.
+     x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */
+  and       x2, x2, x3
+
+  /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */
+  beq       x2, x0, keep_w1_4
+  bn.addi   w1, w1, 1
+  keep_w1_4:
+
+  /* [w21,w20] <= [w7,w6] - [w11,w10] = d1 - k1
+     if d1 - k1 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */
+  bn.sub    w20, w6, w10
+  bn.subb   w21, w7, w11
+
+  /* Compare w20 to 0. */
+  bn.cmp    w20, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, FG0, x0
+  andi      x2, x2, 8
+
+  /* Compare w21 to 0. */
+  bn.cmp    w21, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x3 <= 8 if FG0.Z else 0 */
+  csrrw     x3, FG0, x0
+  andi      x3, x3, 8
+
+  /* Check if both registers w20 and w21 are equal to 0.
+     x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */
+  and       x2, x2, x3
+
+  /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */
+  beq       x2, x0, keep_w1_5
+  bn.addi   w1, w1, 1
+  keep_w1_5:
+
+  /* [w21,w20] <= [w9,w8] - [w11,w10] = k0 - k1
+     if k0 - k1 == 0: w1 <= w1 + w3 = w1 + 1, else: w1 <= w1 + w31 = w1 + 0 */
+  bn.sub    w20, w8, w10
+  bn.subb   w21, w9, w11
+
+  /* Compare w20 to 0. */
+  bn.cmp    w20, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x2 <= 8 if FG0.Z else 0 */
+  csrrw     x2, FG0, x0
+  andi      x2, x2, 8
+
+  /* Compare w21 to 0. */
+  bn.cmp    w21, w31
+
+  /* Read the FG0.Z flag (position 3).
+     x3 <= 8 if FG0.Z else 0 */
+  csrrw     x3, FG0, x0
+  andi      x3, x3, 8
+
+  /* Check if both registers w20 and w21 are equal to 0.
+     x2 AND x3 == 0 <=> [w21,w20] != 0, x2 AND x3 != 0 <=> [w21,w20] == 0 */
+  and       x2, x2, x3
+
+  /* If x2 != 0: w1 <= w1 + 1, else: w1 <= w1 + 0 */
+  beq       x2, x0, keep_w1_6
+  bn.addi   w1, w1, 1
+  keep_w1_6:
+
+  ecall
+
+.section .data
+
+.balign 32
+
+/* 1st private key share d0 (448-bit) */
+d0:
+  .zero 64
+
+/* 2nd private key share d1 (448-bit) */
+d1:
+  .zero 64
+
+/* 1st scalar share k0 (448-bit) */
+k0:
+  .zero 64
+
+/* 2nd scalar share k1 (448-bit) */
+k1:
+  .zero 64
diff --git a/sw/otbn/crypto/tests/p384_mulmod448x128_test.exp b/sw/otbn/crypto/tests/p384_mulmod448x128_test.exp
new file mode 100644
index 0000000000000..521e386310259
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_mulmod448x128_test.exp
@@ -0,0 +1,3 @@
+# Expected values (result of modular multiplication)
+w0 = 0xb1c0a5d4079771ccbf1e21a89602d8636f771e8cdc5e0e904c5152463aa12b0d
+w1 = 0x00000000000000000000000000000000872ed6de74b2c551e0d591aa03cd081d
diff --git a/sw/otbn/crypto/tests/p384_mulmod448x128_test.s b/sw/otbn/crypto/tests/p384_mulmod448x128_test.s
new file mode 100644
index 0000000000000..9b6502cba626e
--- /dev/null
+++ b/sw/otbn/crypto/tests/p384_mulmod448x128_test.s
@@ -0,0 +1,79 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone test for P-384 ECDSA signature generation
+ *
+ * Computes P-384 ECDSA signature for message, nonce and private key
+ * contained in the .data section.
+ *
+ * See comment at the end of the file for expected values of signature.
+ */
+
+.section .text.start
+
+p384_mulmod448x128_test:
+
+  /* init all-zero reg */
+  bn.xor    w31, w31, w31
+
+  /* load multiplication input into WDRs
+     [w11,w10] <= a
+     w16 <= b */
+  li        x2, 10
+  la        x3, a
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2, 32(x3)
+  li        x2, 16
+  la        x3, b
+  bn.lid    x2, 0(x3)
+
+  /* load domain parameter n (order of base point)
+     [w13, w12] <= n = dmem[dptr_n] */
+  li        x2, 12
+  la        x3, p384_n
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* Compute Solinas constant k for modulus n (we know it is only 191 bits, so
+     no need to compute the high part):
+     w14 <= 2^256 - n[255:0] = (2^384 - n) mod (2^256) = 2^384 - n */
+  bn.sub    w14, w31, w12
+
+  /* Compute a * b mod n
+     [w17,w16] <= [w11,w10] * w16 mod [w13,w12] = a * b mod n */
+  jal       x1, p384_mulmod448x128_n
+
+  /* move result to different WDRs for comparison */
+  bn.mov    w0, w16
+  bn.mov    w1, w17
+
+  ecall
+
+
+.data
+
+a:
+  .word 0x5c832a51
+  .word 0x3eb17c27
+  .word 0x9a0c1b76
+  .word 0x6e001281
+  .word 0x4de8344e
+  .word 0x5b7d3b0f
+  .word 0x96d2f9e0
+  .word 0x1e9d19e7
+  .word 0x16f5c1ee
+  .word 0x800a4c94
+  .word 0xe14cd8df
+  .word 0xadb9ce1b
+  .word 0x8677a5f2
+  .word 0x32f9e2b0
+  .zero 8
+
+b:
+  .word 0x5c832a51
+  .word 0x3eb17c27
+  .word 0x9a0c1b76
+  .word 0x6e001281
+  .zero 48
diff --git a/sw/otbn/crypto/tests/p384_scalar_mult_test.exp b/sw/otbn/crypto/tests/p384_scalar_mult_test.exp
index be9d0206a3be8..6f2f96470e94d 100644
--- a/sw/otbn/crypto/tests/p384_scalar_mult_test.exp
+++ b/sw/otbn/crypto/tests/p384_scalar_mult_test.exp
@@ -1,7 +1,4 @@
 # Expected values (x- and y-coordinates of result):
-# [w1, w0] is affine x-coordinate of resulting point,
-# [w3, w2] is affine y-coordinate of resulting point.
+# [w1, w0] is affine x-coordinate of resulting point
 w0  = 0x6c5d59dbafa8ecbaf0b2d3c1e818325403634e3b86956e6ead6739217b702c4a
 w1  = 0x00000000000000000000000000000000d177aa22a7c535a28cae00d420c4cd27
-w2  = 0x607c6c698fc5c15cbfadf94e322fa2fa5ff6cf915fe9ad62f538701f1add78ec
-w3  = 0x000000000000000000000000000000009e18fa893348fb1d44f40dbedcb5e36c
diff --git a/sw/otbn/crypto/tests/p384_scalar_mult_test.s b/sw/otbn/crypto/tests/p384_scalar_mult_test.s
index 7fdd588fa70ba..68f70dbab95dc 100644
--- a/sw/otbn/crypto/tests/p384_scalar_mult_test.s
+++ b/sw/otbn/crypto/tests/p384_scalar_mult_test.s
@@ -17,6 +17,9 @@
 
 p384_scalar_mult_test:
 
+  /* Init all-zero register. */
+  bn.xor  w31, w31, w31
+
   /* set dmem pointer to point to x-coordinate */
   la       x2, p1_x
   la       x3, dptr_x
@@ -27,18 +30,18 @@ p384_scalar_mult_test:
   la       x3, dptr_y
   sw       x2, 0(x3)
 
-  /* set dmem pointer to point to scalar k */
-  la       x2, scalar
-  la       x3, dptr_k
+  /* set dmem pointer to point to 1st scalar share k0 */
+  la       x2, k0
+  la       x3, dptr_k0
   sw       x2, 0(x3)
 
-  /* set dmem pointer to point to blinding parameter */
-  la       x2, blinding_param
-  la       x3, dptr_rnd
+  /* set dmem pointer to point to 2nd scalar share k1 */
+  la       x2, k1
+  la       x3, dptr_k1
   sw       x2, 0(x3)
 
   /* call scalar point multiplication routine in P-384 lib */
-  jal      x1, scalar_mult_p384
+  jal      x1, p384_scalar_mult
 
   /* load result to WDRs for comparison with reference */
   li        x2, 0
@@ -49,11 +52,31 @@ p384_scalar_mult_test:
   bn.lid    x2++, 0(x3)
   bn.lid    x2, 32(x3)
 
+  /* load domain parameter p (modulus)
+     [w13, w12] = p = dmem[p384_p] */
+  li        x2, 12
+  la        x3, p384_p
+  bn.lid    x2++, 0(x3)
+  bn.lid    x2++, 32(x3)
+
+  /* unmask x coordinate x = x_m + m mod p = x-coord. + y-coord. mod p */
+  bn.add    w0, w0, w2
+  bn.addc   w1, w1, w3
+
+  bn.mov    w18, w0
+  bn.mov    w19, w1
+  bn.mov    w20, w31
+  jal       x1, p384_reduce_p
+  bn.mov    w0, w16
+  bn.mov    w1, w17
+
   ecall
 
 
 .section .data
 
+.balign 32
+
 /* point 1 x-cooridante p1_x */
 p1_x:
   .word 0x1a11808b
@@ -86,7 +109,43 @@ p1_y:
   .word 0x841e4949
   .zero 16
 
-/* scalar k */
+/* 1st scalar share k0 (448-bit) */
+k0:
+  .word 0x5c832a51
+  .word 0x3eb17c27
+  .word 0x9a0c1b76
+  .word 0x6e001281
+  .word 0x4de8344e
+  .word 0x5b7d3b0f
+  .word 0x96d2f9e0
+  .word 0x1e9d19e7
+  .word 0x16f5c1ee
+  .word 0x800a4c94
+  .word 0xe14cd8df
+  .word 0xadb9ce1b
+  .word 0x8677a5f2
+  .word 0x32f9e2b0
+  .zero 8
+
+/* 2nd scalar share k1 (448-bit) */
+k1:
+  .word 0x33eae098
+  .word 0xd31b18d5
+  .word 0x507568cd
+  .word 0xab8fb14d
+  .word 0x9ef51898
+  .word 0x44676e61
+  .word 0x9cb814d9
+  .word 0x4ad22b6e
+  .word 0x8930f243
+  .word 0xb706d682
+  .word 0xa9da1611
+  .word 0x13e7014a
+  .word 0x9ec9b430
+  .word 0x9e5dc598
+  .zero 8
+
+/* scalar k = (k0 + k1) mod n (384-bit)*/
 scalar:
   .word 0xe8791ba3
   .word 0xf549e1f7
@@ -101,19 +160,3 @@ scalar:
   .word 0x8b26eef1
   .word 0xc1a0cf66
   .zero 16
-
-   /* blinding parameter rnd */
- blinding_param:
-  .word 0xa82c85b0
-  .word 0x163ce1c8
-  .word 0x32518fd7
-  .word 0xf8a428cd
-  .word 0xf5b9d867
-  .word 0x00906f5f
-  .word 0x7387b4f2
-  .word 0xa2d3da7a
-  .word 0xebe0a647
-  .word 0xfb2ef7ca
-  .word 0x74249432
-  .word 0x230e5ff6
-  .zero 16
diff --git a/sw/otbn/crypto/tests/primality_test.s b/sw/otbn/crypto/tests/primality_test.s
index 8af539b7896d1..a174c7d5f2260 100644
--- a/sw/otbn/crypto/tests/primality_test.s
+++ b/sw/otbn/crypto/tests/primality_test.s
@@ -58,26 +58,26 @@ main:
 .data
 
 /* Candidate prime (randomly generated using pycryptodome) =
-0x9ac5b6d69aa1d91c418d9bf315ba72595488aabddbd435dafe630ba818e3d4ef03ab9bf93147a781cc45f6219f8bc92fc500c92dc8539832055036f6537320a1
+0x83f4fb7ca746b70dd7e37ce93847ed7995ccf101bb7a9c628ebcffeeaa0114efd346ddfb53c1d31d51ab13bbcb0b2346d6689cd78210bfe05f458233d8e58e1b
 */
 .balign 32
 input:
-.word 0x537320a1
-.word 0x055036f6
-.word 0xc8539832
-.word 0xc500c92d
-.word 0x9f8bc92f
-.word 0xcc45f621
-.word 0x3147a781
-.word 0x03ab9bf9
-.word 0x18e3d4ef
-.word 0xfe630ba8
-.word 0xdbd435da
-.word 0x5488aabd
-.word 0x15ba7259
-.word 0x418d9bf3
-.word 0x9aa1d91c
-.word 0x9ac5b6d6
+.word 0xd8e58e1b
+.word 0x5f458233
+.word 0x8210bfe0
+.word 0xd6689cd7
+.word 0xcb0b2346
+.word 0x51ab13bb
+.word 0x53c1d31d
+.word 0xd346ddfb
+.word 0xaa0114ef
+.word 0x8ebcffee
+.word 0xbb7a9c62
+.word 0x95ccf101
+.word 0x3847ed79
+.word 0xd7e37ce9
+.word 0xa746b70d
+.word 0x83f4fb7c
 
 .section .scratchpad
 
diff --git a/sw/otbn/crypto/tests/primality_test_witness_negative_test.exp b/sw/otbn/crypto/tests/primality_test_witness_negative_test.exp
index 3720ca37665bf..411aa2c8d793f 100644
--- a/sw/otbn/crypto/tests/primality_test_witness_negative_test.exp
+++ b/sw/otbn/crypto/tests/primality_test_witness_negative_test.exp
@@ -1,7 +1,2 @@
-# For this particular composite/witness pair, we don't hit an early-exit case;
-# expect to finish the loop so w0, w1 == (b^(w-1) * R) % w
-w0 = 0xb470f524ca68f2455a1a85b8dc006872131ceedf6d07883f0f010e0bd222c0e3
-w1 = 0x65a498fbb4f35d9919fea51aaf2e83256c5f624f37bfc26e63a42a3c74f15a65
-
 # Result from witness test: 0 (indicating "composite")
 w21 = 0
diff --git a/sw/otbn/crypto/tests/primality_test_witness_negative_test.s b/sw/otbn/crypto/tests/primality_test_witness_negative_test.s
index 3093e9fa7abd3..e1736e4581289 100644
--- a/sw/otbn/crypto/tests/primality_test_witness_negative_test.s
+++ b/sw/otbn/crypto/tests/primality_test_witness_negative_test.s
@@ -30,22 +30,12 @@ main:
   la         x18, mont_rr
   jal        x1, test_witness
 
-  /* Load the value from the working buffer into registers. This buffer holds
-     the witness raised to some portion of the exponent; we can check it to
-     ensure that w was found to be composite at exactly the point we expected.
-       w0,w1 <= dmem[tmp:tmp+n*32] */
-  li         x8, 0
-  la         x15, tmp
-  loop       x30, 2
-    bn.lid     x8, 0(x15++)
-    addi       x8, x8, 1
-
   ecall
 
 .data
 
 /* Candidate prime (composite, randomly generated) =
-0xf7b5cc32e3c7c3ff6f220312fe4be4af76c9e51e8c17648c863751d70359bab17c1d7b4844e01d1ec0cd695ff3bae05dc51d95a001ab7b69f66d0c056c2dec39
+0xf7b5cc32e3c7c3ff6f220312fe4be4af76c9e51e8c17648c863751d70359bab17c1d7b4844e01d1ec0cd695ff3bae05dc51d95a001ab7b69f66d0c056c2dec3b
 */
 .balign 32
 input:
@@ -89,34 +79,34 @@ witness:
 /* Precomputed Montgomery constant m0' (256 bits). */
 .balign 32
 mont_m0inv:
-.word 0xd0a3bdf7
-.word 0x7dde1093
-.word 0xf7fe594f
-.word 0x8f66b353
-.word 0x03a1c874
-.word 0x3c4a0e42
-.word 0x0d02fb70
-.word 0x2cf2f731
+.word 0xbb5df30d
+.word 0xf47b30a4
+.word 0x45c4b2af
+.word 0xb6e86212
+.word 0xacafa4f9
+.word 0x6e5afd69
+.word 0x9ae7984c
+.word 0xce44dadc
 
 /* Precomputed Montgomery constant RR (512 bits). */
 .balign 32
 mont_rr:
-.word 0xd04011c2
-.word 0x8ef6bac2
-.word 0x2c87d164
-.word 0x5f60cb7a
-.word 0x5e64a3f6
-.word 0xe9f883b0
-.word 0xa802122b
-.word 0xf910bf58
-.word 0x94680653
-.word 0x3dadc1f1
-.word 0x4adf397f
-.word 0xa87c8a2a
-.word 0x0576494c
-.word 0x5ce4999d
-.word 0x8188e572
-.word 0x0911fc89
+.word 0xc1e31e17
+.word 0x6f9be028
+.word 0xcd184ada
+.word 0xbbd4bbb9
+.word 0x10d84741
+.word 0xa11300bd
+.word 0x4e5c6583
+.word 0x50805ac8
+.word 0x78f6cf41
+.word 0x163b312e
+.word 0x126593d5
+.word 0x03cc62ac
+.word 0x23cbc231
+.word 0xa53b2634
+.word 0x5d9d6071
+.word 0xdf10ee86
 
 .section .scratchpad
 
diff --git a/sw/otbn/crypto/tests/primality_test_witness_test.exp b/sw/otbn/crypto/tests/primality_test_witness_test.exp
index 0be310c8ecbfa..16c7f2246c059 100644
--- a/sw/otbn/crypto/tests/primality_test_witness_test.exp
+++ b/sw/otbn/crypto/tests/primality_test_witness_test.exp
@@ -1,6 +1,6 @@
-# w0, w1 <= (b^(w-1) * R) % w =  (1 * R) % w
-w0 = 0xf156a85066fa88460f3223454c58b0b878c560be590bf156363935bed6c123f5
-w1 = 0x72e3f25e319a983962943197fd7f7e1a7df76977ec8eb6b6e3d91b8199fb9c6c
+# w0, w1 <= (b^((w-1) / 2) * R) % w =  (-1 * R) % w
+w0 = 0x1d52af5f320aef73e19bb975674e9e8f0e753e834de81d53938d9482527db816
+w1 = 0x1a381b439ccacf8d3ad79cd0050103cb04112d1026e29292384dc8fccc08c726
 
 # Result from witness test: all 1s (indicating "possibly prime")
 w21 = 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
diff --git a/sw/otbn/crypto/tests/rsa_1024_enc_test.s b/sw/otbn/crypto/tests/rsa_1024_enc_test.s
index c9419ecd11340..b041f1da476f4 100644
--- a/sw/otbn/crypto/tests/rsa_1024_enc_test.s
+++ b/sw/otbn/crypto/tests/rsa_1024_enc_test.s
@@ -13,7 +13,7 @@
  * .data segment in this file.
  *
  * Copies the encrypted message to wide registers for comparison (starting at
- * w0). See comment at the end of the file for expected values.
+ * w0).
  */
 run_rsa_1024_enc:
   /* Init all-zero register. */
diff --git a/sw/otbn/crypto/tests/rsa_2048_dec_test.exp b/sw/otbn/crypto/tests/rsa_2048_dec_test.exp
new file mode 100644
index 0000000000000..51e2039287ca1
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_2048_dec_test.exp
@@ -0,0 +1,10 @@
+# Expected value:
+# 0x6add9548af50f1bea3cb921205a5bb92ee325e01d160e3738a09aa0df7050e6051d693440f0d00cdd56cee5a748ff3b48b1df7be05808ad20068ad387b8b5e4c25c79bba9f87ef971da926f644c26d4273829fd69db71f9eded2cd1a33c367578550346ada160daa272940dd6fc10dae4a0facef437ece40130301c1b847203cc0defd3620ce89d96fa21d30ee63e458b0198adc842f68af8b462df6014955ab68f663a9b5e77caf15a517ab0931308bf9591cecc7691780a2f3bd99d3ce25433d31537e7cab1b4c07d99199e9517132188150d38d633c2b3ef6ba6fb40504e800fca580beb7a19f2315adb451be690fc4f87ea5914d28d5562dc1dce115a852
+w0 = 0x00fca580beb7a19f2315adb451be690fc4f87ea5914d28d5562dc1dce115a852
+w1 = 0x3d31537e7cab1b4c07d99199e9517132188150d38d633c2b3ef6ba6fb40504e8
+w2 = 0x68f663a9b5e77caf15a517ab0931308bf9591cecc7691780a2f3bd99d3ce2543
+w3 = 0xc0defd3620ce89d96fa21d30ee63e458b0198adc842f68af8b462df6014955ab
+w4 = 0x8550346ada160daa272940dd6fc10dae4a0facef437ece40130301c1b847203c
+w5 = 0x25c79bba9f87ef971da926f644c26d4273829fd69db71f9eded2cd1a33c36757
+w6 = 0x51d693440f0d00cdd56cee5a748ff3b48b1df7be05808ad20068ad387b8b5e4c
+w7 = 0x6add9548af50f1bea3cb921205a5bb92ee325e01d160e3738a09aa0df7050e60
diff --git a/sw/otbn/crypto/tests/rsa_2048_dec_test.s b/sw/otbn/crypto/tests/rsa_2048_dec_test.s
new file mode 100644
index 0000000000000..4dcd14b0cacbb
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_2048_dec_test.s
@@ -0,0 +1,272 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+
+.section .text.start
+
+/**
+ * Standalone RSA-2048 modexp with secret exponent (decryption/signing).
+ */
+main:
+  /* Init all-zero register. */
+  bn.xor  w31, w31, w31
+
+  /* Load number of limbs. */
+  li    x30, 8
+
+  /* Load pointers to modulus and Montgomery constant buffers. */
+  la    x16, modulus
+  la    x17, m0inv
+  la    x18, RR
+
+  /* Compute Montgomery constants. */
+  jal      x1, modload
+
+  /* Run exponentiation.
+       dmem[result] = dmem[base]^dmem[exp] mod dmem[modulus] */
+  la       x14, base
+  la       x15, exp
+  la       x2, result
+  jal      x1, modexp
+
+  /* copy all limbs of result to wide reg file */
+  la       x21, result
+  li       x8, 0
+  loop     x30, 2
+    bn.lid   x8, 0(x21++)
+    addi     x8, x8, 1
+
+  ecall
+
+
+.data
+
+/* Modulus n =
+
+0xb5ed720fe7e1b4a65494e8e9421df94910811d23854cb07b08a34508b682b188b16fa70e4804b4c4f54a54ae2a10848abc9253ac7c6085e5b9abcbcd48515db1626b01df4e7f5f1c85b9ce1b4c8d0f77f3854c8bc4f350ad4d993a6815d0d62ac83b47a257adb40023e1acf003d27953f19c5cbede1af58e42ef12ad9907c20ca428f8b7dbb6f3434936b1108d17ee343d7127f8885ff2513eb834c17bf1c4ddec0d61cc26f5f683c10c0e48676608811e9341f2898f690bc9fafd3b7e46d375e2178a141faf0d637767da550de4c5b9939af133ceba7cd2734df4ad269c166180afd8c35060de8ac302ca911aa3f92d139ed1595523a7f6c201cfafed4c17b5
+ */
+.balign 32
+modulus:
+  .word 0xed4c17b5
+  .word 0xc201cfaf
+  .word 0x5523a7f6
+  .word 0x139ed159
+  .word 0x1aa3f92d
+  .word 0xc302ca91
+  .word 0x5060de8a
+  .word 0x80afd8c3
+  .word 0x269c1661
+  .word 0x734df4ad
+  .word 0xceba7cd2
+  .word 0x939af133
+  .word 0x0de4c5b9
+  .word 0x7767da55
+  .word 0x1faf0d63
+  .word 0xe2178a14
+  .word 0x7e46d375
+  .word 0xc9fafd3b
+  .word 0x898f690b
+  .word 0x1e9341f2
+  .word 0x67660881
+  .word 0xc10c0e48
+  .word 0x26f5f683
+  .word 0xec0d61cc
+  .word 0x7bf1c4dd
+  .word 0x3eb834c1
+  .word 0x885ff251
+  .word 0x3d7127f8
+  .word 0x8d17ee34
+  .word 0x4936b110
+  .word 0xdbb6f343
+  .word 0xa428f8b7
+  .word 0x9907c20c
+  .word 0x42ef12ad
+  .word 0xde1af58e
+  .word 0xf19c5cbe
+  .word 0x03d27953
+  .word 0x23e1acf0
+  .word 0x57adb400
+  .word 0xc83b47a2
+  .word 0x15d0d62a
+  .word 0x4d993a68
+  .word 0xc4f350ad
+  .word 0xf3854c8b
+  .word 0x4c8d0f77
+  .word 0x85b9ce1b
+  .word 0x4e7f5f1c
+  .word 0x626b01df
+  .word 0x48515db1
+  .word 0xb9abcbcd
+  .word 0x7c6085e5
+  .word 0xbc9253ac
+  .word 0x2a10848a
+  .word 0xf54a54ae
+  .word 0x4804b4c4
+  .word 0xb16fa70e
+  .word 0xb682b188
+  .word 0x08a34508
+  .word 0x854cb07b
+  .word 0x10811d23
+  .word 0x421df949
+  .word 0x5494e8e9
+  .word 0xe7e1b4a6
+  .word 0xb5ed720f
+
+/* Base for exponentiation (corresponds to ciphertext for decryption or
+   message for signing).
+
+   Raw hex value =
+0x95fb986cd4aeee4b013effc1d183670380a9e2133ecc6a38dbbfff3f8ef20e1923a5e3741eac8772ee80f28994968fcabd6d454b7791263872bc68d97b6f4fbb76cee24f205d812ad36f2fcb6c11145943009a051c39c18c45b53ee19e51df0254b31eb991783718fb35c51dec249956bceb0276eaee88d8ecdeae2c08ac62a0018408af3923206e911a7ecf6ad786255fa69d63d333e6f44ebd3f5e6ebb7c82443c694d913e200492c89f046943f2dc7d8cf9951c6a33fa721558d1956fb552349ded082714be6a8bff775fd05162744d229fc9fac72509476bdc6434e5187bf3a1cc426cc13f0a10dcf0d15f28abcecfe5674782f232464b1a890d42b6fdd0
+ */
+.balign 32
+base:
+  .word 0x42b6fdd0
+  .word 0x4b1a890d
+  .word 0x82f23246
+  .word 0xcfe56747
+  .word 0x5f28abce
+  .word 0x10dcf0d1
+  .word 0x6cc13f0a
+  .word 0xf3a1cc42
+  .word 0x34e5187b
+  .word 0x476bdc64
+  .word 0xfac72509
+  .word 0x4d229fc9
+  .word 0xd0516274
+  .word 0x8bff775f
+  .word 0x2714be6a
+  .word 0x349ded08
+  .word 0x956fb552
+  .word 0x721558d1
+  .word 0x1c6a33fa
+  .word 0x7d8cf995
+  .word 0x6943f2dc
+  .word 0x92c89f04
+  .word 0x913e2004
+  .word 0x443c694d
+  .word 0x6ebb7c82
+  .word 0x4ebd3f5e
+  .word 0xd333e6f4
+  .word 0x5fa69d63
+  .word 0x6ad78625
+  .word 0x911a7ecf
+  .word 0x3923206e
+  .word 0x018408af
+  .word 0x08ac62a0
+  .word 0xecdeae2c
+  .word 0xeaee88d8
+  .word 0xbceb0276
+  .word 0xec249956
+  .word 0xfb35c51d
+  .word 0x91783718
+  .word 0x54b31eb9
+  .word 0x9e51df02
+  .word 0x45b53ee1
+  .word 0x1c39c18c
+  .word 0x43009a05
+  .word 0x6c111459
+  .word 0xd36f2fcb
+  .word 0x205d812a
+  .word 0x76cee24f
+  .word 0x7b6f4fbb
+  .word 0x72bc68d9
+  .word 0x77912638
+  .word 0xbd6d454b
+  .word 0x94968fca
+  .word 0xee80f289
+  .word 0x1eac8772
+  .word 0x23a5e374
+  .word 0x8ef20e19
+  .word 0xdbbfff3f
+  .word 0x3ecc6a38
+  .word 0x80a9e213
+  .word 0xd1836703
+  .word 0x013effc1
+  .word 0xd4aeee4b
+  .word 0x95fb986c
+
+/* Private exponent d =
+0x51a84a52295a7da34ac3abe746edfd3e7651fdaa3be2b8340124878fe99bafe4130072934e700e537965ebac60e51918cc9b4143627050a95435703cac011974cd200aaf18a4c3242241cbe924eb0bce6357a98bf2d2e39b660128de1f2ca5747e7b5d23d906f68c398ec9f8d13e5f86f623a0dd6b03dec403f71b03207502fbb6c7d812f391e010cbed264655d11ab63c262a803196a128df72ecf1c65ed7f742371e4c4ee355f44cfae81ec0a256da9aa3eb1935fc509d366de08c7edb522411670cd7ee0053bb9395ac4cbe0af6f3cdd1c24e225ee47aa4f381764cfab389db993fed537f397fbff31362a85872993bc467dde42b66894f4cb3ce712b2ee1
+ */
+.balign 32
+exp:
+  .word 0x712b2ee1
+  .word 0x4f4cb3ce
+  .word 0xe42b6689
+  .word 0x3bc467dd
+  .word 0xa8587299
+  .word 0xbff31362
+  .word 0x537f397f
+  .word 0xdb993fed
+  .word 0x4cfab389
+  .word 0xa4f38176
+  .word 0x225ee47a
+  .word 0xcdd1c24e
+  .word 0xbe0af6f3
+  .word 0x9395ac4c
+  .word 0xee0053bb
+  .word 0x11670cd7
+  .word 0x7edb5224
+  .word 0x366de08c
+  .word 0x35fc509d
+  .word 0x9aa3eb19
+  .word 0xc0a256da
+  .word 0x4cfae81e
+  .word 0x4ee355f4
+  .word 0x42371e4c
+  .word 0xc65ed7f7
+  .word 0xdf72ecf1
+  .word 0x3196a128
+  .word 0x3c262a80
+  .word 0x55d11ab6
+  .word 0xcbed2646
+  .word 0xf391e010
+  .word 0xb6c7d812
+  .word 0x207502fb
+  .word 0x03f71b03
+  .word 0x6b03dec4
+  .word 0xf623a0dd
+  .word 0xd13e5f86
+  .word 0x398ec9f8
+  .word 0xd906f68c
+  .word 0x7e7b5d23
+  .word 0x1f2ca574
+  .word 0x660128de
+  .word 0xf2d2e39b
+  .word 0x6357a98b
+  .word 0x24eb0bce
+  .word 0x2241cbe9
+  .word 0x18a4c324
+  .word 0xcd200aaf
+  .word 0xac011974
+  .word 0x5435703c
+  .word 0x627050a9
+  .word 0xcc9b4143
+  .word 0x60e51918
+  .word 0x7965ebac
+  .word 0x4e700e53
+  .word 0x13007293
+  .word 0xe99bafe4
+  .word 0x0124878f
+  .word 0x3be2b834
+  .word 0x7651fdaa
+  .word 0x46edfd3e
+  .word 0x4ac3abe7
+  .word 0x295a7da3
+  .word 0x51a84a52
+
+/* output buffer */
+.balign 32
+result:
+.zero 256
+
+/* buffer for Montgomery constant RR */
+.balign 32
+RR:
+.zero 256
+
+/* buffer for Montgomery constant m0inv */
+.balign 32
+m0inv:
+.zero 32
diff --git a/sw/otbn/crypto/tests/rsa_2048_enc_test.exp b/sw/otbn/crypto/tests/rsa_2048_enc_test.exp
new file mode 100644
index 0000000000000..a8f658f900318
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_2048_enc_test.exp
@@ -0,0 +1,10 @@
+# Expected result (base ^ 65537) mod n =
+# 0x95fb986cd4aeee4b013effc1d183670380a9e2133ecc6a38dbbfff3f8ef20e1923a5e3741eac8772ee80f28994968fcabd6d454b7791263872bc68d97b6f4fbb76cee24f205d812ad36f2fcb6c11145943009a051c39c18c45b53ee19e51df0254b31eb991783718fb35c51dec249956bceb0276eaee88d8ecdeae2c08ac62a0018408af3923206e911a7ecf6ad786255fa69d63d333e6f44ebd3f5e6ebb7c82443c694d913e200492c89f046943f2dc7d8cf9951c6a33fa721558d1956fb552349ded082714be6a8bff775fd05162744d229fc9fac72509476bdc6434e5187bf3a1cc426cc13f0a10dcf0d15f28abcecfe5674782f232464b1a890d42b6fdd0
+w0 = 0xf3a1cc426cc13f0a10dcf0d15f28abcecfe5674782f232464b1a890d42b6fdd0
+w1 = 0x349ded082714be6a8bff775fd05162744d229fc9fac72509476bdc6434e5187b
+w2 = 0x443c694d913e200492c89f046943f2dc7d8cf9951c6a33fa721558d1956fb552
+w3 = 0x018408af3923206e911a7ecf6ad786255fa69d63d333e6f44ebd3f5e6ebb7c82
+w4 = 0x54b31eb991783718fb35c51dec249956bceb0276eaee88d8ecdeae2c08ac62a0
+w5 = 0x76cee24f205d812ad36f2fcb6c11145943009a051c39c18c45b53ee19e51df02
+w6 = 0x23a5e3741eac8772ee80f28994968fcabd6d454b7791263872bc68d97b6f4fbb
+w7 = 0x95fb986cd4aeee4b013effc1d183670380a9e2133ecc6a38dbbfff3f8ef20e19
diff --git a/sw/otbn/crypto/tests/rsa_2048_enc_test.s b/sw/otbn/crypto/tests/rsa_2048_enc_test.s
new file mode 100644
index 0000000000000..6721228f4a518
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_2048_enc_test.s
@@ -0,0 +1,201 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+
+.section .text.start
+
+/**
+ * Standalone RSA-2048 modexp with e=65537 (encryption/verification).
+ */
+main:
+  /* Init all-zero register. */
+  bn.xor  w31, w31, w31
+
+  /* Load number of limbs. */
+  li    x30, 8
+
+  /* Load pointers to modulus and Montgomery constant buffers. */
+  la    x16, modulus
+  la    x17, m0inv
+  la    x18, RR
+
+  /* Compute Montgomery constants. */
+  jal      x1, modload
+
+  /* Run exponentiation.
+       dmem[result] = dmem[base]^dmem[exp] mod dmem[modulus] */
+  la       x14, base
+  la       x2, result
+  jal      x1, modexp_65537
+
+  /* copy all limbs of result to wide reg file */
+  la       x21, result
+  li       x8, 0
+  loop     x30, 2
+    bn.lid   x8, 0(x21++)
+    addi     x8, x8, 1
+
+  ecall
+
+.data
+
+/* Modulus n =
+
+0xb5ed720fe7e1b4a65494e8e9421df94910811d23854cb07b08a34508b682b188b16fa70e4804b4c4f54a54ae2a10848abc9253ac7c6085e5b9abcbcd48515db1626b01df4e7f5f1c85b9ce1b4c8d0f77f3854c8bc4f350ad4d993a6815d0d62ac83b47a257adb40023e1acf003d27953f19c5cbede1af58e42ef12ad9907c20ca428f8b7dbb6f3434936b1108d17ee343d7127f8885ff2513eb834c17bf1c4ddec0d61cc26f5f683c10c0e48676608811e9341f2898f690bc9fafd3b7e46d375e2178a141faf0d637767da550de4c5b9939af133ceba7cd2734df4ad269c166180afd8c35060de8ac302ca911aa3f92d139ed1595523a7f6c201cfafed4c17b5
+ */
+.balign 32
+modulus:
+  .word 0xed4c17b5
+  .word 0xc201cfaf
+  .word 0x5523a7f6
+  .word 0x139ed159
+  .word 0x1aa3f92d
+  .word 0xc302ca91
+  .word 0x5060de8a
+  .word 0x80afd8c3
+  .word 0x269c1661
+  .word 0x734df4ad
+  .word 0xceba7cd2
+  .word 0x939af133
+  .word 0x0de4c5b9
+  .word 0x7767da55
+  .word 0x1faf0d63
+  .word 0xe2178a14
+  .word 0x7e46d375
+  .word 0xc9fafd3b
+  .word 0x898f690b
+  .word 0x1e9341f2
+  .word 0x67660881
+  .word 0xc10c0e48
+  .word 0x26f5f683
+  .word 0xec0d61cc
+  .word 0x7bf1c4dd
+  .word 0x3eb834c1
+  .word 0x885ff251
+  .word 0x3d7127f8
+  .word 0x8d17ee34
+  .word 0x4936b110
+  .word 0xdbb6f343
+  .word 0xa428f8b7
+  .word 0x9907c20c
+  .word 0x42ef12ad
+  .word 0xde1af58e
+  .word 0xf19c5cbe
+  .word 0x03d27953
+  .word 0x23e1acf0
+  .word 0x57adb400
+  .word 0xc83b47a2
+  .word 0x15d0d62a
+  .word 0x4d993a68
+  .word 0xc4f350ad
+  .word 0xf3854c8b
+  .word 0x4c8d0f77
+  .word 0x85b9ce1b
+  .word 0x4e7f5f1c
+  .word 0x626b01df
+  .word 0x48515db1
+  .word 0xb9abcbcd
+  .word 0x7c6085e5
+  .word 0xbc9253ac
+  .word 0x2a10848a
+  .word 0xf54a54ae
+  .word 0x4804b4c4
+  .word 0xb16fa70e
+  .word 0xb682b188
+  .word 0x08a34508
+  .word 0x854cb07b
+  .word 0x10811d23
+  .word 0x421df949
+  .word 0x5494e8e9
+  .word 0xe7e1b4a6
+  .word 0xb5ed720f
+
+
+/* Base for exponentiation (corresponds to plaintext for encryption or
+   signature for verification).
+
+   Raw hex value (randomly generated) =
+0x6add9548af50f1bea3cb921205a5bb92ee325e01d160e3738a09aa0df7050e6051d693440f0d00cdd56cee5a748ff3b48b1df7be05808ad20068ad387b8b5e4c25c79bba9f87ef971da926f644c26d4273829fd69db71f9eded2cd1a33c367578550346ada160daa272940dd6fc10dae4a0facef437ece40130301c1b847203cc0defd3620ce89d96fa21d30ee63e458b0198adc842f68af8b462df6014955ab68f663a9b5e77caf15a517ab0931308bf9591cecc7691780a2f3bd99d3ce25433d31537e7cab1b4c07d99199e9517132188150d38d633c2b3ef6ba6fb40504e800fca580beb7a19f2315adb451be690fc4f87ea5914d28d5562dc1dce115a852
+ */
+.balign 32
+base:
+  .word 0xe115a852
+  .word 0x562dc1dc
+  .word 0x914d28d5
+  .word 0xc4f87ea5
+  .word 0x51be690f
+  .word 0x2315adb4
+  .word 0xbeb7a19f
+  .word 0x00fca580
+  .word 0xb40504e8
+  .word 0x3ef6ba6f
+  .word 0x8d633c2b
+  .word 0x188150d3
+  .word 0xe9517132
+  .word 0x07d99199
+  .word 0x7cab1b4c
+  .word 0x3d31537e
+  .word 0xd3ce2543
+  .word 0xa2f3bd99
+  .word 0xc7691780
+  .word 0xf9591cec
+  .word 0x0931308b
+  .word 0x15a517ab
+  .word 0xb5e77caf
+  .word 0x68f663a9
+  .word 0x014955ab
+  .word 0x8b462df6
+  .word 0x842f68af
+  .word 0xb0198adc
+  .word 0xee63e458
+  .word 0x6fa21d30
+  .word 0x20ce89d9
+  .word 0xc0defd36
+  .word 0xb847203c
+  .word 0x130301c1
+  .word 0x437ece40
+  .word 0x4a0facef
+  .word 0x6fc10dae
+  .word 0x272940dd
+  .word 0xda160daa
+  .word 0x8550346a
+  .word 0x33c36757
+  .word 0xded2cd1a
+  .word 0x9db71f9e
+  .word 0x73829fd6
+  .word 0x44c26d42
+  .word 0x1da926f6
+  .word 0x9f87ef97
+  .word 0x25c79bba
+  .word 0x7b8b5e4c
+  .word 0x0068ad38
+  .word 0x05808ad2
+  .word 0x8b1df7be
+  .word 0x748ff3b4
+  .word 0xd56cee5a
+  .word 0x0f0d00cd
+  .word 0x51d69344
+  .word 0xf7050e60
+  .word 0x8a09aa0d
+  .word 0xd160e373
+  .word 0xee325e01
+  .word 0x05a5bb92
+  .word 0xa3cb9212
+  .word 0xaf50f1be
+  .word 0x6add9548
+
+/* output buffer */
+.balign 32
+result:
+.zero 256
+
+/* buffer for Montgomery constant RR */
+.balign 32
+RR:
+.zero 256
+
+/* buffer for Montgomery constant m0inv */
+.balign 32
+m0inv:
+.zero 32
diff --git a/sw/otbn/crypto/tests/rsa_3072_dec_test.exp b/sw/otbn/crypto/tests/rsa_3072_dec_test.exp
new file mode 100644
index 0000000000000..55a478ea26c50
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_3072_dec_test.exp
@@ -0,0 +1,14 @@
+# Expected value:
+# 0x77d133acf99844910deadefd84b95fc010959a01e040c559c691ac8ff0410b369453478a7ca56f74e3f6a1ea1fef9ef490d8a9c0bd385c49e7b3934e93a52e44d49a7737b8153b295d9baf4ef032d00c61609458ddeeaf73a243670ce7fb188e20fb15b6c01c08c825d5f67547c679a1693dd04360813be3cd28c6e5a0d1dca66b410977470710a1f0b3463659be0e6d5946a4adccfae5e555a9360f44dec7b2a311ea186a6bc574fe00b89dc254481c78db835a1971ae2b22ce2caa06dee69a6b25fbef290e351a3aafc3850265ed51dc3237ea918727f9419aa4c335ba80f69a5205d277ff71b47b939780366179f7471ba6b451c21c2d4c288daa2ffc9fc4349e498c2d869021dc9214406c51ee9735d0341225efbb549f3e7b2939e90d211ebeaf5a2711926d53a32c790616502d02c483f3b357d23b958d554e478246175a12b90c2970c8ed47e9d376923812f8913cda3a6d88bd93f576cb143072c473156ae1e3925977b3b76bc804f2a5feeec49499c54463b55921e4c0d24e0bb41d
+w0 = 0x156ae1e3925977b3b76bc804f2a5feeec49499c54463b55921e4c0d24e0bb41d
+w1 = 0x5a12b90c2970c8ed47e9d376923812f8913cda3a6d88bd93f576cb143072c473
+w2 = 0x1ebeaf5a2711926d53a32c790616502d02c483f3b357d23b958d554e47824617
+w3 = 0x349e498c2d869021dc9214406c51ee9735d0341225efbb549f3e7b2939e90d21
+w4 = 0x9a5205d277ff71b47b939780366179f7471ba6b451c21c2d4c288daa2ffc9fc4
+w5 = 0x6b25fbef290e351a3aafc3850265ed51dc3237ea918727f9419aa4c335ba80f6
+w6 = 0xa311ea186a6bc574fe00b89dc254481c78db835a1971ae2b22ce2caa06dee69a
+w7 = 0x6b410977470710a1f0b3463659be0e6d5946a4adccfae5e555a9360f44dec7b2
+w8 = 0x20fb15b6c01c08c825d5f67547c679a1693dd04360813be3cd28c6e5a0d1dca6
+w9 = 0xd49a7737b8153b295d9baf4ef032d00c61609458ddeeaf73a243670ce7fb188e
+w10 = 0x9453478a7ca56f74e3f6a1ea1fef9ef490d8a9c0bd385c49e7b3934e93a52e44
+w11 = 0x77d133acf99844910deadefd84b95fc010959a01e040c559c691ac8ff0410b36
diff --git a/sw/otbn/crypto/tests/rsa_3072_dec_test.s b/sw/otbn/crypto/tests/rsa_3072_dec_test.s
new file mode 100644
index 0000000000000..4e6a599729ca6
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_3072_dec_test.s
@@ -0,0 +1,368 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+
+.section .text.start
+
+/**
+ * Standalone RSA-3072 modexp with secret exponent (decryption/signing).
+ */
+main:
+  /* Init all-zero register. */
+  bn.xor  w31, w31, w31
+
+  /* Load number of limbs. */
+  li    x30, 12
+
+  /* Load pointers to modulus and Montgomery constant buffers. */
+  la    x16, modulus
+  la    x17, m0inv
+  la    x18, RR
+
+  /* Compute Montgomery constants. */
+  jal      x1, modload
+
+  /* Run exponentiation.
+       dmem[result] = dmem[base]^dmem[exp] mod dmem[modulus] */
+  la       x14, base
+  la       x15, exp
+  la       x2, result
+  jal      x1, modexp
+
+  /* copy all limbs of result to wide reg file */
+  la       x21, result
+  li       x8, 0
+  loop     x30, 2
+    bn.lid   x8, 0(x21++)
+    addi     x8, x8, 1
+
+  ecall
+
+
+.data
+
+/* Modulus n =
+0xb2e73fd1e1dce003def2f2795a1400f2514256a70fe83d64ae8464f114839c94d975c89f97b3598b48de7a560b867b4967ae92d3552f0b204c000b0841f5fac3ef0ba000acfb517a995cf708e46c670a885626d7865ebc5bccc509bc562a4ffc956eb3b859e43bc83debe4888e3e6a55de852c027a874b9c803598a78f4196800db785d91730e8708b8cef986c6d326c9a003201737cb3f5e42cd601c47d74898105671d446b9a5c8a835286f419682fc4b69e79a8d2f9f6aabca5b0c311dabe6fb19d3e03045a729b3107f21370935c6de2316876afae55aeb4da07b8a04aafc1f7717f8d571f47c1a0f395e4ce78ed581db853bda1cb6f224fc4b2c6244611d416b2e729c54ef638d7bd94483b11b56b7b613c06b2564c08de82ef33dff23892e183fd6e96713bfaf76b792c4e8071f3dd5ad695e3748179bebb97140efaabce02687b401b93a513b80b5ec334d38c0b331f90d1454c9b8f3b87017b5174f1d2b75c27fff6e89a3ae099fb0455b5cc9d3bd4840baf510e4d80dbbac4049efb
+ */
+.balign 32
+modulus:
+  .word 0xc4049efb
+  .word 0x4d80dbba
+  .word 0x0baf510e
+  .word 0x9d3bd484
+  .word 0x0455b5cc
+  .word 0x3ae099fb
+  .word 0xfff6e89a
+  .word 0xd2b75c27
+  .word 0x7b5174f1
+  .word 0x8f3b8701
+  .word 0xd1454c9b
+  .word 0x0b331f90
+  .word 0xc334d38c
+  .word 0x13b80b5e
+  .word 0x401b93a5
+  .word 0xce02687b
+  .word 0x140efaab
+  .word 0x79bebb97
+  .word 0x95e37481
+  .word 0xf3dd5ad6
+  .word 0x2c4e8071
+  .word 0xfaf76b79
+  .word 0x6e96713b
+  .word 0x92e183fd
+  .word 0x33dff238
+  .word 0x08de82ef
+  .word 0x06b2564c
+  .word 0x6b7b613c
+  .word 0x483b11b5
+  .word 0x38d7bd94
+  .word 0x29c54ef6
+  .word 0xd416b2e7
+  .word 0xc6244611
+  .word 0x224fc4b2
+  .word 0xbda1cb6f
+  .word 0x581db853
+  .word 0xe4ce78ed
+  .word 0xc1a0f395
+  .word 0x8d571f47
+  .word 0xc1f7717f
+  .word 0xb8a04aaf
+  .word 0xaeb4da07
+  .word 0x76afae55
+  .word 0x6de23168
+  .word 0x1370935c
+  .word 0x9b3107f2
+  .word 0x03045a72
+  .word 0x6fb19d3e
+  .word 0xc311dabe
+  .word 0xaabca5b0
+  .word 0xa8d2f9f6
+  .word 0xc4b69e79
+  .word 0xf419682f
+  .word 0x8a835286
+  .word 0x446b9a5c
+  .word 0x8105671d
+  .word 0xc47d7489
+  .word 0xe42cd601
+  .word 0x737cb3f5
+  .word 0x9a003201
+  .word 0x6c6d326c
+  .word 0x8b8cef98
+  .word 0x1730e870
+  .word 0x0db785d9
+  .word 0x8f419680
+  .word 0x803598a7
+  .word 0x7a874b9c
+  .word 0xde852c02
+  .word 0x8e3e6a55
+  .word 0x3debe488
+  .word 0x59e43bc8
+  .word 0x956eb3b8
+  .word 0x562a4ffc
+  .word 0xccc509bc
+  .word 0x865ebc5b
+  .word 0x885626d7
+  .word 0xe46c670a
+  .word 0x995cf708
+  .word 0xacfb517a
+  .word 0xef0ba000
+  .word 0x41f5fac3
+  .word 0x4c000b08
+  .word 0x552f0b20
+  .word 0x67ae92d3
+  .word 0x0b867b49
+  .word 0x48de7a56
+  .word 0x97b3598b
+  .word 0xd975c89f
+  .word 0x14839c94
+  .word 0xae8464f1
+  .word 0x0fe83d64
+  .word 0x514256a7
+  .word 0x5a1400f2
+  .word 0xdef2f279
+  .word 0xe1dce003
+  .word 0xb2e73fd1
+
+/* Base for exponentiation (corresponds to ciphertext for decryption or
+   message for signing).
+
+   Raw hex value =
+0x1273e84d4509b08a748a9bf81808f0a2387616159d3b04b32ab172285440f09f69f53e37a7cb6a6fc3fb0626a4dad7b64417570b4e112843bd3c82030fb4a01ba9ba2c194e98d51372b9a63153d7279c62452592d597c85dc493d167735918d89c4aa86d20073a0d6ce2a3bf7dc603d73efb38c5ff6fb191db117f8cf4cb0b46e995bfa0e4cee3a055fc44e496989f7248e95c6e3f4879d2c38118f51a0910d37030ddf0a39a0e6e0e3e4be9b0d12e8d25f337657e7f752fa63defebd91b2c107a00475fde78b38da4ca12c17160a8d68f6eecb60481e6ae3f3a0dc03ebb82f327dfe85f850d05215af5f7c07cd5932aaec3d17339e75b2ec803d231188942231edd8c43a6bd9b7be638da9911604a3308117ad41cf697273550765bac72a499cf21d4c4406668f569e4b002d75de22d3e5e7d5c7b33131389cffad951b362865106352780474b3c79378169c6998388e5d2782557c228f0fcf1e132e0fc2d14fe03cca340568cfb5c07a00b052304fa0923f1dfcf627a58e0ea093a97af836
+ */
+.balign 32
+base:
+  .word 0xa97af836
+  .word 0x8e0ea093
+  .word 0xfcf627a5
+  .word 0xa0923f1d
+  .word 0xb052304f
+  .word 0xb5c07a00
+  .word 0x340568cf
+  .word 0x4fe03cca
+  .word 0x2e0fc2d1
+  .word 0x0fcf1e13
+  .word 0x557c228f
+  .word 0x8e5d2782
+  .word 0x9c699838
+  .word 0xc7937816
+  .word 0x780474b3
+  .word 0x65106352
+  .word 0x951b3628
+  .word 0x389cffad
+  .word 0xc7b33131
+  .word 0xd3e5e7d5
+  .word 0x2d75de22
+  .word 0x569e4b00
+  .word 0x4406668f
+  .word 0x9cf21d4c
+  .word 0xbac72a49
+  .word 0x73550765
+  .word 0x41cf6972
+  .word 0x308117ad
+  .word 0x911604a3
+  .word 0xbe638da9
+  .word 0x3a6bd9b7
+  .word 0x31edd8c4
+  .word 0x11889422
+  .word 0xec803d23
+  .word 0x339e75b2
+  .word 0xaaec3d17
+  .word 0x07cd5932
+  .word 0x15af5f7c
+  .word 0xf850d052
+  .word 0x327dfe85
+  .word 0x03ebb82f
+  .word 0xe3f3a0dc
+  .word 0x60481e6a
+  .word 0x68f6eecb
+  .word 0x17160a8d
+  .word 0xda4ca12c
+  .word 0xfde78b38
+  .word 0x07a00475
+  .word 0xbd91b2c1
+  .word 0xfa63defe
+  .word 0x57e7f752
+  .word 0xd25f3376
+  .word 0x9b0d12e8
+  .word 0xe0e3e4be
+  .word 0x0a39a0e6
+  .word 0x37030ddf
+  .word 0x51a0910d
+  .word 0x2c38118f
+  .word 0xe3f4879d
+  .word 0x248e95c6
+  .word 0x496989f7
+  .word 0x055fc44e
+  .word 0x0e4cee3a
+  .word 0x6e995bfa
+  .word 0xcf4cb0b4
+  .word 0x1db117f8
+  .word 0x5ff6fb19
+  .word 0x73efb38c
+  .word 0xf7dc603d
+  .word 0xd6ce2a3b
+  .word 0xd20073a0
+  .word 0x89c4aa86
+  .word 0x7735918d
+  .word 0xdc493d16
+  .word 0x2d597c85
+  .word 0xc6245259
+  .word 0x153d7279
+  .word 0x372b9a63
+  .word 0x94e98d51
+  .word 0xba9ba2c1
+  .word 0x30fb4a01
+  .word 0x3bd3c820
+  .word 0xb4e11284
+  .word 0x64417570
+  .word 0x6a4dad7b
+  .word 0xfc3fb062
+  .word 0x7a7cb6a6
+  .word 0xf69f53e3
+  .word 0x85440f09
+  .word 0x32ab1722
+  .word 0x59d3b04b
+  .word 0x23876161
+  .word 0x81808f0a
+  .word 0xa748a9bf
+  .word 0xd4509b08
+  .word 0x01273e84
+
+/* Private exponent d =
+0x1bf6782bb27d670843db3e5a0861d30a0cf86cf9dccb24796daba4e96796f0acf5566b1ec2c3d62da69c9b8b826ea92b7e88b34b53e7affa02d708e26808ee029d04f8a3d265cfc4f55eaa001a4ff54518ad3a91fa5f295ac1e55451bb380edb8071d6a66c6a778ba35e1110e506cd711180483234fb9bae60fdbf980514afd4e10ffbdc443b314192165bc6bbbfcf9f58ecc9e41f2c7126705d2fb00409c5e2ce274d882e0f1188006069504dac00f4626f56d2d637efb905d3c9a418c15c2a9f1b2f1d3fca1461d2b483d3ce354e56f24ebbea9197c2359af199d89cdaf737668626719923e8718ee4f5085ecb1b09aed5f539795ef462f173451e18d04939b2b090fdc6e75bd438be26cc7b0b8244810176d366e6f1b38144510d956f5ed8f5f3f51e50092b54945cf6ecc0a6f317cc44e487dd38f8b3e0f42841ff538d87b75d592fdca3ee5f1eedc81f0d9b2652b5058a3e50b9ab7d266eb0c681f6f829daec744b0cbf7d22d099e96cd3d1e29cb675ecaef5a7d99d35b84ca4d35c6b8d
+ */
+.balign 32
+exp:
+  .word 0xd35c6b8d
+  .word 0x35b84ca4
+  .word 0xf5a7d99d
+  .word 0xb675ecae
+  .word 0xd3d1e29c
+  .word 0xd099e96c
+  .word 0x0cbf7d22
+  .word 0xdaec744b
+  .word 0x81f6f829
+  .word 0x266eb0c6
+  .word 0x50b9ab7d
+  .word 0xb5058a3e
+  .word 0x0d9b2652
+  .word 0x1eedc81f
+  .word 0xdca3ee5f
+  .word 0xb75d592f
+  .word 0xff538d87
+  .word 0xe0f42841
+  .word 0xdd38f8b3
+  .word 0xcc44e487
+  .word 0xc0a6f317
+  .word 0x945cf6ec
+  .word 0x50092b54
+  .word 0xf5f3f51e
+  .word 0x956f5ed8
+  .word 0x8144510d
+  .word 0x66e6f1b3
+  .word 0x810176d3
+  .word 0x7b0b8244
+  .word 0x38be26cc
+  .word 0xc6e75bd4
+  .word 0xb2b090fd
+  .word 0x18d04939
+  .word 0xf173451e
+  .word 0x795ef462
+  .word 0xaed5f539
+  .word 0x5ecb1b09
+  .word 0x8ee4f508
+  .word 0x9923e871
+  .word 0x66862671
+  .word 0x9cdaf737
+  .word 0x9af199d8
+  .word 0x9197c235
+  .word 0xf24ebbea
+  .word 0xce354e56
+  .word 0xd2b483d3
+  .word 0x3fca1461
+  .word 0x9f1b2f1d
+  .word 0x18c15c2a
+  .word 0x05d3c9a4
+  .word 0xd637efb9
+  .word 0x626f56d2
+  .word 0x4dac00f4
+  .word 0x00606950
+  .word 0x2e0f1188
+  .word 0xce274d88
+  .word 0x0409c5e2
+  .word 0x705d2fb0
+  .word 0x1f2c7126
+  .word 0x58ecc9e4
+  .word 0xbbbfcf9f
+  .word 0x92165bc6
+  .word 0x443b3141
+  .word 0xe10ffbdc
+  .word 0x0514afd4
+  .word 0x60fdbf98
+  .word 0x34fb9bae
+  .word 0x11804832
+  .word 0xe506cd71
+  .word 0xa35e1110
+  .word 0x6c6a778b
+  .word 0x8071d6a6
+  .word 0xbb380edb
+  .word 0xc1e55451
+  .word 0xfa5f295a
+  .word 0x18ad3a91
+  .word 0x1a4ff545
+  .word 0xf55eaa00
+  .word 0xd265cfc4
+  .word 0x9d04f8a3
+  .word 0x6808ee02
+  .word 0x02d708e2
+  .word 0x53e7affa
+  .word 0x7e88b34b
+  .word 0x826ea92b
+  .word 0xa69c9b8b
+  .word 0xc2c3d62d
+  .word 0xf5566b1e
+  .word 0x6796f0ac
+  .word 0x6daba4e9
+  .word 0xdccb2479
+  .word 0x0cf86cf9
+  .word 0x0861d30a
+  .word 0x43db3e5a
+  .word 0xb27d6708
+  .word 0x1bf6782b
+
+
+/* output buffer */
+.balign 32
+result:
+.zero 384
+
+/* buffer for Montgomery constant RR */
+.balign 32
+RR:
+.zero 384
+
+/* buffer for Montgomery constant m0inv */
+.balign 32
+m0inv:
+.zero 32
diff --git a/sw/otbn/crypto/tests/rsa_3072_enc_test.exp b/sw/otbn/crypto/tests/rsa_3072_enc_test.exp
new file mode 100644
index 0000000000000..6ef8409ea67ed
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_3072_enc_test.exp
@@ -0,0 +1,14 @@
+# Expected value:
+# 0x1273e84d4509b08a748a9bf81808f0a2387616159d3b04b32ab172285440f09f69f53e37a7cb6a6fc3fb0626a4dad7b64417570b4e112843bd3c82030fb4a01ba9ba2c194e98d51372b9a63153d7279c62452592d597c85dc493d167735918d89c4aa86d20073a0d6ce2a3bf7dc603d73efb38c5ff6fb191db117f8cf4cb0b46e995bfa0e4cee3a055fc44e496989f7248e95c6e3f4879d2c38118f51a0910d37030ddf0a39a0e6e0e3e4be9b0d12e8d25f337657e7f752fa63defebd91b2c107a00475fde78b38da4ca12c17160a8d68f6eecb60481e6ae3f3a0dc03ebb82f327dfe85f850d05215af5f7c07cd5932aaec3d17339e75b2ec803d231188942231edd8c43a6bd9b7be638da9911604a3308117ad41cf697273550765bac72a499cf21d4c4406668f569e4b002d75de22d3e5e7d5c7b33131389cffad951b362865106352780474b3c79378169c6998388e5d2782557c228f0fcf1e132e0fc2d14fe03cca340568cfb5c07a00b052304fa0923f1dfcf627a58e0ea093a97af836
+w0 = 0x4fe03cca340568cfb5c07a00b052304fa0923f1dfcf627a58e0ea093a97af836
+w1 = 0x65106352780474b3c79378169c6998388e5d2782557c228f0fcf1e132e0fc2d1
+w2 = 0x9cf21d4c4406668f569e4b002d75de22d3e5e7d5c7b33131389cffad951b3628
+w3 = 0x31edd8c43a6bd9b7be638da9911604a3308117ad41cf697273550765bac72a49
+w4 = 0x327dfe85f850d05215af5f7c07cd5932aaec3d17339e75b2ec803d2311889422
+w5 = 0x07a00475fde78b38da4ca12c17160a8d68f6eecb60481e6ae3f3a0dc03ebb82f
+w6 = 0x37030ddf0a39a0e6e0e3e4be9b0d12e8d25f337657e7f752fa63defebd91b2c1
+w7 = 0x6e995bfa0e4cee3a055fc44e496989f7248e95c6e3f4879d2c38118f51a0910d
+w8 = 0x89c4aa86d20073a0d6ce2a3bf7dc603d73efb38c5ff6fb191db117f8cf4cb0b4
+w9 = 0xba9ba2c194e98d51372b9a63153d7279c62452592d597c85dc493d167735918d
+w10 = 0xf69f53e37a7cb6a6fc3fb0626a4dad7b64417570b4e112843bd3c82030fb4a01
+w11 = 0x01273e84d4509b08a748a9bf81808f0a2387616159d3b04b32ab172285440f09
diff --git a/sw/otbn/crypto/tests/rsa_3072_enc_test.s b/sw/otbn/crypto/tests/rsa_3072_enc_test.s
new file mode 100644
index 0000000000000..374fff917c52c
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_3072_enc_test.s
@@ -0,0 +1,264 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+
+.section .text.start
+
+/**
+ * Standalone RSA-3072 modexp with e=65537 (encryption/verification).
+ */
+main:
+  /* Init all-zero register. */
+  bn.xor  w31, w31, w31
+
+  /* Load number of limbs. */
+  li    x30, 12
+
+  /* Load pointers to modulus and Montgomery constant buffers. */
+  la    x16, modulus
+  la    x17, m0inv
+  la    x18, RR
+
+  /* Compute Montgomery constants. */
+  jal      x1, modload
+
+  /* Run exponentiation.
+       dmem[result] = dmem[base]^dmem[exp] mod dmem[modulus] */
+  la       x14, base
+  la       x2, result
+  jal      x1, modexp_65537
+
+  /* copy all limbs of result to wide reg file */
+  la       x21, result
+  li       x8, 0
+  loop     x30, 2
+    bn.lid   x8, 0(x21++)
+    addi     x8, x8, 1
+
+  ecall
+
+.data
+
+/* Modulus n =
+0xb2e73fd1e1dce003def2f2795a1400f2514256a70fe83d64ae8464f114839c94d975c89f97b3598b48de7a560b867b4967ae92d3552f0b204c000b0841f5fac3ef0ba000acfb517a995cf708e46c670a885626d7865ebc5bccc509bc562a4ffc956eb3b859e43bc83debe4888e3e6a55de852c027a874b9c803598a78f4196800db785d91730e8708b8cef986c6d326c9a003201737cb3f5e42cd601c47d74898105671d446b9a5c8a835286f419682fc4b69e79a8d2f9f6aabca5b0c311dabe6fb19d3e03045a729b3107f21370935c6de2316876afae55aeb4da07b8a04aafc1f7717f8d571f47c1a0f395e4ce78ed581db853bda1cb6f224fc4b2c6244611d416b2e729c54ef638d7bd94483b11b56b7b613c06b2564c08de82ef33dff23892e183fd6e96713bfaf76b792c4e8071f3dd5ad695e3748179bebb97140efaabce02687b401b93a513b80b5ec334d38c0b331f90d1454c9b8f3b87017b5174f1d2b75c27fff6e89a3ae099fb0455b5cc9d3bd4840baf510e4d80dbbac4049efb
+ */
+.balign 32
+modulus:
+  .word 0xc4049efb
+  .word 0x4d80dbba
+  .word 0x0baf510e
+  .word 0x9d3bd484
+  .word 0x0455b5cc
+  .word 0x3ae099fb
+  .word 0xfff6e89a
+  .word 0xd2b75c27
+  .word 0x7b5174f1
+  .word 0x8f3b8701
+  .word 0xd1454c9b
+  .word 0x0b331f90
+  .word 0xc334d38c
+  .word 0x13b80b5e
+  .word 0x401b93a5
+  .word 0xce02687b
+  .word 0x140efaab
+  .word 0x79bebb97
+  .word 0x95e37481
+  .word 0xf3dd5ad6
+  .word 0x2c4e8071
+  .word 0xfaf76b79
+  .word 0x6e96713b
+  .word 0x92e183fd
+  .word 0x33dff238
+  .word 0x08de82ef
+  .word 0x06b2564c
+  .word 0x6b7b613c
+  .word 0x483b11b5
+  .word 0x38d7bd94
+  .word 0x29c54ef6
+  .word 0xd416b2e7
+  .word 0xc6244611
+  .word 0x224fc4b2
+  .word 0xbda1cb6f
+  .word 0x581db853
+  .word 0xe4ce78ed
+  .word 0xc1a0f395
+  .word 0x8d571f47
+  .word 0xc1f7717f
+  .word 0xb8a04aaf
+  .word 0xaeb4da07
+  .word 0x76afae55
+  .word 0x6de23168
+  .word 0x1370935c
+  .word 0x9b3107f2
+  .word 0x03045a72
+  .word 0x6fb19d3e
+  .word 0xc311dabe
+  .word 0xaabca5b0
+  .word 0xa8d2f9f6
+  .word 0xc4b69e79
+  .word 0xf419682f
+  .word 0x8a835286
+  .word 0x446b9a5c
+  .word 0x8105671d
+  .word 0xc47d7489
+  .word 0xe42cd601
+  .word 0x737cb3f5
+  .word 0x9a003201
+  .word 0x6c6d326c
+  .word 0x8b8cef98
+  .word 0x1730e870
+  .word 0x0db785d9
+  .word 0x8f419680
+  .word 0x803598a7
+  .word 0x7a874b9c
+  .word 0xde852c02
+  .word 0x8e3e6a55
+  .word 0x3debe488
+  .word 0x59e43bc8
+  .word 0x956eb3b8
+  .word 0x562a4ffc
+  .word 0xccc509bc
+  .word 0x865ebc5b
+  .word 0x885626d7
+  .word 0xe46c670a
+  .word 0x995cf708
+  .word 0xacfb517a
+  .word 0xef0ba000
+  .word 0x41f5fac3
+  .word 0x4c000b08
+  .word 0x552f0b20
+  .word 0x67ae92d3
+  .word 0x0b867b49
+  .word 0x48de7a56
+  .word 0x97b3598b
+  .word 0xd975c89f
+  .word 0x14839c94
+  .word 0xae8464f1
+  .word 0x0fe83d64
+  .word 0x514256a7
+  .word 0x5a1400f2
+  .word 0xdef2f279
+  .word 0xe1dce003
+  .word 0xb2e73fd1
+
+
+/* Base for exponentiation (corresponds to plaintext for encryption or
+   signature for verification).
+
+   Raw hex value (randomly generated) =
+0x77d133acf99844910deadefd84b95fc010959a01e040c559c691ac8ff0410b369453478a7ca56f74e3f6a1ea1fef9ef490d8a9c0bd385c49e7b3934e93a52e44d49a7737b8153b295d9baf4ef032d00c61609458ddeeaf73a243670ce7fb188e20fb15b6c01c08c825d5f67547c679a1693dd04360813be3cd28c6e5a0d1dca66b410977470710a1f0b3463659be0e6d5946a4adccfae5e555a9360f44dec7b2a311ea186a6bc574fe00b89dc254481c78db835a1971ae2b22ce2caa06dee69a6b25fbef290e351a3aafc3850265ed51dc3237ea918727f9419aa4c335ba80f69a5205d277ff71b47b939780366179f7471ba6b451c21c2d4c288daa2ffc9fc4349e498c2d869021dc9214406c51ee9735d0341225efbb549f3e7b2939e90d211ebeaf5a2711926d53a32c790616502d02c483f3b357d23b958d554e478246175a12b90c2970c8ed47e9d376923812f8913cda3a6d88bd93f576cb143072c473156ae1e3925977b3b76bc804f2a5feeec49499c54463b55921e4c0d24e0bb41d
+ */
+.balign 32
+base:
+  .word 0x4e0bb41d
+  .word 0x21e4c0d2
+  .word 0x4463b559
+  .word 0xc49499c5
+  .word 0xf2a5feee
+  .word 0xb76bc804
+  .word 0x925977b3
+  .word 0x156ae1e3
+  .word 0x3072c473
+  .word 0xf576cb14
+  .word 0x6d88bd93
+  .word 0x913cda3a
+  .word 0x923812f8
+  .word 0x47e9d376
+  .word 0x2970c8ed
+  .word 0x5a12b90c
+  .word 0x47824617
+  .word 0x958d554e
+  .word 0xb357d23b
+  .word 0x02c483f3
+  .word 0x0616502d
+  .word 0x53a32c79
+  .word 0x2711926d
+  .word 0x1ebeaf5a
+  .word 0x39e90d21
+  .word 0x9f3e7b29
+  .word 0x25efbb54
+  .word 0x35d03412
+  .word 0x6c51ee97
+  .word 0xdc921440
+  .word 0x2d869021
+  .word 0x349e498c
+  .word 0x2ffc9fc4
+  .word 0x4c288daa
+  .word 0x51c21c2d
+  .word 0x471ba6b4
+  .word 0x366179f7
+  .word 0x7b939780
+  .word 0x77ff71b4
+  .word 0x9a5205d2
+  .word 0x35ba80f6
+  .word 0x419aa4c3
+  .word 0x918727f9
+  .word 0xdc3237ea
+  .word 0x0265ed51
+  .word 0x3aafc385
+  .word 0x290e351a
+  .word 0x6b25fbef
+  .word 0x06dee69a
+  .word 0x22ce2caa
+  .word 0x1971ae2b
+  .word 0x78db835a
+  .word 0xc254481c
+  .word 0xfe00b89d
+  .word 0x6a6bc574
+  .word 0xa311ea18
+  .word 0x44dec7b2
+  .word 0x55a9360f
+  .word 0xccfae5e5
+  .word 0x5946a4ad
+  .word 0x59be0e6d
+  .word 0xf0b34636
+  .word 0x470710a1
+  .word 0x6b410977
+  .word 0xa0d1dca6
+  .word 0xcd28c6e5
+  .word 0x60813be3
+  .word 0x693dd043
+  .word 0x47c679a1
+  .word 0x25d5f675
+  .word 0xc01c08c8
+  .word 0x20fb15b6
+  .word 0xe7fb188e
+  .word 0xa243670c
+  .word 0xddeeaf73
+  .word 0x61609458
+  .word 0xf032d00c
+  .word 0x5d9baf4e
+  .word 0xb8153b29
+  .word 0xd49a7737
+  .word 0x93a52e44
+  .word 0xe7b3934e
+  .word 0xbd385c49
+  .word 0x90d8a9c0
+  .word 0x1fef9ef4
+  .word 0xe3f6a1ea
+  .word 0x7ca56f74
+  .word 0x9453478a
+  .word 0xf0410b36
+  .word 0xc691ac8f
+  .word 0xe040c559
+  .word 0x10959a01
+  .word 0x84b95fc0
+  .word 0x0deadefd
+  .word 0xf9984491
+  .word 0x77d133ac
+
+/* output buffer */
+.balign 32
+result:
+.zero 384
+
+/* buffer for Montgomery constant RR */
+.balign 32
+RR:
+.zero 384
+
+/* buffer for Montgomery constant m0inv */
+.balign 32
+m0inv:
+.zero 384
diff --git a/sw/otbn/crypto/tests/rsa_4096_enc_test.exp b/sw/otbn/crypto/tests/rsa_4096_enc_test.exp
new file mode 100644
index 0000000000000..c4b751cb8b87e
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_4096_enc_test.exp
@@ -0,0 +1,18 @@
+# Expected result:
+# 0x74798a179a6112dcfabdd7a1b39dab868d30bf4bcd359bc6eaf3c9b3626089ecd0851b9d077cb5deafe302f71035e179b528af0b9bfc0055caee605fd12f7f1b2251bde06b292a7c1a69227ba00945bacf857252571fb71a94bd353ea9d0a94d0068060b44c7f5bd416be032723581c824799f44ed947eba9008d1cf0c1a21a139f0125494b07540be6f55b53bcdcd51ea2b8fd0af22d08a3a74d0ad55b7e2481dc14fec2bc701276175f9eaa84298536d21de5d92b970760fd8898a2b9212145ce94070d2e5d75a85cbed16c660ecab4e71bac82519d367336e5975676e58cc08208d24dce920812f607713415ba58038bd9745d4aba1de2b11788baaa5146e0ef5f8da023c7049dabfc84434f02c9bb5d488e0caaa2794374396420f3c29456fd16448c13a3da797e741879aa4b55b6eb69313c02366393bc7e64b0220fb46a0eb481afb669af2192964a13e37b8050bf5472456905fb224ef27d4e86684024766be8859d7ebc910b35af5ef334497929e2120afa5f0b46539a9b58a9e725a84db8c290f547733085ae8970cda95069c67064461c368c38e10c5e3f4ae84f7a87ae5850c73cde81c3021d50b0aa1af472dd0b08d5d983ec8fdec9366ef9c52ddf7de6f81bd22e1bb4f7356fa1d40d0384b46c2997ce8d85d6bb594624b8e97f5e6134ee2bd795d89b996000f2c2e49c13d808b1b46f0fde6a8e6715403e9ad
+w0 = 0xf5e6134ee2bd795d89b996000f2c2e49c13d808b1b46f0fde6a8e6715403e9ad
+w1 = 0xddf7de6f81bd22e1bb4f7356fa1d40d0384b46c2997ce8d85d6bb594624b8e97
+w2 = 0xa87ae5850c73cde81c3021d50b0aa1af472dd0b08d5d983ec8fdec9366ef9c52
+w3 = 0x84db8c290f547733085ae8970cda95069c67064461c368c38e10c5e3f4ae84f7
+w4 = 0x4766be8859d7ebc910b35af5ef334497929e2120afa5f0b46539a9b58a9e725a
+w5 = 0xa0eb481afb669af2192964a13e37b8050bf5472456905fb224ef27d4e8668402
+w6 = 0x6fd16448c13a3da797e741879aa4b55b6eb69313c02366393bc7e64b0220fb46
+w7 = 0x0ef5f8da023c7049dabfc84434f02c9bb5d488e0caaa2794374396420f3c2945
+w8 = 0x08208d24dce920812f607713415ba58038bd9745d4aba1de2b11788baaa5146e
+w9 = 0x5ce94070d2e5d75a85cbed16c660ecab4e71bac82519d367336e5975676e58cc
+w10 = 0x1dc14fec2bc701276175f9eaa84298536d21de5d92b970760fd8898a2b921214
+w11 = 0x39f0125494b07540be6f55b53bcdcd51ea2b8fd0af22d08a3a74d0ad55b7e248
+w12 = 0x0068060b44c7f5bd416be032723581c824799f44ed947eba9008d1cf0c1a21a1
+w13 = 0x2251bde06b292a7c1a69227ba00945bacf857252571fb71a94bd353ea9d0a94d
+w14 = 0xd0851b9d077cb5deafe302f71035e179b528af0b9bfc0055caee605fd12f7f1b
+w15 = 0x74798a179a6112dcfabdd7a1b39dab868d30bf4bcd359bc6eaf3c9b3626089ec
diff --git a/sw/otbn/crypto/tests/rsa_4096_enc_test.s b/sw/otbn/crypto/tests/rsa_4096_enc_test.s
new file mode 100644
index 0000000000000..d42ea3204cc67
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_4096_enc_test.s
@@ -0,0 +1,329 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+
+.section .text.start
+
+/**
+ * Standalone RSA-4096 modexp with e=65537 (encryption/verification).
+ */
+main:
+  /* Init all-zero register. */
+  bn.xor  w31, w31, w31
+
+  /* Load number of limbs. */
+  li    x30, 16
+
+  /* Load pointers to modulus and Montgomery constant buffers. */
+  la    x16, modulus
+  la    x17, m0inv
+  la    x18, RR
+
+  /* Compute Montgomery constants. */
+  jal      x1, modload
+
+  /* Run exponentiation.
+       dmem[result] = dmem[base]^dmem[exp] mod dmem[modulus] */
+  la       x14, base
+  la       x2, result
+  jal      x1, modexp_65537
+
+  /* copy all limbs of result to wide reg file */
+  la       x21, result
+  li       x8, 0
+  loop     x30, 2
+    bn.lid   x8, 0(x21++)
+    addi     x8, x8, 1
+
+  ecall
+
+.data
+
+/* Modulus n =
+0xb25ab5439b6a703a7ba169f099a766944a86466bd18324b2149a23564261af44f1087a7df201eb2dc9583de79d9db60edd4a17aee8ed7b9384de837d70f5f99ad91695d9c780dde5401f160ce02a6135df0ea2339617b962250cf810a2be45acd43b602eddf1be6321d236e6338272e5bd5cda251a896d1d65eb10e2308f9ba8bcf4fb0836a5439c8a86394acdf2a2a3d0b4ae41b75d52894a8d79adfd1cb8db977d42d4865cd9a426bf1156b86e541469ac5a54bc06231da1db901d548cf53f3f003f7cdeee9b1b9ca7b4049b0e36b8cc7fc6d62967ffbffa593aa5cfbb41c68df57003911cf3ba2516378eaa9ee36da6ce4b09d71f072a79615d5619c8132c5467b56eae8a5e2aaa56ac4aa5dc9f696f89dd0cd0f818cdc8b58c938b336f87179cbb52a6a2965a7fcd619a5b315d370bdefeca9cbd6ea39e853f39d39c14f797ca5c31535c89f883cdfbb3bb1934490b136e46f99d8e5411a2a8b73b2519f43d78ee5cc675dcbcfeac8ef693c09a1aa87785cb5713298fa2edfcc67497cc6dbbc5d911edf7b1b5a735f14ab1870b481cd35279c932c74902faf5f047d84e6bedb88c28fced24b3728c5d9dc1114c46bfded6531873e718372dad28aae0a3c4f06dd81542cb9192783a9107a0263c8add0f23b250472f50b18f0e7719a3ba58ba38bc9ab906f86d0507a44690aba5ee96ef1083c237f2f004bff60bc4ecfb99
+ */
+.balign 32
+modulus:
+  .word 0xc4ecfb99
+  .word 0x04bff60b
+  .word 0xc237f2f0
+  .word 0x96ef1083
+  .word 0x90aba5ee
+  .word 0x0507a446
+  .word 0xb906f86d
+  .word 0xba38bc9a
+  .word 0x19a3ba58
+  .word 0xb18f0e77
+  .word 0x50472f50
+  .word 0xdd0f23b2
+  .word 0xa0263c8a
+  .word 0x783a9107
+  .word 0x42cb9192
+  .word 0xf06dd815
+  .word 0xaae0a3c4
+  .word 0x372dad28
+  .word 0x1873e718
+  .word 0xbfded653
+  .word 0xc1114c46
+  .word 0x728c5d9d
+  .word 0xfced24b3
+  .word 0xedb88c28
+  .word 0x47d84e6b
+  .word 0x02faf5f0
+  .word 0xc932c749
+  .word 0x1cd35279
+  .word 0xb1870b48
+  .word 0xa735f14a
+  .word 0xedf7b1b5
+  .word 0xbbc5d911
+  .word 0x7497cc6d
+  .word 0xa2edfcc6
+  .word 0x5713298f
+  .word 0xa87785cb
+  .word 0x93c09a1a
+  .word 0xfeac8ef6
+  .word 0xc675dcbc
+  .word 0x3d78ee5c
+  .word 0x3b2519f4
+  .word 0x11a2a8b7
+  .word 0xf99d8e54
+  .word 0x0b136e46
+  .word 0xbb193449
+  .word 0x83cdfbb3
+  .word 0x535c89f8
+  .word 0x97ca5c31
+  .word 0xd39c14f7
+  .word 0x9e853f39
+  .word 0x9cbd6ea3
+  .word 0x0bdefeca
+  .word 0x5b315d37
+  .word 0x7fcd619a
+  .word 0xa6a2965a
+  .word 0x179cbb52
+  .word 0x8b336f87
+  .word 0xc8b58c93
+  .word 0xd0f818cd
+  .word 0x6f89dd0c
+  .word 0xa5dc9f69
+  .word 0xaa56ac4a
+  .word 0xae8a5e2a
+  .word 0x5467b56e
+  .word 0x19c8132c
+  .word 0x79615d56
+  .word 0xd71f072a
+  .word 0xa6ce4b09
+  .word 0xaa9ee36d
+  .word 0x2516378e
+  .word 0x911cf3ba
+  .word 0x8df57003
+  .word 0xcfbb41c6
+  .word 0xfa593aa5
+  .word 0x2967ffbf
+  .word 0xcc7fc6d6
+  .word 0x9b0e36b8
+  .word 0x9ca7b404
+  .word 0xdeee9b1b
+  .word 0x3f003f7c
+  .word 0x548cf53f
+  .word 0xa1db901d
+  .word 0xbc06231d
+  .word 0x69ac5a54
+  .word 0xb86e5414
+  .word 0x26bf1156
+  .word 0x865cd9a4
+  .word 0x977d42d4
+  .word 0xfd1cb8db
+  .word 0x4a8d79ad
+  .word 0xb75d5289
+  .word 0xd0b4ae41
+  .word 0xcdf2a2a3
+  .word 0x8a86394a
+  .word 0x36a5439c
+  .word 0xbcf4fb08
+  .word 0x308f9ba8
+  .word 0x65eb10e2
+  .word 0x1a896d1d
+  .word 0xbd5cda25
+  .word 0x338272e5
+  .word 0x21d236e6
+  .word 0xddf1be63
+  .word 0xd43b602e
+  .word 0xa2be45ac
+  .word 0x250cf810
+  .word 0x9617b962
+  .word 0xdf0ea233
+  .word 0xe02a6135
+  .word 0x401f160c
+  .word 0xc780dde5
+  .word 0xd91695d9
+  .word 0x70f5f99a
+  .word 0x84de837d
+  .word 0xe8ed7b93
+  .word 0xdd4a17ae
+  .word 0x9d9db60e
+  .word 0xc9583de7
+  .word 0xf201eb2d
+  .word 0xf1087a7d
+  .word 0x4261af44
+  .word 0x149a2356
+  .word 0xd18324b2
+  .word 0x4a86466b
+  .word 0x99a76694
+  .word 0x7ba169f0
+  .word 0x9b6a703a
+  .word 0xb25ab543
+
+
+/* Base for exponentiation (corresponds to plaintext for encryption or
+   signature for verification).
+
+   Raw hex value (randomly generated) =
+0x9e67bf21cfb170bd70edb7b9ffb99fbfe6a681f9e17bc8a966bf55d54794b95f9c4ff3657f3eef86433035ec3cc1fd4c092498a59f3fb5ac0b29c1a7a429130509229a001a86f72182354886779211a3f38ae8b864d094f875cc30bfce8df4a999cd6e43ab25c786ebb4d78bd6f439b278937d6d092be28d986564faab071878f0b4982b70af87c2261a0fc4d58b4c5d227cd880b40af25828988a730746b711cd6aaeec67f07b40df881cc8b784f944f4dc9fccac096631baf8ec17201fbacab0f09cbd2e816495820f6a5d7263ab5dd72cee1c1145327e2696066b6103304206c29ace7f13d92b3a7edf3cb9dc3fe5d2da7c22d16319f2fcdf44a9cf14de57cc75f9395b0d1ebd90c107b74ca88d8c99be1e5a4a41d1fa2285ccd8580f4a6206fb4d0cae5945bcd33b5f1308025f660bf96e3e448216b02da98b86d8e9d633e311b2f19fce4dbaa6317d04aaf360ea9245bd0bb70811e64d87accf8ab6339b063ba26b085c85e369f37c2c62a485fab7f2b22edd5f4f6365c3e47fae372ea3e530796473835384e77187ac856b9ebbc3c10f1a0394e9a9c25a8e635a55ac907a011119aa5d00edc26f0b64e9972391ba545a03e003624e0624f824c22710237e8f97a07ffcff0106684f0c17b8df6f975bdd5f286a95f7635416b3e9129aa81e4cee9932dfe177f7f33897412fdde0e8b87f6cc0c54ab2c8f022dcb7fef768
+ */
+.balign 32
+base:
+  .word 0xb7fef768
+  .word 0xc8f022dc
+  .word 0xc0c54ab2
+  .word 0xe8b87f6c
+  .word 0x412fdde0
+  .word 0xf7f33897
+  .word 0x32dfe177
+  .word 0x1e4cee99
+  .word 0xe9129aa8
+  .word 0x635416b3
+  .word 0x286a95f7
+  .word 0x975bdd5f
+  .word 0x17b8df6f
+  .word 0x06684f0c
+  .word 0x7ffcff01
+  .word 0x7e8f97a0
+  .word 0xc2271023
+  .word 0x0624f824
+  .word 0xe003624e
+  .word 0xba545a03
+  .word 0xe9972391
+  .word 0xc26f0b64
+  .word 0xaa5d00ed
+  .word 0x7a011119
+  .word 0x5a55ac90
+  .word 0xc25a8e63
+  .word 0x0394e9a9
+  .word 0xc3c10f1a
+  .word 0x856b9ebb
+  .word 0xe77187ac
+  .word 0x73835384
+  .word 0xe5307964
+  .word 0xae372ea3
+  .word 0x65c3e47f
+  .word 0xdd5f4f63
+  .word 0xb7f2b22e
+  .word 0x62a485fa
+  .word 0x69f37c2c
+  .word 0x085c85e3
+  .word 0x063ba26b
+  .word 0x8ab6339b
+  .word 0x4d87accf
+  .word 0xb70811e6
+  .word 0x9245bd0b
+  .word 0xaaf360ea
+  .word 0xa6317d04
+  .word 0x9fce4dba
+  .word 0xe311b2f1
+  .word 0xd8e9d633
+  .word 0x2da98b86
+  .word 0x448216b0
+  .word 0x0bf96e3e
+  .word 0x08025f66
+  .word 0xd33b5f13
+  .word 0xae5945bc
+  .word 0x06fb4d0c
+  .word 0x580f4a62
+  .word 0x2285ccd8
+  .word 0x4a41d1fa
+  .word 0x99be1e5a
+  .word 0x4ca88d8c
+  .word 0x90c107b7
+  .word 0x5b0d1ebd
+  .word 0xcc75f939
+  .word 0xcf14de57
+  .word 0xfcdf44a9
+  .word 0xd16319f2
+  .word 0xd2da7c22
+  .word 0xb9dc3fe5
+  .word 0x3a7edf3c
+  .word 0x7f13d92b
+  .word 0x06c29ace
+  .word 0x61033042
+  .word 0x2696066b
+  .word 0x1145327e
+  .word 0xd72cee1c
+  .word 0x7263ab5d
+  .word 0x820f6a5d
+  .word 0x2e816495
+  .word 0xb0f09cbd
+  .word 0x201fbaca
+  .word 0xbaf8ec17
+  .word 0xac096631
+  .word 0xf4dc9fcc
+  .word 0xb784f944
+  .word 0xdf881cc8
+  .word 0x67f07b40
+  .word 0xcd6aaeec
+  .word 0x0746b711
+  .word 0x28988a73
+  .word 0xb40af258
+  .word 0x227cd880
+  .word 0xd58b4c5d
+  .word 0x261a0fc4
+  .word 0x70af87c2
+  .word 0xf0b4982b
+  .word 0xab071878
+  .word 0x986564fa
+  .word 0x092be28d
+  .word 0x78937d6d
+  .word 0xd6f439b2
+  .word 0xebb4d78b
+  .word 0xab25c786
+  .word 0x99cd6e43
+  .word 0xce8df4a9
+  .word 0x75cc30bf
+  .word 0x64d094f8
+  .word 0xf38ae8b8
+  .word 0x779211a3
+  .word 0x82354886
+  .word 0x1a86f721
+  .word 0x09229a00
+  .word 0xa4291305
+  .word 0x0b29c1a7
+  .word 0x9f3fb5ac
+  .word 0x092498a5
+  .word 0x3cc1fd4c
+  .word 0x433035ec
+  .word 0x7f3eef86
+  .word 0x9c4ff365
+  .word 0x4794b95f
+  .word 0x66bf55d5
+  .word 0xe17bc8a9
+  .word 0xe6a681f9
+  .word 0xffb99fbf
+  .word 0x70edb7b9
+  .word 0xcfb170bd
+  .word 0x9e67bf21
+
+
+/* output buffer */
+.balign 32
+result:
+.zero 512
+
+/* buffer for Montgomery constant RR */
+.balign 32
+RR:
+.zero 512
+
+/* buffer for Montgomery constant m0inv */
+.balign 32
+m0inv:
+.zero 32
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.exp
new file mode 100644
index 0000000000000..250028a7d63f7
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.exp
@@ -0,0 +1,2 @@
+# Expect 2^256 - 1 (check passed).
+w24 = 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.s
new file mode 100644
index 0000000000000..19d5c6f5bc7f0
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_good_test.s
@@ -0,0 +1,31 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Ensure that a good value for p passes RSA keygen checks.
+ *
+ * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for
+ * RSA-2048.
+ */
+
+.section .text.start
+
+main:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load the number of limbs for this test. */
+  li        x30, 4
+  li        x31, 3
+
+  /* Load required constants. */
+  li        x20, 20
+  li        x21, 21
+
+  /* Check an acceptable value of p.
+       w24 <= 2^256-1 if the check passed, otherwise 0 */
+  la        x16, good_p
+  jal       x1, check_p
+
+  ecall
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.exp
new file mode 100644
index 0000000000000..75275f176e56d
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.exp
@@ -0,0 +1,2 @@
+# Expect 0 (check failed).
+w24 = 0
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.s
new file mode 100644
index 0000000000000..658faaf11550f
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_prime_test.s
@@ -0,0 +1,31 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Ensure that a nonprime value for p fails RSA keygen checks.
+ *
+ * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for
+ * RSA-2048.
+ */
+
+.section .text.start
+
+main:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load the number of limbs for this test. */
+  li        x30, 4
+  li        x31, 3
+
+  /* Load required constants. */
+  li        x20, 20
+  li        x21, 21
+
+  /* Check a value of p that is nonprime.
+       w24 <= 2^256-1 if the check passed, otherwise 0 */
+  la        x16, not_prime
+  jal       x1, check_p
+
+  ecall
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.exp
new file mode 100644
index 0000000000000..75275f176e56d
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.exp
@@ -0,0 +1,2 @@
+# Expect 0 (check failed).
+w24 = 0
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.s
new file mode 100644
index 0000000000000..fc2023840c22c
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkp_not_relprime_test.s
@@ -0,0 +1,31 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Ensure that a multiple of F4 fails RSA keygen checks for p.
+ *
+ * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for
+ * RSA-2048.
+ */
+
+.section .text.start
+
+main:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load the number of limbs for this test. */
+  li        x30, 4
+  li        x31, 3
+
+  /* Load required constants. */
+  li        x20, 20
+  li        x21, 21
+
+  /* Check a value of p that is not relatively prime to F4.
+       w24 <= 2^256-1 if the check passed, otherwise 0 */
+  la        x16, not_relprime
+  jal       x1, check_p
+
+  ecall
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.exp
deleted file mode 100644
index 9fc6e72ba9642..0000000000000
--- a/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.exp
+++ /dev/null
@@ -1,13 +0,0 @@
-# Expected values are the "good" p and q.
-
-# p = 0xe85547c5336579f83a2d50a611f489a4f2c3a918d2027fbc3f25c2de2dd36cdedc8901266de144a223b2c78a5a11024488a4aa2f4ef71f0fb93dfdbb2280b4d99dc9b3b77b039fd9fefcc3fe439e2bcb3db3ee3c0378a4d1297c1a5eebcd0d4ab3c0b50eb1511605c7c0907af31564ec5cc635e3de465e99cf6169c933ca0ab5
-w0 = 0xb3c0b50eb1511605c7c0907af31564ec5cc635e3de465e99cf6169c933ca0ab5
-w1 = 0x9dc9b3b77b039fd9fefcc3fe439e2bcb3db3ee3c0378a4d1297c1a5eebcd0d4a
-w2 = 0xdc8901266de144a223b2c78a5a11024488a4aa2f4ef71f0fb93dfdbb2280b4d9
-w3 = 0xe85547c5336579f83a2d50a611f489a4f2c3a918d2027fbc3f25c2de2dd36cde
-
-# q = 0xb863a172d3d5562b582f38e251e540b424d4cbadd5da0ce64cb755227227b9535e0ab2437c1522415a70211eaa1dc4b4192b33148b1226da2ed107b64beeac72b112d99b960df54e21336a13aef97b5ec8646752af38385314a81a531bced7da5a781f6b19d119805941c47777a7aa9580a35b9f75c7dd97545d70790d7e8e9d
-w4 = 0x5a781f6b19d119805941c47777a7aa9580a35b9f75c7dd97545d70790d7e8e9d
-w5 = 0xb112d99b960df54e21336a13aef97b5ec8646752af38385314a81a531bced7da
-w6 = 0x5e0ab2437c1522415a70211eaa1dc4b4192b33148b1226da2ed107b64beeac72
-w7 = 0xb863a172d3d5562b582f38e251e540b424d4cbadd5da0ce64cb755227227b953
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.s
deleted file mode 100644
index e977a97bd4274..0000000000000
--- a/sw/otbn/crypto/tests/rsa_keygen_checkpq_test.s
+++ /dev/null
@@ -1,531 +0,0 @@
-/* Copyright lowRISC contributors. */
-/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
-/* SPDX-License-Identifier: Apache-2.0 */
-
-/**
- * Standalone test for checks on RSA keygen p and q values.
- *
- * See FIPS 186-5 section A.1.3 for the full specification of requirements on p
- * and q. The value for p must satisfy:
- *   - p % 2 = 1
- *   - p >= sqrt(2)*(2^(nlen/2 - 1)), where nlen = RSA public key length
- *   - GCD(p-1,65537) = 1
- *   - p is probably prime
- *
- * For q, we need to satisfy the same requirements as p plus one more: q must
- * not be too close to p.  Specifically, we need to reject the value if:
- *   |p-q| < 2^(nlen/2 - 100).
- *
- * We don't test the oddness requirement here, since the `check_*` routines
- * require oddness as a precondition. However, all other requirements are
- * tested.
- *
- * For we use 4-limb (1024-bit) values for p and q in this test, which
- * correspond to RSA-2048.
- */
-
-.section .text.start
-
-main:
-  /* Init all-zero register. */
-  bn.xor    w31, w31, w31
-
-  /* Load the number of limbs for this test. */
-  li        x30, 4
-  li        x31, 3
-
-  /* Load required constants. */
-  li        x20, 20
-  li        x21, 21
-
-  /* Zeroize the buffer for q so that, if we never get to checking it and
-     writing any real data there, we don't get DMEM integrity errors when we
-     try to load it to registers. */
-  la        x2, zero
-  jal       x1, copy_to_rsa_q
-
-  /* Check a value of p that is too small. */
-  la        x16, too_small
-  jal       x1, test_bad_p
-
-  /* Check a value of p such that GCD(p-1, 65537) != 1. */
-  la        x16, not_relprime
-  jal       x1, test_bad_p
-
-  /* Check a value of p that is not prime. */
-  la        x16, not_prime
-  jal       x1, test_bad_p
-
-  /* Check a value of p that is acceptable. */
-  la        x16, good_p
-  jal       x1, check_p
-  jal       x1, last_check_to_x2
-
-  /* Copy the good value of p into dmem[rsa_p] for the q checks. */
-  la        x3, rsa_p
-  loop      x30, 2
-    bn.lid   x20, 0(x16++)
-    bn.sid   x20, 0(x3++)
-
-  /* If x2 != 0, the check failed; point to zeroes and exit. */
-  la        x16, zero
-  bne       x2, x0, _program_exit
-
-  /* Check a value of q that is too small. */
-  la        x2, too_small
-  jal       x1, test_bad_q
-
-  /* Check a value of q that is too close to p. */
-  la        x2, too_close
-  jal       x1, test_bad_q
-
-  /* Check a value of q that is acceptable. */
-  la        x2, good_q
-  jal       x1, copy_to_rsa_q
-  jal       x1, check_q
-  jal       x1, last_check_to_x2
-
-  /* If x2 == 0, the check passed; jump to exit without zeroing q. */
-  beq      x2, x0, _program_exit_load_p
-
-  /* If we get here, the good value of q failed; zeroize rsa_q. */
-  la        x2, zero
-  jal       x1, copy_to_rsa_q
-
-_program_exit_load_p:
-  /* This jump point sets x16=rsa_p so p is loaded from that buffer instead of
-     whatever's in x16. */
-  la        x16, rsa_p
-
-_program_exit:
-  /* Load the selected value of p (or bad value) into registers.
-       w0,w1,w2,w3 <= dmem[x16..x16+(4*32)] */
-  li         x3, 0
-  loop       x30, 2
-    bn.lid     x3, 0(x16++)
-    addi       x3, x3, 1
-
-  /* Load the selected value of q into registers.
-       w4,w5,w6,w7 <= dmem[rsa_q..rsa_q+(4*32)] */
-  la         x2, rsa_q
-  loop       x30, 2
-    bn.lid     x3, 0(x2++)
-    addi       x3, x3, 1
-
-  ecall
-
-/**
- * Copy the value to dmem[rsa_q].
- *
- * @param[in] x2: pointer to value to copy
- * @param[in] x30: number of limbs
- */
-copy_to_rsa_q:
-  la        x3, rsa_q
-  loop      x30, 2
-    bn.lid   x20, 0(x2++)
-    bn.sid   x20, 0(x3++)
-  ret
-
-/**
- * Test a bad value for p.
- *
- * @param[in] x16: pointer to value for test
- */
-test_bad_p:
-  /* Run checks and ensure they failed. */
-  jal       x1, check_p
-  jal       x1, last_check_to_x2
-
-  /* If x2 == 0, the check passed, so jump to the exit sequence. */
-  beq      x2, x0, _program_exit
-
-  /* If we get here, all is well; return to the caller. */
-  ret
-
-/**
- * Test a bad value for q.
- *
- * @param[in] x2: pointer to value for test
- */
-test_bad_q:
-  /* Copy the test value into dmem[rsa_q]. */
-  jal       x1, copy_to_rsa_q
-
-  /* Run checks and ensure they failed. */
-  jal       x1, check_q
-  jal       x1, last_check_to_x2
-
-  /* If x2 == 0, the check passed, so jump to the exit sequence. */
-  beq      x2, x0, _program_exit_load_p
-
-  /* If we get here, all is well; return to the caller. */
-  ret
-
-/**
- * Get the result of the last check in a register.
- *
- * The result is nonzero if the check FAILED, and zero if it passed.
- *
- * @param[in] w24: result of last check (all-1 or all-0).
- * @param[in] w31: all-zero.
- * @param[out] x2: 0 if w24 == 0, otherwise nonzero
- */
-last_check_to_x2:
-  /* Compare the result of the check to zero.
-       FG0.Z <= (w24 == 0) */
-  bn.cmp   w24, w31
-
-  /* Get the FG0.Z flag into a register.
-       x2 <= CSRs[FG0] & 8 = FG0.Z << 3 */
-  csrrs    x2, 0x7c0, x0
-  andi     x2, x2, 8
-  ret
-
-.data
-
-/* Note: Some of the Python scripts shown below reference the lower bound for
-   p/q as a Python variable called lower_bound. This value was generated and
-   checked for RSA-4096 as specified in BoringSSL:
-     https://boringssl.googlesource.com/boringssl/+/dcabfe2d8940529a69e007660fa7bf6c15954ecc/crypto/fipsmodule/rsa/rsa_impl.c#1006
-
-   The value for RSA-2048, as used in these tests, is simply the value for
-   RSA-4096 shifted right by 1024 bits. We can check it using:
-   >> lower_bound**2 < 2**2047 < (lower_bound+1)**2
-   True
-
-   For reference, the hex value of the RSA-2048 lower bound is:
-   0xb504f333f9de6484597d89b3754abe9f1d6f60ba893ba84ced17ac85833399154afc83043ab8a2c3a8b1fe6fdc83db390f74a85e439c7b4a780487363dfa2768d2202e8742af1f4e53059c6011bc337bcab1bc911688458a460abc722f7c4e33c6d5a8a38bb7e9dccb2a634331f3c84df52f120f836e582eeaa4a0899040ca4a
-*/
-
-/**
- * An odd 1024-bit value that is too small to be used for p or q.
- *
- * Specifically, this value is the highest prime number below the lower bound.
- *
- * Python script for generating the test data (using PyCryptoDome's
- * Crypto.Util.number package for the primality check):
-too_small = lower_bound - 1
-while True:
-  if math.gcd(too_small-1, 65537) != 1:
-    continue
-  if number.isPrime(too_small):
-    break
-  too_small -= 2
- *
- * Hex value for reference:
- * 0xb504f333f9de6484597d89b3754abe9f1d6f60ba893ba84ced17ac85833399154afc83043ab8a2c3a8b1fe6fdc83db390f74a85e439c7b4a780487363dfa2768d2202e8742af1f4e53059c6011bc337bcab1bc911688458a460abc722f7c4e33c6d5a8a38bb7e9dccb2a634331f3c84df52f120f836e582eeaa4a0899040c619
- */
-.balign 32
-too_small:
-  .word 0x9040c619
-  .word 0xeaa4a089
-  .word 0x836e582e
-  .word 0xf52f120f
-  .word 0x31f3c84d
-  .word 0xcb2a6343
-  .word 0x8bb7e9dc
-  .word 0xc6d5a8a3
-  .word 0x2f7c4e33
-  .word 0x460abc72
-  .word 0x1688458a
-  .word 0xcab1bc91
-  .word 0x11bc337b
-  .word 0x53059c60
-  .word 0x42af1f4e
-  .word 0xd2202e87
-  .word 0x3dfa2768
-  .word 0x78048736
-  .word 0x439c7b4a
-  .word 0x0f74a85e
-  .word 0xdc83db39
-  .word 0xa8b1fe6f
-  .word 0x3ab8a2c3
-  .word 0x4afc8304
-  .word 0x83339915
-  .word 0xed17ac85
-  .word 0x893ba84c
-  .word 0x1d6f60ba
-  .word 0x754abe9f
-  .word 0x597d89b3
-  .word 0xf9de6484
-  .word 0xb504f333
-
-/**
- * An 1024-bit value that doesn't satisfy relative primality with 65537.
- *
- * This number is selected to be larger than the lower bound and prime, so it
- * doesn't fail any other checks than GCD(p-1,e)=1.
- *
- * Python script for generating the test data (using PyCryptoDome's
- * Crypto.Util.number package for the primality check):
-while True:
-  y = random.randrange(lower_bound, (1 << 1024))
-  y -= (y % 65537)
-  if (y & 1 == 0) and number.isPrime(y+1):
-    break
-not_relprime = y+1
- *
- * Hex value for reference:
- * 0xf36b245b0051285df9f46be79c821a95584a00007b907c4102578d6c8c5d459c4328a174859c703e66bc706a9224e20f387da68e80a362fb1f0f36a912df95c26dc8b40902bff546d3aff671eea79a86df507180e0fba265c0ab601e582580f9fb18a62f9ff4e92d8d698408be08d7c24507244c6d3859be3804f2a7d9f16867
- */
-.balign 32
-not_relprime:
-  .word 0xd9f16867
-  .word 0x3804f2a7
-  .word 0x6d3859be
-  .word 0x4507244c
-  .word 0xbe08d7c2
-  .word 0x8d698408
-  .word 0x9ff4e92d
-  .word 0xfb18a62f
-  .word 0x582580f9
-  .word 0xc0ab601e
-  .word 0xe0fba265
-  .word 0xdf507180
-  .word 0xeea79a86
-  .word 0xd3aff671
-  .word 0x02bff546
-  .word 0x6dc8b409
-  .word 0x12df95c2
-  .word 0x1f0f36a9
-  .word 0x80a362fb
-  .word 0x387da68e
-  .word 0x9224e20f
-  .word 0x66bc706a
-  .word 0x859c703e
-  .word 0x4328a174
-  .word 0x8c5d459c
-  .word 0x02578d6c
-  .word 0x7b907c41
-  .word 0x584a0000
-  .word 0x9c821a95
-  .word 0xf9f46be7
-  .word 0x0051285d
-  .word 0xf36b245b
-
-/**
- * An 1024-bit value that passes other checks but isn't prime.
- *
- * Python script for generating the test data (using PyCryptoDome's
- * Crypto.Util.number package for the primality check):
-while True:
-  not_prime = random.randrange(lower_bound, (1 << 1024))
-  not_prime |= 1
-  if math.gcd(not_prime, 65537) != 1:
-    continue
-  if not number.isPrime(not_prime):
-    break
- *
- * Hex value for reference:
- * 0xecbbd72477e406de8ff72a93afbe19ed4258d3dd8cfa5b2a8b5c76d22053504710a8460c30c5141fc581df484e58a2bd019c03a1acab6c7fd70f9865ac6dcdcce4cca95266e4d2dea9a408b8ded6591daa4416bb7ca78357cad5c7d527d46a06807337d6845484589c8010eb6b674194608e1b9732db4e8cee053d2572158cf5
- */
-.balign 32
-not_prime:
-  .word 0x72158cf5
-  .word 0xee053d25
-  .word 0x32db4e8c
-  .word 0x608e1b97
-  .word 0x6b674194
-  .word 0x9c8010eb
-  .word 0x84548458
-  .word 0x807337d6
-  .word 0x27d46a06
-  .word 0xcad5c7d5
-  .word 0x7ca78357
-  .word 0xaa4416bb
-  .word 0xded6591d
-  .word 0xa9a408b8
-  .word 0x66e4d2de
-  .word 0xe4cca952
-  .word 0xac6dcdcc
-  .word 0xd70f9865
-  .word 0xacab6c7f
-  .word 0x019c03a1
-  .word 0x4e58a2bd
-  .word 0xc581df48
-  .word 0x30c5141f
-  .word 0x10a8460c
-  .word 0x20535047
-  .word 0x8b5c76d2
-  .word 0x8cfa5b2a
-  .word 0x4258d3dd
-  .word 0xafbe19ed
-  .word 0x8ff72a93
-  .word 0x77e406de
-  .word 0xecbbd724
-
-/**
- * An acceptable value for p.
- *
- * To make sure the checks on q are being tested, this value is specifically
- * chosen to be far enough away from the "bad" values of q that they wouldn't
- * be rejected on that basis.
- *
- * Python script for generating p (using PyCryptoDome's Crypto.Util.number
- * package for the primality check):
-while True:
-  p = random.randrange(lower_bound, 1 << 1024)
-  p |= 1
-  if abs(p - too_small) < (1 << 924):
-    continue
-  if abs(p - not_relprime) < (1 << 924):
-    continue
-  if abs(p - not_prime) < (1 << 924):
-    continue
-  if math.gcd(p-1, 65537) != 1:
-    continue
-  if number.isPrime(p):
-    break
- *
- * Hex value for reference:
- * 0xe85547c5336579f83a2d50a611f489a4f2c3a918d2027fbc3f25c2de2dd36cdedc8901266de144a223b2c78a5a11024488a4aa2f4ef71f0fb93dfdbb2280b4d99dc9b3b77b039fd9fefcc3fe439e2bcb3db3ee3c0378a4d1297c1a5eebcd0d4ab3c0b50eb1511605c7c0907af31564ec5cc635e3de465e99cf6169c933ca0ab5
- */
-.balign 32
-good_p:
-  .word 0x33ca0ab5
-  .word 0xcf6169c9
-  .word 0xde465e99
-  .word 0x5cc635e3
-  .word 0xf31564ec
-  .word 0xc7c0907a
-  .word 0xb1511605
-  .word 0xb3c0b50e
-  .word 0xebcd0d4a
-  .word 0x297c1a5e
-  .word 0x0378a4d1
-  .word 0x3db3ee3c
-  .word 0x439e2bcb
-  .word 0xfefcc3fe
-  .word 0x7b039fd9
-  .word 0x9dc9b3b7
-  .word 0x2280b4d9
-  .word 0xb93dfdbb
-  .word 0x4ef71f0f
-  .word 0x88a4aa2f
-  .word 0x5a110244
-  .word 0x23b2c78a
-  .word 0x6de144a2
-  .word 0xdc890126
-  .word 0x2dd36cde
-  .word 0x3f25c2de
-  .word 0xd2027fbc
-  .word 0xf2c3a918
-  .word 0x11f489a4
-  .word 0x3a2d50a6
-  .word 0x336579f8
-  .word 0xe85547c5
-
-/**
- * A value for q that is too close to p, but meets other requirements.
- *
- * Python script for generating test data (using PyCryptoDome's
- * Crypto.Util.number package for the primality check):
-while True:
-  too_close = random.randrange(p - (1 << 924), p + (1 << 924))
-  if too_close & 1 == 0:
-    continue
-  if too_close < lower_bound:
-    continue
-  if math.gcd(too_close - 1, 65537) != 1:
-    continue
-  if number.isPrime(too_close):
-    break
- *
- * Hex value for reference:
- * 0xe85547c5336579f83a2d50a60364d13462f8746c6177f91a902b276464b8c39d0ffeb8d77af899a932ed3198d0d3ca66948d678bf7e95f30e95014fdb0a3b13c56927a70b14191134664a3374ada1d0a3d3dfb0a8fbf3704ef0e8588eafebd9e81f0dca5b7b5cca8b753862a472ed36b8c820c618110ca8936e79789e4ec8b71
- */
-.balign 32
-too_close:
-  .word 0xe4ec8b71
-  .word 0x36e79789
-  .word 0x8110ca89
-  .word 0x8c820c61
-  .word 0x472ed36b
-  .word 0xb753862a
-  .word 0xb7b5cca8
-  .word 0x81f0dca5
-  .word 0xeafebd9e
-  .word 0xef0e8588
-  .word 0x8fbf3704
-  .word 0x3d3dfb0a
-  .word 0x4ada1d0a
-  .word 0x4664a337
-  .word 0xb1419113
-  .word 0x56927a70
-  .word 0xb0a3b13c
-  .word 0xe95014fd
-  .word 0xf7e95f30
-  .word 0x948d678b
-  .word 0xd0d3ca66
-  .word 0x32ed3198
-  .word 0x7af899a9
-  .word 0x0ffeb8d7
-  .word 0x64b8c39d
-  .word 0x902b2764
-  .word 0x6177f91a
-  .word 0x62f8746c
-  .word 0x0364d134
-  .word 0x3a2d50a6
-  .word 0x336579f8
-  .word 0xe85547c5
-
-/**
- * An acceptable value for q.
- *
- * Python script for generating q (using PyCryptoDome's Crypto.Util.number
- * package for the primality check):
-while True:
-  q = random.randrange(lower_bound, 1 << 1024)
-  q |= 1
-  if abs(p - q) < (1 << 924):
-    continue
-  if math.gcd(q-1, 65537) != 1:
-    continue
-  if number.isPrime(q):
-    break
- *
- * Hex value for reference:
- * 0xb863a172d3d5562b582f38e251e540b424d4cbadd5da0ce64cb755227227b9535e0ab2437c1522415a70211eaa1dc4b4192b33148b1226da2ed107b64beeac72b112d99b960df54e21336a13aef97b5ec8646752af38385314a81a531bced7da5a781f6b19d119805941c47777a7aa9580a35b9f75c7dd97545d70790d7e8e9d
- */
-.balign 32
-good_q:
-  .word 0x0d7e8e9d
-  .word 0x545d7079
-  .word 0x75c7dd97
-  .word 0x80a35b9f
-  .word 0x77a7aa95
-  .word 0x5941c477
-  .word 0x19d11980
-  .word 0x5a781f6b
-  .word 0x1bced7da
-  .word 0x14a81a53
-  .word 0xaf383853
-  .word 0xc8646752
-  .word 0xaef97b5e
-  .word 0x21336a13
-  .word 0x960df54e
-  .word 0xb112d99b
-  .word 0x4beeac72
-  .word 0x2ed107b6
-  .word 0x8b1226da
-  .word 0x192b3314
-  .word 0xaa1dc4b4
-  .word 0x5a70211e
-  .word 0x7c152241
-  .word 0x5e0ab243
-  .word 0x7227b953
-  .word 0x4cb75522
-  .word 0xd5da0ce6
-  .word 0x24d4cbad
-  .word 0x51e540b4
-  .word 0x582f38e2
-  .word 0xd3d5562b
-  .word 0xb863a172
-
-/**
- * Zeroes to point to for the "good value failed" case.
- */
-.balign 32
-zero:
-.zero 128
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkpq_test_data.s b/sw/otbn/crypto/tests/rsa_keygen_checkpq_test_data.s
new file mode 100644
index 0000000000000..173267dd31afe
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkpq_test_data.s
@@ -0,0 +1,371 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Data to test checks on RSA keygen p and q values.
+ *
+ * See FIPS 186-5 section A.1.3 for the full specification of requirements on p
+ * and q. The value for p must satisfy:
+ *   - p % 2 = 1
+ *   - p >= sqrt(2)*(2^(nlen/2 - 1)), where nlen = RSA public key length
+ *   - GCD(p-1,65537) = 1
+ *   - p is probably prime
+ *
+ * For q, we need to satisfy the same requirements as p plus one more: q must
+ * not be too close to p.  Specifically, we need to reject the value if:
+ *   |p-q| < 2^(nlen/2 - 100).
+ *
+ * This test data includes values of p and q that each fail exactly one
+ * condition, as well as two "good" values of p and q that are compatible with
+ * each other.
+ *
+ * This test data uses 4-limb (1024-bit) values for p and q, which correspond
+ * to RSA-2048.
+ */
+
+.data
+
+/* Note: Some of the Python scripts shown below reference the lower bound for
+   p/q as a Python variable called lower_bound. This value was generated and
+   checked for RSA-4096 as specified in BoringSSL:
+     https://boringssl.googlesource.com/boringssl/+/dcabfe2d8940529a69e007660fa7bf6c15954ecc/crypto/fipsmodule/rsa/rsa_impl.c#1006
+
+   The value for RSA-2048, as used in these tests, is simply the value for
+   RSA-4096 shifted right by 1024 bits. We can check it using:
+   >> lower_bound**2 < 2**2047 < (lower_bound+1)**2
+   True
+
+   For reference, the hex value of the RSA-2048 lower bound is:
+   0xb504f333f9de6484597d89b3754abe9f1d6f60ba893ba84ced17ac85833399154afc83043ab8a2c3a8b1fe6fdc83db390f74a85e439c7b4a780487363dfa2768d2202e8742af1f4e53059c6011bc337bcab1bc911688458a460abc722f7c4e33c6d5a8a38bb7e9dccb2a634331f3c84df52f120f836e582eeaa4a0899040ca4a
+*/
+
+/**
+ * An odd 1024-bit value that is too small to be used for p or q.
+ *
+ * Specifically, this value is the highest prime number below the lower bound.
+ *
+ * Python script for generating the test data (using PyCryptoDome's
+ * Crypto.Util.number package for the primality check):
+too_small = lower_bound - 1
+while True:
+  if math.gcd(too_small-1, 65537) != 1:
+    continue
+  if number.isPrime(too_small):
+    break
+  too_small -= 2
+ *
+ * Hex value for reference:
+ * 0xb504f333f9de6484597d89b3754abe9f1d6f60ba893ba84ced17ac85833399154afc83043ab8a2c3a8b1fe6fdc83db390f74a85e439c7b4a780487363dfa2768d2202e8742af1f4e53059c6011bc337bcab1bc911688458a460abc722f7c4e33c6d5a8a38bb7e9dccb2a634331f3c84df52f120f836e582eeaa4a0899040c619
+ */
+.balign 32
+.globl too_small
+too_small:
+  .word 0x9040c619
+  .word 0xeaa4a089
+  .word 0x836e582e
+  .word 0xf52f120f
+  .word 0x31f3c84d
+  .word 0xcb2a6343
+  .word 0x8bb7e9dc
+  .word 0xc6d5a8a3
+  .word 0x2f7c4e33
+  .word 0x460abc72
+  .word 0x1688458a
+  .word 0xcab1bc91
+  .word 0x11bc337b
+  .word 0x53059c60
+  .word 0x42af1f4e
+  .word 0xd2202e87
+  .word 0x3dfa2768
+  .word 0x78048736
+  .word 0x439c7b4a
+  .word 0x0f74a85e
+  .word 0xdc83db39
+  .word 0xa8b1fe6f
+  .word 0x3ab8a2c3
+  .word 0x4afc8304
+  .word 0x83339915
+  .word 0xed17ac85
+  .word 0x893ba84c
+  .word 0x1d6f60ba
+  .word 0x754abe9f
+  .word 0x597d89b3
+  .word 0xf9de6484
+  .word 0xb504f333
+
+/**
+ * An 1024-bit value that doesn't satisfy relative primality with 65537.
+ *
+ * This number is selected to be larger than the lower bound and prime, so it
+ * doesn't fail any other checks than GCD(p-1,e)=1.
+ *
+ * Python script for generating the test data (using PyCryptoDome's
+ * Crypto.Util.number package for the primality check):
+while True:
+  y = random.randrange(lower_bound, (1 << 1024))
+  y -= (y % 65537)
+  if (y & 1 == 0) and number.isPrime(y+1):
+    break
+not_relprime = y+1
+ *
+ * Hex value for reference:
+ * 0xf36b245b0051285df9f46be79c821a95584a00007b907c4102578d6c8c5d459c4328a174859c703e66bc706a9224e20f387da68e80a362fb1f0f36a912df95c26dc8b40902bff546d3aff671eea79a86df507180e0fba265c0ab601e582580f9fb18a62f9ff4e92d8d698408be08d7c24507244c6d3859be3804f2a7d9f16867
+ */
+.balign 32
+.globl not_relprime
+not_relprime:
+  .word 0xd9f16867
+  .word 0x3804f2a7
+  .word 0x6d3859be
+  .word 0x4507244c
+  .word 0xbe08d7c2
+  .word 0x8d698408
+  .word 0x9ff4e92d
+  .word 0xfb18a62f
+  .word 0x582580f9
+  .word 0xc0ab601e
+  .word 0xe0fba265
+  .word 0xdf507180
+  .word 0xeea79a86
+  .word 0xd3aff671
+  .word 0x02bff546
+  .word 0x6dc8b409
+  .word 0x12df95c2
+  .word 0x1f0f36a9
+  .word 0x80a362fb
+  .word 0x387da68e
+  .word 0x9224e20f
+  .word 0x66bc706a
+  .word 0x859c703e
+  .word 0x4328a174
+  .word 0x8c5d459c
+  .word 0x02578d6c
+  .word 0x7b907c41
+  .word 0x584a0000
+  .word 0x9c821a95
+  .word 0xf9f46be7
+  .word 0x0051285d
+  .word 0xf36b245b
+
+/**
+ * An 1024-bit value that passes other checks but isn't prime.
+ *
+ * Python script for generating the test data (using PyCryptoDome's
+ * Crypto.Util.number package for the primality check):
+while True:
+  not_prime = random.randrange(lower_bound, (1 << 1024))
+  not_prime |= 3
+  if math.gcd(not_prime, 65537) != 1:
+    continue
+  if not number.isPrime(not_prime):
+    break
+ *
+ * Hex value for reference:
+ * 0xecbbd72477e406de8ff72a93afbe19ed4258d3dd8cfa5b2a8b5c76d22053504710a8460c30c5141fc581df484e58a2bd019c03a1acab6c7fd70f9865ac6dcdcce4cca95266e4d2dea9a408b8ded6591daa4416bb7ca78357cad5c7d527d46a06807337d6845484589c8010eb6b674194608e1b9732db4e8cee053d2572158cf7
+ */
+.balign 32
+.globl not_prime
+not_prime:
+  .word 0x72158cf7
+  .word 0xee053d25
+  .word 0x32db4e8c
+  .word 0x608e1b97
+  .word 0x6b674194
+  .word 0x9c8010eb
+  .word 0x84548458
+  .word 0x807337d6
+  .word 0x27d46a06
+  .word 0xcad5c7d5
+  .word 0x7ca78357
+  .word 0xaa4416bb
+  .word 0xded6591d
+  .word 0xa9a408b8
+  .word 0x66e4d2de
+  .word 0xe4cca952
+  .word 0xac6dcdcc
+  .word 0xd70f9865
+  .word 0xacab6c7f
+  .word 0x019c03a1
+  .word 0x4e58a2bd
+  .word 0xc581df48
+  .word 0x30c5141f
+  .word 0x10a8460c
+  .word 0x20535047
+  .word 0x8b5c76d2
+  .word 0x8cfa5b2a
+  .word 0x4258d3dd
+  .word 0xafbe19ed
+  .word 0x8ff72a93
+  .word 0x77e406de
+  .word 0xecbbd724
+
+/**
+ * An acceptable value for p.
+ *
+ * To make sure the checks on q are being tested, this value is specifically
+ * chosen to be far enough away from the "bad" values of q that they wouldn't
+ * be rejected on that basis.
+ *
+ * Python script for generating p (using PyCryptoDome's Crypto.Util.number
+ * package for the primality check):
+while True:
+  p = random.randrange(lower_bound, 1 << 1024)
+  p |= 3
+  if abs(p - too_small) < (1 << 924):
+    continue
+  if abs(p - not_relprime) < (1 << 924):
+    continue
+  if abs(p - not_prime) < (1 << 924):
+    continue
+  if math.gcd(p-1, 65537) != 1:
+    continue
+  if number.isPrime(p):
+    break
+ *
+ * Hex value for reference:
+ * 0xd10b3338d7d2cca85be7b76c5497f2fe89a9f9b73e613262565636dbc5901c386b1df3c7b8eb3ac8548a9062a5958b33c84dfe0fa9e2c61250d75683be1585008f926d5cfc4d3a3f003746a3beefcc71d287133768fc0268e1f84cb791be8e6dfc48b706ee0515089ff618c0a648854d6a93e9a0452552e93720ffa2021fd53b
+ */
+.balign 32
+.globl good_p
+good_p:
+  .word 0x021fd53b
+  .word 0x3720ffa2
+  .word 0x452552e9
+  .word 0x6a93e9a0
+  .word 0xa648854d
+  .word 0x9ff618c0
+  .word 0xee051508
+  .word 0xfc48b706
+  .word 0x91be8e6d
+  .word 0xe1f84cb7
+  .word 0x68fc0268
+  .word 0xd2871337
+  .word 0xbeefcc71
+  .word 0x003746a3
+  .word 0xfc4d3a3f
+  .word 0x8f926d5c
+  .word 0xbe158500
+  .word 0x50d75683
+  .word 0xa9e2c612
+  .word 0xc84dfe0f
+  .word 0xa5958b33
+  .word 0x548a9062
+  .word 0xb8eb3ac8
+  .word 0x6b1df3c7
+  .word 0xc5901c38
+  .word 0x565636db
+  .word 0x3e613262
+  .word 0x89a9f9b7
+  .word 0x5497f2fe
+  .word 0x5be7b76c
+  .word 0xd7d2cca8
+  .word 0xd10b3338
+
+/**
+ * A value for q that is too close to p, but meets other requirements.
+ *
+ * Python script for generating test data (using PyCryptoDome's
+ * Crypto.Util.number package for the primality check):
+while True:
+  too_close = random.randrange(p - (1 << 924), p + (1 << 924))
+  too_close |= 3
+  if too_close < lower_bound:
+    continue
+  if math.gcd(too_close - 1, 65537) != 1:
+    continue
+  if number.isPrime(too_close):
+    break
+ *
+ * Hex value for reference:
+ * 0xd10b3338d7d2cca85be7b76c479a213a2646058cc86df4e6fb59ec553c4e93bcf9eab3ddcf6caf42e690294667a03e9bc11a94f9b78df5311f5ea7890eb161e7067d759143ff20425120197025aac542ca2cfd1dcfe3ebddeae1f19ece50583c83597856830a0827333d1b67d6d887a16c3f8fe156d119ee6a0b2ca6ba4f62fb
+ */
+.balign 32
+.globl too_close
+too_close:
+  .word 0xba4f62fb
+  .word 0x6a0b2ca6
+  .word 0x56d119ee
+  .word 0x6c3f8fe1
+  .word 0xd6d887a1
+  .word 0x333d1b67
+  .word 0x830a0827
+  .word 0x83597856
+  .word 0xce50583c
+  .word 0xeae1f19e
+  .word 0xcfe3ebdd
+  .word 0xca2cfd1d
+  .word 0x25aac542
+  .word 0x51201970
+  .word 0x43ff2042
+  .word 0x067d7591
+  .word 0x0eb161e7
+  .word 0x1f5ea789
+  .word 0xb78df531
+  .word 0xc11a94f9
+  .word 0x67a03e9b
+  .word 0xe6902946
+  .word 0xcf6caf42
+  .word 0xf9eab3dd
+  .word 0x3c4e93bc
+  .word 0xfb59ec55
+  .word 0xc86df4e6
+  .word 0x2646058c
+  .word 0x479a213a
+  .word 0x5be7b76c
+  .word 0xd7d2cca8
+  .word 0xd10b3338
+
+/**
+ * An acceptable value for q.
+ *
+ * Python script for generating q (using PyCryptoDome's Crypto.Util.number
+ * package for the primality check):
+while True:
+  q = random.randrange(lower_bound, 1 << 1024)
+  q |= 3
+  if abs(p - q) < (1 << 924):
+    continue
+  if math.gcd(q-1, 65537) != 1:
+    continue
+  if number.isPrime(q):
+    break
+ *
+ * Hex value for reference:
+ * 0xf83da3592c89b3b8972d1a8dd1de78d7b64a0b1cce4a54ca5125bfc16105ce43ebe4bc6b5e0088e37281d264d2081cf1097671eb3299e91a6c571e4b71cdd1144ca96ad7c45bd05e8e25e371ca8e2043cf73a30ba5e9c979f259bbc9476c1ab3693136e403ebe4e47542c7a6f4164d1a7e2938e65191c9aee6a3534a87c3f1ff
+ */
+.balign 32
+.globl good_q
+good_q:
+  .word 0x87c3f1ff
+  .word 0xe6a3534a
+  .word 0x5191c9ae
+  .word 0x7e2938e6
+  .word 0xf4164d1a
+  .word 0x7542c7a6
+  .word 0x03ebe4e4
+  .word 0x693136e4
+  .word 0x476c1ab3
+  .word 0xf259bbc9
+  .word 0xa5e9c979
+  .word 0xcf73a30b
+  .word 0xca8e2043
+  .word 0x8e25e371
+  .word 0xc45bd05e
+  .word 0x4ca96ad7
+  .word 0x71cdd114
+  .word 0x6c571e4b
+  .word 0x3299e91a
+  .word 0x097671eb
+  .word 0xd2081cf1
+  .word 0x7281d264
+  .word 0x5e0088e3
+  .word 0xebe4bc6b
+  .word 0x6105ce43
+  .word 0x5125bfc1
+  .word 0xce4a54ca
+  .word 0xb64a0b1c
+  .word 0xd1de78d7
+  .word 0x972d1a8d
+  .word 0x2c89b3b8
+  .word 0xf83da359
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.exp
new file mode 100644
index 0000000000000..250028a7d63f7
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.exp
@@ -0,0 +1,2 @@
+# Expect 2^256 - 1 (check passed).
+w24 = 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.s
new file mode 100644
index 0000000000000..f40b0508f5e3d
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_good_test.s
@@ -0,0 +1,43 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Ensure that an acceptable value for q passes RSA keygen checks.
+ *
+ * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for
+ * RSA-2048.
+ */
+
+.section .text.start
+
+main:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load the number of limbs for this test. */
+  li        x30, 4
+  li        x31, 3
+
+  /* Load required constants. */
+  li        x20, 20
+  li        x21, 21
+
+  /* Copy a "good" p value into `rsa_p`. */
+  la        x16, good_p
+  la        x3, rsa_p
+  loop      x30, 2
+    bn.lid   x20, 0(x16++)
+    bn.sid   x20, 0(x3++)
+
+  /* Copy the good value into `rsa_q`. */
+  la        x16, good_q
+  la        x3, rsa_q
+  loop      x30, 2
+    bn.lid   x20, 0(x16++)
+    bn.sid   x20, 0(x3++)
+
+  /* w24 <= 2^256-1 if the check passed, otherwise 0 */
+  jal       x1, check_q
+
+  ecall
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.exp
new file mode 100644
index 0000000000000..75275f176e56d
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.exp
@@ -0,0 +1,2 @@
+# Expect 0 (check failed).
+w24 = 0
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.s
new file mode 100644
index 0000000000000..dfd18797fbeb6
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_prime_test.s
@@ -0,0 +1,43 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Ensure that a nonprime value for q fails RSA keygen checks.
+ *
+ * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for
+ * RSA-2048.
+ */
+
+.section .text.start
+
+main:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load the number of limbs for this test. */
+  li        x30, 4
+  li        x31, 3
+
+  /* Load required constants. */
+  li        x20, 20
+  li        x21, 21
+
+  /* Copy a "good" p value into `rsa_p`. */
+  la        x16, good_p
+  la        x3, rsa_p
+  loop      x30, 2
+    bn.lid   x20, 0(x16++)
+    bn.sid   x20, 0(x3++)
+
+  /* Copy the nonprime value into `rsa_q`. */
+  la        x16, not_prime
+  la        x3, rsa_q
+  loop      x30, 2
+    bn.lid   x20, 0(x16++)
+    bn.sid   x20, 0(x3++)
+
+  /* w24 <= 2^256-1 if the check passed, otherwise 0 */
+  jal       x1, check_q
+
+  ecall
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.exp
new file mode 100644
index 0000000000000..75275f176e56d
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.exp
@@ -0,0 +1,2 @@
+# Expect 0 (check failed).
+w24 = 0
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.s
new file mode 100644
index 0000000000000..5f13a25eea47a
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_not_relprime_test.s
@@ -0,0 +1,43 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Ensure that a multiple of F4 fails RSA keygen checks for q.
+ *
+ * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for
+ * RSA-2048.
+ */
+
+.section .text.start
+
+main:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load the number of limbs for this test. */
+  li        x30, 4
+  li        x31, 3
+
+  /* Load required constants. */
+  li        x20, 20
+  li        x21, 21
+
+  /* Copy a "good" p value into `rsa_p`. */
+  la        x16, good_p
+  la        x3, rsa_p
+  loop      x30, 2
+    bn.lid   x20, 0(x16++)
+    bn.sid   x20, 0(x3++)
+
+  /* Copy the bad value into `rsa_q`. */
+  la        x16, not_relprime
+  la        x3, rsa_q
+  loop      x30, 2
+    bn.lid   x20, 0(x16++)
+    bn.sid   x20, 0(x3++)
+
+  /* w24 <= 2^256-1 if the check passed, otherwise 0 */
+  jal       x1, check_q
+
+  ecall
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.exp b/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.exp
new file mode 100644
index 0000000000000..75275f176e56d
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.exp
@@ -0,0 +1,2 @@
+# Expect 0 (check failed).
+w24 = 0
diff --git a/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.s b/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.s
new file mode 100644
index 0000000000000..d2c3abf5d897b
--- /dev/null
+++ b/sw/otbn/crypto/tests/rsa_keygen_checkq_too_close_test.s
@@ -0,0 +1,43 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Ensure that a value for q which is too close to p fails RSA keygen checks.
+ *
+ * Uses the test data from `rsa_keygen_checkpq_test_data`, which is sized for
+ * RSA-2048.
+ */
+
+.section .text.start
+
+main:
+  /* Init all-zero register. */
+  bn.xor    w31, w31, w31
+
+  /* Load the number of limbs for this test. */
+  li        x30, 4
+  li        x31, 3
+
+  /* Load required constants. */
+  li        x20, 20
+  li        x21, 21
+
+  /* Copy a "good" p value into `rsa_p`. */
+  la        x16, good_p
+  la        x3, rsa_p
+  loop      x30, 2
+    bn.lid   x20, 0(x16++)
+    bn.sid   x20, 0(x3++)
+
+  /* Copy the too-close value into `rsa_q`. */
+  la        x16, too_close
+  la        x3, rsa_q
+  loop      x30, 2
+    bn.lid   x20, 0(x16++)
+    bn.sid   x20, 0(x3++)
+
+  /* w24 <= 2^256-1 if the check passed, otherwise 0 */
+  jal       x1, check_q
+
+  ecall
diff --git a/sw/otbn/crypto/tests/x25519_test.exp b/sw/otbn/crypto/tests/x25519_test.exp
deleted file mode 100644
index 40dc093fc9864..0000000000000
--- a/sw/otbn/crypto/tests/x25519_test.exp
+++ /dev/null
@@ -1,2 +0,0 @@
-# Test failure counter in w0 is 0.
-w0 = 0x0
diff --git a/sw/otbn/crypto/tests/x25519_test.s b/sw/otbn/crypto/tests/x25519_test.s
deleted file mode 100644
index 6f5fc1e316ca7..0000000000000
--- a/sw/otbn/crypto/tests/x25519_test.s
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright lowRISC contributors. */
-/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
-/* SPDX-License-Identifier: Apache-2.0 */
-
-/**
- * Standalone tests for X25519.
- *
- * This test will exit with the number of failures written to the w0 register;
- * w0=0 means all tests succeeded.
- */
-
-.section .text.start
-
-main:
-  /* Initialize failure counter to 0.
-       w0 <= 0 */
-  bn.xor  w0, w0, w0
-
-  /* Run tests. */
-  jal  x1, run_test1
-  jal  x1, run_test2
-
-  ecall
-
-run_test1:
-  /* w8 <= dmem[test1_k] = enc(k) */
-  li      x2, 8
-  la      x3, test1_k
-  bn.lid  x2, 0(x3)
-
-  /* w9 <= dmem[test1_u] = enc(u) */
-  li      x2, 9
-  la      x3, test1_u
-  bn.lid  x2, 0(x3)
-
-  /* w22 <= X25519(k, u) */
-  jal     x1, X25519
-
-  /* w25 <= dmem[test1_exp_result] */
-  li      x2, 25
-  la      x3, test1_exp_result
-  bn.lid  x2, 0(x3)
-
-  jal     x1, check_result
-
-  ret
-
-run_test2:
-  /* w8 <= dmem[test2_k] = enc(k) */
-  li      x2, 8
-  la      x3, test2_k
-  bn.lid  x2, 0(x3)
-
-  /* w9 <= dmem[test2_u] = enc(u) */
-  li      x2, 9
-  la      x3, test2_u
-  bn.lid  x2, 0(x3)
-
-  /* w22 <= X25519(k, u) */
-  jal     x1, X25519
-
-  /* w25 <= dmem[test2_exp_result] */
-  li      x2, 25
-  la      x3, test2_exp_result
-  bn.lid  x2, 0(x3)
-
-  jal     x1, check_result
-
-  ret
-
-/**
- * Increment the failure counter if expected/actual results don't match.
- *
- * @param[in] w25: expected result
- * @param[in] w22: actual result
- * @param[in,out] w0: error count
- *
- * clobbered registers: w0, w1
- * clobbered flag groups: FG0
- */
-check_result:
-  /* Increment error register if expected < actual. */
-  bn.addi w1, w0, 1
-  bn.cmp  w22, w25
-  bn.sel  w0, w1, w0, C
-
-  /* Increment error register if actual < expected. */
-  bn.addi w1, w0, 1
-  bn.cmp  w25, w22
-  bn.sel  w0, w1, w0, C
-  ret
-
-.data
-
-/* Test vector 1 from RFC 7748, section 5.2:
-     https://datatracker.ietf.org/doc/html/rfc7748#section-5.2 */
-
-.balign 32
-test1_k:
-  .word 0x6be346a5
-  .word 0x9d7c52f0
-  .word 0x4b15163b
-  .word 0xdd5e4682
-  .word 0x0a4c1462
-  .word 0x185afcc1
-  .word 0x44226a50
-  .word 0xc49a44ba
-
-.balign 32
-test1_u:
-  .word 0x6768dbe6
-  .word 0xdb303058
-  .word 0xa4c19435
-  .word 0x7c5fb124
-  .word 0xec246672
-  .word 0x3b35b326
-  .word 0xa603a910
-  .word 0x4c1cabd0
-
-.balign 32
-test1_exp_result:
-  .word 0x3755dac3
-  .word 0x90c6e99d
-  .word 0x4dea948e
-  .word 0x4f088df2
-  .word 0x03cfec32
-  .word 0xf7711c49
-  .word 0x5507b454
-  .word 0x5285a277
-
-
-/* Test vector 2 from RFC 7748, section 5.2:
-     https://datatracker.ietf.org/doc/html/rfc7748#section-5.2 */
-
-.balign 32
-test2_k:
-  .word 0xd4e9664b
-  .word 0x3c67b4d1
-  .word 0x9126d25a
-  .word 0xf56a7d95
-  .word 0x21641bc1
-  .word 0xd401eae0
-  .word 0x9e16a42c
-  .word 0x0dba1879
-
-.balign 32
-test2_u:
-  .word 0x120f21e5
-  .word 0xd3116878
-  .word 0x9d95b7f4
-  .word 0x2cae3805
-  .word 0x10e7db31
-  .word 0x3e3cc06f
-  .word 0x49d54cfc
-  .word 0x93a415c7
-
-.balign 32
-test2_exp_result:
-  .word 0x94decb95
-  .word 0x7d90e876
-  .word 0x5ce4ad7a
-  .word 0xf873b8b4
-  .word 0x685a598b
-  .word 0x52a19f79
-  .word 0x64f7f8e6
-  .word 0x5779ac7a
diff --git a/sw/otbn/crypto/tests/x25519_test1.exp b/sw/otbn/crypto/tests/x25519_test1.exp
new file mode 100644
index 0000000000000..2ac426a6a0ecc
--- /dev/null
+++ b/sw/otbn/crypto/tests/x25519_test1.exp
@@ -0,0 +1,2 @@
+# Expected result from the RFC.
+w22 = 0x5285a2775507b454f7711c4903cfec324f088df24dea948e90c6e99d3755dac3
diff --git a/sw/otbn/crypto/tests/x25519_test1.s b/sw/otbn/crypto/tests/x25519_test1.s
new file mode 100644
index 0000000000000..169f664b10fef
--- /dev/null
+++ b/sw/otbn/crypto/tests/x25519_test1.s
@@ -0,0 +1,52 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone test for X25519.
+ *
+ * Runs test vector 1 from RFC 7748, section 5.2:
+ *   https://datatracker.ietf.org/doc/html/rfc7748#section-5.2
+ */
+
+.section .text.start
+
+main:
+  /* w8 <= dmem[k] = enc(k) */
+  li      x2, 8
+  la      x3, k
+  bn.lid  x2, 0(x3)
+
+  /* w9 <= dmem[u] = enc(u) */
+  li      x2, 9
+  la      x3, u
+  bn.lid  x2, 0(x3)
+
+  /* w22 <= X25519(k, u) */
+  jal     x1, X25519
+
+  ecall
+
+.data
+
+.balign 32
+k:
+  .word 0x6be346a5
+  .word 0x9d7c52f0
+  .word 0x4b15163b
+  .word 0xdd5e4682
+  .word 0x0a4c1462
+  .word 0x185afcc1
+  .word 0x44226a50
+  .word 0xc49a44ba
+
+.balign 32
+u:
+  .word 0x6768dbe6
+  .word 0xdb303058
+  .word 0xa4c19435
+  .word 0x7c5fb124
+  .word 0xec246672
+  .word 0x3b35b326
+  .word 0xa603a910
+  .word 0x4c1cabd0
diff --git a/sw/otbn/crypto/tests/x25519_test2.exp b/sw/otbn/crypto/tests/x25519_test2.exp
new file mode 100644
index 0000000000000..0ec3576a4eb09
--- /dev/null
+++ b/sw/otbn/crypto/tests/x25519_test2.exp
@@ -0,0 +1,2 @@
+# Expected result from the RFC.
+w22 = 0x5779ac7a64f7f8e652a19f79685a598bf873b8b45ce4ad7a7d90e87694decb95
diff --git a/sw/otbn/crypto/tests/x25519_test2.s b/sw/otbn/crypto/tests/x25519_test2.s
new file mode 100644
index 0000000000000..a0947ed22e93b
--- /dev/null
+++ b/sw/otbn/crypto/tests/x25519_test2.s
@@ -0,0 +1,52 @@
+/* Copyright lowRISC contributors. */
+/* Licensed under the Apache License, Version 2.0, see LICENSE for details. */
+/* SPDX-License-Identifier: Apache-2.0 */
+
+/**
+ * Standalone test for X25519.
+ *
+ * Runs test vector 2 from RFC 7748, section 5.2:
+ *   https://datatracker.ietf.org/doc/html/rfc7748#section-5.2
+ */
+
+.section .text.start
+
+main:
+  /* w8 <= dmem[k] = enc(k) */
+  li      x2, 8
+  la      x3, k
+  bn.lid  x2, 0(x3)
+
+  /* w9 <= dmem[u] = enc(u) */
+  li      x2, 9
+  la      x3, u
+  bn.lid  x2, 0(x3)
+
+  /* w22 <= X25519(k, u) */
+  jal     x1, X25519
+
+  ecall
+
+.data
+
+.balign 32
+k:
+  .word 0xd4e9664b
+  .word 0x3c67b4d1
+  .word 0x9126d25a
+  .word 0xf56a7d95
+  .word 0x21641bc1
+  .word 0xd401eae0
+  .word 0x9e16a42c
+  .word 0x0dba1879
+
+.balign 32
+u:
+  .word 0x120f21e5
+  .word 0xd3116878
+  .word 0x9d95b7f4
+  .word 0x2cae3805
+  .word 0x10e7db31
+  .word 0x3e3cc06f
+  .word 0x49d54cfc
+  .word 0x93a415c7
diff --git a/sw/otbn/crypto/x25519.s b/sw/otbn/crypto/x25519.s
index facf6dfc6be5f..abc2c53e26a3a 100644
--- a/sw/otbn/crypto/x25519.s
+++ b/sw/otbn/crypto/x25519.s
@@ -38,7 +38,7 @@ X25519:
   li      x2, 2
   la      x3, modulus25519
   bn.lid  x2, 0(x3)
-  bn.wsrw 0x0, w2
+  bn.wsrw MOD, w2
 
   /* Decode scalar. From RFC 7748, section 5:
 
diff --git a/util/topgen/templates/chiplevel.sv.tpl b/util/topgen/templates/chiplevel.sv.tpl
index 8cd816a60e0bd..ca4bb3f8fae8f 100644
--- a/util/topgen/templates/chiplevel.sv.tpl
+++ b/util/topgen/templates/chiplevel.sv.tpl
@@ -1690,20 +1690,21 @@ module chip_${top["name"]}_${target["name"]} #(
 
   // Capture trigger.
   // We use the clkmgr_aon_idle signal of the IP of interest to form a precise capture trigger.
-  // GPIO[11:9] is used for selecting the IP of interest. The encoding is as follows (see
+  // GPIO[11:10] is used for selecting the IP of interest. The encoding is as follows (see
   // hint_names_e enum in clkmgr_pkg.sv for details).
   //
-  // IP              - GPIO[11:9] - Index for clkmgr_aon_idle
-  // ------------------------------------------------------------
-  //  AES            -   000      -  0
-  //  HMAC           -   001      -  1 - not implemented on CW305
-  //  KMAC           -   010      -  2 - not implemented on CW305
-  //  OTBN (IO_DIV4) -   011      -  3 - not implemented on CW305
-  //  OTBN           -   100      -  4 - not implemented on CW305
+  // IP              - GPIO[11:10] - Index for clkmgr_aon_idle
+  // -------------------------------------------------------------
+  //  AES            -   00       -  0
+  //  HMAC           -   01       -  1 - not implemented on CW305
+  //  KMAC           -   10       -  2 - not implemented on CW305
+  //  OTBN           -   11       -  3 - not implemented on CW305
   //
-  // In addition, GPIO8 is used for gating the capture trigger in software.
-  // Note that GPIO[11:8] are connected to LED[3:0] on the CW310.
-  // On the CW305, GPIO[9,8] are connected to LED[5,7].
+  // GPIO9 is used for gating the selected capture trigger in software. Alternatively, GPIO8
+  // can be used to implement a less precise but fully software-controlled capture trigger
+  // similar to what can be done on ASIC.
+  //
+  // Note that on the CW305, GPIO[9,8] are connected to LED[5(Green),7(Red)].
 
   prim_mubi_pkg::mubi4_t clk_trans_idle, manual_in_io_clk_idle;
 
@@ -1713,14 +1714,14 @@ module chip_${top["name"]}_${target["name"]} #(
   clkmgr_pkg::hint_names_e trigger_sel;
   always_comb begin : trigger_sel_mux
     % if top["name"] == "darjeeling":
-    unique case ({dio_out[DioGpioGpio11], dio_out[DioGpioGpio10], dio_out[DioGpioGpio9]})
+    unique case ({dio_out[DioGpioGpio11], dio_out[DioGpioGpio10]})
     % else:
-    unique case ({mio_out[MioOutGpioGpio11], mio_out[MioOutGpioGpio10], mio_out[MioOutGpioGpio9]})
+    unique case ({mio_out[MioOutGpioGpio11], mio_out[MioOutGpioGpio10]})
     % endif
-      3'b000:  trigger_sel = clkmgr_pkg::HintMainAes;
-      3'b001:  trigger_sel = clkmgr_pkg::HintMainHmac;
-      3'b010:  trigger_sel = clkmgr_pkg::HintMainKmac;
-      3'b100:  trigger_sel = clkmgr_pkg::HintMainOtbn;
+      2'b00:   trigger_sel = clkmgr_pkg::HintMainAes;
+      2'b01:   trigger_sel = clkmgr_pkg::HintMainHmac;
+      2'b10:   trigger_sel = clkmgr_pkg::HintMainKmac;
+      2'b11:   trigger_sel = clkmgr_pkg::HintMainOtbn;
       default: trigger_sel = clkmgr_pkg::HintMainAes;
     endcase;
   end
@@ -1730,28 +1731,50 @@ module chip_${top["name"]}_${target["name"]} #(
   logic clk_io_div4_trigger_en, manual_in_io_clk_trigger_en;
   logic clk_io_div4_trigger_oe, manual_in_io_clk_trigger_oe;
   % if top["name"] == "darjeeling":
-  assign clk_io_div4_trigger_en = dio_out[DioGpioGpio8];
-  assign clk_io_div4_trigger_oe = dio_oe[DioGpioGpio8];
+  logic clk_io_div4_trigger_hw_en, manual_in_io_clk_trigger_hw_en;
+  logic clk_io_div4_trigger_hw_oe, manual_in_io_clk_trigger_hw_oe;
+  logic clk_io_div4_trigger_sw_en, manual_in_io_clk_trigger_sw_en;
+  logic clk_io_div4_trigger_sw_oe, manual_in_io_clk_trigger_sw_oe;
+  assign clk_io_div4_trigger_hw_en = dio_out[DioGpioGpio9];
+  assign clk_io_div4_trigger_hw_oe = dio_oe[DioGpioGpio9];
+  assign clk_io_div4_trigger_sw_en = dio_out[DioGpioGpio8];
+  assign clk_io_div4_trigger_sw_oe = dio_oe[DioGpioGpio8];
   % else:
-  assign clk_io_div4_trigger_en = mio_out[MioOutGpioGpio8];
-  assign clk_io_div4_trigger_oe = mio_oe[MioOutGpioGpio8];
+  logic clk_io_div4_trigger_hw_en, manual_in_io_clk_trigger_hw_en;
+  logic clk_io_div4_trigger_hw_oe, manual_in_io_clk_trigger_hw_oe;
+  logic clk_io_div4_trigger_sw_en, manual_in_io_clk_trigger_sw_en;
+  logic clk_io_div4_trigger_sw_oe, manual_in_io_clk_trigger_sw_oe;
+  assign clk_io_div4_trigger_hw_en = mio_out[MioOutGpioGpio9];
+  assign clk_io_div4_trigger_hw_oe = mio_oe[MioOutGpioGpio9];
+  assign clk_io_div4_trigger_sw_en = mio_out[MioOutGpioGpio8];
+  assign clk_io_div4_trigger_sw_oe = mio_oe[MioOutGpioGpio8];
   % endif
 
   // Synchronize signals to manual_in_io_clk.
   prim_flop_2sync #(
-    .Width ($bits(clk_trans_idle) + 2)
+    .Width ($bits(clk_trans_idle) + 4)
   ) u_sync_trigger (
     .clk_i (manual_in_io_clk),
     .rst_ni(manual_in_por_n),
-    .d_i   ({clk_trans_idle,        clk_io_div4_trigger_en,      clk_io_div4_trigger_oe}),
-    .q_o   ({manual_in_io_clk_idle, manual_in_io_clk_trigger_en, manual_in_io_clk_trigger_oe})
+    .d_i   ({clk_trans_idle,
+             clk_io_div4_trigger_hw_en,
+             clk_io_div4_trigger_hw_oe,
+             clk_io_div4_trigger_sw_en,
+             clk_io_div4_trigger_sw_oe}),
+    .q_o   ({manual_in_io_clk_idle,
+             manual_in_io_clk_trigger_hw_en,
+             manual_in_io_clk_trigger_hw_oe,
+             manual_in_io_clk_trigger_sw_en,
+             manual_in_io_clk_trigger_sw_oe})
   );
 
-  // Generate the actual trigger signal.
+  // Generate the actual trigger signal as trigger_sw OR trigger_hw.
   assign manual_attr_io_trigger = '0;
-  assign manual_oe_io_trigger  = manual_in_io_clk_trigger_oe;
-  assign manual_out_io_trigger = manual_in_io_clk_trigger_en &
-      prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle);
+  assign manual_oe_io_trigger  =
+      manual_in_io_clk_trigger_sw_oe | manual_in_io_clk_trigger_hw_oe;
+  assign manual_out_io_trigger =
+      manual_in_io_clk_trigger_sw_en | (manual_in_io_clk_trigger_hw_en &
+          prim_mubi_pkg::mubi4_test_false_strict(manual_in_io_clk_idle));
 % endif
 ## This separate UART debugging output is needed for the CW305 only.
 % if target["name"] == "cw305":