Merge pull request #94 from jargh/main

64-bit SIMD regs in ARM model, better BOUNDER_RULE, slow-ARM field optimizations
awslabs · Nov 17, 2023 · 06781d2 · 06781d2
2 parents f629458 + cb13eb8
commit 06781d2
Show file tree

Hide file tree

Showing 49 changed files with 1,183 additions and 499 deletions.
diff --git a/arm/allowed_asm b/arm/allowed_asm
@@ -41,4 +41,5 @@
 : subs$
 : tst$
 : umulh$
+: umull$
 : $
diff --git a/arm/curve25519/Makefile b/arm/curve25519/Makefile
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 OR ISC
 #############################################################################
 
-# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
 # use a cross-assembling version so that the code can still be assembled
 # and the proofs checked against the object files (though you won't be able
 # to run code without additional emulation infrastructure). The aarch64

diff --git a/arm/curve25519/bignum_inv_p25519.S b/arm/curve25519/bignum_inv_p25519.S
@@ -1106,7 +1106,7 @@ midloop:
 // since we know (for in-scope cases) that f is either +1 or -1.
 // We don't explicitly shift right by 59 either, but looking at
 // bit 63 (or any bit >= 60) of the unshifted result is enough
-// to distiguish -1 from +1; this is then made into a mask.
+// to distinguish -1 from +1; this is then made into a mask.
 
         ldr     x0, [f]
         ldr     x1, [g]

diff --git a/arm/curve25519/edwards25519_scalarmulbase.S b/arm/curve25519/edwards25519_scalarmulbase.S
@@ -577,7 +577,7 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase):
 // (X,Y,Z,T), representing an affine point on the edwards25519 curve
 // (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z).
 // In comments B means the standard basepoint (x,4/5) =
-// (0x216....f25d51a,0x0x6666..666658).
+// (0x216....f25d51a,0x6666..666658).
 //
 // Initialize accumulator "acc" to either 0 or 2^251 * B depending on
 // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle.

diff --git a/arm/curve25519/edwards25519_scalarmulbase_alt.S b/arm/curve25519/edwards25519_scalarmulbase_alt.S
@@ -419,7 +419,7 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt):
 // (X,Y,Z,T), representing an affine point on the edwards25519 curve
 // (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z).
 // In comments B means the standard basepoint (x,4/5) =
-// (0x216....f25d51a,0x0x6666..666658).
+// (0x216....f25d51a,0x6666..666658).
 //
 // Initialize accumulator "acc" to either 0 or 2^251 * B depending on
 // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle.

diff --git a/arm/curve25519/edwards25519_scalarmuldouble.S b/arm/curve25519/edwards25519_scalarmuldouble.S
@@ -1514,7 +1514,7 @@ edwards25519_scalarmuldouble_loop:
 // form amounts to swapping the first two fields and negating the third.
 // The negation does not always fully reduce even mod 2^256-38 in the zero
 // case, instead giving -0 = 2^256-38. But that is fine since the result is
-// always fed to a multipliction inside the "pepadd" function below that
+// always fed to a multiplication inside the "pepadd" function below that
 // handles any 256-bit input.
 
         cmp     cf, xzr

diff --git a/arm/curve25519/edwards25519_scalarmuldouble_alt.S b/arm/curve25519/edwards25519_scalarmuldouble_alt.S
@@ -1298,7 +1298,7 @@ edwards25519_scalarmuldouble_alt_loop:
 // form amounts to swapping the first two fields and negating the third.
 // The negation does not always fully reduce even mod 2^256-38 in the zero
 // case, instead giving -0 = 2^256-38. But that is fine since the result is
-// always fed to a multipliction inside the "pepadd" function below that
+// always fed to a multiplication inside the "pepadd" function below that
 // handles any 256-bit input.
 
         cmp     cf, xzr

diff --git a/arm/fastmul/Makefile b/arm/fastmul/Makefile
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 OR ISC
 #############################################################################
 
-# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
 # use a cross-assembling version so that the code can still be assembled
 # and the proofs checked against the object files (though you won't be able
 # to run code without additional emulation infrastructure). The aarch64

diff --git a/arm/generic/Makefile b/arm/generic/Makefile
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 OR ISC
 #############################################################################
 
-# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
 # use a cross-assembling version so that the code can still be assembled
 # and the proofs checked against the object files (though you won't be able
 # to run code without additional emulation infrastructure). The aarch64

diff --git a/arm/generic/bignum_copy_row_from_table_8n_neon.S b/arm/generic/bignum_copy_row_from_table_8n_neon.S
@@ -3,7 +3,7 @@
 
 // ----------------------------------------------------------------------------
 // Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
-// into z[0..width-1]. width must be a mutiple of 8.
+// into z[0..width-1]. width must be a multiple of 8.
 // This function is constant-time with respect to the value of `idx`. This is
 // achieved by reading the whole table and using the bit-masking to get the
 // `idx`-th row.

diff --git a/arm/generic/word_divstep59.S b/arm/generic/word_divstep59.S
@@ -202,7 +202,7 @@ S2N_BN_SYMBOL(word_divstep59):
         tst     grs, #1
 
 // Split the last divsteps into two blocks of 10 and 9 to insert the matrix
-// multiplication in betwen them. The first ten iterations:
+// multiplication in between them. The first ten iterations:
 
 .set i, 0
 .rep 10

diff --git a/arm/p256/Makefile b/arm/p256/Makefile
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 OR ISC
 #############################################################################
 
-# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
 # use a cross-assembling version so that the code can still be assembled
 # and the proofs checked against the object files (though you won't be able
 # to run code without additional emulation infrastructure). The aarch64

diff --git a/arm/p256/bignum_montsqr_p256.S b/arm/p256/bignum_montsqr_p256.S
@@ -102,6 +102,10 @@
 #define s2 x17
 #define s3 x1
 
+#define a0short w2
+#define a1short w3
+#define d1short w11
+
 S2N_BN_SYMBOL(bignum_montsqr_p256):
 
 // Load in all words of the input
@@ -110,21 +114,29 @@ S2N_BN_SYMBOL(bignum_montsqr_p256):
         ldp     a2, a3, [x1, #16]
 
 // Square the low half, getting a result in [s3;s2;s1;s0]
-
-        mul     s0, a0, a0
-        mul     s2, a1, a1
-        mul     d1, a0, a1
-
-        umulh   s1, a0, a0
-        umulh   s3, a1, a1
-        umulh   d2, a0, a1
-
-        adds    s1, s1, d1
-        adcs    s2, s2, d2
+// This uses 32x32->64 multiplications to reduce the number of UMULHs
+
+        umull   s0, a0short, a0short
+        lsr     d1, a0, #32
+        umull   s1, d1short, d1short
+        umull   d1, a0short, d1short
+        adds    s0, s0, d1, lsl #33
+        lsr     d1, d1, #31
+        adc     s1, s1, d1
+        umull   s2, a1short, a1short
+        lsr     d1, a1, #32
+        umull   s3, d1short, d1short
+        umull   d1, a1short, d1short
+        mul     d2, a0, a1
+        umulh   d3, a0, a1
+        adds    s2, s2, d1, lsl #33
+        lsr     d1, d1, #31
+        adc     s3, s3, d1
+        adds    d2, d2, d2
+        adcs    d3, d3, d3
         adc     s3, s3, xzr
-
-        adds    s1, s1, d1
-        adcs    s2, s2, d2
+        adds    s1, s1, d2
+        adcs    s2, s2, d3
         adc     s3, s3, xzr
 
 // Perform two "short" Montgomery steps on the low square
@@ -185,11 +197,6 @@ S2N_BN_SYMBOL(bignum_montsqr_p256):
 #define r3 x6
 #define c x7
 
-// Remind ourselves what else we can't destroy
-
-#define a2 x4
-#define a3 x5
-
 // So we can have these as temps
 
 #define t1 x11
@@ -225,30 +232,27 @@ S2N_BN_SYMBOL(bignum_montsqr_p256):
 // B^2 c <= z + (B - 1) * p < B * p + (B - 1) * p < 2 * B * p,
 // so the top half is certainly < 2 * p. If c = 1 already, we know
 // subtracting p will give the reduced modulus. But now we do a
-// comparison to catch cases where the residue is >= p.
-// First set [t2;0;t1;-1] = p_256 since we use the constants twice.
+// subtraction-comparison to catch cases where the residue is >= p.
+// The constants are such that [t3;0;t1;-1] = p_256.
 
-        mov     t1, #0x00000000ffffffff
-        mov     t2, #0xffffffff00000001
+#define t0      x5
 
 // Set CF (because of inversion) iff (0,p_256) <= (c,r3,r2,r1,r0)
 
-        subs    xzr, r0, #-1
-        sbcs    xzr, r1, t1
-        sbcs    xzr, r2, xzr
-        sbcs    xzr, r3, t2
+        mov     t1, #0x00000000ffffffff
+        subs    t0, r0, #-1
+        sbcs    t1, r1, t1
+        mov     t3, #0xffffffff00000001
+        sbcs    t2, r2, xzr
+        sbcs    t3, r3, t3
         sbcs    xzr, c, xzr
 
-// Convert the condition into a bitmask and do a masked subtraction
-
-        csetm   c, cs
+// Select final output accordingly
 
-        subs    r0, r0, c
-        and     t1, t1, c
-        sbcs    r1, r1, t1
-        sbcs    r2, r2, xzr
-        and     t2, t2, c
-        sbc     r3, r3, t2
+        csel    r0, t0, r0, cs
+        csel    r1, t1, r1, cs
+        csel    r2, t2, r2, cs
+        csel    r3, t3, r3, cs
 
 // Store things back in place
 

diff --git a/arm/p384/Makefile b/arm/p384/Makefile
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 OR ISC
 #############################################################################
 
-# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
 # use a cross-assembling version so that the code can still be assembled
 # and the proofs checked against the object files (though you won't be able
 # to run code without additional emulation infrastructure). The aarch64

diff --git a/arm/p521/Makefile b/arm/p521/Makefile
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 OR ISC
 #############################################################################
 
-# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
+# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
 # use a cross-assembling version so that the code can still be assembled
 # and the proofs checked against the object files (though you won't be able
 # to run code without additional emulation infrastructure). The aarch64

diff --git a/arm/proofs/arm.ml b/arm/proofs/arm.ml
@@ -162,6 +162,41 @@ let WREG_EXPAND_CLAUSES = prove
    W30 = X30 :> zerotop_32`,
   REWRITE_TAC(!component_alias_thms));;
 
+let DREG_EXPAND_CLAUSES = prove
+ (`D0 = Q0 :> zerotop_64 /\
+   D1 = Q1 :> zerotop_64 /\
+   D2 = Q2 :> zerotop_64 /\
+   D3 = Q3 :> zerotop_64 /\
+   D4 = Q4 :> zerotop_64 /\
+   D5 = Q5 :> zerotop_64 /\
+   D6 = Q6 :> zerotop_64 /\
+   D7 = Q7 :> zerotop_64 /\
+   D8 = Q8 :> zerotop_64 /\
+   D9 = Q9 :> zerotop_64 /\
+   D10 = Q10 :> zerotop_64 /\
+   D11 = Q11 :> zerotop_64 /\
+   D12 = Q12 :> zerotop_64 /\
+   D13 = Q13 :> zerotop_64 /\
+   D14 = Q14 :> zerotop_64 /\
+   D15 = Q15 :> zerotop_64 /\
+   D16 = Q16 :> zerotop_64 /\
+   D17 = Q17 :> zerotop_64 /\
+   D18 = Q18 :> zerotop_64 /\
+   D19 = Q19 :> zerotop_64 /\
+   D20 = Q20 :> zerotop_64 /\
+   D21 = Q21 :> zerotop_64 /\
+   D22 = Q22 :> zerotop_64 /\
+   D23 = Q23 :> zerotop_64 /\
+   D24 = Q24 :> zerotop_64 /\
+   D25 = Q25 :> zerotop_64 /\
+   D26 = Q26 :> zerotop_64 /\
+   D27 = Q27 :> zerotop_64 /\
+   D28 = Q28 :> zerotop_64 /\
+   D29 = Q29 :> zerotop_64 /\
+   D30 = Q30 :> zerotop_64 /\
+   D31 = Q31 :> zerotop_64`,
+  REWRITE_TAC(!component_alias_thms));;
+
 let READ_SHIFTEDREG_CLAUSES = prove
  (`read (Shiftedreg Rn LSL n) s = word_shl (read Rn s) n /\
    read (Shiftedreg Rn LSR n) s = word_ushr (read Rn s) n /\
@@ -203,9 +238,11 @@ let ARM_EXEC_CONV =
     GEN_REWRITE_CONV ONCE_DEPTH_CONV [CONJUNCT2 SEQ_ID]) ORELSEC
    (GEN_REWRITE_CONV I ARM_OPERATION_CLAUSES THENC
     REWRITE_CONV [condition_semantics])) THENC
-  REWRITE_CONV[WREG_EXPAND_CLAUSES] THENC
-  REWRITE_CONV[READ_RVALUE; ASSIGN_ZEROTOP_32; ARM_ZERO_REGISTER;
+  REWRITE_CONV[WREG_EXPAND_CLAUSES; DREG_EXPAND_CLAUSES] THENC
+  REWRITE_CONV[READ_RVALUE; ARM_ZERO_REGISTER;
+               ASSIGN_ZEROTOP_32; ASSIGN_ZEROTOP_64;
                READ_ZEROTOP_32; WRITE_ZEROTOP_32;
+               READ_ZEROTOP_64; WRITE_ZEROTOP_64;
                READ_SHIFTEDREG_CLAUSES; READ_EXTENDEDREG_CLAUSES] THENC
   DEPTH_CONV(WORD_NUM_RED_CONV ORELSEC WORD_WORD_OPERATION_CONV);;
 

diff --git a/arm/proofs/base.ml b/arm/proofs/base.ml
@@ -37,7 +37,7 @@ loadt "common/interval.ml";;
 loadt "common/elf.ml";;
 
 (* ------------------------------------------------------------------------- *)
-(* Support for additional SHA instrinsics (from Carl Kwan)                   *)
+(* Support for additional SHA intrinsics (from Carl Kwan)                    *)
 (* ------------------------------------------------------------------------- *)
 
 loadt "arm/proofs/sha256.ml";;

diff --git a/arm/proofs/bignum_cmadd.ml b/arm/proofs/bignum_cmadd.ml
@@ -85,7 +85,7 @@ let BIGNUM_CMADD_CORRECT = prove
   X_GEN_TAC `pc:num` THEN REWRITE_TAC[NONOVERLAPPING_CLAUSES] THEN
   REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN DISCH_TAC THEN
 
-  (*** Target the end label, removing redudancy in conclusion ***)
+  (*** Target the end label, removing redundancy in conclusion ***)
 
   BIGNUM_TERMRANGE_TAC `n:num` `a:num` THEN
   ENSURES_SEQUENCE_TAC `pc + 0x98`

diff --git a/arm/proofs/bignum_cmnegadd.ml b/arm/proofs/bignum_cmnegadd.ml
@@ -95,7 +95,7 @@ let BIGNUM_CMNEGADD_CORRECT = prove
   X_GEN_TAC `pc:num` THEN REWRITE_TAC[NONOVERLAPPING_CLAUSES] THEN
   REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN DISCH_TAC THEN
 
-  (*** Target the end label, removing redudancy in conclusion ***)
+  (*** Target the end label, removing redundancy in conclusion ***)
 
   BIGNUM_TERMRANGE_TAC `n:num` `a:num` THEN
   ENSURES_SEQUENCE_TAC `pc + 0xb8`

diff --git a/arm/proofs/bignum_cmul.ml b/arm/proofs/bignum_cmul.ml
@@ -77,7 +77,7 @@ let BIGNUM_CMUL_CORRECT = prove
   X_GEN_TAC `pc:num` THEN REWRITE_TAC[NONOVERLAPPING_CLAUSES] THEN
   REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN DISCH_TAC THEN
 
-  (*** Target the end label, removing redudancy in conclusion ***)
+  (*** Target the end label, removing redundancy in conclusion ***)
 
   BIGNUM_TERMRANGE_TAC `n:num` `a:num` THEN
   ENSURES_SEQUENCE_TAC `pc + 0x80`

diff --git a/arm/proofs/bignum_emontredc_8n_neon.ml b/arm/proofs/bignum_emontredc_8n_neon.ml
@@ -2675,7 +2675,7 @@ let BIGNUM_EMONTREDC_8N_NEON_CORRECT = time prove
     CONV_TAC (LAND_CONV (ONCE_DEPTH_CONV NUM_REDUCE_CONV)) THEN
     REWRITE_TAC[WORD_ADD_0;ARITH_RULE `x+0=x`;VAL_WORD_GALOIS;DIMINDEX_64; BIGDIGIT_BOUND] THEN STRIP_TAC] THEN
 
-  (* From assupmtion
+  (* From assumption
         `bignum_from_memory (z',((k + 4) - 4 * i)) s0 = highdigits a (4 * i),
      make
         `bignum_from_memory (z'+32,k - 4*i) s0 = highdigits a (4 * (i + 1))

diff --git a/arm/proofs/bignum_ksqr_32_64.ml b/arm/proofs/bignum_ksqr_32_64.ml
@@ -843,7 +843,7 @@ let bignum_ksqr_32_64_mc =
 let BIGNUM_KSQR_32_64_EXEC = ARM_MK_EXEC_RULE bignum_ksqr_32_64_mc;;
 
 (* ------------------------------------------------------------------------- *)
-(* Proof for the inner-level 8->16 squring.                                  *)
+(* Proof for the inner-level 8->16 squaring.                                 *)
 (* ------------------------------------------------------------------------- *)
 
 let lemma1 = prove

diff --git a/arm/proofs/bignum_ksqr_32_64_neon.ml b/arm/proofs/bignum_ksqr_32_64_neon.ml
@@ -917,7 +917,7 @@ let bignum_ksqr_32_64_neon_mc =
 let BIGNUM_KSQR_32_64_NEON_EXEC = ARM_MK_EXEC_RULE bignum_ksqr_32_64_neon_mc;;
 
 (* ------------------------------------------------------------------------- *)
-(* Proof for the inner-level 8->16 squring.                                  *)
+(* Proof for the inner-level 8->16 squaring.                                 *)
 (* ------------------------------------------------------------------------- *)
 
 let lemma1 = prove
-Original file line number
+Diff line change
@@ Expand Up / @@ -41,4 +41,5 @@ @@
     : subs$
     : tst$
     : umulh$
+    : umull$
     : $