Skip to content

Commit

Permalink
Merge pull request #94 from jargh/main
Browse files Browse the repository at this point in the history
64-bit SIMD regs in ARM model, better BOUNDER_RULE, slow-ARM field optimizations
  • Loading branch information
aqjune-aws authored Nov 17, 2023
2 parents f629458 + cb13eb8 commit 06781d2
Show file tree
Hide file tree
Showing 49 changed files with 1,183 additions and 499 deletions.
1 change: 1 addition & 0 deletions arm/allowed_asm
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,5 @@
: subs$
: tst$
: umulh$
: umull$
: $
2 changes: 1 addition & 1 deletion arm/curve25519/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0 OR ISC
#############################################################################

# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
# use a cross-assembling version so that the code can still be assembled
# and the proofs checked against the object files (though you won't be able
# to run code without additional emulation infrastructure). The aarch64
Expand Down
2 changes: 1 addition & 1 deletion arm/curve25519/bignum_inv_p25519.S
Original file line number Diff line number Diff line change
Expand Up @@ -1106,7 +1106,7 @@ midloop:
// since we know (for in-scope cases) that f is either +1 or -1.
// We don't explicitly shift right by 59 either, but looking at
// bit 63 (or any bit >= 60) of the unshifted result is enough
// to distiguish -1 from +1; this is then made into a mask.
// to distinguish -1 from +1; this is then made into a mask.

ldr x0, [f]
ldr x1, [g]
Expand Down
2 changes: 1 addition & 1 deletion arm/curve25519/edwards25519_scalarmulbase.S
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase):
// (X,Y,Z,T), representing an affine point on the edwards25519 curve
// (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z).
// In comments B means the standard basepoint (x,4/5) =
// (0x216....f25d51a,0x0x6666..666658).
// (0x216....f25d51a,0x6666..666658).
//
// Initialize accumulator "acc" to either 0 or 2^251 * B depending on
// bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle.
Expand Down
2 changes: 1 addition & 1 deletion arm/curve25519/edwards25519_scalarmulbase_alt.S
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt):
// (X,Y,Z,T), representing an affine point on the edwards25519 curve
// (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z).
// In comments B means the standard basepoint (x,4/5) =
// (0x216....f25d51a,0x0x6666..666658).
// (0x216....f25d51a,0x6666..666658).
//
// Initialize accumulator "acc" to either 0 or 2^251 * B depending on
// bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle.
Expand Down
2 changes: 1 addition & 1 deletion arm/curve25519/edwards25519_scalarmuldouble.S
Original file line number Diff line number Diff line change
Expand Up @@ -1514,7 +1514,7 @@ edwards25519_scalarmuldouble_loop:
// form amounts to swapping the first two fields and negating the third.
// The negation does not always fully reduce even mod 2^256-38 in the zero
// case, instead giving -0 = 2^256-38. But that is fine since the result is
// always fed to a multipliction inside the "pepadd" function below that
// always fed to a multiplication inside the "pepadd" function below that
// handles any 256-bit input.

cmp cf, xzr
Expand Down
2 changes: 1 addition & 1 deletion arm/curve25519/edwards25519_scalarmuldouble_alt.S
Original file line number Diff line number Diff line change
Expand Up @@ -1298,7 +1298,7 @@ edwards25519_scalarmuldouble_alt_loop:
// form amounts to swapping the first two fields and negating the third.
// The negation does not always fully reduce even mod 2^256-38 in the zero
// case, instead giving -0 = 2^256-38. But that is fine since the result is
// always fed to a multipliction inside the "pepadd" function below that
// always fed to a multiplication inside the "pepadd" function below that
// handles any 256-bit input.

cmp cf, xzr
Expand Down
2 changes: 1 addition & 1 deletion arm/fastmul/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0 OR ISC
#############################################################################

# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
# use a cross-assembling version so that the code can still be assembled
# and the proofs checked against the object files (though you won't be able
# to run code without additional emulation infrastructure). The aarch64
Expand Down
2 changes: 1 addition & 1 deletion arm/generic/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0 OR ISC
#############################################################################

# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
# use a cross-assembling version so that the code can still be assembled
# and the proofs checked against the object files (though you won't be able
# to run code without additional emulation infrastructure). The aarch64
Expand Down
2 changes: 1 addition & 1 deletion arm/generic/bignum_copy_row_from_table_8n_neon.S
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

// ----------------------------------------------------------------------------
// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1]
// into z[0..width-1]. width must be a mutiple of 8.
// into z[0..width-1]. width must be a multiple of 8.
// This function is constant-time with respect to the value of `idx`. This is
// achieved by reading the whole table and using the bit-masking to get the
// `idx`-th row.
Expand Down
2 changes: 1 addition & 1 deletion arm/generic/word_divstep59.S
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ S2N_BN_SYMBOL(word_divstep59):
tst grs, #1

// Split the last divsteps into two blocks of 10 and 9 to insert the matrix
// multiplication in betwen them. The first ten iterations:
// multiplication in between them. The first ten iterations:

.set i, 0
.rep 10
Expand Down
2 changes: 1 addition & 1 deletion arm/p256/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0 OR ISC
#############################################################################

# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
# use a cross-assembling version so that the code can still be assembled
# and the proofs checked against the object files (though you won't be able
# to run code without additional emulation infrastructure). The aarch64
Expand Down
76 changes: 40 additions & 36 deletions arm/p256/bignum_montsqr_p256.S
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@
#define s2 x17
#define s3 x1

#define a0short w2
#define a1short w3
#define d1short w11

S2N_BN_SYMBOL(bignum_montsqr_p256):

// Load in all words of the input
Expand All @@ -110,21 +114,29 @@ S2N_BN_SYMBOL(bignum_montsqr_p256):
ldp a2, a3, [x1, #16]

// Square the low half, getting a result in [s3;s2;s1;s0]

mul s0, a0, a0
mul s2, a1, a1
mul d1, a0, a1

umulh s1, a0, a0
umulh s3, a1, a1
umulh d2, a0, a1

adds s1, s1, d1
adcs s2, s2, d2
// This uses 32x32->64 multiplications to reduce the number of UMULHs

umull s0, a0short, a0short
lsr d1, a0, #32
umull s1, d1short, d1short
umull d1, a0short, d1short
adds s0, s0, d1, lsl #33
lsr d1, d1, #31
adc s1, s1, d1
umull s2, a1short, a1short
lsr d1, a1, #32
umull s3, d1short, d1short
umull d1, a1short, d1short
mul d2, a0, a1
umulh d3, a0, a1
adds s2, s2, d1, lsl #33
lsr d1, d1, #31
adc s3, s3, d1
adds d2, d2, d2
adcs d3, d3, d3
adc s3, s3, xzr

adds s1, s1, d1
adcs s2, s2, d2
adds s1, s1, d2
adcs s2, s2, d3
adc s3, s3, xzr

// Perform two "short" Montgomery steps on the low square
Expand Down Expand Up @@ -185,11 +197,6 @@ S2N_BN_SYMBOL(bignum_montsqr_p256):
#define r3 x6
#define c x7

// Remind ourselves what else we can't destroy

#define a2 x4
#define a3 x5

// So we can have these as temps

#define t1 x11
Expand Down Expand Up @@ -225,30 +232,27 @@ S2N_BN_SYMBOL(bignum_montsqr_p256):
// B^2 c <= z + (B - 1) * p < B * p + (B - 1) * p < 2 * B * p,
// so the top half is certainly < 2 * p. If c = 1 already, we know
// subtracting p will give the reduced modulus. But now we do a
// comparison to catch cases where the residue is >= p.
// First set [t2;0;t1;-1] = p_256 since we use the constants twice.
// subtraction-comparison to catch cases where the residue is >= p.
// The constants are such that [t3;0;t1;-1] = p_256.

mov t1, #0x00000000ffffffff
mov t2, #0xffffffff00000001
#define t0 x5

// Set CF (because of inversion) iff (0,p_256) <= (c,r3,r2,r1,r0)

subs xzr, r0, #-1
sbcs xzr, r1, t1
sbcs xzr, r2, xzr
sbcs xzr, r3, t2
mov t1, #0x00000000ffffffff
subs t0, r0, #-1
sbcs t1, r1, t1
mov t3, #0xffffffff00000001
sbcs t2, r2, xzr
sbcs t3, r3, t3
sbcs xzr, c, xzr

// Convert the condition into a bitmask and do a masked subtraction

csetm c, cs
// Select final output accordingly

subs r0, r0, c
and t1, t1, c
sbcs r1, r1, t1
sbcs r2, r2, xzr
and t2, t2, c
sbc r3, r3, t2
csel r0, t0, r0, cs
csel r1, t1, r1, cs
csel r2, t2, r2, cs
csel r3, t3, r3, cs

// Store things back in place

Expand Down
2 changes: 1 addition & 1 deletion arm/p384/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0 OR ISC
#############################################################################

# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
# use a cross-assembling version so that the code can still be assembled
# and the proofs checked against the object files (though you won't be able
# to run code without additional emulation infrastructure). The aarch64
Expand Down
2 changes: 1 addition & 1 deletion arm/p521/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0 OR ISC
#############################################################################

# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise
# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise
# use a cross-assembling version so that the code can still be assembled
# and the proofs checked against the object files (though you won't be able
# to run code without additional emulation infrastructure). The aarch64
Expand Down
41 changes: 39 additions & 2 deletions arm/proofs/arm.ml
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,41 @@ let WREG_EXPAND_CLAUSES = prove
W30 = X30 :> zerotop_32`,
REWRITE_TAC(!component_alias_thms));;

let DREG_EXPAND_CLAUSES = prove
(`D0 = Q0 :> zerotop_64 /\
D1 = Q1 :> zerotop_64 /\
D2 = Q2 :> zerotop_64 /\
D3 = Q3 :> zerotop_64 /\
D4 = Q4 :> zerotop_64 /\
D5 = Q5 :> zerotop_64 /\
D6 = Q6 :> zerotop_64 /\
D7 = Q7 :> zerotop_64 /\
D8 = Q8 :> zerotop_64 /\
D9 = Q9 :> zerotop_64 /\
D10 = Q10 :> zerotop_64 /\
D11 = Q11 :> zerotop_64 /\
D12 = Q12 :> zerotop_64 /\
D13 = Q13 :> zerotop_64 /\
D14 = Q14 :> zerotop_64 /\
D15 = Q15 :> zerotop_64 /\
D16 = Q16 :> zerotop_64 /\
D17 = Q17 :> zerotop_64 /\
D18 = Q18 :> zerotop_64 /\
D19 = Q19 :> zerotop_64 /\
D20 = Q20 :> zerotop_64 /\
D21 = Q21 :> zerotop_64 /\
D22 = Q22 :> zerotop_64 /\
D23 = Q23 :> zerotop_64 /\
D24 = Q24 :> zerotop_64 /\
D25 = Q25 :> zerotop_64 /\
D26 = Q26 :> zerotop_64 /\
D27 = Q27 :> zerotop_64 /\
D28 = Q28 :> zerotop_64 /\
D29 = Q29 :> zerotop_64 /\
D30 = Q30 :> zerotop_64 /\
D31 = Q31 :> zerotop_64`,
REWRITE_TAC(!component_alias_thms));;

let READ_SHIFTEDREG_CLAUSES = prove
(`read (Shiftedreg Rn LSL n) s = word_shl (read Rn s) n /\
read (Shiftedreg Rn LSR n) s = word_ushr (read Rn s) n /\
Expand Down Expand Up @@ -203,9 +238,11 @@ let ARM_EXEC_CONV =
GEN_REWRITE_CONV ONCE_DEPTH_CONV [CONJUNCT2 SEQ_ID]) ORELSEC
(GEN_REWRITE_CONV I ARM_OPERATION_CLAUSES THENC
REWRITE_CONV [condition_semantics])) THENC
REWRITE_CONV[WREG_EXPAND_CLAUSES] THENC
REWRITE_CONV[READ_RVALUE; ASSIGN_ZEROTOP_32; ARM_ZERO_REGISTER;
REWRITE_CONV[WREG_EXPAND_CLAUSES; DREG_EXPAND_CLAUSES] THENC
REWRITE_CONV[READ_RVALUE; ARM_ZERO_REGISTER;
ASSIGN_ZEROTOP_32; ASSIGN_ZEROTOP_64;
READ_ZEROTOP_32; WRITE_ZEROTOP_32;
READ_ZEROTOP_64; WRITE_ZEROTOP_64;
READ_SHIFTEDREG_CLAUSES; READ_EXTENDEDREG_CLAUSES] THENC
DEPTH_CONV(WORD_NUM_RED_CONV ORELSEC WORD_WORD_OPERATION_CONV);;

Expand Down
2 changes: 1 addition & 1 deletion arm/proofs/base.ml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ loadt "common/interval.ml";;
loadt "common/elf.ml";;

(* ------------------------------------------------------------------------- *)
(* Support for additional SHA instrinsics (from Carl Kwan) *)
(* Support for additional SHA intrinsics (from Carl Kwan) *)
(* ------------------------------------------------------------------------- *)

loadt "arm/proofs/sha256.ml";;
Expand Down
2 changes: 1 addition & 1 deletion arm/proofs/bignum_cmadd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ let BIGNUM_CMADD_CORRECT = prove
X_GEN_TAC `pc:num` THEN REWRITE_TAC[NONOVERLAPPING_CLAUSES] THEN
REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN DISCH_TAC THEN

(*** Target the end label, removing redudancy in conclusion ***)
(*** Target the end label, removing redundancy in conclusion ***)

BIGNUM_TERMRANGE_TAC `n:num` `a:num` THEN
ENSURES_SEQUENCE_TAC `pc + 0x98`
Expand Down
2 changes: 1 addition & 1 deletion arm/proofs/bignum_cmnegadd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ let BIGNUM_CMNEGADD_CORRECT = prove
X_GEN_TAC `pc:num` THEN REWRITE_TAC[NONOVERLAPPING_CLAUSES] THEN
REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN DISCH_TAC THEN

(*** Target the end label, removing redudancy in conclusion ***)
(*** Target the end label, removing redundancy in conclusion ***)

BIGNUM_TERMRANGE_TAC `n:num` `a:num` THEN
ENSURES_SEQUENCE_TAC `pc + 0xb8`
Expand Down
2 changes: 1 addition & 1 deletion arm/proofs/bignum_cmul.ml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ let BIGNUM_CMUL_CORRECT = prove
X_GEN_TAC `pc:num` THEN REWRITE_TAC[NONOVERLAPPING_CLAUSES] THEN
REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS] THEN DISCH_TAC THEN

(*** Target the end label, removing redudancy in conclusion ***)
(*** Target the end label, removing redundancy in conclusion ***)

BIGNUM_TERMRANGE_TAC `n:num` `a:num` THEN
ENSURES_SEQUENCE_TAC `pc + 0x80`
Expand Down
2 changes: 1 addition & 1 deletion arm/proofs/bignum_emontredc_8n_neon.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2675,7 +2675,7 @@ let BIGNUM_EMONTREDC_8N_NEON_CORRECT = time prove
CONV_TAC (LAND_CONV (ONCE_DEPTH_CONV NUM_REDUCE_CONV)) THEN
REWRITE_TAC[WORD_ADD_0;ARITH_RULE `x+0=x`;VAL_WORD_GALOIS;DIMINDEX_64; BIGDIGIT_BOUND] THEN STRIP_TAC] THEN

(* From assupmtion
(* From assumption
`bignum_from_memory (z',((k + 4) - 4 * i)) s0 = highdigits a (4 * i),
make
`bignum_from_memory (z'+32,k - 4*i) s0 = highdigits a (4 * (i + 1))
Expand Down
2 changes: 1 addition & 1 deletion arm/proofs/bignum_ksqr_32_64.ml
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,7 @@ let bignum_ksqr_32_64_mc =
let BIGNUM_KSQR_32_64_EXEC = ARM_MK_EXEC_RULE bignum_ksqr_32_64_mc;;

(* ------------------------------------------------------------------------- *)
(* Proof for the inner-level 8->16 squring. *)
(* Proof for the inner-level 8->16 squaring. *)
(* ------------------------------------------------------------------------- *)

let lemma1 = prove
Expand Down
2 changes: 1 addition & 1 deletion arm/proofs/bignum_ksqr_32_64_neon.ml
Original file line number Diff line number Diff line change
Expand Up @@ -917,7 +917,7 @@ let bignum_ksqr_32_64_neon_mc =
let BIGNUM_KSQR_32_64_NEON_EXEC = ARM_MK_EXEC_RULE bignum_ksqr_32_64_neon_mc;;

(* ------------------------------------------------------------------------- *)
(* Proof for the inner-level 8->16 squring. *)
(* Proof for the inner-level 8->16 squaring. *)
(* ------------------------------------------------------------------------- *)

let lemma1 = prove
Expand Down
Loading

0 comments on commit 06781d2

Please sign in to comment.