From 65f046ecbfa1842806720f352d3181b7caf99ce3 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Thu, 6 Jun 2024 12:02:20 -0500 Subject: [PATCH] Add `bignum_mont{sqr,mul}_p521_neon` This patch adds `bignum_mont{sqr,mul}_p521_neon`. ``` bignum_montsqr_p521 : 114.7 ns each (var 0.2%, corr 0.06) = 8720010 ops/sec bignum_montsqr_p521_neon : 83.8 ns each (var 0.4%, corr -0.04) = 11926387 ops/sec bignum_montmul_p521 : 130.8 ns each (var 0.2%, corr -0.00) = 7644702 ops/sec bignum_montmul_p521_neon : 111.4 ns each (var 0.2%, corr 0.04) = 8978421 ops/sec ``` The new subroutine specs are added to specification.txt, and test as well as benchmark are updated. Modular squaring/multiplication functions are not included in this patch. This patch also contains the following updates: - A tactic for showing equivalence of loops is added (the tactic is not used yet). - Definitions for input state equivalence are canonicalized as `.. /\ (?a. read c1 s = a /\ read c1 s' = a /\ (?b. read c2 s = b /\ read c2 s' = b /\ ( ... )))` - Minor buggy behaviors in equiv tactics are fixed and performance improvements done --- arm/Makefile | 2 + arm/p521/Makefile | 2 + arm/p521/bignum_montmul_p521_neon.S | 1415 ++++++++++++++++++++++++ arm/p521/bignum_montsqr_p521_neon.S | 1124 +++++++++++++++++++ arm/proofs/arm.ml | 55 +- arm/proofs/bignum_montmul_p256_neon.ml | 48 +- arm/proofs/bignum_montmul_p384_neon.ml | 49 +- arm/proofs/bignum_montmul_p521.ml | 91 +- arm/proofs/bignum_montmul_p521_neon.ml | 1216 ++++++++++++++++++++ arm/proofs/bignum_montsqr_p256_neon.ml | 69 +- arm/proofs/bignum_montsqr_p384_neon.ml | 50 +- arm/proofs/bignum_montsqr_p521.ml | 70 +- arm/proofs/bignum_montsqr_p521_neon.ml | 1032 +++++++++++++++++ arm/proofs/bignum_mul_8_16_neon.ml | 2 +- arm/proofs/bignum_sqr_8_16_neon.ml | 2 +- arm/proofs/equiv.ml | 318 ++++-- arm/proofs/neon_helper.ml | 481 +++++++- arm/proofs/specifications.txt | 2 + benchmarks/benchmark.c | 8 +- common/misc.ml | 112 ++ common/relational2.ml | 275 ++++- include/s2n-bignum-c89.h | 2 + include/s2n-bignum.h | 2 + tests/test.c | 104 +- tools/diff.py | 6 +- 25 files changed, 6129 insertions(+), 408 deletions(-) create mode 100644 arm/p521/bignum_montmul_p521_neon.S create mode 100644 arm/p521/bignum_montsqr_p521_neon.S create mode 100644 arm/proofs/bignum_montmul_p521_neon.ml create mode 100644 arm/proofs/bignum_montsqr_p521_neon.ml diff --git a/arm/Makefile b/arm/Makefile index deffe316..4ceaf99c 100644 --- a/arm/Makefile +++ b/arm/Makefile @@ -302,8 +302,10 @@ BIGNUM_OBJ = curve25519/bignum_add_p25519.o \ p521/bignum_mod_p521_9.o \ p521/bignum_montmul_p521.o \ p521/bignum_montmul_p521_alt.o \ + p521/bignum_montmul_p521_neon.o \ p521/bignum_montsqr_p521.o \ p521/bignum_montsqr_p521_alt.o \ + p521/bignum_montsqr_p521_neon.o \ p521/bignum_mul_p521.o \ p521/bignum_mul_p521_alt.o \ p521/bignum_neg_p521.o \ diff --git a/arm/p521/Makefile b/arm/p521/Makefile index 7980bdd9..64db0725 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -32,8 +32,10 @@ OBJ = bignum_add_p521.o \ bignum_mod_p521_9.o \ bignum_montmul_p521.o \ bignum_montmul_p521_alt.o \ + bignum_montmul_p521_neon.o \ bignum_montsqr_p521.o \ bignum_montsqr_p521_alt.o \ + bignum_montsqr_p521_neon.o \ bignum_mul_p521.o \ bignum_mul_p521_alt.o \ bignum_neg_p521.o \ diff --git a/arm/p521/bignum_montmul_p521_neon.S b/arm/p521/bignum_montmul_p521_neon.S new file mode 100644 index 00000000..9586339f --- /dev/null +++ b/arm/p521/bignum_montmul_p521_neon.S @@ -0,0 +1,1415 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^576) mod p_521 +// Inputs x[9], y[9]; output z[9] +// +// extern void bignum_montmul_p521_neon +// (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); +// +// Does z := (x * y / 2^576) mod p_521, assuming x < p_521, y < p_521. This +// means the Montgomery base is the "native size" 2^{9*64} = 2^576; since +// p_521 is a Mersenne prime the basic modular multiplication bignum_mul_p521 +// can be considered a Montgomery operation to base 2^521. +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_montmul_p521_neon is functionally equivalent to bignum_montmul_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// stp x25, x26, [sp, #-16]! +// sub sp, sp, #80 +// ldp x14, x7, [x1] +// ldp x3, x25, [x1, #16] +// ldp x10, x24, [x2] +// ldr q0, [x1] +// ldr q25, [x2] +// ldp x12, x6, [x2, #16] +// movi v18.2D, #0x00000000ffffffff +// uzp2 v3.4S, v25.4S, v25.4S +// xtn v26.2S, v0.2D +// xtn v22.2S, v25.2D +// rev64 v24.4S, v25.4S +// umull v19.2D, v26.2S, v22.2S +// umull v25.2D, v26.2S, v3.2S +// uzp2 v20.4S, v0.4S, v0.4S +// mul v0.4S, v24.4S, v0.4S +// usra v25.2D, v19.2D, #32 +// umull v6.2D, v20.2S, v3.2S +// uaddlp v0.2D, v0.4S +// and v18.16B, v25.16B, v18.16B +// umlal v18.2D, v20.2S, v22.2S +// shl v0.2D, v0.2D, #32 +// usra v6.2D, v25.2D, #32 +// umlal v0.2D, v26.2S, v22.2S +// usra v6.2D, v18.2D, #32 +// mov x23, v0.d[0] +// mov x16, v0.d[1] +// mul x5, x3, x12 +// mul x21, x25, x6 +// mov x19, v6.d[0] +// adds x16, x16, x19 +// mov x19, v6.d[1] +// adcs x5, x5, x19 +// umulh x19, x3, x12 +// adcs x21, x21, x19 +// umulh x19, x25, x6 +// adc x19, x19, xzr +// adds x8, x16, x23 +// adcs x16, x5, x16 +// adcs x5, x21, x5 +// adcs x21, x19, x21 +// adc x19, xzr, x19 +// adds x11, x16, x23 +// adcs x15, x5, x8 +// adcs x16, x21, x16 +// adcs x5, x19, x5 +// adcs x21, xzr, x21 +// adc x19, xzr, x19 +// subs x20, x3, x25 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x6, x12 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x5, x5, x13 +// eor x20, x20, x9 +// adcs x21, x21, x20 +// adc x19, x19, x9 +// subs x20, x14, x7 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x24, x10 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x8, x8, x13 +// eor x20, x20, x9 +// adcs x11, x11, x20 +// adcs x15, x15, x9 +// adcs x16, x16, x9 +// adcs x5, x5, x9 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x20, x7, x25 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x6, x24 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x16, x16, x13 +// eor x20, x20, x9 +// adcs x5, x5, x20 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x20, x14, x3 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x12, x10 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x11, x11, x13 +// eor x20, x20, x9 +// adcs x15, x15, x20 +// adcs x16, x16, x9 +// adcs x5, x5, x9 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x25, x14, x25 +// cneg x25, x25, cc +// csetm x20, cc +// subs x10, x6, x10 +// cneg x10, x10, cc +// mul x6, x25, x10 +// umulh x25, x25, x10 +// cinv x10, x20, cc +// cmn x10, #0x1 +// eor x6, x6, x10 +// adcs x6, x15, x6 +// eor x25, x25, x10 +// adcs x25, x16, x25 +// adcs x16, x5, x10 +// adcs x5, x21, x10 +// adc x10, x19, x10 +// subs x7, x7, x3 +// cneg x7, x7, cc +// csetm x3, cc +// subs x24, x12, x24 +// cneg x24, x24, cc +// mul x12, x7, x24 +// umulh x7, x7, x24 +// cinv x3, x3, cc +// cmn x3, #0x1 +// eor x24, x12, x3 +// adcs x24, x6, x24 +// eor x7, x7, x3 +// adcs x7, x25, x7 +// adcs x25, x16, x3 +// adcs x12, x5, x3 +// adc x3, x10, x3 +// lsl x10, x23, #9 +// extr x6, x8, x23, #55 +// extr x23, x11, x8, #55 +// extr x16, x24, x11, #55 +// lsr x24, x24, #55 +// stp x7, x25, [sp] // @slothy:writes=stack0 +// stp x12, x3, [sp, #16] // @slothy:writes=stack16 +// stp x10, x6, [sp, #32] // @slothy:writes=stack32 +// stp x23, x16, [sp, #48] // @slothy:writes=stack48 +// str x24, [sp, #64] // @slothy:writes=stack64 +// ldp x7, x3, [x1, #32] +// ldr q0, [x1, #32] +// ldp x25, x10, [x1, #48] +// ldp x24, x12, [x2, #32] +// ldr q25, [x2, #32] +// ldp x6, x23, [x2, #48] +// ldr q18, [x1, #48] +// ldr q3, [x2, #48] +// uzp1 v26.4S, v25.4S, v0.4S +// rev64 v25.4S, v25.4S +// uzp1 v22.4S, v0.4S, v0.4S +// mul v0.4S, v25.4S, v0.4S +// uaddlp v0.2D, v0.4S +// shl v0.2D, v0.2D, #32 +// umlal v0.2D, v22.2S, v26.2S +// mov x16, v0.d[0] +// mov x5, v0.d[1] +// movi v0.2D, #0x00000000ffffffff +// uzp2 v25.4S, v3.4S, v3.4S +// xtn v26.2S, v18.2D +// xtn v22.2S, v3.2D +// rev64 v24.4S, v3.4S +// umull v19.2D, v26.2S, v22.2S +// umull v3.2D, v26.2S, v25.2S +// uzp2 v20.4S, v18.4S, v18.4S +// mul v18.4S, v24.4S, v18.4S +// usra v3.2D, v19.2D, #32 +// umull v6.2D, v20.2S, v25.2S +// uaddlp v25.2D, v18.4S +// and v0.16B, v3.16B, v0.16B +// umlal v0.2D, v20.2S, v22.2S +// shl v25.2D, v25.2D, #32 +// usra v6.2D, v3.2D, #32 +// umlal v25.2D, v26.2S, v22.2S +// usra v6.2D, v0.2D, #32 +// mov x21, v25.d[0] +// mov x19, v25.d[1] +// umulh x8, x7, x24 +// adds x5, x5, x8 +// umulh x8, x3, x12 +// adcs x21, x21, x8 +// mov x8, v6.d[0] +// adcs x19, x19, x8 +// mov x8, v6.d[1] +// adc x8, x8, xzr +// adds x11, x5, x16 +// adcs x5, x21, x5 +// adcs x21, x19, x21 +// adcs x19, x8, x19 +// adc x8, xzr, x8 +// adds x15, x5, x16 +// adcs x20, x21, x11 +// adcs x5, x19, x5 +// adcs x21, x8, x21 +// adcs x19, xzr, x19 +// adc x8, xzr, x8 +// subs x9, x25, x10 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x23, x6 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x21, x21, x26 +// eor x9, x9, x13 +// adcs x19, x19, x9 +// adc x8, x8, x13 +// subs x9, x7, x3 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x12, x24 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x11, x11, x26 +// eor x9, x9, x13 +// adcs x15, x15, x9 +// adcs x20, x20, x13 +// adcs x5, x5, x13 +// adcs x21, x21, x13 +// adcs x19, x19, x13 +// adc x8, x8, x13 +// subs x9, x3, x10 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x23, x12 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x5, x5, x26 +// eor x9, x9, x13 +// adcs x14, x21, x9 +// adcs x21, x19, x13 +// adc x19, x8, x13 +// subs x9, x7, x25 +// cneg x8, x9, cc +// csetm x9, cc +// subs x13, x6, x24 +// cneg x13, x13, cc +// mul x26, x8, x13 +// umulh x8, x8, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x15, x15, x13 +// eor x8, x8, x9 +// adcs x8, x20, x8 +// adcs x5, x5, x9 +// adcs x20, x14, x9 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x9, x7, x10 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x23, x24 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x8, x8, x26 +// eor x9, x9, x13 +// adcs x5, x5, x9 +// adcs x20, x20, x13 +// adcs x21, x21, x13 +// adc x19, x19, x13 +// subs x9, x3, x25 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x6, x12 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x8, x8, x26 +// eor x9, x9, x13 +// adcs x5, x5, x9 +// adcs x20, x20, x13 +// adcs x21, x21, x13 +// adc x19, x19, x13 +// ldp x9, x13, [sp] // @slothy:reads=stack0 +// adds x16, x16, x9 +// adcs x11, x11, x13 +// stp x16, x11, [sp] // @slothy:writes=stack0 +// ldp x16, x11, [sp, #16] // @slothy:reads=stack16 +// adcs x16, x15, x16 +// adcs x8, x8, x11 +// stp x16, x8, [sp, #16] // @slothy:writes=stack16 +// ldp x16, x8, [sp, #32] // @slothy:reads=stack32 +// adcs x16, x5, x16 +// adcs x5, x20, x8 +// stp x16, x5, [sp, #32] // @slothy:writes=stack32 +// ldp x16, x5, [sp, #48] // @slothy:reads=stack48 +// adcs x16, x21, x16 +// adcs x5, x19, x5 +// stp x16, x5, [sp, #48] // @slothy:writes=stack48 +// ldr x16, [sp, #64] // @slothy:reads=stack64 +// adc x16, x16, xzr +// str x16, [sp, #64] // @slothy:writes=stack64 +// ldp x16, x5, [x1] +// subs x7, x7, x16 +// sbcs x3, x3, x5 +// ldp x16, x5, [x1, #16] +// sbcs x25, x25, x16 +// sbcs x10, x10, x5 +// csetm x16, cc +// ldp x5, x21, [x2] +// subs x24, x5, x24 +// sbcs x12, x21, x12 +// ldp x5, x19, [x2, #16] +// sbcs x6, x5, x6 +// sbcs x23, x19, x23 +// csetm x5, cc +// eor x7, x7, x16 +// subs x7, x7, x16 +// eor x3, x3, x16 +// sbcs x3, x3, x16 +// eor x25, x25, x16 +// sbcs x25, x25, x16 +// eor x10, x10, x16 +// sbc x10, x10, x16 +// eor x24, x24, x5 +// subs x24, x24, x5 +// eor x12, x12, x5 +// sbcs x12, x12, x5 +// eor x6, x6, x5 +// sbcs x6, x6, x5 +// eor x23, x23, x5 +// sbc x23, x23, x5 +// eor x16, x5, x16 +// mul x21, x7, x24 +// mul x5, x3, x12 +// mul x19, x25, x6 +// mul x8, x10, x23 +// umulh x11, x7, x24 +// adds x5, x5, x11 +// umulh x11, x3, x12 +// adcs x19, x19, x11 +// umulh x11, x25, x6 +// adcs x8, x8, x11 +// umulh x11, x10, x23 +// adc x11, x11, xzr +// adds x15, x5, x21 +// adcs x5, x19, x5 +// adcs x19, x8, x19 +// adcs x8, x11, x8 +// adc x11, xzr, x11 +// adds x20, x5, x21 +// adcs x9, x19, x15 +// adcs x5, x8, x5 +// adcs x19, x11, x19 +// adcs x8, xzr, x8 +// adc x11, xzr, x11 +// subs x13, x25, x10 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x23, x6 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x19, x19, x22 +// eor x13, x13, x26 +// adcs x8, x8, x13 +// adc x11, x11, x26 +// subs x13, x7, x3 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x12, x24 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x15, x15, x22 +// eor x13, x13, x26 +// adcs x20, x20, x13 +// adcs x9, x9, x26 +// adcs x5, x5, x26 +// adcs x19, x19, x26 +// adcs x8, x8, x26 +// adc x11, x11, x26 +// subs x13, x3, x10 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x23, x12 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x5, x5, x22 +// eor x13, x13, x26 +// adcs x19, x19, x13 +// adcs x8, x8, x26 +// adc x11, x11, x26 +// subs x13, x7, x25 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x6, x24 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x20, x20, x22 +// eor x13, x13, x26 +// adcs x9, x9, x13 +// adcs x5, x5, x26 +// adcs x19, x19, x26 +// adcs x8, x8, x26 +// adc x11, x11, x26 +// subs x7, x7, x10 +// cneg x7, x7, cc +// csetm x10, cc +// subs x24, x23, x24 +// cneg x24, x24, cc +// mul x23, x7, x24 +// umulh x7, x7, x24 +// cinv x10, x10, cc +// cmn x10, #0x1 +// eor x24, x23, x10 +// adcs x24, x9, x24 +// eor x7, x7, x10 +// adcs x7, x5, x7 +// adcs x23, x19, x10 +// adcs x5, x8, x10 +// adc x10, x11, x10 +// subs x3, x3, x25 +// cneg x3, x3, cc +// csetm x25, cc +// subs x12, x6, x12 +// cneg x12, x12, cc +// mul x6, x3, x12 +// umulh x3, x3, x12 +// cinv x25, x25, cc +// cmn x25, #0x1 +// eor x12, x6, x25 +// adcs x24, x24, x12 +// eor x3, x3, x25 +// adcs x7, x7, x3 +// adcs x3, x23, x25 +// adcs x12, x5, x25 +// adc x25, x10, x25 +// ldp x10, x6, [sp] // @slothy:reads=stack0 +// ldp x23, x5, [sp, #16] // @slothy:reads=stack16 +// eor x21, x21, x16 +// adds x21, x21, x10 +// eor x19, x15, x16 +// adcs x19, x19, x6 +// eor x8, x20, x16 +// adcs x8, x8, x23 +// eor x24, x24, x16 +// adcs x24, x24, x5 +// eor x7, x7, x16 +// ldp x11, x15, [sp, #32] // @slothy:reads=stack32 +// ldp x20, x9, [sp, #48] // @slothy:reads=stack48 +// ldr x13, [sp, #64] // @slothy:reads=stack64 +// adcs x7, x7, x11 +// eor x3, x3, x16 +// adcs x3, x3, x15 +// eor x12, x12, x16 +// adcs x12, x12, x20 +// eor x25, x25, x16 +// adcs x25, x25, x9 +// adc x26, x13, xzr +// adds x7, x7, x10 +// adcs x3, x3, x6 +// adcs x10, x12, x23 +// adcs x25, x25, x5 +// and x12, x16, #0x1ff +// lsl x6, x21, #9 +// orr x12, x6, x12 +// adcs x12, x11, x12 +// extr x6, x19, x21, #55 +// adcs x6, x15, x6 +// extr x23, x8, x19, #55 +// adcs x23, x20, x23 +// extr x16, x24, x8, #55 +// adcs x16, x9, x16 +// lsr x24, x24, #55 +// adc x24, x24, x13 +// ldr x5, [x2, #64] +// ldp x21, x19, [x1] +// and x8, x21, #0xfffffffffffff +// mul x8, x5, x8 +// ldr x11, [x1, #64] +// ldp x15, x20, [x2] +// and x9, x15, #0xfffffffffffff +// mul x9, x11, x9 +// add x8, x8, x9 +// extr x21, x19, x21, #52 +// and x21, x21, #0xfffffffffffff +// mul x21, x5, x21 +// extr x15, x20, x15, #52 +// and x15, x15, #0xfffffffffffff +// mul x15, x11, x15 +// add x21, x21, x15 +// lsr x15, x8, #52 +// add x21, x21, x15 +// lsl x8, x8, #12 +// extr x8, x21, x8, #12 +// adds x7, x7, x8 +// ldp x8, x15, [x1, #16] +// ldp x9, x13, [x2, #16] +// extr x19, x8, x19, #40 +// and x19, x19, #0xfffffffffffff +// mul x19, x5, x19 +// extr x20, x9, x20, #40 +// and x20, x20, #0xfffffffffffff +// mul x20, x11, x20 +// add x19, x19, x20 +// lsr x20, x21, #52 +// add x19, x19, x20 +// lsl x21, x21, #12 +// extr x21, x19, x21, #24 +// adcs x3, x3, x21 +// extr x21, x15, x8, #28 +// and x21, x21, #0xfffffffffffff +// mul x21, x5, x21 +// extr x8, x13, x9, #28 +// and x8, x8, #0xfffffffffffff +// mul x8, x11, x8 +// add x21, x21, x8 +// lsr x8, x19, #52 +// add x21, x21, x8 +// lsl x19, x19, #12 +// extr x19, x21, x19, #36 +// adcs x10, x10, x19 +// and x19, x3, x10 +// ldp x8, x20, [x1, #32] +// ldp x9, x22, [x2, #32] +// extr x15, x8, x15, #16 +// and x15, x15, #0xfffffffffffff +// mul x4, x5, x15 +// extr x15, x9, x13, #16 +// and x15, x15, #0xfffffffffffff +// mul x15, x11, x15 +// add x15, x4, x15 +// lsl x13, x26, #48 +// add x15, x15, x13 +// lsr x13, x21, #52 +// add x15, x15, x13 +// lsl x21, x21, #12 +// extr x21, x15, x21, #48 +// adcs x25, x25, x21 +// and x21, x19, x25 +// lsr x19, x8, #4 +// and x19, x19, #0xfffffffffffff +// mul x19, x5, x19 +// lsr x26, x9, #4 +// and x13, x26, #0xfffffffffffff +// mul x26, x11, x13 +// add x19, x19, x26 +// lsr x13, x15, #52 +// add x19, x19, x13 +// lsl x15, x15, #12 +// extr x15, x19, x15, #60 +// extr x8, x20, x8, #56 +// and x8, x8, #0xfffffffffffff +// mul x8, x5, x8 +// extr x9, x22, x9, #56 +// and x9, x9, #0xfffffffffffff +// mul x9, x11, x9 +// add x8, x8, x9 +// lsr x19, x19, #52 +// add x19, x8, x19 +// lsl x8, x15, #8 +// extr x8, x19, x8, #8 +// adcs x12, x12, x8 +// and x21, x21, x12 +// ldp x1, x8, [x1, #48] +// ldp x2, x15, [x2, #48] +// extr x20, x1, x20, #44 +// and x20, x20, #0xfffffffffffff +// mul x20, x5, x20 +// extr x9, x2, x22, #44 +// and x9, x9, #0xfffffffffffff +// mul x9, x11, x9 +// add x20, x20, x9 +// lsr x9, x19, #52 +// add x22, x20, x9 +// lsl x19, x19, #12 +// extr x19, x22, x19, #20 +// adcs x6, x6, x19 +// and x21, x21, x6 +// extr x1, x8, x1, #32 +// and x1, x1, #0xfffffffffffff +// mul x1, x5, x1 +// extr x2, x15, x2, #32 +// and x2, x2, #0xfffffffffffff +// mul x2, x11, x2 +// add x2, x1, x2 +// lsr x1, x22, #52 +// add x2, x2, x1 +// lsl x1, x22, #12 +// extr x1, x2, x1, #32 +// adcs x23, x23, x1 +// and x21, x21, x23 +// lsr x1, x8, #20 +// mul x1, x5, x1 +// lsr x19, x15, #20 +// mul x19, x11, x19 +// add x1, x1, x19 +// lsr x19, x2, #52 +// add x19, x1, x19 +// lsl x2, x2, #12 +// extr x2, x19, x2, #44 +// adcs x16, x16, x2 +// and x2, x21, x16 +// mul x5, x5, x11 +// lsr x1, x19, #44 +// add x5, x5, x1 +// adc x24, x24, x5 +// lsr x5, x24, #9 +// orr x24, x24, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x7, x5 +// adcs xzr, x2, xzr +// adcs xzr, x24, xzr +// adcs x7, x7, x5 +// adcs x2, x3, xzr +// adcs x10, x10, xzr +// adcs x25, x25, xzr +// adcs x12, x12, xzr +// adcs x6, x6, xzr +// adcs x23, x23, xzr +// adcs x16, x16, xzr +// adc x3, x24, xzr +// stp x2, x10, [x0] // @slothy:writes=buffer0 +// stp x25, x12, [x0, #16] // @slothy:writes=buffer16 +// stp x6, x23, [x0, #32] // @slothy:writes=buffer32 +// lsl x25, x7, #9 +// and x3, x3, #0x1ff +// orr x3, x3, x25 +// stp x16, x3, [x0, #48] // @slothy:writes=buffer48 +// lsr x14, x7, #55 +// str x14, [x0, #64] // @slothy:writes=buffer64 +// add sp, sp, #80 +// ldp x25, x26, [sp], #16 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads and add/sub sp #80 as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montmul_p521_neon): + +// Save registers and make space for the temporary buffer + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, #80 + + ldr q24, [x2] + ldr q21, [x1] + ldr q1, [x2, #48] + ldp x23, x20, [x1, #16] + movi v18.2D, #0x00000000ffffffff + ldp x19, x17, [x2, #16] + uzp2 v3.4S, v24.4S, v24.4S + xtn v6.2S, v21.2D + ldp x11, x22, [x1] + rev64 v5.4S, v24.4S + xtn v24.2S, v24.2D + subs x16, x23, x20 + umull v29.2D, v6.2S, v3.2S + rev64 v31.4S, v1.4S + cneg x26, x16, cc + umull v27.2D, v6.2S, v24.2S + ldr q19, [x1, #48] + csetm x12, cc + mul x15, x20, x17 + mul v26.4S, v5.4S, v21.4S + uzp2 v28.4S, v21.4S, v21.4S + subs x6, x17, x19 + xtn v7.2S, v1.2D + cinv x10, x12, cc + cneg x3, x6, cc + uzp2 v21.4S, v1.4S, v1.4S + umull v1.2D, v28.2S, v3.2S + mul x12, x26, x3 + usra v29.2D, v27.2D, #32 + mul v25.4S, v31.4S, v19.4S + usra v1.2D, v29.2D, #32 + uaddlp v31.2D, v26.4S + umulh x14, x26, x3 + eor x12, x12, x10 + and v26.16B, v29.16B, v18.16B + uaddlp v2.2D, v25.4S + subs x16, x11, x22 + shl v0.2D, v31.2D, #32 + xtn v31.2S, v19.2D + cneg x6, x16, cc + shl v16.2D, v2.2D, #32 + umlal v26.2D, v28.2S, v24.2S + umlal v0.2D, v6.2S, v24.2S + uzp2 v30.4S, v19.4S, v19.4S + umulh x26, x20, x17 + umull v22.2D, v31.2S, v21.2S + umull v29.2D, v30.2S, v21.2S + usra v1.2D, v26.2D, #32 + mul x13, x23, x19 + eor x9, x14, x10 + ldr q5, [x2, #32] + umull v26.2D, v31.2S, v7.2S + ldp x21, x4, [x2] + csetm x8, cc + mov x16, v0.d[1] + ldr q6, [x1, #32] + umlal v16.2D, v31.2S, v7.2S + mov x3, v0.d[0] + umulh x14, x23, x19 + mov x25, v1.d[1] + mov x5, v1.d[0] + usra v22.2D, v26.2D, #32 + rev64 v3.4S, v5.4S + adds x16, x16, x5 + uzp1 v24.4S, v5.4S, v6.4S + movi v26.2D, #0x00000000ffffffff + adcs x7, x13, x25 + uzp1 v0.4S, v6.4S, v6.4S + mul v5.4S, v3.4S, v6.4S + adcs x25, x15, x14 + adc x13, x26, xzr + adds x26, x16, x3 + and v6.16B, v22.16B, v26.16B + usra v29.2D, v22.2D, #32 + adcs x16, x7, x16 + adcs x14, x25, x7 + umlal v6.2D, v30.2S, v7.2S + adcs x7, x13, x25 + uaddlp v7.2D, v5.4S + adc x13, xzr, x13 + adds x25, x16, x3 + adcs x24, x14, x26 + shl v1.2D, v7.2D, #32 + adcs x5, x7, x16 + usra v29.2D, v6.2D, #32 + adcs x16, x13, x14 + umlal v1.2D, v0.2S, v24.2S + adcs x14, xzr, x7 + adc x13, xzr, x13 + subs x7, x4, x21 + cneg x7, x7, cc + mul x15, x6, x7 + umulh x7, x6, x7 + cinv x6, x8, cc + cmn x10, #0x1 + adcs x16, x16, x12 + eor x8, x15, x6 + adcs x14, x14, x9 + adc x9, x13, x10 + subs x13, x22, x20 + cneg x13, x13, cc + csetm x10, cc + subs x12, x17, x4 + cinv x15, x10, cc + cneg x10, x12, cc + cmn x6, #0x1 + umulh x12, x13, x10 + eor x7, x7, x6 + adcs x26, x26, x8 + adcs x7, x25, x7 + adcs x8, x24, x6 + adcs x24, x5, x6 + adcs x25, x16, x6 + mul x5, x13, x10 + adcs x13, x14, x6 + adc x14, x9, x6 + subs x10, x11, x23 + csetm x16, cc + cneg x9, x10, cc + subs x6, x19, x21 + cinv x10, x16, cc + cneg x16, x6, cc + eor x5, x5, x15 + subs x20, x11, x20 + mul x6, x9, x16 + csetm x11, cc + cneg x20, x20, cc + subs x17, x17, x21 + cneg x17, x17, cc + cinv x11, x11, cc + umulh x9, x9, x16 + eor x16, x12, x15 + subs x21, x22, x23 + cneg x22, x21, cc + eor x12, x6, x10 + csetm x6, cc + cmn x15, #0x1 + eor x9, x9, x10 + adcs x5, x24, x5 + umulh x23, x20, x17 + lsl x24, x3, #9 + adcs x25, x25, x16 + adcs x21, x13, x15 + adc x16, x14, x15 + subs x13, x19, x4 + cneg x14, x13, cc + cinv x15, x6, cc + cmn x10, #0x1 + mul x13, x20, x17 + extr x17, x26, x3, #55 + adcs x12, x7, x12 + adcs x8, x8, x9 + eor x19, x23, x11 + adcs x6, x5, x10 + eor x13, x13, x11 + mov x5, v29.d[0] + adcs x25, x25, x10 + extr x26, x12, x26, #55 + mul x4, x22, x14 + adcs x7, x21, x10 + stp x24, x17, [sp, #32] + ldp x20, x21, [x1, #48] + adc x24, x16, x10 + cmn x11, #0x1 + mov x16, v16.d[0] + umulh x17, x22, x14 + adcs x13, x8, x13 + eor x9, x4, x15 + adcs x10, x6, x19 + ldp x22, x23, [x1, #32] + adcs x3, x25, x11 + ldp x4, x19, [x2, #32] + eor x17, x17, x15 + adcs x7, x7, x11 + adc x14, x24, x11 + subs x6, x20, x21 + csetm x11, cc + cneg x8, x6, cc + cmn x15, #0x1 + umulh x25, x22, x4 + adcs x24, x13, x9 + adcs x10, x10, x17 + extr x13, x24, x12, #55 + adcs x9, x3, x15 + ldp x17, x3, [x2, #48] + umulh x6, x23, x19 + adcs x7, x7, x15 + adc x14, x14, x15 + subs x12, x22, x23 + stp x10, x9, [sp] + mov x9, v1.d[1] + csetm x10, cc + stp x7, x14, [sp, #16] + cneg x12, x12, cc + subs x14, x3, x17 + mov x7, v16.d[1] + cneg x15, x14, cc + mov x14, v29.d[1] + cinv x11, x11, cc + adds x9, x9, x25 + mul x25, x8, x15 + stp x26, x13, [sp, #48] + lsr x24, x24, #55 + adcs x26, x16, x6 + mov x13, v1.d[0] + str x24, [sp, #64] + adcs x7, x7, x5 + adc x5, x14, xzr + umulh x6, x8, x15 + eor x15, x25, x11 + subs x25, x19, x4 + cinv x16, x10, cc + cneg x10, x25, cc + eor x6, x6, x11 + adds x8, x9, x13 + adcs x14, x26, x9 + mul x9, x12, x10 + adcs x24, x7, x26 + adcs x7, x5, x7 + umulh x25, x12, x10 + adc x12, xzr, x5 + adds x26, x14, x13 + eor x10, x9, x16 + adcs x9, x24, x8 + adcs x5, x7, x14 + adcs x14, x12, x24 + adcs x7, xzr, x7 + adc x12, xzr, x12 + eor x24, x25, x16 + cmn x11, #0x1 + adcs x25, x14, x15 + adcs x14, x7, x6 + adc x11, x12, x11 + subs x12, x23, x21 + csetm x15, cc + cneg x7, x12, cc + subs x12, x3, x19 + cneg x12, x12, cc + cinv x15, x15, cc + cmn x16, #0x1 + adcs x6, x8, x10 + mul x10, x7, x12 + adcs x26, x26, x24 + adcs x9, x9, x16 + umulh x24, x7, x12 + eor x8, x10, x15 + adcs x5, x5, x16 + adcs x25, x25, x16 + adcs x7, x14, x16 + adc x16, x11, x16 + subs x11, x22, x20 + cneg x11, x11, cc + csetm x14, cc + subs x10, x17, x4 + cinv x14, x14, cc + cneg x10, x10, cc + cmn x15, #0x1 + eor x12, x24, x15 + adcs x5, x5, x8 + mul x24, x11, x10 + adcs x8, x25, x12 + adcs x25, x7, x15 + adc x16, x16, x15 + subs x12, x22, x21 + umulh x10, x11, x10 + cneg x15, x12, cc + csetm x11, cc + subs x12, x3, x4 + cneg x12, x12, cc + cinv x7, x11, cc + mul x11, x15, x12 + eor x24, x24, x14 + cmn x14, #0x1 + eor x10, x10, x14 + adcs x24, x26, x24 + eor x26, x11, x7 + adcs x10, x9, x10 + ldp x11, x9, [x1, #16] + umulh x15, x15, x12 + adcs x5, x5, x14 + adcs x8, x8, x14 + adcs x25, x25, x14 + adc x12, x16, x14 + cmn x7, #0x1 + adcs x16, x10, x26 + eor x14, x15, x7 + adcs x26, x5, x14 + ldp x5, x10, [x1] + adcs x14, x8, x7 + adcs x15, x25, x7 + adc x7, x12, x7 + subs x25, x23, x20 + cneg x25, x25, cc + csetm x8, cc + subs x22, x22, x5 + sbcs x10, x23, x10 + ldp x23, x12, [x2] + sbcs x20, x20, x11 + sbcs x21, x21, x9 + csetm x9, cc + subs x11, x17, x19 + cneg x5, x11, cc + cinv x11, x8, cc + subs x23, x23, x4 + sbcs x19, x12, x19 + eor x20, x20, x9 + ldp x12, x4, [x2, #16] + eor x21, x21, x9 + umulh x8, x25, x5 + eor x22, x22, x9 + eor x10, x10, x9 + sbcs x17, x12, x17 + sbcs x3, x4, x3 + mul x25, x25, x5 + csetm x12, cc + subs x22, x22, x9 + eor x4, x23, x12 + sbcs x23, x10, x9 + eor x10, x3, x12 + sbcs x20, x20, x9 + eor x5, x8, x11 + eor x3, x19, x12 + sbc x21, x21, x9 + subs x4, x4, x12 + eor x25, x25, x11 + sbcs x19, x3, x12 + eor x3, x17, x12 + sbcs x17, x3, x12 + umulh x8, x23, x19 + sbc x3, x10, x12 + cmn x11, #0x1 + adcs x25, x16, x25 + adcs x26, x26, x5 + ldp x10, x5, [sp] + adcs x16, x14, x11 + mul x14, x22, x4 + adcs x15, x15, x11 + adc x7, x7, x11 + adds x11, x13, x10 + umulh x10, x21, x3 + adcs x13, x6, x5 + ldp x6, x5, [sp, #16] + stp x11, x13, [sp] + eor x13, x12, x9 + mul x9, x23, x19 + adcs x6, x24, x6 + ldp x11, x24, [sp, #32] + mul x12, x20, x17 + adcs x25, x25, x5 + stp x6, x25, [sp, #16] + ldp x6, x25, [sp, #48] + umulh x5, x20, x17 + adcs x11, x26, x11 + ldr x26, [sp, #64] + adcs x16, x16, x24 + stp x11, x16, [sp, #32] + adcs x11, x15, x6 + umulh x24, x22, x4 + adcs x25, x7, x25 + adc x7, x26, xzr + stp x11, x25, [sp, #48] + subs x26, x20, x21 + csetm x15, cc + cneg x25, x26, cc + str x7, [sp, #64] + mul x11, x21, x3 + subs x6, x22, x23 + cneg x6, x6, cc + csetm x16, cc + subs x26, x3, x17 + cneg x26, x26, cc + cinv x7, x15, cc + adds x24, x9, x24 + adcs x8, x12, x8 + umulh x12, x25, x26 + adcs x5, x11, x5 + adc x11, x10, xzr + subs x15, x19, x4 + cinv x9, x16, cc + mul x26, x25, x26 + eor x25, x12, x7 + cneg x12, x15, cc + adds x16, x24, x14 + eor x15, x26, x7 + umulh x26, x6, x12 + adcs x10, x8, x24 + adcs x8, x5, x8 + adcs x24, x11, x5 + adc x5, xzr, x11 + adds x11, x10, x14 + mul x12, x6, x12 + adcs x6, x8, x16 + eor x14, x14, x13 + adcs x10, x24, x10 + adcs x8, x5, x8 + adcs x24, xzr, x24 + adc x5, xzr, x5 + cmn x7, #0x1 + adcs x15, x8, x15 + adcs x24, x24, x25 + eor x25, x26, x9 + adc x8, x5, x7 + eor x5, x12, x9 + subs x26, x23, x21 + cneg x12, x26, cc + csetm x26, cc + subs x7, x3, x19 + cneg x7, x7, cc + cinv x26, x26, cc + cmn x9, #0x1 + adcs x5, x16, x5 + mul x16, x12, x7 + adcs x25, x11, x25 + umulh x7, x12, x7 + adcs x12, x6, x9 + eor x11, x16, x26 + adcs x6, x10, x9 + adcs x10, x15, x9 + adcs x24, x24, x9 + adc x8, x8, x9 + subs x15, x22, x20 + cneg x15, x15, cc + csetm x9, cc + subs x16, x17, x4 + cneg x16, x16, cc + cinv x9, x9, cc + subs x21, x22, x21 + mul x22, x15, x16 + eor x7, x7, x26 + cneg x21, x21, cc + umulh x16, x15, x16 + csetm x15, cc + subs x4, x3, x4 + cneg x3, x4, cc + eor x4, x22, x9 + cinv x15, x15, cc + cmn x26, #0x1 + eor x22, x5, x13 + adcs x5, x6, x11 + adcs x6, x10, x7 + adcs x10, x24, x26 + eor x11, x16, x9 + adc x8, x8, x26 + subs x16, x23, x20 + cneg x7, x16, cc + csetm x23, cc + cmn x9, #0x1 + adcs x16, x25, x4 + mul x4, x21, x3 + adcs x24, x12, x11 + eor x11, x16, x13 + adcs x26, x5, x9 + adcs x16, x6, x9 + umulh x20, x21, x3 + adcs x6, x10, x9 + ldp x3, x10, [x1] + adc x12, x8, x9 + subs x21, x17, x19 + cneg x8, x21, cc + eor x25, x20, x15 + eor x20, x4, x15 + mul x19, x7, x8 + cinv x17, x23, cc + cmn x15, #0x1 + adcs x4, x24, x20 + extr x21, x10, x3, #52 + umulh x9, x7, x8 + and x24, x21, #0xfffffffffffff + adcs x26, x26, x25 + eor x7, x19, x17 + adcs x5, x16, x15 + and x23, x3, #0xfffffffffffff + eor x9, x9, x17 + adcs x21, x6, x15 + adc x6, x12, x15 + cmn x17, #0x1 + adcs x25, x4, x7 + and x4, x13, #0x1ff + ldp x16, x8, [sp] + adcs x20, x26, x9 + adcs x12, x5, x17 + ldp x3, x5, [sp, #16] + eor x15, x12, x13 + adcs x12, x21, x17 + adc x9, x6, x17 + adds x21, x14, x16 + lsl x7, x21, #9 + eor x26, x12, x13 + ldp x19, x17, [sp, #32] + orr x4, x7, x4 + eor x14, x25, x13 + adcs x7, x22, x8 + adcs x12, x11, x3 + eor x11, x20, x13 + ldp x6, x25, [sp, #48] + eor x20, x9, x13 + adcs x22, x14, x5 + ldr x14, [x2, #64] + adcs x9, x11, x19 + ldr x11, [sp, #64] + adcs x13, x15, x17 + adcs x26, x26, x6 + adcs x20, x20, x25 + adc x15, x11, xzr + adds x16, x9, x16 + mul x9, x14, x23 + adcs x23, x13, x8 + extr x13, x7, x21, #55 + adcs x21, x26, x3 + ldp x3, x26, [x1, #16] + extr x8, x22, x12, #55 + adcs x20, x20, x5 + adcs x19, x19, x4 + mul x4, x14, x24 + ldp x5, x24, [x2] + adcs x17, x17, x13 + extr x13, x26, x3, #28 + extr x10, x3, x10, #40 + extr x7, x12, x7, #55 + and x12, x13, #0xfffffffffffff + adcs x3, x6, x7 + ldr x6, [x1, #64] + extr x7, x24, x5, #52 + and x5, x5, #0xfffffffffffff + mul x12, x14, x12 + adcs x13, x25, x8 + and x7, x7, #0xfffffffffffff + ldp x8, x25, [x2, #16] + mul x5, x6, x5 + extr x24, x8, x24, #40 + and x24, x24, #0xfffffffffffff + add x9, x9, x5 + lsr x5, x22, #55 + mul x7, x6, x7 + extr x22, x25, x8, #28 + and x10, x10, #0xfffffffffffff + mul x10, x14, x10 + lsr x8, x9, #52 + lsl x9, x9, #12 + add x7, x4, x7 + adc x4, x5, x11 + ldp x11, x5, [x2, #32] + add x8, x7, x8 + and x7, x22, #0xfffffffffffff + extr x22, x8, x9, #12 + lsl x9, x15, #48 + mul x15, x6, x24 + add x10, x10, x15 + lsr x15, x8, #52 + extr x25, x11, x25, #16 + and x25, x25, #0xfffffffffffff + mul x24, x6, x7 + add x7, x10, x15 + lsr x10, x7, #52 + lsl x8, x8, #12 + extr x8, x7, x8, #24 + adds x22, x16, x22 + ldp x16, x15, [x1, #32] + adcs x23, x23, x8 + extr x8, x5, x11, #56 + mul x25, x6, x25 + add x24, x12, x24 + add x12, x24, x10 + lsr x10, x16, #4 + lsl x7, x7, #12 + extr x24, x12, x7, #36 + and x10, x10, #0xfffffffffffff + extr x26, x16, x26, #16 + mul x10, x14, x10 + and x8, x8, #0xfffffffffffff + adcs x21, x21, x24 + and x7, x26, #0xfffffffffffff + mul x7, x14, x7 + lsr x24, x11, #4 + and x24, x24, #0xfffffffffffff + extr x11, x15, x16, #56 + lsl x26, x12, #12 + and x16, x11, #0xfffffffffffff + mul x11, x6, x24 + lsr x12, x12, #52 + ldp x2, x24, [x2, #48] + add x25, x7, x25 + add x25, x25, x9 + and x9, x23, x21 + mul x8, x6, x8 + add x12, x25, x12 + add x25, x10, x11 + extr x11, x12, x26, #48 + ldp x7, x26, [x1, #48] + extr x5, x2, x5, #44 + lsr x1, x12, #52 + mul x10, x14, x16 + lsr x16, x24, #20 + add x10, x10, x8 + extr x8, x26, x7, #32 + and x8, x8, #0xfffffffffffff + extr x24, x24, x2, #32 + mul x2, x6, x16 + add x1, x25, x1 + lsr x25, x26, #20 + and x26, x24, #0xfffffffffffff + and x24, x5, #0xfffffffffffff + extr x16, x7, x15, #44 + mul x7, x6, x24 + adcs x11, x20, x11 + and x20, x16, #0xfffffffffffff + lsl x5, x12, #12 + and x15, x9, x11 + mul x24, x14, x20 + lsr x9, x1, #52 + add x20, x10, x9 + extr x12, x1, x5, #60 + lsl x9, x20, #12 + lsl x5, x12, #8 + mul x10, x14, x8 + extr x12, x20, x5, #8 + lsr x1, x20, #52 + add x7, x24, x7 + adcs x8, x19, x12 + and x5, x15, x8 + add x7, x7, x1 + mul x20, x6, x26 + extr x24, x7, x9, #20 + lsr x19, x7, #52 + mul x25, x14, x25 + lsl x16, x7, #12 + add x20, x10, x20 + adcs x12, x17, x24 + add x19, x20, x19 + lsr x26, x19, #52 + mul x24, x14, x6 + and x5, x5, x12 + add x6, x25, x2 + lsl x17, x19, #12 + add x14, x6, x26 + extr x16, x19, x16, #32 + lsr x6, x14, #44 + extr x19, x14, x17, #44 + add x9, x24, x6 + adcs x17, x3, x16 + adcs x2, x13, x19 + and x7, x5, x17 + adc x15, x4, x9 + cmp xzr, xzr + orr x1, x15, #0xfffffffffffffe00 + lsr x3, x15, #9 + adcs xzr, x22, x3 + and x15, x7, x2 + adcs xzr, x15, xzr + adcs xzr, x1, xzr + adcs x7, x22, x3 + lsl x3, x7, #9 + lsr x15, x7, #55 + str x15, [x0, #64] + adcs x13, x23, xzr + adcs x16, x21, xzr + stp x13, x16, [x0] + adcs x13, x11, xzr + adcs x16, x8, xzr + stp x13, x16, [x0, #16] + adcs x19, x12, xzr + adcs x16, x17, xzr + adcs x13, x2, xzr + stp x19, x16, [x0, #32] + adc x16, x1, xzr + and x16, x16, #0x1ff + orr x16, x16, x3 + stp x13, x16, [x0, #48] + +// Restore regs and return + + add sp, sp, #80 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/p521/bignum_montsqr_p521_neon.S b/arm/p521/bignum_montsqr_p521_neon.S new file mode 100644 index 00000000..c4d11731 --- /dev/null +++ b/arm/p521/bignum_montsqr_p521_neon.S @@ -0,0 +1,1124 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^576) mod p_521 +// Input x[9]; output z[9] +// +// extern void bignum_montsqr_p521_neon +// (uint64_t z[static 9], uint64_t x[static 9]); +// +// Does z := (x^2 / 2^576) mod p_521, assuming x < p_521. This means the +// Montgomery base is the "native size" 2^{9*64} = 2^576; since p_521 is +// a Mersenne prime the basic modular squaring bignum_sqr_p521 can be +// considered a Montgomery operation to base 2^521. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_montsqr_p521_neon is functionally equivalent to bignum_montsqr_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// ldp x16, x8, [x1] +// ldr q18, [x1] +// ldr q5, [x1] +// ldr q20, [x1] +// ldp x17, x13, [x1, #16] +// ldr q17, [x1, #16] +// ldr q1, [x1, #16] +// ldr q28, [x1, #16] +// ldp x9, x15, [x1, #32] +// ldr q27, [x1] +// ldr q29, [x1, #32] +// ldp x23, x2, [x1, #48] +// ldr q6, [x1, #48] +// ldr q4, [x1, #48] +// mul x24, x9, x23 +// mul x11, x15, x2 +// umulh x20, x9, x23 +// subs x4, x9, x15 +// cneg x22, x4, cc +// csetm x12, cc +// subs x4, x2, x23 +// cneg x4, x4, cc +// mul x19, x22, x4 +// umulh x4, x22, x4 +// cinv x7, x12, cc +// eor x14, x19, x7 +// eor x22, x4, x7 +// adds x12, x24, x20 +// adc x19, x20, xzr +// umulh x4, x15, x2 +// adds x12, x12, x11 +// adcs x19, x19, x4 +// adc x4, x4, xzr +// adds x19, x19, x11 +// adc x4, x4, xzr +// cmn x7, #0x1 +// adcs x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, x7 +// adds x11, x24, x24 +// adcs x20, x12, x12 +// adcs x10, x19, x19 +// adcs x3, x4, x4 +// adc x5, xzr, xzr +// ldr q30, [x1, #32] +// umull v0.2D, v30.2S, v30.2S +// umull2 v2.2D, v30.4S, v30.4S +// xtn v24.2S, v30.2D +// uzp2 v30.4S, v30.4S, v30.4S +// umull v30.2D, v30.2S, v24.2S +// mov x7, v0.d[0] +// mov x14, v0.d[1] +// mov x19, v2.d[0] +// mov x22, v2.d[1] +// mov x4, v30.d[0] +// mov x12, v30.d[1] +// adds x21, x7, x4, lsl #33 +// lsr x4, x4, #31 +// adc x14, x14, x4 +// adds x19, x19, x12, lsl #33 +// lsr x4, x12, #31 +// adc x22, x22, x4 +// mul x4, x9, x15 +// umulh x12, x9, x15 +// adds x24, x14, x4, lsl #1 +// extr x4, x12, x4, #63 +// adcs x19, x19, x4 +// lsr x4, x12, #63 +// adc x4, x22, x4 +// adds x11, x11, x19 +// adcs x20, x20, x4 +// adcs x10, x10, xzr +// adcs x3, x3, xzr +// adc x6, x5, xzr +// movi v3.2D, #0x00000000ffffffff +// uzp2 v16.4S, v4.4S, v4.4S +// xtn v25.2S, v6.2D +// xtn v23.2S, v4.2D +// rev64 v30.4S, v4.4S +// umull v24.2D, v25.2S, v23.2S +// umull v0.2D, v25.2S, v16.2S +// uzp2 v2.4S, v6.4S, v6.4S +// mul v30.4S, v30.4S, v6.4S +// usra v0.2D, v24.2D, #32 +// umull v19.2D, v2.2S, v16.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v0.16B, v3.16B +// umlal v24.2D, v2.2S, v23.2S +// shl v30.2D, v30.2D, #32 +// usra v19.2D, v0.2D, #32 +// umlal v30.2D, v25.2S, v23.2S +// usra v19.2D, v24.2D, #32 +// mov x5, v30.d[0] +// mov x7, v30.d[1] +// mul x14, x23, x2 +// mov x19, v19.d[0] +// mov x4, v19.d[1] +// umulh x22, x23, x2 +// adds x12, x19, x14 +// adcs x19, x7, x22 +// adc x4, x4, xzr +// adds x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, xzr +// adds x7, x5, x10 +// adcs x3, x12, x3 +// adcs x14, x19, x6 +// adc x10, x4, xzr +// ldr x4, [x1, #64] +// add x6, x4, x4 +// mul x5, x4, x4 +// and x4, x16, #0xfffffffffffff +// mul x22, x6, x4 +// extr x4, x8, x16, #52 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x4, x12, x4, #12 +// adds x21, x21, x4 +// extr x4, x17, x8, #40 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x22, x19, x4 +// lsl x4, x12, #12 +// extr x4, x22, x4, #24 +// adcs x24, x24, x4 +// extr x4, x13, x17, #28 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x4, x12, x4, #36 +// adcs x11, x11, x4 +// extr x4, x9, x13, #16 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x22, x19, x4 +// lsl x4, x12, #12 +// extr x4, x22, x4, #48 +// adcs x20, x20, x4 +// lsr x4, x9, #4 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x22, x12, x4, #60 +// extr x4, x15, x9, #56 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x12, x19, x4 +// lsl x4, x22, #8 +// extr x4, x12, x4, #8 +// adcs x7, x7, x4 +// extr x4, x23, x15, #44 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x22, x19, x4 +// lsl x4, x12, #12 +// extr x4, x22, x4, #20 +// adcs x1, x3, x4 +// extr x4, x2, x23, #32 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x4, x12, x4, #32 +// adcs x14, x14, x4 +// lsr x4, x2, #20 +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x19, x19, x4 +// lsl x4, x12, #12 +// extr x4, x19, x4, #44 +// adcs x22, x10, x4 +// lsr x4, x19, #44 +// adc x12, x5, x4 +// extr x19, x24, x21, #9 +// extr x4, x11, x24, #9 +// stp x19, x4, [x0] // @slothy:writes=buffer0 +// extr x19, x20, x11, #9 +// extr x4, x7, x20, #9 +// stp x19, x4, [x0, #16] // @slothy:writes=buffer16 +// extr x19, x1, x7, #9 +// extr x4, x14, x1, #9 +// stp x19, x4, [x0, #32] // @slothy:writes=buffer32 +// extr x19, x22, x14, #9 +// extr x4, x12, x22, #9 +// stp x19, x4, [x0, #48] // @slothy:writes=buffer48 +// and x19, x21, #0x1ff +// lsr x4, x12, #9 +// add x4, x19, x4 +// str x4, [x0, #64] +// uzp1 v2.4S, v28.4S, v18.4S +// rev64 v30.4S, v28.4S +// uzp1 v24.4S, v18.4S, v18.4S +// mul v30.4S, v30.4S, v18.4S +// uaddlp v30.2D, v30.4S +// shl v30.2D, v30.2D, #32 +// umlal v30.2D, v24.2S, v2.2S +// mov x11, v30.d[0] +// mov x20, v30.d[1] +// umulh x7, x16, x17 +// subs x4, x16, x8 +// cneg x22, x4, cc +// csetm x12, cc +// subs x4, x13, x17 +// cneg x4, x4, cc +// mul x19, x22, x4 +// umulh x4, x22, x4 +// cinv x1, x12, cc +// eor x14, x19, x1 +// eor x22, x4, x1 +// adds x12, x11, x7 +// adc x19, x7, xzr +// umulh x4, x8, x13 +// adds x12, x12, x20 +// adcs x19, x19, x4 +// adc x4, x4, xzr +// adds x19, x19, x20 +// adc x4, x4, xzr +// cmn x1, #0x1 +// adcs x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, x1 +// adds x21, x11, x11 +// adcs x24, x12, x12 +// adcs x11, x19, x19 +// adcs x20, x4, x4 +// adc x7, xzr, xzr +// movi v3.2D, #0x00000000ffffffff +// uzp2 v16.4S, v20.4S, v20.4S +// xtn v25.2S, v5.2D +// xtn v23.2S, v20.2D +// rev64 v30.4S, v20.4S +// umull v24.2D, v25.2S, v23.2S +// umull v0.2D, v25.2S, v16.2S +// uzp2 v2.4S, v5.4S, v5.4S +// mul v30.4S, v30.4S, v5.4S +// usra v0.2D, v24.2D, #32 +// umull v19.2D, v2.2S, v16.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v0.16B, v3.16B +// umlal v24.2D, v2.2S, v23.2S +// shl v30.2D, v30.2D, #32 +// usra v19.2D, v0.2D, #32 +// umlal v30.2D, v25.2S, v23.2S +// usra v19.2D, v24.2D, #32 +// mov x10, v30.d[0] +// mov x1, v30.d[1] +// mul x14, x16, x8 +// mov x19, v19.d[0] +// mov x4, v19.d[1] +// umulh x22, x16, x8 +// adds x12, x19, x14 +// adcs x19, x1, x22 +// adc x4, x4, xzr +// adds x3, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, xzr +// adds x5, x21, x19 +// adcs x21, x24, x4 +// adcs x24, x11, xzr +// adcs x11, x20, xzr +// adc x20, x7, xzr +// movi v3.2D, #0x00000000ffffffff +// uzp2 v16.4S, v1.4S, v1.4S +// xtn v25.2S, v17.2D +// xtn v23.2S, v1.2D +// rev64 v30.4S, v1.4S +// umull v24.2D, v25.2S, v23.2S +// umull v0.2D, v25.2S, v16.2S +// uzp2 v2.4S, v17.4S, v17.4S +// mul v30.4S, v30.4S, v17.4S +// usra v0.2D, v24.2D, #32 +// umull v19.2D, v2.2S, v16.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v0.16B, v3.16B +// umlal v24.2D, v2.2S, v23.2S +// shl v30.2D, v30.2D, #32 +// usra v19.2D, v0.2D, #32 +// umlal v30.2D, v25.2S, v23.2S +// usra v19.2D, v24.2D, #32 +// mov x7, v30.d[0] +// mov x1, v30.d[1] +// mul x14, x17, x13 +// mov x19, v19.d[0] +// mov x4, v19.d[1] +// umulh x22, x17, x13 +// adds x12, x19, x14 +// adcs x19, x1, x22 +// adc x4, x4, xzr +// adds x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, xzr +// adds x1, x7, x24 +// adcs x14, x12, x11 +// adcs x22, x19, x20 +// adc x12, x4, xzr +// ldp x19, x4, [x0] // @slothy:reads=buffer0 +// adds x19, x19, x10 +// adcs x4, x4, x3 +// stp x19, x4, [x0] // @slothy:writes=buffer0 +// ldp x19, x4, [x0, #16] // @slothy:reads=buffer16 +// adcs x19, x19, x5 +// adcs x4, x4, x21 +// stp x19, x4, [x0, #16] // @slothy:writes=buffer16 +// ldp x19, x4, [x0, #32] // @slothy:reads=buffer32 +// adcs x19, x19, x1 +// adcs x4, x4, x14 +// stp x19, x4, [x0, #32] // @slothy:writes=buffer32 +// ldp x19, x4, [x0, #48] // @slothy:reads=buffer48 +// adcs x19, x19, x22 +// adcs x4, x4, x12 +// stp x19, x4, [x0, #48] // @slothy:writes=buffer48 +// ldr x4, [x0, #64] +// adc x4, x4, xzr +// str x4, [x0, #64] +// movi v3.2D, #0x00000000ffffffff +// uzp2 v2.4S, v29.4S, v29.4S +// xtn v16.2S, v27.2D +// xtn v25.2S, v29.2D +// rev64 v30.4S, v29.4S +// umull v24.2D, v16.2S, v25.2S +// umull v23.2D, v16.2S, v2.2S +// uzp2 v0.4S, v27.4S, v27.4S +// mul v30.4S, v30.4S, v27.4S +// usra v23.2D, v24.2D, #32 +// umull v2.2D, v0.2S, v2.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v23.16B, v3.16B +// umlal v24.2D, v0.2S, v25.2S +// shl v30.2D, v30.2D, #32 +// usra v2.2D, v23.2D, #32 +// umlal v30.2D, v16.2S, v25.2S +// usra v2.2D, v24.2D, #32 +// mov x6, v30.d[0] +// mov x22, v30.d[1] +// mul x12, x17, x23 +// mul x19, x13, x2 +// mov x4, v2.d[0] +// adds x22, x22, x4 +// mov x4, v2.d[1] +// adcs x12, x12, x4 +// umulh x4, x17, x23 +// adcs x19, x19, x4 +// umulh x4, x13, x2 +// adc x4, x4, xzr +// adds x21, x22, x6 +// adcs x22, x12, x22 +// adcs x12, x19, x12 +// adcs x19, x4, x19 +// adc x4, xzr, x4 +// adds x24, x22, x6 +// adcs x11, x12, x21 +// adcs x20, x19, x22 +// adcs x1, x4, x12 +// adcs x14, xzr, x19 +// adc x7, xzr, x4 +// subs x4, x17, x13 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x2, x23 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x1, x1, x4 +// eor x4, x12, x19 +// adcs x14, x14, x4 +// adc x7, x7, x19 +// subs x4, x16, x8 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x15, x9 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x10, x21, x4 +// eor x4, x12, x19 +// adcs x24, x24, x4 +// adcs x11, x11, x19 +// adcs x20, x20, x19 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x8, x13 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x2, x15 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x20, x20, x4 +// eor x4, x12, x19 +// adcs x1, x1, x4 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x16, x17 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x23, x9 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x24, x24, x4 +// eor x4, x12, x19 +// adcs x11, x11, x4 +// adcs x20, x20, x19 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x16, x13 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x2, x9 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x11, x11, x4 +// eor x4, x12, x19 +// adcs x20, x20, x4 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x8, x17 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x23, x15 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x3, x11, x4 +// eor x4, x12, x19 +// adcs x5, x20, x4 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x22, x7, x19 +// ldp x12, x19, [x0] // @slothy:reads=buffer0 +// extr x4, x1, x5, #8 +// adds x11, x4, x12 +// extr x4, x14, x1, #8 +// adcs x20, x4, x19 +// ldp x19, x12, [x0, #16] // @slothy:reads=buffer16 +// extr x4, x22, x14, #8 +// adcs x7, x4, x19 +// and x19, x20, x7 +// lsr x4, x22, #8 +// adcs x1, x4, x12 +// and x22, x19, x1 +// ldp x19, x12, [x0, #32] // @slothy:reads=buffer32 +// lsl x4, x6, #1 +// adcs x14, x4, x19 +// and x19, x22, x14 +// extr x4, x10, x6, #63 +// adcs x21, x4, x12 +// and x22, x19, x21 +// ldp x19, x12, [x0, #48] // @slothy:reads=buffer48 +// extr x4, x24, x10, #63 +// adcs x2, x4, x19 +// and x19, x22, x2 +// extr x4, x3, x24, #63 +// adcs x24, x4, x12 +// and x12, x19, x24 +// ldr x19, [x0, #64] +// extr x4, x5, x3, #63 +// and x4, x4, #0x1ff +// adc x4, x19, x4 +// lsr x19, x4, #9 +// orr x4, x4, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x11, x19 +// adcs xzr, x12, xzr +// adcs xzr, x4, xzr +// adcs x11, x11, x19 +// adcs x20, x20, xzr +// adcs x7, x7, xzr +// adcs x1, x1, xzr +// adcs x14, x14, xzr +// adcs x22, x21, xzr +// adcs x12, x2, xzr +// adcs x24, x24, xzr +// adc x4, x4, xzr +// and x19, x4, #0x1ff +// lsl x4, x11, #9 +// extr x11, x20, x11, #55 +// extr x20, x7, x20, #55 +// extr x7, x1, x7, #55 +// extr x1, x14, x1, #55 +// orr x4, x19, x4 +// extr x14, x22, x14, #55 +// extr x22, x12, x22, #55 +// extr x12, x24, x12, #55 +// extr x19, x4, x24, #55 +// lsr x4, x4, #55 +// stp x11, x20, [x0] // @slothy:writes=buffer0 +// stp x7, x1, [x0, #16] // @slothy:writes=buffer16 +// stp x14, x22, [x0, #32] // @slothy:writes=buffer32 +// stp x12, x19, [x0, #48] // @slothy:writes=buffer48 +// str x4, [x0, #64] +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads and add/sub sp #80 as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montsqr_p521_neon): + +// Save registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + +// The optimized body + + ldr q31, [x1, #48] + ldp x9, x15, [x1, #32] + ldp x23, x2, [x1, #48] + ldr q0, [x1, #48] + ldr q29, [x1, #32] + rev64 v21.4S, v31.4S + umulh x13, x9, x23 + mul v23.4S, v21.4S, v0.4S + xtn v21.2S, v0.2D + uzp2 v19.4S, v31.4S, v31.4S + xtn v2.2S, v29.2D + xtn v30.2S, v31.2D + uzp2 v3.4S, v29.4S, v29.4S + umull v6.2D, v21.2S, v19.2S + mul x10, x9, x23 + uaddlp v23.2D, v23.4S + umull v22.2D, v21.2S, v30.2S + adds x22, x10, x13 + mul x17, x9, x15 + movi v25.2D, #0x00000000ffffffff + uzp2 v1.4S, v0.4S, v0.4S + adc x8, x13, xzr + subs x19, x9, x15 + umull v28.2D, v3.2S, v2.2S + shl v31.2D, v23.2D, #32 + csetm x5, cc + cneg x3, x19, cc + umull v19.2D, v1.2S, v19.2S + ldr q4, [x1, #16] + subs x24, x2, x23 + mul x6, x15, x2 + usra v6.2D, v22.2D, #32 + ldr q23, [x1] + cneg x13, x24, cc + umulh x24, x15, x2 + umull v5.2D, v29.2S, v29.2S + rev64 v3.4S, v4.4S + cinv x19, x5, cc + adds x16, x22, x6 + mov x14, v28.d[1] + umlal v31.2D, v21.2S, v30.2S + umull2 v17.2D, v29.4S, v29.4S + mov x20, v28.d[0] + mul v29.4S, v3.4S, v23.4S + and v22.16B, v6.16B, v25.16B + mul x5, x3, x13 + mov x4, v5.d[1] + mov x7, v5.d[0] + adcs x11, x8, x24 + ldr q5, [x1] + ldr q0, [x1] + adc x22, x24, xzr + adds x8, x11, x6 + usra v19.2D, v6.2D, #32 + umlal v22.2D, v1.2S, v30.2S + adc x11, x22, xzr + adds x21, x7, x20, lsl #33 + mov x24, v17.d[1] + mov x22, v17.d[0] + lsr x12, x20, #31 + uzp1 v2.4S, v4.4S, v23.4S + uzp1 v20.4S, v23.4S, v23.4S + usra v19.2D, v22.2D, #32 + adc x4, x4, x12 + lsr x6, x14, #31 + adds x20, x22, x14, lsl #33 + ldr q17, [x1, #16] + uzp2 v22.4S, v0.4S, v0.4S + eor x12, x5, x19 + umulh x7, x3, x13 + xtn v23.2S, v0.2D + adc x5, x24, x6 + cmn x19, #0x1 + xtn v25.2S, v5.2D + ldr q27, [x1] + adcs x16, x16, x12 + uaddlp v1.2D, v29.4S + umulh x3, x9, x15 + eor x13, x7, x19 + adcs x24, x8, x13 + adc x11, x11, x19 + adds x12, x10, x10 + adcs x13, x16, x16 + mul x19, x23, x2 + umull v21.2D, v25.2S, v23.2S + adcs x7, x24, x24 + ldp x16, x8, [x1] + umull v3.2D, v25.2S, v22.2S + uzp2 v6.4S, v5.4S, v5.4S + adcs x10, x11, x11 + ldr q29, [x1, #32] + adc x14, xzr, xzr + adds x24, x4, x17, lsl #1 + mov x4, v31.d[1] + shl v30.2D, v1.2D, #32 + lsr x6, x3, #63 + extr x11, x3, x17, #63 + ldr q1, [x1, #16] + mov x22, v19.d[1] + adcs x20, x20, x11 + umulh x3, x23, x2 + movi v4.2D, #0x00000000ffffffff + usra v3.2D, v21.2D, #32 + adc x5, x5, x6 + adds x11, x12, x20 + mov x6, v19.d[0] + umull v19.2D, v6.2S, v22.2S + adcs x20, x13, x5 + rev64 v22.4S, v0.4S + ldr x5, [x1, #64] + ldp x17, x13, [x1, #16] + adcs x7, x7, xzr + umlal v30.2D, v20.2S, v2.2S + adcs x12, x10, xzr + and x1, x16, #0xfffffffffffff + mul v22.4S, v22.4S, v5.4S + adc x14, x14, xzr + adds x6, x6, x19 + xtn v5.2S, v1.2D + adcs x10, x4, x3 + mov x4, v31.d[0] + adc x22, x22, xzr + adds x19, x6, x19 + add x6, x5, x5 + and v21.16B, v3.16B, v4.16B + adcs x10, x10, x3 + extr x3, x8, x16, #52 + mul x1, x6, x1 + usra v19.2D, v3.2D, #32 + adc x22, x22, xzr + adds x7, x4, x7 + umlal v21.2D, v6.2S, v23.2S + and x4, x3, #0xfffffffffffff + adcs x3, x19, x12 + uzp2 v28.4S, v1.4S, v1.4S + extr x19, x17, x8, #40 + mul x12, x6, x4 + adcs x14, x10, x14 + rev64 v4.4S, v1.4S + mul x5, x5, x5 + lsr x4, x9, #4 + adc x10, x22, xzr + lsl x22, x1, #12 + lsr x1, x1, #52 + add x12, x12, x1 + and x1, x19, #0xfffffffffffff + extr x19, x12, x22, #12 + mul x1, x6, x1 + extr x22, x13, x17, #28 + adds x21, x21, x19 + mul v31.4S, v4.4S, v17.4S + and x19, x22, #0xfffffffffffff + lsr x22, x12, #52 + lsl x12, x12, #12 + mul x19, x6, x19 + add x22, x1, x22 + extr x1, x22, x12, #24 + and x4, x4, #0xfffffffffffff + adcs x12, x24, x1 + extr x1, x9, x13, #16 + mul x24, x6, x4 + and x1, x1, #0xfffffffffffff + lsr x4, x22, #52 + add x4, x19, x4 + lsl x22, x22, #12 + mul x1, x6, x1 + extr x22, x4, x22, #36 + adcs x11, x11, x22 + extr x22, x11, x12, #9 + extr x19, x12, x21, #9 + uaddlp v3.2D, v22.4S + lsl x12, x4, #12 + stp x19, x22, [x0] + umulh x19, x16, x17 + uaddlp v31.2D, v31.4S + lsr x22, x4, #52 + extr x4, x15, x9, #56 + usra v19.2D, v21.2D, #32 + add x22, x1, x22 + extr x1, x23, x15, #44 + shl v4.2D, v31.2D, #32 + extr x12, x22, x12, #48 + and x4, x4, #0xfffffffffffff + uzp2 v7.4S, v17.4S, v17.4S + adcs x20, x20, x12 + xtn v17.2S, v17.2D + lsl x12, x22, #12 + lsr x22, x22, #52 + mul x4, x6, x4 + add x22, x24, x22 + and x24, x1, #0xfffffffffffff + extr x1, x2, x23, #32 + extr x12, x22, x12, #60 + lsl x12, x12, #8 + lsr x22, x22, #52 + mul x24, x6, x24 + add x4, x4, x22 + and x22, x1, #0xfffffffffffff + extr x12, x4, x12, #8 + lsl x1, x4, #12 + lsr x4, x4, #52 + adcs x7, x7, x12 + mul x12, x6, x22 + add x24, x24, x4 + extr x1, x24, x1, #20 + extr x22, x20, x11, #9 + extr x20, x7, x20, #9 + lsr x11, x2, #20 + mul x6, x6, x11 + lsr x4, x24, #52 + add x4, x12, x4 + lsl x12, x24, #12 + adcs x3, x3, x1 + extr x24, x4, x12, #32 + lsr x11, x4, #52 + adcs x12, x14, x24 + umull v31.2D, v17.2S, v28.2S + add x24, x6, x11 + lsl x1, x4, #12 + extr x7, x3, x7, #9 + rev64 v6.4S, v29.4S + umull v22.2D, v17.2S, v5.2S + extr x11, x12, x3, #9 + extr x14, x24, x1, #44 + umlal v4.2D, v17.2S, v5.2S + adcs x3, x10, x14 + umulh x10, x8, x13 + lsr x14, x24, #44 + adc x24, x5, x14 + subs x5, x16, x8 + stp x22, x20, [x0, #16] + csetm x1, cc + shl v21.2D, v3.2D, #32 + movi v17.2D, #0x00000000ffffffff + cneg x20, x5, cc + subs x5, x13, x17 + usra v31.2D, v22.2D, #32 + cneg x14, x5, cc + lsr x6, x24, #9 + and x22, x21, #0x1ff + mov x4, v30.d[0] + add x6, x22, x6 + stp x7, x11, [x0, #32] + umulh x22, x20, x14 + mov x5, v30.d[1] + str x6, [x0, #64] + extr x12, x3, x12, #9 + umull v28.2D, v7.2S, v28.2S + mul x11, x20, x14 + mul v6.4S, v6.4S, v27.4S + and v1.16B, v31.16B, v17.16B + cinv x21, x1, cc + adds x6, x4, x19 + uzp2 v22.4S, v27.4S, v27.4S + adc x20, x19, xzr + adds x6, x6, x5 + umlal v1.2D, v7.2S, v5.2S + xtn v20.2S, v29.2D + eor x22, x22, x21 + adcs x7, x20, x10 + usra v28.2D, v31.2D, #32 + eor x20, x11, x21 + usra v28.2D, v1.2D, #32 + xtn v0.2S, v27.2D + adc x10, x10, xzr + adds x1, x7, x5 + umlal v21.2D, v25.2S, v23.2S + uzp2 v29.4S, v29.4S, v29.4S + adc x19, x10, xzr + cmn x21, #0x1 + umull v3.2D, v0.2S, v20.2S + adcs x5, x6, x20 + extr x10, x24, x3, #9 + umull v31.2D, v0.2S, v29.2S + adcs x1, x1, x22 + stp x12, x10, [x0, #48] + mul x24, x16, x8 + mov x3, v28.d[1] + usra v31.2D, v3.2D, #32 + adc x10, x19, x21 + adds x7, x4, x4 + umulh x14, x16, x8 + uaddlp v3.2D, v6.4S + mov x4, v28.d[0] + adcs x12, x5, x5 + mov x5, v19.d[0] + movi v23.2D, #0x00000000ffffffff + adcs x20, x1, x1 + mov x19, v21.d[1] + mov x1, v19.d[1] + adcs x22, x10, x10 + and v17.16B, v31.16B, v23.16B + adc x6, xzr, xzr + umlal v17.2D, v22.2S, v20.2S + adds x10, x5, x24 + mul x11, x17, x13 + mov x5, v21.d[0] + umull v28.2D, v22.2S, v29.2S + adcs x19, x19, x14 + shl v5.2D, v3.2D, #32 + adc x21, x1, xzr + adds x10, x10, x24 + adcs x1, x19, x14 + umulh x14, x17, x13 + adc x19, x21, xzr + adds x7, x7, x1 + adcs x1, x12, x19 + adcs x24, x20, xzr + mov x20, v4.d[1] + usra v28.2D, v31.2D, #32 + mov x21, v4.d[0] + adcs x19, x22, xzr + adc x6, x6, xzr + adds x4, x4, x11 + adcs x20, x20, x14 + adc x22, x3, xzr + adds x12, x4, x11 + umulh x11, x13, x2 + adcs x3, x20, x14 + adc x20, x22, xzr + adds x21, x21, x24 + ldp x22, x24, [x0] + adcs x4, x12, x19 + ldp x19, x14, [x0, #16] + usra v28.2D, v17.2D, #32 + adcs x3, x3, x6 + umlal v5.2D, v0.2S, v20.2S + adc x6, x20, xzr + umulh x20, x17, x23 + adds x12, x22, x5 + ldp x22, x5, [x0, #32] + adcs x10, x24, x10 + adcs x19, x19, x7 + stp x12, x10, [x0] + ldp x12, x7, [x0, #48] + adcs x10, x14, x1 + mul x14, x13, x2 + ldr x24, [x0, #64] + adcs x22, x22, x21 + adcs x5, x5, x4 + mov x21, v28.d[1] + stp x22, x5, [x0, #32] + mul x1, x17, x23 + adcs x3, x12, x3 + mov x4, v28.d[0] + mov x12, v5.d[1] + stp x19, x10, [x0, #16] + adcs x19, x7, x6 + mov x6, v5.d[0] + adc x10, x24, xzr + subs x7, x16, x8 + cneg x5, x7, cc + csetm x24, cc + subs x7, x15, x9 + cneg x22, x7, cc + cinv x7, x24, cc + adds x12, x12, x4 + umulh x4, x5, x22 + adcs x1, x1, x21 + stp x3, x19, [x0, #48] + str x10, [x0, #64] + adcs x20, x14, x20 + adc x21, x11, xzr + subs x14, x17, x13 + cneg x10, x14, cc + csetm x3, cc + subs x19, x2, x23 + cneg x19, x19, cc + cinv x11, x3, cc + adds x14, x12, x6 + mul x24, x5, x22 + adcs x22, x1, x12 + eor x3, x4, x7 + mul x4, x10, x19 + adcs x1, x20, x1 + adcs x12, x21, x20 + adc x5, xzr, x21 + umulh x19, x10, x19 + adds x20, x22, x6 + eor x10, x24, x7 + adcs x21, x1, x14 + eor x24, x4, x11 + adcs x4, x12, x22 + adcs x1, x5, x1 + adcs x12, xzr, x12 + adc x22, xzr, x5 + eor x5, x19, x11 + cmn x11, #0x1 + adcs x19, x1, x24 + adcs x5, x12, x5 + adc x24, x22, x11 + subs x1, x8, x13 + cneg x22, x1, cc + csetm x1, cc + subs x11, x2, x15 + cinv x1, x1, cc + cneg x12, x11, cc + cmn x7, #0x1 + adcs x10, x14, x10 + mul x14, x22, x12 + adcs x20, x20, x3 + eor x11, x14, x1 + adcs x3, x21, x7 + umulh x21, x22, x12 + adcs x22, x4, x7 + adcs x4, x19, x7 + adcs x12, x5, x7 + adc x7, x24, x7 + subs x14, x16, x17 + csetm x5, cc + cneg x19, x14, cc + subs x24, x23, x9 + cneg x14, x24, cc + cinv x5, x5, cc + cmn x1, #0x1 + mul x24, x19, x14 + adcs x22, x22, x11 + eor x11, x21, x1 + eor x24, x24, x5 + umulh x19, x19, x14 + adcs x4, x4, x11 + adcs x14, x12, x1 + adc x1, x7, x1 + subs x17, x8, x17 + cneg x12, x17, cc + csetm x17, cc + subs x16, x16, x13 + cneg x11, x16, cc + csetm x16, cc + subs x23, x23, x15 + cinv x7, x17, cc + cneg x13, x23, cc + mul x15, x12, x13 + subs x23, x2, x9 + cinv x8, x16, cc + cneg x17, x23, cc + eor x16, x19, x5 + mul x23, x11, x17 + cmn x5, #0x1 + adcs x20, x20, x24 + eor x15, x15, x7 + adcs x3, x3, x16 + adcs x2, x22, x5 + umulh x16, x11, x17 + adcs x19, x4, x5 + ldp x4, x22, [x0, #48] + extr x21, x10, x6, #63 + adcs x24, x14, x5 + eor x23, x23, x8 + adc x1, x1, x5 + cmn x8, #0x1 + umulh x9, x12, x13 + eor x14, x16, x8 + adcs x3, x3, x23 + ldp x11, x5, [x0, #16] + ldp x13, x16, [x0] + adcs x23, x2, x14 + adcs x14, x19, x8 + extr x19, x20, x10, #63 + lsl x12, x6, #1 + adcs x17, x24, x8 + adc x1, x1, x8 + cmn x7, #0x1 + adcs x24, x3, x15 + eor x9, x9, x7 + ldp x15, x3, [x0, #32] + adcs x9, x23, x9 + ldr x8, [x0, #64] + extr x20, x24, x20, #63 + adcs x23, x14, x7 + extr x2, x9, x24, #63 + adcs x14, x17, x7 + and x24, x2, #0x1ff + extr x9, x23, x9, #8 + extr x6, x14, x23, #8 + adc x23, x1, x7 + adds x10, x9, x13 + adcs x13, x6, x16 + extr x1, x23, x14, #8 + lsr x23, x23, #8 + adcs x7, x1, x11 + adcs x2, x23, x5 + and x23, x13, x7 + adcs x16, x12, x15 + and x23, x23, x2 + adcs x14, x21, x3 + and x23, x23, x16 + adcs x5, x19, x4 + and x23, x23, x14 + adcs x22, x20, x22 + and x23, x23, x5 + and x1, x23, x22 + adc x9, x8, x24 + lsr x23, x9, #9 + cmp xzr, xzr + orr x17, x9, #0xfffffffffffffe00 + adcs xzr, x10, x23 + adcs xzr, x1, xzr + adcs xzr, x17, xzr + adcs x23, x10, x23 + adcs x9, x13, xzr + lsl x4, x23, #9 + adcs x1, x7, xzr + extr x23, x9, x23, #55 + extr x9, x1, x9, #55 + adcs x10, x2, xzr + extr x1, x10, x1, #55 + stp x23, x9, [x0] + adcs x19, x16, xzr + adcs x9, x14, xzr + extr x23, x19, x10, #55 + adcs x10, x5, xzr + stp x1, x23, [x0, #16] + extr x5, x9, x19, #55 + adcs x1, x22, xzr + extr x23, x10, x9, #55 + adc x9, x17, xzr + stp x5, x23, [x0, #32] + extr x10, x1, x10, #55 + and x23, x9, #0x1ff + orr x23, x23, x4 + extr x9, x23, x1, #55 + lsr x23, x23, #55 + stp x10, x9, [x0, #48] + str x23, [x0, #64] + +// Restore regs and return + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/arm/proofs/arm.ml b/arm/proofs/arm.ml index 5fff5c32..f36e94a9 100644 --- a/arm/proofs/arm.ml +++ b/arm/proofs/arm.ml @@ -969,40 +969,49 @@ let ARM_ADD_RETURN_STACK_TAC = let ARM_SUB_LIST_OF_MC_TAC (correct_th:thm) (program_sub_mc_def:thm) (length_ths:thm list): tactic = W (fun (asl,g) -> - let vars,pc = - let xs = fst (strip_forall g) in - butlast xs, last xs in let begin_ofs,n = let rhs = snd (dest_eq (concl program_sub_mc_def)) in dest_pair (rand(rator rhs)) in + let correct_th_vars = + let xs = fst (strip_forall g) in + if List.exists (fun t -> t = `pc:num`) xs + then + List.map + (fun t -> if t = `pc:num` then mk_binary "+" (t,begin_ofs) else t) xs + else + let xs,pc = butlast xs, last xs in + xs @ [mk_binary "+" (pc,begin_ofs)] in if !arm_print_log then begin Printf.printf "ARM_SUB_LIST_OF_MC_TAC: begin_ofs: %s, n: %s\n" (string_of_term begin_ofs) (string_of_term n); - Printf.printf "\tvars: %s, pc: %s\n" - (String.concat "," (map string_of_term vars)) - (string_of_term pc) + Printf.printf "\tvars: %s\n" + (String.concat "," (map string_of_term correct_th_vars)) end else (); REPEAT STRIP_TAC THEN - MP_TAC (ISPECL (vars @ [mk_binary "+" (pc,begin_ofs)]) correct_th) THEN + MP_TAC (ISPECL correct_th_vars correct_th) THEN (* Prove antedecent of correct_th *) ANTS_TAC THENL [ - POP_ASSUM MP_TAC THEN + (REPEAT (POP_ASSUM MP_TAC) THEN REWRITE_TAC(length_ths @ [ALL;NONOVERLAPPING_CLAUSES]) THEN - STRIP_TAC THEN ASM_REWRITE_TAC[] THEN NONOVERLAPPING_TAC; - ALL_TAC - ] THEN - - MATCH_MP_TAC ENSURES_SUBLEMMA_THM THEN - REWRITE_TAC[] THEN - REPEAT CONJ_TAC THENL [ - REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[ADD_0] THEN - REWRITE_TAC[program_sub_mc_def;WORD_ADD] THEN - IMP_REWRITE_TAC(CONJUNCTS ALIGNED_BYTES_LOADED_SUB_LIST) THEN - CONV_TAC NUM_DIVIDES_CONV; - - SUBSUMED_MAYCHANGE_TAC; - - MESON_TAC[ADD_ASSOC;ADD_0] + REPEAT STRIP_TAC THEN + ASM_REWRITE_TAC[] THEN NONOVERLAPPING_TAC) ORELSE ALL_TAC; + (* Leave it to user *) + + MATCH_MP_TAC ENSURES_SUBLEMMA_THM THEN + REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[ADD_0] THEN + REWRITE_TAC[program_sub_mc_def;WORD_ADD] THEN + IMP_REWRITE_TAC(CONJUNCTS ALIGNED_BYTES_LOADED_SUB_LIST) THEN + CONV_TAC NUM_DIVIDES_CONV; + + SUBSUMED_MAYCHANGE_TAC; + + MESON_TAC[ADD_ASSOC;ADD_0] ORELSE + FAIL_TAC ("MESON_TAC could not prove the third precondition of " ^ + "ENSURES_SUBLEMMA_THM " ^ + "`(!s s'. P s /\ Q' s' /\ R' s s' ==> Q s')`") + ] ]);; (* ------------------------------------------------------------------------- *) diff --git a/arm/proofs/bignum_montmul_p256_neon.ml b/arm/proofs/bignum_montmul_p256_neon.ml index 7f1d3280..18ec3491 100644 --- a/arm/proofs/bignum_montmul_p256_neon.ml +++ b/arm/proofs/bignum_montmul_p256_neon.ml @@ -250,13 +250,12 @@ let bignum_montmul_p256_interm1_core_mc_def, let equiv_input_states = new_definition `!s1 s1' x y z. (equiv_input_states:(armstate#armstate)->int64->int64->int64->bool) (s1,s1') x y z <=> - (?a b. - C_ARGUMENTS [z; x; y] s1 /\ + (C_ARGUMENTS [z; x; y] s1 /\ C_ARGUMENTS [z; x; y] s1' /\ - bignum_from_memory (x,4) s1 = a /\ - bignum_from_memory (x,4) s1' = a /\ - bignum_from_memory (y,4) s1 = b /\ - bignum_from_memory (y,4) s1' = b)`;; + ?a. bignum_from_memory (x,4) s1 = a /\ + bignum_from_memory (x,4) s1' = a /\ + (?b. bignum_from_memory (y,4) s1 = b /\ + bignum_from_memory (y,4) s1' = b))`;; let equiv_output_states = new_definition `!s1 s1' z. @@ -329,7 +328,7 @@ let BIGNUM_MONTMUL_P256_CORE_EQUIV1 = prove(equiv_goal1, ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,4) state`; C_ARGUMENTS] THEN - REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); (** SUBGOAL 2. Maychange left **) DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN @@ -420,7 +419,7 @@ let BIGNUM_MONTMUL_P256_CORE_EQUIV2 = prove( ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,4) state`; C_ARGUMENTS] THEN - REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); (** SUBGOAL 2. Maychange left **) DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN @@ -488,12 +487,7 @@ let BIGNUM_MONTMUL_P256_CORE_EQUIV = prove(equiv_goal, ] THEN STRIP_TAC THEN - FIRST_X_ASSUM (fun th -> ASSUME_TAC (SPEC_ALL (MATCH_MP BIGNUM_MONTMUL_P256_CORE_EQUIV1 th))) THEN - FIRST_X_ASSUM (fun th -> ASSUME_TAC (SPEC_ALL (MATCH_MP BIGNUM_MONTMUL_P256_CORE_EQUIV2 th))) THEN - FIRST_X_ASSUM (fun c1 -> - FIRST_X_ASSUM (fun c2 -> - MP_TAC (REWRITE_RULE [] (MATCH_MP ENSURES2_CONJ2 (CONJ c1 c2))) - )) THEN + ENSURES2_TRANS_TAC BIGNUM_MONTMUL_P256_CORE_EQUIV1 BIGNUM_MONTMUL_P256_CORE_EQUIV2 THEN (* break 'ALL nonoverlapping' in assumptions *) RULE_ASSUM_TAC (REWRITE_RULE[ @@ -519,17 +513,15 @@ let BIGNUM_MONTMUL_P256_CORE_EQUIV = prove(equiv_goal, REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; fst BIGNUM_MONTMUL_P256_INTERM1_CORE_EXEC] THEN STRIP_TAC THEN ASM_REWRITE_TAC[] THEN - MAP_EVERY EXISTS_TAC [`a:num`;`b:num`] THEN - REWRITE_TAC[] THEN - PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P256_INTERM1_CORE_EXEC; + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P256_INTERM1_CORE_EXEC); UNDISCH_TAC `equiv_input_states (s,s') x y z` THEN REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; fst BIGNUM_MONTMUL_P256_INTERM1_CORE_EXEC] THEN STRIP_TAC THEN ASM_REWRITE_TAC[] THEN - MAP_EVERY EXISTS_TAC [`a:num`;`b:num`] THEN - REWRITE_TAC[] THEN - PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P256_INTERM1_CORE_EXEC + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P256_INTERM1_CORE_EXEC); ]; REPEAT GEN_TAC THEN STRIP_TAC THEN @@ -633,7 +625,7 @@ let BIGNUM_MONTMUL_P256_NEON_CORE_CORRECT = prove( [ FIRST_ASSUM (fun th -> MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN - ASM_REWRITE_TAC[equiv_input_states] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN EXISTS_TAC `write (memory :> bytelist (word pc,LENGTH (APPEND bignum_montmul_p256_core_mc barrier_inst_bytes))) @@ -643,10 +635,9 @@ let BIGNUM_MONTMUL_P256_NEON_CORE_CORRECT = prove( REPEAT CONJ_TAC THEN TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P256_CORE_EXEC) THEN (* Now has only one subgoal: the equivalence! *) - REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN - MAP_EVERY EXISTS_TAC [`a:num`;`b:num`] THEN - REPEAT CONJ_TAC THEN - TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P256_CORE_EXEC); + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P256_CORE_EXEC); (** SUBGOAL 2. Postcond **) MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; @@ -762,7 +753,7 @@ let BIGNUM_AMONTMUL_P256_NEON_CORE_CORRECT = prove( [ FIRST_ASSUM (fun th -> MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN - ASM_REWRITE_TAC[equiv_input_states] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN EXISTS_TAC `write (memory :> bytelist (word pc,LENGTH (APPEND bignum_montmul_p256_core_mc barrier_inst_bytes))) @@ -773,9 +764,8 @@ let BIGNUM_AMONTMUL_P256_NEON_CORE_CORRECT = prove( TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P256_CORE_EXEC) THEN (* Now has only one subgoal: the equivalence! *) REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN - MAP_EVERY EXISTS_TAC [`a:num`;`b:num`] THEN - REPEAT CONJ_TAC THEN - TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P256_CORE_EXEC); + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P256_CORE_EXEC); (** SUBGOAL 2. Postcond **) MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; diff --git a/arm/proofs/bignum_montmul_p384_neon.ml b/arm/proofs/bignum_montmul_p384_neon.ml index 7b9156a6..930b7baf 100644 --- a/arm/proofs/bignum_montmul_p384_neon.ml +++ b/arm/proofs/bignum_montmul_p384_neon.ml @@ -458,13 +458,12 @@ let bignum_montmul_p384_interm1_core_mc_def, let equiv_input_states = new_definition `!s1 s1' x y z. (equiv_input_states:(armstate#armstate)->int64->int64->int64->bool) (s1,s1') x y z <=> - (?a b. - C_ARGUMENTS [z; x; y] s1 /\ - C_ARGUMENTS [z; x; y] s1' /\ - bignum_from_memory (x,6) s1 = a /\ - bignum_from_memory (x,6) s1' = a /\ - bignum_from_memory (y,6) s1 = b /\ - bignum_from_memory (y,6) s1' = b)`;; + (C_ARGUMENTS [z; x; y] s1 /\ + C_ARGUMENTS [z; x; y] s1' /\ + ?a. bignum_from_memory (x,6) s1 = a /\ + bignum_from_memory (x,6) s1' = a /\ + (?b. bignum_from_memory (y,6) s1 = b /\ + bignum_from_memory (y,6) s1' = b))`;; let equiv_output_states = new_definition `!s1 s1' z. @@ -700,15 +699,9 @@ let BIGNUM_MONTMUL_P384_CORE_EQUIV = time prove(equiv_goal, ALL_TAC ] THEN - DISCH_THEN (CHOOSE_THEN (DESTRUCT_TAC "h1 h2 h3 h4")) THEN + STRIP_TAC THEN - - FIRST_X_ASSUM (fun th -> ASSUME_TAC (SPEC_ALL (MATCH_MP BIGNUM_MONTMUL_P384_CORE_EQUIV1 th))) THEN - FIRST_X_ASSUM (fun th -> ASSUME_TAC (SPEC_ALL (MATCH_MP BIGNUM_MONTMUL_P384_CORE_EQUIV2 th))) THEN - FIRST_X_ASSUM (fun c1 -> - FIRST_X_ASSUM (fun c2 -> - MP_TAC (REWRITE_RULE [] (MATCH_MP ENSURES2_CONJ2 (CONJ c1 c2))) - )) THEN + ENSURES2_TRANS_TAC BIGNUM_MONTMUL_P384_CORE_EQUIV1 BIGNUM_MONTMUL_P384_CORE_EQUIV2 THEN (* break 'ALL nonoverlapping' in assumptions *) RULE_ASSUM_TAC (REWRITE_RULE[ @@ -734,17 +727,15 @@ let BIGNUM_MONTMUL_P384_CORE_EQUIV = time prove(equiv_goal, REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; fst BIGNUM_MONTMUL_P384_INTERM1_CORE_EXEC] THEN STRIP_TAC THEN ASM_REWRITE_TAC[] THEN - MAP_EVERY EXISTS_TAC [`a:num`;`b:num`] THEN - REWRITE_TAC[] THEN - PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P384_INTERM1_CORE_EXEC; + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P384_INTERM1_CORE_EXEC); UNDISCH_TAC `equiv_input_states (s,s') x y z` THEN REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; fst BIGNUM_MONTMUL_P384_INTERM1_CORE_EXEC] THEN STRIP_TAC THEN ASM_REWRITE_TAC[] THEN - MAP_EVERY EXISTS_TAC [`a:num`;`b:num`] THEN - REWRITE_TAC[] THEN - PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P384_INTERM1_CORE_EXEC + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P384_INTERM1_CORE_EXEC); ]; REPEAT GEN_TAC THEN STRIP_TAC THEN @@ -855,7 +846,7 @@ let BIGNUM_MONTMUL_P384_NEON_CORE_CORRECT = time prove( [ FIRST_ASSUM (fun th -> MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN - ASM_REWRITE_TAC[equiv_input_states] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN EXISTS_TAC `write (memory :> bytelist (word pc,LENGTH (APPEND bignum_montmul_p384_core_mc barrier_inst_bytes))) @@ -865,11 +856,8 @@ let BIGNUM_MONTMUL_P384_NEON_CORE_CORRECT = time prove( REPEAT CONJ_TAC THEN TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P384_CORE_EXEC) THEN (* Now has only one subgoal: the equivalence! *) - REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN - MAP_EVERY EXISTS_TAC [`a:num`;`b:num`] THEN - REPEAT CONJ_TAC THEN - TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P384_CORE_EXEC) THEN - NO_TAC; + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P384_CORE_EXEC); (** SUBGOAL 2. Postcond **) MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; @@ -1002,7 +990,7 @@ let BIGNUM_AMONTMUL_P384_NEON_CORE_CORRECT = time prove( [ FIRST_ASSUM (fun th -> MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN - ASM_REWRITE_TAC[equiv_input_states] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN EXISTS_TAC `write (memory :> bytelist (word pc,LENGTH (APPEND bignum_montmul_p384_core_mc barrier_inst_bytes))) @@ -1013,10 +1001,7 @@ let BIGNUM_AMONTMUL_P384_NEON_CORE_CORRECT = time prove( TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P384_CORE_EXEC) THEN (* Now has only one subgoal: the equivalence! *) REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN - MAP_EVERY EXISTS_TAC [`a:num`;`b:num`] THEN - REPEAT CONJ_TAC THEN - TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P384_CORE_EXEC) THEN - NO_TAC; + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P384_CORE_EXEC); (** SUBGOAL 2. Postcond **) MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; diff --git a/arm/proofs/bignum_montmul_p521.ml b/arm/proofs/bignum_montmul_p521.ml index ebfd4a5e..b729defb 100644 --- a/arm/proofs/bignum_montmul_p521.ml +++ b/arm/proofs/bignum_montmul_p521.ml @@ -649,6 +649,15 @@ let bignum_montmul_p521_mc = let BIGNUM_MONTMUL_P521_EXEC = ARM_MK_EXEC_RULE bignum_montmul_p521_mc;; +(* bignum_montmul_p521_mc without callee-save register spills + ret. *) +let bignum_montmul_p521_core_mc_def, + bignum_montmul_p521_core_mc, + BIGNUM_MONTMUL_P521_CORE_EXEC = + mk_sublist_of_mc "bignum_montmul_p521_core_mc" + bignum_montmul_p521_mc + (`20`,`LENGTH bignum_montmul_p521_mc - 44`) + (fst BIGNUM_MONTMUL_P521_EXEC);; + (* ------------------------------------------------------------------------- *) (* Proof. *) (* ------------------------------------------------------------------------- *) @@ -748,20 +757,21 @@ let ADK_48_TAC = DECARRY_RULE o CONJUNCTS) THEN DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC;; -let BIGNUM_MONTMUL_P521_CORRECT = prove +let BIGNUM_MONTMUL_P521_CORE_CORRECT = prove (`!z x y a b pc stackpointer. aligned 16 stackpointer /\ ALL (nonoverlapping (stackpointer,80)) - [(word pc,0x9d8); (z,8 * 9); (x,8 * 9); (y,8 * 9)] /\ - nonoverlapping (z,8 * 9) (word pc,0x9d8) + [(word pc,LENGTH bignum_montmul_p521_core_mc); (z,8 * 9); + (x,8 * 9); (y,8 * 9)] /\ + nonoverlapping (z,8 * 9) (word pc,LENGTH bignum_montmul_p521_core_mc) ==> ensures arm - (\s. aligned_bytes_loaded s (word pc) bignum_montmul_p521_mc /\ - read PC s = word(pc + 0x14) /\ + (\s. aligned_bytes_loaded s (word pc) bignum_montmul_p521_core_mc /\ + read PC s = word(pc) /\ read SP s = stackpointer /\ C_ARGUMENTS [z; x; y] s /\ bignum_from_memory (x,9) s = a /\ bignum_from_memory (y,9) s = b) - (\s. read PC s = word (pc + 0x9c0) /\ + (\s. read PC s = word (pc + LENGTH bignum_montmul_p521_core_mc) /\ (a < p_521 /\ b < p_521 ==> bignum_from_memory (z,9) s = (inverse_mod p_521 (2 EXP 576) * a * b) MOD p_521)) @@ -774,14 +784,15 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove MAP_EVERY X_GEN_TAC [`z:int64`; `x:int64`; `y:int64`; `a:num`; `b:num`; `pc:num`; `stackpointer:int64`] THEN - REWRITE_TAC[ALL; C_ARGUMENTS; SOME_FLAGS; NONOVERLAPPING_CLAUSES] THEN + REWRITE_TAC[ALL; C_ARGUMENTS; SOME_FLAGS; NONOVERLAPPING_CLAUSES; + fst BIGNUM_MONTMUL_P521_CORE_EXEC] THEN DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN (*** Globalize the a < p_521 /\ b < p_521 assumption for simplicity ***) ASM_CASES_TAC `a < p_521 /\ b < p_521` THENL [ASM_REWRITE_TAC[] THEN FIRST_X_ASSUM(CONJUNCTS_THEN ASSUME_TAC); - ARM_SIM_TAC BIGNUM_MONTMUL_P521_EXEC (1--619)] THEN + ARM_SIM_TAC BIGNUM_MONTMUL_P521_CORE_EXEC (1--619)] THEN (*** Digitize, deduce the bound on the top words ***) @@ -799,7 +810,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove (*** 4x4 multiplication of the low portions and its rebasing ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC [5; 6; 7; 8; 10; 12; 14; 16; 17; 18; 19; 20; 21; 22; 23; 24; 25; 26; 27; 33; 38; 40; 41; 47; 52; 54; 55; 56; 57; 58; 59; 65; 70; 72; 73; 74; 80; 85; 87; 88; 89; 90; 91; 97; 102; 104; 105; 106; @@ -861,7 +872,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove (*** 4x4 multiplication of the high portions ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC [138; 139; 140; 141; 143; 145; 147; 149; 150; 151; 152; 153; 154; 155; 156; 157; 158; 159; 160; 166; 171; 173; 174; 180; 185; 187; 188; 189; 190; 191; 192; 198; 203; 205; 206; 207; @@ -881,7 +892,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove (*** Addition combining high and low parts into hl ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC [258; 259; 262; 263; 266; 267; 270; 271; 274] (257--275) THEN ABBREV_TAC @@ -926,7 +937,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove (*** The sign-magnitude difference computation ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC [277; 278; 280; 281; 284; 285; 287; 288; 291; 293; 295; 297; 299; 301; 303; 305] (276--306) THEN @@ -992,7 +1003,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove (*** One more 4x4 multiplication of the cross-terms ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC [307; 308; 309; 310; 312; 314; 316; 318; 319; 320; 321; 322; 323; 324; 325; 326; 327; 328; 329; 335; 340; 342; 343; 349; 354; 356; 357; 358; 359; 360; 361; 367; 372; 374; 375; 376; @@ -1016,7 +1027,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove EXPAND_TAC "hl" THEN CONV_TAC(LAND_CONV(ONCE_DEPTH_CONV BIGNUM_OF_WORDLIST_DIV_CONV)) THEN DISCH_TAC THEN - ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC [429; 431; 433; 435; 440; 442; 444; 446] (426--447) THEN ABBREV_TAC @@ -1092,7 +1103,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove DISCH_THEN(ASSUME_TAC o MATCH_MP (INTEGER_RULE `(x:int == a + y) (mod n) /\ (y' == y) (mod n) ==> (x == a + y') (mod n)`))] THEN - ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC [448; 449; 450; 451; 455; 457; 459; 461] (448--463) THEN ABBREV_TAC @@ -1170,7 +1181,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove (*** The intricate augmentation of the product with top words ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC [484; 498; 510; 527; 551; 566; 579; 590; 595] (464--595) THEN ONCE_REWRITE_TAC[GSYM MOD_MULT_MOD2] THEN @@ -1475,7 +1486,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove STRIP_ASSUME_TAC THENL [REWRITE_TAC[MOD_LT_EQ] THEN UNDISCH_TAC `n < 2 EXP 576` THEN ARITH_TAC; ALL_TAC] THEN - ARM_STEPS_TAC BIGNUM_MONTMUL_P521_EXEC (596--598) THEN + ARM_STEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC (596--598) THEN RULE_ASSUM_TAC(REWRITE_RULE[GSYM WORD_AND_ASSOC; DIMINDEX_64; NUM_REDUCE_CONV `9 MOD 64`]) THEN MAP_EVERY ABBREV_TAC @@ -1485,7 +1496,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove word_and sum_s498 (word_and sum_s510 (word_and sum_s527 (word_and sum_s551 (word_and sum_s566 (word_and sum_s579 sum_s590)))))`] THEN - ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_EXEC (599--601) (599--601) THEN + ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC (599--601) (599--601) THEN SUBGOAL_THEN `carry_s601 <=> 2 EXP 192 <= @@ -1497,7 +1508,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove ACCUMULATOR_ASSUM_LIST(MP_TAC o end_itlist CONJ o DECARRY_RULE) THEN DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN BOUNDER_TAC[]; ACCUMULATOR_POP_ASSUM_LIST(K ALL_TAC)] THEN - ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_EXEC (602--610) (602--610) THEN + ARM_ACCSTEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC (602--610) (602--610) THEN SUBGOAL_THEN `val(d:int64) = 2 EXP 9 * (2 EXP 55 - 1) + val(sum_s595:int64) MOD 2 EXP 9` SUBST_ALL_TAC THENL @@ -1609,7 +1620,7 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove (*** The rotation to shift from the 512 position ***) - ARM_STEPS_TAC BIGNUM_MONTMUL_P521_EXEC (611--619) THEN + ARM_STEPS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC (611--619) THEN ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN CONV_TAC MOD_DOWN_CONV THEN CONV_TAC SYM_CONV THEN REWRITE_TAC[MOD_UNIQUE] THEN @@ -1645,6 +1656,41 @@ let BIGNUM_MONTMUL_P521_CORRECT = prove CONV_TAC NUM_REDUCE_CONV THEN REWRITE_TAC[GSYM REAL_OF_NUM_CLAUSES] THEN REAL_INTEGER_TAC);; +let BIGNUM_MONTMUL_P521_CORRECT = time prove + (`!z x y a b pc stackpointer. + aligned 16 stackpointer /\ + ALL (nonoverlapping (stackpointer,80)) + [(word pc,LENGTH bignum_montmul_p521_mc); (z,8 * 9); + (x,8 * 9); (y,8 * 9)] /\ + nonoverlapping (z,8 * 9) (word pc,LENGTH bignum_montmul_p521_mc) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_montmul_p521_mc /\ + read PC s = word(pc + 20) /\ + read SP s = stackpointer /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,9) s = a /\ + bignum_from_memory (y,9) s = b) + (\s. read PC s = word (pc + 20 + LENGTH bignum_montmul_p521_core_mc) /\ + (a < p_521 /\ b < p_521 + ==> bignum_from_memory (z,9) s = + (inverse_mod p_521 (2 EXP 576) * a * b) MOD p_521)) + (MAYCHANGE [PC; X3; X4; X5; X6; X7; X8; X9; + X10; X11; X12; X13; X14; X15; X16; X17; X19; + X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)])`, + + ARM_SUB_LIST_OF_MC_TAC BIGNUM_MONTMUL_P521_CORE_CORRECT + bignum_montmul_p521_core_mc_def + [fst BIGNUM_MONTMUL_P521_CORE_EXEC;fst BIGNUM_MONTMUL_P521_EXEC] THEN + + REPEAT (POP_ASSUM MP_TAC) THEN + REWRITE_TAC([fst BIGNUM_MONTMUL_P521_CORE_EXEC;fst BIGNUM_MONTMUL_P521_EXEC;ALL; + NONOVERLAPPING_CLAUSES]) THEN + REPEAT STRIP_TAC THEN + ASM_REWRITE_TAC[] THEN NONOVERLAPPING_TAC);; + let BIGNUM_MONTMUL_P521_SUBROUTINE_CORRECT = prove (`!z x y a b pc stackpointer returnaddress. aligned 16 stackpointer /\ @@ -1666,6 +1712,9 @@ let BIGNUM_MONTMUL_P521_SUBROUTINE_CORRECT = prove (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, MAYCHANGE [memory :> bytes(z,8 * 9); memory :> bytes(word_sub stackpointer (word 144),144)])`, + let th = CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV) + (REWRITE_RULE [fst BIGNUM_MONTMUL_P521_CORE_EXEC;fst BIGNUM_MONTMUL_P521_EXEC] + BIGNUM_MONTMUL_P521_CORRECT) in ARM_ADD_RETURN_STACK_TAC - BIGNUM_MONTMUL_P521_EXEC BIGNUM_MONTMUL_P521_CORRECT + BIGNUM_MONTMUL_P521_EXEC th `[X19;X20;X21;X22;X23;X24;X25;X26]` 144);; diff --git a/arm/proofs/bignum_montmul_p521_neon.ml b/arm/proofs/bignum_montmul_p521_neon.ml new file mode 100644 index 00000000..a5746203 --- /dev/null +++ b/arm/proofs/bignum_montmul_p521_neon.ml @@ -0,0 +1,1216 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + The first program equivalence between the 'core' part of source program and + its SIMD-vectorized but not instruction-unscheduled program +******************************************************************************) + +needs "arm/proofs/bignum_montmul_p521.ml";; +needs "arm/proofs/equiv.ml";; +needs "arm/proofs/neon_helper.ml";; + +(* This is the intermediate program that is equivalent to both + bignum_montmul_p521 and bignum_montmul_p521_neon. This is a vectorized + version of bignum_montmul_p521 but instructions are unscheduled. *) + +let bignum_montmul_p521_interm1_ops:int list = [ + 0xa9401c2e; (* ldp x14, x7, [x1] *) + 0xa9416423; (* ldp x3, x25, [x1, #16] *) + 0xa940604a; (* ldp x10, x24, [x2] *) + 0x3dc00020; (* ldr q0, [x1] *) + 0x3dc00059; (* ldr q25, [x2] *) + 0xa941184c; (* ldp x12, x6, [x2, #16] *) + 0x6f00e5f2; (* movi v18.2d, #0xffffffff *) + 0x4e995b23; (* uzp2 v3.4s, v25.4s, v25.4s *) + 0x0ea1281a; (* xtn v26.2s, v0.2d *) + 0x0ea12b36; (* xtn v22.2s, v25.2d *) + 0x4ea00b38; (* rev64 v24.4s, v25.4s *) + 0x2eb6c353; (* umull v19.2d, v26.2s, v22.2s *) + 0x2ea3c359; (* umull v25.2d, v26.2s, v3.2s *) + 0x4e805814; (* uzp2 v20.4s, v0.4s, v0.4s *) + 0x4ea09f00; (* mul v0.4s, v24.4s, v0.4s *) + 0x6f601679; (* usra v25.2d, v19.2d, #32 *) + 0x2ea3c286; (* umull v6.2d, v20.2s, v3.2s *) + 0x6ea02800; (* uaddlp v0.2d, v0.4s *) + 0x4e321f32; (* and v18.16b, v25.16b, v18.16b *) + 0x2eb68292; (* umlal v18.2d, v20.2s, v22.2s *) + 0x4f605400; (* shl v0.2d, v0.2d, #32 *) + 0x6f601726; (* usra v6.2d, v25.2d, #32 *) + 0x2eb68340; (* umlal v0.2d, v26.2s, v22.2s *) + 0x6f601646; (* usra v6.2d, v18.2d, #32 *) + 0x4e083c17; (* mov x23, v0.d[0] *) + 0x4e183c10; (* mov x16, v0.d[1] *) + 0x9b0c7c65; (* mul x5, x3, x12 *) + 0x9b067f35; (* mul x21, x25, x6 *) + 0x4e083cd3; (* mov x19, v6.d[0] *) + 0xab130210; (* adds x16, x16, x19 *) + 0x4e183cd3; (* mov x19, v6.d[1] *) + 0xba1300a5; (* adcs x5, x5, x19 *) + 0x9bcc7c73; (* umulh x19, x3, x12 *) + 0xba1302b5; (* adcs x21, x21, x19 *) + 0x9bc67f33; (* umulh x19, x25, x6 *) + 0x9a1f0273; (* adc x19, x19, xzr *) + 0xab170208; (* adds x8, x16, x23 *) + 0xba1000b0; (* adcs x16, x5, x16 *) + 0xba0502a5; (* adcs x5, x21, x5 *) + 0xba150275; (* adcs x21, x19, x21 *) + 0x9a1303f3; (* adc x19, xzr, x19 *) + 0xab17020b; (* adds x11, x16, x23 *) + 0xba0800af; (* adcs x15, x5, x8 *) + 0xba1002b0; (* adcs x16, x21, x16 *) + 0xba050265; (* adcs x5, x19, x5 *) + 0xba1503f5; (* adcs x21, xzr, x21 *) + 0x9a1303f3; (* adc x19, xzr, x19 *) + 0xeb190074; (* subs x20, x3, x25 *) + 0xda942694; (* cneg x20, x20, cc // cc = lo, ul, last *) + 0xda9f23e9; (* csetm x9, cc // cc = lo, ul, last *) + 0xeb0c00cd; (* subs x13, x6, x12 *) + 0xda8d25ad; (* cneg x13, x13, cc // cc = lo, ul, last *) + 0x9b0d7e9a; (* mul x26, x20, x13 *) + 0x9bcd7e94; (* umulh x20, x20, x13 *) + 0xda892129; (* cinv x9, x9, cc // cc = lo, ul, last *) + 0xb100053f; (* cmn x9, #0x1 *) + 0xca09034d; (* eor x13, x26, x9 *) + 0xba0d00a5; (* adcs x5, x5, x13 *) + 0xca090294; (* eor x20, x20, x9 *) + 0xba1402b5; (* adcs x21, x21, x20 *) + 0x9a090273; (* adc x19, x19, x9 *) + 0xeb0701d4; (* subs x20, x14, x7 *) + 0xda942694; (* cneg x20, x20, cc // cc = lo, ul, last *) + 0xda9f23e9; (* csetm x9, cc // cc = lo, ul, last *) + 0xeb0a030d; (* subs x13, x24, x10 *) + 0xda8d25ad; (* cneg x13, x13, cc // cc = lo, ul, last *) + 0x9b0d7e9a; (* mul x26, x20, x13 *) + 0x9bcd7e94; (* umulh x20, x20, x13 *) + 0xda892129; (* cinv x9, x9, cc // cc = lo, ul, last *) + 0xb100053f; (* cmn x9, #0x1 *) + 0xca09034d; (* eor x13, x26, x9 *) + 0xba0d0108; (* adcs x8, x8, x13 *) + 0xca090294; (* eor x20, x20, x9 *) + 0xba14016b; (* adcs x11, x11, x20 *) + 0xba0901ef; (* adcs x15, x15, x9 *) + 0xba090210; (* adcs x16, x16, x9 *) + 0xba0900a5; (* adcs x5, x5, x9 *) + 0xba0902b5; (* adcs x21, x21, x9 *) + 0x9a090273; (* adc x19, x19, x9 *) + 0xeb1900f4; (* subs x20, x7, x25 *) + 0xda942694; (* cneg x20, x20, cc // cc = lo, ul, last *) + 0xda9f23e9; (* csetm x9, cc // cc = lo, ul, last *) + 0xeb1800cd; (* subs x13, x6, x24 *) + 0xda8d25ad; (* cneg x13, x13, cc // cc = lo, ul, last *) + 0x9b0d7e9a; (* mul x26, x20, x13 *) + 0x9bcd7e94; (* umulh x20, x20, x13 *) + 0xda892129; (* cinv x9, x9, cc // cc = lo, ul, last *) + 0xb100053f; (* cmn x9, #0x1 *) + 0xca09034d; (* eor x13, x26, x9 *) + 0xba0d0210; (* adcs x16, x16, x13 *) + 0xca090294; (* eor x20, x20, x9 *) + 0xba1400a5; (* adcs x5, x5, x20 *) + 0xba0902b5; (* adcs x21, x21, x9 *) + 0x9a090273; (* adc x19, x19, x9 *) + 0xeb0301d4; (* subs x20, x14, x3 *) + 0xda942694; (* cneg x20, x20, cc // cc = lo, ul, last *) + 0xda9f23e9; (* csetm x9, cc // cc = lo, ul, last *) + 0xeb0a018d; (* subs x13, x12, x10 *) + 0xda8d25ad; (* cneg x13, x13, cc // cc = lo, ul, last *) + 0x9b0d7e9a; (* mul x26, x20, x13 *) + 0x9bcd7e94; (* umulh x20, x20, x13 *) + 0xda892129; (* cinv x9, x9, cc // cc = lo, ul, last *) + 0xb100053f; (* cmn x9, #0x1 *) + 0xca09034d; (* eor x13, x26, x9 *) + 0xba0d016b; (* adcs x11, x11, x13 *) + 0xca090294; (* eor x20, x20, x9 *) + 0xba1401ef; (* adcs x15, x15, x20 *) + 0xba090210; (* adcs x16, x16, x9 *) + 0xba0900a5; (* adcs x5, x5, x9 *) + 0xba0902b5; (* adcs x21, x21, x9 *) + 0x9a090273; (* adc x19, x19, x9 *) + 0xeb1901d9; (* subs x25, x14, x25 *) + 0xda992739; (* cneg x25, x25, cc // cc = lo, ul, last *) + 0xda9f23f4; (* csetm x20, cc // cc = lo, ul, last *) + 0xeb0a00ca; (* subs x10, x6, x10 *) + 0xda8a254a; (* cneg x10, x10, cc // cc = lo, ul, last *) + 0x9b0a7f26; (* mul x6, x25, x10 *) + 0x9bca7f39; (* umulh x25, x25, x10 *) + 0xda94228a; (* cinv x10, x20, cc // cc = lo, ul, last *) + 0xb100055f; (* cmn x10, #0x1 *) + 0xca0a00c6; (* eor x6, x6, x10 *) + 0xba0601e6; (* adcs x6, x15, x6 *) + 0xca0a0339; (* eor x25, x25, x10 *) + 0xba190219; (* adcs x25, x16, x25 *) + 0xba0a00b0; (* adcs x16, x5, x10 *) + 0xba0a02a5; (* adcs x5, x21, x10 *) + 0x9a0a026a; (* adc x10, x19, x10 *) + 0xeb0300e7; (* subs x7, x7, x3 *) + 0xda8724e7; (* cneg x7, x7, cc // cc = lo, ul, last *) + 0xda9f23e3; (* csetm x3, cc // cc = lo, ul, last *) + 0xeb180198; (* subs x24, x12, x24 *) + 0xda982718; (* cneg x24, x24, cc // cc = lo, ul, last *) + 0x9b187cec; (* mul x12, x7, x24 *) + 0x9bd87ce7; (* umulh x7, x7, x24 *) + 0xda832063; (* cinv x3, x3, cc // cc = lo, ul, last *) + 0xb100047f; (* cmn x3, #0x1 *) + 0xca030198; (* eor x24, x12, x3 *) + 0xba1800d8; (* adcs x24, x6, x24 *) + 0xca0300e7; (* eor x7, x7, x3 *) + 0xba070327; (* adcs x7, x25, x7 *) + 0xba030219; (* adcs x25, x16, x3 *) + 0xba0300ac; (* adcs x12, x5, x3 *) + 0x9a030143; (* adc x3, x10, x3 *) + 0xd377daea; (* lsl x10, x23, #9 *) + 0x93d7dd06; (* extr x6, x8, x23, #55 *) + 0x93c8dd77; (* extr x23, x11, x8, #55 *) + 0x93cbdf10; (* extr x16, x24, x11, #55 *) + 0xd377ff18; (* lsr x24, x24, #55 *) + 0xa90067e7; (* stp x7, x25, [sp] *) + 0xa9010fec; (* stp x12, x3, [sp, #16] *) + 0xa9021bea; (* stp x10, x6, [sp, #32] *) + 0xa90343f7; (* stp x23, x16, [sp, #48] *) + 0xf90023f8; (* str x24, [sp, #64] *) + 0xa9420c27; (* ldp x7, x3, [x1, #32] *) + 0x3dc00820; (* ldr q0, [x1, #32] *) + 0xa9432839; (* ldp x25, x10, [x1, #48] *) + 0xa9423058; (* ldp x24, x12, [x2, #32] *) + 0x3dc00859; (* ldr q25, [x2, #32] *) + 0xa9435c46; (* ldp x6, x23, [x2, #48] *) + 0x3dc00c32; (* ldr q18, [x1, #48] *) + 0x3dc00c43; (* ldr q3, [x2, #48] *) + 0x4e801b3a; (* uzp1 v26.4s, v25.4s, v0.4s *) + 0x4ea00b39; (* rev64 v25.4s, v25.4s *) + 0x4e801816; (* uzp1 v22.4s, v0.4s, v0.4s *) + 0x4ea09f20; (* mul v0.4s, v25.4s, v0.4s *) + 0x6ea02800; (* uaddlp v0.2d, v0.4s *) + 0x4f605400; (* shl v0.2d, v0.2d, #32 *) + 0x2eba82c0; (* umlal v0.2d, v22.2s, v26.2s *) + 0x4e083c10; (* mov x16, v0.d[0] *) + 0x4e183c05; (* mov x5, v0.d[1] *) + 0x6f00e5e0; (* movi v0.2d, #0xffffffff *) + 0x4e835879; (* uzp2 v25.4s, v3.4s, v3.4s *) + 0x0ea12a5a; (* xtn v26.2s, v18.2d *) + 0x0ea12876; (* xtn v22.2s, v3.2d *) + 0x4ea00878; (* rev64 v24.4s, v3.4s *) + 0x2eb6c353; (* umull v19.2d, v26.2s, v22.2s *) + 0x2eb9c343; (* umull v3.2d, v26.2s, v25.2s *) + 0x4e925a54; (* uzp2 v20.4s, v18.4s, v18.4s *) + 0x4eb29f12; (* mul v18.4s, v24.4s, v18.4s *) + 0x6f601663; (* usra v3.2d, v19.2d, #32 *) + 0x2eb9c286; (* umull v6.2d, v20.2s, v25.2s *) + 0x6ea02a59; (* uaddlp v25.2d, v18.4s *) + 0x4e201c60; (* and v0.16b, v3.16b, v0.16b *) + 0x2eb68280; (* umlal v0.2d, v20.2s, v22.2s *) + 0x4f605739; (* shl v25.2d, v25.2d, #32 *) + 0x6f601466; (* usra v6.2d, v3.2d, #32 *) + 0x2eb68359; (* umlal v25.2d, v26.2s, v22.2s *) + 0x6f601406; (* usra v6.2d, v0.2d, #32 *) + 0x4e083f35; (* mov x21, v25.d[0] *) + 0x4e183f33; (* mov x19, v25.d[1] *) + 0x9bd87ce8; (* umulh x8, x7, x24 *) + 0xab0800a5; (* adds x5, x5, x8 *) + 0x9bcc7c68; (* umulh x8, x3, x12 *) + 0xba0802b5; (* adcs x21, x21, x8 *) + 0x4e083cc8; (* mov x8, v6.d[0] *) + 0xba080273; (* adcs x19, x19, x8 *) + 0x4e183cc8; (* mov x8, v6.d[1] *) + 0x9a1f0108; (* adc x8, x8, xzr *) + 0xab1000ab; (* adds x11, x5, x16 *) + 0xba0502a5; (* adcs x5, x21, x5 *) + 0xba150275; (* adcs x21, x19, x21 *) + 0xba130113; (* adcs x19, x8, x19 *) + 0x9a0803e8; (* adc x8, xzr, x8 *) + 0xab1000af; (* adds x15, x5, x16 *) + 0xba0b02b4; (* adcs x20, x21, x11 *) + 0xba050265; (* adcs x5, x19, x5 *) + 0xba150115; (* adcs x21, x8, x21 *) + 0xba1303f3; (* adcs x19, xzr, x19 *) + 0x9a0803e8; (* adc x8, xzr, x8 *) + 0xeb0a0329; (* subs x9, x25, x10 *) + 0xda892529; (* cneg x9, x9, cc // cc = lo, ul, last *) + 0xda9f23ed; (* csetm x13, cc // cc = lo, ul, last *) + 0xeb0602fa; (* subs x26, x23, x6 *) + 0xda9a275a; (* cneg x26, x26, cc // cc = lo, ul, last *) + 0x9b1a7d36; (* mul x22, x9, x26 *) + 0x9bda7d29; (* umulh x9, x9, x26 *) + 0xda8d21ad; (* cinv x13, x13, cc // cc = lo, ul, last *) + 0xb10005bf; (* cmn x13, #0x1 *) + 0xca0d02da; (* eor x26, x22, x13 *) + 0xba1a02b5; (* adcs x21, x21, x26 *) + 0xca0d0129; (* eor x9, x9, x13 *) + 0xba090273; (* adcs x19, x19, x9 *) + 0x9a0d0108; (* adc x8, x8, x13 *) + 0xeb0300e9; (* subs x9, x7, x3 *) + 0xda892529; (* cneg x9, x9, cc // cc = lo, ul, last *) + 0xda9f23ed; (* csetm x13, cc // cc = lo, ul, last *) + 0xeb18019a; (* subs x26, x12, x24 *) + 0xda9a275a; (* cneg x26, x26, cc // cc = lo, ul, last *) + 0x9b1a7d36; (* mul x22, x9, x26 *) + 0x9bda7d29; (* umulh x9, x9, x26 *) + 0xda8d21ad; (* cinv x13, x13, cc // cc = lo, ul, last *) + 0xb10005bf; (* cmn x13, #0x1 *) + 0xca0d02da; (* eor x26, x22, x13 *) + 0xba1a016b; (* adcs x11, x11, x26 *) + 0xca0d0129; (* eor x9, x9, x13 *) + 0xba0901ef; (* adcs x15, x15, x9 *) + 0xba0d0294; (* adcs x20, x20, x13 *) + 0xba0d00a5; (* adcs x5, x5, x13 *) + 0xba0d02b5; (* adcs x21, x21, x13 *) + 0xba0d0273; (* adcs x19, x19, x13 *) + 0x9a0d0108; (* adc x8, x8, x13 *) + 0xeb0a0069; (* subs x9, x3, x10 *) + 0xda892529; (* cneg x9, x9, cc // cc = lo, ul, last *) + 0xda9f23ed; (* csetm x13, cc // cc = lo, ul, last *) + 0xeb0c02fa; (* subs x26, x23, x12 *) + 0xda9a275a; (* cneg x26, x26, cc // cc = lo, ul, last *) + 0x9b1a7d36; (* mul x22, x9, x26 *) + 0x9bda7d29; (* umulh x9, x9, x26 *) + 0xda8d21ad; (* cinv x13, x13, cc // cc = lo, ul, last *) + 0xb10005bf; (* cmn x13, #0x1 *) + 0xca0d02da; (* eor x26, x22, x13 *) + 0xba1a00a5; (* adcs x5, x5, x26 *) + 0xca0d0129; (* eor x9, x9, x13 *) + 0xba0902ae; (* adcs x14, x21, x9 *) + 0xba0d0275; (* adcs x21, x19, x13 *) + 0x9a0d0113; (* adc x19, x8, x13 *) + 0xeb1900e9; (* subs x9, x7, x25 *) + 0xda892528; (* cneg x8, x9, cc // cc = lo, ul, last *) + 0xda9f23e9; (* csetm x9, cc // cc = lo, ul, last *) + 0xeb1800cd; (* subs x13, x6, x24 *) + 0xda8d25ad; (* cneg x13, x13, cc // cc = lo, ul, last *) + 0x9b0d7d1a; (* mul x26, x8, x13 *) + 0x9bcd7d08; (* umulh x8, x8, x13 *) + 0xda892129; (* cinv x9, x9, cc // cc = lo, ul, last *) + 0xb100053f; (* cmn x9, #0x1 *) + 0xca09034d; (* eor x13, x26, x9 *) + 0xba0d01ef; (* adcs x15, x15, x13 *) + 0xca090108; (* eor x8, x8, x9 *) + 0xba080288; (* adcs x8, x20, x8 *) + 0xba0900a5; (* adcs x5, x5, x9 *) + 0xba0901d4; (* adcs x20, x14, x9 *) + 0xba0902b5; (* adcs x21, x21, x9 *) + 0x9a090273; (* adc x19, x19, x9 *) + 0xeb0a00e9; (* subs x9, x7, x10 *) + 0xda892529; (* cneg x9, x9, cc // cc = lo, ul, last *) + 0xda9f23ed; (* csetm x13, cc // cc = lo, ul, last *) + 0xeb1802fa; (* subs x26, x23, x24 *) + 0xda9a275a; (* cneg x26, x26, cc // cc = lo, ul, last *) + 0x9b1a7d36; (* mul x22, x9, x26 *) + 0x9bda7d29; (* umulh x9, x9, x26 *) + 0xda8d21ad; (* cinv x13, x13, cc // cc = lo, ul, last *) + 0xb10005bf; (* cmn x13, #0x1 *) + 0xca0d02da; (* eor x26, x22, x13 *) + 0xba1a0108; (* adcs x8, x8, x26 *) + 0xca0d0129; (* eor x9, x9, x13 *) + 0xba0900a5; (* adcs x5, x5, x9 *) + 0xba0d0294; (* adcs x20, x20, x13 *) + 0xba0d02b5; (* adcs x21, x21, x13 *) + 0x9a0d0273; (* adc x19, x19, x13 *) + 0xeb190069; (* subs x9, x3, x25 *) + 0xda892529; (* cneg x9, x9, cc // cc = lo, ul, last *) + 0xda9f23ed; (* csetm x13, cc // cc = lo, ul, last *) + 0xeb0c00da; (* subs x26, x6, x12 *) + 0xda9a275a; (* cneg x26, x26, cc // cc = lo, ul, last *) + 0x9b1a7d36; (* mul x22, x9, x26 *) + 0x9bda7d29; (* umulh x9, x9, x26 *) + 0xda8d21ad; (* cinv x13, x13, cc // cc = lo, ul, last *) + 0xb10005bf; (* cmn x13, #0x1 *) + 0xca0d02da; (* eor x26, x22, x13 *) + 0xba1a0108; (* adcs x8, x8, x26 *) + 0xca0d0129; (* eor x9, x9, x13 *) + 0xba0900a5; (* adcs x5, x5, x9 *) + 0xba0d0294; (* adcs x20, x20, x13 *) + 0xba0d02b5; (* adcs x21, x21, x13 *) + 0x9a0d0273; (* adc x19, x19, x13 *) + 0xa94037e9; (* ldp x9, x13, [sp] *) + 0xab090210; (* adds x16, x16, x9 *) + 0xba0d016b; (* adcs x11, x11, x13 *) + 0xa9002ff0; (* stp x16, x11, [sp] *) + 0xa9412ff0; (* ldp x16, x11, [sp, #16] *) + 0xba1001f0; (* adcs x16, x15, x16 *) + 0xba0b0108; (* adcs x8, x8, x11 *) + 0xa90123f0; (* stp x16, x8, [sp, #16] *) + 0xa94223f0; (* ldp x16, x8, [sp, #32] *) + 0xba1000b0; (* adcs x16, x5, x16 *) + 0xba080285; (* adcs x5, x20, x8 *) + 0xa90217f0; (* stp x16, x5, [sp, #32] *) + 0xa94317f0; (* ldp x16, x5, [sp, #48] *) + 0xba1002b0; (* adcs x16, x21, x16 *) + 0xba050265; (* adcs x5, x19, x5 *) + 0xa90317f0; (* stp x16, x5, [sp, #48] *) + 0xf94023f0; (* ldr x16, [sp, #64] *) + 0x9a1f0210; (* adc x16, x16, xzr *) + 0xf90023f0; (* str x16, [sp, #64] *) + 0xa9401430; (* ldp x16, x5, [x1] *) + 0xeb1000e7; (* subs x7, x7, x16 *) + 0xfa050063; (* sbcs x3, x3, x5 *) + 0xa9411430; (* ldp x16, x5, [x1, #16] *) + 0xfa100339; (* sbcs x25, x25, x16 *) + 0xfa05014a; (* sbcs x10, x10, x5 *) + 0xda9f23f0; (* csetm x16, cc // cc = lo, ul, last *) + 0xa9405445; (* ldp x5, x21, [x2] *) + 0xeb1800b8; (* subs x24, x5, x24 *) + 0xfa0c02ac; (* sbcs x12, x21, x12 *) + 0xa9414c45; (* ldp x5, x19, [x2, #16] *) + 0xfa0600a6; (* sbcs x6, x5, x6 *) + 0xfa170277; (* sbcs x23, x19, x23 *) + 0xda9f23e5; (* csetm x5, cc // cc = lo, ul, last *) + 0xca1000e7; (* eor x7, x7, x16 *) + 0xeb1000e7; (* subs x7, x7, x16 *) + 0xca100063; (* eor x3, x3, x16 *) + 0xfa100063; (* sbcs x3, x3, x16 *) + 0xca100339; (* eor x25, x25, x16 *) + 0xfa100339; (* sbcs x25, x25, x16 *) + 0xca10014a; (* eor x10, x10, x16 *) + 0xda10014a; (* sbc x10, x10, x16 *) + 0xca050318; (* eor x24, x24, x5 *) + 0xeb050318; (* subs x24, x24, x5 *) + 0xca05018c; (* eor x12, x12, x5 *) + 0xfa05018c; (* sbcs x12, x12, x5 *) + 0xca0500c6; (* eor x6, x6, x5 *) + 0xfa0500c6; (* sbcs x6, x6, x5 *) + 0xca0502f7; (* eor x23, x23, x5 *) + 0xda0502f7; (* sbc x23, x23, x5 *) + 0xca1000b0; (* eor x16, x5, x16 *) + 0x9b187cf5; (* mul x21, x7, x24 *) + 0x9b0c7c65; (* mul x5, x3, x12 *) + 0x9b067f33; (* mul x19, x25, x6 *) + 0x9b177d48; (* mul x8, x10, x23 *) + 0x9bd87ceb; (* umulh x11, x7, x24 *) + 0xab0b00a5; (* adds x5, x5, x11 *) + 0x9bcc7c6b; (* umulh x11, x3, x12 *) + 0xba0b0273; (* adcs x19, x19, x11 *) + 0x9bc67f2b; (* umulh x11, x25, x6 *) + 0xba0b0108; (* adcs x8, x8, x11 *) + 0x9bd77d4b; (* umulh x11, x10, x23 *) + 0x9a1f016b; (* adc x11, x11, xzr *) + 0xab1500af; (* adds x15, x5, x21 *) + 0xba050265; (* adcs x5, x19, x5 *) + 0xba130113; (* adcs x19, x8, x19 *) + 0xba080168; (* adcs x8, x11, x8 *) + 0x9a0b03eb; (* adc x11, xzr, x11 *) + 0xab1500b4; (* adds x20, x5, x21 *) + 0xba0f0269; (* adcs x9, x19, x15 *) + 0xba050105; (* adcs x5, x8, x5 *) + 0xba130173; (* adcs x19, x11, x19 *) + 0xba0803e8; (* adcs x8, xzr, x8 *) + 0x9a0b03eb; (* adc x11, xzr, x11 *) + 0xeb0a032d; (* subs x13, x25, x10 *) + 0xda8d25ad; (* cneg x13, x13, cc // cc = lo, ul, last *) + 0xda9f23fa; (* csetm x26, cc // cc = lo, ul, last *) + 0xeb0602f6; (* subs x22, x23, x6 *) + 0xda9626d6; (* cneg x22, x22, cc // cc = lo, ul, last *) + 0x9b167da4; (* mul x4, x13, x22 *) + 0x9bd67dad; (* umulh x13, x13, x22 *) + 0xda9a235a; (* cinv x26, x26, cc // cc = lo, ul, last *) + 0xb100075f; (* cmn x26, #0x1 *) + 0xca1a0096; (* eor x22, x4, x26 *) + 0xba160273; (* adcs x19, x19, x22 *) + 0xca1a01ad; (* eor x13, x13, x26 *) + 0xba0d0108; (* adcs x8, x8, x13 *) + 0x9a1a016b; (* adc x11, x11, x26 *) + 0xeb0300ed; (* subs x13, x7, x3 *) + 0xda8d25ad; (* cneg x13, x13, cc // cc = lo, ul, last *) + 0xda9f23fa; (* csetm x26, cc // cc = lo, ul, last *) + 0xeb180196; (* subs x22, x12, x24 *) + 0xda9626d6; (* cneg x22, x22, cc // cc = lo, ul, last *) + 0x9b167da4; (* mul x4, x13, x22 *) + 0x9bd67dad; (* umulh x13, x13, x22 *) + 0xda9a235a; (* cinv x26, x26, cc // cc = lo, ul, last *) + 0xb100075f; (* cmn x26, #0x1 *) + 0xca1a0096; (* eor x22, x4, x26 *) + 0xba1601ef; (* adcs x15, x15, x22 *) + 0xca1a01ad; (* eor x13, x13, x26 *) + 0xba0d0294; (* adcs x20, x20, x13 *) + 0xba1a0129; (* adcs x9, x9, x26 *) + 0xba1a00a5; (* adcs x5, x5, x26 *) + 0xba1a0273; (* adcs x19, x19, x26 *) + 0xba1a0108; (* adcs x8, x8, x26 *) + 0x9a1a016b; (* adc x11, x11, x26 *) + 0xeb0a006d; (* subs x13, x3, x10 *) + 0xda8d25ad; (* cneg x13, x13, cc // cc = lo, ul, last *) + 0xda9f23fa; (* csetm x26, cc // cc = lo, ul, last *) + 0xeb0c02f6; (* subs x22, x23, x12 *) + 0xda9626d6; (* cneg x22, x22, cc // cc = lo, ul, last *) + 0x9b167da4; (* mul x4, x13, x22 *) + 0x9bd67dad; (* umulh x13, x13, x22 *) + 0xda9a235a; (* cinv x26, x26, cc // cc = lo, ul, last *) + 0xb100075f; (* cmn x26, #0x1 *) + 0xca1a0096; (* eor x22, x4, x26 *) + 0xba1600a5; (* adcs x5, x5, x22 *) + 0xca1a01ad; (* eor x13, x13, x26 *) + 0xba0d0273; (* adcs x19, x19, x13 *) + 0xba1a0108; (* adcs x8, x8, x26 *) + 0x9a1a016b; (* adc x11, x11, x26 *) + 0xeb1900ed; (* subs x13, x7, x25 *) + 0xda8d25ad; (* cneg x13, x13, cc // cc = lo, ul, last *) + 0xda9f23fa; (* csetm x26, cc // cc = lo, ul, last *) + 0xeb1800d6; (* subs x22, x6, x24 *) + 0xda9626d6; (* cneg x22, x22, cc // cc = lo, ul, last *) + 0x9b167da4; (* mul x4, x13, x22 *) + 0x9bd67dad; (* umulh x13, x13, x22 *) + 0xda9a235a; (* cinv x26, x26, cc // cc = lo, ul, last *) + 0xb100075f; (* cmn x26, #0x1 *) + 0xca1a0096; (* eor x22, x4, x26 *) + 0xba160294; (* adcs x20, x20, x22 *) + 0xca1a01ad; (* eor x13, x13, x26 *) + 0xba0d0129; (* adcs x9, x9, x13 *) + 0xba1a00a5; (* adcs x5, x5, x26 *) + 0xba1a0273; (* adcs x19, x19, x26 *) + 0xba1a0108; (* adcs x8, x8, x26 *) + 0x9a1a016b; (* adc x11, x11, x26 *) + 0xeb0a00e7; (* subs x7, x7, x10 *) + 0xda8724e7; (* cneg x7, x7, cc // cc = lo, ul, last *) + 0xda9f23ea; (* csetm x10, cc // cc = lo, ul, last *) + 0xeb1802f8; (* subs x24, x23, x24 *) + 0xda982718; (* cneg x24, x24, cc // cc = lo, ul, last *) + 0x9b187cf7; (* mul x23, x7, x24 *) + 0x9bd87ce7; (* umulh x7, x7, x24 *) + 0xda8a214a; (* cinv x10, x10, cc // cc = lo, ul, last *) + 0xb100055f; (* cmn x10, #0x1 *) + 0xca0a02f8; (* eor x24, x23, x10 *) + 0xba180138; (* adcs x24, x9, x24 *) + 0xca0a00e7; (* eor x7, x7, x10 *) + 0xba0700a7; (* adcs x7, x5, x7 *) + 0xba0a0277; (* adcs x23, x19, x10 *) + 0xba0a0105; (* adcs x5, x8, x10 *) + 0x9a0a016a; (* adc x10, x11, x10 *) + 0xeb190063; (* subs x3, x3, x25 *) + 0xda832463; (* cneg x3, x3, cc // cc = lo, ul, last *) + 0xda9f23f9; (* csetm x25, cc // cc = lo, ul, last *) + 0xeb0c00cc; (* subs x12, x6, x12 *) + 0xda8c258c; (* cneg x12, x12, cc // cc = lo, ul, last *) + 0x9b0c7c66; (* mul x6, x3, x12 *) + 0x9bcc7c63; (* umulh x3, x3, x12 *) + 0xda992339; (* cinv x25, x25, cc // cc = lo, ul, last *) + 0xb100073f; (* cmn x25, #0x1 *) + 0xca1900cc; (* eor x12, x6, x25 *) + 0xba0c0318; (* adcs x24, x24, x12 *) + 0xca190063; (* eor x3, x3, x25 *) + 0xba0300e7; (* adcs x7, x7, x3 *) + 0xba1902e3; (* adcs x3, x23, x25 *) + 0xba1900ac; (* adcs x12, x5, x25 *) + 0x9a190159; (* adc x25, x10, x25 *) + 0xa9401bea; (* ldp x10, x6, [sp] *) + 0xa94117f7; (* ldp x23, x5, [sp, #16] *) + 0xca1002b5; (* eor x21, x21, x16 *) + 0xab0a02b5; (* adds x21, x21, x10 *) + 0xca1001f3; (* eor x19, x15, x16 *) + 0xba060273; (* adcs x19, x19, x6 *) + 0xca100288; (* eor x8, x20, x16 *) + 0xba170108; (* adcs x8, x8, x23 *) + 0xca100318; (* eor x24, x24, x16 *) + 0xba050318; (* adcs x24, x24, x5 *) + 0xca1000e7; (* eor x7, x7, x16 *) + 0xa9423feb; (* ldp x11, x15, [sp, #32] *) + 0xa94327f4; (* ldp x20, x9, [sp, #48] *) + 0xf94023ed; (* ldr x13, [sp, #64] *) + 0xba0b00e7; (* adcs x7, x7, x11 *) + 0xca100063; (* eor x3, x3, x16 *) + 0xba0f0063; (* adcs x3, x3, x15 *) + 0xca10018c; (* eor x12, x12, x16 *) + 0xba14018c; (* adcs x12, x12, x20 *) + 0xca100339; (* eor x25, x25, x16 *) + 0xba090339; (* adcs x25, x25, x9 *) + 0x9a1f01ba; (* adc x26, x13, xzr *) + 0xab0a00e7; (* adds x7, x7, x10 *) + 0xba060063; (* adcs x3, x3, x6 *) + 0xba17018a; (* adcs x10, x12, x23 *) + 0xba050339; (* adcs x25, x25, x5 *) + 0x9240220c; (* and x12, x16, #0x1ff *) + 0xd377daa6; (* lsl x6, x21, #9 *) + 0xaa0c00cc; (* orr x12, x6, x12 *) + 0xba0c016c; (* adcs x12, x11, x12 *) + 0x93d5de66; (* extr x6, x19, x21, #55 *) + 0xba0601e6; (* adcs x6, x15, x6 *) + 0x93d3dd17; (* extr x23, x8, x19, #55 *) + 0xba170297; (* adcs x23, x20, x23 *) + 0x93c8df10; (* extr x16, x24, x8, #55 *) + 0xba100130; (* adcs x16, x9, x16 *) + 0xd377ff18; (* lsr x24, x24, #55 *) + 0x9a0d0318; (* adc x24, x24, x13 *) + 0xf9402045; (* ldr x5, [x2, #64] *) + 0xa9404c35; (* ldp x21, x19, [x1] *) + 0x9240cea8; (* and x8, x21, #0xfffffffffffff *) + 0x9b087ca8; (* mul x8, x5, x8 *) + 0xf940202b; (* ldr x11, [x1, #64] *) + 0xa940504f; (* ldp x15, x20, [x2] *) + 0x9240cde9; (* and x9, x15, #0xfffffffffffff *) + 0x9b097d69; (* mul x9, x11, x9 *) + 0x8b090108; (* add x8, x8, x9 *) + 0x93d5d275; (* extr x21, x19, x21, #52 *) + 0x9240ceb5; (* and x21, x21, #0xfffffffffffff *) + 0x9b157cb5; (* mul x21, x5, x21 *) + 0x93cfd28f; (* extr x15, x20, x15, #52 *) + 0x9240cdef; (* and x15, x15, #0xfffffffffffff *) + 0x9b0f7d6f; (* mul x15, x11, x15 *) + 0x8b0f02b5; (* add x21, x21, x15 *) + 0xd374fd0f; (* lsr x15, x8, #52 *) + 0x8b0f02b5; (* add x21, x21, x15 *) + 0xd374cd08; (* lsl x8, x8, #12 *) + 0x93c832a8; (* extr x8, x21, x8, #12 *) + 0xab0800e7; (* adds x7, x7, x8 *) + 0xa9413c28; (* ldp x8, x15, [x1, #16] *) + 0xa9413449; (* ldp x9, x13, [x2, #16] *) + 0x93d3a113; (* extr x19, x8, x19, #40 *) + 0x9240ce73; (* and x19, x19, #0xfffffffffffff *) + 0x9b137cb3; (* mul x19, x5, x19 *) + 0x93d4a134; (* extr x20, x9, x20, #40 *) + 0x9240ce94; (* and x20, x20, #0xfffffffffffff *) + 0x9b147d74; (* mul x20, x11, x20 *) + 0x8b140273; (* add x19, x19, x20 *) + 0xd374feb4; (* lsr x20, x21, #52 *) + 0x8b140273; (* add x19, x19, x20 *) + 0xd374ceb5; (* lsl x21, x21, #12 *) + 0x93d56275; (* extr x21, x19, x21, #24 *) + 0xba150063; (* adcs x3, x3, x21 *) + 0x93c871f5; (* extr x21, x15, x8, #28 *) + 0x9240ceb5; (* and x21, x21, #0xfffffffffffff *) + 0x9b157cb5; (* mul x21, x5, x21 *) + 0x93c971a8; (* extr x8, x13, x9, #28 *) + 0x9240cd08; (* and x8, x8, #0xfffffffffffff *) + 0x9b087d68; (* mul x8, x11, x8 *) + 0x8b0802b5; (* add x21, x21, x8 *) + 0xd374fe68; (* lsr x8, x19, #52 *) + 0x8b0802b5; (* add x21, x21, x8 *) + 0xd374ce73; (* lsl x19, x19, #12 *) + 0x93d392b3; (* extr x19, x21, x19, #36 *) + 0xba13014a; (* adcs x10, x10, x19 *) + 0x8a0a0073; (* and x19, x3, x10 *) + 0xa9425028; (* ldp x8, x20, [x1, #32] *) + 0xa9425849; (* ldp x9, x22, [x2, #32] *) + 0x93cf410f; (* extr x15, x8, x15, #16 *) + 0x9240cdef; (* and x15, x15, #0xfffffffffffff *) + 0x9b0f7ca4; (* mul x4, x5, x15 *) + 0x93cd412f; (* extr x15, x9, x13, #16 *) + 0x9240cdef; (* and x15, x15, #0xfffffffffffff *) + 0x9b0f7d6f; (* mul x15, x11, x15 *) + 0x8b0f008f; (* add x15, x4, x15 *) + 0xd3503f4d; (* lsl x13, x26, #48 *) + 0x8b0d01ef; (* add x15, x15, x13 *) + 0xd374fead; (* lsr x13, x21, #52 *) + 0x8b0d01ef; (* add x15, x15, x13 *) + 0xd374ceb5; (* lsl x21, x21, #12 *) + 0x93d5c1f5; (* extr x21, x15, x21, #48 *) + 0xba150339; (* adcs x25, x25, x21 *) + 0x8a190275; (* and x21, x19, x25 *) + 0xd344fd13; (* lsr x19, x8, #4 *) + 0x9240ce73; (* and x19, x19, #0xfffffffffffff *) + 0x9b137cb3; (* mul x19, x5, x19 *) + 0xd344fd3a; (* lsr x26, x9, #4 *) + 0x9240cf4d; (* and x13, x26, #0xfffffffffffff *) + 0x9b0d7d7a; (* mul x26, x11, x13 *) + 0x8b1a0273; (* add x19, x19, x26 *) + 0xd374fded; (* lsr x13, x15, #52 *) + 0x8b0d0273; (* add x19, x19, x13 *) + 0xd374cdef; (* lsl x15, x15, #12 *) + 0x93cff26f; (* extr x15, x19, x15, #60 *) + 0x93c8e288; (* extr x8, x20, x8, #56 *) + 0x9240cd08; (* and x8, x8, #0xfffffffffffff *) + 0x9b087ca8; (* mul x8, x5, x8 *) + 0x93c9e2c9; (* extr x9, x22, x9, #56 *) + 0x9240cd29; (* and x9, x9, #0xfffffffffffff *) + 0x9b097d69; (* mul x9, x11, x9 *) + 0x8b090108; (* add x8, x8, x9 *) + 0xd374fe73; (* lsr x19, x19, #52 *) + 0x8b130113; (* add x19, x8, x19 *) + 0xd378dde8; (* lsl x8, x15, #8 *) + 0x93c82268; (* extr x8, x19, x8, #8 *) + 0xba08018c; (* adcs x12, x12, x8 *) + 0x8a0c02b5; (* and x21, x21, x12 *) + 0xa9432021; (* ldp x1, x8, [x1, #48] *) + 0xa9433c42; (* ldp x2, x15, [x2, #48] *) + 0x93d4b034; (* extr x20, x1, x20, #44 *) + 0x9240ce94; (* and x20, x20, #0xfffffffffffff *) + 0x9b147cb4; (* mul x20, x5, x20 *) + 0x93d6b049; (* extr x9, x2, x22, #44 *) + 0x9240cd29; (* and x9, x9, #0xfffffffffffff *) + 0x9b097d69; (* mul x9, x11, x9 *) + 0x8b090294; (* add x20, x20, x9 *) + 0xd374fe69; (* lsr x9, x19, #52 *) + 0x8b090296; (* add x22, x20, x9 *) + 0xd374ce73; (* lsl x19, x19, #12 *) + 0x93d352d3; (* extr x19, x22, x19, #20 *) + 0xba1300c6; (* adcs x6, x6, x19 *) + 0x8a0602b5; (* and x21, x21, x6 *) + 0x93c18101; (* extr x1, x8, x1, #32 *) + 0x9240cc21; (* and x1, x1, #0xfffffffffffff *) + 0x9b017ca1; (* mul x1, x5, x1 *) + 0x93c281e2; (* extr x2, x15, x2, #32 *) + 0x9240cc42; (* and x2, x2, #0xfffffffffffff *) + 0x9b027d62; (* mul x2, x11, x2 *) + 0x8b020022; (* add x2, x1, x2 *) + 0xd374fec1; (* lsr x1, x22, #52 *) + 0x8b010042; (* add x2, x2, x1 *) + 0xd374cec1; (* lsl x1, x22, #12 *) + 0x93c18041; (* extr x1, x2, x1, #32 *) + 0xba0102f7; (* adcs x23, x23, x1 *) + 0x8a1702b5; (* and x21, x21, x23 *) + 0xd354fd01; (* lsr x1, x8, #20 *) + 0x9b017ca1; (* mul x1, x5, x1 *) + 0xd354fdf3; (* lsr x19, x15, #20 *) + 0x9b137d73; (* mul x19, x11, x19 *) + 0x8b130021; (* add x1, x1, x19 *) + 0xd374fc53; (* lsr x19, x2, #52 *) + 0x8b130033; (* add x19, x1, x19 *) + 0xd374cc42; (* lsl x2, x2, #12 *) + 0x93c2b262; (* extr x2, x19, x2, #44 *) + 0xba020210; (* adcs x16, x16, x2 *) + 0x8a1002a2; (* and x2, x21, x16 *) + 0x9b0b7ca5; (* mul x5, x5, x11 *) + 0xd36cfe61; (* lsr x1, x19, #44 *) + 0x8b0100a5; (* add x5, x5, x1 *) + 0x9a050318; (* adc x24, x24, x5 *) + 0xd349ff05; (* lsr x5, x24, #9 *) + 0xb277db18; (* orr x24, x24, #0xfffffffffffffe00 *) + 0xeb1f03ff; (* cmp xzr, xzr *) + 0xba0500ff; (* adcs xzr, x7, x5 *) + 0xba1f005f; (* adcs xzr, x2, xzr *) + 0xba1f031f; (* adcs xzr, x24, xzr *) + 0xba0500e7; (* adcs x7, x7, x5 *) + 0xba1f0062; (* adcs x2, x3, xzr *) + 0xba1f014a; (* adcs x10, x10, xzr *) + 0xba1f0339; (* adcs x25, x25, xzr *) + 0xba1f018c; (* adcs x12, x12, xzr *) + 0xba1f00c6; (* adcs x6, x6, xzr *) + 0xba1f02f7; (* adcs x23, x23, xzr *) + 0xba1f0210; (* adcs x16, x16, xzr *) + 0x9a1f0303; (* adc x3, x24, xzr *) + 0xa9002802; (* stp x2, x10, [x0] *) + 0xa9013019; (* stp x25, x12, [x0, #16] *) + 0xa9025c06; (* stp x6, x23, [x0, #32] *) + 0xd377d8f9; (* lsl x25, x7, #9 *) + 0x92402063; (* and x3, x3, #0x1ff *) + 0xaa190063; (* orr x3, x3, x25 *) + 0xa9030c10; (* stp x16, x3, [x0, #48] *) + 0xd377fcee; (* lsr x14, x7, #55 *) + 0xf900200e; (* str x14, [x0, #64] *) +];; + +let bignum_montmul_p521_interm1_core_mc = + let charlist = List.concat_map + (fun op32 -> + [Char.chr (Int.logand op32 255); + Char.chr (Int.logand (Int.shift_right op32 8) 255); + Char.chr (Int.logand (Int.shift_right op32 16) 255); + Char.chr (Int.logand (Int.shift_right op32 24) 255)]) + bignum_montmul_p521_interm1_ops in + let byte_list = Bytes.init (List.length charlist) (fun i -> List.nth charlist i) in + define_word_list "bignum_montmul_p521_interm1_core_mc" (term_of_bytes byte_list);; + +let BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC = + ARM_MK_EXEC_RULE bignum_montmul_p521_interm1_core_mc;; + +let equiv_input_states = new_definition + `!s1 s1' x y z stackpointer. + (equiv_input_states:(armstate#armstate)->int64->int64->int64->int64->bool) + (s1,s1') x y z stackpointer <=> + (C_ARGUMENTS [z; x; y] s1 /\ + C_ARGUMENTS [z; x; y] s1' /\ + read SP s1 = stackpointer /\ + read SP s1' = stackpointer /\ + ?a. bignum_from_memory (x,9) s1 = a /\ + bignum_from_memory (x,9) s1' = a /\ + (?b. bignum_from_memory (y,9) s1 = b /\ + bignum_from_memory (y,9) s1' = b))`;; + +let equiv_output_states = new_definition + `!s1 s1' z stackpointer. + (equiv_output_states:(armstate#armstate)->int64->int64->bool) + (s1,s1') z stackpointer <=> + (?a. + read SP s1 = stackpointer /\ + read SP s1' = stackpointer /\ + bignum_from_memory (z,9) s1 = a /\ + bignum_from_memory (z,9) s1' = a)`;; + +let actions = [ + ("equal", 0, 3, 0, 3); + ("insert", 3, 3, 3, 5); + ("equal", 3, 4, 5, 6); + ("replace", 4, 6, 6, 26); + ("equal", 6, 8, 26, 28); + ("replace", 8, 9, 28, 29); + ("equal", 9, 10, 29, 30); + ("replace", 10, 11, 30, 31); + ("equal", 11, 134, 31, 154); + ("insert", 134, 134, 154, 155); + ("equal", 134, 136, 155, 157); + ("insert", 136, 136, 157, 158); + ("equal", 136, 137, 158, 159); + ("replace", 137, 141, 159, 190); + ("equal", 141, 145, 190, 194); + ("replace", 145, 146, 194, 195); + ("equal", 146, 147, 195, 196); + ("replace", 147, 148, 196, 197); + ("equal", 148, 619, 197, 668); +];; + +let equiv_goal1 = mk_equiv_statement + `aligned 16 stackpointer /\ + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_montmul_p521_core_mc); + (word pc2,LENGTH bignum_montmul_p521_interm1_core_mc)] /\ + ALL (nonoverlapping (stackpointer, 80)) + [(word pc,LENGTH bignum_montmul_p521_core_mc); + (word pc2,LENGTH bignum_montmul_p521_interm1_core_mc); + (z,8 * 9); (x:int64,8 * 9); (y:int64,8 * 9)]` + equiv_input_states + equiv_output_states + bignum_montmul_p521_core_mc 0 + `MAYCHANGE [PC; X3; X4; X5; X6; X7; X8; X9; + X10; X11; X12; X13; X14; X15; X16; X17; X19; + X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]` + bignum_montmul_p521_interm1_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]`;; + +let _org_extra_word_CONV = !extra_word_CONV;; +extra_word_CONV := + [GEN_REWRITE_CONV I [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_LO; WORD_MUL64_HI; + WORD_SQR64_HI; WORD_SQR128_DIGIT0]] + @ (!extra_word_CONV);; + +let BIGNUM_MONTMUL_P521_CORE_EQUIV1 = time prove(equiv_goal1, + + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI;SOME_FLAGS; + ALLPAIRS;ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MONTMUL_P521_CORE_EXEC; + fst BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC] THEN + REPEAT STRIP_TAC THEN + (** Initialize **) + EQUIV_INITIATE_TAC equiv_input_states THEN + REPEAT (FIRST_X_ASSUM BIGNUM_EXPAND_AND_DIGITIZE_TAC) THEN + ASM_PROPAGATE_DIGIT_EQS_FROM_EXPANDED_BIGNUM_TAC THEN + (* necessary to run ldr qs *) + COMBINE_READ_BYTES64_PAIRS_TAC THEN + + (* Start *) + EQUIV_STEPS_TAC actions + BIGNUM_MONTMUL_P521_CORE_EXEC + BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC THEN + + REPEAT_N 2 ENSURES_FINAL_STATE'_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Outputs **) + ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; + BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,9) state`; + C_ARGUMENTS] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + (** SUBGOAL 2. Maychange left **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC; + + (** SUBGOAL 3. Maychange right **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0:armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC + ]);; + +extra_word_CONV := _org_extra_word_CONV;; + + + +(****************************************************************************** + The second program equivalence between the core part of intermediate + program and fully optimized program +******************************************************************************) + +let bignum_montmul_p521_neon_mc = + define_from_elf "bignum_montmul_p521_neon_mc" + "arm/p521/bignum_montmul_p521_neon.o";; + +let BIGNUM_MONTMUL_P521_NEON_EXEC = + ARM_MK_EXEC_RULE bignum_montmul_p521_neon_mc;; + +let bignum_montmul_p521_neon_core_mc_def, + bignum_montmul_p521_neon_core_mc, + BIGNUM_MONTMUL_P521_NEON_CORE_EXEC = + mk_sublist_of_mc "bignum_montmul_p521_neon_core_mc" + bignum_montmul_p521_neon_mc + (`20`,`LENGTH bignum_montmul_p521_neon_mc - 44`) + (fst BIGNUM_MONTMUL_P521_NEON_EXEC);; + + +let equiv_goal2 = mk_equiv_statement + `aligned 16 stackpointer /\ + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_montmul_p521_interm1_core_mc); + (word pc2,LENGTH bignum_montmul_p521_neon_core_mc)] /\ + ALL (nonoverlapping (stackpointer, 80)) + [(word pc,LENGTH bignum_montmul_p521_interm1_core_mc); + (word pc2,LENGTH bignum_montmul_p521_neon_core_mc); + (z,8 * 9); (x:int64,8 * 9); (y:int64,8 * 9)]` + equiv_input_states + equiv_output_states + bignum_montmul_p521_interm1_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]` + bignum_montmul_p521_neon_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]`;; + + +(* Line numbers from the fully optimized prog. to the intermediate prog. + The script that prints this map is being privately maintained by aqjune-aws. *) + +let inst_map = [5;4;161;2;7;6;8;9;1;11;10;48;13;175;49;12;160;50;28;15;14;51;174;55;52;172;17;53;16;179;22;18;54;57;19;182;62;21;173;63;185;20;23;178;35;177;181;24;27;59;158;176;3;64;26;155;187;25;33;31;29;180;163;30;162;171;32;164;165;34;36;37;183;186;38;39;184;40;166;41;42;43;167;44;188;45;168;46;47;65;66;67;68;69;56;58;71;60;61;80;81;82;83;87;84;70;86;73;72;74;75;76;77;85;78;79;95;97;96;98;102;99;89;112;100;114;113;115;116;119;101;91;128;129;104;130;88;106;90;118;144;92;93;94;131;132;135;103;117;145;105;107;123;108;121;195;109;146;133;110;151;156;111;120;189;134;122;137;124;154;125;157;139;126;127;210;212;211;136;191;138;140;147;141;159;193;142;143;224;149;170;226;150;225;213;190;214;197;217;192;215;152;148;194;169;153;196;198;216;219;227;231;228;221;199;200;229;201;202;230;203;204;233;205;206;207;208;209;235;218;220;222;223;242;244;243;245;246;249;232;234;247;236;237;248;251;238;239;240;241;257;258;259;260;264;261;250;253;252;262;254;255;256;274;263;275;276;277;278;281;279;266;265;268;267;283;269;328;280;270;271;272;273;282;284;285;286;325;287;288;289;290;291;292;326;327;332;329;330;331;293;294;297;333;334;343;335;345;296;339;341;336;337;295;338;340;347;342;353;344;301;349;346;348;299;350;351;352;362;354;298;300;302;306;303;356;304;305;307;366;308;310;309;355;357;311;314;358;312;313;318;364;315;322;316;317;319;360;320;323;321;379;381;380;324;359;393;394;395;382;383;386;361;363;385;365;367;396;400;384;390;397;368;388;399;369;370;371;372;373;398;374;477;375;376;377;378;387;389;391;404;392;402;411;412;413;414;415;418;401;403;416;405;417;406;420;407;408;409;410;426;427;428;429;430;433;443;431;422;444;432;445;446;447;435;450;419;479;421;423;424;437;425;459;460;461;434;436;448;438;481;439;440;449;441;514;442;462;463;454;452;464;466;451;453;522;465;523;455;468;456;515;470;457;458;467;469;501;475;471;472;476;490;473;474;478;502;492;486;503;483;480;482;485;487;494;484;513;489;488;491;493;495;496;497;516;498;505;499;534;509;500;504;524;518;506;548;536;507;549;508;517;525;519;550;510;526;535;520;539;540;521;511;527;551;537;538;529;531;528;512;562;530;552;532;570;541;542;543;566;567;553;544;555;545;546;533;561;547;592;568;554;556;578;557;558;579;563;580;593;559;564;565;581;582;589;574;590;583;572;603;569;571;560;594;573;584;575;602;607;585;591;632;595;617;618;620;633;586;630;621;608;604;609;576;605;587;577;606;596;597;588;613;598;619;599;611;610;600;601;612;622;614;624;631;626;623;615;625;635;641;616;634;637;636;627;642;638;643;628;639;629;644;647;646;645;648;640;649;650;651;663;667;668;652;653;660;654;655;661;656;657;658;662;659;664;665;666];; + +(* (state number, (equation, fresh var)) *) +let state_to_abbrevs: (int * thm) list ref = ref [];; + +let BIGNUM_MONTMUL_P521_CORE_EQUIV2 = time prove( + equiv_goal2, + + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI;SOME_FLAGS; + ALLPAIRS;ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC; + fst BIGNUM_MONTMUL_P521_NEON_CORE_EXEC] THEN + REPEAT STRIP_TAC THEN + (** Initialize **) + EQUIV_INITIATE_TAC equiv_input_states THEN + REPEAT (FIRST_X_ASSUM BIGNUM_EXPAND_AND_DIGITIZE_TAC) THEN + ASM_PROPAGATE_DIGIT_EQS_FROM_EXPANDED_BIGNUM_TAC THEN + (* necessary to run ldr qs *) + COMBINE_READ_BYTES64_PAIRS_TAC THEN + + (* Left *) + ARM_STEPS'_AND_ABBREV_TAC BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC + (1--(List.length inst_map)) state_to_abbrevs THEN + + (* Right *) + ARM_STEPS'_AND_REWRITE_TAC BIGNUM_MONTMUL_P521_NEON_CORE_EXEC + (1--(List.length inst_map)) inst_map state_to_abbrevs THEN + + REPEAT_N 2 ENSURES_FINAL_STATE'_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Outputs **) + ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; + BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,9) state`; + C_ARGUMENTS] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + (** SUBGOAL 2. Maychange left **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC; + + (** SUBGOAL 3. Maychange right **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0:armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC + ]);; + + +(****************************************************************************** + Use transitivity of two program equivalences to prove end-to-end + correctness +******************************************************************************) + +let equiv_goal = mk_equiv_statement + `aligned 16 stackpointer /\ + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_montmul_p521_core_mc); + (word pc2,LENGTH bignum_montmul_p521_neon_core_mc)] /\ + ALL (nonoverlapping (stackpointer, 80)) + [(word pc,LENGTH bignum_montmul_p521_core_mc); + (word pc2,LENGTH bignum_montmul_p521_neon_core_mc); + (z,8 * 9); (x:int64,8 * 9); (y:int64,8 * 9)]` + equiv_input_states + equiv_output_states + bignum_montmul_p521_core_mc 0 + `MAYCHANGE [PC; X3; X4; X5; X6; X7; X8; X9; + X10; X11; X12; X13; X14; X15; X16; X17; X19; + X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]` + bignum_montmul_p521_neon_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)]`;; + +let equiv_output_states_TRANS = prove( + `!s s2 s' z stackpointer. + equiv_output_states (s,s') z stackpointer/\ + equiv_output_states (s',s2) z stackpointer + ==> equiv_output_states (s,s2) z stackpointer`, + MESON_TAC[equiv_output_states]);; + +let BIGNUM_MONTMUL_P521_CORE_EQUIV = time prove(equiv_goal, + REPEAT STRIP_TAC THEN + (* To prove the goal, show that there exists an empty slot in the memory + which can locate bignum_montmul_p521_interm1_core_mc. *) + SUBGOAL_THEN + `?pc3. + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc:int64,LENGTH bignum_montmul_p521_core_mc); + (word pc3:int64,LENGTH bignum_montmul_p521_interm1_core_mc)] /\ + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc3:int64,LENGTH bignum_montmul_p521_interm1_core_mc); + (word pc2:int64,LENGTH bignum_montmul_p521_neon_core_mc)] /\ + // Input buffers and the intermediate program don't alias + ALL (nonoverlapping + (word pc3:int64, LENGTH bignum_montmul_p521_interm1_core_mc)) + [x,8 * 9; y,8 * 9; stackpointer,80] /\ + 4 divides val (word pc3:int64)` + MP_TAC THENL [ + REPEAT (FIRST_X_ASSUM MP_TAC) THEN + ASM_REWRITE_TAC + [ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC; + fst BIGNUM_MONTMUL_P521_NEON_CORE_EXEC; + fst BIGNUM_MONTMUL_P521_CORE_EXEC;GSYM CONJ_ASSOC] THEN + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[] THEN POP_ASSUM_LIST (K ALL_TAC) THEN + FIND_HOLE_TAC; + + ALL_TAC + ] THEN + STRIP_TAC THEN + + (* instantiate first equiv *) + ENSURES2_TRANS_TAC BIGNUM_MONTMUL_P521_CORE_EQUIV1 BIGNUM_MONTMUL_P521_CORE_EQUIV2 THEN + + (* break 'ALL nonoverlapping' in assumptions *) + RULE_ASSUM_TAC (REWRITE_RULE[ + ALLPAIRS;ALL; + fst BIGNUM_MONTMUL_P521_CORE_EXEC; + fst BIGNUM_MONTMUL_P521_NEON_CORE_EXEC; + fst BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC; + NONOVERLAPPING_CLAUSES]) THEN + REPEAT SPLIT_FIRST_CONJ_ASSUM_TAC THEN + + MATCH_MP_TAC ENSURES2_WEAKEN THEN + REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[TAUT `(p /\ q /\ r) /\ p /\ q /\ r' <=> p /\ q /\ r /\ r'`] THEN + EXISTS_TAC + `write (memory :> bytelist + (word pc3,LENGTH bignum_montmul_p521_interm1_core_mc)) + bignum_montmul_p521_interm1_core_mc + (write PC (word pc3) s')` THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC THENL [ + UNDISCH_TAC `equiv_input_states (s,s') x y z stackpointer` THEN + REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC); + + UNDISCH_TAC `equiv_input_states (s,s') x y z stackpointer` THEN + REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P521_INTERM1_CORE_EXEC); + ]; + + REPEAT GEN_TAC THEN STRIP_TAC THEN + ASM_REWRITE_TAC[] THEN ASM_MESON_TAC[equiv_output_states_TRANS]; + + SUBSUMED_MAYCHANGE_TAC + ]);; + + + +(****************************************************************************** + Inducing BIGNUM_MONTMUL_P521_NEON_SUBROUTINE_CORRECT + from BIGNUM_MONTMUL_P521_CORE_CORRECT +******************************************************************************) + +(* Prove BIGNUM_MONTMUL_P521_CORE_CORRECT_N first *) + +let event_n_at_pc_goal = mk_eventually_n_at_pc_statement + `ALL (nonoverlapping + (word pc:int64, LENGTH + (APPEND bignum_montmul_p521_core_mc barrier_inst_bytes))) + [(z:int64,8 * 9); (stackpointer:int64,80)] /\ + aligned 16 stackpointer` + [`z:int64`;`x:int64`;`y:int64`] (*pc_mc_ofs*)0 + bignum_montmul_p521_core_mc (*pc_ofs*)0 + `\s0. C_ARGUMENTS [z;x;y] s0 /\ read SP s0 = stackpointer`;; + +let BIGNUM_MONTMUL_P521_EVENTUALLY_N_AT_PC = time prove(event_n_at_pc_goal, + + REWRITE_TAC[LENGTH_APPEND;fst BIGNUM_MONTMUL_P521_CORE_EXEC; + BARRIER_INST_BYTES_LENGTH] THEN + REWRITE_TAC[eventually_n_at_pc;ALL;NONOVERLAPPING_CLAUSES;C_ARGUMENTS] THEN + SUBGOAL_THEN `4 divides (LENGTH bignum_montmul_p521_core_mc)` + (fun th -> REWRITE_TAC[MATCH_MP aligned_bytes_loaded_append th; + fst BIGNUM_MONTMUL_P521_CORE_EXEC]) THENL [ + REWRITE_TAC[fst BIGNUM_MONTMUL_P521_CORE_EXEC] + THEN CONV_TAC NUM_DIVIDES_CONV + THEN NO_TAC; + ALL_TAC] THEN + REPEAT GEN_TAC THEN STRIP_TAC THEN + (* now start..! *) + X_GEN_TAC `s0:armstate` THEN GEN_TAC THEN STRIP_TAC THEN + (* eventually ==> eventually_n *) + PROVE_EVENTUALLY_IMPLIES_EVENTUALLY_N_TAC BIGNUM_MONTMUL_P521_CORE_EXEC);; + + +let BIGNUM_MONTMUL_P521_CORE_CORRECT_N = + prove_correct_n + BIGNUM_MONTMUL_P521_EXEC + BIGNUM_MONTMUL_P521_CORE_EXEC + BIGNUM_MONTMUL_P521_CORE_CORRECT + BIGNUM_MONTMUL_P521_EVENTUALLY_N_AT_PC;; + + +(* This theorem is a copy of BIGNUM_MONTMUL_P521_CORE_CORRECT with + - 'pc' replaced with 'pc2' + - LENGTH of bignum_montmul_p521_core_mc replaced with + bignum_montmul_p521_neon_core_m + - The MAYCHANGE set replaced with the Neon version's one *) +let BIGNUM_MONTMUL_P521_NEON_CORE_CORRECT = prove + (`!z x y a b pc2 stackpointer. + aligned 16 stackpointer /\ + ALL (nonoverlapping (stackpointer,80)) + [(word pc2,LENGTH bignum_montmul_p521_neon_core_mc); (z,8 * 9); + (x,8 * 9); (y,8 * 9)] /\ + nonoverlapping (z,8 * 9) (word pc2,LENGTH bignum_montmul_p521_neon_core_mc) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc2) bignum_montmul_p521_neon_core_mc /\ + read PC s = word(pc2) /\ + read SP s = stackpointer /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,9) s = a /\ + bignum_from_memory (y,9) s = b) + (\s. read PC s = word (pc2 + LENGTH bignum_montmul_p521_neon_core_mc) /\ + (a < p_521 /\ b < p_521 + ==> bignum_from_memory (z,9) s = + (inverse_mod p_521 (2 EXP 576) * a * b) MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)])`, + REPEAT GEN_TAC THEN + (* Prepare pc for the original program. *) + SUBGOAL_THEN + `?pc. + ALL (nonoverlapping (word pc, + LENGTH (APPEND bignum_montmul_p521_core_mc barrier_inst_bytes))) + [(stackpointer:int64,80);(z:int64,8*9);(x:int64,8 * 9);(y:int64,8 * 9)] /\ + 4 divides val (word pc:int64)` MP_TAC THENL [ + REWRITE_TAC[fst BIGNUM_MONTMUL_P521_CORE_EXEC; + NONOVERLAPPING_CLAUSES;ALL; + LENGTH_APPEND;BARRIER_INST_BYTES_LENGTH] THEN + time FIND_HOLE_TAC; + + (** SUBGOAL 2 **) + ALL_TAC + ] THEN + + REPEAT_N 2 STRIP_TAC THEN + + VCGEN_EQUIV_TAC BIGNUM_MONTMUL_P521_CORE_EQUIV BIGNUM_MONTMUL_P521_CORE_CORRECT_N + [fst BIGNUM_MONTMUL_P521_CORE_EXEC;fst BIGNUM_MONTMUL_P521_NEON_CORE_EXEC] THEN + + (* unfold definitions that may block tactics *) + RULE_ASSUM_TAC (REWRITE_RULE[ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MONTMUL_P521_EXEC; + fst BIGNUM_MONTMUL_P521_NEON_EXEC]) THEN + REPEAT SPLIT_FIRST_CONJ_ASSUM_TAC THEN + REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Precond **) + X_GEN_TAC `s2:armstate` THEN REPEAT STRIP_TAC THEN + SUBGOAL_THEN `4 divides val (word pc2:int64)` ASSUME_TAC THENL + [ FIRST_ASSUM (fun th -> + MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] + THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN + EXISTS_TAC + `write (memory :> bytelist + (word pc,LENGTH (APPEND bignum_montmul_p521_core_mc barrier_inst_bytes))) + (APPEND bignum_montmul_p521_core_mc barrier_inst_bytes) + (write PC (word pc) s2)` THEN + (* Expand variables appearing in the equiv relation *) + REPEAT CONJ_TAC THEN + TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC) THEN + (* Now has only one subgoal: the '?a. ...' part of input equivalence! *) + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTMUL_P521_CORE_EXEC); + + (** SUBGOAL 2. Postcond **) + MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_MONTMUL_P521_NEON_CORE_EXEC]; + + (** SUBGOAL 3. Frame **) + MESON_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] + ]);; + + +let BIGNUM_MONTMUL_P521_NEON_CORRECT = prove + (`!z x y a b pc stackpointer. + aligned 16 stackpointer /\ + ALL (nonoverlapping (stackpointer,80)) + [(word pc,LENGTH bignum_montmul_p521_neon_mc); (z,8 * 9); + (x,8 * 9); (y,8 * 9)] /\ + nonoverlapping (z,8 * 9) (word pc,LENGTH bignum_montmul_p521_neon_mc) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_montmul_p521_neon_mc /\ + read PC s = word(pc+20) /\ + read SP s = stackpointer /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,9) s = a /\ + bignum_from_memory (y,9) s = b) + (\s. read PC s = word (pc + (20 + LENGTH bignum_montmul_p521_neon_core_mc)) /\ + (a < p_521 /\ b < p_521 + ==> bignum_from_memory (z,9) s = + (inverse_mod p_521 (2 EXP 576) * a * b) MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24; X25; X26] ,, + MAYCHANGE [memory :> bignum(z,9); + memory :> bytes(stackpointer,80)])`, + + ARM_SUB_LIST_OF_MC_TAC BIGNUM_MONTMUL_P521_NEON_CORE_CORRECT + bignum_montmul_p521_neon_core_mc_def + [fst BIGNUM_MONTMUL_P521_NEON_EXEC; + fst BIGNUM_MONTMUL_P521_NEON_CORE_EXEC]);; + + +let BIGNUM_MONTMUL_P521_NEON_SUBROUTINE_CORRECT = prove + (`!z x y a b pc stackpointer returnaddress. + aligned 16 stackpointer /\ + nonoverlapping (z,8 * 9) (word pc,LENGTH bignum_montmul_p521_neon_mc) /\ + ALL (nonoverlapping (word_sub stackpointer (word 144),144)) + [(word pc,LENGTH bignum_montmul_p521_neon_mc); (x,8 * 9); (y,8 * 9); + (z,8 * 9)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_montmul_p521_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory (x,9) s = a /\ + bignum_from_memory (y,9) s = b) + (\s. read PC s = returnaddress /\ + (a < p_521 /\ b < p_521 + ==> bignum_from_memory (z,9) s = + (inverse_mod p_521 (2 EXP 576) * a * b) MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(z,8 * 9); + memory :> bytes(word_sub stackpointer (word 144),144)])`, + let th = CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV) + (REWRITE_RULE [fst BIGNUM_MONTMUL_P521_NEON_CORE_EXEC; + fst BIGNUM_MONTMUL_P521_NEON_EXEC] + BIGNUM_MONTMUL_P521_NEON_CORRECT) in + REWRITE_TAC[fst BIGNUM_MONTMUL_P521_NEON_EXEC] THEN + ARM_ADD_RETURN_STACK_TAC + BIGNUM_MONTMUL_P521_NEON_EXEC th + `[X19;X20;X21;X22;X23;X24;X25;X26]` 144);; diff --git a/arm/proofs/bignum_montsqr_p256_neon.ml b/arm/proofs/bignum_montsqr_p256_neon.ml index 544d2758..dec00e8d 100644 --- a/arm/proofs/bignum_montsqr_p256_neon.ml +++ b/arm/proofs/bignum_montsqr_p256_neon.ml @@ -183,11 +183,10 @@ let bignum_montsqr_p256_interm1_core_mc_def, let equiv_input_states = new_definition `!s1 s1' x z. (equiv_input_states:(armstate#armstate)->int64->int64->bool) (s1,s1') x z <=> - (?a. - C_ARGUMENTS [z; x] s1 /\ + (C_ARGUMENTS [z; x] s1 /\ C_ARGUMENTS [z; x] s1' /\ - bignum_from_memory (x,4) s1 = a /\ - bignum_from_memory (x,4) s1' = a)`;; + ?a. bignum_from_memory (x,4) s1 = a /\ + bignum_from_memory (x,4) s1' = a)`;; let equiv_output_states = new_definition `!s1 s1' z. @@ -332,7 +331,7 @@ extra_word_CONV := WORD_MUL64_LO]] @ (!extra_word_CONV);; -let BIGNUM_MONTSQR_P256_CORE_EQUIV1 = prove(equiv_goal1, +let BIGNUM_MONTSQR_P256_CORE_EQUIV1 = time prove(equiv_goal1, REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI;SOME_FLAGS; ALLPAIRS;ALL;NONOVERLAPPING_CLAUSES; @@ -382,7 +381,7 @@ let BIGNUM_MONTSQR_P256_CORE_EQUIV1 = prove(equiv_goal1, ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,4) state`; C_ARGUMENTS] THEN - REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); (** SUBGOAL 2. Maychange left **) DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN @@ -471,7 +470,7 @@ let BIGNUM_MONTSQR_P256_CORE_EQUIV2 = prove( ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,4) state`; C_ARGUMENTS] THEN - REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); (** SUBGOAL 2. Maychange left **) DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN @@ -538,12 +537,7 @@ let BIGNUM_MONTSQR_P256_CORE_EQUIV = prove(equiv_goal, ] THEN STRIP_TAC THEN - FIRST_X_ASSUM (fun th -> ASSUME_TAC (SPEC_ALL (MATCH_MP BIGNUM_MONTSQR_P256_CORE_EQUIV1 th))) THEN - FIRST_X_ASSUM (fun th -> ASSUME_TAC (SPEC_ALL (MATCH_MP BIGNUM_MONTSQR_P256_CORE_EQUIV2 th))) THEN - FIRST_X_ASSUM (fun c1 -> - FIRST_X_ASSUM (fun c2 -> - MP_TAC (REWRITE_RULE [] (MATCH_MP ENSURES2_CONJ2 (CONJ c1 c2))) - )) THEN + ENSURES2_TRANS_TAC BIGNUM_MONTSQR_P256_CORE_EQUIV1 BIGNUM_MONTSQR_P256_CORE_EQUIV2 THEN (* break 'ALL nonoverlapping' in assumptions *) RULE_ASSUM_TAC (REWRITE_RULE[ @@ -564,40 +558,20 @@ let BIGNUM_MONTSQR_P256_CORE_EQUIV = prove(equiv_goal, (word pc3,LENGTH bignum_montsqr_p256_interm1_core_mc)) bignum_montsqr_p256_interm1_core_mc (write PC (word pc3) s')` THEN - REPEAT CONJ_TAC THEN (TRY ( - REPEAT COMPONENT_READ_OVER_WRITE_LHS_TAC THEN - ASM_REWRITE_TAC[] THEN NO_TAC - )) THENL [ - REWRITE_TAC[aligned_bytes_loaded;bytes_loaded] THEN - RULE_ASSUM_TAC (REWRITE_RULE[aligned_bytes_loaded]) THEN - ASM_REWRITE_TAC[] THEN - MATCH_MP_TAC READ_OVER_WRITE_MEMORY_BYTELIST THEN - REWRITE_TAC[fst BIGNUM_MONTSQR_P256_INTERM1_CORE_EXEC] THEN - ARITH_TAC; - + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P256_INTERM1_CORE_EXEC THENL [ UNDISCH_TAC `equiv_input_states (s,s') x z` THEN REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; fst BIGNUM_MONTSQR_P256_INTERM1_CORE_EXEC] THEN STRIP_TAC THEN ASM_REWRITE_TAC[] THEN - EXISTS_TAC `a:num` THEN - REWRITE_TAC[] THEN - REPEAT CONJ_TAC THENL [ - REPEAT COMPONENT_READ_OVER_WRITE_LHS_TAC THEN ASM_REWRITE_TAC[]; - REPEAT COMPONENT_READ_OVER_WRITE_LHS_TAC THEN ASM_REWRITE_TAC[]; - EXPAND_RHS_TAC THEN READ_OVER_WRITE_ORTHOGONAL_TAC; - ]; + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P256_INTERM1_CORE_EXEC); UNDISCH_TAC `equiv_input_states (s,s') x z` THEN REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; fst BIGNUM_MONTSQR_P256_INTERM1_CORE_EXEC] THEN STRIP_TAC THEN ASM_REWRITE_TAC[] THEN - EXISTS_TAC `a:num` THEN - REWRITE_TAC[] THEN - REPEAT CONJ_TAC THENL [ - REPEAT COMPONENT_READ_OVER_WRITE_LHS_TAC THEN ASM_REWRITE_TAC[]; - REPEAT COMPONENT_READ_OVER_WRITE_LHS_TAC THEN ASM_REWRITE_TAC[]; - EXPAND_RHS_TAC THEN READ_OVER_WRITE_ORTHOGONAL_TAC; - ] + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P256_INTERM1_CORE_EXEC); ]; REPEAT GEN_TAC THEN STRIP_TAC THEN @@ -621,7 +595,6 @@ let event_n_at_pc_goal = mk_eventually_n_at_pc_statement `\s0. C_ARGUMENTS [z;x] s0`;; let BIGNUM_MONTSQR_P256_EVENTUALLY_N_AT_PC = prove(event_n_at_pc_goal, - REWRITE_TAC[LENGTH_APPEND;fst BIGNUM_MONTSQR_P256_CORE_EXEC;BARRIER_INST_BYTES_LENGTH] THEN REWRITE_TAC[eventually_n_at_pc;ALL;NONOVERLAPPING_CLAUSES;C_ARGUMENTS] THEN SUBGOAL_THEN `4 divides (LENGTH bignum_montsqr_p256_core_mc)` @@ -694,7 +667,7 @@ let BIGNUM_MONTSQR_P256_NEON_CORE_CORRECT = prove( [ FIRST_ASSUM (fun th -> MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN - ASM_REWRITE_TAC[equiv_input_states] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN EXISTS_TAC `write (memory :> bytelist (word pc,LENGTH (APPEND bignum_montsqr_p256_core_mc barrier_inst_bytes))) @@ -704,10 +677,9 @@ let BIGNUM_MONTSQR_P256_NEON_CORE_CORRECT = prove( REPEAT CONJ_TAC THEN TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P256_CORE_EXEC) THEN (* Now has only one subgoal: the equivalence! *) - REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN - MAP_EVERY EXISTS_TAC [`a:num`] THEN - REPEAT CONJ_TAC THEN - TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P256_CORE_EXEC); + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P256_CORE_EXEC); (** SUBGOAL 2. Postcond **) MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; @@ -818,7 +790,7 @@ let BIGNUM_AMONTSQR_P256_NEON_CORE_CORRECT = prove( [ FIRST_ASSUM (fun th -> MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN - ASM_REWRITE_TAC[equiv_input_states] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN EXISTS_TAC `write (memory :> bytelist (word pc,LENGTH (APPEND bignum_montsqr_p256_core_mc barrier_inst_bytes))) @@ -828,10 +800,9 @@ let BIGNUM_AMONTSQR_P256_NEON_CORE_CORRECT = prove( REPEAT CONJ_TAC THEN TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P256_CORE_EXEC) THEN (* Now has only one subgoal: the equivalence! *) - REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN - MAP_EVERY EXISTS_TAC [`a:num`] THEN - REPEAT CONJ_TAC THEN - TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P256_CORE_EXEC); + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P256_CORE_EXEC); (** SUBGOAL 2. Postcond **) MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; diff --git a/arm/proofs/bignum_montsqr_p384_neon.ml b/arm/proofs/bignum_montsqr_p384_neon.ml index 15e98c59..1a7739b6 100644 --- a/arm/proofs/bignum_montsqr_p384_neon.ml +++ b/arm/proofs/bignum_montsqr_p384_neon.ml @@ -352,11 +352,10 @@ let bignum_montsqr_p384_interm1_core_mc_def, let equiv_input_states = new_definition `!s1 s1' x z. (equiv_input_states:(armstate#armstate)->int64->int64->bool) (s1,s1') x z <=> - (?a. - C_ARGUMENTS [z; x] s1 /\ + (C_ARGUMENTS [z; x] s1 /\ C_ARGUMENTS [z; x] s1' /\ - bignum_from_memory (x,6) s1 = a /\ - bignum_from_memory (x,6) s1' = a)`;; + ?a. bignum_from_memory (x,6) s1 = a /\ + bignum_from_memory (x,6) s1' = a)`;; let equiv_output_states = new_definition `!s1 s1' z. @@ -404,7 +403,7 @@ let equiv_goal1 = mk_equiv_statement let _org_extra_word_CONV = !extra_word_CONV;; extra_word_CONV := [GEN_REWRITE_CONV I [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_LO; WORD_MUL64_HI; - WORD_SQR64_LO2]] + WORD_SQR128_DIGIT0]] @ (!extra_word_CONV);; let BIGNUM_MONTSQR_P384_CORE_EQUIV1 = time prove(equiv_goal1, @@ -592,15 +591,9 @@ let BIGNUM_MONTSQR_P384_CORE_EQUIV = time prove(equiv_goal, ALL_TAC ] THEN - DISCH_THEN (CHOOSE_THEN (DESTRUCT_TAC "h1 h2 h3 h4")) THEN - + STRIP_TAC THEN - FIRST_X_ASSUM (fun th -> ASSUME_TAC (SPEC_ALL (MATCH_MP BIGNUM_MONTSQR_P384_CORE_EQUIV1 th))) THEN - FIRST_X_ASSUM (fun th -> ASSUME_TAC (SPEC_ALL (MATCH_MP BIGNUM_MONTSQR_P384_CORE_EQUIV2 th))) THEN - FIRST_X_ASSUM (fun c1 -> - FIRST_X_ASSUM (fun c2 -> - MP_TAC (REWRITE_RULE [] (MATCH_MP ENSURES2_CONJ2 (CONJ c1 c2))) - )) THEN + ENSURES2_TRANS_TAC BIGNUM_MONTSQR_P384_CORE_EQUIV1 BIGNUM_MONTSQR_P384_CORE_EQUIV2 THEN (* break 'ALL nonoverlapping' in assumptions *) RULE_ASSUM_TAC (REWRITE_RULE[ @@ -626,17 +619,15 @@ let BIGNUM_MONTSQR_P384_CORE_EQUIV = time prove(equiv_goal, REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; fst BIGNUM_MONTSQR_P384_INTERM1_CORE_EXEC] THEN STRIP_TAC THEN ASM_REWRITE_TAC[] THEN - MAP_EVERY EXISTS_TAC [`a:num`] THEN - REWRITE_TAC[] THEN - PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P384_INTERM1_CORE_EXEC; + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P384_INTERM1_CORE_EXEC); UNDISCH_TAC `equiv_input_states (s,s') x z` THEN REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; fst BIGNUM_MONTSQR_P384_INTERM1_CORE_EXEC] THEN STRIP_TAC THEN ASM_REWRITE_TAC[] THEN - MAP_EVERY EXISTS_TAC [`a:num`] THEN - REWRITE_TAC[] THEN - PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P384_INTERM1_CORE_EXEC + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P384_INTERM1_CORE_EXEC); ]; REPEAT GEN_TAC THEN STRIP_TAC THEN @@ -687,8 +678,7 @@ let BIGNUM_MONTSQR_P384_CORE_CORRECT_N = (* This theorem is a copy of BIGNUM_MONTSQR_P384_CORE_CORRECT, but with - 'pc' replaced with 'pc2' - - LENGTH of bignum_montsqr_p384_core_mc with - bignum_montsqr_p384_neon_core_m + - bignum_montsqr_p384_core_mc with bignum_montsqr_p384_neon_core_mc - The MAYCHANGE set replaced with the Neon version's one *) let BIGNUM_MONTSQR_P384_NEON_CORE_CORRECT = prove( @@ -744,7 +734,7 @@ let BIGNUM_MONTSQR_P384_NEON_CORE_CORRECT = prove( [ FIRST_ASSUM (fun th -> MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN - ASM_REWRITE_TAC[equiv_input_states] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN EXISTS_TAC `write (memory :> bytelist (word pc,LENGTH (APPEND bignum_montsqr_p384_core_mc barrier_inst_bytes))) @@ -754,10 +744,9 @@ let BIGNUM_MONTSQR_P384_NEON_CORE_CORRECT = prove( REPEAT CONJ_TAC THEN TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P384_CORE_EXEC) THEN (* Now has only one subgoal: the equivalence! *) - REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN - MAP_EVERY EXISTS_TAC [`a:num`] THEN - REPEAT CONJ_TAC THEN - TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P384_CORE_EXEC); + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P384_CORE_EXEC); (** SUBGOAL 2. Postcond **) MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; @@ -878,7 +867,7 @@ let BIGNUM_AMONTSQR_P384_NEON_CORE_CORRECT = prove( [ FIRST_ASSUM (fun th -> MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN - ASM_REWRITE_TAC[equiv_input_states] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN EXISTS_TAC `write (memory :> bytelist (word pc,LENGTH (APPEND bignum_montsqr_p384_core_mc barrier_inst_bytes))) @@ -888,10 +877,9 @@ let BIGNUM_AMONTSQR_P384_NEON_CORE_CORRECT = prove( REPEAT CONJ_TAC THEN TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P384_CORE_EXEC) THEN (* Now has only one subgoal: the equivalence! *) - REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN - MAP_EVERY EXISTS_TAC [`a:num`] THEN - REPEAT CONJ_TAC THEN - TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P384_CORE_EXEC); + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P384_CORE_EXEC); (** SUBGOAL 2. Postcond **) MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; diff --git a/arm/proofs/bignum_montsqr_p521.ml b/arm/proofs/bignum_montsqr_p521.ml index 45dedfe7..a34cfbee 100644 --- a/arm/proofs/bignum_montsqr_p521.ml +++ b/arm/proofs/bignum_montsqr_p521.ml @@ -448,6 +448,15 @@ let bignum_montsqr_p521_mc = define_assert_from_elf "bignum_montsqr_p521_mc" "ar let BIGNUM_MONTSQR_P521_EXEC = ARM_MK_EXEC_RULE bignum_montsqr_p521_mc;; +(* bignum_montsqr_p521_mc without callee-save register spills + ret. *) +let bignum_montsqr_p521_core_mc_def, + bignum_montsqr_p521_core_mc, + BIGNUM_MONTSQR_P521_CORE_EXEC = + mk_sublist_of_mc "bignum_montsqr_p521_core_mc" + bignum_montsqr_p521_mc + (`12`,`LENGTH bignum_montsqr_p521_mc - 28`) + (fst BIGNUM_MONTSQR_P521_EXEC);; + (* ------------------------------------------------------------------------- *) (* Proof. *) (* ------------------------------------------------------------------------- *) @@ -527,15 +536,15 @@ let lemma2 = prove ASM_SIMP_TAC[VAL_WORD_SUB_CASES; GSYM REAL_OF_NUM_SUB] THEN REAL_ARITH_TAC);; -let BIGNUM_MONTSQR_P521_CORRECT = time prove +let BIGNUM_MONTSQR_P521_CORE_CORRECT = time prove (`!z x n pc. - nonoverlapping (word pc,0x6b8) (z,8 * 9) + nonoverlapping (word pc,LENGTH bignum_montsqr_p521_core_mc) (z,8 * 9) ==> ensures arm - (\s. aligned_bytes_loaded s (word pc) bignum_montsqr_p521_mc /\ - read PC s = word(pc + 0xc) /\ + (\s. aligned_bytes_loaded s (word pc) bignum_montsqr_p521_core_mc /\ + read PC s = word(pc) /\ C_ARGUMENTS [z; x] s /\ bignum_from_memory (x,9) s = n) - (\s. read PC s = word (pc + 0x6a8) /\ + (\s. read PC s = word (pc + LENGTH bignum_montsqr_p521_core_mc) /\ (n < p_521 ==> bignum_from_memory (z,9) s = (inverse_mod p_521 (2 EXP 576) * n EXP 2) MOD p_521)) @@ -545,13 +554,14 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove MAYCHANGE [memory :> bignum(z,9)])`, MAP_EVERY X_GEN_TAC [`z:int64`; `x:int64`; `n:num`; `pc:num`] THEN - REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; NONOVERLAPPING_CLAUSES] THEN + REWRITE_TAC[C_ARGUMENTS; C_RETURN; SOME_FLAGS; NONOVERLAPPING_CLAUSES; + fst BIGNUM_MONTSQR_P521_CORE_EXEC] THEN DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN (*** Globalize the n < p_521 assumption for simplicity's sake ***) ASM_CASES_TAC `n < p_521` THENL - [ASM_REWRITE_TAC[]; ARM_SIM_TAC BIGNUM_MONTSQR_P521_EXEC (1--423)] THEN + [ASM_REWRITE_TAC[]; ARM_SIM_TAC BIGNUM_MONTSQR_P521_CORE_EXEC (1--423)] THEN (*** Digitize, deduce the bound on the top word specifically ***) @@ -565,7 +575,7 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove (*** The 4x4 squaring of the top "half" ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC [5; 6; 13; 18; 19; 21; 22; 23; 24; 25; 27; 28; 29; 30; 31; 32; 33; 34; 35; 36; 37; 41; 42; 43; 44; 45; 46; 47; 48; 49; 50; 51; 52; 53; 54; 58; 59; 60; 61; 62; 63; 64; 65; 66; 67] @@ -598,7 +608,7 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove (*** The complicated augmentation with the little word contribution ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC [70; 80; 88; 96; 104; 119; 127; 135; 142; 144] (68--144) THEN SUBGOAL_THEN @@ -778,7 +788,7 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove (*** Rotation of the high portion ***) - ARM_STEPS_TAC BIGNUM_MONTSQR_P521_EXEC (145--160) THEN + ARM_STEPS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC (145--160) THEN ABBREV_TAC `htop:int64 = word_add (word_and sum_s80 (word 511)) (word_ushr sum_s144 9)` THEN @@ -850,7 +860,7 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove (*** Squaring of the lower "half" ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC [161; 162; 169; 174; 175; 177; 178; 179; 180; 181; 183; 184; 185; 186; 187; 188; 189; 190; 191; 192; 193; 197; 198; 199; 200; 201; 202; 203; 204; 205; 206; 207; 208; 209; 210; 214; 215; 216; 217; 218; 219; 220; 221; 222; 223] @@ -904,7 +914,7 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove REWRITE_TAC[LENGTH] THEN ARITH_TAC; ALL_TAC] THEN - ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC [225; 226; 229; 230; 233; 234; 237; 238; 241] (224--242) THEN SUBGOAL_THEN @@ -930,7 +940,7 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove (*** The cross-multiplication ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC [243; 244; 245; 246; 248; 250; 252; 254; 255; 256; 257; 258; 259; 260; 261; 262; 263; 264; 265; 271; 276; 278; 279; 285; 290; 292; 293; 294; 295; 296; 297; 303; 308; 310; 311; 312; @@ -968,7 +978,7 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove (*** Addition of the rotated cross-product to the running total ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_EXEC + ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC [364; 366; 369; 372; 376; 379; 383; 386; 391] (362--391) THEN MAP_EVERY ABBREV_TAC [`m0:int64 = word_subword @@ -1073,7 +1083,7 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove (*** Splitting up and stuffing 1 bits into the low part ***) - ARM_STEPS_TAC BIGNUM_MONTSQR_P521_EXEC (392--394) THEN + ARM_STEPS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC (392--394) THEN RULE_ASSUM_TAC(REWRITE_RULE[GSYM WORD_AND_ASSOC; DIMINDEX_64; NUM_REDUCE_CONV `9 MOD 64`]) THEN REPEAT(FIRST_X_ASSUM(K ALL_TAC o check (vfree_in `h:num` o concl))) THEN @@ -1086,7 +1096,7 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove (*** The comparison in its direct condensed form ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_EXEC (395--397) (395--397) THEN + ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC (395--397) (395--397) THEN SUBGOAL_THEN `carry_s397 <=> 2 EXP 192 <= @@ -1101,7 +1111,7 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove (*** Finish the simulation before completing the mathematics ***) - ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_EXEC (398--406) (398--423) THEN + ARM_ACCSTEPS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC (398--406) (398--423) THEN ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN (*** Deal with the final Montgomery tweak first ***) @@ -1256,6 +1266,27 @@ let BIGNUM_MONTSQR_P521_CORRECT = time prove COND_CASES_TAC THEN ASM_REWRITE_TAC[BITVAL_CLAUSES] THEN DISCH_THEN(fun th -> REWRITE_TAC[th]) THEN REAL_INTEGER_TAC);; +let BIGNUM_MONTSQR_P521_CORRECT = time prove + (`!z x n pc. + nonoverlapping (word pc,LENGTH bignum_montsqr_p521_mc) (z,8 * 9) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_montsqr_p521_mc /\ + read PC s = word(pc + 12) /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,9) s = n) + (\s. read PC s = word (pc + 12 + LENGTH bignum_montsqr_p521_core_mc) /\ + (n < p_521 + ==> bignum_from_memory (z,9) s = + (inverse_mod p_521 (2 EXP 576) * n EXP 2) MOD p_521)) + (MAYCHANGE [PC; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; X13; + X14; X15; X16; X17; X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9)])`, + + ARM_SUB_LIST_OF_MC_TAC BIGNUM_MONTSQR_P521_CORE_CORRECT + bignum_montsqr_p521_core_mc_def + [fst BIGNUM_MONTSQR_P521_CORE_EXEC;fst BIGNUM_MONTSQR_P521_EXEC]);; + let BIGNUM_MONTSQR_P521_SUBROUTINE_CORRECT = time prove (`!z x n pc stackpointer returnaddress. aligned 16 stackpointer /\ @@ -1277,6 +1308,9 @@ let BIGNUM_MONTSQR_P521_SUBROUTINE_CORRECT = time prove (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, MAYCHANGE [memory :> bytes(z,8 * 9); memory :> bytes(word_sub stackpointer (word 48),48)])`, + let th = CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV) + (REWRITE_RULE [fst BIGNUM_MONTSQR_P521_CORE_EXEC;fst BIGNUM_MONTSQR_P521_EXEC] + BIGNUM_MONTSQR_P521_CORRECT) in ARM_ADD_RETURN_STACK_TAC - BIGNUM_MONTSQR_P521_EXEC BIGNUM_MONTSQR_P521_CORRECT + BIGNUM_MONTSQR_P521_EXEC th `[X19;X20;X21;X22;X23;X24]` 48);; diff --git a/arm/proofs/bignum_montsqr_p521_neon.ml b/arm/proofs/bignum_montsqr_p521_neon.ml new file mode 100644 index 00000000..e35a46f3 --- /dev/null +++ b/arm/proofs/bignum_montsqr_p521_neon.ml @@ -0,0 +1,1032 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + The first program equivalence between the 'core' part of source program and + its SIMD-vectorized but not instruction-unscheduled program +******************************************************************************) + +needs "arm/proofs/bignum_montsqr_p521.ml";; +needs "arm/proofs/equiv.ml";; +needs "arm/proofs/neon_helper.ml";; + +(* This is the intermediate program that is equivalent to both + bignum_montsqr_p521 and bignum_montsqr_p521_neon. This is a vectorized + version of bignum_montsqr_p521 but instructions are unscheduled. *) + +let bignum_montsqr_p521_interm1_ops:int list = [ + 0xa9402030; (* ldp x16, x8, [x1] *) + 0x3dc00032; (* ldr q18, [x1] *) + 0x3dc00025; (* ldr q5, [x1] *) + 0x3dc00034; (* ldr q20, [x1] *) + 0xa9413431; (* ldp x17, x13, [x1, #16] *) + 0x3dc00431; (* ldr q17, [x1, #16] *) + 0x3dc00421; (* ldr q1, [x1, #16] *) + 0x3dc0043c; (* ldr q28, [x1, #16] *) + 0xa9423c29; (* ldp x9, x15, [x1, #32] *) + 0x3dc0003b; (* ldr q27, [x1] *) + 0x3dc0083d; (* ldr q29, [x1, #32] *) + 0xa9430837; (* ldp x23, x2, [x1, #48] *) + 0x3dc00c26; (* ldr q6, [x1, #48] *) + 0x3dc00c24; (* ldr q4, [x1, #48] *) + 0x9b177d38; (* mul x24, x9, x23 *) + 0x9b027deb; (* mul x11, x15, x2 *) + 0x9bd77d34; (* umulh x20, x9, x23 *) + 0xeb0f0124; (* subs x4, x9, x15 *) + 0xda842496; (* cneg x22, x4, cc // cc = lo, ul, last *) + 0xda9f23ec; (* csetm x12, cc // cc = lo, ul, last *) + 0xeb170044; (* subs x4, x2, x23 *) + 0xda842484; (* cneg x4, x4, cc // cc = lo, ul, last *) + 0x9b047ed3; (* mul x19, x22, x4 *) + 0x9bc47ec4; (* umulh x4, x22, x4 *) + 0xda8c2187; (* cinv x7, x12, cc // cc = lo, ul, last *) + 0xca07026e; (* eor x14, x19, x7 *) + 0xca070096; (* eor x22, x4, x7 *) + 0xab14030c; (* adds x12, x24, x20 *) + 0x9a1f0293; (* adc x19, x20, xzr *) + 0x9bc27de4; (* umulh x4, x15, x2 *) + 0xab0b018c; (* adds x12, x12, x11 *) + 0xba040273; (* adcs x19, x19, x4 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xab0b0273; (* adds x19, x19, x11 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xb10004ff; (* cmn x7, #0x1 *) + 0xba0e018c; (* adcs x12, x12, x14 *) + 0xba160273; (* adcs x19, x19, x22 *) + 0x9a070084; (* adc x4, x4, x7 *) + 0xab18030b; (* adds x11, x24, x24 *) + 0xba0c0194; (* adcs x20, x12, x12 *) + 0xba13026a; (* adcs x10, x19, x19 *) + 0xba040083; (* adcs x3, x4, x4 *) + 0x9a1f03e5; (* adc x5, xzr, xzr *) + 0x3dc0083e; (* ldr q30, [x1, #32] *) + 0x2ebec3c0; (* umull v0.2d, v30.2s, v30.2s *) + 0x6ebec3c2; (* umull2 v2.2d, v30.4s, v30.4s *) + 0x0ea12bd8; (* xtn v24.2s, v30.2d *) + 0x4e9e5bde; (* uzp2 v30.4s, v30.4s, v30.4s *) + 0x2eb8c3de; (* umull v30.2d, v30.2s, v24.2s *) + 0x4e083c07; (* mov x7, v0.d[0] *) + 0x4e183c0e; (* mov x14, v0.d[1] *) + 0x4e083c53; (* mov x19, v2.d[0] *) + 0x4e183c56; (* mov x22, v2.d[1] *) + 0x4e083fc4; (* mov x4, v30.d[0] *) + 0x4e183fcc; (* mov x12, v30.d[1] *) + 0xab0484f5; (* adds x21, x7, x4, lsl #33 *) + 0xd35ffc84; (* lsr x4, x4, #31 *) + 0x9a0401ce; (* adc x14, x14, x4 *) + 0xab0c8673; (* adds x19, x19, x12, lsl #33 *) + 0xd35ffd84; (* lsr x4, x12, #31 *) + 0x9a0402d6; (* adc x22, x22, x4 *) + 0x9b0f7d24; (* mul x4, x9, x15 *) + 0x9bcf7d2c; (* umulh x12, x9, x15 *) + 0xab0405d8; (* adds x24, x14, x4, lsl #1 *) + 0x93c4fd84; (* extr x4, x12, x4, #63 *) + 0xba040273; (* adcs x19, x19, x4 *) + 0xd37ffd84; (* lsr x4, x12, #63 *) + 0x9a0402c4; (* adc x4, x22, x4 *) + 0xab13016b; (* adds x11, x11, x19 *) + 0xba040294; (* adcs x20, x20, x4 *) + 0xba1f014a; (* adcs x10, x10, xzr *) + 0xba1f0063; (* adcs x3, x3, xzr *) + 0x9a1f00a6; (* adc x6, x5, xzr *) + 0x6f00e5e3; (* movi v3.2d, #0xffffffff *) + 0x4e845890; (* uzp2 v16.4s, v4.4s, v4.4s *) + 0x0ea128d9; (* xtn v25.2s, v6.2d *) + 0x0ea12897; (* xtn v23.2s, v4.2d *) + 0x4ea0089e; (* rev64 v30.4s, v4.4s *) + 0x2eb7c338; (* umull v24.2d, v25.2s, v23.2s *) + 0x2eb0c320; (* umull v0.2d, v25.2s, v16.2s *) + 0x4e8658c2; (* uzp2 v2.4s, v6.4s, v6.4s *) + 0x4ea69fde; (* mul v30.4s, v30.4s, v6.4s *) + 0x6f601700; (* usra v0.2d, v24.2d, #32 *) + 0x2eb0c053; (* umull v19.2d, v2.2s, v16.2s *) + 0x6ea02bde; (* uaddlp v30.2d, v30.4s *) + 0x4e231c18; (* and v24.16b, v0.16b, v3.16b *) + 0x2eb78058; (* umlal v24.2d, v2.2s, v23.2s *) + 0x4f6057de; (* shl v30.2d, v30.2d, #32 *) + 0x6f601413; (* usra v19.2d, v0.2d, #32 *) + 0x2eb7833e; (* umlal v30.2d, v25.2s, v23.2s *) + 0x6f601713; (* usra v19.2d, v24.2d, #32 *) + 0x4e083fc5; (* mov x5, v30.d[0] *) + 0x4e183fc7; (* mov x7, v30.d[1] *) + 0x9b027eee; (* mul x14, x23, x2 *) + 0x4e083e73; (* mov x19, v19.d[0] *) + 0x4e183e64; (* mov x4, v19.d[1] *) + 0x9bc27ef6; (* umulh x22, x23, x2 *) + 0xab0e026c; (* adds x12, x19, x14 *) + 0xba1600f3; (* adcs x19, x7, x22 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xab0e018c; (* adds x12, x12, x14 *) + 0xba160273; (* adcs x19, x19, x22 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xab0a00a7; (* adds x7, x5, x10 *) + 0xba030183; (* adcs x3, x12, x3 *) + 0xba06026e; (* adcs x14, x19, x6 *) + 0x9a1f008a; (* adc x10, x4, xzr *) + 0xf9402024; (* ldr x4, [x1, #64] *) + 0x8b040086; (* add x6, x4, x4 *) + 0x9b047c85; (* mul x5, x4, x4 *) + 0x9240ce04; (* and x4, x16, #0xfffffffffffff *) + 0x9b047cd6; (* mul x22, x6, x4 *) + 0x93d0d104; (* extr x4, x8, x16, #52 *) + 0x9240cc84; (* and x4, x4, #0xfffffffffffff *) + 0x9b047cd3; (* mul x19, x6, x4 *) + 0xd374fec4; (* lsr x4, x22, #52 *) + 0x8b04026c; (* add x12, x19, x4 *) + 0xd374cec4; (* lsl x4, x22, #12 *) + 0x93c43184; (* extr x4, x12, x4, #12 *) + 0xab0402b5; (* adds x21, x21, x4 *) + 0x93c8a224; (* extr x4, x17, x8, #40 *) + 0x9240cc84; (* and x4, x4, #0xfffffffffffff *) + 0x9b047cd3; (* mul x19, x6, x4 *) + 0xd374fd84; (* lsr x4, x12, #52 *) + 0x8b040276; (* add x22, x19, x4 *) + 0xd374cd84; (* lsl x4, x12, #12 *) + 0x93c462c4; (* extr x4, x22, x4, #24 *) + 0xba040318; (* adcs x24, x24, x4 *) + 0x93d171a4; (* extr x4, x13, x17, #28 *) + 0x9240cc84; (* and x4, x4, #0xfffffffffffff *) + 0x9b047cd3; (* mul x19, x6, x4 *) + 0xd374fec4; (* lsr x4, x22, #52 *) + 0x8b04026c; (* add x12, x19, x4 *) + 0xd374cec4; (* lsl x4, x22, #12 *) + 0x93c49184; (* extr x4, x12, x4, #36 *) + 0xba04016b; (* adcs x11, x11, x4 *) + 0x93cd4124; (* extr x4, x9, x13, #16 *) + 0x9240cc84; (* and x4, x4, #0xfffffffffffff *) + 0x9b047cd3; (* mul x19, x6, x4 *) + 0xd374fd84; (* lsr x4, x12, #52 *) + 0x8b040276; (* add x22, x19, x4 *) + 0xd374cd84; (* lsl x4, x12, #12 *) + 0x93c4c2c4; (* extr x4, x22, x4, #48 *) + 0xba040294; (* adcs x20, x20, x4 *) + 0xd344fd24; (* lsr x4, x9, #4 *) + 0x9240cc84; (* and x4, x4, #0xfffffffffffff *) + 0x9b047cd3; (* mul x19, x6, x4 *) + 0xd374fec4; (* lsr x4, x22, #52 *) + 0x8b04026c; (* add x12, x19, x4 *) + 0xd374cec4; (* lsl x4, x22, #12 *) + 0x93c4f196; (* extr x22, x12, x4, #60 *) + 0x93c9e1e4; (* extr x4, x15, x9, #56 *) + 0x9240cc84; (* and x4, x4, #0xfffffffffffff *) + 0x9b047cd3; (* mul x19, x6, x4 *) + 0xd374fd84; (* lsr x4, x12, #52 *) + 0x8b04026c; (* add x12, x19, x4 *) + 0xd378dec4; (* lsl x4, x22, #8 *) + 0x93c42184; (* extr x4, x12, x4, #8 *) + 0xba0400e7; (* adcs x7, x7, x4 *) + 0x93cfb2e4; (* extr x4, x23, x15, #44 *) + 0x9240cc84; (* and x4, x4, #0xfffffffffffff *) + 0x9b047cd3; (* mul x19, x6, x4 *) + 0xd374fd84; (* lsr x4, x12, #52 *) + 0x8b040276; (* add x22, x19, x4 *) + 0xd374cd84; (* lsl x4, x12, #12 *) + 0x93c452c4; (* extr x4, x22, x4, #20 *) + 0xba040061; (* adcs x1, x3, x4 *) + 0x93d78044; (* extr x4, x2, x23, #32 *) + 0x9240cc84; (* and x4, x4, #0xfffffffffffff *) + 0x9b047cd3; (* mul x19, x6, x4 *) + 0xd374fec4; (* lsr x4, x22, #52 *) + 0x8b04026c; (* add x12, x19, x4 *) + 0xd374cec4; (* lsl x4, x22, #12 *) + 0x93c48184; (* extr x4, x12, x4, #32 *) + 0xba0401ce; (* adcs x14, x14, x4 *) + 0xd354fc44; (* lsr x4, x2, #20 *) + 0x9b047cd3; (* mul x19, x6, x4 *) + 0xd374fd84; (* lsr x4, x12, #52 *) + 0x8b040273; (* add x19, x19, x4 *) + 0xd374cd84; (* lsl x4, x12, #12 *) + 0x93c4b264; (* extr x4, x19, x4, #44 *) + 0xba040156; (* adcs x22, x10, x4 *) + 0xd36cfe64; (* lsr x4, x19, #44 *) + 0x9a0400ac; (* adc x12, x5, x4 *) + 0x93d52713; (* extr x19, x24, x21, #9 *) + 0x93d82564; (* extr x4, x11, x24, #9 *) + 0xa9001013; (* stp x19, x4, [x0] *) + 0x93cb2693; (* extr x19, x20, x11, #9 *) + 0x93d424e4; (* extr x4, x7, x20, #9 *) + 0xa9011013; (* stp x19, x4, [x0, #16] *) + 0x93c72433; (* extr x19, x1, x7, #9 *) + 0x93c125c4; (* extr x4, x14, x1, #9 *) + 0xa9021013; (* stp x19, x4, [x0, #32] *) + 0x93ce26d3; (* extr x19, x22, x14, #9 *) + 0x93d62584; (* extr x4, x12, x22, #9 *) + 0xa9031013; (* stp x19, x4, [x0, #48] *) + 0x924022b3; (* and x19, x21, #0x1ff *) + 0xd349fd84; (* lsr x4, x12, #9 *) + 0x8b040264; (* add x4, x19, x4 *) + 0xf9002004; (* str x4, [x0, #64] *) + 0x4e921b82; (* uzp1 v2.4s, v28.4s, v18.4s *) + 0x4ea00b9e; (* rev64 v30.4s, v28.4s *) + 0x4e921a58; (* uzp1 v24.4s, v18.4s, v18.4s *) + 0x4eb29fde; (* mul v30.4s, v30.4s, v18.4s *) + 0x6ea02bde; (* uaddlp v30.2d, v30.4s *) + 0x4f6057de; (* shl v30.2d, v30.2d, #32 *) + 0x2ea2831e; (* umlal v30.2d, v24.2s, v2.2s *) + 0x4e083fcb; (* mov x11, v30.d[0] *) + 0x4e183fd4; (* mov x20, v30.d[1] *) + 0x9bd17e07; (* umulh x7, x16, x17 *) + 0xeb080204; (* subs x4, x16, x8 *) + 0xda842496; (* cneg x22, x4, cc // cc = lo, ul, last *) + 0xda9f23ec; (* csetm x12, cc // cc = lo, ul, last *) + 0xeb1101a4; (* subs x4, x13, x17 *) + 0xda842484; (* cneg x4, x4, cc // cc = lo, ul, last *) + 0x9b047ed3; (* mul x19, x22, x4 *) + 0x9bc47ec4; (* umulh x4, x22, x4 *) + 0xda8c2181; (* cinv x1, x12, cc // cc = lo, ul, last *) + 0xca01026e; (* eor x14, x19, x1 *) + 0xca010096; (* eor x22, x4, x1 *) + 0xab07016c; (* adds x12, x11, x7 *) + 0x9a1f00f3; (* adc x19, x7, xzr *) + 0x9bcd7d04; (* umulh x4, x8, x13 *) + 0xab14018c; (* adds x12, x12, x20 *) + 0xba040273; (* adcs x19, x19, x4 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xab140273; (* adds x19, x19, x20 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xb100043f; (* cmn x1, #0x1 *) + 0xba0e018c; (* adcs x12, x12, x14 *) + 0xba160273; (* adcs x19, x19, x22 *) + 0x9a010084; (* adc x4, x4, x1 *) + 0xab0b0175; (* adds x21, x11, x11 *) + 0xba0c0198; (* adcs x24, x12, x12 *) + 0xba13026b; (* adcs x11, x19, x19 *) + 0xba040094; (* adcs x20, x4, x4 *) + 0x9a1f03e7; (* adc x7, xzr, xzr *) + 0x6f00e5e3; (* movi v3.2d, #0xffffffff *) + 0x4e945a90; (* uzp2 v16.4s, v20.4s, v20.4s *) + 0x0ea128b9; (* xtn v25.2s, v5.2d *) + 0x0ea12a97; (* xtn v23.2s, v20.2d *) + 0x4ea00a9e; (* rev64 v30.4s, v20.4s *) + 0x2eb7c338; (* umull v24.2d, v25.2s, v23.2s *) + 0x2eb0c320; (* umull v0.2d, v25.2s, v16.2s *) + 0x4e8558a2; (* uzp2 v2.4s, v5.4s, v5.4s *) + 0x4ea59fde; (* mul v30.4s, v30.4s, v5.4s *) + 0x6f601700; (* usra v0.2d, v24.2d, #32 *) + 0x2eb0c053; (* umull v19.2d, v2.2s, v16.2s *) + 0x6ea02bde; (* uaddlp v30.2d, v30.4s *) + 0x4e231c18; (* and v24.16b, v0.16b, v3.16b *) + 0x2eb78058; (* umlal v24.2d, v2.2s, v23.2s *) + 0x4f6057de; (* shl v30.2d, v30.2d, #32 *) + 0x6f601413; (* usra v19.2d, v0.2d, #32 *) + 0x2eb7833e; (* umlal v30.2d, v25.2s, v23.2s *) + 0x6f601713; (* usra v19.2d, v24.2d, #32 *) + 0x4e083fca; (* mov x10, v30.d[0] *) + 0x4e183fc1; (* mov x1, v30.d[1] *) + 0x9b087e0e; (* mul x14, x16, x8 *) + 0x4e083e73; (* mov x19, v19.d[0] *) + 0x4e183e64; (* mov x4, v19.d[1] *) + 0x9bc87e16; (* umulh x22, x16, x8 *) + 0xab0e026c; (* adds x12, x19, x14 *) + 0xba160033; (* adcs x19, x1, x22 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xab0e0183; (* adds x3, x12, x14 *) + 0xba160273; (* adcs x19, x19, x22 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xab1302a5; (* adds x5, x21, x19 *) + 0xba040315; (* adcs x21, x24, x4 *) + 0xba1f0178; (* adcs x24, x11, xzr *) + 0xba1f028b; (* adcs x11, x20, xzr *) + 0x9a1f00f4; (* adc x20, x7, xzr *) + 0x6f00e5e3; (* movi v3.2d, #0xffffffff *) + 0x4e815830; (* uzp2 v16.4s, v1.4s, v1.4s *) + 0x0ea12a39; (* xtn v25.2s, v17.2d *) + 0x0ea12837; (* xtn v23.2s, v1.2d *) + 0x4ea0083e; (* rev64 v30.4s, v1.4s *) + 0x2eb7c338; (* umull v24.2d, v25.2s, v23.2s *) + 0x2eb0c320; (* umull v0.2d, v25.2s, v16.2s *) + 0x4e915a22; (* uzp2 v2.4s, v17.4s, v17.4s *) + 0x4eb19fde; (* mul v30.4s, v30.4s, v17.4s *) + 0x6f601700; (* usra v0.2d, v24.2d, #32 *) + 0x2eb0c053; (* umull v19.2d, v2.2s, v16.2s *) + 0x6ea02bde; (* uaddlp v30.2d, v30.4s *) + 0x4e231c18; (* and v24.16b, v0.16b, v3.16b *) + 0x2eb78058; (* umlal v24.2d, v2.2s, v23.2s *) + 0x4f6057de; (* shl v30.2d, v30.2d, #32 *) + 0x6f601413; (* usra v19.2d, v0.2d, #32 *) + 0x2eb7833e; (* umlal v30.2d, v25.2s, v23.2s *) + 0x6f601713; (* usra v19.2d, v24.2d, #32 *) + 0x4e083fc7; (* mov x7, v30.d[0] *) + 0x4e183fc1; (* mov x1, v30.d[1] *) + 0x9b0d7e2e; (* mul x14, x17, x13 *) + 0x4e083e73; (* mov x19, v19.d[0] *) + 0x4e183e64; (* mov x4, v19.d[1] *) + 0x9bcd7e36; (* umulh x22, x17, x13 *) + 0xab0e026c; (* adds x12, x19, x14 *) + 0xba160033; (* adcs x19, x1, x22 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xab0e018c; (* adds x12, x12, x14 *) + 0xba160273; (* adcs x19, x19, x22 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xab1800e1; (* adds x1, x7, x24 *) + 0xba0b018e; (* adcs x14, x12, x11 *) + 0xba140276; (* adcs x22, x19, x20 *) + 0x9a1f008c; (* adc x12, x4, xzr *) + 0xa9401013; (* ldp x19, x4, [x0] *) + 0xab0a0273; (* adds x19, x19, x10 *) + 0xba030084; (* adcs x4, x4, x3 *) + 0xa9001013; (* stp x19, x4, [x0] *) + 0xa9411013; (* ldp x19, x4, [x0, #16] *) + 0xba050273; (* adcs x19, x19, x5 *) + 0xba150084; (* adcs x4, x4, x21 *) + 0xa9011013; (* stp x19, x4, [x0, #16] *) + 0xa9421013; (* ldp x19, x4, [x0, #32] *) + 0xba010273; (* adcs x19, x19, x1 *) + 0xba0e0084; (* adcs x4, x4, x14 *) + 0xa9021013; (* stp x19, x4, [x0, #32] *) + 0xa9431013; (* ldp x19, x4, [x0, #48] *) + 0xba160273; (* adcs x19, x19, x22 *) + 0xba0c0084; (* adcs x4, x4, x12 *) + 0xa9031013; (* stp x19, x4, [x0, #48] *) + 0xf9402004; (* ldr x4, [x0, #64] *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xf9002004; (* str x4, [x0, #64] *) + 0x6f00e5e3; (* movi v3.2d, #0xffffffff *) + 0x4e9d5ba2; (* uzp2 v2.4s, v29.4s, v29.4s *) + 0x0ea12b70; (* xtn v16.2s, v27.2d *) + 0x0ea12bb9; (* xtn v25.2s, v29.2d *) + 0x4ea00bbe; (* rev64 v30.4s, v29.4s *) + 0x2eb9c218; (* umull v24.2d, v16.2s, v25.2s *) + 0x2ea2c217; (* umull v23.2d, v16.2s, v2.2s *) + 0x4e9b5b60; (* uzp2 v0.4s, v27.4s, v27.4s *) + 0x4ebb9fde; (* mul v30.4s, v30.4s, v27.4s *) + 0x6f601717; (* usra v23.2d, v24.2d, #32 *) + 0x2ea2c002; (* umull v2.2d, v0.2s, v2.2s *) + 0x6ea02bde; (* uaddlp v30.2d, v30.4s *) + 0x4e231ef8; (* and v24.16b, v23.16b, v3.16b *) + 0x2eb98018; (* umlal v24.2d, v0.2s, v25.2s *) + 0x4f6057de; (* shl v30.2d, v30.2d, #32 *) + 0x6f6016e2; (* usra v2.2d, v23.2d, #32 *) + 0x2eb9821e; (* umlal v30.2d, v16.2s, v25.2s *) + 0x6f601702; (* usra v2.2d, v24.2d, #32 *) + 0x4e083fc6; (* mov x6, v30.d[0] *) + 0x4e183fd6; (* mov x22, v30.d[1] *) + 0x9b177e2c; (* mul x12, x17, x23 *) + 0x9b027db3; (* mul x19, x13, x2 *) + 0x4e083c44; (* mov x4, v2.d[0] *) + 0xab0402d6; (* adds x22, x22, x4 *) + 0x4e183c44; (* mov x4, v2.d[1] *) + 0xba04018c; (* adcs x12, x12, x4 *) + 0x9bd77e24; (* umulh x4, x17, x23 *) + 0xba040273; (* adcs x19, x19, x4 *) + 0x9bc27da4; (* umulh x4, x13, x2 *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0xab0602d5; (* adds x21, x22, x6 *) + 0xba160196; (* adcs x22, x12, x22 *) + 0xba0c026c; (* adcs x12, x19, x12 *) + 0xba130093; (* adcs x19, x4, x19 *) + 0x9a0403e4; (* adc x4, xzr, x4 *) + 0xab0602d8; (* adds x24, x22, x6 *) + 0xba15018b; (* adcs x11, x12, x21 *) + 0xba160274; (* adcs x20, x19, x22 *) + 0xba0c0081; (* adcs x1, x4, x12 *) + 0xba1303ee; (* adcs x14, xzr, x19 *) + 0x9a0403e7; (* adc x7, xzr, x4 *) + 0xeb0d0224; (* subs x4, x17, x13 *) + 0xda84248c; (* cneg x12, x4, cc // cc = lo, ul, last *) + 0xda9f23f6; (* csetm x22, cc // cc = lo, ul, last *) + 0xeb170044; (* subs x4, x2, x23 *) + 0xda842493; (* cneg x19, x4, cc // cc = lo, ul, last *) + 0x9b137d84; (* mul x4, x12, x19 *) + 0x9bd37d8c; (* umulh x12, x12, x19 *) + 0xda9622d3; (* cinv x19, x22, cc // cc = lo, ul, last *) + 0xb100067f; (* cmn x19, #0x1 *) + 0xca130084; (* eor x4, x4, x19 *) + 0xba040021; (* adcs x1, x1, x4 *) + 0xca130184; (* eor x4, x12, x19 *) + 0xba0401ce; (* adcs x14, x14, x4 *) + 0x9a1300e7; (* adc x7, x7, x19 *) + 0xeb080204; (* subs x4, x16, x8 *) + 0xda84248c; (* cneg x12, x4, cc // cc = lo, ul, last *) + 0xda9f23f6; (* csetm x22, cc // cc = lo, ul, last *) + 0xeb0901e4; (* subs x4, x15, x9 *) + 0xda842493; (* cneg x19, x4, cc // cc = lo, ul, last *) + 0x9b137d84; (* mul x4, x12, x19 *) + 0x9bd37d8c; (* umulh x12, x12, x19 *) + 0xda9622d3; (* cinv x19, x22, cc // cc = lo, ul, last *) + 0xb100067f; (* cmn x19, #0x1 *) + 0xca130084; (* eor x4, x4, x19 *) + 0xba0402aa; (* adcs x10, x21, x4 *) + 0xca130184; (* eor x4, x12, x19 *) + 0xba040318; (* adcs x24, x24, x4 *) + 0xba13016b; (* adcs x11, x11, x19 *) + 0xba130294; (* adcs x20, x20, x19 *) + 0xba130021; (* adcs x1, x1, x19 *) + 0xba1301ce; (* adcs x14, x14, x19 *) + 0x9a1300e7; (* adc x7, x7, x19 *) + 0xeb0d0104; (* subs x4, x8, x13 *) + 0xda84248c; (* cneg x12, x4, cc // cc = lo, ul, last *) + 0xda9f23f6; (* csetm x22, cc // cc = lo, ul, last *) + 0xeb0f0044; (* subs x4, x2, x15 *) + 0xda842493; (* cneg x19, x4, cc // cc = lo, ul, last *) + 0x9b137d84; (* mul x4, x12, x19 *) + 0x9bd37d8c; (* umulh x12, x12, x19 *) + 0xda9622d3; (* cinv x19, x22, cc // cc = lo, ul, last *) + 0xb100067f; (* cmn x19, #0x1 *) + 0xca130084; (* eor x4, x4, x19 *) + 0xba040294; (* adcs x20, x20, x4 *) + 0xca130184; (* eor x4, x12, x19 *) + 0xba040021; (* adcs x1, x1, x4 *) + 0xba1301ce; (* adcs x14, x14, x19 *) + 0x9a1300e7; (* adc x7, x7, x19 *) + 0xeb110204; (* subs x4, x16, x17 *) + 0xda84248c; (* cneg x12, x4, cc // cc = lo, ul, last *) + 0xda9f23f6; (* csetm x22, cc // cc = lo, ul, last *) + 0xeb0902e4; (* subs x4, x23, x9 *) + 0xda842493; (* cneg x19, x4, cc // cc = lo, ul, last *) + 0x9b137d84; (* mul x4, x12, x19 *) + 0x9bd37d8c; (* umulh x12, x12, x19 *) + 0xda9622d3; (* cinv x19, x22, cc // cc = lo, ul, last *) + 0xb100067f; (* cmn x19, #0x1 *) + 0xca130084; (* eor x4, x4, x19 *) + 0xba040318; (* adcs x24, x24, x4 *) + 0xca130184; (* eor x4, x12, x19 *) + 0xba04016b; (* adcs x11, x11, x4 *) + 0xba130294; (* adcs x20, x20, x19 *) + 0xba130021; (* adcs x1, x1, x19 *) + 0xba1301ce; (* adcs x14, x14, x19 *) + 0x9a1300e7; (* adc x7, x7, x19 *) + 0xeb0d0204; (* subs x4, x16, x13 *) + 0xda84248c; (* cneg x12, x4, cc // cc = lo, ul, last *) + 0xda9f23f6; (* csetm x22, cc // cc = lo, ul, last *) + 0xeb090044; (* subs x4, x2, x9 *) + 0xda842493; (* cneg x19, x4, cc // cc = lo, ul, last *) + 0x9b137d84; (* mul x4, x12, x19 *) + 0x9bd37d8c; (* umulh x12, x12, x19 *) + 0xda9622d3; (* cinv x19, x22, cc // cc = lo, ul, last *) + 0xb100067f; (* cmn x19, #0x1 *) + 0xca130084; (* eor x4, x4, x19 *) + 0xba04016b; (* adcs x11, x11, x4 *) + 0xca130184; (* eor x4, x12, x19 *) + 0xba040294; (* adcs x20, x20, x4 *) + 0xba130021; (* adcs x1, x1, x19 *) + 0xba1301ce; (* adcs x14, x14, x19 *) + 0x9a1300e7; (* adc x7, x7, x19 *) + 0xeb110104; (* subs x4, x8, x17 *) + 0xda84248c; (* cneg x12, x4, cc // cc = lo, ul, last *) + 0xda9f23f6; (* csetm x22, cc // cc = lo, ul, last *) + 0xeb0f02e4; (* subs x4, x23, x15 *) + 0xda842493; (* cneg x19, x4, cc // cc = lo, ul, last *) + 0x9b137d84; (* mul x4, x12, x19 *) + 0x9bd37d8c; (* umulh x12, x12, x19 *) + 0xda9622d3; (* cinv x19, x22, cc // cc = lo, ul, last *) + 0xb100067f; (* cmn x19, #0x1 *) + 0xca130084; (* eor x4, x4, x19 *) + 0xba040163; (* adcs x3, x11, x4 *) + 0xca130184; (* eor x4, x12, x19 *) + 0xba040285; (* adcs x5, x20, x4 *) + 0xba130021; (* adcs x1, x1, x19 *) + 0xba1301ce; (* adcs x14, x14, x19 *) + 0x9a1300f6; (* adc x22, x7, x19 *) + 0xa9404c0c; (* ldp x12, x19, [x0] *) + 0x93c52024; (* extr x4, x1, x5, #8 *) + 0xab0c008b; (* adds x11, x4, x12 *) + 0x93c121c4; (* extr x4, x14, x1, #8 *) + 0xba130094; (* adcs x20, x4, x19 *) + 0xa9413013; (* ldp x19, x12, [x0, #16] *) + 0x93ce22c4; (* extr x4, x22, x14, #8 *) + 0xba130087; (* adcs x7, x4, x19 *) + 0x8a070293; (* and x19, x20, x7 *) + 0xd348fec4; (* lsr x4, x22, #8 *) + 0xba0c0081; (* adcs x1, x4, x12 *) + 0x8a010276; (* and x22, x19, x1 *) + 0xa9423013; (* ldp x19, x12, [x0, #32] *) + 0xd37ff8c4; (* lsl x4, x6, #1 *) + 0xba13008e; (* adcs x14, x4, x19 *) + 0x8a0e02d3; (* and x19, x22, x14 *) + 0x93c6fd44; (* extr x4, x10, x6, #63 *) + 0xba0c0095; (* adcs x21, x4, x12 *) + 0x8a150276; (* and x22, x19, x21 *) + 0xa9433013; (* ldp x19, x12, [x0, #48] *) + 0x93caff04; (* extr x4, x24, x10, #63 *) + 0xba130082; (* adcs x2, x4, x19 *) + 0x8a0202d3; (* and x19, x22, x2 *) + 0x93d8fc64; (* extr x4, x3, x24, #63 *) + 0xba0c0098; (* adcs x24, x4, x12 *) + 0x8a18026c; (* and x12, x19, x24 *) + 0xf9402013; (* ldr x19, [x0, #64] *) + 0x93c3fca4; (* extr x4, x5, x3, #63 *) + 0x92402084; (* and x4, x4, #0x1ff *) + 0x9a040264; (* adc x4, x19, x4 *) + 0xd349fc93; (* lsr x19, x4, #9 *) + 0xb277d884; (* orr x4, x4, #0xfffffffffffffe00 *) + 0xeb1f03ff; (* cmp xzr, xzr *) + 0xba13017f; (* adcs xzr, x11, x19 *) + 0xba1f019f; (* adcs xzr, x12, xzr *) + 0xba1f009f; (* adcs xzr, x4, xzr *) + 0xba13016b; (* adcs x11, x11, x19 *) + 0xba1f0294; (* adcs x20, x20, xzr *) + 0xba1f00e7; (* adcs x7, x7, xzr *) + 0xba1f0021; (* adcs x1, x1, xzr *) + 0xba1f01ce; (* adcs x14, x14, xzr *) + 0xba1f02b6; (* adcs x22, x21, xzr *) + 0xba1f004c; (* adcs x12, x2, xzr *) + 0xba1f0318; (* adcs x24, x24, xzr *) + 0x9a1f0084; (* adc x4, x4, xzr *) + 0x92402093; (* and x19, x4, #0x1ff *) + 0xd377d964; (* lsl x4, x11, #9 *) + 0x93cbde8b; (* extr x11, x20, x11, #55 *) + 0x93d4dcf4; (* extr x20, x7, x20, #55 *) + 0x93c7dc27; (* extr x7, x1, x7, #55 *) + 0x93c1ddc1; (* extr x1, x14, x1, #55 *) + 0xaa040264; (* orr x4, x19, x4 *) + 0x93cedece; (* extr x14, x22, x14, #55 *) + 0x93d6dd96; (* extr x22, x12, x22, #55 *) + 0x93ccdf0c; (* extr x12, x24, x12, #55 *) + 0x93d8dc93; (* extr x19, x4, x24, #55 *) + 0xd377fc84; (* lsr x4, x4, #55 *) + 0xa900500b; (* stp x11, x20, [x0] *) + 0xa9010407; (* stp x7, x1, [x0, #16] *) + 0xa902580e; (* stp x14, x22, [x0, #32] *) + 0xa9034c0c; (* stp x12, x19, [x0, #48] *) + 0xf9002004; (* str x4, [x0, #64] *) +];; + +let bignum_montsqr_p521_interm1_core_mc = + let charlist = List.concat_map + (fun op32 -> + [Char.chr (Int.logand op32 255); + Char.chr (Int.logand (Int.shift_right op32 8) 255); + Char.chr (Int.logand (Int.shift_right op32 16) 255); + Char.chr (Int.logand (Int.shift_right op32 24) 255)]) + bignum_montsqr_p521_interm1_ops in + let byte_list = Bytes.init (List.length charlist) (fun i -> List.nth charlist i) in + define_word_list "bignum_montsqr_p521_interm1_core_mc" (term_of_bytes byte_list);; + +let BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC = + ARM_MK_EXEC_RULE bignum_montsqr_p521_interm1_core_mc;; + +let equiv_input_states = new_definition + `!s1 s1' x z. + (equiv_input_states:(armstate#armstate)->int64->int64->bool) (s1,s1') x z <=> + (C_ARGUMENTS [z; x] s1 /\ + C_ARGUMENTS [z; x] s1' /\ + ?a. bignum_from_memory (x,9) s1 = a /\ + bignum_from_memory (x,9) s1' = a)`;; + +let equiv_output_states = new_definition + `!s1 s1' z. + (equiv_output_states:(armstate#armstate)->int64->bool) (s1,s1') z <=> + (?a. + bignum_from_memory (z,9) s1 = a /\ + bignum_from_memory (z,9) s1' = a)`;; + +let actions1 = [ + ("equal", 0, 1, 0, 1); ("insert", 1, 1, 1, 4); ("equal", 1, 2, 4, 5); + ("insert", 2, 2, 5, 8); ("equal", 2, 3, 8, 9); ("insert", 3, 3, 9, 11); + ("equal", 3, 4, 11, 12); ("insert", 4, 4, 12, 14); ("equal", 4, 34, 14, 44); + ("delete", 34, 46, 44, 44); ("insert", 46, 46, 44, 69) +];; +(* rewrite WORD_SQR128_DIGIT3;WORD_SQR128_DIGIT2;WORD_SQR128_DIGIT1; + WORD_SQR128_DIGIT0 before actions2 *) +let actions2 = [ + ("equal", 46, 51, 69, 74); ("delete", 51, 53, 74, 74); + ("insert", 53, 53, 74, 94); ("equal", 53, 54, 94, 95); + ("delete", 54, 56, 95, 95); ("insert", 56, 56, 95, 97); + ("equal", 56, 160, 97, 201); ("delete", 160, 162, 201, 201); + ("insert", 162, 162, 201, 210); ("equal", 162, 190, 210, 238); + ("delete", 190, 192, 238, 238); ("insert", 192, 192, 238, 258); + ("equal", 192, 193, 258, 259); ("delete", 193, 195, 259, 259); + ("insert", 195, 195, 259, 261); ("equal", 195, 207, 261, 273); + ("delete", 207, 209, 273, 273); ("insert", 209, 209, 273, 293); + ("equal", 209, 210, 293, 294); ("delete", 210, 212, 294, 294); + ("insert", 212, 212, 294, 296); ("equal", 212, 242, 296, 326); + ("delete", 242, 244, 326, 326); ("insert", 244, 244, 326, 346); + ("equal", 244, 246, 346, 348); ("delete", 246, 247, 348, 348); + ("insert", 247, 247, 348, 349); ("equal", 247, 248, 349, 350); + ("delete", 248, 249, 350, 350); ("insert", 249, 249, 350, 351); + ("equal", 249, 423, 351, 525) +];; + +let equiv_goal1 = mk_equiv_statement + `ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_montsqr_p521_core_mc); + (word pc2,LENGTH bignum_montsqr_p521_interm1_core_mc)]` + equiv_input_states + equiv_output_states + bignum_montsqr_p521_core_mc 0 + `MAYCHANGE [PC; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; X13; + X14; X15; X16; X17; X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9)]` + bignum_montsqr_p521_interm1_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)]`;; + +let _org_extra_word_CONV = !extra_word_CONV;; +extra_word_CONV := + [GEN_REWRITE_CONV I [WORD_BITMANIP_SIMP_LEMMAS; WORD_MUL64_LO; WORD_MUL64_HI; + WORD_SQR64_HI]] + @ (!extra_word_CONV);; + +let BIGNUM_MONTSQR_P521_CORE_EQUIV1 = time prove(equiv_goal1, + + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI;SOME_FLAGS; + ALLPAIRS;ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MONTSQR_P521_CORE_EXEC; + fst BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC;bignum] THEN + REPEAT STRIP_TAC THEN + (** Initialize **) + EQUIV_INITIATE_TAC equiv_input_states THEN + REPEAT (FIRST_X_ASSUM BIGNUM_EXPAND_AND_DIGITIZE_TAC) THEN + ASM_PROPAGATE_DIGIT_EQS_FROM_EXPANDED_BIGNUM_TAC THEN + (* necessary to run ldr qs *) + COMBINE_READ_BYTES64_PAIRS_TAC THEN + + (* Start *) + EQUIV_STEPS_TAC actions1 + BIGNUM_MONTSQR_P521_CORE_EXEC + BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC THEN + + RULE_ASSUM_TAC (REWRITE_RULE[WORD_SQR128_DIGIT3; + WORD_SQR128_DIGIT2;WORD_SQR128_DIGIT1;WORD_SQR128_DIGIT0]) THEN + + EQUIV_STEPS_TAC actions2 + BIGNUM_MONTSQR_P521_CORE_EXEC + BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC THEN + + REPEAT_N 2 ENSURES_FINAL_STATE'_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Outputs **) + ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; + BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,9) state`; + C_ARGUMENTS] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + (** SUBGOAL 2. Maychange left **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC; + + (** SUBGOAL 3. Maychange right **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0:armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC + ]);; + +extra_word_CONV := _org_extra_word_CONV;; + + + +(****************************************************************************** + The second program equivalence between the core part of intermediate + program and fully optimized program +******************************************************************************) + +let bignum_montsqr_p521_neon_mc = + define_from_elf "bignum_montsqr_p521_neon_mc" + "arm/p521/bignum_montsqr_p521_neon.o";; + +let BIGNUM_MONTSQR_P521_NEON_EXEC = + ARM_MK_EXEC_RULE bignum_montsqr_p521_neon_mc;; + +let bignum_montsqr_p521_neon_core_mc_def, + bignum_montsqr_p521_neon_core_mc, + BIGNUM_MONTSQR_P521_NEON_CORE_EXEC = + mk_sublist_of_mc "bignum_montsqr_p521_neon_core_mc" + bignum_montsqr_p521_neon_mc + (`12`,`LENGTH bignum_montsqr_p521_neon_mc - 28`) + (fst BIGNUM_MONTSQR_P521_NEON_EXEC);; + +let equiv_goal2 = mk_equiv_statement + `ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_montsqr_p521_interm1_core_mc); + (word pc2,LENGTH bignum_montsqr_p521_neon_core_mc)]` + equiv_input_states + equiv_output_states + bignum_montsqr_p521_interm1_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)]` + bignum_montsqr_p521_neon_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)]`;; + +(* Line numbers from the fully optimized prog. to the intermediate prog. + The script that prints this map is being privately maintained by aqjune-aws. *) + +let inst_map = [ + 14;9;12;13;45;79;17;83;77;76;48;78;49;81;15;86;80;28;63;75;82;29;18;50;89;20;19;85;8;21;16;84;2;22;30;46;203;25;31;56;91;47;55;205;87;23;52;51;32;3;4;33;34;90;88;35;57;54;53;58;202;204;92;59;61;60;6;240;26;24;242;62;36;241;10;37;206;64;27;38;39;40;41;95;244;42;1;245;246;43;11;44;65;94;207;68;66;7;97;67;98;239;248;69;70;96;249;71;243;109;5;72;208;73;112;247;74;99;277;100;93;101;102;110;251;103;114;113;254;104;105;252;115;106;275;122;116;107;278;111;146;108;119;117;118;123;120;124;130;121;282;131;125;127;132;126;128;147;129;138;148;139;133;134;135;140;136;137;187;186;250;143;188;211;285;141;153;256;142;161;288;144;154;281;145;276;151;149;155;150;162;169;152;158;156;163;157;170;159;166;164;160;171;165;167;189;190;177;178;172;173;174;168;175;179;176;280;180;181;192;331;279;193;182;290;183;224;184;185;212;191;214;253;274;213;215;283;216;199;198;209;200;194;218;210;201;195;284;217;335;286;219;222;334;223;225;287;330;221;226;289;220;291;329;227;228;255;328;229;230;332;231;196;333;232;197;259;296;336;233;234;262;338;295;235;260;327;236;258;261;237;339;238;340;263;294;257;337;264;341;265;266;267;297;268;269;270;271;293;342;292;272;273;298;299;300;301;355;302;303;304;308;305;312;344;306;343;307;353;309;316;310;313;311;320;314;348;324;317;318;351;319;347;321;349;346;315;322;345;325;382;383;384;385;386;389;350;388;352;323;326;354;356;368;369;370;371;372;375;357;387;358;393;373;359;360;361;374;362;391;363;377;364;365;366;367;379;376;378;380;381;400;401;402;403;407;404;390;392;405;394;409;395;406;396;397;398;399;415;417;416;418;419;422;408;420;410;411;424;421;412;413;414;448;449;450;432;433;434;451;455;452;453;435;439;436;426;437;423;425;457;427;428;438;429;483;480;430;441;431;440;454;443;442;469;464;444;445;484;477;446;447;456;458;459;476;460;490;487;461;491;462;492;465;467;463;466;468;470;473;471;474;472;478;475;481;479;485;482;488;486;489;493;494;496;495;497;498;499;500;501;510;502;511;512;503;513;521;504;505;514;506;522;516;507;517;508;523;518;509;515;519;520;524;525];; + +(* (state number, (equation, fresh var)) *) +let state_to_abbrevs: (int * thm) list ref = ref [];; + +let BIGNUM_MONTSQR_P521_CORE_EQUIV2 = time prove( + equiv_goal2, + + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI;SOME_FLAGS; + ALLPAIRS;ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC; + fst BIGNUM_MONTSQR_P521_NEON_CORE_EXEC] THEN + REPEAT STRIP_TAC THEN + (** Initialize **) + EQUIV_INITIATE_TAC equiv_input_states THEN + REPEAT (FIRST_X_ASSUM BIGNUM_EXPAND_AND_DIGITIZE_TAC) THEN + ASM_PROPAGATE_DIGIT_EQS_FROM_EXPANDED_BIGNUM_TAC THEN + (* necessary to run ldr qs *) + COMBINE_READ_BYTES64_PAIRS_TAC THEN + + (* Left *) + ARM_STEPS'_AND_ABBREV_TAC BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC + (1--(List.length inst_map)) state_to_abbrevs THEN + + (* Right *) + ARM_STEPS'_AND_REWRITE_TAC BIGNUM_MONTSQR_P521_NEON_CORE_EXEC + (1--(List.length inst_map)) inst_map state_to_abbrevs THEN + + REPEAT_N 2 ENSURES_FINAL_STATE'_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Outputs **) + ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs; + BIGNUM_EXPAND_CONV `bignum_from_memory (ptr,9) state`; + C_ARGUMENTS] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + (** SUBGOAL 2. Maychange left **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0':armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC; + + (** SUBGOAL 3. Maychange right **) + DISCARD_ASSUMPTIONS_TAC (fun th -> free_in `s0:armstate` (concl th)) THEN + MONOTONE_MAYCHANGE_TAC + ]);; + + + +(****************************************************************************** + Use transitivity of two program equivalences to prove end-to-end + correctness +******************************************************************************) + +let equiv_goal = mk_equiv_statement + `ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc,LENGTH bignum_montsqr_p521_core_mc); + (word pc2,LENGTH bignum_montsqr_p521_neon_core_mc)]` + equiv_input_states + equiv_output_states + bignum_montsqr_p521_core_mc 0 + `MAYCHANGE [PC; X2; X3; X4; X5; X6; X7; X8; X9; X10; X11; X12; X13; + X14; X15; X16; X17; X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [memory :> bignum(z,9)]` + bignum_montsqr_p521_neon_core_mc 0 + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)]`;; + +let equiv_output_states_TRANS = prove( + `!s s2 s' + z. equiv_output_states (s,s') z /\ equiv_output_states (s',s2) z + ==> equiv_output_states (s,s2) z`, + MESON_TAC[equiv_output_states]);; + +let BIGNUM_MONTSQR_P521_CORE_EQUIV = time prove(equiv_goal, + + REPEAT STRIP_TAC THEN + SUBGOAL_THEN + `?pc3. + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc:int64,LENGTH bignum_montsqr_p521_core_mc); + (word pc3:int64,LENGTH bignum_montsqr_p521_interm1_core_mc)] /\ + ALL (nonoverlapping (z:int64,8 * 9)) + [(word pc3:int64,LENGTH bignum_montsqr_p521_interm1_core_mc); + (word pc2:int64,LENGTH bignum_montsqr_p521_neon_core_mc)] /\ + // Input buffers and the intermediate program don't alias + ALL (nonoverlapping + (word pc3:int64, LENGTH bignum_montsqr_p521_interm1_core_mc)) + [x,8 * 9] /\ + 4 divides val (word pc3:int64)` + MP_TAC THENL [ + FIRST_X_ASSUM MP_TAC THEN + ASM_REWRITE_TAC + [ALL;NONOVERLAPPING_CLAUSES; + fst BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC; + fst BIGNUM_MONTSQR_P521_NEON_CORE_EXEC; + GSYM CONJ_ASSOC] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[] THEN POP_ASSUM_LIST (K ALL_TAC) THEN + FIND_HOLE_TAC; + + ALL_TAC + ] THEN + STRIP_TAC THEN + + ENSURES2_TRANS_TAC BIGNUM_MONTSQR_P521_CORE_EQUIV1 BIGNUM_MONTSQR_P521_CORE_EQUIV2 THEN + + (* break 'ALL nonoverlapping' in assumptions *) + RULE_ASSUM_TAC (REWRITE_RULE[ + ALLPAIRS;ALL; + fst BIGNUM_MONTSQR_P521_CORE_EXEC; + fst BIGNUM_MONTSQR_P521_NEON_CORE_EXEC; + fst BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC; + NONOVERLAPPING_CLAUSES]) THEN + REPEAT SPLIT_FIRST_CONJ_ASSUM_TAC THEN + + MATCH_MP_TAC ENSURES2_WEAKEN THEN + REWRITE_TAC[] THEN + REPEAT CONJ_TAC THENL [ + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[TAUT `(p /\ q /\ r) /\ p /\ q /\ r' <=> p /\ q /\ r /\ r'`] THEN + EXISTS_TAC + `write (memory :> bytelist + (word pc3,LENGTH bignum_montsqr_p521_interm1_core_mc)) + bignum_montsqr_p521_interm1_core_mc + (write PC (word pc3) s')` THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC THENL [ + UNDISCH_TAC `equiv_input_states (s,s') x z` THEN + REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC); + + UNDISCH_TAC `equiv_input_states (s,s') x z` THEN + REWRITE_TAC[equiv_input_states;C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC] THEN + STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + REPEAT (TRY HINT_EXISTS_REFL_TAC THEN + PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P521_INTERM1_CORE_EXEC); + ]; + + REPEAT GEN_TAC THEN STRIP_TAC THEN + ASM_REWRITE_TAC[] THEN ASM_MESON_TAC[equiv_output_states_TRANS]; + + SUBSUMED_MAYCHANGE_TAC + ]);; + + + +(****************************************************************************** + Inducing BIGNUM_MONTSQR_P521_NEON_SUBROUTINE_CORRECT + from BIGNUM_MONTSQR_P521_CORE_CORRECT +******************************************************************************) + +(* Prove BIGNUM_MONTSQR_P384_CORE_CORRECT_N first *) + +let event_n_at_pc_goal = mk_eventually_n_at_pc_statement + `nonoverlapping + (word pc:int64, + LENGTH (APPEND bignum_montsqr_p521_core_mc barrier_inst_bytes)) + (z:int64,8 * 9)` + [`z:int64`;`x:int64`] (*pc_mc_ofs*)0 + bignum_montsqr_p521_core_mc (*pc_ofs*)0 + `\s0. C_ARGUMENTS [z;x] s0`;; + +let BIGNUM_MONTSQR_P521_EVENTUALLY_N_AT_PC = time prove(event_n_at_pc_goal, + + REWRITE_TAC[LENGTH_APPEND;fst BIGNUM_MONTSQR_P521_CORE_EXEC;BARRIER_INST_BYTES_LENGTH] THEN + REWRITE_TAC[eventually_n_at_pc;ALL;NONOVERLAPPING_CLAUSES;C_ARGUMENTS] THEN + SUBGOAL_THEN `4 divides (LENGTH bignum_montsqr_p521_core_mc)` + (fun th -> REWRITE_TAC[MATCH_MP aligned_bytes_loaded_append th; + fst BIGNUM_MONTSQR_P521_CORE_EXEC]) THENL [ + REWRITE_TAC[fst BIGNUM_MONTSQR_P521_CORE_EXEC] THEN CONV_TAC NUM_DIVIDES_CONV; + ALL_TAC] THEN + REPEAT GEN_TAC THEN STRIP_TAC THEN + (* now start..! *) + X_GEN_TAC `s0:armstate` THEN GEN_TAC THEN STRIP_TAC THEN + (* eventually ==> eventually_n *) + PROVE_EVENTUALLY_IMPLIES_EVENTUALLY_N_TAC BIGNUM_MONTSQR_P521_CORE_EXEC);; + + +let BIGNUM_MONTSQR_P521_CORE_CORRECT_N = + prove_correct_n + BIGNUM_MONTSQR_P521_EXEC + BIGNUM_MONTSQR_P521_CORE_EXEC + BIGNUM_MONTSQR_P521_CORE_CORRECT + BIGNUM_MONTSQR_P521_EVENTUALLY_N_AT_PC;; + +(* This theorem is a copy of BIGNUM_MONTSQR_P521_CORE_CORRECT, but with + - 'pc' replaced with 'pc2' + - bignum_montsqr_p521_core_mc with bignum_montsqr_p521_neon_core_mc + - The MAYCHANGE set replaced with the Neon version's one *) + +let BIGNUM_MONTSQR_P521_NEON_CORE_CORRECT = time prove + (`!z x n pc2. + nonoverlapping (word pc2,LENGTH bignum_montsqr_p521_neon_core_mc) (z,8 * 9) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc2) bignum_montsqr_p521_neon_core_mc /\ + read PC s = word(pc2) /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,9) s = n) + (\s. read PC s = word (pc2 + LENGTH bignum_montsqr_p521_neon_core_mc) /\ + (n < p_521 + ==> bignum_from_memory (z,9) s = + (inverse_mod p_521 (2 EXP 576) * n EXP 2) MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)])`, + + REPEAT GEN_TAC THEN + (* Prepare pc for the original program. This is going to be used + for preparing an initial state by 'overwriting' bignum_montsqr_p384_mc + at pc. *) + SUBGOAL_THEN + `?pc. + nonoverlapping (word pc, + LENGTH (APPEND bignum_montsqr_p521_core_mc barrier_inst_bytes)) (z:int64,8 * 9) /\ + nonoverlapping (word pc, + LENGTH (APPEND bignum_montsqr_p521_core_mc barrier_inst_bytes)) (x:int64,8 * 9) /\ + 4 divides val (word pc:int64)` MP_TAC THENL [ + REWRITE_TAC[fst BIGNUM_MONTSQR_P521_CORE_EXEC;NONOVERLAPPING_CLAUSES;ALL; + LENGTH_APPEND;BARRIER_INST_BYTES_LENGTH] THEN + FIND_HOLE_TAC; + + (** SUBGOAL 2 **) + ALL_TAC + ] THEN + + REPEAT_N 2 STRIP_TAC THEN + + VCGEN_EQUIV_TAC BIGNUM_MONTSQR_P521_CORE_EQUIV BIGNUM_MONTSQR_P521_CORE_CORRECT_N + [fst BIGNUM_MONTSQR_P521_CORE_EXEC; + fst BIGNUM_MONTSQR_P521_NEON_CORE_EXEC] THEN + + (* unfold definitions that may block tactics *) + RULE_ASSUM_TAC (REWRITE_RULE + [NONOVERLAPPING_CLAUSES;fst BIGNUM_MONTSQR_P521_EXEC; + fst BIGNUM_MONTSQR_P521_NEON_EXEC]) THEN + REWRITE_TAC[C_ARGUMENTS;BIGNUM_FROM_MEMORY_BYTES] THEN + REPEAT CONJ_TAC THENL [ + (** SUBGOAL 1. Precond **) + X_GEN_TAC `s2:armstate` THEN REPEAT STRIP_TAC THEN + SUBGOAL_THEN `4 divides val (word pc2:int64)` ASSUME_TAC THENL + [ FIRST_ASSUM (fun th -> + MP_TAC th THEN REWRITE_TAC[DIVIDES_4_VAL_WORD_64;aligned_bytes_loaded_word] + THEN METIS_TAC[]) THEN NO_TAC; ALL_TAC ] THEN + ASM_REWRITE_TAC[equiv_input_states;C_ARGUMENTS] THEN + EXISTS_TAC + `write (memory :> bytelist + (word pc,LENGTH (APPEND bignum_montsqr_p521_core_mc barrier_inst_bytes))) + (APPEND bignum_montsqr_p521_core_mc barrier_inst_bytes) + (write PC (word pc) s2)` THEN + (* Expand variables appearing in the equiv relation *) + REPEAT CONJ_TAC THEN + TRY (PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC) THEN + (* Now has only one subgoal: the equivalence! *) + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES] THEN + HINT_EXISTS_REFL_TAC THEN PROVE_CONJ_OF_EQ_READS_TAC BIGNUM_MONTSQR_P521_CORE_EXEC; + + (** SUBGOAL 2. Postcond **) + MESON_TAC[equiv_output_states;BIGNUM_FROM_MEMORY_BYTES; + fst BIGNUM_MONTSQR_P521_NEON_CORE_EXEC]; + + (** SUBGOAL 3. Frame **) + MESON_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI] + ]);; + +let BIGNUM_MONTSQR_P521_NEON_CORRECT = time prove + (`!z x n pc. + nonoverlapping (word pc,LENGTH bignum_montsqr_p521_neon_mc) (z,8 * 9) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_montsqr_p521_neon_mc /\ + read PC s = word(pc + 12) /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,9) s = n) + (\s. read PC s = word (pc + 12 + LENGTH bignum_montsqr_p521_neon_core_mc) /\ + (n < p_521 + ==> bignum_from_memory (z,9) s = + (inverse_mod p_521 (2 EXP 576) * n EXP 2) MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19; X20; X21; X22; X23; X24] ,, + MAYCHANGE [memory :> bignum(z,9)])`, + + ARM_SUB_LIST_OF_MC_TAC + BIGNUM_MONTSQR_P521_NEON_CORE_CORRECT + bignum_montsqr_p521_neon_core_mc_def + [fst BIGNUM_MONTSQR_P521_NEON_CORE_EXEC; + fst BIGNUM_MONTSQR_P521_NEON_EXEC]);; + +let BIGNUM_MONTSQR_P521_NEON_SUBROUTINE_CORRECT = time prove + (`!z x n pc stackpointer returnaddress. + aligned 16 stackpointer /\ + nonoverlapping (z,8 * 9) (word_sub stackpointer (word 48),48) /\ + ALLPAIRS nonoverlapping + [(z,8 * 9); (word_sub stackpointer (word 48),48)] + [(word pc,LENGTH bignum_montsqr_p521_neon_mc); (x,8 * 9)] + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_montsqr_p521_neon_mc /\ + read PC s = word pc /\ + read SP s = stackpointer /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x] s /\ + bignum_from_memory (x,9) s = n) + (\s. read PC s = returnaddress /\ + (n < p_521 + ==> bignum_from_memory (z,9) s = + (inverse_mod p_521 (2 EXP 576) * n EXP 2) MOD p_521)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(z,8 * 9); + memory :> bytes(word_sub stackpointer (word 48),48)])`, + let th = CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV) + (REWRITE_RULE [fst BIGNUM_MONTSQR_P521_NEON_CORE_EXEC; + fst BIGNUM_MONTSQR_P521_NEON_EXEC] + BIGNUM_MONTSQR_P521_NEON_CORRECT) in + REWRITE_TAC[fst BIGNUM_MONTSQR_P521_NEON_EXEC] THEN + ARM_ADD_RETURN_STACK_TAC + BIGNUM_MONTSQR_P521_NEON_EXEC th + `[X19;X20;X21;X22;X23;X24]` 48);; diff --git a/arm/proofs/bignum_mul_8_16_neon.ml b/arm/proofs/bignum_mul_8_16_neon.ml index d8981507..b613db58 100644 --- a/arm/proofs/bignum_mul_8_16_neon.ml +++ b/arm/proofs/bignum_mul_8_16_neon.ml @@ -607,7 +607,7 @@ let actions = [ ("equal", 135, 457, 171, 493); ];; -let BIGNUM_MUL_8_16_CORE_EQUIV = prove( +let BIGNUM_MUL_8_16_CORE_EQUIV = time prove( equiv_goal, REWRITE_TAC[SOME_FLAGS;ALL;NONOVERLAPPING_CLAUSES; diff --git a/arm/proofs/bignum_sqr_8_16_neon.ml b/arm/proofs/bignum_sqr_8_16_neon.ml index 3f4dadfe..53295018 100644 --- a/arm/proofs/bignum_sqr_8_16_neon.ml +++ b/arm/proofs/bignum_sqr_8_16_neon.ml @@ -605,7 +605,7 @@ extra_word_CONV := [GEN_REWRITE_CONV I [WORD_BITMANIP_SIMP_LEMMAS; WORD_SQR64_HI; WORD_SQR64_LO; WORD_MUL64_LO; WORD_MUL64_HI]] @ (!extra_word_CONV);; -let BIGNUM_SQR_8_16_CORE_EQUIV = prove(equiv_goal, +let BIGNUM_SQR_8_16_CORE_EQUIV = time prove(equiv_goal, REWRITE_TAC[SOME_FLAGS;ALL;NONOVERLAPPING_CLAUSES; fst BIGNUM_SQR_8_16_CORE_EXEC;fst BIGNUM_SQR_8_16_NEON_CORE_EXEC] THEN diff --git a/arm/proofs/equiv.ml b/arm/proofs/equiv.ml index 4e5ca732..2347f8bb 100644 --- a/arm/proofs/equiv.ml +++ b/arm/proofs/equiv.ml @@ -567,9 +567,10 @@ let ARM_ELONGATE_STEPS_TAC:string->tactic = (Printf.sprintf "Coud not find `arm _ %s`" sname);; (* A variant of ARM_STEP_TAC for equivalence checking. - If 'update' is Some ref, ref will be stored a conjunction of - equalities over reads of the new state and values. *) -let ARM_STEP'_TAC (mc_length_th,decode_th) subths sname (store_update_to:thm ref option) = + If 'store_update_to' is Some ref, a list of + (`read .. = expr`) will be stored instead of added as assumptions *) +let ARM_STEP'_TAC (mc_length_th,decode_th) subths sname + (store_update_to:thm list ref option) = (*** This does the basic decoding setup ***) ARM_BASIC_STEP'2_TAC decode_th sname THEN @@ -601,20 +602,19 @@ let ARM_STEP'_TAC (mc_length_th,decode_th) subths sname (store_update_to:thm ref let thl = STATE_UPDATE_NEW_RULE th in if thl = [] then ALL_TAC else MP_TAC(end_itlist CONJ thl) THEN - ASSEMBLER_SIMPLIFY_TAC THEN - (* At this point, the LHS of the implication of goal looks like this: - `read X19 s1' = word ((val a' * val a''''') DIV 2 EXP 64) /\ - read PC s1' = word (pc2 + 136) - ==> eventually_n ...` - *) - begin match store_update_to with - | None -> ALL_TAC - | Some r -> DISCH_THEN (fun th -> r := th; MP_TAC th) - end THEN - STRIP_TAC);; + ASSEMBLER_SIMPLIFY_TAC) THEN + + begin match store_update_to with + | None -> STRIP_TAC + | Some r -> DISCH_THEN (fun th -> + r := CONJUNCTS th; + ALL_TAC) + end;; (* A variant of DISCARD_OLDSTATE_TAC which receives a list of state names - to preserve. *) + to preserve, 'ss'. + If clean_old_abbrevs is true, transitively remove assumptions that + were using the removed *) let DISCARD_OLDSTATE'_TAC ss (clean_old_abbrevs:bool) = let vs = List.map (fun s -> mk_var(s,`:armstate`)) ss in let rec unbound_statevars_of_read bound_svars tm = @@ -630,6 +630,8 @@ let DISCARD_OLDSTATE'_TAC ss (clean_old_abbrevs:bool) = Comb(Comb(Const("read",_),_),s) -> true | _ -> false in let old_abbrevs: term list ref = ref [] in + (* Erase all 'read c s' equations from assumptions whose s does not + belong to ss. *) DISCARD_ASSUMPTIONS_TAC( fun thm -> let us = unbound_statevars_of_read [] (concl thm) in @@ -642,6 +644,7 @@ let DISCARD_OLDSTATE'_TAC ss (clean_old_abbrevs:bool) = else () else (); true))) THEN (if not clean_old_abbrevs then ALL_TAC else + (* Transitively remove assumptions that use variables in old_abbrevs. *) W(fun (_,_) -> MAP_EVERY (fun (old_abbrev_var:term) -> fun (asl,g) -> @@ -683,43 +686,55 @@ let ENSURES_FINAL_STATE'_TAC = NO_TAC])));; -(* Given eqs = (`read c s = e1`, `read c' s' = e2`), - prove e1 = e2 using WORD_RULE, and abbreviate e1 and e2 as a - fresh variable. +(* Given readth,readth2 = (`|- read c s = e1`, `|- read c' s' = e2`), + prove e1 = e2 using WORD_RULE, abbreviate e1 and e2 as a + fresh variable, and assumes them. + For flag reads, which are simply `|- read ...`, just assumes them. *) -let ABBREV_READS_TAC (eqs:term*term):tactic = +let ABBREV_READS_TAC (readth,readth2:thm*thm):tactic = W(fun (asl,g) -> - let eq,eq2 = eqs in - if not (is_eq eq) then ALL_TAC else - (* eq is: `read elem s = e` *) - let lhs,rhs = dest_eq eq in - (* If lhs is PC update, don't abbrevate it *) - if (can (term_match [] `read PC s`) lhs) then ALL_TAC else - (* If rhs is already a variable, don't abbreviate it again. - Don't try to prove the rhs of eq2. *) - if is_var rhs then ALL_TAC else - let vname = mk_fresh_temp_name() in - Printf.printf "Abbreviating `%s` (which is `%s`) as \"%s\"..\n" - (string_of_term rhs) (string_of_term lhs) vname; - - let lhs2,rhs2 = dest_eq eq2 in - (if rhs2 = rhs then ALL_TAC else - try - let r = WORD_RULE (mk_eq(rhs2,rhs)) in - Printf.printf "\t- Abbreviating `%s` as \"%s\" as well\n" - (string_of_term rhs2) vname; - RULE_ASSUM_TAC (REWRITE_RULE[r]) - with _ -> - Printf.printf "\t- Error: WORD_RULE could not prove `%s = %s`\n" - (string_of_term rhs2) (string_of_term rhs); - failwith "ABBREV_READS_TAC") THEN - let fresh_var = mk_var (vname,type_of rhs) in - ABBREV_TAC (mk_eq (fresh_var,rhs)));; + let eq,eq2 = concl readth,concl readth2 in + if not (is_eq eq) + then (* the flag reads case *) + MAP_EVERY STRIP_ASSUME_TAC [readth;readth2] + else + (* eq is: `read elem s = e` *) + let lhs,rhs = dest_eq eq in + let lhs2,rhs2 = dest_eq eq2 in + (* If lhs is PC update, don't abbrevate it. Or, if rhs is already a + variable, don't abbreviate it again. Don't try to prove the rhs of + eq2. *) + if (can (term_match [] `read PC s`) lhs) || is_var rhs + then MAP_EVERY STRIP_ASSUME_TAC [readth;readth2] + else + let vname = mk_fresh_temp_name() in + Printf.printf "Abbreviating `%s` (which is `%s`) as \"%s\"..\n" + (string_of_term rhs) (string_of_term lhs) vname; + + let readth2 = + (if rhs2 = rhs then readth2 else + try + let r = WORD_RULE (mk_eq(rhs2,rhs)) in + Printf.printf "\t- Abbreviating `%s` as \"%s\" as well\n" + (string_of_term rhs2) vname; + REWRITE_RULE[r] readth2 + with _ -> + Printf.printf "\t- Error: WORD_RULE could not prove `%s = %s`\n" + (string_of_term rhs2) (string_of_term rhs); + failwith "ABBREV_READS_TAC") in + (* Now introduce abbreviated writes, eventually *) + let fresh_var = mk_var (vname,type_of rhs) in + let abbrev_th = prove(mk_exists(fresh_var,mk_eq(rhs,fresh_var)), + EXISTS_TAC rhs THEN REFL_TAC) in + CHOOSE_THEN (fun abbrev_th -> + ASSUME_TAC (REWRITE_RULE[abbrev_th] readth) THEN + ASSUME_TAC (REWRITE_RULE[abbrev_th] readth2) THEN + ASSUME_TAC abbrev_th) abbrev_th);; (* ------------------------------------------------------------------------- *) -(* Tactics and definitions for proving program equivalence. *) +(* Definitions for stating program equivalence. *) (* ------------------------------------------------------------------------- *) (* A recursive function for defining a conjunction of equality clauses *) @@ -737,6 +752,12 @@ let mk_equiv_bool_regs = define ?(a:bool). read reg s1 = a /\ read reg s2 = a /\ mk_equiv_bool_regs regs (s1,s2))`;; +(* ------------------------------------------------------------------------- *) +(* Tactics for proving equivalence of two partially different programs. *) +(* Renamed registers in the input programs should not affect the behavior of *) +(* these tactics. *) +(* ------------------------------------------------------------------------- *) + (* A lock-step simulation. This abbreviates the new expression(s) appearing on the new state expression(s) of the right-side program, and checks whether @@ -745,8 +766,8 @@ let mk_equiv_bool_regs = define It forgets abbreviations that were used in the past. *) let ARM_LOCKSTEP_TAC = - let update_eqs_prog1: thm ref = ref (TAUT `T`) in - let update_eqs_prog2: thm ref = ref (TAUT `T`) in + let update_eqs_prog1: thm list ref = ref [] in + let update_eqs_prog2: thm list ref = ref [] in fun execth execth' (snum:int) (snum':int) (stname'_suffix:string) -> let new_stname = "s" ^ (string_of_int snum) in let new_st = mk_var (new_stname,`:armstate`) and @@ -773,21 +794,19 @@ let ARM_LOCKSTEP_TAC = (* 3. Abbreviate expressions that appear in the new state expressions created from step 2. *) W (fun (asl,g) -> - let update_eqs_prog1_list = CONJUNCTS !update_eqs_prog1 in - let update_eqs_prog2_list = CONJUNCTS !update_eqs_prog2 in + let update_eqs_prog1_list = !update_eqs_prog1 in + let update_eqs_prog2_list = !update_eqs_prog2 in if List.length update_eqs_prog1_list <> List.length update_eqs_prog2_list then (Printf.printf "Updated components mismatch:\n"; Printf.printf "\tprog1: "; - print_qterm (concl !update_eqs_prog1); + List.iter (fun th -> print_qterm (concl th)) update_eqs_prog1_list; Printf.printf "\n\tprog2: "; - print_qterm (concl !update_eqs_prog2); + List.iter (fun th -> print_qterm (concl th)) update_eqs_prog2_list; failwith "ARM_LOCKSTEP_TAC") else - let eqs = zip - (map concl update_eqs_prog1_list) - (map concl update_eqs_prog2_list) in + let eqs = zip update_eqs_prog1_list update_eqs_prog2_list in MAP_EVERY (fun (eq1,eq2) -> ABBREV_READS_TAC (eq1,eq2)) eqs);; @@ -840,8 +859,13 @@ let ARM_STUTTER_RIGHT_TAC exec_th (snames:int list) (st_suffix:string): tactic = MATCH_MP_TAC EVENTUALLY_N_SWAP THEN CLARIFY_TAC);; -(* Tactics that simulate two partially different programs. - Instructions are considered equivalent if they are alpha-equivalent. *) +(* EQUIV_STEPS_TAC simulates two partially different programs and makes + abbreviations of the new symbolic expressions after each step. + Instructions are considered equivalent if they are alpha-equivalent. + It takes a list of 'action's that describe how the symbolic execution + engine must be run. Each action is consumed by EQUIV_STEP_TAC and + a proper tactic is taken. +*) let EQUIV_STEP_TAC action execth1 execth2: tactic = match action with @@ -872,32 +896,42 @@ let EQUIV_STEPS_TAC actions execth1 execth2: tactic = (fun action -> EQUIV_STEP_TAC action execth1 execth2) actions;; +(* ------------------------------------------------------------------------- *) +(* Tactics for proving equivalence of two programs that have reordered *) +(* instructions. *) +(* ------------------------------------------------------------------------- *) -(* Given eq = (`read c s = rhs`), abbreviate rhs as a fresh variable. - and push this at append_to. +(* Given eqth = (`|- read c s = rhs`), abbreviate rhs as a fresh variable. + assume the abbreviated eqth, and add the abbreviation `rhs = fresh_var` + to append_to. append_to is a list of `rhs = fresh_var` equalities. - The abbreviated formula `rhs = x_fresh` is not added as assumption, - unlike ABBREV_TAC. + The abbreviated formula `rhs = x_fresh` is not added as assumption. *) -let ABBREV_READ_TAC (eq:term) (append_to:thm list ref):tactic = +let ABBREV_READ_TAC (eqth:thm) (append_to:thm list ref):tactic = W(fun (asl,g) -> + let eq = concl eqth in if not (is_eq eq) then (Printf.printf "ABBREV_READ_TAC: not equality, passing..: `%s`\n" (string_of_term eq); - ALL_TAC) else + ASSUME_TAC eqth) else (* eq is: `read elem s = e` *) let lhs,rhs = dest_eq eq in (* If lhs is PC update, don't abbrevate it *) - if (can (term_match [] `read PC s`) lhs) then ALL_TAC else - let vname = mk_fresh_temp_name() in - Printf.printf "Abbreviating `%s` (which is `%s`) as \"%s\"..\n" - (string_of_term rhs) (string_of_term lhs) vname; - - let fresh_var = mk_var (vname,type_of rhs) in - ABBREV_TAC (mk_eq (fresh_var,rhs)) THEN - (fun (asl,g) -> - append_to := (snd (List.hd asl))::!append_to; - ALL_TAC (List.tl asl,g)));; + if (can (term_match [] `read PC s`) lhs) + then ASSUME_TAC eqth + else + let vname = mk_fresh_temp_name() in + Printf.printf "Abbreviating `%s` (which is `%s`) as \"%s\"..\n" + (string_of_term rhs) (string_of_term lhs) vname; + + let fresh_var = mk_var (vname,type_of rhs) in + let abbrev_th = prove(mk_exists(fresh_var,mk_eq(rhs,fresh_var)), + EXISTS_TAC rhs THEN REFL_TAC) in + CHOOSE_THEN (fun abbrev_th -> + ASSUME_TAC (REWRITE_RULE[abbrev_th] eqth) THEN + (fun (asl,g) -> + append_to := abbrev_th::!append_to; + ALL_TAC(asl,g))) abbrev_th);; (* Simulate an instruction of the left program and assign fresh variables to the RHSes of new state equations (`read c s = RHS`). @@ -907,7 +941,7 @@ let ABBREV_READ_TAC (eq:term) (append_to:thm list ref):tactic = *) let ARM_STEP'_AND_ABBREV_TAC = - let update_eqs_prog: thm ref = ref (TAUT `T`) in + let update_eqs_prog: thm list ref = ref [] in fun execth (new_stname) (store_to:thm list ref) -> (* Stash the right program's state equations first *) (fun (asl,g) -> @@ -919,9 +953,9 @@ let ARM_STEP'_AND_ABBREV_TAC = RECOVER_ASMS_OF_READ_STATES THEN (* Abbreviate RHSes of the new state equations *) W (fun (asl,g) -> - let update_eqs_prog_list = CONJUNCTS !update_eqs_prog in + let update_eqs_prog_list = !update_eqs_prog in MAP_EVERY - (fun th -> ABBREV_READ_TAC (concl th) store_to) + (fun th -> ABBREV_READ_TAC th store_to) update_eqs_prog_list);; (* store_to is a reference to list of state numbers and abbreviations. @@ -931,6 +965,14 @@ let ARM_STEP'_AND_ABBREV_TAC = let ARM_STEPS'_AND_ABBREV_TAC execth (snums:int list) (store_to: (int * thm) list ref):tactic = W (fun (asl,g) -> store_to := []; ALL_TAC) THEN + (* Stash the right program's state equations first *) + (fun (asl,g) -> + let pat = term_match [] + `eventually_n arm n0 (\s'. eventually_n arm n1 P s0) s1` in + let _,assigns,_ = pat g in + let cur_stname = name_of + (fst (List.find (fun a,b -> b=`s0:armstate`) assigns)) in + STASH_ASMS_OF_READ_STATES [cur_stname] (asl,g)) THEN MAP_EVERY (fun n -> let stname = "s" ^ (string_of_int n) in @@ -945,7 +987,8 @@ let ARM_STEPS'_AND_ABBREV_TAC execth (snums:int list) (List.length !store_to_n) (List.length !store_to); ALL_TAC (asl,g)) THEN CLARIFY_TAC) - snums;; + snums THEN + RECOVER_ASMS_OF_READ_STATES;; let get_read_component (eq:term): term = let lhs = fst (dest_eq eq) in @@ -966,7 +1009,7 @@ let ARM_STEPS'_AND_REWRITE_TAC execth (snums:int list) (inst_map: int list) MAP_EVERY (fun n -> let stname = "s'" ^ (string_of_int n) in - let new_state_eq = ref (REFL `T`) in + let new_state_eq = ref [] in W (fun (asl,g) -> let _ = Printf.printf "Stepping to state %s.. (has %d remaining abbrevs)\n" stname (List.length !abbrevs_cpy) in @@ -979,23 +1022,30 @@ let ARM_STEPS'_AND_REWRITE_TAC execth (snums:int list) (inst_map: int list) let n_at_lprog = List.nth inst_map (n-1) in let abbrevs_for_st_n, leftover = List.partition (fun (n',t)->n'=n_at_lprog) !abbrevs_cpy in let _ = abbrevs_cpy := leftover in - let new_state_eqs = CONJUNCTS !new_state_eq in - (* filter out read PC *) - let new_state_eqs = List.filter - (fun th -> not (can (term_match [] `read PC s`) (fst (dest_eq (concl th))))) + (* new_state_eqs is the updated state components of the 'right' program + instruction. *) + let new_state_eqs = !new_state_eq in + (* Reading flags may not have 'read flag s = ..' form, but just + 'read flag s' or '~(read flag s)'. They don't need to be rewritten. + Also, 'read PC' should not be rewritten as well. Collect them + separately. *) + let new_state_eqs_norewrite,new_state_eqs = + List.partition + (fun th -> not (is_eq (concl th)) || + (can (term_match [] `read PC s`) (fst (dest_eq (concl th))))) new_state_eqs in if List.length abbrevs_for_st_n = List.length new_state_eqs then - (* `read c sn = rhs` <=> `read c sn = abbrev` *) - let rewrite_rules = List.filter_map + (* For each `read c sn = rhs`, replace rhs with abbrev *) + let new_state_eqs = List.filter_map (fun new_state_eq -> let rhs = snd (dest_eq (concl new_state_eq)) in + (* Find 'rhs = abbrev' from the left program's updates. *) match List.find_opt (fun (_,th') -> fst (dest_eq (concl th')) = rhs) abbrevs_for_st_n with | Some (_,rhs_to_abbrev) -> (try - let th' = ISPEC rhs EQ_REFL in - Some (GEN_REWRITE_RULE RAND_CONV [rhs_to_abbrev] th') + Some (GEN_REWRITE_RULE RAND_CONV [rhs_to_abbrev] new_state_eq) with _ -> (Printf.printf "Failed to proceed.\n"; Printf.printf "- rhs: `%s`\n" (string_of_term rhs); @@ -1004,7 +1054,11 @@ let ARM_STEPS'_AND_REWRITE_TAC execth (snums:int list) (inst_map: int list) | None -> (* This case happens when new_state_eq already has abbreviated RHS *) None) new_state_eqs in - RULE_ASSUM_TAC(REWRITE_RULE rewrite_rules) (asl,g) + (if !arm_print_log then begin + Printf.printf " updated new_state_eqs:\n"; + List.iter (fun t -> Printf.printf " %s\n" (string_of_thm t)) new_state_eqs + end); + MAP_EVERY ASSUME_TAC (new_state_eqs_norewrite @ new_state_eqs) (asl,g) else (Printf.printf "State number %d: length mismatch: %d <> %d\n" n (List.length new_state_eqs) (List.length abbrevs_for_st_n); @@ -1014,8 +1068,13 @@ let ARM_STEPS'_AND_REWRITE_TAC execth (snums:int list) (inst_map: int list) List.iter (fun (_,t) -> Printf.printf " %s\n" (string_of_term (concl t))) abbrevs_for_st_n; failwith "ARM_STEPS'_AND_REWRITE_TAC")) THEN CLARIFY_TAC) snums THEN - RECOVER_ASMS_OF_READ_STATES;; + RECOVER_ASMS_OF_READ_STATES THEN + CLARIFY_TAC;; +(* ------------------------------------------------------------------------- *) +(* Tactics that do not perform symbolic execution but are necessary to *) +(* initiate/finalize program equivalence proofs. *) +(* ------------------------------------------------------------------------- *) (* An ad-hoc tactic for proving a goal `read c1 s = .. /\ read c2 s = .. /\ ...`. This also accepts @@ -1023,7 +1082,7 @@ let ARM_STEPS'_AND_REWRITE_TAC execth (snums:int list) (inst_map: int list) Clauses which cannot not be proven with this tactic will remain as a goal. *) let PROVE_CONJ_OF_EQ_READS_TAC execth = REPEAT CONJ_TAC THEN - TRY ( + let main_tac = (* for register updates *) (REPEAT COMPONENT_READ_OVER_WRITE_LHS_TAC THEN REFL_TAC) ORELSE (* for register updates, with rhses abbreviated *) @@ -1040,9 +1099,8 @@ let PROVE_CONJ_OF_EQ_READS_TAC execth = (MATCH_MP_TAC READ_OVER_WRITE_MEMORY_APPEND_BYTELIST ORELSE MATCH_MP_TAC READ_OVER_WRITE_MEMORY_BYTELIST) THEN REWRITE_TAC[LENGTH_APPEND;fst execth;BARRIER_INST_BYTES_LENGTH] THEN - ARITH_TAC));; - - + ARITH_TAC) in + TRY (main_tac ORELSE (MATCH_MP_TAC EQ_SYM THEN main_tac));; (* Prove goals like `?pc. nonoverlapping_modulo (2 EXP 64) (pc,36) (val addr_out,32) /\ @@ -1152,7 +1210,8 @@ let FIND_HOLE_TAC: tactic = (* ------------------------------------------------------------------------- *) -(* Functions that convert a specification theorem into a different form. *) +(* Functions that convert a specification term of theorem into a different *) +(* form. *) (* ------------------------------------------------------------------------- *) let to_ensures_n (ensures_form:term) (numsteps_fn:term): term = @@ -1202,11 +1261,13 @@ let prove_correct_barrier_appended (correct_th:thm) core_exec_th: thm = MP_TAC (SPEC_ALL correct_th) THEN (* Prove antedecent of correct_th *) ANTS_TAC THENL [ - POP_ASSUM MP_TAC THEN + REPEAT (POP_ASSUM MP_TAC) THEN REWRITE_TAC[ALL;NONOVERLAPPING_CLAUSES;LENGTH_APPEND; BARRIER_INST_BYTES_LENGTH] THEN - STRIP_TAC THEN ASM_REWRITE_TAC[] THEN - (NONOVERLAPPING_TAC ORELSE (PRINT_GOAL_TAC THEN NO_TAC)); + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[] THEN + (NONOVERLAPPING_TAC ORELSE + (PRINT_TAC "prove_correct_barrier_appended failed" THEN + PRINT_GOAL_TAC THEN NO_TAC)); ALL_TAC ] THEN @@ -1271,7 +1332,8 @@ let prove_correct_n execth core_execth (correct_th:thm) CONV_TAC ( REWRITE_CONV[fst execth;fst core_execth;LENGTH_APPEND;BARRIER_INST_BYTES_LENGTH] THENC ONCE_DEPTH_CONV NUM_REDUCE_CONV) THEN - (ASM_MESON_TAC[eventually_form] ORELSE + (ASM_MESON_TAC[ALL;NONOVERLAPPING_CLAUSES;NONOVERLAPPING_MODULO_SYM; + eventually_form] ORELSE (PRINT_TAC ("ASM_MESON could not prove this goal. eventually_form: `" ^ (string_of_thm eventually_form) ^ "`") THEN PRINT_GOAL_TAC THEN NO_TAC)));; @@ -1508,7 +1570,9 @@ let PROVE_EVENTUALLY_IMPLIES_EVENTUALLY_N_TAC execth = REPEAT_I_N 0 n (fun i -> EVENTUALLY_TAKE_STEP_RIGHT_FORALL_TAC execth `s0:armstate` 0 i n THEN - DISCARD_OLDSTATE_TAC ("s" ^ (if i = (n-1) then "_final" else string_of_int (i+1)))) THEN + DISCARD_OLDSTATE_TAC + ("s" ^ (if i = (n-1) then "_final" else string_of_int (i+1))) THEN + CLARIFY_TAC) THEN (* match last step: utilize the barrier instruction *) ONCE_REWRITE_TAC[eventually_CASES] THEN ASM_REWRITE_TAC[] THEN @@ -1542,7 +1606,7 @@ let PROVE_EVENTUALLY_IMPLIES_EVENTUALLY_N_TAC execth = FIRST_X_ASSUM (fun th -> let res = MATCH_MP (ARITH_RULE`!x. 1+x x<(n-1)`) th in ASSUME_TAC (CONV_RULE (ONCE_DEPTH_CONV NUM_REDUCE_CONV) res)) - ]) THEN + ] THEN CLARIFY_TAC) THEN ASM_ARITH_TAC (* last is: 'n < 0' *) ];; @@ -1574,6 +1638,23 @@ let mk_eventually_n_at_pc_statement list_mk_forall (`pc:num`::quants, (mk_imp (assum,body)));; +(* mk_equiv_statement creates a term + `!pc pc2 . + assum ==> ensures2 arm + (\(s,s2). aligned_bytes_loaded s (word (pc+pc_ofs1)) mc1 /\ + read PC s = word (pc+pc_ofs1) /\ + aligned_bytes_loaded s2 (word (pc2+pc_ofs2)) mc2 /\ + read PC s2 = word (pc2+pc_ofs2) /\ + equiv_in (s,s2)) + (\(s,s2). aligned_bytes_loaded s (word (pc+pc_ofs1)) mc1 /\ + read PC s = word (pc+(pc_ofs1+)) /\ + aligned_bytes_loaded s2 (word (pc2+pc_ofs2)) mc2 /\ + read PC s2 = word (pc2+(pc_ofs2+)) /\ + equiv_out (s,s2)) + (\(s,s2) (s',s2'). maychange1 s s' /\ maychange2 s2 s2') + (\s. ) + (\s. )` +*) let mk_equiv_statement (assum:term) (equiv_in:thm) (equiv_out:thm) (mc1:thm) (pc_ofs1:int) (maychange1:term) (mc2:thm) (pc_ofs2:int) (maychange2:term):term = @@ -1683,30 +1764,37 @@ let VCGEN_EQUIV_TAC equiv_th correct_n_th mc_length_ths = let conj_ensures_n_equiv = CONJ ensures_n_part equiv_part in MATCH_MP (TAUT`((P==>Q)/\(R==>S)) ==> ((P/\R)==>(Q/\S))`) conj_ensures_n_equiv) THEN - (* Prove the nonoverlapping assumptions here: - (ASSUM ==> ensures_n) ==> ensures_n *) + (* Try to prove the assumptions of equiv_th and + correct_n_th, which are in ASSUM of + (ASSUM ==> ensures_n) ==> ensures_n. + If it could not be proven, it will be left as a subgoal of this tactic. *) W (fun (asl,g) -> if !arm_print_log then PRINT_GOAL_TAC else ALL_TAC) THEN + + let maintac = + (* Conjunction of ensures2 and ensures_n *) + DISCH_THEN (fun th -> LABEL_TAC "H" + (REWRITE_RULE[] (MATCH_MP ENSURES_N_ENSURES2_CONJ th))) THEN + (* .. and apply H as a precondition of ENSURES2_ENSURES_N *) + REMOVE_THEN "H" (fun th -> + let th2 = MATCH_MP + (REWRITE_RULE [TAUT `(P/\P2/\P3==>Q) <=> P==>P2==>P3==>Q`] ENSURES2_ENSURES_N) th in + MATCH_MP_TAC (REWRITE_RULE [TAUT`(P==>Q==>R) <=> (P/\Q==>R)`] th2)) THEN + REWRITE_TAC[] in + W (fun (asl,g) -> if is_imp g then let r = ([ALL;NONOVERLAPPING_CLAUSES;LENGTH_APPEND; BARRIER_INST_BYTES_LENGTH] @ mc_length_ths) in SUBGOAL_THEN (fst (dest_imp (fst (dest_imp g)))) (fun th -> REWRITE_TAC[th]) THENL [ - REWRITE_TAC r THEN RULE_ASSUM_TAC(REWRITE_RULE r) THEN + (REWRITE_TAC r THEN RULE_ASSUM_TAC(REWRITE_RULE r) THEN REPEAT SPLIT_FIRST_CONJ_ASSUM_TAC THEN - REPEAT CONJ_TAC THEN NONOVERLAPPING_TAC; - ALL_TAC - ] - else ALL_TAC) THEN + ASM_REWRITE_TAC[] THEN + REPEAT CONJ_TAC THEN NONOVERLAPPING_TAC) ORELSE + ALL_TAC (* Leave this as a subgoal *); - (* Conjunction of ensures2 and ensures_n *) - DISCH_THEN (fun th -> LABEL_TAC "H" - (REWRITE_RULE[] (MATCH_MP ENSURES_N_ENSURES2_CONJ th))) THEN - (* .. and apply H as a precondition of ENSURES2_ENSURES_N *) - REMOVE_THEN "H" (fun th -> - let th2 = MATCH_MP - (REWRITE_RULE [TAUT `(P/\P2/\P3==>Q) <=> P==>P2==>P3==>Q`] ENSURES2_ENSURES_N) th in - MATCH_MP_TAC (REWRITE_RULE [TAUT`(P==>Q==>R) <=> (P/\Q==>R)`] th2)) THEN - REWRITE_TAC[];; + maintac + ] + else maintac);; diff --git a/arm/proofs/neon_helper.ml b/arm/proofs/neon_helper.ml index ed4b4593..679c1077 100644 --- a/arm/proofs/neon_helper.ml +++ b/arm/proofs/neon_helper.ml @@ -376,8 +376,8 @@ let WORD_MUL64_HI = prove(`!(x: (64)word) (y: (64)word). AP_THM_TAC THEN AP_TERM_TAC THEN ARITH_TAC);; -(* Low 64-bits of 64x64->128-bit squaring (version 2) *) -let WORD_SQR64_LO2 = prove( +(* Four 64-bit words of 128x128->256-bit squaring *) +let WORD_SQR128_DIGIT0 = prove( `!(x:(64)word). word_add (word_mul @@ -387,10 +387,25 @@ let WORD_SQR64_LO2 = prove( (word_mul (word_zx (word_subword x (0,32):(32)word):(64)word) (word_zx (word_subword x (32,32):(32)word):(64)word)) 33) = + word (0 + val x * val x) /\ + word_add + (word_mul + (word_zx (word_subword x (0,32):(32)word):(64)word) + (word_zx (word_subword x (0,32):(32)word):(64)word)) + (word_shl + (word_mul (word_zx (word_subword x (32,32):(32)word):(64)word) + (word_zx (word_subword x (0,32):(32)word):(64)word)) + 33) = word (0 + val x * val x)`, REWRITE_TAC[GSYM WORD_MUL64_LO] THEN - GEN_TAC THEN AP_TERM_TAC THEN + GEN_TAC THEN + MATCH_MP_TAC (TAUT `(P /\ (P ==> Q)) ==> P /\ Q`) THEN + CONJ_TAC THENL [ + ALL_TAC; + CONV_TAC WORD_RULE + ] THEN + AP_TERM_TAC THEN REWRITE_TAC[ARITH_RULE`33=1+32`;GSYM WORD_SHL_COMPOSE] THEN REWRITE_TAC[WORD_RULE `word_shl x 1 = word_add x x`] THEN REWRITE_TAC[WORD_BLAST `!(x:(64)word) y. @@ -404,6 +419,466 @@ let WORD_SQR64_LO2 = prove( AP_THM_TAC THEN AP_TERM_TAC THEN GEN_REWRITE_TAC LAND_CONV [WORD_MUL_SYM] THEN REFL_TAC);; +let WORD_SQR128_LEMMA = prove( + `!(x:int64) (y:int64). + (word_add + (word_add + (word_mul + (word_zx (word_subword x (32,32):int32)) + (word_zx (word_subword x (32,32):int32))) + (word_ushr + (word_mul + (word_zx (word_subword x (32,32):int32)) + (word_zx (word_subword x (0,32):int32))) + 31)) + (word + (bitval + (2 EXP 64 <= + val + (word_mul + (word_zx (word_subword x (0,32):int32)) + (word_zx (word_subword x (0,32):int32)):int64) + + val + (word_shl + (word_mul + (word_zx (word_subword x (32,32):int32):int64) + (word_zx (word_subword x (0,32):int32):int64)) + 33)))):int64) = + word ((val x * val x) DIV 2 EXP 64)`, + REPEAT GEN_TAC THEN + ONCE_REWRITE_TAC[GSYM VAL_EQ] THEN + REWRITE_TAC[VAL_WORD_ADD;VAL_WORD_USHR;VAL_WORD_MUL;VAL_WORD_ZX_GEN; + VAL_WORD_SUBWORD;VAL_WORD;DIMINDEX_64;DIMINDEX_32;VAL_WORD_BITVAL; + VAL_WORD_SHL;ARITH_RULE `x DIV 2 EXP 0 = x`] THEN + CONV_TAC (ONCE_DEPTH_CONV MOD_DOWN_CONV) THEN + CONV_TAC (ONCE_DEPTH_CONV NUM_MIN_CONV) THEN + + ASSUME_TAC (ARITH_RULE `~(2 EXP 64 = 0)`) THEN + ASSUME_TAC (ARITH_RULE `~(2 EXP 32 = 0)`) THEN + + MP_TAC (SPECL [`val (x:int64)`;`2 EXP 32`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC; ALL_TAC] THEN STRIP_TAC THEN + ABBREV_TAC `xhi = val (x:int64) DIV 2 EXP 32` THEN + SUBGOAL_THEN `xhi < 2 EXP 32` ASSUME_TAC THENL [ + EXPAND_TAC "xhi" THEN IMP_REWRITE_TAC[RDIV_LT_EQ] THEN + MP_TAC (SPEC `x:int64` VAL_BOUND_64) THEN ARITH_TAC; + ALL_TAC + ] THEN + ABBREV_TAC `xlo = val (x:int64) MOD 2 EXP 32` THEN + ASM_REWRITE_TAC[] THEN + AP_THM_TAC THEN AP_TERM_TAC THEN + SUBGOAL_THEN `xlo * xlo <2 EXP 64 /\ xhi * xlo < 2 EXP 64` MP_TAC THENL [ + REWRITE_TAC [ARITH_RULE`2 EXP 64 = 2 EXP 32 * 2 EXP 32`] THEN + IMP_REWRITE_TAC[LT_MULT2]; + + ALL_TAC + ] THEN + IMP_REWRITE_TAC[SPECL [`temp:num`;`2 EXP 32`] MOD_LT] THEN + DISCH_THEN (fun th -> MAP_EVERY (fun th' -> REWRITE_TAC [MATCH_MP MOD_LT th']) + (CONJUNCTS th)) THEN + + IMP_REWRITE_TAC[BITVAL_LE_DIV] THEN + CONJ_TAC THENL [ + ALL_TAC; + TRANS_TAC LTE_TRANS `2 EXP 32 * 2 EXP 32 + 2 EXP 32 * 2 EXP 32` THEN + CONJ_TAC THENL [ + MATCH_MP_TAC LT_ADD2 THEN REWRITE_TAC[MOD_LT_EQ_LT] THEN + CONJ_TAC THENL [IMP_REWRITE_TAC [LT_MULT2]; ARITH_TAC]; + + ARITH_TAC + ] + ] THEN + + REWRITE_TAC[LEFT_ADD_DISTRIB;RIGHT_ADD_DISTRIB] THEN + REWRITE_TAC[ARITH_RULE`(x*2 EXP 32)*y*2 EXP 32 = (x*y)*2 EXP 64`;GSYM ADD_ASSOC] + THEN + IMP_REWRITE_TAC[DIV_MULT_ADD] THEN + AP_TERM_TAC THEN + + (* Use ADD_DIV_MOD_SIMP2_LEMMA *) + REWRITE_TAC[ARITH_RULE`2 EXP 33 = 2 * 2 EXP 32`] THEN + SUBGOAL_THEN `(xhi*xlo) DIV 2 EXP 31 = ((2 * (2 EXP 32)) * xhi*xlo) DIV 2 EXP 64` SUBST_ALL_TAC THENL [ + REWRITE_TAC[ARITH_RULE`2 EXP 64 = (2 * (2 EXP 32)) * 2 EXP 31`] THEN + IMP_REWRITE_TAC[DIV_MULT2] THEN ARITH_TAC; + ALL_TAC + ] THEN + IMP_REWRITE_TAC[ADD_DIV_MOD_SIMP2_LEMMA] THEN + AP_THM_TAC THEN AP_TERM_TAC THEN ARITH_TAC);; + +let WORD_SQR128_DIGIT1 = prove( + `!(x:int64) (y:int64). + word_add + (word_add + (word_add + (word_mul (word_zx (word_subword x (32,32):(32)word):(64)word) + (word_zx (word_subword x (32,32):(32)word))) + (word_ushr + (word_mul + (word_zx (word_subword x (32,32):(32)word)) + (word_zx (word_subword x (0,32):(32)word))) + 31):(64)word) + (word + (bitval + (2 EXP 64 <= + val + (word_mul (word_zx (word_subword x (0,32):(32)word)) + (word_zx (word_subword x (0,32):(32)word)) + :(64)word) + + val + (word_shl + (word_mul (word_zx (word_subword x (32,32):(32)word)) + (word_zx (word_subword x (0,32):(32)word)) + :(64)word) + 33))))) + (word_shl (word (0 + val x * val y)) 1) + = word_add + (word_add + (word ((val x * val x) DIV 2 EXP 64)) + (word (0 + val x * val y))) + (word (0 + val x * val y))`, + REWRITE_TAC [WORD_RULE `word_add (word_add x y) y = word_add x (word_shl y 1)`] THEN + REPEAT GEN_TAC THEN + AP_THM_TAC THEN AP_TERM_TAC THEN + REWRITE_TAC[WORD_SQR128_LEMMA]);; + +let WORD_SQR128_DIGIT2 = prove( + `!(x:int64) (y:int64). + word_add + (word_add + (word (0 + val y * val y)) + (word_subword + (word_join + (word ((val x * val y) DIV 2 EXP 64):int64) + (word (0 + val x * val y):int64) + :int128) + (63,64):int64)) + (word + (bitval + (2 EXP 64 <= + val + (word_add + (word_add + (word_mul + (word_zx (word_subword x (32,32):int32)) + (word_zx (word_subword x (32,32):int32))) + (word_ushr + (word_mul + (word_zx (word_subword x (32,32):int32)) + (word_zx (word_subword x (0,32):int32))) + 31)) + (word + (bitval + (2 EXP 64 <= + val + (word_mul + (word_zx (word_subword x (0,32):int32)) + (word_zx (word_subword x (0,32):int32)):int64) + + val + (word_shl + (word_mul + (word_zx (word_subword x (32,32):int32):int64) + (word_zx (word_subword x (0,32):int32):int64)) + 33)))):int64 + ) + + val (word_shl (word (0 + val x * val y):int64) 1)))) = + + word_add + (word_add + (word_add + (word_add + (word (0 + val y * val y)) + (word ((val x * val y) DIV 2 EXP 64))) + (word + (bitval + (2 EXP 64 <= + val (word ((val x * val x) DIV 2 EXP 64):int64) + + val (word (0 + val x * val y):int64))))) + (word ((val x * val y) DIV 2 EXP 64))) + (word + (bitval + (2 EXP 64 <= + val + (word_add + (word ((val x * val x) DIV 2 EXP 64)) + (word (0 + val x * val y)) + :int64) + + val (word (0 + val x * val y):int64))))`, + + REWRITE_TAC[WORD_SQR128_LEMMA] THEN + ASSUME_TAC (ARITH_RULE`~(2 EXP 64 = 0)`) THEN + + REPEAT GEN_TAC THEN + ONCE_REWRITE_TAC[GSYM VAL_EQ] THEN + REWRITE_TAC[ADD_CLAUSES;VAL_WORD_ADD;VAL_WORD_USHR;VAL_WORD_MUL;VAL_WORD_ZX_GEN; + VAL_WORD_SUBWORD;VAL_WORD_JOIN;VAL_WORD;DIMINDEX_64;DIMINDEX_32; + DIMINDEX_128;VAL_WORD_BITVAL;VAL_WORD_SHL;ARITH_RULE `x DIV 2 EXP 0 = x`] THEN + IMP_REWRITE_TAC[BITVAL_LE_MOD_MOD_DIV] THEN + CONV_TAC (ONCE_DEPTH_CONV MOD_DOWN_CONV) THEN + CONV_TAC (ONCE_DEPTH_CONV NUM_MIN_CONV) THEN + + REWRITE_TAC[VAL_MUL_DIV_MOD_SIMP;DIVISION_SIMP] THEN + REWRITE_TAC[GSYM ADD_ASSOC] THEN + REWRITE_TAC[DIV_2_EXP_63;ARITH_RULE`2 EXP 1 = 2`] THEN + MATCH_MP_TAC MOD_ADD_MOD_RIGHT THEN + + REWRITE_TAC[ADD_MOD_MOD_REFL] THEN + IMP_REWRITE_TAC[ADD_DIV_MOD_SIMP2_LEMMA] THEN + REWRITE_TAC[ADD_ASSOC] THEN + IMP_REWRITE_TAC[ADD_DIV_MOD_SIMP2_LEMMA] THEN + TARGET_REWRITE_TAC [ADD_SYM] ADD_DIV_MOD_SIMP2_LEMMA THEN + ASM_REWRITE_TAC[] THEN + AP_THM_TAC THEN AP_TERM_TAC THEN ARITH_TAC);; + +let WORD_SQR128_DIGIT3 = prove( + `!(x:int64) (y:int64). + word_add + (word_add + (word_add + (word_add + (word_mul (word_zx (word_subword y (32,32):int32):int64) + (word_zx (word_subword y (32,32):int32))) + (word_ushr + (word_mul (word_zx (word_subword y (32,32):int32):int64) + (word_zx (word_subword y (0,32):int32))) + 31)) + (word + (bitval + (2 EXP 64 <= + val + (word_mul (word_zx (word_subword y (0,32):int32):int64) + (word_zx (word_subword y (0,32):int32))) + + val + (word_shl + (word_mul (word_zx (word_subword y (32,32):int32):int64) + (word_zx (word_subword y (0,32):int32))) + 33))))) + (word_ushr (word ((val x * val y) DIV 2 EXP 64):int64) 63)) + (word + (bitval + (2 EXP 64 <= + val (word (0 + val y * val y):int64) + + val + (word_subword + (word_join + (word ((val x * val y) DIV 2 EXP 64):int64) + (word (0 + val x * val y):int64) + :int128) + (63,64):int64) + + bitval + (2 EXP 64 <= + val + (word_add + (word_add + (word_mul (word_zx (word_subword x (32,32):int32):int64) + (word_zx (word_subword x (32,32):int32):int64)) + (word_ushr + (word_mul (word_zx (word_subword x (32,32):int32):int64) + (word_zx (word_subword x (0,32):int32):int64)) + 31)) + (word + (bitval + (2 EXP 64 <= + val + (word_mul (word_zx (word_subword x (0,32):int32):int64) + (word_zx (word_subword x (0,32):int32):int64)) + + val + (word_shl + (word_mul (word_zx (word_subword x (32,32):int32)) + (word_zx (word_subword x (0,32):int32):int64)) + 33))))) + + val (word_shl (word (0 + val x * val y):int64) 1))))) = + + word_add + (word_add + (word ((val y * val y) DIV 2 EXP 64)) + (word + (bitval + (2 EXP 64 <= + val (word (0 + val y * val y):int64) + + val (word ((val x * val y) DIV 2 EXP 64):int64) + + bitval + (2 EXP 64 <= + val (word ((val x * val x) DIV 2 EXP 64):int64) + + val (word (0 + val x * val y):int64)))))) + (word + (bitval + (2 EXP 64 <= + val + (word_add + (word_add (word (0 + val y * val y):int64) + (word ((val x * val y) DIV 2 EXP 64))) + (word + (bitval + (2 EXP 64 <= + val (word ((val x * val x) DIV 2 EXP 64):int64) + + val (word (0 + val x * val y):int64))))) + + val (word ((val x * val y) DIV 2 EXP 64):int64) + + bitval + (2 EXP 64 <= + val + (word_add (word ((val x * val x) DIV 2 EXP 64):int64) + (word (0 + val x * val y))) + + val (word (0 + val x * val y):int64)))))`, + + REWRITE_TAC[WORD_SQR128_LEMMA] THEN + ASSUME_TAC (ARITH_RULE`~(2 EXP 64 = 0)`) THEN + REPEAT GEN_TAC THEN + + ONCE_REWRITE_TAC[GSYM VAL_EQ] THEN + REWRITE_TAC[ADD_CLAUSES;VAL_WORD_ADD;VAL_WORD_USHR;VAL_WORD_MUL;VAL_WORD_ZX_GEN; + VAL_WORD_SUBWORD;VAL_WORD_JOIN;VAL_WORD;DIMINDEX_64;DIMINDEX_32; + DIMINDEX_128;VAL_WORD_BITVAL;VAL_WORD_SHL;ARITH_RULE `x DIV 2 EXP 0 = x`; + ARITH_RULE`2 EXP 1 = 2`;DIV_2_EXP_63] THEN + CONV_TAC (ONCE_DEPTH_CONV MOD_DOWN_CONV) THEN + CONV_TAC (ONCE_DEPTH_CONV NUM_MIN_CONV) THEN + + ASM_SIMP_TAC[VAL_MUL_DIV_MOD_SIMP;DIVISION_SIMP; + ADD_DIV_MOD_SIMP2_LEMMA;ADD_DIV_MOD_SIMP_LEMMA] THEN + IMP_REWRITE_TAC[BITVAL_LE_MOD_MOD_DIV] THEN + CONJ_TAC THENL [ + ALL_TAC; + IMP_REWRITE_TAC [RDIV_LT_EQ;LT_MULT2;VAL_BOUND_64] + ] THEN + + ASM_SIMP_TAC[GSYM ADD_ASSOC;ADD_DIV_MOD_SIMP2_LEMMA] THEN + MATCH_MP_TAC MOD_ADD_MOD_RIGHT THEN + AP_THM_TAC THEN AP_TERM_TAC THEN + + (* High-level idea: for every 'bitval (2 EXP 64 <= X)', 0 <= x < 2 EXP 65. *) + ASSUME_TAC (ARITH_RULE`1 < 2 EXP 64`) THEN + IMP_REWRITE_TAC[BITVAL_LE_DIV] THEN + REWRITE_TAC[ARITH_RULE`2 * 2 EXP 64 = 2 EXP 64 + 2 EXP 64`] THEN + REPEAT CONJ_TAC THENL [ + ALL_TAC; + + MATCH_MP_TAC LT_ADD2 THEN + ASM_REWRITE_TAC[MOD_LT_EQ] THEN + IMP_REWRITE_TAC[MULT_ADD_DIV_LT;VAL_BOUND_64] THEN + ASM_SIMP_TAC[LE_LT;MOD_LT_EQ]; + + MATCH_MP_TAC LT_ADD2 THEN + ASM_REWRITE_TAC[MOD_LT_EQ] THEN + IMP_REWRITE_TAC[MULT_ADD_DIV_LT;VAL_BOUND_64] THEN + IMP_REWRITE_TAC[LE_LT;RDIV_LT_EQ] THEN + DISJ1_TAC THEN IMP_REWRITE_TAC[LT_MULT2;VAL_BOUND_64]; + + MATCH_MP_TAC LTE_ADD2 THEN + ASM_REWRITE_TAC[MOD_LT_EQ] THEN + TRANS_TAC LE_TRANS `(2 EXP 64 - 1) + 1` THEN + CONJ_TAC THENL [ + ALL_TAC; ARITH_TAC + ] THEN + MATCH_MP_TAC LE_ADD2 THEN + CONJ_TAC THENL [ + REWRITE_TAC[GSYM LT_SUC_LE] THEN + REWRITE_TAC[ARITH_RULE`SUC (2 EXP 64 - 1) = 2 EXP 64`] THEN + ASM_REWRITE_TAC[MOD_LT_EQ]; + + REWRITE_TAC[GSYM LT_SUC_LE;ARITH_RULE`SUC 1 = 1 + 1`] THEN + IMP_REWRITE_TAC[RDIV_LT_EQ;LEFT_ADD_DISTRIB;MULT_CLAUSES] THEN + MATCH_MP_TAC LT_ADD2 THEN CONJ_TAC THENL [ + IMP_REWRITE_TAC[RDIV_LT_EQ;LT_MULT2;VAL_BOUND_64]; + + IMP_REWRITE_TAC[MOD_LT_EQ] + ] + ] + ] THEN + + SUBGOAL_THEN + `!k. (val (y:int64) * val (y:int64) + k) MOD 2 EXP 64 = + ((val (y:int64) * val (y:int64)) MOD 2 EXP 64 + k) MOD 2 EXP 64` + (LABEL_TAC "H") THENL [ + REWRITE_TAC[ADD_MOD_MOD_REFL]; ALL_TAC + ] THEN + USE_THEN "H" (fun th -> REWRITE_TAC[th]) THEN + TARGET_REWRITE_TAC[ADD_AC] ADD_DIV_MOD_SIMP2_LEMMA THEN + ASM_REWRITE_TAC[] THEN + + TARGET_REWRITE_TAC[ADD_AC] ADD_DIV_MOD_SIMP2_LEMMA THEN + ASM_REWRITE_TAC[] THEN + + SUBGOAL_THEN `!(a:num) (b:num). + (2 * (a * b) DIV 2 EXP 64) DIV 2 EXP 64 = + ((2 * a * b) DIV 2 EXP 64) DIV 2 EXP 64` + (fun th -> REWRITE_TAC[th]) THENL [ + ASSUME_TAC (ARITH_RULE `~(2 EXP 63 = 0)`) THEN + REPEAT STRIP_TAC THEN + ABBREV_TAC `c = a * b` THEN + MP_TAC (SPECL [`c:num`;`2 EXP 64`] DIVISION) THEN + ASM_REWRITE_TAC[] THEN + DISCH_THEN (fun th -> let t1,t2 = CONJ_PAIR th in + LABEL_TAC "HC" t1 THEN LABEL_TAC "HLT" t2) THEN + MAP_EVERY ABBREV_TAC [`ch = c DIV 2 EXP 64`;`cl = c MOD 2 EXP 64`] THEN + + MP_TAC (SPECL [`cl:num`;`2 EXP 63`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC;ALL_TAC] THEN + DISCH_THEN (fun th -> let t1,t2 = CONJ_PAIR th in + LABEL_TAC "HCL" t1 THEN LABEL_TAC "HCLLT" t2) THEN + MAP_EVERY ABBREV_TAC [`(clh:num) = cl DIV 2 EXP 63`;`cll = cl MOD 2 EXP 63`] THEN + SUBGOAL_THEN `(clh:num) < 2` ASSUME_TAC THENL [ + EXPAND_TAC "clh" THEN IMP_REWRITE_TAC [RDIV_LT_EQ] THEN + ASM_ARITH_TAC; + ALL_TAC + ] THEN + + USE_THEN "HC" SUBST1_TAC THEN + USE_THEN "HCL" SUBST1_TAC THEN + REWRITE_TAC[LEFT_ADD_DISTRIB] THEN + TARGET_REWRITE_TAC [MULT_AC] DIV_MULT_ADD THEN + ASM_REWRITE_TAC[] THEN + IMP_REWRITE_TAC[ARITH_RULE`2*(x:num)*(2 EXP 63) = x*(2 EXP 64)`;DIV_MULT_ADD] THEN + SUBGOAL_THEN `(2 * cll:num) DIV 2 EXP 64 = 0` SUBST_ALL_TAC THENL [ + IMP_REWRITE_TAC[DIV_LT] THEN ASM_ARITH_TAC; ALL_TAC + ] THEN + REWRITE_TAC[ADD_0] THEN + + (* now either clh is 0 or 1! *) + MP_TAC (SPECL [`2 * ch`;`2 EXP 64`] DIVISION) THEN + ANTS_TAC THENL [ARITH_TAC;ALL_TAC] THEN + DISCH_THEN (fun th -> let t1,t2 = CONJ_PAIR th in + LABEL_TAC "HCH" t1 THEN LABEL_TAC "HCHLT" t2) THEN + MAP_EVERY ABBREV_TAC [`chh = (2 * ch) DIV 2 EXP 64`;`chl = (2 * ch) MOD 2 EXP 64`] THEN + USE_THEN "HCH" SUBST1_TAC THEN + TARGET_REWRITE_TAC [ADD_AC] DIV_MULT_ADD THEN + ASM_REWRITE_TAC[] THEN + MATCH_MP_TAC (ARITH_RULE`x=0 ==> a = a+x`) THEN + ASM_SIMP_TAC[DIV_EQ_0] THEN + + SUBGOAL_THEN `EVEN chl` MP_TAC THENL [ + MP_TAC (SPEC `ch:num` EVEN_DOUBLE) THEN + USE_THEN "HCH" SUBST1_TAC THEN + REWRITE_TAC[EVEN_ADD] THEN + REWRITE_TAC[ARITH_RULE`a*2 EXP 64 = 2*(a*2 EXP 63)`] THEN + ONCE_REWRITE_TAC[EVEN_MULT] THEN + REWRITE_TAC[ARITH_RULE`EVEN 2`]; + + ALL_TAC + ] THEN + + REWRITE_TAC[EVEN_EXISTS] THEN STRIP_TAC THEN + FIRST_X_ASSUM SUBST_ALL_TAC THEN + SUBGOAL_THEN `2 * m <= 2 EXP 64 - 2` MP_TAC THENL [ + USE_THEN "HCHLT" MP_TAC THEN REWRITE_TAC[ + ARITH_RULE`2 EXP 64 - 2 = 2 * (2 EXP 63 - 1)`; + ARITH_RULE`2 EXP 64=2*2 EXP 63`] THEN + IMP_REWRITE_TAC[LT_MULT_LCANCEL;LE_MULT_LCANCEL;ARITH_RULE`~(2=0)`] THEN + ARITH_TAC; + + ALL_TAC + ] THEN + ASM_ARITH_TAC; + + ALL_TAC + ] THEN + TARGET_REWRITE_TAC[ADD_AC] ADD_DIV_MOD_SIMP2_LEMMA THEN + ASM_REWRITE_TAC[] THEN + + TARGET_REWRITE_TAC[ADD_AC] ADD_DIV_MOD_SIMP2_LEMMA THEN + ASM_REWRITE_TAC[] THEN + + ARITH_TAC);; + + (* ------------------------------------------------------------------------- *) (* Helpful tactics *) (* ------------------------------------------------------------------------- *) diff --git a/arm/proofs/specifications.txt b/arm/proofs/specifications.txt index a14cd4b4..ebf5c1e9 100644 --- a/arm/proofs/specifications.txt +++ b/arm/proofs/specifications.txt @@ -145,6 +145,7 @@ BIGNUM_MONTMUL_P384_ALT_SUBROUTINE_CORRECT BIGNUM_MONTMUL_P384_NEON_SUBROUTINE_CORRECT BIGNUM_MONTMUL_P384_SUBROUTINE_CORRECT BIGNUM_MONTMUL_P521_ALT_SUBROUTINE_CORRECT +BIGNUM_MONTMUL_P521_NEON_SUBROUTINE_CORRECT BIGNUM_MONTMUL_P521_SUBROUTINE_CORRECT BIGNUM_MONTMUL_SM2_ALT_SUBROUTINE_CORRECT BIGNUM_MONTMUL_SM2_SUBROUTINE_CORRECT @@ -159,6 +160,7 @@ BIGNUM_MONTSQR_P384_ALT_SUBROUTINE_CORRECT BIGNUM_MONTSQR_P384_NEON_SUBROUTINE_CORRECT BIGNUM_MONTSQR_P384_SUBROUTINE_CORRECT BIGNUM_MONTSQR_P521_ALT_SUBROUTINE_CORRECT +BIGNUM_MONTSQR_P521_NEON_SUBROUTINE_CORRECT BIGNUM_MONTSQR_P521_SUBROUTINE_CORRECT BIGNUM_MONTSQR_SM2_ALT_SUBROUTINE_CORRECT BIGNUM_MONTSQR_SM2_SUBROUTINE_CORRECT diff --git a/benchmarks/benchmark.c b/benchmarks/benchmark.c index 5b1fda65..610f6778 100644 --- a/benchmarks/benchmark.c +++ b/benchmarks/benchmark.c @@ -798,8 +798,10 @@ void call_bignum_kmul_32_64_neon(void) {} void call_bignum_ksqr_32_64_neon(void) {} void call_bignum_montmul_p256_neon(void) {} void call_bignum_montmul_p384_neon(void) {} +void call_bignum_montmul_p521_neon(void) {} void call_bignum_montsqr_p256_neon(void) {} void call_bignum_montsqr_p384_neon(void) {} +void call_bignum_montsqr_p521_neon(void) {} void call_bignum_mul_8_16_neon(void) {} void call_bignum_sqr_8_16_neon(void) {} @@ -821,8 +823,10 @@ void call_bignum_kmul_32_64_neon(void) repeat(bignum_kmul_32_64_neon(b0,b1,b2,b3 void call_bignum_ksqr_32_64_neon(void) repeat(bignum_ksqr_32_64_neon(b0,b1,b2)) void call_bignum_montmul_p256_neon(void) repeat(bignum_montmul_p256_neon(b0,b1,b2)) void call_bignum_montmul_p384_neon(void) repeat(bignum_montmul_p384_neon(b0,b1,b2)) +void call_bignum_montmul_p521_neon(void) repeat(bignum_montmul_p521_neon(b0,b1,b2)) void call_bignum_montsqr_p256_neon(void) repeat(bignum_montsqr_p256_neon(b0,b1)) void call_bignum_montsqr_p384_neon(void) repeat(bignum_montsqr_p384_neon(b0,b1)) +void call_bignum_montsqr_p521_neon(void) repeat(bignum_montsqr_p521_neon(b0,b1)) void call_bignum_mul_8_16_neon(void) repeat(bignum_mul_8_16_neon(b0,b1,b2)) void call_bignum_sqr_8_16_neon(void) repeat(bignum_sqr_8_16_neon(b0,b1)) @@ -1065,7 +1069,8 @@ int main(int argc, char *argv[]) timingtest(arm,"bignum_montmul_p384_neon", call_bignum_montmul_p384_neon); timingtest(bmi,"bignum_montmul_p521",call_bignum_montmul_p521); timingtest(all,"bignum_montmul_p521_alt",call_bignum_montmul_p521_alt); - timingtest(bmi,"bignum_montmul_sm2",call_bignum_montmul_sm2); + timingtest(arm,"bignum_montmul_p521_neon", call_bignum_montmul_p521_neon); + timingtest(bmi,"bignum_montmul_sm2", call_bignum_montmul_sm2); timingtest(all,"bignum_montmul_sm2_alt",call_bignum_montmul_sm2_alt); timingtest(all,"bignum_montredc (32/16 -> 16)",call_bignum_montredc__32_16); timingtest(all,"bignum_montsqr (32 -> 32)" ,call_bignum_montsqr__32); @@ -1079,6 +1084,7 @@ int main(int argc, char *argv[]) timingtest(arm,"bignum_montsqr_p384_neon", call_bignum_montsqr_p384_neon); timingtest(bmi,"bignum_montsqr_p521",call_bignum_montsqr_p521); timingtest(all,"bignum_montsqr_p521_alt",call_bignum_montsqr_p521_alt); + timingtest(arm,"bignum_montsqr_p521_neon", call_bignum_montsqr_p521_neon); timingtest(bmi,"bignum_montsqr_sm2",call_bignum_montsqr_sm2); timingtest(all,"bignum_montsqr_sm2_alt",call_bignum_montsqr_sm2_alt); timingtest(all,"bignum_mul (4x4 -> 8)",call_bignum_mul__4_8); diff --git a/common/misc.ml b/common/misc.ml index ad9b84e4..e707b56d 100644 --- a/common/misc.ml +++ b/common/misc.ml @@ -1442,6 +1442,118 @@ let DIVIDES_MOD2 = prove( ASM_ARITH_TAC ]);; +let BITVAL_LE_DIV = prove( + `!(x:num) (m:num). x < 2 * m ==> bitval (m <= x) = x DIV m`, + REWRITE_TAC[bitval] THEN + REPEAT STRIP_TAC THEN + SUBGOAL_THEN `0 < m` MP_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + MP_TAC (SPECL [`x:num`;`m:num`] DIVISION) THEN + ANTS_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + DISCH_THEN (fun th -> let a,b = CONJ_PAIR th in + FIRST_X_ASSUM MP_TAC THEN MP_TAC b THEN + ASSUME_TAC a) THEN + ABBREV_TAC `xa = x DIV m` THEN + ABBREV_TAC `xb = x MOD m` THEN + REPEAT_N 2 (FIRST_X_ASSUM (K ALL_TAC)) THEN + FIRST_X_ASSUM (fun th -> ONCE_REWRITE_TAC [th]) THEN + REPEAT STRIP_TAC THEN + COND_CASES_TAC THENL [ + DISJ_CASES_THEN2 + (fun th -> SUBST_ALL_TAC th THEN ASM_ARITH_TAC) + (fun th -> MP_TAC th) + (SPEC `xa:num` num_CASES) THEN + STRIP_TAC THEN FIRST_X_ASSUM (fun th -> SUBST_ALL_TAC (REWRITE_RULE [ADD1] th)) THEN + MP_TAC (SPEC `n:num` num_CASES) THEN + DISJ_CASES_THEN2 + (fun th -> REWRITE_TAC[th] THEN ARITH_TAC) + (fun th -> MP_TAC th) + (SPEC `n:num` num_CASES) THEN + STRIP_TAC THEN FIRST_X_ASSUM (fun th -> SUBST_ALL_TAC (REWRITE_RULE [ADD1] th)) THEN + ASM_ARITH_TAC; + + DISJ_CASES_THEN2 + (fun th -> SUBST_ALL_TAC th THEN ASM_ARITH_TAC) + (fun th -> MP_TAC th) + (SPEC `xa:num` num_CASES) THEN + STRIP_TAC THEN FIRST_X_ASSUM (fun th -> SUBST_ALL_TAC (REWRITE_RULE [ADD1] th)) THEN + ASM_ARITH_TAC + ]);; + +let BITVAL_LE_MOD_MOD_DIV = prove( + `!(x1:num) (x2:num) (m:num). ~(m=0) ==> + bitval (m <= (x1 MOD m + x2 MOD m)) = (x1 MOD m + x2 MOD m) DIV m /\ + (x1 DIV m < m ==> bitval (m <= (x1 DIV m + x2 MOD m)) = (x1 DIV m + x2 MOD m) DIV m) /\ + (x2 DIV m < m ==> bitval (m <= (x1 MOD m + x2 DIV m)) = (x1 MOD m + x2 DIV m) DIV m)`, + REPEAT STRIP_TAC THEN IMP_REWRITE_TAC[BITVAL_LE_DIV] THEN + REWRITE_TAC[ARITH_RULE`2*m=m+m`] THEN + IMP_REWRITE_TAC[LT_ADD2;MOD_LT_EQ]);; + +let VAL_MUL_DIV_MOD_SIMP = prove + (`!(x:int64) (y:int64). + ((val x * val y) DIV 2 EXP 64) MOD 2 EXP 64 = (val x * val y) DIV 2 EXP 64 /\ + (val x * val y) MOD 2 EXP 128 = val x * val y`, + REPEAT GEN_TAC THEN + SUBGOAL_THEN `(val (x:int64) * val (y:int64) < 2 EXP 64 * 2 EXP 64)` ASSUME_TAC THENL [ + IMP_REWRITE_TAC[LT_MULT2;VAL_BOUND_64]; ALL_TAC + ] THEN + IMP_REWRITE_TAC[MOD_LT] THEN + IMP_REWRITE_TAC[RDIV_LT_EQ] THEN + ASM_ARITH_TAC);; + +let MOD_ADD_MOD_RIGHT = prove + (`!(a:num) (b:num) (c:num). a MOD d = b MOD d ==> (c + a) MOD d = (c + b) MOD d`, + REPEAT GEN_TAC THEN + SUBST1_TAC (GSYM (fst (CONJ_PAIR (SPECL [`c:num`;`a:num`;`d:num`] ADD_MOD_MOD_REFL)))) THEN + SUBST1_TAC (GSYM (fst (CONJ_PAIR (SPECL [`c:num`;`b:num`;`d:num`] ADD_MOD_MOD_REFL)))) THEN + MESON_TAC[]);; + +let DIV_2_EXP_63 = prove( + `!(x:num). x DIV 2 EXP 63 = (2 * x) DIV 2 EXP 64`, + SIMP_TAC[ARITH_RULE`2 EXP 64 = 2*2 EXP 63`;DIV_MULT2;ARITH_RULE`~(2=0)`]);; + +let LT_SUB_LT = prove(`!(a:num) (b:num). 0 < b /\ b < a ==> a - b < a`, + ASM_ARITH_TAC);; + +let MULT_ADD_DIV_LT = prove( + `!(a:num) (b:num) (c:num) (m:num). + 1 < m /\ a < m /\ b < m /\ c <= m ==> (a * b + c) DIV m < m`, + REPEAT STRIP_TAC THEN + SUBGOAL_THEN `(m:num) <= m * m - (m - 1)` ASSUME_TAC THENL [ + TRANS_TAC LE_TRANS `(m:num) * m - m` THEN CONJ_TAC THENL [ + TARGET_REWRITE_TAC [ARITH_RULE`(x:num)=x*1`] (GSYM LEFT_SUB_DISTRIB) THEN + SUBGOAL_THEN `1 <= (m:num)-1` ASSUME_TAC THENL [ ASM_ARITH_TAC; ALL_TAC] THEN + TARGET_REWRITE_TAC [ARITH_RULE`(x:num)=x*1`] LE_MULT_LCANCEL THEN + ASM_ARITH_TAC; + + ASM_ARITH_TAC + ]; + ALL_TAC + ] THEN + SUBGOAL_THEN `(a:num) * b + c < m * m` ASSUME_TAC THENL [ + TRANS_TAC LET_TRANS `(m - 1) * (m - 1) + m` THEN + CONJ_TAC THENL [ + MATCH_MP_TAC LE_ADD2 THEN + ASM_REWRITE_TAC[] THEN + MATCH_MP_TAC LE_MULT2 THEN + ASM_ARITH_TAC; + + REWRITE_TAC[LEFT_SUB_DISTRIB;RIGHT_SUB_DISTRIB] THEN + REWRITE_TAC[MULT_CLAUSES] THEN + ONCE_REWRITE_TAC[ARITH_RULE`(x:num)-a-b=x-b-a`] THEN + IMP_REWRITE_TAC[SUB_ADD] THEN + MATCH_MP_TAC LT_SUB_LT THEN + CONJ_TAC THENL [ASM_ARITH_TAC; + TRANS_TAC LET_TRANS `(m - 1) * 1` THEN + CONJ_TAC THENL [REWRITE_TAC[MULT_CLAUSES; LE_REFL]; ALL_TAC] THEN + MATCH_MP_TAC LT_MULT2 THEN ASM_ARITH_TAC]; + ]; + + ALL_TAC + ] THEN + + IMP_REWRITE_TAC[RDIV_LT_EQ] THEN + ASM_ARITH_TAC);; + (* ------------------------------------------------------------------------- *) (* Tactics for using labeled assumtions *) (* ------------------------------------------------------------------------- *) diff --git a/common/relational2.ml b/common/relational2.ml index 92609df2..6e5ecbcb 100644 --- a/common/relational2.ml +++ b/common/relational2.ml @@ -403,6 +403,14 @@ let EVENTUALLY_N_SWAP = prove( REPEAT GEN_TAC THEN REWRITE_TAC[eventually_n] THEN REPEAT STRIP_TAC THEN ASM_MESON_TAC[STEPS_NOSTUCK]);; +let EVENTUALLY_N_NESTED = prove( + `!(step:S->S->bool) (s0:S). + eventually_n step n (\s. eventually_n step n (\s2. P s s2) s0) s0 ==> + eventually_n step n (\s. P s s) s0`, + REWRITE_TAC[eventually_n] THEN + REPEAT STRIP_TAC THEN + ASM_MESON_TAC[]);; + let EVENTUALLY_N_COMPOSE = prove( `!(step:S->S->bool) (P:S->S->bool) (Q:S->S->bool) (R:S->S->bool) n1 m1 n2 m2 sa0 sb0. @@ -489,7 +497,7 @@ let EVENTUALLY_N_MONO = `!(step:S->S->bool) (P:S->bool) (Q:S->bool) n s. (!s'. P s' ==> Q s') ==> eventually_n step n P s ==> eventually_n step n Q s`, - REWRITE_TAC[eventually_n] THEN MESON_TAC[]) + REWRITE_TAC[eventually_n] THEN MESON_TAC[]);; let EVENTUALLY_N_EVENTUALLY = prove( @@ -583,6 +591,14 @@ let NESTED_EVENTUALLY_N_IS_PROD_OF_STEPS = prove( (* Properties of ensures_n, ensures2_n. *) (* ------------------------------------------------------------------------- *) +let SEQ_PAIR_SPLIT = prove( + `!(P:A->A->bool) (Q:A->A->bool) (R:A->A->bool) (S:A->A->bool) p1 p2 p1' p2'. + ((\(s,s2) (s',s2'). P s s' /\ Q s2 s2') ,, (\(s,s2) (s',s2'). R s s' /\ S s2 s2')) + (p1,p2) (p1',p2') + <=> + ((P ,, R) p1 p1' /\ (Q ,, S) p2 p2')`, + REWRITE_TAC[seq;EXISTS_PAIR_THM] THEN MESON_TAC[]);; + let ENSURES_N_ENSURES = prove( `!(step:S->S->bool) P Q C f_n. ensures_n step P Q C f_n ==> ensures step P Q C`, @@ -645,13 +661,13 @@ let ENSURES_N_ENSURES2_CONJ = prove( fn_1 fn_2`, REWRITE_TAC [ensures_n;ensures2;eventually_n] THEN MESON_TAC[]);; -(* Transitivity rule for ENSURES2. - Note that the num step functions of ensures2 must be a constant function. - In order to use this lemma, you will need to convert - `ensures2 step (\s. read X0 s = n /\ ...) (..) (..) (\s. read X0 s)` +(* A transitivity rule for ENSURES2. + Note that the num step functions of ensures2 must be constant functions + in this lemma. + In order to use this lemma, you will need to convert, e.g., + `ensures2 step (\s. ...) (..) (..) (\s. read X0 s)` into: - `ensures2 step (\s. read X0 s = n /\ ...) (..) (..) (\s. n)` - and apply this lemma. + `ensures2 step (\s. read X0 s = n /\ ...) (..) (..) (\s. n)`. *) let ENSURES2_TRANS = prove( `!(step:S->S->bool) P Q R C C' n1 n1' n2 n2'. @@ -717,14 +733,6 @@ let ENSURES2_CONJ = prove( REWRITE_TAC[ensures2;eventually_n] THEN MESON_TAC[]);; -let EVENTUALLY_N_NESTED = prove( - `!(step:S->S->bool) (s0:S). - eventually_n step n (\s. eventually_n step n (\s2. P s s2) s0) s0 ==> - eventually_n step n (\s. P s s) s0`, - REWRITE_TAC[eventually_n] THEN - REPEAT STRIP_TAC THEN - ASM_MESON_TAC[]);; - let ENSURES2_CONJ2 = prove( `!(step:S->S->bool) P Q P' Q' C1 C2 C3 fn1 fn2 fn3. ensures2 step P Q @@ -774,6 +782,205 @@ let ENSURES2_CONJ2 = prove( DISCH_THEN (fun th -> MAP_EVERY MP_TAC [th;MATCH_MP EVENTUALLY_N_STEPS th]) THEN MESON_TAC[eventually_n]);; + +let ENSURES2_TRIVIAL = prove( + `!(step:A->A->bool) P Q C. + (!s. P s ==> Q s) /\ (!s. C s s) ==> + ensures2 step P Q C (\s. 0) (\s. 0)`, + + REWRITE_TAC[ensures2;EVENTUALLY_N_TRIVIAL] THEN + REWRITE_TAC[FORALL_PAIR_THM] THEN MESON_TAC[]);; + + +(* ENSURES2_WHILE_PAUP_TAC verifies a relational hoare triple of two WHILE loops, + induction variables of which increasing from a to b - 1 (b - 1 is not included). + ENSURES_WHILE_PAUP_TAC takes the following arguments, all of which are HOL + Light terms: + - a: counter begin, has `:num` type + - b: counter end (not inclusive), has `:num` type + - pc1_head, pc1_backedge: program counter pointing to the beginning/end of + the loop of the first program, has `:num` type + - pc2_head, pc2_backedge: program counter pointing to the beginning/end of + the loop of the second program, has `:num` type + - loopinv: relational loop invariant, has `:num->S#S->bool` type where S is + the type of a program state + - flagcond1, flagcond2: `:S->bool` typed terms checking when the backedge is + taken, in the two programs + - f_nsteps1, f_nsteps2: `:S->num`, the number of small steps taken inside + the loop bodies + - nsteps_pre1, nsteps_pre2: `:num`, the number of small steps taken to reach + from precondition to the loop header + - nsteps_post1, nsteps_post2: `:num`, the number of small steps taken to reach + from pc1_backedge,pc2_backedge to + postcondition +*) + +let ENSURES2_WHILE_PAUP_TAC = + let pth = prove( + `!a b pc1_pre pc1 pc1' pc2_pre pc2 pc2' + (loopinv:num->A->A->bool) + (flagpred1:num->A->bool) (flagpred2:num->A->bool) + (f_nsteps1:num->num) (f_nsteps2:num->num) + (nsteps_pre1:num) (nsteps_pre2:num) + (nsteps_post1:num) (nsteps_post2:num). + C ,, C = C /\ + a < b /\ + ensures2 step + (\(s,s2). program_decodes1 s /\ read pcounter s = (word pc1_pre) /\ + program_decodes2 s2 /\ read pcounter s2 = (word pc2_pre) /\ + precondition s s2) + (\(s,s2). program_decodes1 s /\ read pcounter s = (word pc1) /\ + program_decodes2 s2 /\ read pcounter s2 = (word pc2) /\ + loopinv a s s2) + C + (\s. nsteps_pre1) (\s. nsteps_pre2) /\ + (!i. a <= i /\ i < b /\ ~(i = b) /\ ~(b = 0) /\ 0 < b + ==> ensures2 step + (\(s,s2). program_decodes1 s /\ read pcounter s = word pc1 /\ + program_decodes2 s2 /\ read pcounter s2 = word pc2 /\ + loopinv i s s2) + (\(s,s2). program_decodes1 s /\ read pcounter s = word pc1' /\ + program_decodes2 s2 /\ read pcounter s2 = word pc2' /\ + loopinv (i+1) s s2 /\ flagpred1 (i+1) s /\ + flagpred2 (i+1) s2) + C + (\s. f_nsteps1 i) (\s. f_nsteps2 i)) /\ + (!i. a < i /\ i < b /\ ~(i = b) /\ ~(i = 0) /\ ~(i = a) /\ ~(b = 0) /\ + 0 < b + ==> ensures2 step + (\(s,s2). program_decodes1 s /\ read pcounter s = word pc1' /\ + program_decodes2 s2 /\ read pcounter s2 = word pc2' /\ + loopinv i s s2 /\ flagpred1 i s /\ flagpred2 i s2) + (\(s,s2). program_decodes1 s /\ read pcounter s = word pc1 /\ + program_decodes2 s2 /\ read pcounter s2 = word pc2 /\ + loopinv i s s2) + C + // It only takes a single step to take the backedge. + (\s. 1) (\s. 1)) /\ + ensures2 step + (\(s,s2). program_decodes1 s /\ read pcounter s = word pc1' /\ + program_decodes2 s2 /\ read pcounter s2 = word pc2' /\ + loopinv b s s2 /\ flagpred1 b s /\ flagpred2 b s2) + postcondition + C + (\s. nsteps_post1) (\s. nsteps_post2) /\ + nsteps1 = nsteps_pre1 + (nsum(a..(b-1)) (\i. f_nsteps1 i) + (b-1-a)) + + nsteps_post1 /\ + nsteps2 = nsteps_pre2 + (nsum(a..(b-1)) (\i. f_nsteps2 i) + (b-1-a)) + + nsteps_post2 + ==> ensures2 step + (\(s,s2). program_decodes1 s /\ read pcounter s = word pc1_pre /\ + program_decodes2 s2 /\ read pcounter s2 = word pc2_pre /\ + precondition s s2) + postcondition + C + (\s. nsteps1) (\s. nsteps2)`, + + let APPLY_ENSURES2_TRANS_TAC:tactic = + USE_THEN "HCC" (fun th -> ONCE_REWRITE_TAC[GSYM th]) THEN + MATCH_MP_TAC ENSURES2_TRANS THEN META_EXISTS_TAC THEN CONJ_TAC in + REPEAT GEN_TAC THEN + INTRO_TAC "HCC HBOUND HPRE HLOOP HBACKEDGE HPOST HNSTEPS1 HNSTEPS2" THEN + REMOVE_THEN "HNSTEPS1" (fun th -> SUBST_ALL_TAC th) THEN + REMOVE_THEN "HNSTEPS2" (fun th -> SUBST_ALL_TAC th) THEN + (* precondition *) + APPLY_ENSURES2_TRANS_TAC THENL [ + USE_THEN "HPRE" (fun th -> UNIFY_ACCEPT_TAC [`Q:(A#A)->bool`] th); + ALL_TAC + ] THEN + (* postcondition *) + APPLY_ENSURES2_TRANS_TAC THENL [ + ALL_TAC; + USE_THEN "HPOST" (fun th -> UNIFY_ACCEPT_TAC [`Q:(A#A)->bool`] th) + ] THEN + (* the loop *) + REMOVE_THEN "HPRE" (K ALL_TAC) THEN + REMOVE_THEN "HPOST" (K ALL_TAC) THEN + MAP_EVERY (fun s -> REMOVE_THEN s MP_TAC) ["HBOUND";"HLOOP";"HBACKEDGE"] THEN + SPEC_TAC (`b:num`,`b:num`) THEN + INDUCT_TAC THENL [ + ARITH_TAC; + ALL_TAC + ] THEN + MAP_EVERY INTRO_TAC ["HBACKEDGE";"HLOOP";"HBOUND"] THEN + ASM_CASES_TAC `a < b` THENL [ + ALL_TAC; + (* a = b *) + SUBGOAL_THEN `a:num = b` SUBST_ALL_TAC THENL [ + ASM_ARITH_TAC; ALL_TAC + ] THEN + REWRITE_TAC[SUC_SUB1;NSUM_SING_NUMSEG;SUB_REFL;ADD1] THEN + GEN_REWRITE_TAC (RATOR_CONV o RATOR_CONV o RAND_CONV) + [(GSYM (snd (CONJ_PAIR SEQ_ID)))] THEN + MATCH_MP_TAC ENSURES2_TRANS THEN + META_EXISTS_TAC THEN + CONJ_TAC THENL [ + USE_THEN "HLOOP" (fun th -> MP_TAC (SPEC `b:num` th)) THEN + ANTS_TAC THENL [ + ARITH_TAC; + DISCH_THEN (fun th -> UNIFY_ACCEPT_TAC [`Q:(A#A)->bool`] th) + ]; + + REWRITE_TAC[ensures2;eventually_n;STEPS_TRIVIAL;ARITH_RULE`~(x<0)`] THEN MESON_TAC[] + ] + ] THEN + + SUBGOAL_THEN + `!f. nsum (a..SUC b - 1) (\i. f i) + SUC b - 1 - a = + (nsum (a..b - 1) (\i. f i) + b - 1 - a) + 1 + f b` + (fun th -> REWRITE_TAC[th]) THENL [ + REWRITE_TAC[ADD1;ADD_SUB;ETA_AX] THEN + REWRITE_TAC[ARITH_RULE`(x+y)+z+w=(x+w)+y+z`] THEN + IMP_REWRITE_TAC[GSYM NSUM_CLAUSES_RIGHT] THEN + ASM_ARITH_TAC; + ALL_TAC + ] THEN + APPLY_ENSURES2_TRANS_TAC THENL [ + ALL_TAC; + + APPLY_ENSURES2_TRANS_TAC THENL [ + REMOVE_THEN "HBACKEDGE" (fun th -> MP_TAC (SPEC `b:num` th)) THEN + ANTS_TAC THENL [ASM_ARITH_TAC; ALL_TAC] THEN + DISCH_THEN (UNIFY_ACCEPT_TAC [`Q:(A#A)->bool`;`Q':(A#A)->bool`]); + + REWRITE_TAC[ADD1] THEN + REMOVE_THEN "HLOOP" (fun th -> MP_TAC (SPEC `b:num` th)) THEN + ANTS_TAC THENL [ASM_ARITH_TAC;ALL_TAC] THEN + DISCH_THEN (fun th -> REWRITE_TAC[th]) + ] + ] THEN + + FIRST_X_ASSUM (fun th -> MATCH_MP_TAC(REWRITE_RULE[GSYM IMP_CONJ] th)) THEN + REPEAT CONJ_TAC THENL [ + REPEAT STRIP_TAC THEN FIRST_X_ASSUM MATCH_MP_TAC THEN ASM_ARITH_TAC; + REPEAT STRIP_TAC THEN FIRST_X_ASSUM MATCH_MP_TAC THEN ASM_ARITH_TAC; + ASM_REWRITE_TAC[] + ]) in + + fun a b pc1_head pc1_backedge pc2_head pc2_backedge + loopinv flagcond1 flagcond2 f_nsteps1 f_nsteps2 + nsteps_pre1 nsteps_pre2 + nsteps_post1 nsteps_post2 -> + MATCH_MP_TAC pth THEN + MAP_EVERY EXISTS_TAC [a;b;pc1_head;pc1_backedge;pc2_head;pc2_backedge; + loopinv;flagcond1;flagcond2; + f_nsteps1;f_nsteps2;nsteps_pre1;nsteps_pre2;nsteps_post1; + nsteps_post2] THEN + CONJ_TAC THENL [ + (* MAYCHANGE. *) + REWRITE_TAC[FUN_EQ_THM] THEN + REWRITE_TAC[FORALL_PAIR_THM] THEN + REWRITE_TAC[SEQ_PAIR_SPLIT] THEN + REWRITE_TAC[ETA_AX] THEN + REPEAT STRIP_TAC THEN + MATCH_MP_TAC (MESON[] `!(p:A->A->bool) (q:A->A->bool) r s. + ((p = r) /\ (q = s)) ==> (p x1 x2 /\ q y1 y2 <=> r x1 x2 /\ s y1 y2)`) THEN + REWRITE_TAC[ETA_AX] THEN MAYCHANGE_IDEMPOT_TAC; + + (* The remaining condition. *) + ALL_TAC + ];; + (* A relational hoare triple version of ENSURES_INIT_TAC. *) let ENSURES2_INIT_TAC sname sname2 = GEN_REWRITE_TAC I [ensures2] THEN @@ -786,11 +993,33 @@ let ENSURES2_INIT_TAC sname sname2 = DISCH_THEN(REPEAT_TCL CONJUNCTS_THEN ASSUME_TAC) THEN ASSUME_TAC(ISPEC svar MAYCHANGE_STARTER));; - -let SEQ_PAIR_SPLIT = prove( - `!(P:A->A->bool) (Q:A->A->bool) (R:A->A->bool) (S:A->A->bool) p1 p2 p1' p2'. - ((\(s,s2) (s',s2'). P s s' /\ Q s2 s2') ,, (\(s,s2) (s',s2'). R s s' /\ S s2 s2')) - (p1,p2) (p1',p2') - <=> - ((P ,, R) p1 p1' /\ (Q ,, S) p2 p2')`, - REWRITE_TAC[seq;EXISTS_PAIR_THM] THEN MESON_TAC[]);; +let APPLY_IF (checker:term->bool) (t:tactic) = + W (fun (asl,g) -> + if checker g then t else ALL_TAC);; + +(* Given two ensures2 theorems, combine them using ENSURES2_CONJ2 and put the + result as an antecendent. If any or both of the two theorems have assumption + like `(assumption) ==> ensures2`, this tactic tries to prove the + assumption(s). *) +let ENSURES2_TRANS_TAC ensures2th ensures2th2 = + (* instantiate first ensures2 theorem *) + MP_TAC (SPEC_ALL (SPECL [`pc:num`;`pc3:num`] ensures2th)) THEN + APPLY_IF is_imp (ANTS_TAC THENL [ + ASM_REWRITE_TAC[] THEN REPEAT (POP_ASSUM MP_TAC) THEN + REWRITE_TAC[ALL;NONOVERLAPPING_CLAUSES] THEN + REPEAT STRIP_TAC THEN NONOVERLAPPING_TAC; + ALL_TAC]) THEN + DISCH_THEN (LABEL_TAC "_tmp_trans1") THEN + (* instantiate second ensures2 theorem *) + MP_TAC (SPEC_ALL (SPECL [`pc3:num`;`pc2:num`] ensures2th2)) THEN + APPLY_IF is_imp (ANTS_TAC THENL [ + ASM_REWRITE_TAC[] THEN REPEAT (POP_ASSUM MP_TAC) THEN + REWRITE_TAC[ALL;NONOVERLAPPING_CLAUSES] THEN + REPEAT STRIP_TAC THEN NONOVERLAPPING_TAC; + ALL_TAC]) THEN + DISCH_THEN (LABEL_TAC "_tmp_trans2") THEN + + REMOVE_THEN "_tmp_trans1" (fun c1 -> + REMOVE_THEN "_tmp_trans2" (fun c2 -> + MP_TAC (REWRITE_RULE [] (MATCH_MP ENSURES2_CONJ2 (CONJ c1 c2))) + ));; \ No newline at end of file diff --git a/include/s2n-bignum-c89.h b/include/s2n-bignum-c89.h index bafa5be0..07dd9e70 100644 --- a/include/s2n-bignum-c89.h +++ b/include/s2n-bignum-c89.h @@ -555,6 +555,7 @@ extern void bignum_montmul_p384_neon (uint64_t z[6], uint64_t x[6], uint64_t y[6 /* Inputs x[9], y[9]; output z[9] */ extern void bignum_montmul_p521 (uint64_t z[9], uint64_t x[9], uint64_t y[9]); extern void bignum_montmul_p521_alt (uint64_t z[9], uint64_t x[9], uint64_t y[9]); +extern void bignum_montmul_p521_neon (uint64_t z[9], uint64_t x[9], uint64_t y[9]); /* Montgomery multiply, z := (x * y / 2^256) mod p_sm2 */ /* Inputs x[4], y[4]; output z[4] */ @@ -590,6 +591,7 @@ extern void bignum_montsqr_p384_neon (uint64_t z[6], uint64_t x[6]); /* Input x[9]; output z[9] */ extern void bignum_montsqr_p521 (uint64_t z[9], uint64_t x[9]); extern void bignum_montsqr_p521_alt (uint64_t z[9], uint64_t x[9]); +extern void bignum_montsqr_p521_neon (uint64_t z[9], uint64_t x[9]); /* Montgomery square, z := (x^2 / 2^256) mod p_sm2 */ /* Input x[4]; output z[4] */ diff --git a/include/s2n-bignum.h b/include/s2n-bignum.h index 54f944ea..cfe65827 100644 --- a/include/s2n-bignum.h +++ b/include/s2n-bignum.h @@ -564,6 +564,7 @@ extern void bignum_montmul_p384_neon(uint64_t z[S2N_BIGNUM_STATIC 6], // Inputs x[9], y[9]; output z[9] extern void bignum_montmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9], uint64_t y[S2N_BIGNUM_STATIC 9]); extern void bignum_montmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9], uint64_t y[S2N_BIGNUM_STATIC 9]); +extern void bignum_montmul_p521_neon (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9], uint64_t y[S2N_BIGNUM_STATIC 9]); // Montgomery multiply, z := (x * y / 2^256) mod p_sm2 // Inputs x[4], y[4]; output z[4] @@ -600,6 +601,7 @@ extern void bignum_montsqr_p384_neon(uint64_t z[S2N_BIGNUM_STATIC 6], // Input x[9]; output z[9] extern void bignum_montsqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9]); extern void bignum_montsqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9]); +extern void bignum_montsqr_p521_neon (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t x[S2N_BIGNUM_STATIC 9]); // Montgomery square, z := (x^2 / 2^256) mod p_sm2 // Input x[4]; output z[4] diff --git a/tests/test.c b/tests/test.c index 8e4fa5ce..08c79a90 100644 --- a/tests/test.c +++ b/tests/test.c @@ -6139,7 +6139,9 @@ int test_bignum_montmul_p384_neon(void) { #endif } -int test_bignum_montmul_p521(void) { +int test_bignum_montmul_p521_specific(const char *name, + void (*f)(uint64_t *z, uint64_t *x, + uint64_t *y)) { uint64_t t; printf("Testing bignum_montmul_p521 with %d cases\n",tests); @@ -6149,7 +6151,7 @@ int test_bignum_montmul_p521(void) { reference_mod(9,b0,b2,p_521); random_bignum(9,b2); reference_mod(9,b1,b2,p_521); - bignum_montmul_p521(b4,b0,b1); + f(b4,b0,b1); reference_dmontmul(9,b3,b0,b1,p_521,i_521,b5); c = reference_compare(9,b3,9,b4); @@ -6171,36 +6173,24 @@ int test_bignum_montmul_p521(void) { return 0; } -int test_bignum_montmul_p521_alt(void) -{ uint64_t t; - printf("Testing bignum_montmul_p521_alt with %d cases\n",tests); +int test_bignum_montmul_p521(void) +{ return test_bignum_montmul_p521_specific("bignum_montmul_p521", + bignum_montmul_p521); +} - int c; - for (t = 0; t < tests; ++t) - { random_bignum(9,b2); - reference_mod(9,b0,b2,p_521); - random_bignum(9,b2); - reference_mod(9,b1,b2,p_521); - bignum_montmul_p521_alt(b4,b0,b1); - reference_dmontmul(9,b3,b0,b1,p_521,i_521,b5); +int test_bignum_montmul_p521_alt(void) +{ return test_bignum_montmul_p521_specific("bignum_montmul_p521_alt", + bignum_montmul_p521_alt); +} - c = reference_compare(9,b3,9,b4); - if (c != 0) - { printf("### Disparity: [size %4"PRIu64"] " - "2^-576 * ...0x%016"PRIx64" * ...%016"PRIx64" mod p_521 = " - "0x%016"PRIx64"...%016"PRIx64" not 0x%016"PRIx64"...%016"PRIx64"\n", - UINT64_C(9),b0[0],b1[0],b4[8],b4[0],b3[8],b3[0]); - return 1; - } - else if (VERBOSE) - { printf("OK: [size %4"PRIu64"] " - "2^-576 * ...0x%016"PRIx64" * ...%016"PRIx64" mod p_521 = " - "0x%016"PRIx64"...%016"PRIx64"\n", - UINT64_C(9),b0[0],b1[0],b4[8],b4[0]); - } - } - printf("All OK\n"); - return 0; +int test_bignum_montmul_p521_neon(void) { +#ifdef __x86_64__ + // Do not call the neon function to avoid a linking failure error. + return 1; +#else + return test_bignum_montmul_p521_specific("bignum_montmul_p521_neon", + bignum_montmul_p521_neon); +#endif } int test_bignum_montmul_sm2(void) @@ -6523,15 +6513,16 @@ int test_bignum_montsqr_p384_neon(void) { #endif } -int test_bignum_montsqr_p521(void) -{ uint64_t t; - printf("Testing bignum_montsqr_p521 with %d cases\n",tests); +int test_bignum_montsqr_p521_specific(const char *name, + void (*f)(uint64_t *z, uint64_t *x)) { + uint64_t t; + printf("Testing %s with %d cases\n",name,tests); int c; for (t = 0; t < tests; ++t) { random_bignum(9,b2); reference_mod(9,b0,b2,p_521); - bignum_montsqr_p521(b4,b0); + f(b4,b0); reference_dmontmul(9,b3,b0,b0,p_521,i_521,b5); c = reference_compare(9,b3,9,b4); @@ -6553,34 +6544,25 @@ int test_bignum_montsqr_p521(void) return 0; } -int test_bignum_montsqr_p521_alt(void) -{ uint64_t t; - printf("Testing bignum_montsqr_p521_alt with %d cases\n",tests); +int test_bignum_montsqr_p521(void) +{ return test_bignum_montsqr_p521_specific("bignum_montsqr_p521", + bignum_montsqr_p521); +} - int c; - for (t = 0; t < tests; ++t) - { random_bignum(9,b2); - reference_mod(9,b0,b2,p_521); - bignum_montsqr_p521_alt(b4,b0); - reference_dmontmul(9,b3,b0,b0,p_521,i_521,b5); +int test_bignum_montsqr_p521_alt(void) +{ return test_bignum_montsqr_p521_specific("bignum_montsqr_p521_alt", + bignum_montsqr_p521_alt); +} - c = reference_compare(9,b3,9,b4); - if (c != 0) - { printf("### Disparity: [size %4"PRIu64"] " - "2^-576 * ...0x%016"PRIx64"^2 mod p_521 = " - "0x%016"PRIx64"...%016"PRIx64" not 0x%016"PRIx64"...%016"PRIx64"\n", - UINT64_C(9),b0[0],b4[8],b4[0],b3[8],b3[0]); - return 1; - } - else if (VERBOSE) - { printf("OK: [size %4"PRIu64"] " - "2^-576 * ...0x%016"PRIx64"^2 mod p_521 = " - "0x%016"PRIx64"...%016"PRIx64"\n", - UINT64_C(9),b0[0],b4[8],b4[0]); - } - } - printf("All OK\n"); - return 0; +int test_bignum_montsqr_p521_neon(void) +{ +#ifdef __x86_64__ + // Do not call the neon function to avoid a linking failure error. + return 1; +#else + return test_bignum_montsqr_p521_specific("bignum_montsqr_p521_neon", + bignum_montsqr_p521_neon); +#endif } int test_bignum_montsqr_sm2(void) @@ -12562,8 +12544,10 @@ int main(int argc, char *argv[]) functionaltest(all,"bignum_ksqr_32_64_neon",test_bignum_ksqr_32_64_neon); functionaltest(all,"bignum_montmul_p256_neon", test_bignum_montmul_p256_neon); functionaltest(all,"bignum_montmul_p384_neon", test_bignum_montmul_p384_neon); + functionaltest(all,"bignum_montmul_p521_neon", test_bignum_montmul_p521_neon); functionaltest(all,"bignum_montsqr_p256_neon", test_bignum_montsqr_p256_neon); functionaltest(all,"bignum_montsqr_p384_neon", test_bignum_montsqr_p384_neon); + functionaltest(all,"bignum_montsqr_p521_neon", test_bignum_montsqr_p521_neon); functionaltest(all,"bignum_mul_8_16_neon",test_bignum_mul_8_16_neon); functionaltest(all,"bignum_sqr_8_16_neon",test_bignum_sqr_8_16_neon); } diff --git a/tools/diff.py b/tools/diff.py index 745b14d5..a4771d09 100644 --- a/tools/diff.py +++ b/tools/diff.py @@ -5,10 +5,14 @@ import difflib import sys +if len(sys.argv) > 3: + print("diff.py ") + exit(1) + l1 = open(sys.argv[1], "r").readlines() l2 = open(sys.argv[2], "r").readlines() -s = difflib.SequenceMatcher(None, l1, l2) +s = difflib.SequenceMatcher(None, l1, l2, autojunk=False) # https://docs.python.org/3/library/difflib.html#sequencematcher-objects for tag,i1,i2,j1,j2 in s.get_opcodes():