diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S index 2a26dafc8f..b8531e18dd 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S +++ b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S @@ -1242,7 +1242,7 @@ curve25519_x25519_scalarloop: usra v20.2d, v25.2d, #25 and v27.16b, v25.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 - mov v5.d[0], x3 // depth 86 + mov v5.d[0], x3 mov v1.d[0], x5 // FINAL z2 usra v26.2d, v20.2d, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 and v28.16b, v20.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_alt.S b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_alt.S index 82de375b14..518cb89555 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_alt.S +++ b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_alt.S @@ -593,8 +593,7 @@ curve25519_x25519_alt_scalarloop: // Multiplex directly into (xn,zn) then do three pure doubling steps; // this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. +// of the scalar. cmp swap, xzr mux_4(xn,xm,xn) @@ -631,12 +630,12 @@ curve25519_x25519_alt_scalarloop: orr x1, x1, 0x10000 cmadd_4(e,p,d) mul_4(xn,s,d) - mul_p25519(zn,p,e) + mul_4(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn +// Prepare to call the modular inverse function to get zn' = 1/zn - add x0, xm + add x0, zn add x1, zn // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -644,7 +643,7 @@ curve25519_x25519_alt_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 128 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn and zn. mov x20, x0 mov x10, #0xffffffffffffffed @@ -1675,28 +1674,13 @@ curve25519_x25519_alt_invmidloop: stp x0, x1, [x4] stp x2, x5, [x4, #16] -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - ldp x0, x1, [zn] - ldp x2, x3, [zn+16] - orr x0, x0, x1 - orr x2, x2, x3 - orr x4, x0, x2 - cmp x4, xzr - ldp x0, x1, [xn] - csel x0, x0, xzr, ne - csel x1, x1, xzr, ne - ldp x2, x3, [xn+16] - stp x0, x1, [xn] - csel x2, x2, xzr, ne - csel x3, x3, xzr, ne - stp x2, x3, [xn+16] - // Now the result is xn * (1/zn), fully reduced modulo p. +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. - mul_p25519(resx,xn,xm) + mul_p25519(resx,xn,zn) // Restore stack and registers diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S index 73c27db9f8..69230ca0df 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S +++ b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S @@ -1360,7 +1360,7 @@ curve25519_x25519_byte_scalarloop: usra v20.2d, v25.2d, #25 and v27.16b, v25.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 - mov v5.d[0], x3 // depth 86 + mov v5.d[0], x3 mov v1.d[0], x5 // FINAL z2 usra v26.2d, v20.2d, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 and v28.16b, v20.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte_alt.S b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte_alt.S index 790cb2b030..511e2960bd 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte_alt.S +++ b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte_alt.S @@ -711,8 +711,7 @@ curve25519_x25519_byte_alt_scalarloop: // Multiplex directly into (xn,zn) then do three pure doubling steps; // this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. +// of the scalar. cmp swap, xzr mux_4(xn,xm,xn) @@ -749,12 +748,12 @@ curve25519_x25519_byte_alt_scalarloop: orr x1, x1, 0x10000 cmadd_4(e,p,d) mul_4(xn,s,d) - mul_p25519(zn,p,e) + mul_4(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn +// Prepare to call the modular inverse function to get zn' = 1/zn - add x0, xm + add x0, zn add x1, zn // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -762,7 +761,7 @@ curve25519_x25519_byte_alt_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 128 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn and zn. mov x20, x0 mov x10, #0xffffffffffffffed @@ -1793,28 +1792,13 @@ curve25519_x25519_byte_alt_invmidloop: stp x0, x1, [x4] stp x2, x5, [x4, #16] -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - ldp x0, x1, [zn] - ldp x2, x3, [zn+16] - orr x0, x0, x1 - orr x2, x2, x3 - orr x4, x0, x2 - cmp x4, xzr - ldp x0, x1, [xn] - csel x0, x0, xzr, ne - csel x1, x1, xzr, ne - ldp x2, x3, [xn+16] - stp x0, x1, [xn] - csel x2, x2, xzr, ne - csel x3, x3, xzr, ne - stp x2, x3, [xn+16] - // Now the result is xn * (1/zn), fully reduced modulo p. +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. - mul_p25519(zn,xn,xm) + mul_p25519(zn,xn,zn) ldp x10, x11, [zn] strb w10, [resx] diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table.S b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table.S index ba3e48d061..514df68262 100644 --- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table.S +++ b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S index 2b51db2c42..1e31f070b9 100644 --- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S +++ b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S index dc36c0002e..773a6d5745 100644 --- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S +++ b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S index 80db20d6b6..b065a70525 100644 --- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S +++ b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S @@ -1,9 +1,9 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] -// into z[0..width-1]. width must be a mutiple of 8. +// into z[0..width-1]. width must be a multiple of 8. // This function is constant-time with respect to the value of `idx`. This is // achieved by reading the whole table and using the bit-masking to get the // `idx`-th row. diff --git a/third_party/s2n-bignum/arm/p384/Makefile b/third_party/s2n-bignum/arm/p384/Makefile index 564b9dd93c..60687fb7c1 100644 --- a/third_party/s2n-bignum/arm/p384/Makefile +++ b/third_party/s2n-bignum/arm/p384/Makefile @@ -35,8 +35,10 @@ OBJ = bignum_add_p384.o \ bignum_mod_p384_6.o \ bignum_montmul_p384.o \ bignum_montmul_p384_alt.o \ + bignum_montmul_p384_neon.o \ bignum_montsqr_p384.o \ bignum_montsqr_p384_alt.o \ + bignum_montsqr_p384_neon.o \ bignum_mux_6.o \ bignum_neg_p384.o \ bignum_nonzero_6.o \ @@ -45,8 +47,11 @@ OBJ = bignum_add_p384.o \ bignum_tomont_p384.o \ bignum_triple_p384.o \ p384_montjadd.o \ + p384_montjadd_alt.o \ p384_montjdouble.o \ - p384_montjmixadd.o + p384_montjdouble_alt.o \ + p384_montjmixadd.o \ + p384_montjmixadd_alt.o %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - diff --git a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_neon.S b/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_neon.S new file mode 100644 index 0000000000..08c296bc0d --- /dev/null +++ b/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_neon.S @@ -0,0 +1,885 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^384) mod p_384 +// Inputs x[6], y[6]; output z[6] +// +// extern void bignum_montmul_p384_neon +// (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]); +// +// Does z := (2^{-384} * x * y) mod p_384, assuming that the inputs x and y +// satisfy x * y <= 2^384 * p_384 (in particular this is true if we are in +// the "usual" case x < p_384 and y < p_384). +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- + +// bignum_montmul_p384_neon is functionally equivalent to bignum_montmul_p384. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// ldp x3, x21, [x1] +// ldr q30, [x1] +// ldp x8, x24, [x1, #16] +// ldp x5, x10, [x1, #32] +// ldp x13, x23, [x2] +// ldr q19, [x2] +// ldp x6, x14, [x2, #16] +// ldp x15, x17, [x2, #32] +// ldr q1, [x1, #32] +// ldr q28, [x2, #32] +// uzp1 v5.4S, v19.4S, v30.4S +// rev64 v19.4S, v19.4S +// uzp1 v0.4S, v30.4S, v30.4S +// mul v21.4S, v19.4S, v30.4S +// uaddlp v19.2D, v21.4S +// shl v19.2D, v19.2D, #32 +// umlal v19.2D, v0.2S, v5.2S +// mov x12, v19.d[0] +// mov x16, v19.d[1] +// mul x20, x8, x6 +// umulh x4, x3, x13 +// umulh x1, x21, x23 +// umulh x2, x8, x6 +// adds x4, x4, x16 +// adcs x19, x1, x20 +// adc x20, x2, xzr +// adds x11, x4, x12 +// adcs x16, x19, x4 +// adcs x1, x20, x19 +// adc x2, x20, xzr +// adds x7, x16, x12 +// adcs x4, x1, x4 +// adcs x9, x2, x19 +// adc x19, x20, xzr +// subs x2, x3, x21 +// cneg x20, x2, cc +// csetm x16, cc +// subs x2, x23, x13 +// cneg x2, x2, cc +// mul x1, x20, x2 +// umulh x2, x20, x2 +// cinv x16, x16, cc +// eor x1, x1, x16 +// eor x2, x2, x16 +// cmn x16, #0x1 +// adcs x11, x11, x1 +// adcs x7, x7, x2 +// adcs x4, x4, x16 +// adcs x9, x9, x16 +// adc x19, x19, x16 +// subs x2, x3, x8 +// cneg x20, x2, cc +// csetm x16, cc +// subs x2, x6, x13 +// cneg x2, x2, cc +// mul x1, x20, x2 +// umulh x2, x20, x2 +// cinv x16, x16, cc +// eor x1, x1, x16 +// eor x2, x2, x16 +// cmn x16, #0x1 +// adcs x7, x7, x1 +// adcs x4, x4, x2 +// adcs x9, x9, x16 +// adc x19, x19, x16 +// subs x2, x21, x8 +// cneg x20, x2, cc +// csetm x16, cc +// subs x2, x6, x23 +// cneg x2, x2, cc +// mul x1, x20, x2 +// umulh x2, x20, x2 +// cinv x16, x16, cc +// eor x1, x1, x16 +// eor x2, x2, x16 +// cmn x16, #0x1 +// adcs x4, x4, x1 +// adcs x20, x9, x2 +// adc x16, x19, x16 +// lsl x2, x12, #32 +// add x19, x2, x12 +// lsr x2, x19, #32 +// subs x1, x2, x19 +// sbc x2, x19, xzr +// extr x1, x2, x1, #32 +// lsr x2, x2, #32 +// adds x12, x2, x19 +// adc x2, xzr, xzr +// subs x1, x11, x1 +// sbcs x7, x7, x12 +// sbcs x4, x4, x2 +// sbcs x20, x20, xzr +// sbcs x16, x16, xzr +// sbc x9, x19, xzr +// lsl x2, x1, #32 +// add x19, x2, x1 +// lsr x2, x19, #32 +// subs x1, x2, x19 +// sbc x2, x19, xzr +// extr x1, x2, x1, #32 +// lsr x2, x2, #32 +// adds x12, x2, x19 +// adc x2, xzr, xzr +// subs x1, x7, x1 +// sbcs x4, x4, x12 +// sbcs x20, x20, x2 +// sbcs x16, x16, xzr +// sbcs x7, x9, xzr +// sbc x9, x19, xzr +// lsl x2, x1, #32 +// add x19, x2, x1 +// lsr x2, x19, #32 +// subs x1, x2, x19 +// sbc x2, x19, xzr +// extr x12, x2, x1, #32 +// lsr x2, x2, #32 +// adds x1, x2, x19 +// adc x2, xzr, xzr +// subs x4, x4, x12 +// sbcs x20, x20, x1 +// sbcs x16, x16, x2 +// sbcs x12, x7, xzr +// sbcs x1, x9, xzr +// sbc x2, x19, xzr +// stp x4, x20, [x0] // @slothy:writes=buffer0 +// stp x16, x12, [x0, #16] // @slothy:writes=buffer16 +// stp x1, x2, [x0, #32] // @slothy:writes=buffer32 +// mul x22, x24, x14 +// movi v31.2D, #0x00000000ffffffff +// uzp2 v16.4S, v28.4S, v28.4S +// xtn v6.2S, v1.2D +// xtn v30.2S, v28.2D +// rev64 v28.4S, v28.4S +// umull v5.2D, v6.2S, v30.2S +// umull v0.2D, v6.2S, v16.2S +// uzp2 v19.4S, v1.4S, v1.4S +// mul v20.4S, v28.4S, v1.4S +// usra v0.2D, v5.2D, #32 +// umull v1.2D, v19.2S, v16.2S +// uaddlp v24.2D, v20.4S +// and v5.16B, v0.16B, v31.16B +// umlal v5.2D, v19.2S, v30.2S +// shl v19.2D, v24.2D, #32 +// usra v1.2D, v0.2D, #32 +// umlal v19.2D, v6.2S, v30.2S +// usra v1.2D, v5.2D, #32 +// mov x20, v19.d[0] +// mov x16, v19.d[1] +// umulh x12, x24, x14 +// mov x1, v1.d[0] +// mov x2, v1.d[1] +// adds x4, x12, x20 +// adcs x20, x1, x16 +// adc x16, x2, xzr +// adds x7, x4, x22 +// adcs x12, x20, x4 +// adcs x1, x16, x20 +// adc x2, x16, xzr +// adds x9, x12, x22 +// adcs x19, x1, x4 +// adcs x4, x2, x20 +// adc x20, x16, xzr +// subs x2, x24, x5 +// cneg x16, x2, cc +// csetm x12, cc +// subs x2, x15, x14 +// cneg x2, x2, cc +// mul x1, x16, x2 +// umulh x2, x16, x2 +// cinv x12, x12, cc +// eor x1, x1, x12 +// eor x2, x2, x12 +// cmn x12, #0x1 +// adcs x11, x7, x1 +// adcs x9, x9, x2 +// adcs x19, x19, x12 +// adcs x4, x4, x12 +// adc x20, x20, x12 +// subs x2, x24, x10 +// cneg x16, x2, cc +// csetm x12, cc +// subs x2, x17, x14 +// cneg x2, x2, cc +// mul x1, x16, x2 +// umulh x2, x16, x2 +// cinv x12, x12, cc +// eor x1, x1, x12 +// eor x2, x2, x12 +// cmn x12, #0x1 +// adcs x7, x9, x1 +// adcs x19, x19, x2 +// adcs x4, x4, x12 +// adc x20, x20, x12 +// subs x2, x5, x10 +// cneg x16, x2, cc +// csetm x12, cc +// subs x2, x17, x15 +// cneg x2, x2, cc +// mul x1, x16, x2 +// umulh x2, x16, x2 +// cinv x16, x12, cc +// eor x1, x1, x16 +// eor x2, x2, x16 +// cmn x16, #0x1 +// adcs x19, x19, x1 +// adcs x12, x4, x2 +// adc x1, x20, x16 +// subs x2, x24, x3 +// sbcs x24, x5, x21 +// sbcs x21, x10, x8 +// ngc x5, xzr +// cmn x5, #0x1 +// eor x2, x2, x5 +// adcs x4, x2, xzr +// eor x2, x24, x5 +// adcs x20, x2, xzr +// eor x2, x21, x5 +// adc x16, x2, xzr +// subs x2, x13, x14 +// sbcs x24, x23, x15 +// sbcs x8, x6, x17 +// ngc x21, xzr +// cmn x21, #0x1 +// eor x2, x2, x21 +// adcs x15, x2, xzr +// eor x2, x24, x21 +// adcs x14, x2, xzr +// eor x2, x8, x21 +// adc x6, x2, xzr +// eor x9, x5, x21 +// ldp x21, x2, [x0] // @slothy:reads=buffer0 +// adds x10, x22, x21 +// adcs x5, x11, x2 +// ldp x21, x2, [x0, #16] // @slothy:reads=buffer16 +// adcs x24, x7, x21 +// adcs x8, x19, x2 +// ldp x21, x2, [x0, #32] // @slothy:reads=buffer32 +// adcs x21, x12, x21 +// adcs x2, x1, x2 +// adc x19, xzr, xzr +// stp x10, x5, [x0] // @slothy:writes=buffer0 +// stp x24, x8, [x0, #16] // @slothy:writes=buffer16 +// stp x21, x2, [x0, #32] // @slothy:writes=buffer32 +// mul x12, x4, x15 +// mul x5, x20, x14 +// mul x24, x16, x6 +// umulh x8, x4, x15 +// umulh x21, x20, x14 +// umulh x2, x16, x6 +// adds x10, x8, x5 +// adcs x5, x21, x24 +// adc x24, x2, xzr +// adds x23, x10, x12 +// adcs x8, x5, x10 +// adcs x21, x24, x5 +// adc x2, x24, xzr +// adds x13, x8, x12 +// adcs x1, x21, x10 +// adcs x10, x2, x5 +// adc x5, x24, xzr +// subs x2, x4, x20 +// cneg x24, x2, cc +// csetm x8, cc +// subs x2, x14, x15 +// cneg x2, x2, cc +// mul x21, x24, x2 +// umulh x2, x24, x2 +// cinv x8, x8, cc +// eor x21, x21, x8 +// eor x2, x2, x8 +// cmn x8, #0x1 +// adcs x23, x23, x21 +// adcs x13, x13, x2 +// adcs x1, x1, x8 +// adcs x10, x10, x8 +// adc x5, x5, x8 +// subs x2, x4, x16 +// cneg x24, x2, cc +// csetm x8, cc +// subs x2, x6, x15 +// cneg x2, x2, cc +// mul x21, x24, x2 +// umulh x2, x24, x2 +// cinv x8, x8, cc +// eor x21, x21, x8 +// eor x2, x2, x8 +// cmn x8, #0x1 +// adcs x4, x13, x21 +// adcs x13, x1, x2 +// adcs x1, x10, x8 +// adc x10, x5, x8 +// subs x2, x20, x16 +// cneg x24, x2, cc +// csetm x8, cc +// subs x2, x6, x14 +// cneg x2, x2, cc +// mul x21, x24, x2 +// umulh x2, x24, x2 +// cinv x5, x8, cc +// eor x21, x21, x5 +// eor x2, x2, x5 +// cmn x5, #0x1 +// adcs x24, x13, x21 +// adcs x8, x1, x2 +// adc x21, x10, x5 +// ldp x20, x16, [x0] // @slothy:reads=buffer0 +// ldp x17, x15, [x0, #16] // @slothy:reads=buffer16 +// ldp x14, x6, [x0, #32] // @slothy:reads=buffer32 +// cmn x9, #0x1 +// eor x2, x12, x9 +// adcs x12, x2, x20 +// eor x2, x23, x9 +// adcs x23, x2, x16 +// eor x2, x4, x9 +// adcs x13, x2, x17 +// eor x2, x24, x9 +// adcs x10, x2, x15 +// eor x2, x8, x9 +// adcs x5, x2, x14 +// eor x2, x21, x9 +// adcs x24, x2, x6 +// adcs x1, x9, x19 +// adcs x8, x9, xzr +// adcs x21, x9, xzr +// adc x2, x9, xzr +// adds x10, x10, x20 +// adcs x5, x5, x16 +// adcs x24, x24, x17 +// adcs x17, x1, x15 +// adcs x15, x8, x14 +// adcs x14, x21, x6 +// adc x6, x2, x19 +// lsl x2, x12, #32 +// add x1, x2, x12 +// lsr x2, x1, #32 +// subs x21, x2, x1 +// sbc x2, x1, xzr +// extr x21, x2, x21, #32 +// lsr x2, x2, #32 +// adds x8, x2, x1 +// adc x2, xzr, xzr +// subs x21, x23, x21 +// sbcs x23, x13, x8 +// sbcs x10, x10, x2 +// sbcs x5, x5, xzr +// sbcs x24, x24, xzr +// sbc x13, x1, xzr +// lsl x2, x21, #32 +// add x1, x2, x21 +// lsr x2, x1, #32 +// subs x21, x2, x1 +// sbc x2, x1, xzr +// extr x21, x2, x21, #32 +// lsr x2, x2, #32 +// adds x8, x2, x1 +// adc x2, xzr, xzr +// subs x21, x23, x21 +// sbcs x10, x10, x8 +// sbcs x5, x5, x2 +// sbcs x24, x24, xzr +// sbcs x23, x13, xzr +// sbc x13, x1, xzr +// lsl x2, x21, #32 +// add x1, x2, x21 +// lsr x2, x1, #32 +// subs x21, x2, x1 +// sbc x2, x1, xzr +// extr x8, x2, x21, #32 +// lsr x2, x2, #32 +// adds x21, x2, x1 +// adc x2, xzr, xzr +// subs x10, x10, x8 +// sbcs x5, x5, x21 +// sbcs x24, x24, x2 +// sbcs x8, x23, xzr +// sbcs x21, x13, xzr +// sbc x2, x1, xzr +// adds x23, x17, x8 +// adcs x13, x15, x21 +// adcs x1, x14, x2 +// adc x2, x6, xzr +// add x8, x2, #0x1 +// lsl x2, x8, #32 +// subs x21, x8, x2 +// sbc x2, x2, xzr +// adds x10, x10, x21 +// adcs x5, x5, x2 +// adcs x24, x24, x8 +// adcs x8, x23, xzr +// adcs x21, x13, xzr +// adcs x13, x1, xzr +// csetm x1, cc +// mov x2, #0xffffffff +// and x2, x2, x1 +// adds x10, x10, x2 +// eor x2, x2, x1 +// adcs x5, x5, x2 +// mov x2, #0xfffffffffffffffe +// and x2, x2, x1 +// adcs x24, x24, x2 +// adcs x8, x8, x1 +// adcs x21, x21, x1 +// adc x2, x13, x1 +// stp x10, x5, [x0] // @slothy:writes=buffer0 +// stp x24, x8, [x0, #16] // @slothy:writes=buffer16 +// stp x21, x2, [x0, #32] // @slothy:writes=buffer32 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret' and +// # callee-register store/loads as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32]" +// export RESERVED_REGS="[x18,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montmul_p384_neon): + +// Save some registers + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + + ldr q3, [x1] + ldr q25, [x2] + ldp x13, x23, [x2] + ldp x3, x21, [x1] + rev64 v23.4S, v25.4S + uzp1 v17.4S, v25.4S, v3.4S + umulh x15, x3, x13 + mul v6.4S, v23.4S, v3.4S + uzp1 v3.4S, v3.4S, v3.4S + ldr q27, [x2, #32] + ldp x8, x24, [x1, #16] + subs x6, x3, x21 + ldr q0, [x1, #32] + movi v23.2D, #0x00000000ffffffff + csetm x10, cc + umulh x19, x21, x23 + rev64 v4.4S, v27.4S + uzp2 v25.4S, v27.4S, v27.4S + cneg x4, x6, cc + subs x7, x23, x13 + xtn v22.2S, v0.2D + xtn v24.2S, v27.2D + cneg x20, x7, cc + ldp x6, x14, [x2, #16] + mul v27.4S, v4.4S, v0.4S + uaddlp v20.2D, v6.4S + cinv x5, x10, cc + mul x16, x4, x20 + uzp2 v6.4S, v0.4S, v0.4S + umull v21.2D, v22.2S, v25.2S + shl v0.2D, v20.2D, #32 + umlal v0.2D, v3.2S, v17.2S + mul x22, x8, x6 + umull v1.2D, v6.2S, v25.2S + subs x12, x3, x8 + umull v20.2D, v22.2S, v24.2S + cneg x17, x12, cc + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc + usra v21.2D, v20.2D, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2D, v21.2D, #32 + adds x22, x15, x7 + and v26.16B, v21.16B, v23.16B + adcs x16, x12, x15 + uaddlp v25.2D, v27.4S + adcs x9, x19, x12 + umlal v26.2D, v6.2S, v24.2S + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2D, v25.2D, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc + cinv x10, x10, cc + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc + eor x19, x19, x10 + csetm x4, cc + subs x16, x6, x23 + cneg x16, x16, cc + umlal v27.2D, v22.2S, v24.2S + mul x15, x20, x16 + cinv x4, x4, cc + cmn x10, #0x1 + usra v1.2D, v26.2D, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [x2, #32] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [x1, #32] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [x0] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [x0, #16] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc + csetm x2, cc + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc + cneg x19, x19, cc + stp x9, x20, [x0, #32] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc + csetm x12, cc + subs x9, x17, x14 + cinv x12, x12, cc + cneg x9, x9, cc + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc + cneg x24, x10, cc + subs x10, x17, x15 + cinv x7, x7, cc + cneg x10, x10, cc + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [x0] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [x0, #16] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [x0, #32] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [x0] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [x0, #16] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [x0, #32] + cneg x3, x21, cc + csetm x24, cc + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc + csetm x16, cc + subs x21, x6, x15 + cneg x22, x21, cc + cinv x21, x24, cc + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc + csetm x24, cc + subs x20, x14, x15 + cinv x24, x24, cc + mul x22, x3, x22 + cneg x3, x20, cc + subs x13, x6, x14 + cneg x20, x13, cc + cinv x15, x16, cc + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [x0] + ldp x21, x12, [x0, #16] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [x0, #32] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [x0] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [x0, #16] + adc x12, x15, x23 + stp x21, x12, [x0, #32] + +// Restore registers and return + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_neon.S b/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_neon.S new file mode 100644 index 0000000000..9be6380eb4 --- /dev/null +++ b/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_neon.S @@ -0,0 +1,665 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^384) mod p_384 +// Input x[6]; output z[6] +// +// extern void bignum_montsqr_p384_neon +// (uint64_t z[static 6], uint64_t x[static 6]); +// +// Does z := (x^2 / 2^384) mod p_384, assuming x^2 <= 2^384 * p_384, which is +// guaranteed in particular if x < p_384 initially (the "intended" case). +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- + +// bignum_montsqr_p384_neon is functionally equivalent to bignum_montsqr_p384. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montsqr_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// ldp x9, x2, [x1] +// ldr q18, [x1] +// ldr q19, [x1] +// ldp x4, x6, [x1, #16] +// ldp x5, x10, [x1, #32] +// ldr q21, [x1, #32] +// ldr q28, [x1, #32] +// mul x12, x9, x2 +// mul x1, x9, x4 +// mul x13, x2, x4 +// movi v0.2D, #0x00000000ffffffff +// uzp2 v5.4S, v19.4S, v19.4S +// xtn v25.2S, v18.2D +// xtn v4.2S, v19.2D +// rev64 v23.4S, v19.4S +// umull v20.2D, v25.2S, v4.2S +// umull v30.2D, v25.2S, v5.2S +// uzp2 v19.4S, v18.4S, v18.4S +// mul v22.4S, v23.4S, v18.4S +// usra v30.2D, v20.2D, #32 +// umull v18.2D, v19.2S, v5.2S +// uaddlp v22.2D, v22.4S +// and v20.16B, v30.16B, v0.16B +// umlal v20.2D, v19.2S, v4.2S +// shl v19.2D, v22.2D, #32 +// usra v18.2D, v30.2D, #32 +// umlal v19.2D, v25.2S, v4.2S +// usra v18.2D, v20.2D, #32 +// mov x7, v19.d[0] +// mov x17, v19.d[1] +// mul x16, x4, x4 +// umulh x3, x9, x2 +// adds x15, x1, x3 +// umulh x1, x9, x4 +// adcs x13, x13, x1 +// umulh x1, x2, x4 +// adcs x8, x1, xzr +// mov x11, v18.d[0] +// mov x14, v18.d[1] +// umulh x1, x4, x4 +// adds x3, x12, x12 +// adcs x15, x15, x15 +// adcs x13, x13, x13 +// adcs x12, x8, x8 +// adc x1, x1, xzr +// adds x11, x11, x3 +// adcs x3, x17, x15 +// adcs x17, x14, x13 +// adcs x15, x16, x12 +// adc x13, x1, xzr +// lsl x1, x7, #32 +// add x16, x1, x7 +// lsr x1, x16, #32 +// subs x12, x1, x16 +// sbc x1, x16, xzr +// extr x12, x1, x12, #32 +// lsr x1, x1, #32 +// adds x7, x1, x16 +// adc x1, xzr, xzr +// subs x12, x11, x12 +// sbcs x11, x3, x7 +// sbcs x17, x17, x1 +// sbcs x15, x15, xzr +// sbcs x13, x13, xzr +// sbc x3, x16, xzr +// lsl x1, x12, #32 +// add x16, x1, x12 +// lsr x1, x16, #32 +// subs x12, x1, x16 +// sbc x1, x16, xzr +// extr x12, x1, x12, #32 +// lsr x1, x1, #32 +// adds x7, x1, x16 +// adc x1, xzr, xzr +// subs x12, x11, x12 +// sbcs x17, x17, x7 +// sbcs x15, x15, x1 +// sbcs x13, x13, xzr +// sbcs x11, x3, xzr +// sbc x3, x16, xzr +// lsl x1, x12, #32 +// add x16, x1, x12 +// lsr x1, x16, #32 +// subs x12, x1, x16 +// sbc x1, x16, xzr +// extr x7, x1, x12, #32 +// lsr x1, x1, #32 +// adds x12, x1, x16 +// adc x1, xzr, xzr +// subs x17, x17, x7 +// sbcs x15, x15, x12 +// sbcs x13, x13, x1 +// sbcs x7, x11, xzr +// sbcs x12, x3, xzr +// sbc x1, x16, xzr +// stp x17, x15, [x0] // @slothy:writes=buffer0 +// stp x13, x7, [x0, #16] // @slothy:writes=buffer16 +// stp x12, x1, [x0, #32] // @slothy:writes=buffer32 +// mul x14, x9, x6 +// mul x15, x2, x5 +// mul x13, x4, x10 +// umulh x7, x9, x6 +// umulh x12, x2, x5 +// umulh x1, x4, x10 +// adds x15, x7, x15 +// adcs x16, x12, x13 +// adc x13, x1, xzr +// adds x11, x15, x14 +// adcs x7, x16, x15 +// adcs x12, x13, x16 +// adc x1, x13, xzr +// adds x17, x7, x14 +// adcs x15, x12, x15 +// adcs x3, x1, x16 +// adc x16, x13, xzr +// subs x1, x9, x2 +// cneg x13, x1, cc +// csetm x7, cc +// subs x1, x5, x6 +// cneg x1, x1, cc +// mul x12, x13, x1 +// umulh x1, x13, x1 +// cinv x7, x7, cc +// eor x12, x12, x7 +// eor x1, x1, x7 +// cmn x7, #0x1 +// adcs x11, x11, x12 +// adcs x17, x17, x1 +// adcs x15, x15, x7 +// adcs x3, x3, x7 +// adc x16, x16, x7 +// subs x9, x9, x4 +// cneg x13, x9, cc +// csetm x7, cc +// subs x1, x10, x6 +// cneg x1, x1, cc +// mul x12, x13, x1 +// umulh x1, x13, x1 +// cinv x7, x7, cc +// eor x12, x12, x7 +// eor x1, x1, x7 +// cmn x7, #0x1 +// adcs x17, x17, x12 +// adcs x15, x15, x1 +// adcs x13, x3, x7 +// adc x7, x16, x7 +// subs x2, x2, x4 +// cneg x12, x2, cc +// csetm x1, cc +// subs x2, x10, x5 +// cneg x2, x2, cc +// mul x4, x12, x2 +// umulh x2, x12, x2 +// cinv x1, x1, cc +// eor x4, x4, x1 +// eor x2, x2, x1 +// cmn x1, #0x1 +// adcs x12, x15, x4 +// adcs x4, x13, x2 +// adc x2, x7, x1 +// adds x1, x14, x14 +// adcs x16, x11, x11 +// adcs x17, x17, x17 +// adcs x15, x12, x12 +// adcs x13, x4, x4 +// adcs x7, x2, x2 +// adc x12, xzr, xzr +// ldp x4, x2, [x0] // @slothy:reads=buffer0 +// adds x1, x1, x4 +// adcs x16, x16, x2 +// ldp x4, x2, [x0, #16] // @slothy:reads=buffer16 +// adcs x17, x17, x4 +// adcs x15, x15, x2 +// ldp x4, x2, [x0, #32] // @slothy:reads=buffer32 +// adcs x13, x13, x4 +// adcs x7, x7, x2 +// adc x11, x12, xzr +// lsl x2, x1, #32 +// add x12, x2, x1 +// lsr x2, x12, #32 +// subs x4, x2, x12 +// sbc x2, x12, xzr +// extr x4, x2, x4, #32 +// lsr x2, x2, #32 +// adds x1, x2, x12 +// adc x2, xzr, xzr +// subs x4, x16, x4 +// sbcs x16, x17, x1 +// sbcs x17, x15, x2 +// sbcs x15, x13, xzr +// sbcs x13, x7, xzr +// sbc x7, x12, xzr +// lsl x2, x4, #32 +// add x12, x2, x4 +// lsr x2, x12, #32 +// subs x4, x2, x12 +// sbc x2, x12, xzr +// extr x4, x2, x4, #32 +// lsr x2, x2, #32 +// adds x1, x2, x12 +// adc x2, xzr, xzr +// subs x4, x16, x4 +// sbcs x16, x17, x1 +// sbcs x17, x15, x2 +// sbcs x15, x13, xzr +// sbcs x13, x7, xzr +// sbc x7, x12, xzr +// lsl x2, x4, #32 +// add x12, x2, x4 +// lsr x2, x12, #32 +// subs x4, x2, x12 +// sbc x2, x12, xzr +// extr x1, x2, x4, #32 +// lsr x2, x2, #32 +// adds x4, x2, x12 +// adc x2, xzr, xzr +// subs x3, x16, x1 +// sbcs x17, x17, x4 +// sbcs x15, x15, x2 +// sbcs x1, x13, xzr +// sbcs x4, x7, xzr +// sbc x2, x12, xzr +// adds x13, x11, x1 +// adcs x7, x4, xzr +// adcs x12, x2, xzr +// adcs x16, xzr, xzr +// mul x2, x6, x6 +// adds x3, x3, x2 +// xtn v30.2S, v28.2D +// shrn v26.2S, v28.2D, #32 +// umull v26.2D, v30.2S, v26.2S +// shl v19.2D, v26.2D, #33 +// umlal v19.2D, v30.2S, v30.2S +// mov x1, v19.d[0] +// mov x4, v19.d[1] +// umulh x2, x6, x6 +// adcs x17, x17, x2 +// umulh x2, x5, x5 +// adcs x15, x15, x1 +// adcs x13, x13, x2 +// umulh x2, x10, x10 +// adcs x7, x7, x4 +// adcs x12, x12, x2 +// adc x16, x16, xzr +// dup v28.2D, x6 +// movi v0.2D, #0x00000000ffffffff +// uzp2 v5.4S, v21.4S, v21.4S +// xtn v25.2S, v28.2D +// xtn v4.2S, v21.2D +// rev64 v19.4S, v21.4S +// umull v30.2D, v25.2S, v4.2S +// umull v23.2D, v25.2S, v5.2S +// uzp2 v20.4S, v28.4S, v28.4S +// mul v19.4S, v19.4S, v28.4S +// usra v23.2D, v30.2D, #32 +// umull v18.2D, v20.2S, v5.2S +// uaddlp v19.2D, v19.4S +// and v30.16B, v23.16B, v0.16B +// umlal v30.2D, v20.2S, v4.2S +// shl v19.2D, v19.2D, #32 +// usra v18.2D, v23.2D, #32 +// umlal v19.2D, v25.2S, v4.2S +// usra v18.2D, v30.2D, #32 +// mov x6, v19.d[0] +// mov x1, v19.d[1] +// mul x4, x5, x10 +// mov x2, v18.d[0] +// adds x1, x1, x2 +// mov x2, v18.d[1] +// adcs x4, x4, x2 +// umulh x5, x5, x10 +// adc x2, x5, xzr +// adds x5, x6, x6 +// adcs x6, x1, x1 +// adcs x1, x4, x4 +// adcs x4, x2, x2 +// adc x2, xzr, xzr +// adds x17, x17, x5 +// adcs x15, x15, x6 +// adcs x13, x13, x1 +// adcs x7, x7, x4 +// adcs x12, x12, x2 +// adc x2, x16, xzr +// mov x5, #0xffffffff00000001 +// mov x6, #0xffffffff +// mov x1, #0x1 +// cmn x3, x5 +// adcs xzr, x17, x6 +// adcs xzr, x15, x1 +// adcs xzr, x13, xzr +// adcs xzr, x7, xzr +// adcs xzr, x12, xzr +// adc x2, x2, xzr +// neg x4, x2 +// and x2, x5, x4 +// adds x10, x3, x2 +// and x2, x6, x4 +// adcs x5, x17, x2 +// and x2, x1, x4 +// adcs x6, x15, x2 +// adcs x1, x13, xzr +// adcs x4, x7, xzr +// adc x2, x12, xzr +// stp x10, x5, [x0] // @slothy:writes=buffer0 +// stp x6, x1, [x0, #16] // @slothy:writes=buffer16 +// stp x4, x2, [x0, #32] // @slothy:writes=buffer32 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret' as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32]" +// export RESERVED_REGS="[x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montsqr_p384_neon): + + ldr q1, [x1] + ldp x9, x2, [x1] + ldr q0, [x1] + ldp x4, x6, [x1, #16] + rev64 v21.4S, v1.4S + uzp2 v28.4S, v1.4S, v1.4S + umulh x7, x9, x2 + xtn v17.2S, v1.2D + mul v27.4S, v21.4S, v0.4S + ldr q20, [x1, #32] + xtn v30.2S, v0.2D + ldr q1, [x1, #32] + uzp2 v31.4S, v0.4S, v0.4S + ldp x5, x10, [x1, #32] + umulh x8, x9, x4 + uaddlp v3.2D, v27.4S + umull v16.2D, v30.2S, v17.2S + mul x16, x9, x4 + umull v27.2D, v30.2S, v28.2S + shrn v0.2S, v20.2D, #32 + xtn v7.2S, v20.2D + shl v20.2D, v3.2D, #32 + umull v3.2D, v31.2S, v28.2S + mul x3, x2, x4 + umlal v20.2D, v30.2S, v17.2S + umull v22.2D, v7.2S, v0.2S + usra v27.2D, v16.2D, #32 + umulh x11, x2, x4 + movi v21.2D, #0x00000000ffffffff + uzp2 v28.4S, v1.4S, v1.4S + adds x15, x16, x7 + and v5.16B, v27.16B, v21.16B + adcs x3, x3, x8 + usra v3.2D, v27.2D, #32 + dup v29.2D, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2D, v31.2S, v17.2S + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2D, v22.2D, #33 + xtn v25.2S, v29.2D + rev64 v31.4S, v1.4S + lsl x13, x14, #32 + uzp2 v6.4S, v29.4S, v29.4S + umlal v19.2D, v7.2S, v7.2S + usra v3.2D, v5.2D, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4S, v31.4S, v29.4S + xtn v4.2S, v1.2D + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2D, v25.2S, v28.2S + adcs x11, x16, x16 + umull v21.2D, v25.2S, v4.2S + mov x17, v3.d[0] + umull v18.2D, v6.2S, v28.2S + adc x16, x8, xzr + uaddlp v16.2D, v17.4S + movi v1.2D, #0x00000000ffffffff + subs x13, x13, x12 + usra v31.2D, v21.2D, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2D, v16.2D, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16B, v31.16B, v1.16B + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2D, v6.2S, v4.2S + usra v18.2D, v31.2D, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2D, v25.2S, v4.2S + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2D, v3.2D, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x15, x3, x17 + sbcs x3, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [x0] // @slothy:writes=buffer0 + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + stp x15, x3, [x0, #16] // @slothy:writes=buffer16 + csetm x15, cc + cneg x1, x1, cc + stp x11, x14, [x0, #32] // @slothy:writes=buffer32 + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc + cinv x16, x15, cc + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc + cneg x9, x9, cc + subs x4, x2, x4 + cneg x4, x4, cc + csetm x7, cc + subs x2, x10, x6 + cinv x8, x8, cc + cneg x2, x2, cc + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc + cneg x1, x1, cc + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + ldp x9, x17, [x0, #16] // @slothy:reads=buffer16 + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [x0] // @slothy:reads=buffer0 + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [x0, #32] // @slothy:reads=buffer32 + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x9 + adcs x1, x1, x17 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 + adcs x14, x14, x2 + mov x2, #0x1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x11, x13, x1 + and x13, x4, x9 + adcs x5, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + stp x11, x5, [x0] // @slothy:writes=buffer0 + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [x0, #16] // @slothy:writes=buffer16 + adc x17, x14, xzr + stp x2, x17, [x0, #32] // depth 72 // @slothy:writes=buffer32 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/p384/p384_montjadd.S b/third_party/s2n-bignum/arm/p384/p384_montjadd.S index 9c0e1ecb99..3b65363162 100644 --- a/third_party/s2n-bignum/arm/p384/p384_montjadd.S +++ b/third_party/s2n-bignum/arm/p384/p384_montjadd.S @@ -49,6 +49,7 @@ #define z1sq sp, #(NUMSIZE*0) #define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) #define yd sp, #(NUMSIZE*1) #define y2a sp, #(NUMSIZE*1) @@ -62,723 +63,660 @@ #define t2 sp, #(NUMSIZE*4) #define x1a sp, #(NUMSIZE*4) #define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) #define xd sp, #(NUMSIZE*5) #define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) #define y1a sp, #(NUMSIZE*6) #define NSPACE (NUMSIZE*7) -// Corresponds exactly to bignum_montmul_p384_alt +// Corresponds to bignum_montmul_p384 except x24 -> x0 #define montmul_p384(P0,P1,P2) \ ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x12, x3, x5; \ - umulh x13, x3, x5; \ - mul x11, x3, x6; \ - umulh x14, x3, x6; \ - adds x13, x13, x11; \ - ldp x7, x8, [P2+16]; \ - mul x11, x3, x7; \ - umulh x15, x3, x7; \ - adcs x14, x14, x11; \ - mul x11, x3, x8; \ - umulh x16, x3, x8; \ - adcs x15, x15, x11; \ - ldp x9, x10, [P2+32]; \ - mul x11, x3, x9; \ - umulh x17, x3, x9; \ - adcs x16, x16, x11; \ - mul x11, x3, x10; \ - umulh x19, x3, x10; \ - adcs x17, x17, x11; \ - adc x19, x19, xzr; \ - mul x11, x4, x5; \ - adds x13, x13, x11; \ - mul x11, x4, x6; \ - adcs x14, x14, x11; \ - mul x11, x4, x7; \ - adcs x15, x15, x11; \ - mul x11, x4, x8; \ - adcs x16, x16, x11; \ - mul x11, x4, x9; \ - adcs x17, x17, x11; \ - mul x11, x4, x10; \ - adcs x19, x19, x11; \ - cset x20, cs; \ - umulh x11, x4, x5; \ - adds x14, x14, x11; \ - umulh x11, x4, x6; \ - adcs x15, x15, x11; \ - umulh x11, x4, x7; \ - adcs x16, x16, x11; \ - umulh x11, x4, x8; \ - adcs x17, x17, x11; \ - umulh x11, x4, x9; \ - adcs x19, x19, x11; \ - umulh x11, x4, x10; \ - adc x20, x20, x11; \ - ldp x3, x4, [P1+16]; \ - mul x11, x3, x5; \ - adds x14, x14, x11; \ - mul x11, x3, x6; \ - adcs x15, x15, x11; \ - mul x11, x3, x7; \ - adcs x16, x16, x11; \ - mul x11, x3, x8; \ - adcs x17, x17, x11; \ - mul x11, x3, x9; \ - adcs x19, x19, x11; \ - mul x11, x3, x10; \ - adcs x20, x20, x11; \ - cset x21, cs; \ - umulh x11, x3, x5; \ - adds x15, x15, x11; \ - umulh x11, x3, x6; \ - adcs x16, x16, x11; \ - umulh x11, x3, x7; \ - adcs x17, x17, x11; \ - umulh x11, x3, x8; \ - adcs x19, x19, x11; \ - umulh x11, x3, x9; \ - adcs x20, x20, x11; \ - umulh x11, x3, x10; \ - adc x21, x21, x11; \ - mul x11, x4, x5; \ - adds x15, x15, x11; \ - mul x11, x4, x6; \ - adcs x16, x16, x11; \ - mul x11, x4, x7; \ - adcs x17, x17, x11; \ - mul x11, x4, x8; \ - adcs x19, x19, x11; \ - mul x11, x4, x9; \ - adcs x20, x20, x11; \ - mul x11, x4, x10; \ - adcs x21, x21, x11; \ - cset x22, cs; \ - umulh x11, x4, x5; \ - adds x16, x16, x11; \ - umulh x11, x4, x6; \ - adcs x17, x17, x11; \ - umulh x11, x4, x7; \ - adcs x19, x19, x11; \ - umulh x11, x4, x8; \ - adcs x20, x20, x11; \ - umulh x11, x4, x9; \ - adcs x21, x21, x11; \ - umulh x11, x4, x10; \ - adc x22, x22, x11; \ - ldp x3, x4, [P1+32]; \ - mul x11, x3, x5; \ - adds x16, x16, x11; \ - mul x11, x3, x6; \ - adcs x17, x17, x11; \ - mul x11, x3, x7; \ - adcs x19, x19, x11; \ - mul x11, x3, x8; \ - adcs x20, x20, x11; \ - mul x11, x3, x9; \ - adcs x21, x21, x11; \ - mul x11, x3, x10; \ - adcs x22, x22, x11; \ - cset x2, cs; \ - umulh x11, x3, x5; \ - adds x17, x17, x11; \ - umulh x11, x3, x6; \ - adcs x19, x19, x11; \ - umulh x11, x3, x7; \ - adcs x20, x20, x11; \ - umulh x11, x3, x8; \ - adcs x21, x21, x11; \ - umulh x11, x3, x9; \ - adcs x22, x22, x11; \ - umulh x11, x3, x10; \ - adc x2, x2, x11; \ - mul x11, x4, x5; \ - adds x17, x17, x11; \ - mul x11, x4, x6; \ - adcs x19, x19, x11; \ - mul x11, x4, x7; \ - adcs x20, x20, x11; \ - mul x11, x4, x8; \ - adcs x21, x21, x11; \ - mul x11, x4, x9; \ - adcs x22, x22, x11; \ - mul x11, x4, x10; \ - adcs x2, x2, x11; \ - cset x1, cs; \ - umulh x11, x4, x5; \ - adds x19, x19, x11; \ - umulh x11, x4, x6; \ - adcs x20, x20, x11; \ - umulh x11, x4, x7; \ - adcs x21, x21, x11; \ - umulh x11, x4, x8; \ - adcs x22, x22, x11; \ - umulh x11, x4, x9; \ - adcs x2, x2, x11; \ - umulh x11, x4, x10; \ - adc x1, x1, x11; \ - lsl x7, x12, #32; \ - add x12, x7, x12; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x12; \ - mov x6, #0xffffffff; \ - mul x5, x6, x12; \ - umulh x6, x6, x12; \ - adds x7, x7, x5; \ - adcs x6, x6, x12; \ - adc x5, xzr, xzr; \ - subs x13, x13, x7; \ - sbcs x14, x14, x6; \ - sbcs x15, x15, x5; \ + ldp x5, x6, [P1+16]; \ + ldp x7, x8, [P1+32]; \ + ldp x9, x10, [P2]; \ + ldp x11, x12, [P2+16]; \ + ldp x13, x14, [P2+32]; \ + mul x15, x3, x9; \ + mul x21, x4, x10; \ + mul x22, x5, x11; \ + umulh x23, x3, x9; \ + umulh x0, x4, x10; \ + umulh x1, x5, x11; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x3, x4; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x3, x5; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x4, x5; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ + sbc x15, x15, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ + sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x12, x12, xzr; \ - lsl x7, x13, #32; \ - add x13, x7, x13; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x13; \ - mov x6, #0xffffffff; \ - mul x5, x6, x13; \ - umulh x6, x6, x13; \ - adds x7, x7, x5; \ - adcs x6, x6, x13; \ - adc x5, xzr, xzr; \ - subs x14, x14, x7; \ - sbcs x15, x15, x6; \ - sbcs x16, x16, x5; \ - sbcs x17, x17, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - lsl x7, x14, #32; \ - add x14, x7, x14; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x14; \ - mov x6, #0xffffffff; \ - mul x5, x6, x14; \ - umulh x6, x6, x14; \ - adds x7, x7, x5; \ - adcs x6, x6, x14; \ - adc x5, xzr, xzr; \ - subs x15, x15, x7; \ - sbcs x16, x16, x6; \ - sbcs x17, x17, x5; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x14, x14, xzr; \ - lsl x7, x15, #32; \ - add x15, x7, x15; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x15; \ - mov x6, #0xffffffff; \ - mul x5, x6, x15; \ - umulh x6, x6, x15; \ - adds x7, x7, x5; \ - adcs x6, x6, x15; \ - adc x5, xzr, xzr; \ - subs x16, x16, x7; \ - sbcs x17, x17, x6; \ - sbcs x12, x12, x5; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ + sbc x17, x17, xzr; \ + stp x19, x20, [P0]; \ + stp x1, x15, [P0+16]; \ + stp x16, x17, [P0+32]; \ + mul x15, x6, x12; \ + mul x21, x7, x13; \ + mul x22, x8, x14; \ + umulh x23, x6, x12; \ + umulh x0, x7, x13; \ + umulh x1, x8, x14; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x6, x7; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x13, x12; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x6, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x14, x12; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x7, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x14, x13; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + subs x6, x6, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x5; \ + ngc x3, xzr; \ + cmn x3, #1; \ + eor x6, x6, x3; \ + adcs x6, x6, xzr; \ + eor x7, x7, x3; \ + adcs x7, x7, xzr; \ + eor x8, x8, x3; \ + adc x8, x8, xzr; \ + subs x9, x9, x12; \ + sbcs x10, x10, x13; \ + sbcs x11, x11, x14; \ + ngc x14, xzr; \ + cmn x14, #1; \ + eor x9, x9, x14; \ + adcs x9, x9, xzr; \ + eor x10, x10, x14; \ + adcs x10, x10, xzr; \ + eor x11, x11, x14; \ + adc x11, x11, xzr; \ + eor x14, x3, x14; \ + ldp x21, x22, [P0]; \ + adds x15, x15, x21; \ + adcs x16, x16, x22; \ + ldp x21, x22, [P0+16]; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + ldp x21, x22, [P0+32]; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adc x2, xzr, xzr; \ + stp x15, x16, [P0]; \ + stp x17, x19, [P0+16]; \ + stp x20, x1, [P0+32]; \ + mul x15, x6, x9; \ + mul x21, x7, x10; \ + mul x22, x8, x11; \ + umulh x23, x6, x9; \ + umulh x0, x7, x10; \ + umulh x1, x8, x11; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x6, x7; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x6, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x7, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + ldp x3, x4, [P0]; \ + ldp x5, x6, [P0+16]; \ + ldp x7, x8, [P0+32]; \ + cmn x14, #1; \ + eor x15, x15, x14; \ + adcs x15, x15, x3; \ + eor x16, x16, x14; \ + adcs x16, x16, x4; \ + eor x17, x17, x14; \ + adcs x17, x17, x5; \ + eor x19, x19, x14; \ + adcs x19, x19, x6; \ + eor x20, x20, x14; \ + adcs x20, x20, x7; \ + eor x1, x1, x14; \ + adcs x1, x1, x8; \ + adcs x9, x14, x2; \ + adcs x10, x14, xzr; \ + adcs x11, x14, xzr; \ + adc x12, x14, xzr; \ + adds x19, x19, x3; \ + adcs x20, x20, x4; \ + adcs x1, x1, x5; \ + adcs x9, x9, x6; \ + adcs x10, x10, x7; \ + adcs x11, x11, x8; \ + adc x12, x12, x2; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ sbc x15, x15, xzr; \ - lsl x7, x16, #32; \ - add x16, x7, x16; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x16; \ - mov x6, #0xffffffff; \ - mul x5, x6, x16; \ - umulh x6, x6, x16; \ - adds x7, x7, x5; \ - adcs x6, x6, x16; \ - adc x5, xzr, xzr; \ - subs x17, x17, x7; \ - sbcs x12, x12, x6; \ - sbcs x13, x13, x5; \ - sbcs x14, x14, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ sbcs x15, x15, xzr; \ sbc x16, x16, xzr; \ - lsl x7, x17, #32; \ - add x17, x7, x17; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x17; \ - mov x6, #0xffffffff; \ - mul x5, x6, x17; \ - umulh x6, x6, x17; \ - adds x7, x7, x5; \ - adcs x6, x6, x17; \ - adc x5, xzr, xzr; \ - subs x12, x12, x7; \ - sbcs x13, x13, x6; \ - sbcs x14, x14, x5; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ sbc x17, x17, xzr; \ - adds x12, x12, x19; \ - adcs x13, x13, x20; \ - adcs x14, x14, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x2; \ - adcs x17, x17, x1; \ - adc x10, xzr, xzr; \ - mov x11, #0xffffffff00000001; \ - adds x19, x12, x11; \ - mov x11, #0xffffffff; \ - adcs x20, x13, x11; \ - mov x11, #0x1; \ - adcs x21, x14, x11; \ - adcs x22, x15, xzr; \ - adcs x2, x16, xzr; \ - adcs x1, x17, xzr; \ + adds x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adc x12, x12, xzr; \ + add x22, x12, #1; \ + lsl x21, x22, #32; \ + subs x0, x22, x21; \ + sbc x21, x21, xzr; \ + adds x19, x19, x0; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adcs x9, x9, xzr; \ adcs x10, x10, xzr; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - csel x14, x14, x21, eq; \ - csel x15, x15, x22, eq; \ - csel x16, x16, x2, eq; \ - csel x17, x17, x1, eq; \ - stp x12, x13, [P0]; \ - stp x14, x15, [P0+16]; \ - stp x16, x17, [P0+32] - -// Corresponds exactly to bignum_montsqr_p384_alt + adcs x11, x11, xzr; \ + csetm x22, lo; \ + mov x23, #4294967295; \ + and x23, x23, x22; \ + adds x19, x19, x23; \ + eor x23, x23, x22; \ + adcs x20, x20, x23; \ + mov x23, #-2; \ + and x23, x23, x22; \ + adcs x1, x1, x23; \ + adcs x9, x9, x22; \ + adcs x10, x10, x22; \ + adc x11, x11, x22; \ + stp x19, x20, [P0]; \ + stp x1, x9, [P0+16]; \ + stp x10, x11, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384 #define montsqr_p384(P0,P1) \ ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ + mul x14, x2, x3; \ + mul x15, x2, x4; \ + mul x16, x3, x4; \ + mul x8, x2, x2; \ + mul x10, x3, x3; \ + mul x12, x4, x4; \ + umulh x17, x2, x3; \ + adds x15, x15, x17; \ + umulh x17, x2, x4; \ + adcs x16, x16, x17; \ + umulh x17, x3, x4; \ adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ + umulh x9, x2, x2; \ + umulh x11, x3, x3; \ + umulh x13, x4, x4; \ + adds x14, x14, x14; \ adcs x15, x15, x15; \ adcs x16, x16, x16; \ adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ + adc x13, x13, xzr; \ + adds x9, x9, x14; \ + adcs x10, x10, x15; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x13, xzr; \ + lsl x16, x8, #32; \ + add x8, x16, x8; \ + lsr x16, x8, #32; \ + subs x16, x16, x8; \ + sbc x15, x8, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x8; \ + adc x14, xzr, xzr; \ + subs x9, x9, x16; \ + sbcs x10, x10, x15; \ + sbcs x11, x11, x14; \ sbcs x12, x12, xzr; \ sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ + sbc x8, x8, xzr; \ + lsl x16, x9, #32; \ + add x9, x16, x9; \ + lsr x16, x9, #32; \ + subs x16, x16, x9; \ + sbc x15, x9, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x9; \ + adc x14, xzr, xzr; \ + subs x10, x10, x16; \ + sbcs x11, x11, x15; \ + sbcs x12, x12, x14; \ sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ + sbcs x8, x8, xzr; \ sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ + lsl x16, x10, #32; \ + add x10, x16, x10; \ + lsr x16, x10, #32; \ + subs x16, x16, x10; \ + sbc x15, x10, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x10; \ + adc x14, xzr, xzr; \ + subs x11, x11, x16; \ + sbcs x12, x12, x15; \ + sbcs x13, x13, x14; \ + sbcs x8, x8, xzr; \ sbcs x9, x9, xzr; \ sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - adc x6, xzr, xzr; \ - mov x8, #-4294967295; \ - adds x14, x2, x8; \ - mov x8, #4294967295; \ - adcs x15, x9, x8; \ - mov x8, #1; \ - adcs x16, x10, x8; \ - adcs x17, x11, xzr; \ - adcs x19, x12, xzr; \ - adcs x20, x13, xzr; \ - adcs x6, x6, xzr; \ - csel x2, x2, x14, eq; \ - csel x9, x9, x15, eq; \ - csel x10, x10, x16, eq; \ - csel x11, x11, x17, eq; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] - -// Almost-Montgomery variant which we use when an input to other muls -// with the other argument fully reduced (which is always safe). In -// fact, with the Karatsuba-based Montgomery mul here, we don't even -// *need* the restriction that the other argument is reduced. - -#define amontsqr_p384(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ + stp x11, x12, [P0]; \ + stp x13, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + mul x8, x2, x5; \ + mul x14, x3, x6; \ + mul x15, x4, x7; \ + umulh x16, x2, x5; \ + umulh x17, x3, x6; \ + umulh x1, x4, x7; \ + adds x16, x16, x14; \ + adcs x17, x17, x15; \ + adc x1, x1, xzr; \ + adds x9, x16, x8; \ + adcs x10, x17, x16; \ + adcs x11, x1, x17; \ + adc x12, x1, xzr; \ adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ - ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ - adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x1, xzr; \ + subs x17, x2, x3; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x6, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x9, x9, x16; \ + adcs x10, x10, x15; \ + adcs x11, x11, x14; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x2, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x10, x10, x16; \ + adcs x11, x11, x15; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x3, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x6; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x11, x11, x16; \ + adcs x12, x12, x15; \ + adc x13, x13, x14; \ + adds x8, x8, x8; \ + adcs x9, x9, x9; \ adcs x10, x10, x10; \ adcs x11, x11, x11; \ adcs x12, x12, x12; \ adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ + adc x17, xzr, xzr; \ + ldp x2, x3, [P0]; \ + adds x8, x8, x2; \ + adcs x9, x9, x3; \ + ldp x2, x3, [P0+16]; \ + adcs x10, x10, x2; \ + adcs x11, x11, x3; \ + ldp x2, x3, [P0+32]; \ + adcs x12, x12, x2; \ + adcs x13, x13, x3; \ + adc x17, x17, xzr; \ + lsl x4, x8, #32; \ + add x8, x4, x8; \ + lsr x4, x8, #32; \ + subs x4, x4, x8; \ + sbc x3, x8, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x8; \ + adc x2, xzr, xzr; \ + subs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, x2; \ sbcs x12, x12, xzr; \ sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ + sbc x8, x8, xzr; \ + lsl x4, x9, #32; \ + add x9, x4, x9; \ + lsr x4, x9, #32; \ + subs x4, x4, x9; \ + sbc x3, x9, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x9; \ + adc x2, xzr, xzr; \ + subs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, x2; \ sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ + sbcs x8, x8, xzr; \ sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ + lsl x4, x10, #32; \ + add x10, x4, x10; \ + lsr x4, x10, #32; \ + subs x4, x4, x10; \ + sbc x3, x10, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x10; \ + adc x2, xzr, xzr; \ + subs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, x2; \ + sbcs x8, x8, xzr; \ sbcs x9, x9, xzr; \ sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - mov x14, #-4294967295; \ - mov x15, #4294967295; \ - csel x14, x14, xzr, cs; \ - csel x15, x15, xzr, cs; \ - cset x16, cs; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, xzr; \ - adcs x12, x12, xzr; \ - adc x13, x13, xzr; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] + adds x17, x17, x8; \ + adcs x8, x9, xzr; \ + adcs x9, x10, xzr; \ + adcs x10, xzr, xzr; \ + mul x1, x5, x5; \ + adds x11, x11, x1; \ + mul x14, x6, x6; \ + mul x15, x7, x7; \ + umulh x1, x5, x5; \ + adcs x12, x12, x1; \ + umulh x1, x6, x6; \ + adcs x13, x13, x14; \ + adcs x17, x17, x1; \ + umulh x1, x7, x7; \ + adcs x8, x8, x15; \ + adcs x9, x9, x1; \ + adc x10, x10, xzr; \ + mul x1, x5, x6; \ + mul x14, x5, x7; \ + mul x15, x6, x7; \ + umulh x16, x5, x6; \ + adds x14, x14, x16; \ + umulh x16, x5, x7; \ + adcs x15, x15, x16; \ + umulh x16, x6, x7; \ + adc x16, x16, xzr; \ + adds x1, x1, x1; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adc x5, xzr, xzr; \ + adds x12, x12, x1; \ + adcs x13, x13, x14; \ + adcs x17, x17, x15; \ + adcs x8, x8, x16; \ + adcs x9, x9, x5; \ + adc x10, x10, xzr; \ + mov x1, #-4294967295; \ + mov x14, #4294967295; \ + mov x15, #1; \ + cmn x11, x1; \ + adcs xzr, x12, x14; \ + adcs xzr, x13, x15; \ + adcs xzr, x17, xzr; \ + adcs xzr, x8, xzr; \ + adcs xzr, x9, xzr; \ + adc x10, x10, xzr; \ + neg x10, x10; \ + and x1, x1, x10; \ + adds x11, x11, x1; \ + and x14, x14, x10; \ + adcs x12, x12, x14; \ + and x15, x15, x10; \ + adcs x13, x13, x15; \ + adcs x17, x17, xzr; \ + adcs x8, x8, xzr; \ + adc x9, x9, xzr; \ + stp x11, x12, [P0]; \ + stp x13, x17, [P0+16]; \ + stp x8, x9, [P0+32] // Corresponds exactly to bignum_sub_p384 @@ -830,8 +768,8 @@ S2N_BN_SYMBOL(p384_montjadd): // Main code, just a sequence of basic field operations // 8 * multiply + 3 * square + 7 * subtract - amontsqr_p384(z1sq,z_1) - amontsqr_p384(z2sq,z_2) + montsqr_p384(z1sq,z_1) + montsqr_p384(z2sq,z_2) montmul_p384(y1a,z_2,y_1) montmul_p384(y2a,z_1,y_2) @@ -844,26 +782,135 @@ S2N_BN_SYMBOL(p384_montjadd): sub_p384(xd,x2a,x1a) sub_p384(yd,y2a,y1a) - amontsqr_p384(zz,xd) + montsqr_p384(zz,xd) montsqr_p384(ww,yd) montmul_p384(zzx1,zz,x1a) montmul_p384(zzx2,zz,x2a) - sub_p384(x_3,ww,zzx1) + sub_p384(resx,ww,zzx1) sub_p384(t1,zzx2,zzx1) montmul_p384(xd,xd,z_1) - sub_p384(x_3,x_3,zzx2) + sub_p384(resx,resx,zzx2) - sub_p384(t2,zzx1,x_3) + sub_p384(t2,zzx1,resx) montmul_p384(t1,t1,y1a) - montmul_p384(z_3,xd,z_2) + montmul_p384(resz,xd,z_2) montmul_p384(t2,yd,t2) - sub_p384(y_3,t2,t1) + sub_p384(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x20, x20, x21 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + + ldp x6, x7, [z_2] + ldp x8, x9, [z_2+16] + ldp x10, x11, [z_2+32] + + orr x21, x6, x7 + orr x22, x8, x9 + orr x23, x10, x11 + orr x21, x21, x22 + orr x21, x21, x23 + cmp x21, xzr + cset x21, ne + + cmp x21, x20 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x12, x13, [resz] + csel x12, x0, x12, lo + csel x13, x1, x13, lo + csel x12, x6, x12, hi + csel x13, x7, x13, hi + ldp x14, x15, [resz+16] + csel x14, x2, x14, lo + csel x15, x3, x15, lo + csel x14, x8, x14, hi + csel x15, x9, x15, hi + ldp x16, x17, [resz+32] + csel x16, x4, x16, lo + csel x17, x5, x17, lo + csel x16, x10, x16, hi + csel x17, x11, x17, hi + + ldp x20, x21, [x_1] + ldp x0, x1, [resx] + csel x0, x20, x0, lo + csel x1, x21, x1, lo + ldp x20, x21, [x_2] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + + ldp x20, x21, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x20, x2, lo + csel x3, x21, x3, lo + ldp x20, x21, [x_2+16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + + ldp x20, x21, [x_1+32] + ldp x4, x5, [resx+32] + csel x4, x20, x4, lo + csel x5, x21, x5, lo + ldp x20, x21, [x_2+32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + + ldp x20, x21, [y_1] + ldp x6, x7, [resy] + csel x6, x20, x6, lo + csel x7, x21, x7, lo + ldp x20, x21, [y_2] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + + ldp x20, x21, [y_1+16] + ldp x8, x9, [resy+16] + csel x8, x20, x8, lo + csel x9, x21, x9, lo + ldp x20, x21, [y_2+16] + csel x8, x20, x8, hi + csel x9, x21, x9, hi + + ldp x20, x21, [y_1+32] + ldp x10, x11, [resy+32] + csel x10, x20, x10, lo + csel x11, x21, x11, lo + ldp x20, x21, [y_2+32] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] // Restore stack and registers diff --git a/third_party/s2n-bignum/arm/p384/p384_montjadd_alt.S b/third_party/s2n-bignum/arm/p384/p384_montjadd_alt.S new file mode 100644 index 0000000000..b84065dea9 --- /dev/null +++ b/third_party/s2n-bignum/arm/p384/p384_montjadd_alt.S @@ -0,0 +1,993 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjadd_alt +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjadd_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x24 +#define input_x x25 +#define input_y x26 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x12, x3, x5; \ + umulh x13, x3, x5; \ + mul x11, x3, x6; \ + umulh x14, x3, x6; \ + adds x13, x13, x11; \ + ldp x7, x8, [P2+16]; \ + mul x11, x3, x7; \ + umulh x15, x3, x7; \ + adcs x14, x14, x11; \ + mul x11, x3, x8; \ + umulh x16, x3, x8; \ + adcs x15, x15, x11; \ + ldp x9, x10, [P2+32]; \ + mul x11, x3, x9; \ + umulh x17, x3, x9; \ + adcs x16, x16, x11; \ + mul x11, x3, x10; \ + umulh x19, x3, x10; \ + adcs x17, x17, x11; \ + adc x19, x19, xzr; \ + mul x11, x4, x5; \ + adds x13, x13, x11; \ + mul x11, x4, x6; \ + adcs x14, x14, x11; \ + mul x11, x4, x7; \ + adcs x15, x15, x11; \ + mul x11, x4, x8; \ + adcs x16, x16, x11; \ + mul x11, x4, x9; \ + adcs x17, x17, x11; \ + mul x11, x4, x10; \ + adcs x19, x19, x11; \ + cset x20, cs; \ + umulh x11, x4, x5; \ + adds x14, x14, x11; \ + umulh x11, x4, x6; \ + adcs x15, x15, x11; \ + umulh x11, x4, x7; \ + adcs x16, x16, x11; \ + umulh x11, x4, x8; \ + adcs x17, x17, x11; \ + umulh x11, x4, x9; \ + adcs x19, x19, x11; \ + umulh x11, x4, x10; \ + adc x20, x20, x11; \ + ldp x3, x4, [P1+16]; \ + mul x11, x3, x5; \ + adds x14, x14, x11; \ + mul x11, x3, x6; \ + adcs x15, x15, x11; \ + mul x11, x3, x7; \ + adcs x16, x16, x11; \ + mul x11, x3, x8; \ + adcs x17, x17, x11; \ + mul x11, x3, x9; \ + adcs x19, x19, x11; \ + mul x11, x3, x10; \ + adcs x20, x20, x11; \ + cset x21, cs; \ + umulh x11, x3, x5; \ + adds x15, x15, x11; \ + umulh x11, x3, x6; \ + adcs x16, x16, x11; \ + umulh x11, x3, x7; \ + adcs x17, x17, x11; \ + umulh x11, x3, x8; \ + adcs x19, x19, x11; \ + umulh x11, x3, x9; \ + adcs x20, x20, x11; \ + umulh x11, x3, x10; \ + adc x21, x21, x11; \ + mul x11, x4, x5; \ + adds x15, x15, x11; \ + mul x11, x4, x6; \ + adcs x16, x16, x11; \ + mul x11, x4, x7; \ + adcs x17, x17, x11; \ + mul x11, x4, x8; \ + adcs x19, x19, x11; \ + mul x11, x4, x9; \ + adcs x20, x20, x11; \ + mul x11, x4, x10; \ + adcs x21, x21, x11; \ + cset x22, cs; \ + umulh x11, x4, x5; \ + adds x16, x16, x11; \ + umulh x11, x4, x6; \ + adcs x17, x17, x11; \ + umulh x11, x4, x7; \ + adcs x19, x19, x11; \ + umulh x11, x4, x8; \ + adcs x20, x20, x11; \ + umulh x11, x4, x9; \ + adcs x21, x21, x11; \ + umulh x11, x4, x10; \ + adc x22, x22, x11; \ + ldp x3, x4, [P1+32]; \ + mul x11, x3, x5; \ + adds x16, x16, x11; \ + mul x11, x3, x6; \ + adcs x17, x17, x11; \ + mul x11, x3, x7; \ + adcs x19, x19, x11; \ + mul x11, x3, x8; \ + adcs x20, x20, x11; \ + mul x11, x3, x9; \ + adcs x21, x21, x11; \ + mul x11, x3, x10; \ + adcs x22, x22, x11; \ + cset x2, cs; \ + umulh x11, x3, x5; \ + adds x17, x17, x11; \ + umulh x11, x3, x6; \ + adcs x19, x19, x11; \ + umulh x11, x3, x7; \ + adcs x20, x20, x11; \ + umulh x11, x3, x8; \ + adcs x21, x21, x11; \ + umulh x11, x3, x9; \ + adcs x22, x22, x11; \ + umulh x11, x3, x10; \ + adc x2, x2, x11; \ + mul x11, x4, x5; \ + adds x17, x17, x11; \ + mul x11, x4, x6; \ + adcs x19, x19, x11; \ + mul x11, x4, x7; \ + adcs x20, x20, x11; \ + mul x11, x4, x8; \ + adcs x21, x21, x11; \ + mul x11, x4, x9; \ + adcs x22, x22, x11; \ + mul x11, x4, x10; \ + adcs x2, x2, x11; \ + cset x1, cs; \ + umulh x11, x4, x5; \ + adds x19, x19, x11; \ + umulh x11, x4, x6; \ + adcs x20, x20, x11; \ + umulh x11, x4, x7; \ + adcs x21, x21, x11; \ + umulh x11, x4, x8; \ + adcs x22, x22, x11; \ + umulh x11, x4, x9; \ + adcs x2, x2, x11; \ + umulh x11, x4, x10; \ + adc x1, x1, x11; \ + lsl x7, x12, #32; \ + add x12, x7, x12; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x12; \ + mov x6, #0xffffffff; \ + mul x5, x6, x12; \ + umulh x6, x6, x12; \ + adds x7, x7, x5; \ + adcs x6, x6, x12; \ + adc x5, xzr, xzr; \ + subs x13, x13, x7; \ + sbcs x14, x14, x6; \ + sbcs x15, x15, x5; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x12, x12, xzr; \ + lsl x7, x13, #32; \ + add x13, x7, x13; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x13; \ + mov x6, #0xffffffff; \ + mul x5, x6, x13; \ + umulh x6, x6, x13; \ + adds x7, x7, x5; \ + adcs x6, x6, x13; \ + adc x5, xzr, xzr; \ + subs x14, x14, x7; \ + sbcs x15, x15, x6; \ + sbcs x16, x16, x5; \ + sbcs x17, x17, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + lsl x7, x14, #32; \ + add x14, x7, x14; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x14; \ + mov x6, #0xffffffff; \ + mul x5, x6, x14; \ + umulh x6, x6, x14; \ + adds x7, x7, x5; \ + adcs x6, x6, x14; \ + adc x5, xzr, xzr; \ + subs x15, x15, x7; \ + sbcs x16, x16, x6; \ + sbcs x17, x17, x5; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x14, x14, xzr; \ + lsl x7, x15, #32; \ + add x15, x7, x15; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x15; \ + mov x6, #0xffffffff; \ + mul x5, x6, x15; \ + umulh x6, x6, x15; \ + adds x7, x7, x5; \ + adcs x6, x6, x15; \ + adc x5, xzr, xzr; \ + subs x16, x16, x7; \ + sbcs x17, x17, x6; \ + sbcs x12, x12, x5; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbc x15, x15, xzr; \ + lsl x7, x16, #32; \ + add x16, x7, x16; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x16; \ + mov x6, #0xffffffff; \ + mul x5, x6, x16; \ + umulh x6, x6, x16; \ + adds x7, x7, x5; \ + adcs x6, x6, x16; \ + adc x5, xzr, xzr; \ + subs x17, x17, x7; \ + sbcs x12, x12, x6; \ + sbcs x13, x13, x5; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x7, x17, #32; \ + add x17, x7, x17; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x17; \ + mov x6, #0xffffffff; \ + mul x5, x6, x17; \ + umulh x6, x6, x17; \ + adds x7, x7, x5; \ + adcs x6, x6, x17; \ + adc x5, xzr, xzr; \ + subs x12, x12, x7; \ + sbcs x13, x13, x6; \ + sbcs x14, x14, x5; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbc x17, x17, xzr; \ + adds x12, x12, x19; \ + adcs x13, x13, x20; \ + adcs x14, x14, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x2; \ + adcs x17, x17, x1; \ + adc x10, xzr, xzr; \ + mov x11, #0xffffffff00000001; \ + adds x19, x12, x11; \ + mov x11, #0xffffffff; \ + adcs x20, x13, x11; \ + mov x11, #0x1; \ + adcs x21, x14, x11; \ + adcs x22, x15, xzr; \ + adcs x2, x16, xzr; \ + adcs x1, x17, xzr; \ + adcs x10, x10, xzr; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + csel x14, x14, x21, eq; \ + csel x15, x15, x22, eq; \ + csel x16, x16, x2, eq; \ + csel x17, x17, x1, eq; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16]; \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + adc x6, xzr, xzr; \ + mov x8, #-4294967295; \ + adds x14, x2, x8; \ + mov x8, #4294967295; \ + adcs x15, x9, x8; \ + mov x8, #1; \ + adcs x16, x10, x8; \ + adcs x17, x11, xzr; \ + adcs x19, x12, xzr; \ + adcs x20, x13, xzr; \ + adcs x6, x6, xzr; \ + csel x2, x2, x14, eq; \ + csel x9, x9, x15, eq; \ + csel x10, x10, x16, eq; \ + csel x11, x11, x17, eq; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). In +// fact, with the Karatsuba-based Montgomery mul here, we don't even +// *need* the restriction that the other argument is reduced. + +#define amontsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + mov x14, #-4294967295; \ + mov x15, #4294967295; \ + csel x14, x14, xzr, cs; \ + csel x15, x15, xzr, cs; \ + cset x16, cs; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, xzr; \ + adcs x12, x12, xzr; \ + adc x13, x13, xzr; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + csetm x3, lo; \ + mov x4, #4294967295; \ + and x4, x4, x3; \ + adds x5, x5, x4; \ + eor x4, x4, x3; \ + adcs x6, x6, x4; \ + mov x4, #-2; \ + and x4, x4, x3; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + adcs x9, x9, x3; \ + adc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +S2N_BN_SYMBOL(p384_montjadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p384(z1sq,z_1) + amontsqr_p384(z2sq,z_2) + + montmul_p384(y1a,z_2,y_1) + montmul_p384(y2a,z_1,y_2) + + montmul_p384(x2a,z1sq,x_2) + montmul_p384(x1a,z2sq,x_1) + montmul_p384(y2a,z1sq,y2a) + montmul_p384(y1a,z2sq,y1a) + + sub_p384(xd,x2a,x1a) + sub_p384(yd,y2a,y1a) + + amontsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x1a) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + montmul_p384(xd,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + montmul_p384(t1,t1,y1a) + montmul_p384(resz,xd,z_2) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x20, x20, x21 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + + ldp x6, x7, [z_2] + ldp x8, x9, [z_2+16] + ldp x10, x11, [z_2+32] + + orr x21, x6, x7 + orr x22, x8, x9 + orr x23, x10, x11 + orr x21, x21, x22 + orr x21, x21, x23 + cmp x21, xzr + cset x21, ne + + cmp x21, x20 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x12, x13, [resz] + csel x12, x0, x12, lo + csel x13, x1, x13, lo + csel x12, x6, x12, hi + csel x13, x7, x13, hi + ldp x14, x15, [resz+16] + csel x14, x2, x14, lo + csel x15, x3, x15, lo + csel x14, x8, x14, hi + csel x15, x9, x15, hi + ldp x16, x17, [resz+32] + csel x16, x4, x16, lo + csel x17, x5, x17, lo + csel x16, x10, x16, hi + csel x17, x11, x17, hi + + ldp x20, x21, [x_1] + ldp x0, x1, [resx] + csel x0, x20, x0, lo + csel x1, x21, x1, lo + ldp x20, x21, [x_2] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + + ldp x20, x21, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x20, x2, lo + csel x3, x21, x3, lo + ldp x20, x21, [x_2+16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + + ldp x20, x21, [x_1+32] + ldp x4, x5, [resx+32] + csel x4, x20, x4, lo + csel x5, x21, x5, lo + ldp x20, x21, [x_2+32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + + ldp x20, x21, [y_1] + ldp x6, x7, [resy] + csel x6, x20, x6, lo + csel x7, x21, x7, lo + ldp x20, x21, [y_2] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + + ldp x20, x21, [y_1+16] + ldp x8, x9, [resy+16] + csel x8, x20, x8, lo + csel x9, x21, x9, lo + ldp x20, x21, [y_2+16] + csel x8, x20, x8, hi + csel x9, x21, x9, hi + + ldp x20, x21, [y_1+32] + ldp x10, x11, [resy+32] + csel x10, x20, x10, lo + csel x11, x21, x11, lo + ldp x20, x21, [y_2+32] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p384/p384_montjdouble.S b/third_party/s2n-bignum/arm/p384/p384_montjdouble.S index 7dfd9766f2..3f92103cad 100644 --- a/third_party/s2n-bignum/arm/p384/p384_montjdouble.S +++ b/third_party/s2n-bignum/arm/p384/p384_montjdouble.S @@ -26,8 +26,8 @@ // Stable homes for input arguments during main code sequence -#define input_z x23 -#define input_x x24 +#define input_z x25 +#define input_x x26 // Pointer-offset pairs for inputs and outputs @@ -56,501 +56,652 @@ #define d sp, #(NUMSIZE*6) #define x4p sp, #(NUMSIZE*6) -#define NSPACE (NUMSIZE*7) +#define NSPACE #(NUMSIZE*7) -// Corresponds exactly to bignum_montmul_p384_alt +// Corresponds exactly to bignum_montmul_p384 #define montmul_p384(P0,P1,P2) \ ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x12, x3, x5; \ - umulh x13, x3, x5; \ - mul x11, x3, x6; \ - umulh x14, x3, x6; \ - adds x13, x13, x11; \ - ldp x7, x8, [P2+16]; \ - mul x11, x3, x7; \ - umulh x15, x3, x7; \ - adcs x14, x14, x11; \ - mul x11, x3, x8; \ - umulh x16, x3, x8; \ - adcs x15, x15, x11; \ - ldp x9, x10, [P2+32]; \ - mul x11, x3, x9; \ - umulh x17, x3, x9; \ - adcs x16, x16, x11; \ - mul x11, x3, x10; \ - umulh x19, x3, x10; \ - adcs x17, x17, x11; \ - adc x19, x19, xzr; \ - mul x11, x4, x5; \ - adds x13, x13, x11; \ - mul x11, x4, x6; \ - adcs x14, x14, x11; \ - mul x11, x4, x7; \ - adcs x15, x15, x11; \ - mul x11, x4, x8; \ - adcs x16, x16, x11; \ - mul x11, x4, x9; \ - adcs x17, x17, x11; \ - mul x11, x4, x10; \ - adcs x19, x19, x11; \ - cset x20, cs; \ - umulh x11, x4, x5; \ - adds x14, x14, x11; \ - umulh x11, x4, x6; \ - adcs x15, x15, x11; \ - umulh x11, x4, x7; \ - adcs x16, x16, x11; \ - umulh x11, x4, x8; \ - adcs x17, x17, x11; \ - umulh x11, x4, x9; \ - adcs x19, x19, x11; \ - umulh x11, x4, x10; \ - adc x20, x20, x11; \ - ldp x3, x4, [P1+16]; \ - mul x11, x3, x5; \ - adds x14, x14, x11; \ - mul x11, x3, x6; \ - adcs x15, x15, x11; \ - mul x11, x3, x7; \ - adcs x16, x16, x11; \ - mul x11, x3, x8; \ - adcs x17, x17, x11; \ - mul x11, x3, x9; \ - adcs x19, x19, x11; \ - mul x11, x3, x10; \ - adcs x20, x20, x11; \ - cset x21, cs; \ - umulh x11, x3, x5; \ - adds x15, x15, x11; \ - umulh x11, x3, x6; \ - adcs x16, x16, x11; \ - umulh x11, x3, x7; \ - adcs x17, x17, x11; \ - umulh x11, x3, x8; \ - adcs x19, x19, x11; \ - umulh x11, x3, x9; \ - adcs x20, x20, x11; \ - umulh x11, x3, x10; \ - adc x21, x21, x11; \ - mul x11, x4, x5; \ - adds x15, x15, x11; \ - mul x11, x4, x6; \ - adcs x16, x16, x11; \ - mul x11, x4, x7; \ - adcs x17, x17, x11; \ - mul x11, x4, x8; \ - adcs x19, x19, x11; \ - mul x11, x4, x9; \ - adcs x20, x20, x11; \ - mul x11, x4, x10; \ - adcs x21, x21, x11; \ - cset x22, cs; \ - umulh x11, x4, x5; \ - adds x16, x16, x11; \ - umulh x11, x4, x6; \ - adcs x17, x17, x11; \ - umulh x11, x4, x7; \ - adcs x19, x19, x11; \ - umulh x11, x4, x8; \ - adcs x20, x20, x11; \ - umulh x11, x4, x9; \ - adcs x21, x21, x11; \ - umulh x11, x4, x10; \ - adc x22, x22, x11; \ - ldp x3, x4, [P1+32]; \ - mul x11, x3, x5; \ - adds x16, x16, x11; \ - mul x11, x3, x6; \ - adcs x17, x17, x11; \ - mul x11, x3, x7; \ - adcs x19, x19, x11; \ - mul x11, x3, x8; \ - adcs x20, x20, x11; \ - mul x11, x3, x9; \ - adcs x21, x21, x11; \ - mul x11, x3, x10; \ - adcs x22, x22, x11; \ - cset x2, cs; \ - umulh x11, x3, x5; \ - adds x17, x17, x11; \ - umulh x11, x3, x6; \ - adcs x19, x19, x11; \ - umulh x11, x3, x7; \ - adcs x20, x20, x11; \ - umulh x11, x3, x8; \ - adcs x21, x21, x11; \ - umulh x11, x3, x9; \ - adcs x22, x22, x11; \ - umulh x11, x3, x10; \ - adc x2, x2, x11; \ - mul x11, x4, x5; \ - adds x17, x17, x11; \ - mul x11, x4, x6; \ - adcs x19, x19, x11; \ - mul x11, x4, x7; \ - adcs x20, x20, x11; \ - mul x11, x4, x8; \ - adcs x21, x21, x11; \ - mul x11, x4, x9; \ - adcs x22, x22, x11; \ - mul x11, x4, x10; \ - adcs x2, x2, x11; \ - cset x1, cs; \ - umulh x11, x4, x5; \ - adds x19, x19, x11; \ - umulh x11, x4, x6; \ - adcs x20, x20, x11; \ - umulh x11, x4, x7; \ - adcs x21, x21, x11; \ - umulh x11, x4, x8; \ - adcs x22, x22, x11; \ - umulh x11, x4, x9; \ - adcs x2, x2, x11; \ - umulh x11, x4, x10; \ - adc x1, x1, x11; \ - lsl x7, x12, #32; \ - add x12, x7, x12; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x12; \ - mov x6, #0xffffffff; \ - mul x5, x6, x12; \ - umulh x6, x6, x12; \ - adds x7, x7, x5; \ - adcs x6, x6, x12; \ - adc x5, xzr, xzr; \ - subs x13, x13, x7; \ - sbcs x14, x14, x6; \ - sbcs x15, x15, x5; \ + ldp x5, x6, [P1+16]; \ + ldp x7, x8, [P1+32]; \ + ldp x9, x10, [P2]; \ + ldp x11, x12, [P2+16]; \ + ldp x13, x14, [P2+32]; \ + mul x15, x3, x9; \ + mul x21, x4, x10; \ + mul x22, x5, x11; \ + umulh x23, x3, x9; \ + umulh x24, x4, x10; \ + umulh x1, x5, x11; \ + adds x23, x23, x21; \ + adcs x24, x24, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x24, x23; \ + adcs x19, x1, x24; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x24; \ + adc x1, x1, xzr; \ + subs x24, x3, x4; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x3, x5; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x4, x5; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ + sbc x15, x15, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ + sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x12, x12, xzr; \ - lsl x7, x13, #32; \ - add x13, x7, x13; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x13; \ - mov x6, #0xffffffff; \ - mul x5, x6, x13; \ - umulh x6, x6, x13; \ - adds x7, x7, x5; \ - adcs x6, x6, x13; \ - adc x5, xzr, xzr; \ - subs x14, x14, x7; \ - sbcs x15, x15, x6; \ - sbcs x16, x16, x5; \ - sbcs x17, x17, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - lsl x7, x14, #32; \ - add x14, x7, x14; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x14; \ - mov x6, #0xffffffff; \ - mul x5, x6, x14; \ - umulh x6, x6, x14; \ - adds x7, x7, x5; \ - adcs x6, x6, x14; \ - adc x5, xzr, xzr; \ - subs x15, x15, x7; \ - sbcs x16, x16, x6; \ - sbcs x17, x17, x5; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x14, x14, xzr; \ - lsl x7, x15, #32; \ - add x15, x7, x15; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x15; \ - mov x6, #0xffffffff; \ - mul x5, x6, x15; \ - umulh x6, x6, x15; \ - adds x7, x7, x5; \ - adcs x6, x6, x15; \ - adc x5, xzr, xzr; \ - subs x16, x16, x7; \ - sbcs x17, x17, x6; \ - sbcs x12, x12, x5; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ + sbc x17, x17, xzr; \ + stp x19, x20, [P0]; \ + stp x1, x15, [P0+16]; \ + stp x16, x17, [P0+32]; \ + mul x15, x6, x12; \ + mul x21, x7, x13; \ + mul x22, x8, x14; \ + umulh x23, x6, x12; \ + umulh x24, x7, x13; \ + umulh x1, x8, x14; \ + adds x23, x23, x21; \ + adcs x24, x24, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x24, x23; \ + adcs x19, x1, x24; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x24; \ + adc x1, x1, xzr; \ + subs x24, x6, x7; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x13, x12; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x6, x8; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x14, x12; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x7, x8; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x14, x13; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + subs x6, x6, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x5; \ + ngc x3, xzr; \ + cmn x3, #1; \ + eor x6, x6, x3; \ + adcs x6, x6, xzr; \ + eor x7, x7, x3; \ + adcs x7, x7, xzr; \ + eor x8, x8, x3; \ + adc x8, x8, xzr; \ + subs x9, x9, x12; \ + sbcs x10, x10, x13; \ + sbcs x11, x11, x14; \ + ngc x14, xzr; \ + cmn x14, #1; \ + eor x9, x9, x14; \ + adcs x9, x9, xzr; \ + eor x10, x10, x14; \ + adcs x10, x10, xzr; \ + eor x11, x11, x14; \ + adc x11, x11, xzr; \ + eor x14, x3, x14; \ + ldp x21, x22, [P0]; \ + adds x15, x15, x21; \ + adcs x16, x16, x22; \ + ldp x21, x22, [P0+16]; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + ldp x21, x22, [P0+32]; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adc x2, xzr, xzr; \ + stp x15, x16, [P0]; \ + stp x17, x19, [P0+16]; \ + stp x20, x1, [P0+32]; \ + mul x15, x6, x9; \ + mul x21, x7, x10; \ + mul x22, x8, x11; \ + umulh x23, x6, x9; \ + umulh x24, x7, x10; \ + umulh x1, x8, x11; \ + adds x23, x23, x21; \ + adcs x24, x24, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x24, x23; \ + adcs x19, x1, x24; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x24; \ + adc x1, x1, xzr; \ + subs x24, x6, x7; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x6, x8; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x24, x7, x8; \ + cneg x24, x24, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x24, x22; \ + umulh x22, x24, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + ldp x3, x4, [P0]; \ + ldp x5, x6, [P0+16]; \ + ldp x7, x8, [P0+32]; \ + cmn x14, #1; \ + eor x15, x15, x14; \ + adcs x15, x15, x3; \ + eor x16, x16, x14; \ + adcs x16, x16, x4; \ + eor x17, x17, x14; \ + adcs x17, x17, x5; \ + eor x19, x19, x14; \ + adcs x19, x19, x6; \ + eor x20, x20, x14; \ + adcs x20, x20, x7; \ + eor x1, x1, x14; \ + adcs x1, x1, x8; \ + adcs x9, x14, x2; \ + adcs x10, x14, xzr; \ + adcs x11, x14, xzr; \ + adc x12, x14, xzr; \ + adds x19, x19, x3; \ + adcs x20, x20, x4; \ + adcs x1, x1, x5; \ + adcs x9, x9, x6; \ + adcs x10, x10, x7; \ + adcs x11, x11, x8; \ + adc x12, x12, x2; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ sbc x15, x15, xzr; \ - lsl x7, x16, #32; \ - add x16, x7, x16; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x16; \ - mov x6, #0xffffffff; \ - mul x5, x6, x16; \ - umulh x6, x6, x16; \ - adds x7, x7, x5; \ - adcs x6, x6, x16; \ - adc x5, xzr, xzr; \ - subs x17, x17, x7; \ - sbcs x12, x12, x6; \ - sbcs x13, x13, x5; \ - sbcs x14, x14, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ sbcs x15, x15, xzr; \ sbc x16, x16, xzr; \ - lsl x7, x17, #32; \ - add x17, x7, x17; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x17; \ - mov x6, #0xffffffff; \ - mul x5, x6, x17; \ - umulh x6, x6, x17; \ - adds x7, x7, x5; \ - adcs x6, x6, x17; \ - adc x5, xzr, xzr; \ - subs x12, x12, x7; \ - sbcs x13, x13, x6; \ - sbcs x14, x14, x5; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ sbc x17, x17, xzr; \ - adds x12, x12, x19; \ - adcs x13, x13, x20; \ - adcs x14, x14, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x2; \ - adcs x17, x17, x1; \ - adc x10, xzr, xzr; \ - mov x11, #0xffffffff00000001; \ - adds x19, x12, x11; \ - mov x11, #0xffffffff; \ - adcs x20, x13, x11; \ - mov x11, #0x1; \ - adcs x21, x14, x11; \ - adcs x22, x15, xzr; \ - adcs x2, x16, xzr; \ - adcs x1, x17, xzr; \ + adds x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adc x12, x12, xzr; \ + add x22, x12, #1; \ + lsl x21, x22, #32; \ + subs x24, x22, x21; \ + sbc x21, x21, xzr; \ + adds x19, x19, x24; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adcs x9, x9, xzr; \ adcs x10, x10, xzr; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - csel x14, x14, x21, eq; \ - csel x15, x15, x22, eq; \ - csel x16, x16, x2, eq; \ - csel x17, x17, x1, eq; \ - stp x12, x13, [P0]; \ - stp x14, x15, [P0+16]; \ - stp x16, x17, [P0+32] - -// Corresponds exactly to bignum_montsqr_p384_alt + adcs x11, x11, xzr; \ + csetm x22, lo; \ + mov x23, #4294967295; \ + and x23, x23, x22; \ + adds x19, x19, x23; \ + eor x23, x23, x22; \ + adcs x20, x20, x23; \ + mov x23, #-2; \ + and x23, x23, x22; \ + adcs x1, x1, x23; \ + adcs x9, x9, x22; \ + adcs x10, x10, x22; \ + adc x11, x11, x22; \ + stp x19, x20, [P0]; \ + stp x1, x9, [P0+16]; \ + stp x10, x11, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384 #define montsqr_p384(P0,P1) \ ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ + mul x14, x2, x3; \ + mul x15, x2, x4; \ + mul x16, x3, x4; \ + mul x8, x2, x2; \ + mul x10, x3, x3; \ + mul x12, x4, x4; \ + umulh x17, x2, x3; \ + adds x15, x15, x17; \ + umulh x17, x2, x4; \ + adcs x16, x16, x17; \ + umulh x17, x3, x4; \ adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ + umulh x9, x2, x2; \ + umulh x11, x3, x3; \ + umulh x13, x4, x4; \ + adds x14, x14, x14; \ adcs x15, x15, x15; \ adcs x16, x16, x16; \ adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ + adc x13, x13, xzr; \ + adds x9, x9, x14; \ + adcs x10, x10, x15; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x13, xzr; \ + lsl x16, x8, #32; \ + add x8, x16, x8; \ + lsr x16, x8, #32; \ + subs x16, x16, x8; \ + sbc x15, x8, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x8; \ + adc x14, xzr, xzr; \ + subs x9, x9, x16; \ + sbcs x10, x10, x15; \ + sbcs x11, x11, x14; \ sbcs x12, x12, xzr; \ sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ + sbc x8, x8, xzr; \ + lsl x16, x9, #32; \ + add x9, x16, x9; \ + lsr x16, x9, #32; \ + subs x16, x16, x9; \ + sbc x15, x9, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x9; \ + adc x14, xzr, xzr; \ + subs x10, x10, x16; \ + sbcs x11, x11, x15; \ + sbcs x12, x12, x14; \ sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ + sbcs x8, x8, xzr; \ sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ + lsl x16, x10, #32; \ + add x10, x16, x10; \ + lsr x16, x10, #32; \ + subs x16, x16, x10; \ + sbc x15, x10, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x10; \ + adc x14, xzr, xzr; \ + subs x11, x11, x16; \ + sbcs x12, x12, x15; \ + sbcs x13, x13, x14; \ + sbcs x8, x8, xzr; \ sbcs x9, x9, xzr; \ sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ + stp x11, x12, [P0]; \ + stp x13, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + mul x8, x2, x5; \ + mul x14, x3, x6; \ + mul x15, x4, x7; \ + umulh x16, x2, x5; \ + umulh x17, x3, x6; \ + umulh x1, x4, x7; \ + adds x16, x16, x14; \ + adcs x17, x17, x15; \ + adc x1, x1, xzr; \ + adds x9, x16, x8; \ + adcs x10, x17, x16; \ + adcs x11, x1, x17; \ + adc x12, x1, xzr; \ + adds x10, x10, x8; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x1, xzr; \ + subs x17, x2, x3; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x6, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x9, x9, x16; \ + adcs x10, x10, x15; \ + adcs x11, x11, x14; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x2, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x10, x10, x16; \ + adcs x11, x11, x15; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x3, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x6; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x11, x11, x16; \ + adcs x12, x12, x15; \ + adc x13, x13, x14; \ + adds x8, x8, x8; \ + adcs x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adc x17, xzr, xzr; \ + ldp x2, x3, [P0]; \ + adds x8, x8, x2; \ + adcs x9, x9, x3; \ + ldp x2, x3, [P0+16]; \ + adcs x10, x10, x2; \ + adcs x11, x11, x3; \ + ldp x2, x3, [P0+32]; \ + adcs x12, x12, x2; \ + adcs x13, x13, x3; \ + adc x17, x17, xzr; \ + lsl x4, x8, #32; \ + add x8, x4, x8; \ + lsr x4, x8, #32; \ + subs x4, x4, x8; \ + sbc x3, x8, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x8; \ + adc x2, xzr, xzr; \ + subs x9, x9, x4; \ sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ + sbcs x11, x11, x2; \ sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - adc x6, xzr, xzr; \ - mov x8, #-4294967295; \ - adds x14, x2, x8; \ - mov x8, #4294967295; \ - adcs x15, x9, x8; \ - mov x8, #1; \ - adcs x16, x10, x8; \ - adcs x17, x11, xzr; \ - adcs x19, x12, xzr; \ - adcs x20, x13, xzr; \ - adcs x6, x6, xzr; \ - csel x2, x2, x14, eq; \ - csel x9, x9, x15, eq; \ - csel x10, x10, x16, eq; \ - csel x11, x11, x17, eq; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] + sbcs x13, x13, xzr; \ + sbc x8, x8, xzr; \ + lsl x4, x9, #32; \ + add x9, x4, x9; \ + lsr x4, x9, #32; \ + subs x4, x4, x9; \ + sbc x3, x9, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x9; \ + adc x2, xzr, xzr; \ + subs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, x2; \ + sbcs x13, x13, xzr; \ + sbcs x8, x8, xzr; \ + sbc x9, x9, xzr; \ + lsl x4, x10, #32; \ + add x10, x4, x10; \ + lsr x4, x10, #32; \ + subs x4, x4, x10; \ + sbc x3, x10, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x10; \ + adc x2, xzr, xzr; \ + subs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, x2; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + adds x17, x17, x8; \ + adcs x8, x9, xzr; \ + adcs x9, x10, xzr; \ + adcs x10, xzr, xzr; \ + mul x1, x5, x5; \ + adds x11, x11, x1; \ + mul x14, x6, x6; \ + mul x15, x7, x7; \ + umulh x1, x5, x5; \ + adcs x12, x12, x1; \ + umulh x1, x6, x6; \ + adcs x13, x13, x14; \ + adcs x17, x17, x1; \ + umulh x1, x7, x7; \ + adcs x8, x8, x15; \ + adcs x9, x9, x1; \ + adc x10, x10, xzr; \ + mul x1, x5, x6; \ + mul x14, x5, x7; \ + mul x15, x6, x7; \ + umulh x16, x5, x6; \ + adds x14, x14, x16; \ + umulh x16, x5, x7; \ + adcs x15, x15, x16; \ + umulh x16, x6, x7; \ + adc x16, x16, xzr; \ + adds x1, x1, x1; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adc x5, xzr, xzr; \ + adds x12, x12, x1; \ + adcs x13, x13, x14; \ + adcs x17, x17, x15; \ + adcs x8, x8, x16; \ + adcs x9, x9, x5; \ + adc x10, x10, xzr; \ + mov x1, #-4294967295; \ + mov x14, #4294967295; \ + mov x15, #1; \ + cmn x11, x1; \ + adcs xzr, x12, x14; \ + adcs xzr, x13, x15; \ + adcs xzr, x17, xzr; \ + adcs xzr, x8, xzr; \ + adcs xzr, x9, xzr; \ + adc x10, x10, xzr; \ + neg x10, x10; \ + and x1, x1, x10; \ + adds x11, x11, x1; \ + and x14, x14, x10; \ + adcs x12, x12, x14; \ + and x15, x15, x10; \ + adcs x13, x13, x15; \ + adcs x17, x17, xzr; \ + adcs x8, x8, xzr; \ + adc x9, x9, xzr; \ + stp x11, x12, [P0]; \ + stp x13, x17, [P0+16]; \ + stp x8, x9, [P0+32] // Corresponds exactly to bignum_sub_p384 @@ -876,10 +1027,11 @@ S2N_BN_SYMBOL(p384_montjdouble): // Save regs and make room on stack for temporary variables - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - sub sp, sp, NSPACE + sub sp, sp, NSPACE+64 + stp x19, x20, [sp, NSPACE] + stp x21, x22, [sp, NSPACE+16] + stp x23, x24, [sp, NSPACE+32] + stp x25, x26, [sp, NSPACE+48] // Move the input arguments to stable places @@ -938,12 +1090,11 @@ S2N_BN_SYMBOL(p384_montjdouble): // Restore stack and registers - add sp, sp, NSPACE - - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - + ldp x19, x20, [sp, NSPACE] + ldp x21, x22, [sp, NSPACE+16] + ldp x23, x24, [sp, NSPACE+32] + ldp x25, x26, [sp, NSPACE+48] + add sp, sp, NSPACE+64 ret #if defined(__linux__) && defined(__ELF__) diff --git a/third_party/s2n-bignum/arm/p384/p384_montjdouble_alt.S b/third_party/s2n-bignum/arm/p384/p384_montjdouble_alt.S new file mode 100644 index 0000000000..0e83ff4a98 --- /dev/null +++ b/third_party/s2n-bignum/arm/p384/p384_montjdouble_alt.S @@ -0,0 +1,951 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjdouble_alt +// (uint64_t p3[static 18],uint64_t p1[static 18]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjdouble_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x23 +#define input_x x24 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y2 sp, #(NUMSIZE*1) +#define x2p sp, #(NUMSIZE*2) +#define xy2 sp, #(NUMSIZE*3) + +#define y4 sp, #(NUMSIZE*4) +#define t2 sp, #(NUMSIZE*4) + +#define dx2 sp, #(NUMSIZE*5) +#define t1 sp, #(NUMSIZE*5) + +#define d sp, #(NUMSIZE*6) +#define x4p sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x12, x3, x5; \ + umulh x13, x3, x5; \ + mul x11, x3, x6; \ + umulh x14, x3, x6; \ + adds x13, x13, x11; \ + ldp x7, x8, [P2+16]; \ + mul x11, x3, x7; \ + umulh x15, x3, x7; \ + adcs x14, x14, x11; \ + mul x11, x3, x8; \ + umulh x16, x3, x8; \ + adcs x15, x15, x11; \ + ldp x9, x10, [P2+32]; \ + mul x11, x3, x9; \ + umulh x17, x3, x9; \ + adcs x16, x16, x11; \ + mul x11, x3, x10; \ + umulh x19, x3, x10; \ + adcs x17, x17, x11; \ + adc x19, x19, xzr; \ + mul x11, x4, x5; \ + adds x13, x13, x11; \ + mul x11, x4, x6; \ + adcs x14, x14, x11; \ + mul x11, x4, x7; \ + adcs x15, x15, x11; \ + mul x11, x4, x8; \ + adcs x16, x16, x11; \ + mul x11, x4, x9; \ + adcs x17, x17, x11; \ + mul x11, x4, x10; \ + adcs x19, x19, x11; \ + cset x20, cs; \ + umulh x11, x4, x5; \ + adds x14, x14, x11; \ + umulh x11, x4, x6; \ + adcs x15, x15, x11; \ + umulh x11, x4, x7; \ + adcs x16, x16, x11; \ + umulh x11, x4, x8; \ + adcs x17, x17, x11; \ + umulh x11, x4, x9; \ + adcs x19, x19, x11; \ + umulh x11, x4, x10; \ + adc x20, x20, x11; \ + ldp x3, x4, [P1+16]; \ + mul x11, x3, x5; \ + adds x14, x14, x11; \ + mul x11, x3, x6; \ + adcs x15, x15, x11; \ + mul x11, x3, x7; \ + adcs x16, x16, x11; \ + mul x11, x3, x8; \ + adcs x17, x17, x11; \ + mul x11, x3, x9; \ + adcs x19, x19, x11; \ + mul x11, x3, x10; \ + adcs x20, x20, x11; \ + cset x21, cs; \ + umulh x11, x3, x5; \ + adds x15, x15, x11; \ + umulh x11, x3, x6; \ + adcs x16, x16, x11; \ + umulh x11, x3, x7; \ + adcs x17, x17, x11; \ + umulh x11, x3, x8; \ + adcs x19, x19, x11; \ + umulh x11, x3, x9; \ + adcs x20, x20, x11; \ + umulh x11, x3, x10; \ + adc x21, x21, x11; \ + mul x11, x4, x5; \ + adds x15, x15, x11; \ + mul x11, x4, x6; \ + adcs x16, x16, x11; \ + mul x11, x4, x7; \ + adcs x17, x17, x11; \ + mul x11, x4, x8; \ + adcs x19, x19, x11; \ + mul x11, x4, x9; \ + adcs x20, x20, x11; \ + mul x11, x4, x10; \ + adcs x21, x21, x11; \ + cset x22, cs; \ + umulh x11, x4, x5; \ + adds x16, x16, x11; \ + umulh x11, x4, x6; \ + adcs x17, x17, x11; \ + umulh x11, x4, x7; \ + adcs x19, x19, x11; \ + umulh x11, x4, x8; \ + adcs x20, x20, x11; \ + umulh x11, x4, x9; \ + adcs x21, x21, x11; \ + umulh x11, x4, x10; \ + adc x22, x22, x11; \ + ldp x3, x4, [P1+32]; \ + mul x11, x3, x5; \ + adds x16, x16, x11; \ + mul x11, x3, x6; \ + adcs x17, x17, x11; \ + mul x11, x3, x7; \ + adcs x19, x19, x11; \ + mul x11, x3, x8; \ + adcs x20, x20, x11; \ + mul x11, x3, x9; \ + adcs x21, x21, x11; \ + mul x11, x3, x10; \ + adcs x22, x22, x11; \ + cset x2, cs; \ + umulh x11, x3, x5; \ + adds x17, x17, x11; \ + umulh x11, x3, x6; \ + adcs x19, x19, x11; \ + umulh x11, x3, x7; \ + adcs x20, x20, x11; \ + umulh x11, x3, x8; \ + adcs x21, x21, x11; \ + umulh x11, x3, x9; \ + adcs x22, x22, x11; \ + umulh x11, x3, x10; \ + adc x2, x2, x11; \ + mul x11, x4, x5; \ + adds x17, x17, x11; \ + mul x11, x4, x6; \ + adcs x19, x19, x11; \ + mul x11, x4, x7; \ + adcs x20, x20, x11; \ + mul x11, x4, x8; \ + adcs x21, x21, x11; \ + mul x11, x4, x9; \ + adcs x22, x22, x11; \ + mul x11, x4, x10; \ + adcs x2, x2, x11; \ + cset x1, cs; \ + umulh x11, x4, x5; \ + adds x19, x19, x11; \ + umulh x11, x4, x6; \ + adcs x20, x20, x11; \ + umulh x11, x4, x7; \ + adcs x21, x21, x11; \ + umulh x11, x4, x8; \ + adcs x22, x22, x11; \ + umulh x11, x4, x9; \ + adcs x2, x2, x11; \ + umulh x11, x4, x10; \ + adc x1, x1, x11; \ + lsl x7, x12, #32; \ + add x12, x7, x12; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x12; \ + mov x6, #0xffffffff; \ + mul x5, x6, x12; \ + umulh x6, x6, x12; \ + adds x7, x7, x5; \ + adcs x6, x6, x12; \ + adc x5, xzr, xzr; \ + subs x13, x13, x7; \ + sbcs x14, x14, x6; \ + sbcs x15, x15, x5; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x12, x12, xzr; \ + lsl x7, x13, #32; \ + add x13, x7, x13; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x13; \ + mov x6, #0xffffffff; \ + mul x5, x6, x13; \ + umulh x6, x6, x13; \ + adds x7, x7, x5; \ + adcs x6, x6, x13; \ + adc x5, xzr, xzr; \ + subs x14, x14, x7; \ + sbcs x15, x15, x6; \ + sbcs x16, x16, x5; \ + sbcs x17, x17, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + lsl x7, x14, #32; \ + add x14, x7, x14; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x14; \ + mov x6, #0xffffffff; \ + mul x5, x6, x14; \ + umulh x6, x6, x14; \ + adds x7, x7, x5; \ + adcs x6, x6, x14; \ + adc x5, xzr, xzr; \ + subs x15, x15, x7; \ + sbcs x16, x16, x6; \ + sbcs x17, x17, x5; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x14, x14, xzr; \ + lsl x7, x15, #32; \ + add x15, x7, x15; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x15; \ + mov x6, #0xffffffff; \ + mul x5, x6, x15; \ + umulh x6, x6, x15; \ + adds x7, x7, x5; \ + adcs x6, x6, x15; \ + adc x5, xzr, xzr; \ + subs x16, x16, x7; \ + sbcs x17, x17, x6; \ + sbcs x12, x12, x5; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbc x15, x15, xzr; \ + lsl x7, x16, #32; \ + add x16, x7, x16; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x16; \ + mov x6, #0xffffffff; \ + mul x5, x6, x16; \ + umulh x6, x6, x16; \ + adds x7, x7, x5; \ + adcs x6, x6, x16; \ + adc x5, xzr, xzr; \ + subs x17, x17, x7; \ + sbcs x12, x12, x6; \ + sbcs x13, x13, x5; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x7, x17, #32; \ + add x17, x7, x17; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x17; \ + mov x6, #0xffffffff; \ + mul x5, x6, x17; \ + umulh x6, x6, x17; \ + adds x7, x7, x5; \ + adcs x6, x6, x17; \ + adc x5, xzr, xzr; \ + subs x12, x12, x7; \ + sbcs x13, x13, x6; \ + sbcs x14, x14, x5; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbc x17, x17, xzr; \ + adds x12, x12, x19; \ + adcs x13, x13, x20; \ + adcs x14, x14, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x2; \ + adcs x17, x17, x1; \ + adc x10, xzr, xzr; \ + mov x11, #0xffffffff00000001; \ + adds x19, x12, x11; \ + mov x11, #0xffffffff; \ + adcs x20, x13, x11; \ + mov x11, #0x1; \ + adcs x21, x14, x11; \ + adcs x22, x15, xzr; \ + adcs x2, x16, xzr; \ + adcs x1, x17, xzr; \ + adcs x10, x10, xzr; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + csel x14, x14, x21, eq; \ + csel x15, x15, x22, eq; \ + csel x16, x16, x2, eq; \ + csel x17, x17, x1, eq; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16]; \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + adc x6, xzr, xzr; \ + mov x8, #-4294967295; \ + adds x14, x2, x8; \ + mov x8, #4294967295; \ + adcs x15, x9, x8; \ + mov x8, #1; \ + adcs x16, x10, x8; \ + adcs x17, x11, xzr; \ + adcs x19, x12, xzr; \ + adcs x20, x13, xzr; \ + adcs x6, x6, xzr; \ + csel x2, x2, x14, eq; \ + csel x9, x9, x15, eq; \ + csel x10, x10, x16, eq; \ + csel x11, x11, x17, eq; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + csetm x3, lo; \ + mov x4, #4294967295; \ + and x4, x4, x3; \ + adds x5, x5, x4; \ + eor x4, x4, x3; \ + adcs x6, x6, x4; \ + mov x4, #-2; \ + and x4, x4, x3; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + adcs x9, x9, x3; \ + adc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +// Corresponds exactly to bignum_add_p384 + +#define add_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + adds x5, x5, x4; \ + adcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + adcs x9, x9, x4; \ + adcs x10, x10, x3; \ + adc x3, xzr, xzr; \ + mov x4, #0xffffffff; \ + cmp x5, x4; \ + mov x4, #0xffffffff00000000; \ + sbcs xzr, x6, x4; \ + mov x4, #0xfffffffffffffffe; \ + sbcs xzr, x7, x4; \ + adcs xzr, x8, xzr; \ + adcs xzr, x9, xzr; \ + adcs xzr, x10, xzr; \ + adcs x3, x3, xzr; \ + csetm x3, ne; \ + mov x4, #0xffffffff; \ + and x4, x4, x3; \ + subs x5, x5, x4; \ + eor x4, x4, x3; \ + sbcs x6, x6, x4; \ + mov x4, #0xfffffffffffffffe; \ + and x4, x4, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + sbcs x9, x9, x3; \ + sbc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +// P0 = 4 * P1 - P2 + +#define cmsub41_p384(P0,P1,P2) \ + ldp x1, x2, [P1]; \ + ldp x3, x4, [P1+16]; \ + ldp x5, x6, [P1+32]; \ + lsl x0, x1, #2; \ + ldp x7, x8, [P2]; \ + subs x0, x0, x7; \ + extr x1, x2, x1, #62; \ + sbcs x1, x1, x8; \ + ldp x7, x8, [P2+16]; \ + extr x2, x3, x2, #62; \ + sbcs x2, x2, x7; \ + extr x3, x4, x3, #62; \ + sbcs x3, x3, x8; \ + extr x4, x5, x4, #62; \ + ldp x7, x8, [P2+32]; \ + sbcs x4, x4, x7; \ + extr x5, x6, x5, #62; \ + sbcs x5, x5, x8; \ + lsr x6, x6, #62; \ + adc x6, x6, xzr; \ + lsl x7, x6, #32; \ + subs x8, x6, x7; \ + sbc x7, x7, xzr; \ + adds x0, x0, x8; \ + adcs x1, x1, x7; \ + adcs x2, x2, x6; \ + adcs x3, x3, xzr; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + csetm x8, cc; \ + mov x9, #0xffffffff; \ + and x9, x9, x8; \ + adds x0, x0, x9; \ + eor x9, x9, x8; \ + adcs x1, x1, x9; \ + mov x9, #0xfffffffffffffffe; \ + and x9, x9, x8; \ + adcs x2, x2, x9; \ + adcs x3, x3, x8; \ + adcs x4, x4, x8; \ + adc x5, x5, x8; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16]; \ + stp x4, x5, [P0+32] + +// P0 = C * P1 - D * P2 + +#define cmsub_p384(P0,C,P1,D,P2) \ + ldp x0, x1, [P2]; \ + mov x6, #0x00000000ffffffff; \ + subs x6, x6, x0; \ + mov x7, #0xffffffff00000000; \ + sbcs x7, x7, x1; \ + ldp x0, x1, [P2+16]; \ + mov x8, #0xfffffffffffffffe; \ + sbcs x8, x8, x0; \ + mov x13, #0xffffffffffffffff; \ + sbcs x9, x13, x1; \ + ldp x0, x1, [P2+32]; \ + sbcs x10, x13, x0; \ + sbc x11, x13, x1; \ + mov x12, D; \ + mul x0, x12, x6; \ + mul x1, x12, x7; \ + mul x2, x12, x8; \ + mul x3, x12, x9; \ + mul x4, x12, x10; \ + mul x5, x12, x11; \ + umulh x6, x12, x6; \ + umulh x7, x12, x7; \ + umulh x8, x12, x8; \ + umulh x9, x12, x9; \ + umulh x10, x12, x10; \ + umulh x12, x12, x11; \ + adds x1, x1, x6; \ + adcs x2, x2, x7; \ + adcs x3, x3, x8; \ + adcs x4, x4, x9; \ + adcs x5, x5, x10; \ + mov x6, #1; \ + adc x6, x12, x6; \ + ldp x8, x9, [P1]; \ + ldp x10, x11, [P1+16]; \ + ldp x12, x13, [P1+32]; \ + mov x14, C; \ + mul x15, x14, x8; \ + umulh x8, x14, x8; \ + adds x0, x0, x15; \ + mul x15, x14, x9; \ + umulh x9, x14, x9; \ + adcs x1, x1, x15; \ + mul x15, x14, x10; \ + umulh x10, x14, x10; \ + adcs x2, x2, x15; \ + mul x15, x14, x11; \ + umulh x11, x14, x11; \ + adcs x3, x3, x15; \ + mul x15, x14, x12; \ + umulh x12, x14, x12; \ + adcs x4, x4, x15; \ + mul x15, x14, x13; \ + umulh x13, x14, x13; \ + adcs x5, x5, x15; \ + adc x6, x6, xzr; \ + adds x1, x1, x8; \ + adcs x2, x2, x9; \ + adcs x3, x3, x10; \ + adcs x4, x4, x11; \ + adcs x5, x5, x12; \ + adcs x6, x6, x13; \ + lsl x7, x6, #32; \ + subs x8, x6, x7; \ + sbc x7, x7, xzr; \ + adds x0, x0, x8; \ + adcs x1, x1, x7; \ + adcs x2, x2, x6; \ + adcs x3, x3, xzr; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + csetm x6, cc; \ + mov x7, #0xffffffff; \ + and x7, x7, x6; \ + adds x0, x0, x7; \ + eor x7, x7, x6; \ + adcs x1, x1, x7; \ + mov x7, #0xfffffffffffffffe; \ + and x7, x7, x6; \ + adcs x2, x2, x7; \ + adcs x3, x3, x6; \ + adcs x4, x4, x6; \ + adc x5, x5, x6; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16]; \ + stp x4, x5, [P0+32] + +// A weak version of add that only guarantees sum in 6 digits + +#define weakadd_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + adds x5, x5, x4; \ + adcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + adcs x9, x9, x4; \ + adcs x10, x10, x3; \ + csetm x3, cs; \ + mov x4, #0xffffffff; \ + and x4, x4, x3; \ + subs x5, x5, x4; \ + eor x4, x4, x3; \ + sbcs x6, x6, x4; \ + mov x4, #0xfffffffffffffffe; \ + and x4, x4, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + sbcs x9, x9, x3; \ + sbc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +// P0 = 3 * P1 - 8 * P2 + +#define cmsub38_p384(P0,P1,P2) \ + ldp x0, x1, [P2]; \ + mov x6, #0x00000000ffffffff; \ + subs x6, x6, x0; \ + mov x7, #0xffffffff00000000; \ + sbcs x7, x7, x1; \ + ldp x0, x1, [P2+16]; \ + mov x8, #0xfffffffffffffffe; \ + sbcs x8, x8, x0; \ + mov x13, #0xffffffffffffffff; \ + sbcs x9, x13, x1; \ + ldp x0, x1, [P2+32]; \ + sbcs x10, x13, x0; \ + sbc x11, x13, x1; \ + lsl x0, x6, #3; \ + extr x1, x7, x6, #61; \ + extr x2, x8, x7, #61; \ + extr x3, x9, x8, #61; \ + extr x4, x10, x9, #61; \ + extr x5, x11, x10, #61; \ + lsr x6, x11, #61; \ + add x6, x6, #1; \ + ldp x8, x9, [P1]; \ + ldp x10, x11, [P1+16]; \ + ldp x12, x13, [P1+32]; \ + mov x14, 3; \ + mul x15, x14, x8; \ + umulh x8, x14, x8; \ + adds x0, x0, x15; \ + mul x15, x14, x9; \ + umulh x9, x14, x9; \ + adcs x1, x1, x15; \ + mul x15, x14, x10; \ + umulh x10, x14, x10; \ + adcs x2, x2, x15; \ + mul x15, x14, x11; \ + umulh x11, x14, x11; \ + adcs x3, x3, x15; \ + mul x15, x14, x12; \ + umulh x12, x14, x12; \ + adcs x4, x4, x15; \ + mul x15, x14, x13; \ + umulh x13, x14, x13; \ + adcs x5, x5, x15; \ + adc x6, x6, xzr; \ + adds x1, x1, x8; \ + adcs x2, x2, x9; \ + adcs x3, x3, x10; \ + adcs x4, x4, x11; \ + adcs x5, x5, x12; \ + adcs x6, x6, x13; \ + lsl x7, x6, #32; \ + subs x8, x6, x7; \ + sbc x7, x7, xzr; \ + adds x0, x0, x8; \ + adcs x1, x1, x7; \ + adcs x2, x2, x6; \ + adcs x3, x3, xzr; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + csetm x6, cc; \ + mov x7, #0xffffffff; \ + and x7, x7, x6; \ + adds x0, x0, x7; \ + eor x7, x7, x6; \ + adcs x1, x1, x7; \ + mov x7, #0xfffffffffffffffe; \ + and x7, x7, x6; \ + adcs x2, x2, x7; \ + adcs x3, x3, x6; \ + adcs x4, x4, x6; \ + adc x5, x5, x6; \ + stp x0, x1, [P0]; \ + stp x2, x3, [P0+16]; \ + stp x4, x5, [P0+32] + +S2N_BN_SYMBOL(p384_montjdouble_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p384(z2,z_1) + montsqr_p384(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + weakadd_p384(t1,x_1,z2) + sub_p384(t2,x_1,z2) + montmul_p384(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p384(t1,y_1,z_1) + montsqr_p384(x4p,x2p) + montmul_p384(xy2,x_1,y2) + +// t2 = (y + z)^2 + + montsqr_p384(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p384(d,12,xy2,9,x4p) + sub_p384(t1,t2,z2) + +// y4 = y^4 + + montsqr_p384(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p384(z_3,t1,y2) + montmul_p384(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p384(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p384(y_3,dx2,y4) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p384/p384_montjmixadd.S b/third_party/s2n-bignum/arm/p384/p384_montjmixadd.S index 1b0165ab8c..f340e4f5ce 100644 --- a/third_party/s2n-bignum/arm/p384/p384_montjmixadd.S +++ b/third_party/s2n-bignum/arm/p384/p384_montjmixadd.S @@ -50,6 +50,7 @@ #define zp2 sp, #(NUMSIZE*0) #define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) #define yd sp, #(NUMSIZE*1) #define y2a sp, #(NUMSIZE*1) @@ -62,720 +63,657 @@ #define t2 sp, #(NUMSIZE*4) #define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) #define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) #define NSPACE (NUMSIZE*6) -// Corresponds exactly to bignum_montmul_p384_alt +// Corresponds to bignum_montmul_p384 except x24 -> x0 #define montmul_p384(P0,P1,P2) \ ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x12, x3, x5; \ - umulh x13, x3, x5; \ - mul x11, x3, x6; \ - umulh x14, x3, x6; \ - adds x13, x13, x11; \ - ldp x7, x8, [P2+16]; \ - mul x11, x3, x7; \ - umulh x15, x3, x7; \ - adcs x14, x14, x11; \ - mul x11, x3, x8; \ - umulh x16, x3, x8; \ - adcs x15, x15, x11; \ - ldp x9, x10, [P2+32]; \ - mul x11, x3, x9; \ - umulh x17, x3, x9; \ - adcs x16, x16, x11; \ - mul x11, x3, x10; \ - umulh x19, x3, x10; \ - adcs x17, x17, x11; \ - adc x19, x19, xzr; \ - mul x11, x4, x5; \ - adds x13, x13, x11; \ - mul x11, x4, x6; \ - adcs x14, x14, x11; \ - mul x11, x4, x7; \ - adcs x15, x15, x11; \ - mul x11, x4, x8; \ - adcs x16, x16, x11; \ - mul x11, x4, x9; \ - adcs x17, x17, x11; \ - mul x11, x4, x10; \ - adcs x19, x19, x11; \ - cset x20, cs; \ - umulh x11, x4, x5; \ - adds x14, x14, x11; \ - umulh x11, x4, x6; \ - adcs x15, x15, x11; \ - umulh x11, x4, x7; \ - adcs x16, x16, x11; \ - umulh x11, x4, x8; \ - adcs x17, x17, x11; \ - umulh x11, x4, x9; \ - adcs x19, x19, x11; \ - umulh x11, x4, x10; \ - adc x20, x20, x11; \ - ldp x3, x4, [P1+16]; \ - mul x11, x3, x5; \ - adds x14, x14, x11; \ - mul x11, x3, x6; \ - adcs x15, x15, x11; \ - mul x11, x3, x7; \ - adcs x16, x16, x11; \ - mul x11, x3, x8; \ - adcs x17, x17, x11; \ - mul x11, x3, x9; \ - adcs x19, x19, x11; \ - mul x11, x3, x10; \ - adcs x20, x20, x11; \ - cset x21, cs; \ - umulh x11, x3, x5; \ - adds x15, x15, x11; \ - umulh x11, x3, x6; \ - adcs x16, x16, x11; \ - umulh x11, x3, x7; \ - adcs x17, x17, x11; \ - umulh x11, x3, x8; \ - adcs x19, x19, x11; \ - umulh x11, x3, x9; \ - adcs x20, x20, x11; \ - umulh x11, x3, x10; \ - adc x21, x21, x11; \ - mul x11, x4, x5; \ - adds x15, x15, x11; \ - mul x11, x4, x6; \ - adcs x16, x16, x11; \ - mul x11, x4, x7; \ - adcs x17, x17, x11; \ - mul x11, x4, x8; \ - adcs x19, x19, x11; \ - mul x11, x4, x9; \ - adcs x20, x20, x11; \ - mul x11, x4, x10; \ - adcs x21, x21, x11; \ - cset x22, cs; \ - umulh x11, x4, x5; \ - adds x16, x16, x11; \ - umulh x11, x4, x6; \ - adcs x17, x17, x11; \ - umulh x11, x4, x7; \ - adcs x19, x19, x11; \ - umulh x11, x4, x8; \ - adcs x20, x20, x11; \ - umulh x11, x4, x9; \ - adcs x21, x21, x11; \ - umulh x11, x4, x10; \ - adc x22, x22, x11; \ - ldp x3, x4, [P1+32]; \ - mul x11, x3, x5; \ - adds x16, x16, x11; \ - mul x11, x3, x6; \ - adcs x17, x17, x11; \ - mul x11, x3, x7; \ - adcs x19, x19, x11; \ - mul x11, x3, x8; \ - adcs x20, x20, x11; \ - mul x11, x3, x9; \ - adcs x21, x21, x11; \ - mul x11, x3, x10; \ - adcs x22, x22, x11; \ - cset x2, cs; \ - umulh x11, x3, x5; \ - adds x17, x17, x11; \ - umulh x11, x3, x6; \ - adcs x19, x19, x11; \ - umulh x11, x3, x7; \ - adcs x20, x20, x11; \ - umulh x11, x3, x8; \ - adcs x21, x21, x11; \ - umulh x11, x3, x9; \ - adcs x22, x22, x11; \ - umulh x11, x3, x10; \ - adc x2, x2, x11; \ - mul x11, x4, x5; \ - adds x17, x17, x11; \ - mul x11, x4, x6; \ - adcs x19, x19, x11; \ - mul x11, x4, x7; \ - adcs x20, x20, x11; \ - mul x11, x4, x8; \ - adcs x21, x21, x11; \ - mul x11, x4, x9; \ - adcs x22, x22, x11; \ - mul x11, x4, x10; \ - adcs x2, x2, x11; \ - cset x1, cs; \ - umulh x11, x4, x5; \ - adds x19, x19, x11; \ - umulh x11, x4, x6; \ - adcs x20, x20, x11; \ - umulh x11, x4, x7; \ - adcs x21, x21, x11; \ - umulh x11, x4, x8; \ - adcs x22, x22, x11; \ - umulh x11, x4, x9; \ - adcs x2, x2, x11; \ - umulh x11, x4, x10; \ - adc x1, x1, x11; \ - lsl x7, x12, #32; \ - add x12, x7, x12; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x12; \ - mov x6, #0xffffffff; \ - mul x5, x6, x12; \ - umulh x6, x6, x12; \ - adds x7, x7, x5; \ - adcs x6, x6, x12; \ - adc x5, xzr, xzr; \ - subs x13, x13, x7; \ - sbcs x14, x14, x6; \ - sbcs x15, x15, x5; \ + ldp x5, x6, [P1+16]; \ + ldp x7, x8, [P1+32]; \ + ldp x9, x10, [P2]; \ + ldp x11, x12, [P2+16]; \ + ldp x13, x14, [P2+32]; \ + mul x15, x3, x9; \ + mul x21, x4, x10; \ + mul x22, x5, x11; \ + umulh x23, x3, x9; \ + umulh x0, x4, x10; \ + umulh x1, x5, x11; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x3, x4; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x3, x5; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x4, x5; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ + sbc x15, x15, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ + sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x12, x12, xzr; \ - lsl x7, x13, #32; \ - add x13, x7, x13; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x13; \ - mov x6, #0xffffffff; \ - mul x5, x6, x13; \ - umulh x6, x6, x13; \ - adds x7, x7, x5; \ - adcs x6, x6, x13; \ - adc x5, xzr, xzr; \ - subs x14, x14, x7; \ - sbcs x15, x15, x6; \ - sbcs x16, x16, x5; \ - sbcs x17, x17, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - lsl x7, x14, #32; \ - add x14, x7, x14; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x14; \ - mov x6, #0xffffffff; \ - mul x5, x6, x14; \ - umulh x6, x6, x14; \ - adds x7, x7, x5; \ - adcs x6, x6, x14; \ - adc x5, xzr, xzr; \ - subs x15, x15, x7; \ - sbcs x16, x16, x6; \ - sbcs x17, x17, x5; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x14, x14, xzr; \ - lsl x7, x15, #32; \ - add x15, x7, x15; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x15; \ - mov x6, #0xffffffff; \ - mul x5, x6, x15; \ - umulh x6, x6, x15; \ - adds x7, x7, x5; \ - adcs x6, x6, x15; \ - adc x5, xzr, xzr; \ - subs x16, x16, x7; \ - sbcs x17, x17, x6; \ - sbcs x12, x12, x5; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ + sbc x17, x17, xzr; \ + stp x19, x20, [P0]; \ + stp x1, x15, [P0+16]; \ + stp x16, x17, [P0+32]; \ + mul x15, x6, x12; \ + mul x21, x7, x13; \ + mul x22, x8, x14; \ + umulh x23, x6, x12; \ + umulh x0, x7, x13; \ + umulh x1, x8, x14; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x6, x7; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x13, x12; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x6, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x14, x12; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x7, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x14, x13; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + subs x6, x6, x3; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x5; \ + ngc x3, xzr; \ + cmn x3, #1; \ + eor x6, x6, x3; \ + adcs x6, x6, xzr; \ + eor x7, x7, x3; \ + adcs x7, x7, xzr; \ + eor x8, x8, x3; \ + adc x8, x8, xzr; \ + subs x9, x9, x12; \ + sbcs x10, x10, x13; \ + sbcs x11, x11, x14; \ + ngc x14, xzr; \ + cmn x14, #1; \ + eor x9, x9, x14; \ + adcs x9, x9, xzr; \ + eor x10, x10, x14; \ + adcs x10, x10, xzr; \ + eor x11, x11, x14; \ + adc x11, x11, xzr; \ + eor x14, x3, x14; \ + ldp x21, x22, [P0]; \ + adds x15, x15, x21; \ + adcs x16, x16, x22; \ + ldp x21, x22, [P0+16]; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + ldp x21, x22, [P0+32]; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adc x2, xzr, xzr; \ + stp x15, x16, [P0]; \ + stp x17, x19, [P0+16]; \ + stp x20, x1, [P0+32]; \ + mul x15, x6, x9; \ + mul x21, x7, x10; \ + mul x22, x8, x11; \ + umulh x23, x6, x9; \ + umulh x0, x7, x10; \ + umulh x1, x8, x11; \ + adds x23, x23, x21; \ + adcs x0, x0, x22; \ + adc x1, x1, xzr; \ + adds x16, x23, x15; \ + adcs x17, x0, x23; \ + adcs x19, x1, x0; \ + adc x20, x1, xzr; \ + adds x17, x17, x15; \ + adcs x19, x19, x23; \ + adcs x20, x20, x0; \ + adc x1, x1, xzr; \ + subs x0, x6, x7; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x10, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x16, x16, x21; \ + adcs x17, x17, x22; \ + adcs x19, x19, x23; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x6, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x9; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x17, x17, x21; \ + adcs x19, x19, x22; \ + adcs x20, x20, x23; \ + adc x1, x1, x23; \ + subs x0, x7, x8; \ + cneg x0, x0, lo; \ + csetm x23, lo; \ + subs x22, x11, x10; \ + cneg x22, x22, lo; \ + mul x21, x0, x22; \ + umulh x22, x0, x22; \ + cinv x23, x23, lo; \ + eor x21, x21, x23; \ + eor x22, x22, x23; \ + cmn x23, #1; \ + adcs x19, x19, x21; \ + adcs x20, x20, x22; \ + adc x1, x1, x23; \ + ldp x3, x4, [P0]; \ + ldp x5, x6, [P0+16]; \ + ldp x7, x8, [P0+32]; \ + cmn x14, #1; \ + eor x15, x15, x14; \ + adcs x15, x15, x3; \ + eor x16, x16, x14; \ + adcs x16, x16, x4; \ + eor x17, x17, x14; \ + adcs x17, x17, x5; \ + eor x19, x19, x14; \ + adcs x19, x19, x6; \ + eor x20, x20, x14; \ + adcs x20, x20, x7; \ + eor x1, x1, x14; \ + adcs x1, x1, x8; \ + adcs x9, x14, x2; \ + adcs x10, x14, xzr; \ + adcs x11, x14, xzr; \ + adc x12, x14, xzr; \ + adds x19, x19, x3; \ + adcs x20, x20, x4; \ + adcs x1, x1, x5; \ + adcs x9, x9, x6; \ + adcs x10, x10, x7; \ + adcs x11, x11, x8; \ + adc x12, x12, x2; \ + lsl x23, x15, #32; \ + add x15, x23, x15; \ + lsr x23, x15, #32; \ + subs x23, x23, x15; \ + sbc x22, x15, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x15; \ + adc x21, xzr, xzr; \ + subs x16, x16, x23; \ + sbcs x17, x17, x22; \ + sbcs x19, x19, x21; \ + sbcs x20, x20, xzr; \ + sbcs x1, x1, xzr; \ sbc x15, x15, xzr; \ - lsl x7, x16, #32; \ - add x16, x7, x16; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x16; \ - mov x6, #0xffffffff; \ - mul x5, x6, x16; \ - umulh x6, x6, x16; \ - adds x7, x7, x5; \ - adcs x6, x6, x16; \ - adc x5, xzr, xzr; \ - subs x17, x17, x7; \ - sbcs x12, x12, x6; \ - sbcs x13, x13, x5; \ - sbcs x14, x14, xzr; \ + lsl x23, x16, #32; \ + add x16, x23, x16; \ + lsr x23, x16, #32; \ + subs x23, x23, x16; \ + sbc x22, x16, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x16; \ + adc x21, xzr, xzr; \ + subs x17, x17, x23; \ + sbcs x19, x19, x22; \ + sbcs x20, x20, x21; \ + sbcs x1, x1, xzr; \ sbcs x15, x15, xzr; \ sbc x16, x16, xzr; \ - lsl x7, x17, #32; \ - add x17, x7, x17; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x17; \ - mov x6, #0xffffffff; \ - mul x5, x6, x17; \ - umulh x6, x6, x17; \ - adds x7, x7, x5; \ - adcs x6, x6, x17; \ - adc x5, xzr, xzr; \ - subs x12, x12, x7; \ - sbcs x13, x13, x6; \ - sbcs x14, x14, x5; \ + lsl x23, x17, #32; \ + add x17, x23, x17; \ + lsr x23, x17, #32; \ + subs x23, x23, x17; \ + sbc x22, x17, xzr; \ + extr x23, x22, x23, #32; \ + lsr x22, x22, #32; \ + adds x22, x22, x17; \ + adc x21, xzr, xzr; \ + subs x19, x19, x23; \ + sbcs x20, x20, x22; \ + sbcs x1, x1, x21; \ sbcs x15, x15, xzr; \ sbcs x16, x16, xzr; \ sbc x17, x17, xzr; \ - adds x12, x12, x19; \ - adcs x13, x13, x20; \ - adcs x14, x14, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x2; \ - adcs x17, x17, x1; \ - adc x10, xzr, xzr; \ - mov x11, #0xffffffff00000001; \ - adds x19, x12, x11; \ - mov x11, #0xffffffff; \ - adcs x20, x13, x11; \ - mov x11, #0x1; \ - adcs x21, x14, x11; \ - adcs x22, x15, xzr; \ - adcs x2, x16, xzr; \ - adcs x1, x17, xzr; \ + adds x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adc x12, x12, xzr; \ + add x22, x12, #1; \ + lsl x21, x22, #32; \ + subs x0, x22, x21; \ + sbc x21, x21, xzr; \ + adds x19, x19, x0; \ + adcs x20, x20, x21; \ + adcs x1, x1, x22; \ + adcs x9, x9, xzr; \ adcs x10, x10, xzr; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - csel x14, x14, x21, eq; \ - csel x15, x15, x22, eq; \ - csel x16, x16, x2, eq; \ - csel x17, x17, x1, eq; \ - stp x12, x13, [P0]; \ - stp x14, x15, [P0+16]; \ - stp x16, x17, [P0+32] - -// Corresponds exactly to bignum_montsqr_p384_alt + adcs x11, x11, xzr; \ + csetm x22, lo; \ + mov x23, #4294967295; \ + and x23, x23, x22; \ + adds x19, x19, x23; \ + eor x23, x23, x22; \ + adcs x20, x20, x23; \ + mov x23, #-2; \ + and x23, x23, x22; \ + adcs x1, x1, x23; \ + adcs x9, x9, x22; \ + adcs x10, x10, x22; \ + adc x11, x11, x22; \ + stp x19, x20, [P0]; \ + stp x1, x9, [P0+16]; \ + stp x10, x11, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384 #define montsqr_p384(P0,P1) \ ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ + mul x14, x2, x3; \ + mul x15, x2, x4; \ + mul x16, x3, x4; \ + mul x8, x2, x2; \ + mul x10, x3, x3; \ + mul x12, x4, x4; \ + umulh x17, x2, x3; \ + adds x15, x15, x17; \ + umulh x17, x2, x4; \ + adcs x16, x16, x17; \ + umulh x17, x3, x4; \ adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ + umulh x9, x2, x2; \ + umulh x11, x3, x3; \ + umulh x13, x4, x4; \ + adds x14, x14, x14; \ adcs x15, x15, x15; \ adcs x16, x16, x16; \ adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ + adc x13, x13, xzr; \ + adds x9, x9, x14; \ + adcs x10, x10, x15; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x13, xzr; \ + lsl x16, x8, #32; \ + add x8, x16, x8; \ + lsr x16, x8, #32; \ + subs x16, x16, x8; \ + sbc x15, x8, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x8; \ + adc x14, xzr, xzr; \ + subs x9, x9, x16; \ + sbcs x10, x10, x15; \ + sbcs x11, x11, x14; \ sbcs x12, x12, xzr; \ sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ + sbc x8, x8, xzr; \ + lsl x16, x9, #32; \ + add x9, x16, x9; \ + lsr x16, x9, #32; \ + subs x16, x16, x9; \ + sbc x15, x9, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x9; \ + adc x14, xzr, xzr; \ + subs x10, x10, x16; \ + sbcs x11, x11, x15; \ + sbcs x12, x12, x14; \ sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ + sbcs x8, x8, xzr; \ sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ + lsl x16, x10, #32; \ + add x10, x16, x10; \ + lsr x16, x10, #32; \ + subs x16, x16, x10; \ + sbc x15, x10, xzr; \ + extr x16, x15, x16, #32; \ + lsr x15, x15, #32; \ + adds x15, x15, x10; \ + adc x14, xzr, xzr; \ + subs x11, x11, x16; \ + sbcs x12, x12, x15; \ + sbcs x13, x13, x14; \ + sbcs x8, x8, xzr; \ sbcs x9, x9, xzr; \ sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - adc x6, xzr, xzr; \ - mov x8, #-4294967295; \ - adds x14, x2, x8; \ - mov x8, #4294967295; \ - adcs x15, x9, x8; \ - mov x8, #1; \ - adcs x16, x10, x8; \ - adcs x17, x11, xzr; \ - adcs x19, x12, xzr; \ - adcs x20, x13, xzr; \ - adcs x6, x6, xzr; \ - csel x2, x2, x14, eq; \ - csel x9, x9, x15, eq; \ - csel x10, x10, x16, eq; \ - csel x11, x11, x17, eq; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] - -// Almost-Montgomery variant which we use when an input to other muls -// with the other argument fully reduced (which is always safe). In -// fact, with the Karatsuba-based Montgomery mul here, we don't even -// *need* the restriction that the other argument is reduced. - -#define amontsqr_p384(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ + stp x11, x12, [P0]; \ + stp x13, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + mul x8, x2, x5; \ + mul x14, x3, x6; \ + mul x15, x4, x7; \ + umulh x16, x2, x5; \ + umulh x17, x3, x6; \ + umulh x1, x4, x7; \ + adds x16, x16, x14; \ + adcs x17, x17, x15; \ + adc x1, x1, xzr; \ + adds x9, x16, x8; \ + adcs x10, x17, x16; \ + adcs x11, x1, x17; \ + adc x12, x1, xzr; \ adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ - ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ - adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ + adcs x11, x11, x16; \ + adcs x12, x12, x17; \ + adc x13, x1, xzr; \ + subs x17, x2, x3; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x6, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x9, x9, x16; \ + adcs x10, x10, x15; \ + adcs x11, x11, x14; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x2, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x5; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x10, x10, x16; \ + adcs x11, x11, x15; \ + adcs x12, x12, x14; \ + adc x13, x13, x14; \ + subs x17, x3, x4; \ + cneg x17, x17, lo; \ + csetm x14, lo; \ + subs x15, x7, x6; \ + cneg x15, x15, lo; \ + mul x16, x17, x15; \ + umulh x15, x17, x15; \ + cinv x14, x14, lo; \ + eor x16, x16, x14; \ + eor x15, x15, x14; \ + cmn x14, #1; \ + adcs x11, x11, x16; \ + adcs x12, x12, x15; \ + adc x13, x13, x14; \ + adds x8, x8, x8; \ + adcs x9, x9, x9; \ adcs x10, x10, x10; \ adcs x11, x11, x11; \ adcs x12, x12, x12; \ adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ + adc x17, xzr, xzr; \ + ldp x2, x3, [P0]; \ + adds x8, x8, x2; \ + adcs x9, x9, x3; \ + ldp x2, x3, [P0+16]; \ + adcs x10, x10, x2; \ + adcs x11, x11, x3; \ + ldp x2, x3, [P0+32]; \ + adcs x12, x12, x2; \ + adcs x13, x13, x3; \ + adc x17, x17, xzr; \ + lsl x4, x8, #32; \ + add x8, x4, x8; \ + lsr x4, x8, #32; \ + subs x4, x4, x8; \ + sbc x3, x8, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x8; \ + adc x2, xzr, xzr; \ + subs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, x2; \ sbcs x12, x12, xzr; \ sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ + sbc x8, x8, xzr; \ + lsl x4, x9, #32; \ + add x9, x4, x9; \ + lsr x4, x9, #32; \ + subs x4, x4, x9; \ + sbc x3, x9, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x9; \ + adc x2, xzr, xzr; \ + subs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, x2; \ sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ + sbcs x8, x8, xzr; \ sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ + lsl x4, x10, #32; \ + add x10, x4, x10; \ + lsr x4, x10, #32; \ + subs x4, x4, x10; \ + sbc x3, x10, xzr; \ + extr x4, x3, x4, #32; \ + lsr x3, x3, #32; \ + adds x3, x3, x10; \ + adc x2, xzr, xzr; \ + subs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, x2; \ + sbcs x8, x8, xzr; \ sbcs x9, x9, xzr; \ sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - mov x14, #-4294967295; \ - mov x15, #4294967295; \ - csel x14, x14, xzr, cs; \ - csel x15, x15, xzr, cs; \ - cset x16, cs; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, xzr; \ - adcs x12, x12, xzr; \ - adc x13, x13, xzr; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] + adds x17, x17, x8; \ + adcs x8, x9, xzr; \ + adcs x9, x10, xzr; \ + adcs x10, xzr, xzr; \ + mul x1, x5, x5; \ + adds x11, x11, x1; \ + mul x14, x6, x6; \ + mul x15, x7, x7; \ + umulh x1, x5, x5; \ + adcs x12, x12, x1; \ + umulh x1, x6, x6; \ + adcs x13, x13, x14; \ + adcs x17, x17, x1; \ + umulh x1, x7, x7; \ + adcs x8, x8, x15; \ + adcs x9, x9, x1; \ + adc x10, x10, xzr; \ + mul x1, x5, x6; \ + mul x14, x5, x7; \ + mul x15, x6, x7; \ + umulh x16, x5, x6; \ + adds x14, x14, x16; \ + umulh x16, x5, x7; \ + adcs x15, x15, x16; \ + umulh x16, x6, x7; \ + adc x16, x16, xzr; \ + adds x1, x1, x1; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adc x5, xzr, xzr; \ + adds x12, x12, x1; \ + adcs x13, x13, x14; \ + adcs x17, x17, x15; \ + adcs x8, x8, x16; \ + adcs x9, x9, x5; \ + adc x10, x10, xzr; \ + mov x1, #-4294967295; \ + mov x14, #4294967295; \ + mov x15, #1; \ + cmn x11, x1; \ + adcs xzr, x12, x14; \ + adcs xzr, x13, x15; \ + adcs xzr, x17, xzr; \ + adcs xzr, x8, xzr; \ + adcs xzr, x9, xzr; \ + adc x10, x10, xzr; \ + neg x10, x10; \ + and x1, x1, x10; \ + adds x11, x11, x1; \ + and x14, x14, x10; \ + adcs x12, x12, x14; \ + and x15, x15, x10; \ + adcs x13, x13, x15; \ + adcs x17, x17, xzr; \ + adcs x8, x8, xzr; \ + adc x9, x9, xzr; \ + stp x11, x12, [P0]; \ + stp x13, x17, [P0+16]; \ + stp x8, x9, [P0+32] // Corresponds exactly to bignum_sub_p384 @@ -827,7 +765,7 @@ S2N_BN_SYMBOL(p384_montjmixadd): // Main code, just a sequence of basic field operations // 8 * multiply + 3 * square + 7 * subtract - amontsqr_p384(zp2,z_1) + montsqr_p384(zp2,z_1) montmul_p384(y2a,z_1,y_2) montmul_p384(x2a,zp2,x_2) @@ -836,25 +774,91 @@ S2N_BN_SYMBOL(p384_montjmixadd): sub_p384(xd,x2a,x_1) sub_p384(yd,y2a,y_1) - amontsqr_p384(zz,xd) + montsqr_p384(zz,xd) montsqr_p384(ww,yd) montmul_p384(zzx1,zz,x_1) montmul_p384(zzx2,zz,x2a) - sub_p384(x_3,ww,zzx1) + sub_p384(resx,ww,zzx1) sub_p384(t1,zzx2,zzx1) - montmul_p384(z_3,xd,z_1) + montmul_p384(resz,xd,z_1) - sub_p384(x_3,x_3,zzx2) + sub_p384(resx,resx,zzx2) - sub_p384(t2,zzx1,x_3) + sub_p384(t2,zzx1,resx) montmul_p384(t1,t1,y_1) montmul_p384(t2,yd,t2) - sub_p384(y_3,t2,t1) + sub_p384(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + orr x6, x0, x1 + orr x7, x2, x3 + orr x8, x4, x5 + orr x6, x6, x7 + orr x6, x6, x8 + cmp x6, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), +// hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x19, x20, [x_2] + csel x0, x0, x19, ne + csel x1, x1, x20, ne + ldp x2, x3, [resx+16] + ldp x19, x20, [x_2+16] + csel x2, x2, x19, ne + csel x3, x3, x20, ne + ldp x4, x5, [resx+32] + ldp x19, x20, [x_2+32] + csel x4, x4, x19, ne + csel x5, x5, x20, ne + + ldp x6, x7, [resy] + ldp x19, x20, [y_2] + csel x6, x6, x19, ne + csel x7, x7, x20, ne + ldp x8, x9, [resy+16] + ldp x19, x20, [y_2+16] + csel x8, x8, x19, ne + csel x9, x9, x20, ne + ldp x10, x11, [resy+32] + ldp x19, x20, [y_2+32] + csel x10, x10, x19, ne + csel x11, x11, x20, ne + + ldp x12, x13, [resz] + mov x19, #0xffffffff00000001 + mov x20, #0x00000000ffffffff + csel x12, x12, x19, ne + csel x13, x13, x20, ne + ldp x14, x15, [resz+16] + mov x19, #1 + csel x14, x14, x19, ne + csel x15, x15, xzr, ne + ldp x16, x17, [resz+32] + csel x16, x16, xzr, ne + csel x17, x17, xzr, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] // Restore stack and registers diff --git a/third_party/s2n-bignum/arm/p384/p384_montjmixadd_alt.S b/third_party/s2n-bignum/arm/p384/p384_montjmixadd_alt.S new file mode 100644 index 0000000000..f36301a11e --- /dev/null +++ b/third_party/s2n-bignum/arm/p384/p384_montjmixadd_alt.S @@ -0,0 +1,941 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjmixadd_alt +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x24 +#define input_x x25 +#define input_y x26 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x12, x3, x5; \ + umulh x13, x3, x5; \ + mul x11, x3, x6; \ + umulh x14, x3, x6; \ + adds x13, x13, x11; \ + ldp x7, x8, [P2+16]; \ + mul x11, x3, x7; \ + umulh x15, x3, x7; \ + adcs x14, x14, x11; \ + mul x11, x3, x8; \ + umulh x16, x3, x8; \ + adcs x15, x15, x11; \ + ldp x9, x10, [P2+32]; \ + mul x11, x3, x9; \ + umulh x17, x3, x9; \ + adcs x16, x16, x11; \ + mul x11, x3, x10; \ + umulh x19, x3, x10; \ + adcs x17, x17, x11; \ + adc x19, x19, xzr; \ + mul x11, x4, x5; \ + adds x13, x13, x11; \ + mul x11, x4, x6; \ + adcs x14, x14, x11; \ + mul x11, x4, x7; \ + adcs x15, x15, x11; \ + mul x11, x4, x8; \ + adcs x16, x16, x11; \ + mul x11, x4, x9; \ + adcs x17, x17, x11; \ + mul x11, x4, x10; \ + adcs x19, x19, x11; \ + cset x20, cs; \ + umulh x11, x4, x5; \ + adds x14, x14, x11; \ + umulh x11, x4, x6; \ + adcs x15, x15, x11; \ + umulh x11, x4, x7; \ + adcs x16, x16, x11; \ + umulh x11, x4, x8; \ + adcs x17, x17, x11; \ + umulh x11, x4, x9; \ + adcs x19, x19, x11; \ + umulh x11, x4, x10; \ + adc x20, x20, x11; \ + ldp x3, x4, [P1+16]; \ + mul x11, x3, x5; \ + adds x14, x14, x11; \ + mul x11, x3, x6; \ + adcs x15, x15, x11; \ + mul x11, x3, x7; \ + adcs x16, x16, x11; \ + mul x11, x3, x8; \ + adcs x17, x17, x11; \ + mul x11, x3, x9; \ + adcs x19, x19, x11; \ + mul x11, x3, x10; \ + adcs x20, x20, x11; \ + cset x21, cs; \ + umulh x11, x3, x5; \ + adds x15, x15, x11; \ + umulh x11, x3, x6; \ + adcs x16, x16, x11; \ + umulh x11, x3, x7; \ + adcs x17, x17, x11; \ + umulh x11, x3, x8; \ + adcs x19, x19, x11; \ + umulh x11, x3, x9; \ + adcs x20, x20, x11; \ + umulh x11, x3, x10; \ + adc x21, x21, x11; \ + mul x11, x4, x5; \ + adds x15, x15, x11; \ + mul x11, x4, x6; \ + adcs x16, x16, x11; \ + mul x11, x4, x7; \ + adcs x17, x17, x11; \ + mul x11, x4, x8; \ + adcs x19, x19, x11; \ + mul x11, x4, x9; \ + adcs x20, x20, x11; \ + mul x11, x4, x10; \ + adcs x21, x21, x11; \ + cset x22, cs; \ + umulh x11, x4, x5; \ + adds x16, x16, x11; \ + umulh x11, x4, x6; \ + adcs x17, x17, x11; \ + umulh x11, x4, x7; \ + adcs x19, x19, x11; \ + umulh x11, x4, x8; \ + adcs x20, x20, x11; \ + umulh x11, x4, x9; \ + adcs x21, x21, x11; \ + umulh x11, x4, x10; \ + adc x22, x22, x11; \ + ldp x3, x4, [P1+32]; \ + mul x11, x3, x5; \ + adds x16, x16, x11; \ + mul x11, x3, x6; \ + adcs x17, x17, x11; \ + mul x11, x3, x7; \ + adcs x19, x19, x11; \ + mul x11, x3, x8; \ + adcs x20, x20, x11; \ + mul x11, x3, x9; \ + adcs x21, x21, x11; \ + mul x11, x3, x10; \ + adcs x22, x22, x11; \ + cset x2, cs; \ + umulh x11, x3, x5; \ + adds x17, x17, x11; \ + umulh x11, x3, x6; \ + adcs x19, x19, x11; \ + umulh x11, x3, x7; \ + adcs x20, x20, x11; \ + umulh x11, x3, x8; \ + adcs x21, x21, x11; \ + umulh x11, x3, x9; \ + adcs x22, x22, x11; \ + umulh x11, x3, x10; \ + adc x2, x2, x11; \ + mul x11, x4, x5; \ + adds x17, x17, x11; \ + mul x11, x4, x6; \ + adcs x19, x19, x11; \ + mul x11, x4, x7; \ + adcs x20, x20, x11; \ + mul x11, x4, x8; \ + adcs x21, x21, x11; \ + mul x11, x4, x9; \ + adcs x22, x22, x11; \ + mul x11, x4, x10; \ + adcs x2, x2, x11; \ + cset x1, cs; \ + umulh x11, x4, x5; \ + adds x19, x19, x11; \ + umulh x11, x4, x6; \ + adcs x20, x20, x11; \ + umulh x11, x4, x7; \ + adcs x21, x21, x11; \ + umulh x11, x4, x8; \ + adcs x22, x22, x11; \ + umulh x11, x4, x9; \ + adcs x2, x2, x11; \ + umulh x11, x4, x10; \ + adc x1, x1, x11; \ + lsl x7, x12, #32; \ + add x12, x7, x12; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x12; \ + mov x6, #0xffffffff; \ + mul x5, x6, x12; \ + umulh x6, x6, x12; \ + adds x7, x7, x5; \ + adcs x6, x6, x12; \ + adc x5, xzr, xzr; \ + subs x13, x13, x7; \ + sbcs x14, x14, x6; \ + sbcs x15, x15, x5; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x12, x12, xzr; \ + lsl x7, x13, #32; \ + add x13, x7, x13; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x13; \ + mov x6, #0xffffffff; \ + mul x5, x6, x13; \ + umulh x6, x6, x13; \ + adds x7, x7, x5; \ + adcs x6, x6, x13; \ + adc x5, xzr, xzr; \ + subs x14, x14, x7; \ + sbcs x15, x15, x6; \ + sbcs x16, x16, x5; \ + sbcs x17, x17, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + lsl x7, x14, #32; \ + add x14, x7, x14; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x14; \ + mov x6, #0xffffffff; \ + mul x5, x6, x14; \ + umulh x6, x6, x14; \ + adds x7, x7, x5; \ + adcs x6, x6, x14; \ + adc x5, xzr, xzr; \ + subs x15, x15, x7; \ + sbcs x16, x16, x6; \ + sbcs x17, x17, x5; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x14, x14, xzr; \ + lsl x7, x15, #32; \ + add x15, x7, x15; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x15; \ + mov x6, #0xffffffff; \ + mul x5, x6, x15; \ + umulh x6, x6, x15; \ + adds x7, x7, x5; \ + adcs x6, x6, x15; \ + adc x5, xzr, xzr; \ + subs x16, x16, x7; \ + sbcs x17, x17, x6; \ + sbcs x12, x12, x5; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbc x15, x15, xzr; \ + lsl x7, x16, #32; \ + add x16, x7, x16; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x16; \ + mov x6, #0xffffffff; \ + mul x5, x6, x16; \ + umulh x6, x6, x16; \ + adds x7, x7, x5; \ + adcs x6, x6, x16; \ + adc x5, xzr, xzr; \ + subs x17, x17, x7; \ + sbcs x12, x12, x6; \ + sbcs x13, x13, x5; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbc x16, x16, xzr; \ + lsl x7, x17, #32; \ + add x17, x7, x17; \ + mov x7, #0xffffffff00000001; \ + umulh x7, x7, x17; \ + mov x6, #0xffffffff; \ + mul x5, x6, x17; \ + umulh x6, x6, x17; \ + adds x7, x7, x5; \ + adcs x6, x6, x17; \ + adc x5, xzr, xzr; \ + subs x12, x12, x7; \ + sbcs x13, x13, x6; \ + sbcs x14, x14, x5; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbc x17, x17, xzr; \ + adds x12, x12, x19; \ + adcs x13, x13, x20; \ + adcs x14, x14, x21; \ + adcs x15, x15, x22; \ + adcs x16, x16, x2; \ + adcs x17, x17, x1; \ + adc x10, xzr, xzr; \ + mov x11, #0xffffffff00000001; \ + adds x19, x12, x11; \ + mov x11, #0xffffffff; \ + adcs x20, x13, x11; \ + mov x11, #0x1; \ + adcs x21, x14, x11; \ + adcs x22, x15, xzr; \ + adcs x2, x16, xzr; \ + adcs x1, x17, xzr; \ + adcs x10, x10, xzr; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + csel x14, x14, x21, eq; \ + csel x15, x15, x22, eq; \ + csel x16, x16, x2, eq; \ + csel x17, x17, x1, eq; \ + stp x12, x13, [P0]; \ + stp x14, x15, [P0+16]; \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + adc x6, xzr, xzr; \ + mov x8, #-4294967295; \ + adds x14, x2, x8; \ + mov x8, #4294967295; \ + adcs x15, x9, x8; \ + mov x8, #1; \ + adcs x16, x10, x8; \ + adcs x17, x11, xzr; \ + adcs x19, x12, xzr; \ + adcs x20, x13, xzr; \ + adcs x6, x6, xzr; \ + csel x2, x2, x14, eq; \ + csel x9, x9, x15, eq; \ + csel x10, x10, x16, eq; \ + csel x11, x11, x17, eq; \ + csel x12, x12, x19, eq; \ + csel x13, x13, x20, eq; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). In +// fact, with the Karatsuba-based Montgomery mul here, we don't even +// *need* the restriction that the other argument is reduced. + +#define amontsqr_p384(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x9, x2, x3; \ + umulh x10, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x8, x2, x4; \ + adds x10, x10, x8; \ + mul x11, x2, x5; \ + mul x8, x3, x4; \ + adcs x11, x11, x8; \ + umulh x12, x2, x5; \ + mul x8, x3, x5; \ + adcs x12, x12, x8; \ + ldp x6, x7, [P1+32]; \ + mul x13, x2, x7; \ + mul x8, x3, x6; \ + adcs x13, x13, x8; \ + umulh x14, x2, x7; \ + mul x8, x3, x7; \ + adcs x14, x14, x8; \ + mul x15, x5, x6; \ + adcs x15, x15, xzr; \ + umulh x16, x5, x6; \ + adc x16, x16, xzr; \ + umulh x8, x2, x4; \ + adds x11, x11, x8; \ + umulh x8, x3, x4; \ + adcs x12, x12, x8; \ + umulh x8, x3, x5; \ + adcs x13, x13, x8; \ + umulh x8, x3, x6; \ + adcs x14, x14, x8; \ + umulh x8, x3, x7; \ + adcs x15, x15, x8; \ + adc x16, x16, xzr; \ + mul x8, x2, x6; \ + adds x12, x12, x8; \ + mul x8, x4, x5; \ + adcs x13, x13, x8; \ + mul x8, x4, x6; \ + adcs x14, x14, x8; \ + mul x8, x4, x7; \ + adcs x15, x15, x8; \ + mul x8, x5, x7; \ + adcs x16, x16, x8; \ + mul x17, x6, x7; \ + adcs x17, x17, xzr; \ + umulh x19, x6, x7; \ + adc x19, x19, xzr; \ + umulh x8, x2, x6; \ + adds x13, x13, x8; \ + umulh x8, x4, x5; \ + adcs x14, x14, x8; \ + umulh x8, x4, x6; \ + adcs x15, x15, x8; \ + umulh x8, x4, x7; \ + adcs x16, x16, x8; \ + umulh x8, x5, x7; \ + adcs x17, x17, x8; \ + adc x19, x19, xzr; \ + adds x9, x9, x9; \ + adcs x10, x10, x10; \ + adcs x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + cset x20, hs; \ + umulh x8, x2, x2; \ + mul x2, x2, x2; \ + adds x9, x9, x8; \ + mul x8, x3, x3; \ + adcs x10, x10, x8; \ + umulh x8, x3, x3; \ + adcs x11, x11, x8; \ + mul x8, x4, x4; \ + adcs x12, x12, x8; \ + umulh x8, x4, x4; \ + adcs x13, x13, x8; \ + mul x8, x5, x5; \ + adcs x14, x14, x8; \ + umulh x8, x5, x5; \ + adcs x15, x15, x8; \ + mul x8, x6, x6; \ + adcs x16, x16, x8; \ + umulh x8, x6, x6; \ + adcs x17, x17, x8; \ + mul x8, x7, x7; \ + adcs x19, x19, x8; \ + umulh x8, x7, x7; \ + adc x20, x20, x8; \ + lsl x5, x2, #32; \ + add x2, x5, x2; \ + mov x5, #-4294967295; \ + umulh x5, x5, x2; \ + mov x4, #4294967295; \ + mul x3, x4, x2; \ + umulh x4, x4, x2; \ + adds x5, x5, x3; \ + adcs x4, x4, x2; \ + adc x3, xzr, xzr; \ + subs x9, x9, x5; \ + sbcs x10, x10, x4; \ + sbcs x11, x11, x3; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbc x2, x2, xzr; \ + lsl x5, x9, #32; \ + add x9, x5, x9; \ + mov x5, #-4294967295; \ + umulh x5, x5, x9; \ + mov x4, #4294967295; \ + mul x3, x4, x9; \ + umulh x4, x4, x9; \ + adds x5, x5, x3; \ + adcs x4, x4, x9; \ + adc x3, xzr, xzr; \ + subs x10, x10, x5; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + sbcs x13, x13, xzr; \ + sbcs x2, x2, xzr; \ + sbc x9, x9, xzr; \ + lsl x5, x10, #32; \ + add x10, x5, x10; \ + mov x5, #-4294967295; \ + umulh x5, x5, x10; \ + mov x4, #4294967295; \ + mul x3, x4, x10; \ + umulh x4, x4, x10; \ + adds x5, x5, x3; \ + adcs x4, x4, x10; \ + adc x3, xzr, xzr; \ + subs x11, x11, x5; \ + sbcs x12, x12, x4; \ + sbcs x13, x13, x3; \ + sbcs x2, x2, xzr; \ + sbcs x9, x9, xzr; \ + sbc x10, x10, xzr; \ + lsl x5, x11, #32; \ + add x11, x5, x11; \ + mov x5, #-4294967295; \ + umulh x5, x5, x11; \ + mov x4, #4294967295; \ + mul x3, x4, x11; \ + umulh x4, x4, x11; \ + adds x5, x5, x3; \ + adcs x4, x4, x11; \ + adc x3, xzr, xzr; \ + subs x12, x12, x5; \ + sbcs x13, x13, x4; \ + sbcs x2, x2, x3; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbc x11, x11, xzr; \ + lsl x5, x12, #32; \ + add x12, x5, x12; \ + mov x5, #-4294967295; \ + umulh x5, x5, x12; \ + mov x4, #4294967295; \ + mul x3, x4, x12; \ + umulh x4, x4, x12; \ + adds x5, x5, x3; \ + adcs x4, x4, x12; \ + adc x3, xzr, xzr; \ + subs x13, x13, x5; \ + sbcs x2, x2, x4; \ + sbcs x9, x9, x3; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbc x12, x12, xzr; \ + lsl x5, x13, #32; \ + add x13, x5, x13; \ + mov x5, #-4294967295; \ + umulh x5, x5, x13; \ + mov x4, #4294967295; \ + mul x3, x4, x13; \ + umulh x4, x4, x13; \ + adds x5, x5, x3; \ + adcs x4, x4, x13; \ + adc x3, xzr, xzr; \ + subs x2, x2, x5; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, x17; \ + adcs x12, x12, x19; \ + adcs x13, x13, x20; \ + mov x14, #-4294967295; \ + mov x15, #4294967295; \ + csel x14, x14, xzr, cs; \ + csel x15, x15, xzr, cs; \ + cset x16, cs; \ + adds x2, x2, x14; \ + adcs x9, x9, x15; \ + adcs x10, x10, x16; \ + adcs x11, x11, xzr; \ + adcs x12, x12, xzr; \ + adc x13, x13, xzr; \ + stp x2, x9, [P0]; \ + stp x10, x11, [P0+16]; \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + csetm x3, lo; \ + mov x4, #4294967295; \ + and x4, x4, x3; \ + adds x5, x5, x4; \ + eor x4, x4, x3; \ + adcs x6, x6, x4; \ + mov x4, #-2; \ + and x4, x4, x3; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + adcs x9, x9, x3; \ + adc x10, x10, x3; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32] + +S2N_BN_SYMBOL(p384_montjmixadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p384(zp2,z_1) + montmul_p384(y2a,z_1,y_2) + + montmul_p384(x2a,zp2,x_2) + montmul_p384(y2a,zp2,y2a) + + sub_p384(xd,x2a,x_1) + sub_p384(yd,y2a,y_1) + + amontsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x_1) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + montmul_p384(resz,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + montmul_p384(t1,t1,y_1) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + orr x6, x0, x1 + orr x7, x2, x3 + orr x8, x4, x5 + orr x6, x6, x7 + orr x6, x6, x8 + cmp x6, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), +// hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x19, x20, [x_2] + csel x0, x0, x19, ne + csel x1, x1, x20, ne + ldp x2, x3, [resx+16] + ldp x19, x20, [x_2+16] + csel x2, x2, x19, ne + csel x3, x3, x20, ne + ldp x4, x5, [resx+32] + ldp x19, x20, [x_2+32] + csel x4, x4, x19, ne + csel x5, x5, x20, ne + + ldp x6, x7, [resy] + ldp x19, x20, [y_2] + csel x6, x6, x19, ne + csel x7, x7, x20, ne + ldp x8, x9, [resy+16] + ldp x19, x20, [y_2+16] + csel x8, x8, x19, ne + csel x9, x9, x20, ne + ldp x10, x11, [resy+32] + ldp x19, x20, [y_2+32] + csel x10, x10, x19, ne + csel x11, x11, x20, ne + + ldp x12, x13, [resz] + mov x19, #0xffffffff00000001 + mov x20, #0x00000000ffffffff + csel x12, x12, x19, ne + csel x13, x13, x20, ne + ldp x14, x15, [resz+16] + mov x19, #1 + csel x14, x14, x19, ne + csel x15, x15, xzr, ne + ldp x16, x17, [resz+32] + csel x16, x16, xzr, ne + csel x17, x17, xzr, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/Makefile b/third_party/s2n-bignum/arm/p521/Makefile index ae0d4f8d70..3e5e0e855c 100644 --- a/third_party/s2n-bignum/arm/p521/Makefile +++ b/third_party/s2n-bignum/arm/p521/Makefile @@ -32,21 +32,28 @@ OBJ = bignum_add_p521.o \ bignum_mod_p521_9.o \ bignum_montmul_p521.o \ bignum_montmul_p521_alt.o \ + bignum_montmul_p521_neon.o \ bignum_montsqr_p521.o \ bignum_montsqr_p521_alt.o \ + bignum_montsqr_p521_neon.o \ bignum_mul_p521.o \ bignum_mul_p521_alt.o \ + bignum_mul_p521_neon.o \ bignum_neg_p521.o \ bignum_optneg_p521.o \ bignum_sqr_p521.o \ bignum_sqr_p521_alt.o \ + bignum_sqr_p521_neon.o \ bignum_sub_p521.o \ bignum_tolebytes_p521.o \ bignum_tomont_p521.o \ bignum_triple_p521.o \ p521_jadd.o \ + p521_jadd_alt.o \ p521_jdouble.o \ - p521_jmixadd.o + p521_jdouble_alt.o \ + p521_jmixadd.o \ + p521_jmixadd_alt.o %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - diff --git a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521_neon.S b/third_party/s2n-bignum/arm/p521/bignum_montmul_p521_neon.S new file mode 100644 index 0000000000..9586339f95 --- /dev/null +++ b/third_party/s2n-bignum/arm/p521/bignum_montmul_p521_neon.S @@ -0,0 +1,1415 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^576) mod p_521 +// Inputs x[9], y[9]; output z[9] +// +// extern void bignum_montmul_p521_neon +// (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); +// +// Does z := (x * y / 2^576) mod p_521, assuming x < p_521, y < p_521. This +// means the Montgomery base is the "native size" 2^{9*64} = 2^576; since +// p_521 is a Mersenne prime the basic modular multiplication bignum_mul_p521 +// can be considered a Montgomery operation to base 2^521. +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_montmul_p521_neon is functionally equivalent to bignum_montmul_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// stp x25, x26, [sp, #-16]! +// sub sp, sp, #80 +// ldp x14, x7, [x1] +// ldp x3, x25, [x1, #16] +// ldp x10, x24, [x2] +// ldr q0, [x1] +// ldr q25, [x2] +// ldp x12, x6, [x2, #16] +// movi v18.2D, #0x00000000ffffffff +// uzp2 v3.4S, v25.4S, v25.4S +// xtn v26.2S, v0.2D +// xtn v22.2S, v25.2D +// rev64 v24.4S, v25.4S +// umull v19.2D, v26.2S, v22.2S +// umull v25.2D, v26.2S, v3.2S +// uzp2 v20.4S, v0.4S, v0.4S +// mul v0.4S, v24.4S, v0.4S +// usra v25.2D, v19.2D, #32 +// umull v6.2D, v20.2S, v3.2S +// uaddlp v0.2D, v0.4S +// and v18.16B, v25.16B, v18.16B +// umlal v18.2D, v20.2S, v22.2S +// shl v0.2D, v0.2D, #32 +// usra v6.2D, v25.2D, #32 +// umlal v0.2D, v26.2S, v22.2S +// usra v6.2D, v18.2D, #32 +// mov x23, v0.d[0] +// mov x16, v0.d[1] +// mul x5, x3, x12 +// mul x21, x25, x6 +// mov x19, v6.d[0] +// adds x16, x16, x19 +// mov x19, v6.d[1] +// adcs x5, x5, x19 +// umulh x19, x3, x12 +// adcs x21, x21, x19 +// umulh x19, x25, x6 +// adc x19, x19, xzr +// adds x8, x16, x23 +// adcs x16, x5, x16 +// adcs x5, x21, x5 +// adcs x21, x19, x21 +// adc x19, xzr, x19 +// adds x11, x16, x23 +// adcs x15, x5, x8 +// adcs x16, x21, x16 +// adcs x5, x19, x5 +// adcs x21, xzr, x21 +// adc x19, xzr, x19 +// subs x20, x3, x25 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x6, x12 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x5, x5, x13 +// eor x20, x20, x9 +// adcs x21, x21, x20 +// adc x19, x19, x9 +// subs x20, x14, x7 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x24, x10 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x8, x8, x13 +// eor x20, x20, x9 +// adcs x11, x11, x20 +// adcs x15, x15, x9 +// adcs x16, x16, x9 +// adcs x5, x5, x9 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x20, x7, x25 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x6, x24 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x16, x16, x13 +// eor x20, x20, x9 +// adcs x5, x5, x20 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x20, x14, x3 +// cneg x20, x20, cc +// csetm x9, cc +// subs x13, x12, x10 +// cneg x13, x13, cc +// mul x26, x20, x13 +// umulh x20, x20, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x11, x11, x13 +// eor x20, x20, x9 +// adcs x15, x15, x20 +// adcs x16, x16, x9 +// adcs x5, x5, x9 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x25, x14, x25 +// cneg x25, x25, cc +// csetm x20, cc +// subs x10, x6, x10 +// cneg x10, x10, cc +// mul x6, x25, x10 +// umulh x25, x25, x10 +// cinv x10, x20, cc +// cmn x10, #0x1 +// eor x6, x6, x10 +// adcs x6, x15, x6 +// eor x25, x25, x10 +// adcs x25, x16, x25 +// adcs x16, x5, x10 +// adcs x5, x21, x10 +// adc x10, x19, x10 +// subs x7, x7, x3 +// cneg x7, x7, cc +// csetm x3, cc +// subs x24, x12, x24 +// cneg x24, x24, cc +// mul x12, x7, x24 +// umulh x7, x7, x24 +// cinv x3, x3, cc +// cmn x3, #0x1 +// eor x24, x12, x3 +// adcs x24, x6, x24 +// eor x7, x7, x3 +// adcs x7, x25, x7 +// adcs x25, x16, x3 +// adcs x12, x5, x3 +// adc x3, x10, x3 +// lsl x10, x23, #9 +// extr x6, x8, x23, #55 +// extr x23, x11, x8, #55 +// extr x16, x24, x11, #55 +// lsr x24, x24, #55 +// stp x7, x25, [sp] // @slothy:writes=stack0 +// stp x12, x3, [sp, #16] // @slothy:writes=stack16 +// stp x10, x6, [sp, #32] // @slothy:writes=stack32 +// stp x23, x16, [sp, #48] // @slothy:writes=stack48 +// str x24, [sp, #64] // @slothy:writes=stack64 +// ldp x7, x3, [x1, #32] +// ldr q0, [x1, #32] +// ldp x25, x10, [x1, #48] +// ldp x24, x12, [x2, #32] +// ldr q25, [x2, #32] +// ldp x6, x23, [x2, #48] +// ldr q18, [x1, #48] +// ldr q3, [x2, #48] +// uzp1 v26.4S, v25.4S, v0.4S +// rev64 v25.4S, v25.4S +// uzp1 v22.4S, v0.4S, v0.4S +// mul v0.4S, v25.4S, v0.4S +// uaddlp v0.2D, v0.4S +// shl v0.2D, v0.2D, #32 +// umlal v0.2D, v22.2S, v26.2S +// mov x16, v0.d[0] +// mov x5, v0.d[1] +// movi v0.2D, #0x00000000ffffffff +// uzp2 v25.4S, v3.4S, v3.4S +// xtn v26.2S, v18.2D +// xtn v22.2S, v3.2D +// rev64 v24.4S, v3.4S +// umull v19.2D, v26.2S, v22.2S +// umull v3.2D, v26.2S, v25.2S +// uzp2 v20.4S, v18.4S, v18.4S +// mul v18.4S, v24.4S, v18.4S +// usra v3.2D, v19.2D, #32 +// umull v6.2D, v20.2S, v25.2S +// uaddlp v25.2D, v18.4S +// and v0.16B, v3.16B, v0.16B +// umlal v0.2D, v20.2S, v22.2S +// shl v25.2D, v25.2D, #32 +// usra v6.2D, v3.2D, #32 +// umlal v25.2D, v26.2S, v22.2S +// usra v6.2D, v0.2D, #32 +// mov x21, v25.d[0] +// mov x19, v25.d[1] +// umulh x8, x7, x24 +// adds x5, x5, x8 +// umulh x8, x3, x12 +// adcs x21, x21, x8 +// mov x8, v6.d[0] +// adcs x19, x19, x8 +// mov x8, v6.d[1] +// adc x8, x8, xzr +// adds x11, x5, x16 +// adcs x5, x21, x5 +// adcs x21, x19, x21 +// adcs x19, x8, x19 +// adc x8, xzr, x8 +// adds x15, x5, x16 +// adcs x20, x21, x11 +// adcs x5, x19, x5 +// adcs x21, x8, x21 +// adcs x19, xzr, x19 +// adc x8, xzr, x8 +// subs x9, x25, x10 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x23, x6 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x21, x21, x26 +// eor x9, x9, x13 +// adcs x19, x19, x9 +// adc x8, x8, x13 +// subs x9, x7, x3 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x12, x24 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x11, x11, x26 +// eor x9, x9, x13 +// adcs x15, x15, x9 +// adcs x20, x20, x13 +// adcs x5, x5, x13 +// adcs x21, x21, x13 +// adcs x19, x19, x13 +// adc x8, x8, x13 +// subs x9, x3, x10 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x23, x12 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x5, x5, x26 +// eor x9, x9, x13 +// adcs x14, x21, x9 +// adcs x21, x19, x13 +// adc x19, x8, x13 +// subs x9, x7, x25 +// cneg x8, x9, cc +// csetm x9, cc +// subs x13, x6, x24 +// cneg x13, x13, cc +// mul x26, x8, x13 +// umulh x8, x8, x13 +// cinv x9, x9, cc +// cmn x9, #0x1 +// eor x13, x26, x9 +// adcs x15, x15, x13 +// eor x8, x8, x9 +// adcs x8, x20, x8 +// adcs x5, x5, x9 +// adcs x20, x14, x9 +// adcs x21, x21, x9 +// adc x19, x19, x9 +// subs x9, x7, x10 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x23, x24 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x8, x8, x26 +// eor x9, x9, x13 +// adcs x5, x5, x9 +// adcs x20, x20, x13 +// adcs x21, x21, x13 +// adc x19, x19, x13 +// subs x9, x3, x25 +// cneg x9, x9, cc +// csetm x13, cc +// subs x26, x6, x12 +// cneg x26, x26, cc +// mul x22, x9, x26 +// umulh x9, x9, x26 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x26, x22, x13 +// adcs x8, x8, x26 +// eor x9, x9, x13 +// adcs x5, x5, x9 +// adcs x20, x20, x13 +// adcs x21, x21, x13 +// adc x19, x19, x13 +// ldp x9, x13, [sp] // @slothy:reads=stack0 +// adds x16, x16, x9 +// adcs x11, x11, x13 +// stp x16, x11, [sp] // @slothy:writes=stack0 +// ldp x16, x11, [sp, #16] // @slothy:reads=stack16 +// adcs x16, x15, x16 +// adcs x8, x8, x11 +// stp x16, x8, [sp, #16] // @slothy:writes=stack16 +// ldp x16, x8, [sp, #32] // @slothy:reads=stack32 +// adcs x16, x5, x16 +// adcs x5, x20, x8 +// stp x16, x5, [sp, #32] // @slothy:writes=stack32 +// ldp x16, x5, [sp, #48] // @slothy:reads=stack48 +// adcs x16, x21, x16 +// adcs x5, x19, x5 +// stp x16, x5, [sp, #48] // @slothy:writes=stack48 +// ldr x16, [sp, #64] // @slothy:reads=stack64 +// adc x16, x16, xzr +// str x16, [sp, #64] // @slothy:writes=stack64 +// ldp x16, x5, [x1] +// subs x7, x7, x16 +// sbcs x3, x3, x5 +// ldp x16, x5, [x1, #16] +// sbcs x25, x25, x16 +// sbcs x10, x10, x5 +// csetm x16, cc +// ldp x5, x21, [x2] +// subs x24, x5, x24 +// sbcs x12, x21, x12 +// ldp x5, x19, [x2, #16] +// sbcs x6, x5, x6 +// sbcs x23, x19, x23 +// csetm x5, cc +// eor x7, x7, x16 +// subs x7, x7, x16 +// eor x3, x3, x16 +// sbcs x3, x3, x16 +// eor x25, x25, x16 +// sbcs x25, x25, x16 +// eor x10, x10, x16 +// sbc x10, x10, x16 +// eor x24, x24, x5 +// subs x24, x24, x5 +// eor x12, x12, x5 +// sbcs x12, x12, x5 +// eor x6, x6, x5 +// sbcs x6, x6, x5 +// eor x23, x23, x5 +// sbc x23, x23, x5 +// eor x16, x5, x16 +// mul x21, x7, x24 +// mul x5, x3, x12 +// mul x19, x25, x6 +// mul x8, x10, x23 +// umulh x11, x7, x24 +// adds x5, x5, x11 +// umulh x11, x3, x12 +// adcs x19, x19, x11 +// umulh x11, x25, x6 +// adcs x8, x8, x11 +// umulh x11, x10, x23 +// adc x11, x11, xzr +// adds x15, x5, x21 +// adcs x5, x19, x5 +// adcs x19, x8, x19 +// adcs x8, x11, x8 +// adc x11, xzr, x11 +// adds x20, x5, x21 +// adcs x9, x19, x15 +// adcs x5, x8, x5 +// adcs x19, x11, x19 +// adcs x8, xzr, x8 +// adc x11, xzr, x11 +// subs x13, x25, x10 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x23, x6 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x19, x19, x22 +// eor x13, x13, x26 +// adcs x8, x8, x13 +// adc x11, x11, x26 +// subs x13, x7, x3 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x12, x24 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x15, x15, x22 +// eor x13, x13, x26 +// adcs x20, x20, x13 +// adcs x9, x9, x26 +// adcs x5, x5, x26 +// adcs x19, x19, x26 +// adcs x8, x8, x26 +// adc x11, x11, x26 +// subs x13, x3, x10 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x23, x12 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x5, x5, x22 +// eor x13, x13, x26 +// adcs x19, x19, x13 +// adcs x8, x8, x26 +// adc x11, x11, x26 +// subs x13, x7, x25 +// cneg x13, x13, cc +// csetm x26, cc +// subs x22, x6, x24 +// cneg x22, x22, cc +// mul x4, x13, x22 +// umulh x13, x13, x22 +// cinv x26, x26, cc +// cmn x26, #0x1 +// eor x22, x4, x26 +// adcs x20, x20, x22 +// eor x13, x13, x26 +// adcs x9, x9, x13 +// adcs x5, x5, x26 +// adcs x19, x19, x26 +// adcs x8, x8, x26 +// adc x11, x11, x26 +// subs x7, x7, x10 +// cneg x7, x7, cc +// csetm x10, cc +// subs x24, x23, x24 +// cneg x24, x24, cc +// mul x23, x7, x24 +// umulh x7, x7, x24 +// cinv x10, x10, cc +// cmn x10, #0x1 +// eor x24, x23, x10 +// adcs x24, x9, x24 +// eor x7, x7, x10 +// adcs x7, x5, x7 +// adcs x23, x19, x10 +// adcs x5, x8, x10 +// adc x10, x11, x10 +// subs x3, x3, x25 +// cneg x3, x3, cc +// csetm x25, cc +// subs x12, x6, x12 +// cneg x12, x12, cc +// mul x6, x3, x12 +// umulh x3, x3, x12 +// cinv x25, x25, cc +// cmn x25, #0x1 +// eor x12, x6, x25 +// adcs x24, x24, x12 +// eor x3, x3, x25 +// adcs x7, x7, x3 +// adcs x3, x23, x25 +// adcs x12, x5, x25 +// adc x25, x10, x25 +// ldp x10, x6, [sp] // @slothy:reads=stack0 +// ldp x23, x5, [sp, #16] // @slothy:reads=stack16 +// eor x21, x21, x16 +// adds x21, x21, x10 +// eor x19, x15, x16 +// adcs x19, x19, x6 +// eor x8, x20, x16 +// adcs x8, x8, x23 +// eor x24, x24, x16 +// adcs x24, x24, x5 +// eor x7, x7, x16 +// ldp x11, x15, [sp, #32] // @slothy:reads=stack32 +// ldp x20, x9, [sp, #48] // @slothy:reads=stack48 +// ldr x13, [sp, #64] // @slothy:reads=stack64 +// adcs x7, x7, x11 +// eor x3, x3, x16 +// adcs x3, x3, x15 +// eor x12, x12, x16 +// adcs x12, x12, x20 +// eor x25, x25, x16 +// adcs x25, x25, x9 +// adc x26, x13, xzr +// adds x7, x7, x10 +// adcs x3, x3, x6 +// adcs x10, x12, x23 +// adcs x25, x25, x5 +// and x12, x16, #0x1ff +// lsl x6, x21, #9 +// orr x12, x6, x12 +// adcs x12, x11, x12 +// extr x6, x19, x21, #55 +// adcs x6, x15, x6 +// extr x23, x8, x19, #55 +// adcs x23, x20, x23 +// extr x16, x24, x8, #55 +// adcs x16, x9, x16 +// lsr x24, x24, #55 +// adc x24, x24, x13 +// ldr x5, [x2, #64] +// ldp x21, x19, [x1] +// and x8, x21, #0xfffffffffffff +// mul x8, x5, x8 +// ldr x11, [x1, #64] +// ldp x15, x20, [x2] +// and x9, x15, #0xfffffffffffff +// mul x9, x11, x9 +// add x8, x8, x9 +// extr x21, x19, x21, #52 +// and x21, x21, #0xfffffffffffff +// mul x21, x5, x21 +// extr x15, x20, x15, #52 +// and x15, x15, #0xfffffffffffff +// mul x15, x11, x15 +// add x21, x21, x15 +// lsr x15, x8, #52 +// add x21, x21, x15 +// lsl x8, x8, #12 +// extr x8, x21, x8, #12 +// adds x7, x7, x8 +// ldp x8, x15, [x1, #16] +// ldp x9, x13, [x2, #16] +// extr x19, x8, x19, #40 +// and x19, x19, #0xfffffffffffff +// mul x19, x5, x19 +// extr x20, x9, x20, #40 +// and x20, x20, #0xfffffffffffff +// mul x20, x11, x20 +// add x19, x19, x20 +// lsr x20, x21, #52 +// add x19, x19, x20 +// lsl x21, x21, #12 +// extr x21, x19, x21, #24 +// adcs x3, x3, x21 +// extr x21, x15, x8, #28 +// and x21, x21, #0xfffffffffffff +// mul x21, x5, x21 +// extr x8, x13, x9, #28 +// and x8, x8, #0xfffffffffffff +// mul x8, x11, x8 +// add x21, x21, x8 +// lsr x8, x19, #52 +// add x21, x21, x8 +// lsl x19, x19, #12 +// extr x19, x21, x19, #36 +// adcs x10, x10, x19 +// and x19, x3, x10 +// ldp x8, x20, [x1, #32] +// ldp x9, x22, [x2, #32] +// extr x15, x8, x15, #16 +// and x15, x15, #0xfffffffffffff +// mul x4, x5, x15 +// extr x15, x9, x13, #16 +// and x15, x15, #0xfffffffffffff +// mul x15, x11, x15 +// add x15, x4, x15 +// lsl x13, x26, #48 +// add x15, x15, x13 +// lsr x13, x21, #52 +// add x15, x15, x13 +// lsl x21, x21, #12 +// extr x21, x15, x21, #48 +// adcs x25, x25, x21 +// and x21, x19, x25 +// lsr x19, x8, #4 +// and x19, x19, #0xfffffffffffff +// mul x19, x5, x19 +// lsr x26, x9, #4 +// and x13, x26, #0xfffffffffffff +// mul x26, x11, x13 +// add x19, x19, x26 +// lsr x13, x15, #52 +// add x19, x19, x13 +// lsl x15, x15, #12 +// extr x15, x19, x15, #60 +// extr x8, x20, x8, #56 +// and x8, x8, #0xfffffffffffff +// mul x8, x5, x8 +// extr x9, x22, x9, #56 +// and x9, x9, #0xfffffffffffff +// mul x9, x11, x9 +// add x8, x8, x9 +// lsr x19, x19, #52 +// add x19, x8, x19 +// lsl x8, x15, #8 +// extr x8, x19, x8, #8 +// adcs x12, x12, x8 +// and x21, x21, x12 +// ldp x1, x8, [x1, #48] +// ldp x2, x15, [x2, #48] +// extr x20, x1, x20, #44 +// and x20, x20, #0xfffffffffffff +// mul x20, x5, x20 +// extr x9, x2, x22, #44 +// and x9, x9, #0xfffffffffffff +// mul x9, x11, x9 +// add x20, x20, x9 +// lsr x9, x19, #52 +// add x22, x20, x9 +// lsl x19, x19, #12 +// extr x19, x22, x19, #20 +// adcs x6, x6, x19 +// and x21, x21, x6 +// extr x1, x8, x1, #32 +// and x1, x1, #0xfffffffffffff +// mul x1, x5, x1 +// extr x2, x15, x2, #32 +// and x2, x2, #0xfffffffffffff +// mul x2, x11, x2 +// add x2, x1, x2 +// lsr x1, x22, #52 +// add x2, x2, x1 +// lsl x1, x22, #12 +// extr x1, x2, x1, #32 +// adcs x23, x23, x1 +// and x21, x21, x23 +// lsr x1, x8, #20 +// mul x1, x5, x1 +// lsr x19, x15, #20 +// mul x19, x11, x19 +// add x1, x1, x19 +// lsr x19, x2, #52 +// add x19, x1, x19 +// lsl x2, x2, #12 +// extr x2, x19, x2, #44 +// adcs x16, x16, x2 +// and x2, x21, x16 +// mul x5, x5, x11 +// lsr x1, x19, #44 +// add x5, x5, x1 +// adc x24, x24, x5 +// lsr x5, x24, #9 +// orr x24, x24, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x7, x5 +// adcs xzr, x2, xzr +// adcs xzr, x24, xzr +// adcs x7, x7, x5 +// adcs x2, x3, xzr +// adcs x10, x10, xzr +// adcs x25, x25, xzr +// adcs x12, x12, xzr +// adcs x6, x6, xzr +// adcs x23, x23, xzr +// adcs x16, x16, xzr +// adc x3, x24, xzr +// stp x2, x10, [x0] // @slothy:writes=buffer0 +// stp x25, x12, [x0, #16] // @slothy:writes=buffer16 +// stp x6, x23, [x0, #32] // @slothy:writes=buffer32 +// lsl x25, x7, #9 +// and x3, x3, #0x1ff +// orr x3, x3, x25 +// stp x16, x3, [x0, #48] // @slothy:writes=buffer48 +// lsr x14, x7, #55 +// str x14, [x0, #64] // @slothy:writes=buffer64 +// add sp, sp, #80 +// ldp x25, x26, [sp], #16 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads and add/sub sp #80 as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montmul_p521_neon): + +// Save registers and make space for the temporary buffer + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, #80 + + ldr q24, [x2] + ldr q21, [x1] + ldr q1, [x2, #48] + ldp x23, x20, [x1, #16] + movi v18.2D, #0x00000000ffffffff + ldp x19, x17, [x2, #16] + uzp2 v3.4S, v24.4S, v24.4S + xtn v6.2S, v21.2D + ldp x11, x22, [x1] + rev64 v5.4S, v24.4S + xtn v24.2S, v24.2D + subs x16, x23, x20 + umull v29.2D, v6.2S, v3.2S + rev64 v31.4S, v1.4S + cneg x26, x16, cc + umull v27.2D, v6.2S, v24.2S + ldr q19, [x1, #48] + csetm x12, cc + mul x15, x20, x17 + mul v26.4S, v5.4S, v21.4S + uzp2 v28.4S, v21.4S, v21.4S + subs x6, x17, x19 + xtn v7.2S, v1.2D + cinv x10, x12, cc + cneg x3, x6, cc + uzp2 v21.4S, v1.4S, v1.4S + umull v1.2D, v28.2S, v3.2S + mul x12, x26, x3 + usra v29.2D, v27.2D, #32 + mul v25.4S, v31.4S, v19.4S + usra v1.2D, v29.2D, #32 + uaddlp v31.2D, v26.4S + umulh x14, x26, x3 + eor x12, x12, x10 + and v26.16B, v29.16B, v18.16B + uaddlp v2.2D, v25.4S + subs x16, x11, x22 + shl v0.2D, v31.2D, #32 + xtn v31.2S, v19.2D + cneg x6, x16, cc + shl v16.2D, v2.2D, #32 + umlal v26.2D, v28.2S, v24.2S + umlal v0.2D, v6.2S, v24.2S + uzp2 v30.4S, v19.4S, v19.4S + umulh x26, x20, x17 + umull v22.2D, v31.2S, v21.2S + umull v29.2D, v30.2S, v21.2S + usra v1.2D, v26.2D, #32 + mul x13, x23, x19 + eor x9, x14, x10 + ldr q5, [x2, #32] + umull v26.2D, v31.2S, v7.2S + ldp x21, x4, [x2] + csetm x8, cc + mov x16, v0.d[1] + ldr q6, [x1, #32] + umlal v16.2D, v31.2S, v7.2S + mov x3, v0.d[0] + umulh x14, x23, x19 + mov x25, v1.d[1] + mov x5, v1.d[0] + usra v22.2D, v26.2D, #32 + rev64 v3.4S, v5.4S + adds x16, x16, x5 + uzp1 v24.4S, v5.4S, v6.4S + movi v26.2D, #0x00000000ffffffff + adcs x7, x13, x25 + uzp1 v0.4S, v6.4S, v6.4S + mul v5.4S, v3.4S, v6.4S + adcs x25, x15, x14 + adc x13, x26, xzr + adds x26, x16, x3 + and v6.16B, v22.16B, v26.16B + usra v29.2D, v22.2D, #32 + adcs x16, x7, x16 + adcs x14, x25, x7 + umlal v6.2D, v30.2S, v7.2S + adcs x7, x13, x25 + uaddlp v7.2D, v5.4S + adc x13, xzr, x13 + adds x25, x16, x3 + adcs x24, x14, x26 + shl v1.2D, v7.2D, #32 + adcs x5, x7, x16 + usra v29.2D, v6.2D, #32 + adcs x16, x13, x14 + umlal v1.2D, v0.2S, v24.2S + adcs x14, xzr, x7 + adc x13, xzr, x13 + subs x7, x4, x21 + cneg x7, x7, cc + mul x15, x6, x7 + umulh x7, x6, x7 + cinv x6, x8, cc + cmn x10, #0x1 + adcs x16, x16, x12 + eor x8, x15, x6 + adcs x14, x14, x9 + adc x9, x13, x10 + subs x13, x22, x20 + cneg x13, x13, cc + csetm x10, cc + subs x12, x17, x4 + cinv x15, x10, cc + cneg x10, x12, cc + cmn x6, #0x1 + umulh x12, x13, x10 + eor x7, x7, x6 + adcs x26, x26, x8 + adcs x7, x25, x7 + adcs x8, x24, x6 + adcs x24, x5, x6 + adcs x25, x16, x6 + mul x5, x13, x10 + adcs x13, x14, x6 + adc x14, x9, x6 + subs x10, x11, x23 + csetm x16, cc + cneg x9, x10, cc + subs x6, x19, x21 + cinv x10, x16, cc + cneg x16, x6, cc + eor x5, x5, x15 + subs x20, x11, x20 + mul x6, x9, x16 + csetm x11, cc + cneg x20, x20, cc + subs x17, x17, x21 + cneg x17, x17, cc + cinv x11, x11, cc + umulh x9, x9, x16 + eor x16, x12, x15 + subs x21, x22, x23 + cneg x22, x21, cc + eor x12, x6, x10 + csetm x6, cc + cmn x15, #0x1 + eor x9, x9, x10 + adcs x5, x24, x5 + umulh x23, x20, x17 + lsl x24, x3, #9 + adcs x25, x25, x16 + adcs x21, x13, x15 + adc x16, x14, x15 + subs x13, x19, x4 + cneg x14, x13, cc + cinv x15, x6, cc + cmn x10, #0x1 + mul x13, x20, x17 + extr x17, x26, x3, #55 + adcs x12, x7, x12 + adcs x8, x8, x9 + eor x19, x23, x11 + adcs x6, x5, x10 + eor x13, x13, x11 + mov x5, v29.d[0] + adcs x25, x25, x10 + extr x26, x12, x26, #55 + mul x4, x22, x14 + adcs x7, x21, x10 + stp x24, x17, [sp, #32] + ldp x20, x21, [x1, #48] + adc x24, x16, x10 + cmn x11, #0x1 + mov x16, v16.d[0] + umulh x17, x22, x14 + adcs x13, x8, x13 + eor x9, x4, x15 + adcs x10, x6, x19 + ldp x22, x23, [x1, #32] + adcs x3, x25, x11 + ldp x4, x19, [x2, #32] + eor x17, x17, x15 + adcs x7, x7, x11 + adc x14, x24, x11 + subs x6, x20, x21 + csetm x11, cc + cneg x8, x6, cc + cmn x15, #0x1 + umulh x25, x22, x4 + adcs x24, x13, x9 + adcs x10, x10, x17 + extr x13, x24, x12, #55 + adcs x9, x3, x15 + ldp x17, x3, [x2, #48] + umulh x6, x23, x19 + adcs x7, x7, x15 + adc x14, x14, x15 + subs x12, x22, x23 + stp x10, x9, [sp] + mov x9, v1.d[1] + csetm x10, cc + stp x7, x14, [sp, #16] + cneg x12, x12, cc + subs x14, x3, x17 + mov x7, v16.d[1] + cneg x15, x14, cc + mov x14, v29.d[1] + cinv x11, x11, cc + adds x9, x9, x25 + mul x25, x8, x15 + stp x26, x13, [sp, #48] + lsr x24, x24, #55 + adcs x26, x16, x6 + mov x13, v1.d[0] + str x24, [sp, #64] + adcs x7, x7, x5 + adc x5, x14, xzr + umulh x6, x8, x15 + eor x15, x25, x11 + subs x25, x19, x4 + cinv x16, x10, cc + cneg x10, x25, cc + eor x6, x6, x11 + adds x8, x9, x13 + adcs x14, x26, x9 + mul x9, x12, x10 + adcs x24, x7, x26 + adcs x7, x5, x7 + umulh x25, x12, x10 + adc x12, xzr, x5 + adds x26, x14, x13 + eor x10, x9, x16 + adcs x9, x24, x8 + adcs x5, x7, x14 + adcs x14, x12, x24 + adcs x7, xzr, x7 + adc x12, xzr, x12 + eor x24, x25, x16 + cmn x11, #0x1 + adcs x25, x14, x15 + adcs x14, x7, x6 + adc x11, x12, x11 + subs x12, x23, x21 + csetm x15, cc + cneg x7, x12, cc + subs x12, x3, x19 + cneg x12, x12, cc + cinv x15, x15, cc + cmn x16, #0x1 + adcs x6, x8, x10 + mul x10, x7, x12 + adcs x26, x26, x24 + adcs x9, x9, x16 + umulh x24, x7, x12 + eor x8, x10, x15 + adcs x5, x5, x16 + adcs x25, x25, x16 + adcs x7, x14, x16 + adc x16, x11, x16 + subs x11, x22, x20 + cneg x11, x11, cc + csetm x14, cc + subs x10, x17, x4 + cinv x14, x14, cc + cneg x10, x10, cc + cmn x15, #0x1 + eor x12, x24, x15 + adcs x5, x5, x8 + mul x24, x11, x10 + adcs x8, x25, x12 + adcs x25, x7, x15 + adc x16, x16, x15 + subs x12, x22, x21 + umulh x10, x11, x10 + cneg x15, x12, cc + csetm x11, cc + subs x12, x3, x4 + cneg x12, x12, cc + cinv x7, x11, cc + mul x11, x15, x12 + eor x24, x24, x14 + cmn x14, #0x1 + eor x10, x10, x14 + adcs x24, x26, x24 + eor x26, x11, x7 + adcs x10, x9, x10 + ldp x11, x9, [x1, #16] + umulh x15, x15, x12 + adcs x5, x5, x14 + adcs x8, x8, x14 + adcs x25, x25, x14 + adc x12, x16, x14 + cmn x7, #0x1 + adcs x16, x10, x26 + eor x14, x15, x7 + adcs x26, x5, x14 + ldp x5, x10, [x1] + adcs x14, x8, x7 + adcs x15, x25, x7 + adc x7, x12, x7 + subs x25, x23, x20 + cneg x25, x25, cc + csetm x8, cc + subs x22, x22, x5 + sbcs x10, x23, x10 + ldp x23, x12, [x2] + sbcs x20, x20, x11 + sbcs x21, x21, x9 + csetm x9, cc + subs x11, x17, x19 + cneg x5, x11, cc + cinv x11, x8, cc + subs x23, x23, x4 + sbcs x19, x12, x19 + eor x20, x20, x9 + ldp x12, x4, [x2, #16] + eor x21, x21, x9 + umulh x8, x25, x5 + eor x22, x22, x9 + eor x10, x10, x9 + sbcs x17, x12, x17 + sbcs x3, x4, x3 + mul x25, x25, x5 + csetm x12, cc + subs x22, x22, x9 + eor x4, x23, x12 + sbcs x23, x10, x9 + eor x10, x3, x12 + sbcs x20, x20, x9 + eor x5, x8, x11 + eor x3, x19, x12 + sbc x21, x21, x9 + subs x4, x4, x12 + eor x25, x25, x11 + sbcs x19, x3, x12 + eor x3, x17, x12 + sbcs x17, x3, x12 + umulh x8, x23, x19 + sbc x3, x10, x12 + cmn x11, #0x1 + adcs x25, x16, x25 + adcs x26, x26, x5 + ldp x10, x5, [sp] + adcs x16, x14, x11 + mul x14, x22, x4 + adcs x15, x15, x11 + adc x7, x7, x11 + adds x11, x13, x10 + umulh x10, x21, x3 + adcs x13, x6, x5 + ldp x6, x5, [sp, #16] + stp x11, x13, [sp] + eor x13, x12, x9 + mul x9, x23, x19 + adcs x6, x24, x6 + ldp x11, x24, [sp, #32] + mul x12, x20, x17 + adcs x25, x25, x5 + stp x6, x25, [sp, #16] + ldp x6, x25, [sp, #48] + umulh x5, x20, x17 + adcs x11, x26, x11 + ldr x26, [sp, #64] + adcs x16, x16, x24 + stp x11, x16, [sp, #32] + adcs x11, x15, x6 + umulh x24, x22, x4 + adcs x25, x7, x25 + adc x7, x26, xzr + stp x11, x25, [sp, #48] + subs x26, x20, x21 + csetm x15, cc + cneg x25, x26, cc + str x7, [sp, #64] + mul x11, x21, x3 + subs x6, x22, x23 + cneg x6, x6, cc + csetm x16, cc + subs x26, x3, x17 + cneg x26, x26, cc + cinv x7, x15, cc + adds x24, x9, x24 + adcs x8, x12, x8 + umulh x12, x25, x26 + adcs x5, x11, x5 + adc x11, x10, xzr + subs x15, x19, x4 + cinv x9, x16, cc + mul x26, x25, x26 + eor x25, x12, x7 + cneg x12, x15, cc + adds x16, x24, x14 + eor x15, x26, x7 + umulh x26, x6, x12 + adcs x10, x8, x24 + adcs x8, x5, x8 + adcs x24, x11, x5 + adc x5, xzr, x11 + adds x11, x10, x14 + mul x12, x6, x12 + adcs x6, x8, x16 + eor x14, x14, x13 + adcs x10, x24, x10 + adcs x8, x5, x8 + adcs x24, xzr, x24 + adc x5, xzr, x5 + cmn x7, #0x1 + adcs x15, x8, x15 + adcs x24, x24, x25 + eor x25, x26, x9 + adc x8, x5, x7 + eor x5, x12, x9 + subs x26, x23, x21 + cneg x12, x26, cc + csetm x26, cc + subs x7, x3, x19 + cneg x7, x7, cc + cinv x26, x26, cc + cmn x9, #0x1 + adcs x5, x16, x5 + mul x16, x12, x7 + adcs x25, x11, x25 + umulh x7, x12, x7 + adcs x12, x6, x9 + eor x11, x16, x26 + adcs x6, x10, x9 + adcs x10, x15, x9 + adcs x24, x24, x9 + adc x8, x8, x9 + subs x15, x22, x20 + cneg x15, x15, cc + csetm x9, cc + subs x16, x17, x4 + cneg x16, x16, cc + cinv x9, x9, cc + subs x21, x22, x21 + mul x22, x15, x16 + eor x7, x7, x26 + cneg x21, x21, cc + umulh x16, x15, x16 + csetm x15, cc + subs x4, x3, x4 + cneg x3, x4, cc + eor x4, x22, x9 + cinv x15, x15, cc + cmn x26, #0x1 + eor x22, x5, x13 + adcs x5, x6, x11 + adcs x6, x10, x7 + adcs x10, x24, x26 + eor x11, x16, x9 + adc x8, x8, x26 + subs x16, x23, x20 + cneg x7, x16, cc + csetm x23, cc + cmn x9, #0x1 + adcs x16, x25, x4 + mul x4, x21, x3 + adcs x24, x12, x11 + eor x11, x16, x13 + adcs x26, x5, x9 + adcs x16, x6, x9 + umulh x20, x21, x3 + adcs x6, x10, x9 + ldp x3, x10, [x1] + adc x12, x8, x9 + subs x21, x17, x19 + cneg x8, x21, cc + eor x25, x20, x15 + eor x20, x4, x15 + mul x19, x7, x8 + cinv x17, x23, cc + cmn x15, #0x1 + adcs x4, x24, x20 + extr x21, x10, x3, #52 + umulh x9, x7, x8 + and x24, x21, #0xfffffffffffff + adcs x26, x26, x25 + eor x7, x19, x17 + adcs x5, x16, x15 + and x23, x3, #0xfffffffffffff + eor x9, x9, x17 + adcs x21, x6, x15 + adc x6, x12, x15 + cmn x17, #0x1 + adcs x25, x4, x7 + and x4, x13, #0x1ff + ldp x16, x8, [sp] + adcs x20, x26, x9 + adcs x12, x5, x17 + ldp x3, x5, [sp, #16] + eor x15, x12, x13 + adcs x12, x21, x17 + adc x9, x6, x17 + adds x21, x14, x16 + lsl x7, x21, #9 + eor x26, x12, x13 + ldp x19, x17, [sp, #32] + orr x4, x7, x4 + eor x14, x25, x13 + adcs x7, x22, x8 + adcs x12, x11, x3 + eor x11, x20, x13 + ldp x6, x25, [sp, #48] + eor x20, x9, x13 + adcs x22, x14, x5 + ldr x14, [x2, #64] + adcs x9, x11, x19 + ldr x11, [sp, #64] + adcs x13, x15, x17 + adcs x26, x26, x6 + adcs x20, x20, x25 + adc x15, x11, xzr + adds x16, x9, x16 + mul x9, x14, x23 + adcs x23, x13, x8 + extr x13, x7, x21, #55 + adcs x21, x26, x3 + ldp x3, x26, [x1, #16] + extr x8, x22, x12, #55 + adcs x20, x20, x5 + adcs x19, x19, x4 + mul x4, x14, x24 + ldp x5, x24, [x2] + adcs x17, x17, x13 + extr x13, x26, x3, #28 + extr x10, x3, x10, #40 + extr x7, x12, x7, #55 + and x12, x13, #0xfffffffffffff + adcs x3, x6, x7 + ldr x6, [x1, #64] + extr x7, x24, x5, #52 + and x5, x5, #0xfffffffffffff + mul x12, x14, x12 + adcs x13, x25, x8 + and x7, x7, #0xfffffffffffff + ldp x8, x25, [x2, #16] + mul x5, x6, x5 + extr x24, x8, x24, #40 + and x24, x24, #0xfffffffffffff + add x9, x9, x5 + lsr x5, x22, #55 + mul x7, x6, x7 + extr x22, x25, x8, #28 + and x10, x10, #0xfffffffffffff + mul x10, x14, x10 + lsr x8, x9, #52 + lsl x9, x9, #12 + add x7, x4, x7 + adc x4, x5, x11 + ldp x11, x5, [x2, #32] + add x8, x7, x8 + and x7, x22, #0xfffffffffffff + extr x22, x8, x9, #12 + lsl x9, x15, #48 + mul x15, x6, x24 + add x10, x10, x15 + lsr x15, x8, #52 + extr x25, x11, x25, #16 + and x25, x25, #0xfffffffffffff + mul x24, x6, x7 + add x7, x10, x15 + lsr x10, x7, #52 + lsl x8, x8, #12 + extr x8, x7, x8, #24 + adds x22, x16, x22 + ldp x16, x15, [x1, #32] + adcs x23, x23, x8 + extr x8, x5, x11, #56 + mul x25, x6, x25 + add x24, x12, x24 + add x12, x24, x10 + lsr x10, x16, #4 + lsl x7, x7, #12 + extr x24, x12, x7, #36 + and x10, x10, #0xfffffffffffff + extr x26, x16, x26, #16 + mul x10, x14, x10 + and x8, x8, #0xfffffffffffff + adcs x21, x21, x24 + and x7, x26, #0xfffffffffffff + mul x7, x14, x7 + lsr x24, x11, #4 + and x24, x24, #0xfffffffffffff + extr x11, x15, x16, #56 + lsl x26, x12, #12 + and x16, x11, #0xfffffffffffff + mul x11, x6, x24 + lsr x12, x12, #52 + ldp x2, x24, [x2, #48] + add x25, x7, x25 + add x25, x25, x9 + and x9, x23, x21 + mul x8, x6, x8 + add x12, x25, x12 + add x25, x10, x11 + extr x11, x12, x26, #48 + ldp x7, x26, [x1, #48] + extr x5, x2, x5, #44 + lsr x1, x12, #52 + mul x10, x14, x16 + lsr x16, x24, #20 + add x10, x10, x8 + extr x8, x26, x7, #32 + and x8, x8, #0xfffffffffffff + extr x24, x24, x2, #32 + mul x2, x6, x16 + add x1, x25, x1 + lsr x25, x26, #20 + and x26, x24, #0xfffffffffffff + and x24, x5, #0xfffffffffffff + extr x16, x7, x15, #44 + mul x7, x6, x24 + adcs x11, x20, x11 + and x20, x16, #0xfffffffffffff + lsl x5, x12, #12 + and x15, x9, x11 + mul x24, x14, x20 + lsr x9, x1, #52 + add x20, x10, x9 + extr x12, x1, x5, #60 + lsl x9, x20, #12 + lsl x5, x12, #8 + mul x10, x14, x8 + extr x12, x20, x5, #8 + lsr x1, x20, #52 + add x7, x24, x7 + adcs x8, x19, x12 + and x5, x15, x8 + add x7, x7, x1 + mul x20, x6, x26 + extr x24, x7, x9, #20 + lsr x19, x7, #52 + mul x25, x14, x25 + lsl x16, x7, #12 + add x20, x10, x20 + adcs x12, x17, x24 + add x19, x20, x19 + lsr x26, x19, #52 + mul x24, x14, x6 + and x5, x5, x12 + add x6, x25, x2 + lsl x17, x19, #12 + add x14, x6, x26 + extr x16, x19, x16, #32 + lsr x6, x14, #44 + extr x19, x14, x17, #44 + add x9, x24, x6 + adcs x17, x3, x16 + adcs x2, x13, x19 + and x7, x5, x17 + adc x15, x4, x9 + cmp xzr, xzr + orr x1, x15, #0xfffffffffffffe00 + lsr x3, x15, #9 + adcs xzr, x22, x3 + and x15, x7, x2 + adcs xzr, x15, xzr + adcs xzr, x1, xzr + adcs x7, x22, x3 + lsl x3, x7, #9 + lsr x15, x7, #55 + str x15, [x0, #64] + adcs x13, x23, xzr + adcs x16, x21, xzr + stp x13, x16, [x0] + adcs x13, x11, xzr + adcs x16, x8, xzr + stp x13, x16, [x0, #16] + adcs x19, x12, xzr + adcs x16, x17, xzr + adcs x13, x2, xzr + stp x19, x16, [x0, #32] + adc x16, x1, xzr + and x16, x16, #0x1ff + orr x16, x16, x3 + stp x13, x16, [x0, #48] + +// Restore regs and return + + add sp, sp, #80 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_neon.S b/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_neon.S new file mode 100644 index 0000000000..57cf911615 --- /dev/null +++ b/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_neon.S @@ -0,0 +1,1124 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^576) mod p_521 +// Input x[9]; output z[9] +// +// extern void bignum_montsqr_p521_neon +// (uint64_t z[static 9], uint64_t x[static 9]); +// +// Does z := (x^2 / 2^576) mod p_521, assuming x < p_521. This means the +// Montgomery base is the "native size" 2^{9*64} = 2^576; since p_521 is +// a Mersenne prime the basic modular squaring bignum_sqr_p521 can be +// considered a Montgomery operation to base 2^521. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_montsqr_p521_neon is functionally equivalent to bignum_montsqr_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// ldp x16, x8, [x1] +// ldr q18, [x1] +// ldr q5, [x1] +// ldr q20, [x1] +// ldp x17, x13, [x1, #16] +// ldr q17, [x1, #16] +// ldr q1, [x1, #16] +// ldr q28, [x1, #16] +// ldp x9, x15, [x1, #32] +// ldr q27, [x1] +// ldr q29, [x1, #32] +// ldp x23, x2, [x1, #48] +// ldr q6, [x1, #48] +// ldr q4, [x1, #48] +// mul x24, x9, x23 +// mul x11, x15, x2 +// umulh x20, x9, x23 +// subs x4, x9, x15 +// cneg x22, x4, cc +// csetm x12, cc +// subs x4, x2, x23 +// cneg x4, x4, cc +// mul x19, x22, x4 +// umulh x4, x22, x4 +// cinv x7, x12, cc +// eor x14, x19, x7 +// eor x22, x4, x7 +// adds x12, x24, x20 +// adc x19, x20, xzr +// umulh x4, x15, x2 +// adds x12, x12, x11 +// adcs x19, x19, x4 +// adc x4, x4, xzr +// adds x19, x19, x11 +// adc x4, x4, xzr +// cmn x7, #0x1 +// adcs x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, x7 +// adds x11, x24, x24 +// adcs x20, x12, x12 +// adcs x10, x19, x19 +// adcs x3, x4, x4 +// adc x5, xzr, xzr +// ldr q30, [x1, #32] +// umull v0.2D, v30.2S, v30.2S +// umull2 v2.2D, v30.4S, v30.4S +// xtn v24.2S, v30.2D +// uzp2 v30.4S, v30.4S, v30.4S +// umull v30.2D, v30.2S, v24.2S +// mov x7, v0.d[0] +// mov x14, v0.d[1] +// mov x19, v2.d[0] +// mov x22, v2.d[1] +// mov x4, v30.d[0] +// mov x12, v30.d[1] +// adds x21, x7, x4, lsl #33 +// lsr x4, x4, #31 +// adc x14, x14, x4 +// adds x19, x19, x12, lsl #33 +// lsr x4, x12, #31 +// adc x22, x22, x4 +// mul x4, x9, x15 +// umulh x12, x9, x15 +// adds x24, x14, x4, lsl #1 +// extr x4, x12, x4, #63 +// adcs x19, x19, x4 +// lsr x4, x12, #63 +// adc x4, x22, x4 +// adds x11, x11, x19 +// adcs x20, x20, x4 +// adcs x10, x10, xzr +// adcs x3, x3, xzr +// adc x6, x5, xzr +// movi v3.2D, #0x00000000ffffffff +// uzp2 v16.4S, v4.4S, v4.4S +// xtn v25.2S, v6.2D +// xtn v23.2S, v4.2D +// rev64 v30.4S, v4.4S +// umull v24.2D, v25.2S, v23.2S +// umull v0.2D, v25.2S, v16.2S +// uzp2 v2.4S, v6.4S, v6.4S +// mul v30.4S, v30.4S, v6.4S +// usra v0.2D, v24.2D, #32 +// umull v19.2D, v2.2S, v16.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v0.16B, v3.16B +// umlal v24.2D, v2.2S, v23.2S +// shl v30.2D, v30.2D, #32 +// usra v19.2D, v0.2D, #32 +// umlal v30.2D, v25.2S, v23.2S +// usra v19.2D, v24.2D, #32 +// mov x5, v30.d[0] +// mov x7, v30.d[1] +// mul x14, x23, x2 +// mov x19, v19.d[0] +// mov x4, v19.d[1] +// umulh x22, x23, x2 +// adds x12, x19, x14 +// adcs x19, x7, x22 +// adc x4, x4, xzr +// adds x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, xzr +// adds x7, x5, x10 +// adcs x3, x12, x3 +// adcs x14, x19, x6 +// adc x10, x4, xzr +// ldr x4, [x1, #64] +// add x6, x4, x4 +// mul x5, x4, x4 +// and x4, x16, #0xfffffffffffff +// mul x22, x6, x4 +// extr x4, x8, x16, #52 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x4, x12, x4, #12 +// adds x21, x21, x4 +// extr x4, x17, x8, #40 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x22, x19, x4 +// lsl x4, x12, #12 +// extr x4, x22, x4, #24 +// adcs x24, x24, x4 +// extr x4, x13, x17, #28 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x4, x12, x4, #36 +// adcs x11, x11, x4 +// extr x4, x9, x13, #16 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x22, x19, x4 +// lsl x4, x12, #12 +// extr x4, x22, x4, #48 +// adcs x20, x20, x4 +// lsr x4, x9, #4 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x22, x12, x4, #60 +// extr x4, x15, x9, #56 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x12, x19, x4 +// lsl x4, x22, #8 +// extr x4, x12, x4, #8 +// adcs x7, x7, x4 +// extr x4, x23, x15, #44 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x22, x19, x4 +// lsl x4, x12, #12 +// extr x4, x22, x4, #20 +// adcs x1, x3, x4 +// extr x4, x2, x23, #32 +// and x4, x4, #0xfffffffffffff +// mul x19, x6, x4 +// lsr x4, x22, #52 +// add x12, x19, x4 +// lsl x4, x22, #12 +// extr x4, x12, x4, #32 +// adcs x14, x14, x4 +// lsr x4, x2, #20 +// mul x19, x6, x4 +// lsr x4, x12, #52 +// add x19, x19, x4 +// lsl x4, x12, #12 +// extr x4, x19, x4, #44 +// adcs x22, x10, x4 +// lsr x4, x19, #44 +// adc x12, x5, x4 +// extr x19, x24, x21, #9 +// extr x4, x11, x24, #9 +// stp x19, x4, [x0] // @slothy:writes=buffer0 +// extr x19, x20, x11, #9 +// extr x4, x7, x20, #9 +// stp x19, x4, [x0, #16] // @slothy:writes=buffer16 +// extr x19, x1, x7, #9 +// extr x4, x14, x1, #9 +// stp x19, x4, [x0, #32] // @slothy:writes=buffer32 +// extr x19, x22, x14, #9 +// extr x4, x12, x22, #9 +// stp x19, x4, [x0, #48] // @slothy:writes=buffer48 +// and x19, x21, #0x1ff +// lsr x4, x12, #9 +// add x4, x19, x4 +// str x4, [x0, #64] +// uzp1 v2.4S, v28.4S, v18.4S +// rev64 v30.4S, v28.4S +// uzp1 v24.4S, v18.4S, v18.4S +// mul v30.4S, v30.4S, v18.4S +// uaddlp v30.2D, v30.4S +// shl v30.2D, v30.2D, #32 +// umlal v30.2D, v24.2S, v2.2S +// mov x11, v30.d[0] +// mov x20, v30.d[1] +// umulh x7, x16, x17 +// subs x4, x16, x8 +// cneg x22, x4, cc +// csetm x12, cc +// subs x4, x13, x17 +// cneg x4, x4, cc +// mul x19, x22, x4 +// umulh x4, x22, x4 +// cinv x1, x12, cc +// eor x14, x19, x1 +// eor x22, x4, x1 +// adds x12, x11, x7 +// adc x19, x7, xzr +// umulh x4, x8, x13 +// adds x12, x12, x20 +// adcs x19, x19, x4 +// adc x4, x4, xzr +// adds x19, x19, x20 +// adc x4, x4, xzr +// cmn x1, #0x1 +// adcs x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, x1 +// adds x21, x11, x11 +// adcs x24, x12, x12 +// adcs x11, x19, x19 +// adcs x20, x4, x4 +// adc x7, xzr, xzr +// movi v3.2D, #0x00000000ffffffff +// uzp2 v16.4S, v20.4S, v20.4S +// xtn v25.2S, v5.2D +// xtn v23.2S, v20.2D +// rev64 v30.4S, v20.4S +// umull v24.2D, v25.2S, v23.2S +// umull v0.2D, v25.2S, v16.2S +// uzp2 v2.4S, v5.4S, v5.4S +// mul v30.4S, v30.4S, v5.4S +// usra v0.2D, v24.2D, #32 +// umull v19.2D, v2.2S, v16.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v0.16B, v3.16B +// umlal v24.2D, v2.2S, v23.2S +// shl v30.2D, v30.2D, #32 +// usra v19.2D, v0.2D, #32 +// umlal v30.2D, v25.2S, v23.2S +// usra v19.2D, v24.2D, #32 +// mov x10, v30.d[0] +// mov x1, v30.d[1] +// mul x14, x16, x8 +// mov x19, v19.d[0] +// mov x4, v19.d[1] +// umulh x22, x16, x8 +// adds x12, x19, x14 +// adcs x19, x1, x22 +// adc x4, x4, xzr +// adds x3, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, xzr +// adds x5, x21, x19 +// adcs x21, x24, x4 +// adcs x24, x11, xzr +// adcs x11, x20, xzr +// adc x20, x7, xzr +// movi v3.2D, #0x00000000ffffffff +// uzp2 v16.4S, v1.4S, v1.4S +// xtn v25.2S, v17.2D +// xtn v23.2S, v1.2D +// rev64 v30.4S, v1.4S +// umull v24.2D, v25.2S, v23.2S +// umull v0.2D, v25.2S, v16.2S +// uzp2 v2.4S, v17.4S, v17.4S +// mul v30.4S, v30.4S, v17.4S +// usra v0.2D, v24.2D, #32 +// umull v19.2D, v2.2S, v16.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v0.16B, v3.16B +// umlal v24.2D, v2.2S, v23.2S +// shl v30.2D, v30.2D, #32 +// usra v19.2D, v0.2D, #32 +// umlal v30.2D, v25.2S, v23.2S +// usra v19.2D, v24.2D, #32 +// mov x7, v30.d[0] +// mov x1, v30.d[1] +// mul x14, x17, x13 +// mov x19, v19.d[0] +// mov x4, v19.d[1] +// umulh x22, x17, x13 +// adds x12, x19, x14 +// adcs x19, x1, x22 +// adc x4, x4, xzr +// adds x12, x12, x14 +// adcs x19, x19, x22 +// adc x4, x4, xzr +// adds x1, x7, x24 +// adcs x14, x12, x11 +// adcs x22, x19, x20 +// adc x12, x4, xzr +// ldp x19, x4, [x0] // @slothy:reads=buffer0 +// adds x19, x19, x10 +// adcs x4, x4, x3 +// stp x19, x4, [x0] // @slothy:writes=buffer0 +// ldp x19, x4, [x0, #16] // @slothy:reads=buffer16 +// adcs x19, x19, x5 +// adcs x4, x4, x21 +// stp x19, x4, [x0, #16] // @slothy:writes=buffer16 +// ldp x19, x4, [x0, #32] // @slothy:reads=buffer32 +// adcs x19, x19, x1 +// adcs x4, x4, x14 +// stp x19, x4, [x0, #32] // @slothy:writes=buffer32 +// ldp x19, x4, [x0, #48] // @slothy:reads=buffer48 +// adcs x19, x19, x22 +// adcs x4, x4, x12 +// stp x19, x4, [x0, #48] // @slothy:writes=buffer48 +// ldr x4, [x0, #64] +// adc x4, x4, xzr +// str x4, [x0, #64] +// movi v3.2D, #0x00000000ffffffff +// uzp2 v2.4S, v29.4S, v29.4S +// xtn v16.2S, v27.2D +// xtn v25.2S, v29.2D +// rev64 v30.4S, v29.4S +// umull v24.2D, v16.2S, v25.2S +// umull v23.2D, v16.2S, v2.2S +// uzp2 v0.4S, v27.4S, v27.4S +// mul v30.4S, v30.4S, v27.4S +// usra v23.2D, v24.2D, #32 +// umull v2.2D, v0.2S, v2.2S +// uaddlp v30.2D, v30.4S +// and v24.16B, v23.16B, v3.16B +// umlal v24.2D, v0.2S, v25.2S +// shl v30.2D, v30.2D, #32 +// usra v2.2D, v23.2D, #32 +// umlal v30.2D, v16.2S, v25.2S +// usra v2.2D, v24.2D, #32 +// mov x6, v30.d[0] +// mov x22, v30.d[1] +// mul x12, x17, x23 +// mul x19, x13, x2 +// mov x4, v2.d[0] +// adds x22, x22, x4 +// mov x4, v2.d[1] +// adcs x12, x12, x4 +// umulh x4, x17, x23 +// adcs x19, x19, x4 +// umulh x4, x13, x2 +// adc x4, x4, xzr +// adds x21, x22, x6 +// adcs x22, x12, x22 +// adcs x12, x19, x12 +// adcs x19, x4, x19 +// adc x4, xzr, x4 +// adds x24, x22, x6 +// adcs x11, x12, x21 +// adcs x20, x19, x22 +// adcs x1, x4, x12 +// adcs x14, xzr, x19 +// adc x7, xzr, x4 +// subs x4, x17, x13 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x2, x23 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x1, x1, x4 +// eor x4, x12, x19 +// adcs x14, x14, x4 +// adc x7, x7, x19 +// subs x4, x16, x8 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x15, x9 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x10, x21, x4 +// eor x4, x12, x19 +// adcs x24, x24, x4 +// adcs x11, x11, x19 +// adcs x20, x20, x19 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x8, x13 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x2, x15 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x20, x20, x4 +// eor x4, x12, x19 +// adcs x1, x1, x4 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x16, x17 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x23, x9 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x24, x24, x4 +// eor x4, x12, x19 +// adcs x11, x11, x4 +// adcs x20, x20, x19 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x16, x13 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x2, x9 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x11, x11, x4 +// eor x4, x12, x19 +// adcs x20, x20, x4 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x7, x7, x19 +// subs x4, x8, x17 +// cneg x12, x4, cc +// csetm x22, cc +// subs x4, x23, x15 +// cneg x19, x4, cc +// mul x4, x12, x19 +// umulh x12, x12, x19 +// cinv x19, x22, cc +// cmn x19, #0x1 +// eor x4, x4, x19 +// adcs x3, x11, x4 +// eor x4, x12, x19 +// adcs x5, x20, x4 +// adcs x1, x1, x19 +// adcs x14, x14, x19 +// adc x22, x7, x19 +// ldp x12, x19, [x0] // @slothy:reads=buffer0 +// extr x4, x1, x5, #8 +// adds x11, x4, x12 +// extr x4, x14, x1, #8 +// adcs x20, x4, x19 +// ldp x19, x12, [x0, #16] // @slothy:reads=buffer16 +// extr x4, x22, x14, #8 +// adcs x7, x4, x19 +// and x19, x20, x7 +// lsr x4, x22, #8 +// adcs x1, x4, x12 +// and x22, x19, x1 +// ldp x19, x12, [x0, #32] // @slothy:reads=buffer32 +// lsl x4, x6, #1 +// adcs x14, x4, x19 +// and x19, x22, x14 +// extr x4, x10, x6, #63 +// adcs x21, x4, x12 +// and x22, x19, x21 +// ldp x19, x12, [x0, #48] // @slothy:reads=buffer48 +// extr x4, x24, x10, #63 +// adcs x2, x4, x19 +// and x19, x22, x2 +// extr x4, x3, x24, #63 +// adcs x24, x4, x12 +// and x12, x19, x24 +// ldr x19, [x0, #64] +// extr x4, x5, x3, #63 +// and x4, x4, #0x1ff +// adc x4, x19, x4 +// lsr x19, x4, #9 +// orr x4, x4, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x11, x19 +// adcs xzr, x12, xzr +// adcs xzr, x4, xzr +// adcs x11, x11, x19 +// adcs x20, x20, xzr +// adcs x7, x7, xzr +// adcs x1, x1, xzr +// adcs x14, x14, xzr +// adcs x22, x21, xzr +// adcs x12, x2, xzr +// adcs x24, x24, xzr +// adc x4, x4, xzr +// and x19, x4, #0x1ff +// lsl x4, x11, #9 +// extr x11, x20, x11, #55 +// extr x20, x7, x20, #55 +// extr x7, x1, x7, #55 +// extr x1, x14, x1, #55 +// orr x4, x19, x4 +// extr x14, x22, x14, #55 +// extr x22, x12, x22, #55 +// extr x12, x24, x12, #55 +// extr x19, x4, x24, #55 +// lsr x4, x4, #55 +// stp x11, x20, [x0] // @slothy:writes=buffer0 +// stp x7, x1, [x0, #16] // @slothy:writes=buffer16 +// stp x14, x22, [x0, #32] // @slothy:writes=buffer32 +// stp x12, x19, [x0, #48] // @slothy:writes=buffer48 +// str x4, [x0, #64] +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montsqr_p521_neon): + +// Save registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + +// The optimized body + + ldr q31, [x1, #48] + ldp x9, x15, [x1, #32] + ldp x23, x2, [x1, #48] + ldr q0, [x1, #48] + ldr q29, [x1, #32] + rev64 v21.4S, v31.4S + umulh x13, x9, x23 + mul v23.4S, v21.4S, v0.4S + xtn v21.2S, v0.2D + uzp2 v19.4S, v31.4S, v31.4S + xtn v2.2S, v29.2D + xtn v30.2S, v31.2D + uzp2 v3.4S, v29.4S, v29.4S + umull v6.2D, v21.2S, v19.2S + mul x10, x9, x23 + uaddlp v23.2D, v23.4S + umull v22.2D, v21.2S, v30.2S + adds x22, x10, x13 + mul x17, x9, x15 + movi v25.2D, #0x00000000ffffffff + uzp2 v1.4S, v0.4S, v0.4S + adc x8, x13, xzr + subs x19, x9, x15 + umull v28.2D, v3.2S, v2.2S + shl v31.2D, v23.2D, #32 + csetm x5, cc + cneg x3, x19, cc + umull v19.2D, v1.2S, v19.2S + ldr q4, [x1, #16] + subs x24, x2, x23 + mul x6, x15, x2 + usra v6.2D, v22.2D, #32 + ldr q23, [x1] + cneg x13, x24, cc + umulh x24, x15, x2 + umull v5.2D, v29.2S, v29.2S + rev64 v3.4S, v4.4S + cinv x19, x5, cc + adds x16, x22, x6 + mov x14, v28.d[1] + umlal v31.2D, v21.2S, v30.2S + umull2 v17.2D, v29.4S, v29.4S + mov x20, v28.d[0] + mul v29.4S, v3.4S, v23.4S + and v22.16B, v6.16B, v25.16B + mul x5, x3, x13 + mov x4, v5.d[1] + mov x7, v5.d[0] + adcs x11, x8, x24 + ldr q5, [x1] + ldr q0, [x1] + adc x22, x24, xzr + adds x8, x11, x6 + usra v19.2D, v6.2D, #32 + umlal v22.2D, v1.2S, v30.2S + adc x11, x22, xzr + adds x21, x7, x20, lsl #33 + mov x24, v17.d[1] + mov x22, v17.d[0] + lsr x12, x20, #31 + uzp1 v2.4S, v4.4S, v23.4S + uzp1 v20.4S, v23.4S, v23.4S + usra v19.2D, v22.2D, #32 + adc x4, x4, x12 + lsr x6, x14, #31 + adds x20, x22, x14, lsl #33 + ldr q17, [x1, #16] + uzp2 v22.4S, v0.4S, v0.4S + eor x12, x5, x19 + umulh x7, x3, x13 + xtn v23.2S, v0.2D + adc x5, x24, x6 + cmn x19, #0x1 + xtn v25.2S, v5.2D + ldr q27, [x1] + adcs x16, x16, x12 + uaddlp v1.2D, v29.4S + umulh x3, x9, x15 + eor x13, x7, x19 + adcs x24, x8, x13 + adc x11, x11, x19 + adds x12, x10, x10 + adcs x13, x16, x16 + mul x19, x23, x2 + umull v21.2D, v25.2S, v23.2S + adcs x7, x24, x24 + ldp x16, x8, [x1] + umull v3.2D, v25.2S, v22.2S + uzp2 v6.4S, v5.4S, v5.4S + adcs x10, x11, x11 + ldr q29, [x1, #32] + adc x14, xzr, xzr + adds x24, x4, x17, lsl #1 + mov x4, v31.d[1] + shl v30.2D, v1.2D, #32 + lsr x6, x3, #63 + extr x11, x3, x17, #63 + ldr q1, [x1, #16] + mov x22, v19.d[1] + adcs x20, x20, x11 + umulh x3, x23, x2 + movi v4.2D, #0x00000000ffffffff + usra v3.2D, v21.2D, #32 + adc x5, x5, x6 + adds x11, x12, x20 + mov x6, v19.d[0] + umull v19.2D, v6.2S, v22.2S + adcs x20, x13, x5 + rev64 v22.4S, v0.4S + ldr x5, [x1, #64] + ldp x17, x13, [x1, #16] + adcs x7, x7, xzr + umlal v30.2D, v20.2S, v2.2S + adcs x12, x10, xzr + and x1, x16, #0xfffffffffffff + mul v22.4S, v22.4S, v5.4S + adc x14, x14, xzr + adds x6, x6, x19 + xtn v5.2S, v1.2D + adcs x10, x4, x3 + mov x4, v31.d[0] + adc x22, x22, xzr + adds x19, x6, x19 + add x6, x5, x5 + and v21.16B, v3.16B, v4.16B + adcs x10, x10, x3 + extr x3, x8, x16, #52 + mul x1, x6, x1 + usra v19.2D, v3.2D, #32 + adc x22, x22, xzr + adds x7, x4, x7 + umlal v21.2D, v6.2S, v23.2S + and x4, x3, #0xfffffffffffff + adcs x3, x19, x12 + uzp2 v28.4S, v1.4S, v1.4S + extr x19, x17, x8, #40 + mul x12, x6, x4 + adcs x14, x10, x14 + rev64 v4.4S, v1.4S + mul x5, x5, x5 + lsr x4, x9, #4 + adc x10, x22, xzr + lsl x22, x1, #12 + lsr x1, x1, #52 + add x12, x12, x1 + and x1, x19, #0xfffffffffffff + extr x19, x12, x22, #12 + mul x1, x6, x1 + extr x22, x13, x17, #28 + adds x21, x21, x19 + mul v31.4S, v4.4S, v17.4S + and x19, x22, #0xfffffffffffff + lsr x22, x12, #52 + lsl x12, x12, #12 + mul x19, x6, x19 + add x22, x1, x22 + extr x1, x22, x12, #24 + and x4, x4, #0xfffffffffffff + adcs x12, x24, x1 + extr x1, x9, x13, #16 + mul x24, x6, x4 + and x1, x1, #0xfffffffffffff + lsr x4, x22, #52 + add x4, x19, x4 + lsl x22, x22, #12 + mul x1, x6, x1 + extr x22, x4, x22, #36 + adcs x11, x11, x22 + extr x22, x11, x12, #9 + extr x19, x12, x21, #9 + uaddlp v3.2D, v22.4S + lsl x12, x4, #12 + stp x19, x22, [x0] + umulh x19, x16, x17 + uaddlp v31.2D, v31.4S + lsr x22, x4, #52 + extr x4, x15, x9, #56 + usra v19.2D, v21.2D, #32 + add x22, x1, x22 + extr x1, x23, x15, #44 + shl v4.2D, v31.2D, #32 + extr x12, x22, x12, #48 + and x4, x4, #0xfffffffffffff + uzp2 v7.4S, v17.4S, v17.4S + adcs x20, x20, x12 + xtn v17.2S, v17.2D + lsl x12, x22, #12 + lsr x22, x22, #52 + mul x4, x6, x4 + add x22, x24, x22 + and x24, x1, #0xfffffffffffff + extr x1, x2, x23, #32 + extr x12, x22, x12, #60 + lsl x12, x12, #8 + lsr x22, x22, #52 + mul x24, x6, x24 + add x4, x4, x22 + and x22, x1, #0xfffffffffffff + extr x12, x4, x12, #8 + lsl x1, x4, #12 + lsr x4, x4, #52 + adcs x7, x7, x12 + mul x12, x6, x22 + add x24, x24, x4 + extr x1, x24, x1, #20 + extr x22, x20, x11, #9 + extr x20, x7, x20, #9 + lsr x11, x2, #20 + mul x6, x6, x11 + lsr x4, x24, #52 + add x4, x12, x4 + lsl x12, x24, #12 + adcs x3, x3, x1 + extr x24, x4, x12, #32 + lsr x11, x4, #52 + adcs x12, x14, x24 + umull v31.2D, v17.2S, v28.2S + add x24, x6, x11 + lsl x1, x4, #12 + extr x7, x3, x7, #9 + rev64 v6.4S, v29.4S + umull v22.2D, v17.2S, v5.2S + extr x11, x12, x3, #9 + extr x14, x24, x1, #44 + umlal v4.2D, v17.2S, v5.2S + adcs x3, x10, x14 + umulh x10, x8, x13 + lsr x14, x24, #44 + adc x24, x5, x14 + subs x5, x16, x8 + stp x22, x20, [x0, #16] + csetm x1, cc + shl v21.2D, v3.2D, #32 + movi v17.2D, #0x00000000ffffffff + cneg x20, x5, cc + subs x5, x13, x17 + usra v31.2D, v22.2D, #32 + cneg x14, x5, cc + lsr x6, x24, #9 + and x22, x21, #0x1ff + mov x4, v30.d[0] + add x6, x22, x6 + stp x7, x11, [x0, #32] + umulh x22, x20, x14 + mov x5, v30.d[1] + str x6, [x0, #64] + extr x12, x3, x12, #9 + umull v28.2D, v7.2S, v28.2S + mul x11, x20, x14 + mul v6.4S, v6.4S, v27.4S + and v1.16B, v31.16B, v17.16B + cinv x21, x1, cc + adds x6, x4, x19 + uzp2 v22.4S, v27.4S, v27.4S + adc x20, x19, xzr + adds x6, x6, x5 + umlal v1.2D, v7.2S, v5.2S + xtn v20.2S, v29.2D + eor x22, x22, x21 + adcs x7, x20, x10 + usra v28.2D, v31.2D, #32 + eor x20, x11, x21 + usra v28.2D, v1.2D, #32 + xtn v0.2S, v27.2D + adc x10, x10, xzr + adds x1, x7, x5 + umlal v21.2D, v25.2S, v23.2S + uzp2 v29.4S, v29.4S, v29.4S + adc x19, x10, xzr + cmn x21, #0x1 + umull v3.2D, v0.2S, v20.2S + adcs x5, x6, x20 + extr x10, x24, x3, #9 + umull v31.2D, v0.2S, v29.2S + adcs x1, x1, x22 + stp x12, x10, [x0, #48] + mul x24, x16, x8 + mov x3, v28.d[1] + usra v31.2D, v3.2D, #32 + adc x10, x19, x21 + adds x7, x4, x4 + umulh x14, x16, x8 + uaddlp v3.2D, v6.4S + mov x4, v28.d[0] + adcs x12, x5, x5 + mov x5, v19.d[0] + movi v23.2D, #0x00000000ffffffff + adcs x20, x1, x1 + mov x19, v21.d[1] + mov x1, v19.d[1] + adcs x22, x10, x10 + and v17.16B, v31.16B, v23.16B + adc x6, xzr, xzr + umlal v17.2D, v22.2S, v20.2S + adds x10, x5, x24 + mul x11, x17, x13 + mov x5, v21.d[0] + umull v28.2D, v22.2S, v29.2S + adcs x19, x19, x14 + shl v5.2D, v3.2D, #32 + adc x21, x1, xzr + adds x10, x10, x24 + adcs x1, x19, x14 + umulh x14, x17, x13 + adc x19, x21, xzr + adds x7, x7, x1 + adcs x1, x12, x19 + adcs x24, x20, xzr + mov x20, v4.d[1] + usra v28.2D, v31.2D, #32 + mov x21, v4.d[0] + adcs x19, x22, xzr + adc x6, x6, xzr + adds x4, x4, x11 + adcs x20, x20, x14 + adc x22, x3, xzr + adds x12, x4, x11 + umulh x11, x13, x2 + adcs x3, x20, x14 + adc x20, x22, xzr + adds x21, x21, x24 + ldp x22, x24, [x0] + adcs x4, x12, x19 + ldp x19, x14, [x0, #16] + usra v28.2D, v17.2D, #32 + adcs x3, x3, x6 + umlal v5.2D, v0.2S, v20.2S + adc x6, x20, xzr + umulh x20, x17, x23 + adds x12, x22, x5 + ldp x22, x5, [x0, #32] + adcs x10, x24, x10 + adcs x19, x19, x7 + stp x12, x10, [x0] + ldp x12, x7, [x0, #48] + adcs x10, x14, x1 + mul x14, x13, x2 + ldr x24, [x0, #64] + adcs x22, x22, x21 + adcs x5, x5, x4 + mov x21, v28.d[1] + stp x22, x5, [x0, #32] + mul x1, x17, x23 + adcs x3, x12, x3 + mov x4, v28.d[0] + mov x12, v5.d[1] + stp x19, x10, [x0, #16] + adcs x19, x7, x6 + mov x6, v5.d[0] + adc x10, x24, xzr + subs x7, x16, x8 + cneg x5, x7, cc + csetm x24, cc + subs x7, x15, x9 + cneg x22, x7, cc + cinv x7, x24, cc + adds x12, x12, x4 + umulh x4, x5, x22 + adcs x1, x1, x21 + stp x3, x19, [x0, #48] + str x10, [x0, #64] + adcs x20, x14, x20 + adc x21, x11, xzr + subs x14, x17, x13 + cneg x10, x14, cc + csetm x3, cc + subs x19, x2, x23 + cneg x19, x19, cc + cinv x11, x3, cc + adds x14, x12, x6 + mul x24, x5, x22 + adcs x22, x1, x12 + eor x3, x4, x7 + mul x4, x10, x19 + adcs x1, x20, x1 + adcs x12, x21, x20 + adc x5, xzr, x21 + umulh x19, x10, x19 + adds x20, x22, x6 + eor x10, x24, x7 + adcs x21, x1, x14 + eor x24, x4, x11 + adcs x4, x12, x22 + adcs x1, x5, x1 + adcs x12, xzr, x12 + adc x22, xzr, x5 + eor x5, x19, x11 + cmn x11, #0x1 + adcs x19, x1, x24 + adcs x5, x12, x5 + adc x24, x22, x11 + subs x1, x8, x13 + cneg x22, x1, cc + csetm x1, cc + subs x11, x2, x15 + cinv x1, x1, cc + cneg x12, x11, cc + cmn x7, #0x1 + adcs x10, x14, x10 + mul x14, x22, x12 + adcs x20, x20, x3 + eor x11, x14, x1 + adcs x3, x21, x7 + umulh x21, x22, x12 + adcs x22, x4, x7 + adcs x4, x19, x7 + adcs x12, x5, x7 + adc x7, x24, x7 + subs x14, x16, x17 + csetm x5, cc + cneg x19, x14, cc + subs x24, x23, x9 + cneg x14, x24, cc + cinv x5, x5, cc + cmn x1, #0x1 + mul x24, x19, x14 + adcs x22, x22, x11 + eor x11, x21, x1 + eor x24, x24, x5 + umulh x19, x19, x14 + adcs x4, x4, x11 + adcs x14, x12, x1 + adc x1, x7, x1 + subs x17, x8, x17 + cneg x12, x17, cc + csetm x17, cc + subs x16, x16, x13 + cneg x11, x16, cc + csetm x16, cc + subs x23, x23, x15 + cinv x7, x17, cc + cneg x13, x23, cc + mul x15, x12, x13 + subs x23, x2, x9 + cinv x8, x16, cc + cneg x17, x23, cc + eor x16, x19, x5 + mul x23, x11, x17 + cmn x5, #0x1 + adcs x20, x20, x24 + eor x15, x15, x7 + adcs x3, x3, x16 + adcs x2, x22, x5 + umulh x16, x11, x17 + adcs x19, x4, x5 + ldp x4, x22, [x0, #48] + extr x21, x10, x6, #63 + adcs x24, x14, x5 + eor x23, x23, x8 + adc x1, x1, x5 + cmn x8, #0x1 + umulh x9, x12, x13 + eor x14, x16, x8 + adcs x3, x3, x23 + ldp x11, x5, [x0, #16] + ldp x13, x16, [x0] + adcs x23, x2, x14 + adcs x14, x19, x8 + extr x19, x20, x10, #63 + lsl x12, x6, #1 + adcs x17, x24, x8 + adc x1, x1, x8 + cmn x7, #0x1 + adcs x24, x3, x15 + eor x9, x9, x7 + ldp x15, x3, [x0, #32] + adcs x9, x23, x9 + ldr x8, [x0, #64] + extr x20, x24, x20, #63 + adcs x23, x14, x7 + extr x2, x9, x24, #63 + adcs x14, x17, x7 + and x24, x2, #0x1ff + extr x9, x23, x9, #8 + extr x6, x14, x23, #8 + adc x23, x1, x7 + adds x10, x9, x13 + adcs x13, x6, x16 + extr x1, x23, x14, #8 + lsr x23, x23, #8 + adcs x7, x1, x11 + adcs x2, x23, x5 + and x23, x13, x7 + adcs x16, x12, x15 + and x23, x23, x2 + adcs x14, x21, x3 + and x23, x23, x16 + adcs x5, x19, x4 + and x23, x23, x14 + adcs x22, x20, x22 + and x23, x23, x5 + and x1, x23, x22 + adc x9, x8, x24 + lsr x23, x9, #9 + cmp xzr, xzr + orr x17, x9, #0xfffffffffffffe00 + adcs xzr, x10, x23 + adcs xzr, x1, xzr + adcs xzr, x17, xzr + adcs x23, x10, x23 + adcs x9, x13, xzr + lsl x4, x23, #9 + adcs x1, x7, xzr + extr x23, x9, x23, #55 + extr x9, x1, x9, #55 + adcs x10, x2, xzr + extr x1, x10, x1, #55 + stp x23, x9, [x0] + adcs x19, x16, xzr + adcs x9, x14, xzr + extr x23, x19, x10, #55 + adcs x10, x5, xzr + stp x1, x23, [x0, #16] + extr x5, x9, x19, #55 + adcs x1, x22, xzr + extr x23, x10, x9, #55 + adc x9, x17, xzr + stp x5, x23, [x0, #32] + extr x10, x1, x10, #55 + and x23, x9, #0x1ff + orr x23, x23, x4 + extr x9, x23, x1, #55 + lsr x23, x23, #55 + stp x10, x9, [x0, #48] + str x23, [x0, #64] + +// Restore regs and return + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/bignum_mul_p521_neon.S b/third_party/s2n-bignum/arm/p521/bignum_mul_p521_neon.S new file mode 100644 index 0000000000..c9d34151d5 --- /dev/null +++ b/third_party/s2n-bignum/arm/p521/bignum_mul_p521_neon.S @@ -0,0 +1,1402 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced +// Inputs x[9], y[9]; output z[9] +// +// extern void bignum_mul_p521_neon +// (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_mul_p521_neon is functionally equivalent to bignum_mul_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// stp x25, x26, [sp, #-16]! +// sub sp, sp, #80 +// ldp x15, x21, [x1] +// ldp x10, x17, [x1, #16] +// ldp x13, x16, [x2] +// ldr q18, [x1] +// ldr q28, [x2] +// ldp x5, x20, [x2, #16] +// movi v16.2D, #0x00000000ffffffff +// uzp2 v7.4S, v28.4S, v28.4S +// xtn v4.2S, v18.2D +// xtn v1.2S, v28.2D +// rev64 v27.4S, v28.4S +// umull v21.2D, v4.2S, v1.2S +// umull v28.2D, v4.2S, v7.2S +// uzp2 v5.4S, v18.4S, v18.4S +// mul v18.4S, v27.4S, v18.4S +// usra v28.2D, v21.2D, #32 +// umull v29.2D, v5.2S, v7.2S +// uaddlp v18.2D, v18.4S +// and v16.16B, v28.16B, v16.16B +// umlal v16.2D, v5.2S, v1.2S +// shl v18.2D, v18.2D, #32 +// usra v29.2D, v28.2D, #32 +// umlal v18.2D, v4.2S, v1.2S +// usra v29.2D, v16.2D, #32 +// mov x8, v18.d[0] +// mov x9, v18.d[1] +// mul x6, x10, x5 +// mul x19, x17, x20 +// mov x14, v29.d[0] +// adds x9, x9, x14 +// mov x14, v29.d[1] +// adcs x6, x6, x14 +// umulh x14, x10, x5 +// adcs x19, x19, x14 +// umulh x14, x17, x20 +// adc x14, x14, xzr +// adds x11, x9, x8 +// adcs x9, x6, x9 +// adcs x6, x19, x6 +// adcs x19, x14, x19 +// adc x14, xzr, x14 +// adds x3, x9, x8 +// adcs x24, x6, x11 +// adcs x9, x19, x9 +// adcs x6, x14, x6 +// adcs x19, xzr, x19 +// adc x14, xzr, x14 +// subs x4, x10, x17 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x20, x5 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x6, x6, x23 +// eor x4, x4, x7 +// adcs x19, x19, x4 +// adc x14, x14, x7 +// subs x4, x15, x21 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x16, x13 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x11, x11, x23 +// eor x4, x4, x7 +// adcs x3, x3, x4 +// adcs x24, x24, x7 +// adcs x9, x9, x7 +// adcs x6, x6, x7 +// adcs x19, x19, x7 +// adc x14, x14, x7 +// subs x4, x21, x17 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x20, x16 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x9, x9, x23 +// eor x4, x4, x7 +// adcs x6, x6, x4 +// adcs x19, x19, x7 +// adc x14, x14, x7 +// subs x4, x15, x10 +// cneg x4, x4, cc +// csetm x7, cc +// subs x23, x5, x13 +// cneg x23, x23, cc +// mul x22, x4, x23 +// umulh x4, x4, x23 +// cinv x7, x7, cc +// cmn x7, #0x1 +// eor x23, x22, x7 +// adcs x3, x3, x23 +// eor x4, x4, x7 +// adcs x24, x24, x4 +// adcs x9, x9, x7 +// adcs x6, x6, x7 +// adcs x19, x19, x7 +// adc x14, x14, x7 +// subs x17, x15, x17 +// cneg x17, x17, cc +// csetm x4, cc +// subs x13, x20, x13 +// cneg x13, x13, cc +// mul x20, x17, x13 +// umulh x17, x17, x13 +// cinv x13, x4, cc +// cmn x13, #0x1 +// eor x20, x20, x13 +// adcs x20, x24, x20 +// eor x17, x17, x13 +// adcs x17, x9, x17 +// adcs x9, x6, x13 +// adcs x6, x19, x13 +// adc x13, x14, x13 +// subs x21, x21, x10 +// cneg x21, x21, cc +// csetm x10, cc +// subs x16, x5, x16 +// cneg x16, x16, cc +// mul x5, x21, x16 +// umulh x21, x21, x16 +// cinv x10, x10, cc +// cmn x10, #0x1 +// eor x16, x5, x10 +// adcs x16, x20, x16 +// eor x21, x21, x10 +// adcs x21, x17, x21 +// adcs x17, x9, x10 +// adcs x5, x6, x10 +// adc x10, x13, x10 +// lsl x13, x8, #9 +// extr x20, x11, x8, #55 +// extr x8, x3, x11, #55 +// extr x9, x16, x3, #55 +// lsr x16, x16, #55 +// stp x21, x17, [sp] // @slothy:writes=stack0 +// stp x5, x10, [sp, #16] // @slothy:writes=stack16 +// stp x13, x20, [sp, #32] // @slothy:writes=stack32 +// stp x8, x9, [sp, #48] // @slothy:writes=stack48 +// str x16, [sp, #64] // @slothy:writes=stack64 +// ldp x21, x10, [x1, #32] +// ldp x17, x13, [x1, #48] +// ldp x16, x5, [x2, #32] +// ldr q18, [x1, #32] +// ldr q28, [x2, #32] +// ldp x20, x8, [x2, #48] +// movi v16.2D, #0x00000000ffffffff +// uzp2 v7.4S, v28.4S, v28.4S +// xtn v4.2S, v18.2D +// xtn v1.2S, v28.2D +// rev64 v28.4S, v28.4S +// umull v27.2D, v4.2S, v1.2S +// umull v29.2D, v4.2S, v7.2S +// uzp2 v21.4S, v18.4S, v18.4S +// mul v28.4S, v28.4S, v18.4S +// usra v29.2D, v27.2D, #32 +// umull v18.2D, v21.2S, v7.2S +// uaddlp v28.2D, v28.4S +// and v16.16B, v29.16B, v16.16B +// umlal v16.2D, v21.2S, v1.2S +// shl v28.2D, v28.2D, #32 +// usra v18.2D, v29.2D, #32 +// umlal v28.2D, v4.2S, v1.2S +// usra v18.2D, v16.2D, #32 +// mov x9, v28.d[0] +// mov x6, v28.d[1] +// mul x19, x17, x20 +// mul x14, x13, x8 +// mov x11, v18.d[0] +// adds x6, x6, x11 +// mov x11, v18.d[1] +// adcs x19, x19, x11 +// umulh x11, x17, x20 +// adcs x14, x14, x11 +// umulh x11, x13, x8 +// adc x11, x11, xzr +// adds x3, x6, x9 +// adcs x6, x19, x6 +// adcs x19, x14, x19 +// adcs x14, x11, x14 +// adc x11, xzr, x11 +// adds x24, x6, x9 +// adcs x4, x19, x3 +// adcs x6, x14, x6 +// adcs x19, x11, x19 +// adcs x14, xzr, x14 +// adc x11, xzr, x11 +// subs x7, x17, x13 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x8, x20 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x19, x19, x22 +// eor x7, x7, x23 +// adcs x14, x14, x7 +// adc x11, x11, x23 +// subs x7, x21, x10 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x5, x16 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x3, x3, x22 +// eor x7, x7, x23 +// adcs x24, x24, x7 +// adcs x4, x4, x23 +// adcs x6, x6, x23 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x10, x13 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x8, x5 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x6, x6, x22 +// eor x7, x7, x23 +// adcs x19, x19, x7 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x21, x17 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x20, x16 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x24, x24, x22 +// eor x7, x7, x23 +// adcs x4, x4, x7 +// adcs x6, x6, x23 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x21, x13 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x8, x16 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x4, x4, x22 +// eor x7, x7, x23 +// adcs x6, x6, x7 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// subs x7, x10, x17 +// cneg x7, x7, cc +// csetm x23, cc +// subs x22, x20, x5 +// cneg x22, x22, cc +// mul x12, x7, x22 +// umulh x7, x7, x22 +// cinv x23, x23, cc +// cmn x23, #0x1 +// eor x22, x12, x23 +// adcs x4, x4, x22 +// eor x7, x7, x23 +// adcs x6, x6, x7 +// adcs x19, x19, x23 +// adcs x14, x14, x23 +// adc x11, x11, x23 +// ldp x7, x23, [sp] // @slothy:reads=stack0 +// adds x9, x9, x7 +// adcs x3, x3, x23 +// stp x9, x3, [sp] // @slothy:writes=stack0 +// ldp x9, x3, [sp, #16] // @slothy:reads=stack16 +// adcs x9, x24, x9 +// adcs x3, x4, x3 +// stp x9, x3, [sp, #16] // @slothy:writes=stack16 +// ldp x9, x3, [sp, #32] // @slothy:reads=stack32 +// adcs x9, x6, x9 +// adcs x6, x19, x3 +// stp x9, x6, [sp, #32] // @slothy:writes=stack32 +// ldp x9, x6, [sp, #48] // @slothy:reads=stack48 +// adcs x9, x14, x9 +// adcs x6, x11, x6 +// stp x9, x6, [sp, #48] // @slothy:writes=stack48 +// ldr x9, [sp, #64] // @slothy:reads=stack64 +// adc x9, x9, xzr +// str x9, [sp, #64] // @slothy:writes=stack64 +// ldp x9, x6, [x1] +// subs x21, x21, x9 +// sbcs x10, x10, x6 +// ldp x9, x6, [x1, #16] +// sbcs x17, x17, x9 +// sbcs x13, x13, x6 +// csetm x9, cc +// ldp x6, x19, [x2] +// subs x16, x6, x16 +// sbcs x5, x19, x5 +// ldp x6, x19, [x2, #16] +// sbcs x20, x6, x20 +// sbcs x8, x19, x8 +// csetm x6, cc +// eor x21, x21, x9 +// subs x21, x21, x9 +// eor x10, x10, x9 +// sbcs x10, x10, x9 +// eor x17, x17, x9 +// sbcs x17, x17, x9 +// eor x13, x13, x9 +// sbc x13, x13, x9 +// eor x16, x16, x6 +// subs x16, x16, x6 +// eor x5, x5, x6 +// sbcs x5, x5, x6 +// eor x20, x20, x6 +// sbcs x20, x20, x6 +// eor x8, x8, x6 +// sbc x8, x8, x6 +// eor x9, x6, x9 +// mul x6, x21, x16 +// mul x19, x10, x5 +// mul x14, x17, x20 +// mul x11, x13, x8 +// umulh x3, x21, x16 +// adds x19, x19, x3 +// umulh x3, x10, x5 +// adcs x14, x14, x3 +// umulh x3, x17, x20 +// adcs x11, x11, x3 +// umulh x3, x13, x8 +// adc x3, x3, xzr +// adds x24, x19, x6 +// adcs x19, x14, x19 +// adcs x14, x11, x14 +// adcs x11, x3, x11 +// adc x3, xzr, x3 +// adds x4, x19, x6 +// adcs x7, x14, x24 +// adcs x19, x11, x19 +// adcs x14, x3, x14 +// adcs x11, xzr, x11 +// adc x3, xzr, x3 +// subs x23, x17, x13 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x8, x20 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x14, x14, x12 +// eor x23, x23, x22 +// adcs x11, x11, x23 +// adc x3, x3, x22 +// subs x23, x21, x10 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x5, x16 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x24, x24, x12 +// eor x23, x23, x22 +// adcs x4, x4, x23 +// adcs x7, x7, x22 +// adcs x19, x19, x22 +// adcs x14, x14, x22 +// adcs x11, x11, x22 +// adc x3, x3, x22 +// subs x23, x10, x13 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x8, x5 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x19, x19, x12 +// eor x23, x23, x22 +// adcs x14, x14, x23 +// adcs x11, x11, x22 +// adc x3, x3, x22 +// subs x23, x21, x17 +// cneg x23, x23, cc +// csetm x22, cc +// subs x12, x20, x16 +// cneg x12, x12, cc +// mul x15, x23, x12 +// umulh x23, x23, x12 +// cinv x22, x22, cc +// cmn x22, #0x1 +// eor x12, x15, x22 +// adcs x4, x4, x12 +// eor x23, x23, x22 +// adcs x7, x7, x23 +// adcs x19, x19, x22 +// adcs x14, x14, x22 +// adcs x11, x11, x22 +// adc x3, x3, x22 +// subs x21, x21, x13 +// cneg x21, x21, cc +// csetm x13, cc +// subs x16, x8, x16 +// cneg x16, x16, cc +// mul x8, x21, x16 +// umulh x21, x21, x16 +// cinv x13, x13, cc +// cmn x13, #0x1 +// eor x16, x8, x13 +// adcs x16, x7, x16 +// eor x21, x21, x13 +// adcs x21, x19, x21 +// adcs x8, x14, x13 +// adcs x19, x11, x13 +// adc x13, x3, x13 +// subs x10, x10, x17 +// cneg x10, x10, cc +// csetm x17, cc +// subs x5, x20, x5 +// cneg x5, x5, cc +// mul x20, x10, x5 +// umulh x10, x10, x5 +// cinv x17, x17, cc +// cmn x17, #0x1 +// eor x5, x20, x17 +// adcs x16, x16, x5 +// eor x10, x10, x17 +// adcs x21, x21, x10 +// adcs x10, x8, x17 +// adcs x5, x19, x17 +// adc x17, x13, x17 +// ldp x13, x20, [sp] // @slothy:reads=stack0 +// ldp x8, x19, [sp, #16] // @slothy:reads=stack16 +// eor x6, x6, x9 +// adds x6, x6, x13 +// eor x14, x24, x9 +// adcs x14, x14, x20 +// eor x11, x4, x9 +// adcs x11, x11, x8 +// eor x16, x16, x9 +// adcs x16, x16, x19 +// eor x21, x21, x9 +// ldp x3, x24, [sp, #32] // @slothy:reads=stack32 +// ldp x4, x7, [sp, #48] // @slothy:reads=stack48 +// ldr x23, [sp, #64] // @slothy:reads=stack64 +// adcs x21, x21, x3 +// eor x10, x10, x9 +// adcs x10, x10, x24 +// eor x5, x5, x9 +// adcs x5, x5, x4 +// eor x17, x17, x9 +// adcs x17, x17, x7 +// adc x22, x23, xzr +// adds x21, x21, x13 +// adcs x10, x10, x20 +// adcs x13, x5, x8 +// adcs x17, x17, x19 +// and x5, x9, #0x1ff +// lsl x20, x6, #9 +// orr x5, x20, x5 +// adcs x5, x3, x5 +// extr x20, x14, x6, #55 +// adcs x20, x24, x20 +// extr x8, x11, x14, #55 +// adcs x8, x4, x8 +// extr x9, x16, x11, #55 +// adcs x9, x7, x9 +// lsr x16, x16, #55 +// adc x16, x16, x23 +// ldr x6, [x2, #64] +// ldp x19, x14, [x1] +// and x11, x19, #0xfffffffffffff +// mul x11, x6, x11 +// ldr x3, [x1, #64] +// ldp x24, x4, [x2] +// and x7, x24, #0xfffffffffffff +// mul x7, x3, x7 +// add x11, x11, x7 +// extr x19, x14, x19, #52 +// and x19, x19, #0xfffffffffffff +// mul x19, x6, x19 +// extr x24, x4, x24, #52 +// and x24, x24, #0xfffffffffffff +// mul x24, x3, x24 +// add x19, x19, x24 +// lsr x24, x11, #52 +// add x19, x19, x24 +// lsl x11, x11, #12 +// extr x11, x19, x11, #12 +// adds x21, x21, x11 +// ldp x11, x24, [x1, #16] +// ldp x7, x23, [x2, #16] +// extr x14, x11, x14, #40 +// and x14, x14, #0xfffffffffffff +// mul x14, x6, x14 +// extr x4, x7, x4, #40 +// and x4, x4, #0xfffffffffffff +// mul x4, x3, x4 +// add x14, x14, x4 +// lsr x4, x19, #52 +// add x14, x14, x4 +// lsl x19, x19, #12 +// extr x19, x14, x19, #24 +// adcs x10, x10, x19 +// extr x19, x24, x11, #28 +// and x19, x19, #0xfffffffffffff +// mul x19, x6, x19 +// extr x11, x23, x7, #28 +// and x11, x11, #0xfffffffffffff +// mul x11, x3, x11 +// add x19, x19, x11 +// lsr x11, x14, #52 +// add x19, x19, x11 +// lsl x14, x14, #12 +// extr x14, x19, x14, #36 +// adcs x13, x13, x14 +// and x14, x10, x13 +// ldp x11, x4, [x1, #32] +// ldp x7, x12, [x2, #32] +// extr x24, x11, x24, #16 +// and x24, x24, #0xfffffffffffff +// mul x24, x6, x24 +// extr x23, x7, x23, #16 +// and x23, x23, #0xfffffffffffff +// mul x23, x3, x23 +// add x24, x24, x23 +// lsl x23, x22, #48 +// add x24, x24, x23 +// lsr x23, x19, #52 +// add x24, x24, x23 +// lsl x19, x19, #12 +// extr x19, x24, x19, #48 +// adcs x17, x17, x19 +// and x19, x14, x17 +// lsr x14, x11, #4 +// and x14, x14, #0xfffffffffffff +// mul x14, x6, x14 +// lsr x23, x7, #4 +// and x23, x23, #0xfffffffffffff +// mul x23, x3, x23 +// add x14, x14, x23 +// lsr x23, x24, #52 +// add x14, x14, x23 +// lsl x24, x24, #12 +// extr x24, x14, x24, #60 +// extr x11, x4, x11, #56 +// and x11, x11, #0xfffffffffffff +// mul x11, x6, x11 +// extr x7, x12, x7, #56 +// and x7, x7, #0xfffffffffffff +// mul x7, x3, x7 +// add x11, x11, x7 +// lsr x14, x14, #52 +// add x14, x11, x14 +// lsl x11, x24, #8 +// extr x11, x14, x11, #8 +// adcs x5, x5, x11 +// and x19, x19, x5 +// ldp x11, x24, [x1, #48] +// ldp x2, x7, [x2, #48] +// extr x4, x11, x4, #44 +// and x4, x4, #0xfffffffffffff +// mul x4, x6, x4 +// extr x23, x2, x12, #44 +// and x23, x23, #0xfffffffffffff +// mul x23, x3, x23 +// add x4, x4, x23 +// lsr x23, x14, #52 +// add x4, x4, x23 +// lsl x14, x14, #12 +// extr x14, x4, x14, #20 +// adcs x20, x20, x14 +// and x19, x19, x20 +// extr x14, x24, x11, #32 +// and x14, x14, #0xfffffffffffff +// mul x14, x6, x14 +// extr x2, x7, x2, #32 +// and x2, x2, #0xfffffffffffff +// mul x2, x3, x2 +// add x2, x14, x2 +// lsr x14, x4, #52 +// add x2, x2, x14 +// lsl x14, x4, #12 +// extr x14, x2, x14, #32 +// adcs x8, x8, x14 +// and x19, x19, x8 +// lsr x14, x24, #20 +// mul x14, x6, x14 +// lsr x11, x7, #20 +// mul x11, x3, x11 +// add x14, x14, x11 +// lsr x11, x2, #52 +// add x14, x14, x11 +// lsl x2, x2, #12 +// extr x2, x14, x2, #44 +// adcs x9, x9, x2 +// and x2, x19, x9 +// mul x6, x6, x3 +// lsr x19, x14, #44 +// add x6, x6, x19 +// adc x16, x16, x6 +// lsr x6, x16, #9 +// orr x16, x16, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x21, x6 +// adcs xzr, x2, xzr +// adcs xzr, x16, xzr +// adcs x21, x21, x6 +// adcs x10, x10, xzr +// adcs x13, x13, xzr +// adcs x17, x17, xzr +// adcs x5, x5, xzr +// adcs x20, x20, xzr +// adcs x8, x8, xzr +// adcs x9, x9, xzr +// adc x16, x16, xzr +// and x2, x21, #0x1ff +// extr x21, x10, x21, #9 +// extr x10, x13, x10, #9 +// stp x21, x10, [x0] // @slothy:writes=buffer0 +// extr x21, x17, x13, #9 +// extr x10, x5, x17, #9 +// stp x21, x10, [x0, #16] // @slothy:writes=buffer16 +// extr x21, x20, x5, #9 +// extr x10, x8, x20, #9 +// stp x21, x10, [x0, #32] // @slothy:writes=buffer32 +// extr x21, x9, x8, #9 +// extr x10, x16, x9, #9 +// stp x21, x10, [x0, #48] // @slothy:writes=buffer48 +// str x2, [x0, #64] // @slothy:writes=buffer64 +// add sp, sp, #80 +// ldp x25, x26, [sp], #16 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads and add/sub sp #80 as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_mul_p521_neon): + +// Save registers and make space for the temporary buffer + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, #80 + + ldr q6, [x2] + ldp x10, x17, [x1, #16] + ldr q4, [x1] + ldr q16, [x2, #32] + ldp x5, x20, [x2, #16] + ldr q2, [x1, #32] + movi v31.2D, #0x00000000ffffffff + uzp2 v17.4S, v6.4S, v6.4S + rev64 v7.4S, v6.4S + ldp x15, x21, [x1] + xtn v25.2S, v6.2D + xtn v22.2S, v4.2D + subs x14, x10, x17 + mul v7.4S, v7.4S, v4.4S + csetm x8, cc + rev64 v3.4S, v16.4S + xtn v1.2S, v16.2D + ldp x13, x16, [x2] + mul x26, x10, x5 + uzp2 v16.4S, v16.4S, v16.4S + uaddlp v26.2D, v7.4S + cneg x4, x14, cc + subs x24, x15, x21 + xtn v5.2S, v2.2D + mul v28.4S, v3.4S, v2.4S + shl v26.2D, v26.2D, #32 + mul x22, x17, x20 + umull v20.2D, v22.2S, v25.2S + uzp2 v6.4S, v4.4S, v4.4S + umull v18.2D, v22.2S, v17.2S + uzp2 v4.4S, v2.4S, v2.4S + cneg x14, x24, cc + csetm x7, cc + umulh x11, x17, x20 + usra v18.2D, v20.2D, #32 + uaddlp v7.2D, v28.4S + subs x19, x16, x13 + umlal v26.2D, v22.2S, v25.2S + cneg x19, x19, cc + shl v28.2D, v7.2D, #32 + umull v7.2D, v5.2S, v1.2S + umull v30.2D, v5.2S, v16.2S + cinv x6, x7, cc + mul x25, x14, x19 + umlal v28.2D, v5.2S, v1.2S + umull v21.2D, v6.2S, v17.2S + umulh x14, x14, x19 + usra v30.2D, v7.2D, #32 + subs x9, x20, x5 + and v29.16B, v18.16B, v31.16B + cinv x23, x8, cc + mov x8, v26.d[1] + cneg x12, x9, cc + usra v21.2D, v18.2D, #32 + umlal v29.2D, v6.2S, v25.2S + mul x24, x4, x12 + umull v18.2D, v4.2S, v16.2S + movi v25.2D, #0x00000000ffffffff + eor x9, x14, x6 + and v7.16B, v30.16B, v25.16B + usra v21.2D, v29.2D, #32 + umulh x7, x10, x5 + usra v18.2D, v30.2D, #32 + umlal v7.2D, v4.2S, v1.2S + mov x19, v21.d[0] + umulh x3, x4, x12 + mov x14, v21.d[1] + usra v18.2D, v7.2D, #32 + adds x4, x8, x19 + mov x8, v26.d[0] + adcs x19, x26, x14 + adcs x14, x22, x7 + adc x12, x11, xzr + adds x11, x4, x8 + adcs x26, x19, x4 + adcs x22, x14, x19 + eor x4, x24, x23 + adcs x14, x12, x14 + eor x7, x25, x6 + adc x25, xzr, x12 + eor x19, x3, x23 + adds x3, x26, x8 + adcs x24, x22, x11 + adcs x12, x14, x26 + adcs x22, x25, x22 + adcs x26, xzr, x14 + adc x14, xzr, x25 + cmn x23, #0x1 + adcs x22, x22, x4 + adcs x19, x26, x19 + adc x25, x14, x23 + subs x14, x21, x17 + cneg x23, x14, cc + csetm x26, cc + subs x4, x20, x16 + cneg x14, x4, cc + cinv x4, x26, cc + cmn x6, #0x1 + adcs x11, x11, x7 + mul x7, x23, x14 + adcs x9, x3, x9 + adcs x26, x24, x6 + umulh x3, x23, x14 + adcs x14, x12, x6 + adcs x22, x22, x6 + adcs x12, x19, x6 + extr x24, x11, x8, #55 + adc x6, x25, x6 + subs x19, x15, x17 + csetm x17, cc + cneg x23, x19, cc + subs x19, x20, x13 + lsl x25, x8, #9 + eor x8, x7, x4 + cneg x20, x19, cc + umulh x7, x23, x20 + cinv x19, x17, cc + subs x17, x15, x10 + csetm x15, cc + stp x25, x24, [sp, #32] + cneg x24, x17, cc + mul x20, x23, x20 + subs x25, x5, x13 + cneg x13, x25, cc + cinv x15, x15, cc + mul x25, x24, x13 + subs x21, x21, x10 + csetm x23, cc + cneg x17, x21, cc + subs x21, x5, x16 + umulh x13, x24, x13 + cinv x10, x23, cc + cneg x23, x21, cc + cmn x4, #0x1 + adcs x14, x14, x8 + eor x21, x3, x4 + adcs x21, x22, x21 + eor x5, x20, x19 + adcs x24, x12, x4 + mul x12, x17, x23 + eor x8, x25, x15 + adc x25, x6, x4 + cmn x15, #0x1 + adcs x6, x9, x8 + ldp x20, x8, [x2, #48] + eor x9, x13, x15 + adcs x4, x26, x9 + umulh x26, x17, x23 + ldp x17, x13, [x1, #48] + adcs x9, x14, x15 + adcs x16, x21, x15 + adcs x14, x24, x15 + eor x21, x7, x19 + mul x23, x17, x20 + adc x24, x25, x15 + cmn x19, #0x1 + adcs x7, x4, x5 + adcs x9, x9, x21 + umulh x3, x13, x8 + adcs x16, x16, x19 + adcs x22, x14, x19 + eor x5, x12, x10 + adc x12, x24, x19 + cmn x10, #0x1 + adcs x19, x7, x5 + eor x14, x26, x10 + mov x7, v28.d[1] + adcs x24, x9, x14 + extr x4, x19, x6, #55 + umulh x15, x17, x20 + mov x14, v18.d[1] + lsr x9, x19, #55 + adcs x5, x16, x10 + mov x16, v18.d[0] + adcs x19, x22, x10 + str x9, [sp, #64] + extr x25, x6, x11, #55 + adc x21, x12, x10 + subs x26, x17, x13 + stp x25, x4, [sp, #48] + stp x19, x21, [sp, #16] + csetm x6, cc + cneg x4, x26, cc + mul x19, x13, x8 + subs x11, x8, x20 + stp x24, x5, [sp] + ldp x21, x10, [x1, #32] + cinv x12, x6, cc + cneg x6, x11, cc + mov x9, v28.d[0] + umulh x25, x4, x6 + adds x22, x7, x16 + ldp x16, x5, [x2, #32] + adcs x14, x23, x14 + adcs x11, x19, x15 + adc x24, x3, xzr + adds x3, x22, x9 + adcs x15, x14, x22 + mul x22, x4, x6 + adcs x6, x11, x14 + adcs x4, x24, x11 + eor x14, x25, x12 + adc x26, xzr, x24 + subs x7, x21, x10 + csetm x23, cc + cneg x19, x7, cc + subs x24, x5, x16 + cneg x11, x24, cc + cinv x7, x23, cc + adds x25, x15, x9 + eor x23, x22, x12 + adcs x22, x6, x3 + mul x24, x19, x11 + adcs x15, x4, x15 + adcs x6, x26, x6 + umulh x19, x19, x11 + adcs x11, xzr, x4 + adc x26, xzr, x26 + cmn x12, #0x1 + adcs x4, x6, x23 + eor x6, x24, x7 + adcs x14, x11, x14 + adc x26, x26, x12 + subs x11, x10, x13 + cneg x12, x11, cc + csetm x11, cc + eor x19, x19, x7 + subs x24, x8, x5 + cinv x11, x11, cc + cneg x24, x24, cc + cmn x7, #0x1 + adcs x3, x3, x6 + mul x23, x12, x24 + adcs x25, x25, x19 + adcs x6, x22, x7 + umulh x19, x12, x24 + adcs x22, x15, x7 + adcs x12, x4, x7 + eor x24, x23, x11 + adcs x4, x14, x7 + adc x26, x26, x7 + eor x19, x19, x11 + subs x14, x21, x17 + cneg x7, x14, cc + csetm x14, cc + subs x23, x20, x16 + cinv x14, x14, cc + cneg x23, x23, cc + cmn x11, #0x1 + adcs x22, x22, x24 + mul x24, x7, x23 + adcs x15, x12, x19 + adcs x4, x4, x11 + adc x19, x26, x11 + umulh x26, x7, x23 + subs x7, x21, x13 + eor x11, x24, x14 + cneg x23, x7, cc + csetm x12, cc + subs x7, x8, x16 + cneg x7, x7, cc + cinv x12, x12, cc + cmn x14, #0x1 + eor x26, x26, x14 + adcs x11, x25, x11 + mul x25, x23, x7 + adcs x26, x6, x26 + adcs x6, x22, x14 + adcs x24, x15, x14 + umulh x23, x23, x7 + adcs x4, x4, x14 + adc x22, x19, x14 + eor x14, x25, x12 + eor x7, x23, x12 + cmn x12, #0x1 + adcs x14, x26, x14 + ldp x19, x25, [x2] + ldp x15, x23, [x2, #16] + adcs x26, x6, x7 + adcs x24, x24, x12 + adcs x7, x4, x12 + adc x4, x22, x12 + subs x19, x19, x16 + ldp x16, x22, [x1] + sbcs x6, x25, x5 + ldp x12, x25, [x1, #16] + sbcs x15, x15, x20 + sbcs x8, x23, x8 + csetm x23, cc + subs x21, x21, x16 + eor x16, x19, x23 + sbcs x19, x10, x22 + eor x22, x6, x23 + eor x8, x8, x23 + sbcs x6, x17, x12 + sbcs x13, x13, x25 + csetm x12, cc + subs x10, x10, x17 + cneg x17, x10, cc + csetm x25, cc + subs x5, x20, x5 + eor x10, x19, x12 + cneg x19, x5, cc + eor x20, x15, x23 + eor x21, x21, x12 + cinv x15, x25, cc + mul x25, x17, x19 + subs x16, x16, x23 + sbcs x5, x22, x23 + eor x6, x6, x12 + sbcs x20, x20, x23 + eor x22, x13, x12 + sbc x8, x8, x23 + subs x21, x21, x12 + umulh x19, x17, x19 + sbcs x10, x10, x12 + sbcs x17, x6, x12 + eor x6, x19, x15 + eor x19, x25, x15 + umulh x25, x17, x20 + sbc x13, x22, x12 + cmn x15, #0x1 + adcs x22, x14, x19 + adcs x19, x26, x6 + ldp x6, x26, [sp] + adcs x14, x24, x15 + umulh x24, x21, x16 + adcs x7, x7, x15 + adc x15, x4, x15 + adds x4, x9, x6 + eor x9, x23, x12 + adcs x12, x3, x26 + stp x4, x12, [sp] + ldp x4, x26, [sp, #16] + umulh x12, x10, x5 + ldp x6, x23, [sp, #32] + adcs x3, x11, x4 + mul x4, x13, x8 + adcs x26, x22, x26 + ldp x22, x11, [sp, #48] + adcs x6, x19, x6 + stp x3, x26, [sp, #16] + mul x26, x10, x5 + adcs x14, x14, x23 + stp x6, x14, [sp, #32] + ldr x6, [sp, #64] + adcs x22, x7, x22 + adcs x14, x15, x11 + mul x11, x17, x20 + adc x19, x6, xzr + stp x22, x14, [sp, #48] + adds x14, x26, x24 + str x19, [sp, #64] + umulh x19, x13, x8 + adcs x7, x11, x12 + adcs x22, x4, x25 + mul x6, x21, x16 + adc x19, x19, xzr + subs x11, x17, x13 + cneg x12, x11, cc + csetm x11, cc + subs x24, x8, x20 + cinv x11, x11, cc + cneg x24, x24, cc + adds x4, x14, x6 + adcs x14, x7, x14 + mul x3, x12, x24 + adcs x7, x22, x7 + adcs x22, x19, x22 + umulh x12, x12, x24 + adc x24, xzr, x19 + adds x19, x14, x6 + eor x3, x3, x11 + adcs x26, x7, x4 + adcs x14, x22, x14 + adcs x25, x24, x7 + adcs x23, xzr, x22 + eor x7, x12, x11 + adc x12, xzr, x24 + subs x22, x21, x10 + cneg x24, x22, cc + csetm x22, cc + subs x15, x5, x16 + cinv x22, x22, cc + cneg x15, x15, cc + cmn x11, #0x1 + adcs x3, x25, x3 + mul x25, x24, x15 + adcs x23, x23, x7 + adc x11, x12, x11 + subs x7, x10, x13 + umulh x15, x24, x15 + cneg x12, x7, cc + csetm x7, cc + eor x24, x25, x22 + eor x25, x15, x22 + cmn x22, #0x1 + adcs x24, x4, x24 + adcs x19, x19, x25 + adcs x15, x26, x22 + adcs x4, x14, x22 + adcs x26, x3, x22 + adcs x25, x23, x22 + adc x23, x11, x22 + subs x14, x21, x17 + cneg x3, x14, cc + csetm x11, cc + subs x14, x8, x5 + cneg x14, x14, cc + cinv x7, x7, cc + subs x13, x21, x13 + cneg x21, x13, cc + csetm x13, cc + mul x22, x12, x14 + subs x8, x8, x16 + cinv x13, x13, cc + umulh x14, x12, x14 + cneg x12, x8, cc + subs x8, x20, x16 + cneg x8, x8, cc + cinv x16, x11, cc + eor x22, x22, x7 + cmn x7, #0x1 + eor x14, x14, x7 + adcs x4, x4, x22 + mul x11, x3, x8 + adcs x22, x26, x14 + adcs x14, x25, x7 + eor x25, x24, x9 + adc x26, x23, x7 + umulh x7, x3, x8 + subs x17, x10, x17 + cneg x24, x17, cc + eor x3, x11, x16 + csetm x11, cc + subs x20, x20, x5 + cneg x5, x20, cc + cinv x11, x11, cc + cmn x16, #0x1 + mul x17, x21, x12 + eor x8, x7, x16 + adcs x10, x19, x3 + and x19, x9, #0x1ff + adcs x20, x15, x8 + umulh x15, x21, x12 + eor x12, x10, x9 + eor x8, x6, x9 + adcs x6, x4, x16 + adcs x4, x22, x16 + adcs x21, x14, x16 + adc x7, x26, x16 + mul x10, x24, x5 + cmn x13, #0x1 + ldp x3, x14, [x1] + eor x17, x17, x13 + umulh x5, x24, x5 + adcs x20, x20, x17 + eor x17, x15, x13 + adcs x16, x6, x17 + eor x22, x10, x11 + adcs x23, x4, x13 + extr x10, x14, x3, #52 + and x26, x3, #0xfffffffffffff + adcs x24, x21, x13 + and x15, x10, #0xfffffffffffff + adc x6, x7, x13 + cmn x11, #0x1 + adcs x17, x20, x22 + eor x4, x5, x11 + ldp x21, x10, [sp] + adcs x7, x16, x4 + eor x16, x17, x9 + eor x13, x7, x9 + ldp x3, x17, [sp, #16] + adcs x7, x23, x11 + eor x23, x7, x9 + ldp x5, x22, [sp, #32] + adcs x7, x24, x11 + adc x24, x6, x11 + ldr x6, [x2, #64] + adds x20, x8, x21 + lsl x11, x20, #9 + eor x4, x7, x9 + orr x7, x11, x19 + eor x8, x24, x9 + adcs x11, x25, x10 + mul x26, x6, x26 + ldp x19, x24, [sp, #48] + adcs x12, x12, x3 + adcs x16, x16, x17 + adcs x9, x13, x5 + ldr x25, [sp, #64] + extr x20, x11, x20, #55 + adcs x13, x23, x22 + adcs x4, x4, x19 + extr x23, x12, x11, #55 + adcs x8, x8, x24 + adc x11, x25, xzr + adds x21, x9, x21 + extr x9, x16, x12, #55 + lsr x12, x16, #55 + adcs x10, x13, x10 + mul x15, x6, x15 + adcs x13, x4, x3 + ldp x16, x4, [x2] + ldr x3, [x1, #64] + adcs x17, x8, x17 + adcs x5, x5, x7 + adcs x20, x22, x20 + adcs x8, x19, x23 + and x22, x16, #0xfffffffffffff + ldp x19, x7, [x1, #16] + adcs x9, x24, x9 + extr x24, x4, x16, #52 + adc x16, x12, x25 + mul x22, x3, x22 + and x25, x24, #0xfffffffffffff + extr x14, x19, x14, #40 + and x12, x14, #0xfffffffffffff + extr x23, x7, x19, #28 + ldp x19, x24, [x2, #16] + mul x14, x3, x25 + and x23, x23, #0xfffffffffffff + add x22, x26, x22 + lsl x11, x11, #48 + lsr x26, x22, #52 + lsl x25, x22, #12 + mul x22, x6, x12 + extr x12, x19, x4, #40 + add x4, x15, x14 + mul x15, x6, x23 + add x4, x4, x26 + extr x23, x24, x19, #28 + ldp x14, x19, [x1, #32] + and x26, x12, #0xfffffffffffff + extr x12, x4, x25, #12 + and x25, x23, #0xfffffffffffff + adds x21, x21, x12 + mul x12, x3, x26 + extr x23, x14, x7, #16 + and x23, x23, #0xfffffffffffff + mul x7, x3, x25 + ldp x25, x26, [x2, #32] + add x12, x22, x12 + extr x22, x19, x14, #56 + mul x23, x6, x23 + lsr x14, x14, #4 + extr x24, x25, x24, #16 + add x7, x15, x7 + and x15, x24, #0xfffffffffffff + and x22, x22, #0xfffffffffffff + lsr x24, x4, #52 + mul x15, x3, x15 + and x14, x14, #0xfffffffffffff + add x12, x12, x24 + lsl x24, x4, #12 + lsr x4, x12, #52 + extr x24, x12, x24, #24 + adcs x10, x10, x24 + lsl x24, x12, #12 + add x12, x7, x4 + mul x22, x6, x22 + add x4, x23, x15 + extr x7, x12, x24, #36 + adcs x13, x13, x7 + lsl x15, x12, #12 + add x7, x4, x11 + lsr x24, x12, #52 + ldp x23, x11, [x2, #48] + add x4, x7, x24 + mul x12, x6, x14 + extr x7, x26, x25, #56 + extr x14, x4, x15, #48 + and x2, x7, #0xfffffffffffff + extr x24, x11, x23, #32 + ldp x15, x7, [x1, #48] + and x1, x24, #0xfffffffffffff + lsr x24, x4, #52 + mul x2, x3, x2 + extr x26, x23, x26, #44 + lsr x23, x25, #4 + and x23, x23, #0xfffffffffffff + and x25, x26, #0xfffffffffffff + extr x26, x7, x15, #32 + extr x19, x15, x19, #44 + mul x23, x3, x23 + and x15, x26, #0xfffffffffffff + lsl x26, x4, #12 + and x4, x19, #0xfffffffffffff + lsr x11, x11, #20 + mul x19, x6, x4 + adcs x17, x17, x14 + add x14, x22, x2 + add x22, x12, x23 + lsr x7, x7, #20 + add x22, x22, x24 + extr x2, x22, x26, #60 + mul x24, x3, x25 + lsr x22, x22, #52 + add x14, x14, x22 + lsl x22, x2, #8 + extr x22, x14, x22, #8 + lsl x2, x14, #12 + mul x1, x3, x1 + adcs x12, x5, x22 + mul x5, x6, x15 + and x26, x10, x13 + and x4, x26, x17 + add x23, x19, x24 + lsr x14, x14, #52 + mul x22, x3, x11 + add x11, x23, x14 + extr x25, x11, x2, #20 + lsl x19, x11, #12 + adcs x25, x20, x25 + and x14, x4, x12 + add x1, x5, x1 + and x14, x14, x25 + mul x15, x6, x7 + add x26, x15, x22 + mul x6, x6, x3 + lsr x22, x11, #52 + add x4, x1, x22 + lsr x1, x4, #52 + extr x3, x4, x19, #32 + lsl x15, x4, #12 + add x7, x26, x1 + adcs x23, x8, x3 + extr x20, x7, x15, #44 + and x3, x14, x23 + lsr x19, x7, #44 + adcs x7, x9, x20 + add x11, x6, x19 + adc x4, x16, x11 + lsr x14, x4, #9 + cmp xzr, xzr + and x15, x3, x7 + orr x3, x4, #0xfffffffffffffe00 + adcs xzr, x21, x14 + adcs xzr, x15, xzr + adcs xzr, x3, xzr + adcs x11, x21, x14 + and x14, x11, #0x1ff + adcs x1, x10, xzr + extr x10, x1, x11, #9 + str x14, [x0, #64] + adcs x14, x13, xzr + extr x11, x14, x1, #9 + adcs x1, x17, xzr + extr x4, x1, x14, #9 + stp x10, x11, [x0] + adcs x11, x12, xzr + extr x14, x11, x1, #9 + adcs x10, x25, xzr + extr x11, x10, x11, #9 + stp x4, x14, [x0, #16] + adcs x14, x23, xzr + extr x10, x14, x10, #9 + adcs x1, x7, xzr + stp x11, x10, [x0, #32] + extr x14, x1, x14, #9 + adc x10, x3, xzr + extr x26, x10, x1, #9 + stp x14, x26, [x0, #48] + +// Restore regs and return + + add sp, sp, #80 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521_neon.S b/third_party/s2n-bignum/arm/p521/bignum_sqr_p521_neon.S new file mode 100644 index 0000000000..13cd1c2541 --- /dev/null +++ b/third_party/s2n-bignum/arm/p521/bignum_sqr_p521_neon.S @@ -0,0 +1,1121 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_521, z := (x^2) mod p_521, assuming x reduced +// Input x[9]; output z[9] +// +// extern void bignum_sqr_p521_neon (uint64_t z[static 9], +// uint64_t x[static 9]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// bignum_montsqr_p521_neon is functionally equivalent to bignum_montsqr_p521. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! +// ldp x20, x19, [x1] +// ldr q23, [x1] +// ldr q1, [x1] +// ldr q16, [x1] +// ldp x14, x12, [x1, #16] +// ldr q28, [x1, #16] +// ldr q31, [x1, #16] +// ldp x9, x2, [x1, #32] +// ldr q29, [x1, #32] +// ldr q4, [x1, #32] +// ldr q5, [x1] +// ldr q2, [x1, #32] +// ldp x6, x13, [x1, #48] +// ldr q24, [x1, #48] +// ldr q27, [x1, #48] +// ldr q0, [x1, #16] +// ldr q30, [x1, #48] +// mul x17, x9, x6 +// mul x10, x2, x13 +// umulh x24, x9, x6 +// subs x4, x9, x2 +// cneg x4, x4, cc +// csetm x16, cc +// subs x3, x13, x6 +// cneg x23, x3, cc +// mul x3, x4, x23 +// umulh x4, x4, x23 +// cinv x22, x16, cc +// eor x23, x3, x22 +// eor x16, x4, x22 +// adds x3, x17, x24 +// adc x24, x24, xzr +// umulh x4, x2, x13 +// adds x3, x3, x10 +// adcs x24, x24, x4 +// adc x4, x4, xzr +// adds x24, x24, x10 +// adc x10, x4, xzr +// cmn x22, #0x1 +// adcs x4, x3, x23 +// adcs x24, x24, x16 +// adc x10, x10, x22 +// adds x8, x17, x17 +// adcs x22, x4, x4 +// adcs x5, x24, x24 +// adcs x11, x10, x10 +// adc x23, xzr, xzr +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v4.4S, v4.4S +// xtn v26.2S, v29.2D +// xtn v22.2S, v4.2D +// rev64 v4.4S, v4.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v29.4S, v29.4S +// mul v4.4S, v4.4S, v29.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x15, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x9, x2 +// mov x10, v18.d[0] +// mov x17, v18.d[1] +// umulh x4, x9, x2 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x7, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x8, x8, x10 +// adcs x22, x22, x17 +// adcs x21, x5, xzr +// adcs x5, x11, xzr +// adc x11, x23, xzr +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v27.4S, v27.4S +// xtn v26.2S, v24.2D +// xtn v22.2S, v27.2D +// rev64 v4.4S, v27.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v24.4S, v24.4S +// mul v4.4S, v4.4S, v24.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x23, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x6, x13 +// mov x10, v18.d[0] +// mov x17, v18.d[1] +// umulh x4, x6, x13 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x24, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x23, x23, x21 +// adcs x16, x24, x5 +// adcs x3, x10, x11 +// adc x21, x17, xzr +// ldr x17, [x1, #64] +// add x5, x17, x17 +// mul x11, x17, x17 +// and x17, x20, #0xfffffffffffff +// mul x4, x5, x17 +// extr x17, x19, x20, #52 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x17, x24, x17, #12 +// adds x15, x15, x17 +// extr x17, x14, x19, #40 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x4, x10, x17 +// lsl x17, x24, #12 +// extr x17, x4, x17, #24 +// adcs x7, x7, x17 +// extr x17, x12, x14, #28 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x17, x24, x17, #36 +// adcs x8, x8, x17 +// extr x17, x9, x12, #16 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x4, x10, x17 +// lsl x17, x24, #12 +// extr x17, x4, x17, #48 +// adcs x22, x22, x17 +// lsr x17, x9, #4 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x4, x24, x17, #60 +// extr x17, x2, x9, #56 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x24, x10, x17 +// lsl x17, x4, #8 +// extr x17, x24, x17, #8 +// adcs x23, x23, x17 +// extr x17, x6, x2, #44 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x4, x10, x17 +// lsl x17, x24, #12 +// extr x17, x4, x17, #20 +// adcs x16, x16, x17 +// extr x17, x13, x6, #32 +// and x17, x17, #0xfffffffffffff +// mul x10, x5, x17 +// lsr x17, x4, #52 +// add x24, x10, x17 +// lsl x17, x4, #12 +// extr x17, x24, x17, #32 +// adcs x3, x3, x17 +// lsr x17, x13, #20 +// mul x10, x5, x17 +// lsr x17, x24, #52 +// add x10, x10, x17 +// lsl x17, x24, #12 +// extr x17, x10, x17, #44 +// adcs x4, x21, x17 +// lsr x17, x10, #44 +// adc x24, x11, x17 +// extr x10, x7, x15, #9 +// extr x17, x8, x7, #9 +// stp x10, x17, [x0] // @slothy:writes=buffer0 +// extr x10, x22, x8, #9 +// extr x17, x23, x22, #9 +// stp x10, x17, [x0, #16] // @slothy:writes=buffer16 +// extr x10, x16, x23, #9 +// extr x17, x3, x16, #9 +// stp x10, x17, [x0, #32] // @slothy:writes=buffer32 +// extr x10, x4, x3, #9 +// extr x17, x24, x4, #9 +// stp x10, x17, [x0, #48] // @slothy:writes=buffer48 +// and x10, x15, #0x1ff +// lsr x17, x24, #9 +// add x17, x10, x17 +// str x17, [x0, #64] // @slothy:writes=buffer64 +// uzp1 v17.4S, v28.4S, v23.4S +// rev64 v4.4S, v28.4S +// uzp1 v7.4S, v23.4S, v23.4S +// mul v4.4S, v4.4S, v23.4S +// uaddlp v4.2D, v4.4S +// shl v4.2D, v4.2D, #32 +// umlal v4.2D, v7.2S, v17.2S +// mov x8, v4.d[0] +// mov x22, v4.d[1] +// umulh x23, x20, x14 +// subs x17, x20, x19 +// cneg x4, x17, cc +// csetm x24, cc +// subs x17, x12, x14 +// cneg x17, x17, cc +// mul x10, x4, x17 +// umulh x17, x4, x17 +// cinv x16, x24, cc +// eor x3, x10, x16 +// eor x4, x17, x16 +// adds x24, x8, x23 +// adc x10, x23, xzr +// umulh x17, x19, x12 +// adds x24, x24, x22 +// adcs x10, x10, x17 +// adc x17, x17, xzr +// adds x10, x10, x22 +// adc x17, x17, xzr +// cmn x16, #0x1 +// adcs x24, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, x16 +// adds x15, x8, x8 +// adcs x7, x24, x24 +// adcs x8, x10, x10 +// adcs x22, x17, x17 +// adc x23, xzr, xzr +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v16.4S, v16.4S +// xtn v26.2S, v1.2D +// xtn v22.2S, v16.2D +// rev64 v4.4S, v16.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v1.4S, v1.4S +// mul v4.4S, v4.4S, v1.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x21, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x20, x19 +// mov x10, v18.d[0] +// mov x17, v18.d[1] +// umulh x4, x20, x19 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x5, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x11, x15, x10 +// adcs x15, x7, x17 +// adcs x7, x8, xzr +// adcs x8, x22, xzr +// adc x22, x23, xzr +// xtn v7.2S, v31.2D +// shrn v4.2S, v31.2D, #32 +// umull v4.2D, v7.2S, v4.2S +// shl v4.2D, v4.2D, #33 +// umlal v4.2D, v7.2S, v7.2S +// mov x23, v4.d[0] +// mov x16, v4.d[1] +// mul x3, x14, x12 +// umulh x10, x14, x14 +// umulh x17, x12, x12 +// umulh x4, x14, x12 +// adds x24, x10, x3 +// adcs x10, x16, x4 +// adc x17, x17, xzr +// adds x24, x24, x3 +// adcs x10, x10, x4 +// adc x17, x17, xzr +// adds x16, x23, x7 +// adcs x3, x24, x8 +// adcs x4, x10, x22 +// adc x24, x17, xzr +// ldp x10, x17, [x0] // @slothy:reads=buffer0 +// adds x10, x10, x21 +// adcs x17, x17, x5 +// stp x10, x17, [x0] // @slothy:writes=buffer0 +// ldp x10, x17, [x0, #16] // @slothy:reads=buffer16 +// adcs x10, x10, x11 +// adcs x17, x17, x15 +// stp x10, x17, [x0, #16] // @slothy:writes=buffer16 +// ldp x10, x17, [x0, #32] // @slothy:reads=buffer32 +// adcs x10, x10, x16 +// adcs x17, x17, x3 +// stp x10, x17, [x0, #32] // @slothy:writes=buffer32 +// ldp x10, x17, [x0, #48] // @slothy:reads=buffer48 +// adcs x10, x10, x4 +// adcs x17, x17, x24 +// stp x10, x17, [x0, #48] // @slothy:writes=buffer48 +// ldr x17, [x0, #64] // @slothy:reads=buffer64 +// adc x17, x17, xzr +// str x17, [x0, #64] // @slothy:writes=buffer64 +// movi v25.2D, #0xffffffff +// uzp2 v19.4S, v2.4S, v2.4S +// xtn v26.2S, v5.2D +// xtn v22.2S, v2.2D +// rev64 v4.4S, v2.4S +// umull v7.2D, v26.2S, v22.2S +// umull v21.2D, v26.2S, v19.2S +// uzp2 v17.4S, v5.4S, v5.4S +// mul v4.4S, v4.4S, v5.4S +// usra v21.2D, v7.2D, #32 +// umull v18.2D, v17.2S, v19.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v21.16B, v25.16B +// umlal v7.2D, v17.2S, v22.2S +// shl v4.2D, v4.2D, #32 +// usra v18.2D, v21.2D, #32 +// umlal v4.2D, v26.2S, v22.2S +// usra v18.2D, v7.2D, #32 +// mov x5, v4.d[0] +// mov x4, v4.d[1] +// movi v25.2D, #0xffffffff +// uzp2 v17.4S, v30.4S, v30.4S +// xtn v19.2S, v0.2D +// xtn v26.2S, v30.2D +// rev64 v4.4S, v30.4S +// umull v7.2D, v19.2S, v26.2S +// umull v22.2D, v19.2S, v17.2S +// uzp2 v21.4S, v0.4S, v0.4S +// mul v4.4S, v4.4S, v0.4S +// usra v22.2D, v7.2D, #32 +// umull v17.2D, v21.2S, v17.2S +// uaddlp v4.2D, v4.4S +// and v7.16B, v22.16B, v25.16B +// umlal v7.2D, v21.2S, v26.2S +// shl v4.2D, v4.2D, #32 +// usra v17.2D, v22.2D, #32 +// umlal v4.2D, v19.2S, v26.2S +// usra v17.2D, v7.2D, #32 +// mov x24, v4.d[0] +// mov x10, v4.d[1] +// mov x17, v18.d[0] +// adds x4, x4, x17 +// mov x17, v18.d[1] +// adcs x24, x24, x17 +// mov x17, v17.d[0] +// adcs x10, x10, x17 +// mov x17, v17.d[1] +// adc x17, x17, xzr +// adds x15, x4, x5 +// adcs x4, x24, x4 +// adcs x24, x10, x24 +// adcs x10, x17, x10 +// adc x17, xzr, x17 +// adds x7, x4, x5 +// adcs x8, x24, x15 +// adcs x22, x10, x4 +// adcs x23, x17, x24 +// adcs x16, xzr, x10 +// adc x3, xzr, x17 +// subs x17, x14, x12 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x13, x6 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x23, x23, x17 +// eor x17, x24, x10 +// adcs x16, x16, x17 +// adc x3, x3, x10 +// subs x17, x20, x19 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x2, x9 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x11, x15, x17 +// eor x17, x24, x10 +// adcs x15, x7, x17 +// adcs x7, x8, x10 +// adcs x22, x22, x10 +// adcs x23, x23, x10 +// adcs x16, x16, x10 +// adc x3, x3, x10 +// subs x17, x19, x12 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x13, x2 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x8, x22, x17 +// eor x17, x24, x10 +// adcs x23, x23, x17 +// adcs x16, x16, x10 +// adc x3, x3, x10 +// subs x17, x20, x14 +// cneg x24, x17, cc +// csetm x4, cc +// subs x17, x6, x9 +// cneg x10, x17, cc +// mul x17, x24, x10 +// umulh x24, x24, x10 +// cinv x10, x4, cc +// cmn x10, #0x1 +// eor x17, x17, x10 +// adcs x22, x15, x17 +// eor x17, x24, x10 +// adcs x4, x7, x17 +// adcs x24, x8, x10 +// adcs x23, x23, x10 +// adcs x16, x16, x10 +// adc x3, x3, x10 +// subs x12, x20, x12 +// cneg x10, x12, cc +// csetm x17, cc +// subs x12, x13, x9 +// cneg x9, x12, cc +// mul x12, x10, x9 +// umulh x13, x10, x9 +// cinv x9, x17, cc +// cmn x9, #0x1 +// eor x12, x12, x9 +// adcs x4, x4, x12 +// eor x12, x13, x9 +// adcs x24, x24, x12 +// adcs x10, x23, x9 +// adcs x17, x16, x9 +// adc x13, x3, x9 +// subs x19, x19, x14 +// cneg x12, x19, cc +// csetm x9, cc +// subs x6, x6, x2 +// cneg x14, x6, cc +// mul x19, x12, x14 +// umulh x12, x12, x14 +// cinv x14, x9, cc +// cmn x14, #0x1 +// eor x19, x19, x14 +// adcs x23, x4, x19 +// eor x19, x12, x14 +// adcs x16, x24, x19 +// adcs x6, x10, x14 +// adcs x2, x17, x14 +// adc x9, x13, x14 +// ldp x12, x14, [x0] // @slothy:reads=buffer0 +// extr x19, x6, x16, #8 +// adds x10, x19, x12 +// extr x19, x2, x6, #8 +// adcs x17, x19, x14 +// ldp x14, x12, [x0, #16] // @slothy:reads=buffer16 +// extr x19, x9, x2, #8 +// adcs x13, x19, x14 +// and x14, x17, x13 +// lsr x19, x9, #8 +// adcs x6, x19, x12 +// and x9, x14, x6 +// ldp x14, x12, [x0, #32] // @slothy:reads=buffer32 +// lsl x19, x5, #1 +// adcs x2, x19, x14 +// and x14, x9, x2 +// extr x19, x11, x5, #63 +// adcs x3, x19, x12 +// and x9, x14, x3 +// ldp x14, x12, [x0, #48] // @slothy:reads=buffer48 +// extr x19, x22, x11, #63 +// adcs x4, x19, x14 +// and x14, x9, x4 +// extr x19, x23, x22, #63 +// adcs x24, x19, x12 +// and x12, x14, x24 +// ldr x14, [x0, #64] // @slothy:reads=buffer64 +// extr x19, x16, x23, #63 +// and x19, x19, #0x1ff +// adc x19, x14, x19 +// lsr x14, x19, #9 +// orr x19, x19, #0xfffffffffffffe00 +// cmp xzr, xzr +// adcs xzr, x10, x14 +// adcs xzr, x12, xzr +// adcs xzr, x19, xzr +// adcs x10, x10, x14 +// adcs x17, x17, xzr +// adcs x13, x13, xzr +// adcs x6, x6, xzr +// adcs x2, x2, xzr +// adcs x9, x3, xzr +// adcs x12, x4, xzr +// adcs x14, x24, xzr +// adc x19, x19, xzr +// and x19, x19, #0x1ff +// stp x10, x17, [x0] // @slothy:writes=buffer0 +// stp x13, x6, [x0, #16] // @slothy:writes=buffer16 +// stp x2, x9, [x0, #32] // @slothy:writes=buffer32 +// stp x12, x14, [x0, #48] // @slothy:writes=buffer48 +// str x19, [x0, #64] // @slothy:writes=buffer64 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret', +// # callee-register store/loads as, say, 'input.S'. +// export OUTPUTS="[hint_buffer0,hint_buffer16,hint_buffer32,hint_buffer48,hint_buffer64]" +// export RESERVED_REGS="[x18,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521_neon) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521_neon) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_sqr_p521_neon): + +// Save registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + + ldr q23, [x1, #32] + ldp x9, x2, [x1, #32] + ldr q16, [x1, #32] + ldr q20, [x1, #48] + ldp x6, x13, [x1, #48] + rev64 v2.4S, v23.4S + mul x14, x9, x2 + ldr q31, [x1, #48] + subs x22, x9, x2 + uzp2 v26.4S, v23.4S, v23.4S + mul v30.4S, v2.4S, v16.4S + xtn v0.2S, v20.2D + csetm x12, cc + xtn v21.2S, v16.2D + xtn v23.2S, v23.2D + umulh x10, x9, x6 + rev64 v27.4S, v31.4S + umull v2.2D, v21.2S, v26.2S + cneg x23, x22, cc + uaddlp v25.2D, v30.4S + umull v18.2D, v21.2S, v23.2S + mul x22, x9, x6 + mul v6.4S, v27.4S, v20.4S + uzp2 v17.4S, v20.4S, v20.4S + shl v20.2D, v25.2D, #32 + uzp2 v27.4S, v31.4S, v31.4S + mul x16, x2, x13 + umlal v20.2D, v21.2S, v23.2S + usra v2.2D, v18.2D, #32 + adds x8, x22, x10 + umull v25.2D, v17.2S, v27.2S + xtn v31.2S, v31.2D + movi v1.2D, #0xffffffff + adc x3, x10, xzr + umulh x21, x2, x13 + uzp2 v21.4S, v16.4S, v16.4S + umull v18.2D, v0.2S, v27.2S + subs x19, x13, x6 + and v7.16B, v2.16B, v1.16B + umull v27.2D, v0.2S, v31.2S + cneg x20, x19, cc + movi v30.2D, #0xffffffff + umull v16.2D, v21.2S, v26.2S + umlal v7.2D, v21.2S, v23.2S + mul x19, x23, x20 + cinv x7, x12, cc + uaddlp v6.2D, v6.4S + eor x12, x19, x7 + adds x11, x8, x16 + umulh x10, x23, x20 + ldr q1, [x1] + usra v16.2D, v2.2D, #32 + adcs x19, x3, x21 + shl v2.2D, v6.2D, #32 + adc x20, x21, xzr + adds x17, x19, x16 + usra v18.2D, v27.2D, #32 + adc x19, x20, xzr + cmn x7, #0x1 + umlal v2.2D, v0.2S, v31.2S + umulh x16, x9, x2 + adcs x8, x11, x12 + usra v16.2D, v7.2D, #32 + ldr x12, [x1, #64] + eor x20, x10, x7 + umulh x10, x6, x13 + mov x23, v2.d[0] + mov x3, v2.d[1] + adcs x21, x17, x20 + usra v25.2D, v18.2D, #32 + and v23.16B, v18.16B, v30.16B + adc x7, x19, x7 + adds x22, x22, x22 + ldr q7, [x1, #16] + adcs x17, x8, x8 + umlal v23.2D, v17.2S, v31.2S + mov x19, v16.d[0] + mul x11, x12, x12 + ldr q4, [x1] + usra v25.2D, v23.2D, #32 + add x5, x12, x12 + adcs x15, x21, x21 + ldr q28, [x1] + mov x12, v20.d[1] + adcs x24, x7, x7 + mov x21, v16.d[1] + adc x4, xzr, xzr + adds x19, x19, x14 + ldr q18, [x1, #16] + xtn v26.2S, v1.2D + adcs x8, x12, x16 + adc x21, x21, xzr + adds x7, x19, x14 + xtn v23.2S, v7.2D + rev64 v21.4S, v28.4S + adcs x12, x8, x16 + ldp x20, x19, [x1] + mov x16, v25.d[1] + xtn v22.2S, v28.2D + adc x14, x21, xzr + adds x8, x22, x12 + uzp2 v24.4S, v28.4S, v28.4S + rev64 v28.4S, v18.4S + mul x12, x6, x13 + mul v16.4S, v21.4S, v1.4S + shrn v31.2S, v7.2D, #32 + adcs x22, x17, x14 + mov x14, v25.d[0] + and x21, x20, #0xfffffffffffff + umull v17.2D, v26.2S, v24.2S + ldr q2, [x1, #32] + adcs x17, x15, xzr + ldr q30, [x1, #48] + umull v7.2D, v26.2S, v22.2S + adcs x15, x24, xzr + ldr q0, [x1, #16] + movi v6.2D, #0xffffffff + adc x4, x4, xzr + adds x14, x14, x12 + uzp1 v27.4S, v18.4S, v4.4S + uzp2 v19.4S, v1.4S, v1.4S + adcs x24, x3, x10 + mul x3, x5, x21 + umull v29.2D, v23.2S, v31.2S + ldr q5, [x1] + adc x21, x16, xzr + adds x16, x14, x12 + extr x12, x19, x20, #52 + umull v18.2D, v19.2S, v24.2S + adcs x24, x24, x10 + and x10, x12, #0xfffffffffffff + ldp x14, x12, [x1, #16] + usra v17.2D, v7.2D, #32 + adc x21, x21, xzr + adds x23, x23, x17 + mul x17, x5, x10 + shl v21.2D, v29.2D, #33 + lsl x10, x3, #12 + lsr x1, x3, #52 + rev64 v29.4S, v2.4S + uaddlp v25.2D, v16.4S + add x17, x17, x1 + adcs x16, x16, x15 + extr x3, x14, x19, #40 + mov x15, v20.d[0] + extr x10, x17, x10, #12 + and x3, x3, #0xfffffffffffff + shl v3.2D, v25.2D, #32 + and v6.16B, v17.16B, v6.16B + mul x1, x5, x3 + usra v18.2D, v17.2D, #32 + adcs x3, x24, x4 + extr x4, x12, x14, #28 + umlal v6.2D, v19.2S, v22.2S + xtn v20.2S, v2.2D + umlal v3.2D, v26.2S, v22.2S + movi v26.2D, #0xffffffff + lsr x24, x17, #52 + and x4, x4, #0xfffffffffffff + uzp2 v19.4S, v2.4S, v2.4S + add x1, x1, x24 + mul x24, x5, x4 + lsl x4, x17, #12 + xtn v24.2S, v5.2D + extr x17, x1, x4, #24 + adc x21, x21, xzr + umlal v21.2D, v23.2S, v23.2S + adds x4, x15, x10 + lsl x10, x1, #12 + adcs x15, x7, x17 + mul v23.4S, v28.4S, v4.4S + and x7, x4, #0x1ff + lsr x17, x1, #52 + umulh x1, x19, x12 + uzp2 v17.4S, v5.4S, v5.4S + extr x4, x15, x4, #9 + add x24, x24, x17 + mul v29.4S, v29.4S, v5.4S + extr x17, x24, x10, #36 + extr x10, x9, x12, #16 + uzp1 v28.4S, v4.4S, v4.4S + adcs x17, x8, x17 + and x8, x10, #0xfffffffffffff + umull v16.2D, v24.2S, v20.2S + extr x10, x17, x15, #9 + mul x15, x5, x8 + stp x4, x10, [x0] + lsl x4, x24, #12 + lsr x8, x9, #4 + uaddlp v4.2D, v23.4S + and x8, x8, #0xfffffffffffff + umull v23.2D, v24.2S, v19.2S + mul x8, x5, x8 + extr x10, x2, x9, #56 + lsr x24, x24, #52 + and x10, x10, #0xfffffffffffff + add x15, x15, x24 + extr x4, x15, x4, #48 + mul x24, x5, x10 + lsr x10, x15, #52 + usra v23.2D, v16.2D, #32 + add x10, x8, x10 + shl v4.2D, v4.2D, #32 + adcs x22, x22, x4 + extr x4, x6, x2, #44 + lsl x15, x15, #12 + lsr x8, x10, #52 + extr x15, x10, x15, #60 + and x10, x4, #0xfffffffffffff + umlal v4.2D, v28.2S, v27.2S + add x8, x24, x8 + extr x4, x13, x6, #32 + mul x24, x5, x10 + uzp2 v16.4S, v30.4S, v30.4S + lsl x10, x15, #8 + rev64 v28.4S, v30.4S + and x15, x4, #0xfffffffffffff + extr x4, x8, x10, #8 + mul x10, x5, x15 + lsl x15, x8, #12 + adcs x23, x23, x4 + lsr x4, x8, #52 + lsr x8, x13, #20 + add x4, x24, x4 + mul x8, x5, x8 + lsr x24, x4, #52 + extr x15, x4, x15, #20 + lsl x4, x4, #12 + add x10, x10, x24 + adcs x15, x16, x15 + extr x4, x10, x4, #32 + umulh x5, x20, x14 + adcs x3, x3, x4 + usra v18.2D, v6.2D, #32 + lsl x16, x10, #12 + extr x24, x15, x23, #9 + lsr x10, x10, #52 + uzp2 v27.4S, v0.4S, v0.4S + add x8, x8, x10 + extr x10, x3, x15, #9 + extr x4, x22, x17, #9 + and v25.16B, v23.16B, v26.16B + lsr x17, x8, #44 + extr x15, x8, x16, #44 + extr x16, x23, x22, #9 + xtn v7.2S, v30.2D + mov x8, v4.d[0] + stp x24, x10, [x0, #32] + uaddlp v30.2D, v29.4S + stp x4, x16, [x0, #16] + umulh x24, x20, x19 + adcs x15, x21, x15 + adc x16, x11, x17 + subs x11, x20, x19 + xtn v5.2S, v0.2D + csetm x17, cc + extr x3, x15, x3, #9 + mov x22, v4.d[1] + cneg x21, x11, cc + subs x10, x12, x14 + mul v31.4S, v28.4S, v0.4S + cneg x10, x10, cc + cinv x11, x17, cc + shl v4.2D, v30.2D, #32 + umull v28.2D, v5.2S, v16.2S + extr x23, x16, x15, #9 + adds x4, x8, x5 + mul x17, x21, x10 + umull v22.2D, v5.2S, v7.2S + adc x15, x5, xzr + adds x4, x4, x22 + uaddlp v2.2D, v31.4S + lsr x5, x16, #9 + adcs x16, x15, x1 + mov x15, v18.d[0] + adc x1, x1, xzr + umulh x10, x21, x10 + adds x22, x16, x22 + umlal v4.2D, v24.2S, v20.2S + umull v30.2D, v27.2S, v16.2S + stp x3, x23, [x0, #48] + add x3, x7, x5 + adc x16, x1, xzr + usra v28.2D, v22.2D, #32 + mul x23, x20, x19 + eor x1, x17, x11 + cmn x11, #0x1 + mov x17, v18.d[1] + umull v18.2D, v17.2S, v19.2S + adcs x7, x4, x1 + eor x1, x10, x11 + umlal v25.2D, v17.2S, v20.2S + movi v16.2D, #0xffffffff + adcs x22, x22, x1 + usra v18.2D, v23.2D, #32 + umulh x4, x14, x14 + adc x1, x16, x11 + adds x10, x8, x8 + shl v23.2D, v2.2D, #32 + str x3, [x0, #64] + adcs x5, x7, x7 + and v16.16B, v28.16B, v16.16B + usra v30.2D, v28.2D, #32 + adcs x7, x22, x22 + mov x21, v3.d[1] + adcs x11, x1, x1 + umlal v16.2D, v27.2S, v7.2S + adc x22, xzr, xzr + adds x16, x15, x23 + mul x8, x14, x12 + umlal v23.2D, v5.2S, v7.2S + usra v18.2D, v25.2D, #32 + umulh x15, x14, x12 + adcs x21, x21, x24 + usra v30.2D, v16.2D, #32 + adc x1, x17, xzr + adds x3, x16, x23 + adcs x21, x21, x24 + adc x1, x1, xzr + adds x24, x10, x21 + umulh x21, x12, x12 + adcs x16, x5, x1 + adcs x10, x7, xzr + mov x17, v21.d[1] + adcs x23, x11, xzr + adc x5, x22, xzr + adds x1, x4, x8 + adcs x22, x17, x15 + ldp x17, x4, [x0] + mov x11, v21.d[0] + adc x21, x21, xzr + adds x1, x1, x8 + adcs x15, x22, x15 + adc x8, x21, xzr + adds x22, x11, x10 + mov x21, v3.d[0] + adcs x11, x1, x23 + ldp x1, x10, [x0, #16] + adcs x15, x15, x5 + adc x7, x8, xzr + adds x8, x17, x21 + mov x23, v4.d[1] + ldp x5, x21, [x0, #32] + adcs x17, x4, x3 + ldr x4, [x0, #64] + mov x3, v18.d[0] + adcs x24, x1, x24 + stp x8, x17, [x0] + adcs x17, x10, x16 + ldp x1, x16, [x0, #48] + adcs x5, x5, x22 + adcs x8, x21, x11 + stp x5, x8, [x0, #32] + adcs x1, x1, x15 + mov x15, v23.d[1] + adcs x21, x16, x7 + stp x1, x21, [x0, #48] + adc x10, x4, xzr + subs x7, x14, x12 + mov x16, v18.d[1] + cneg x5, x7, cc + csetm x4, cc + subs x11, x13, x6 + mov x8, v23.d[0] + cneg x7, x11, cc + cinv x21, x4, cc + mov x11, v30.d[0] + adds x4, x23, x3 + mul x22, x5, x7 + mov x23, v30.d[1] + adcs x8, x8, x16 + adcs x16, x15, x11 + adc x11, x23, xzr + umulh x3, x5, x7 + stp x24, x17, [x0, #16] + mov x5, v4.d[0] + subs x15, x20, x19 + cneg x7, x15, cc + str x10, [x0, #64] + csetm x1, cc + subs x24, x2, x9 + cneg x17, x24, cc + cinv x15, x1, cc + adds x23, x4, x5 + umulh x1, x7, x17 + adcs x24, x8, x4 + adcs x10, x16, x8 + eor x8, x22, x21 + adcs x16, x11, x16 + mul x22, x7, x17 + eor x17, x1, x15 + adc x1, xzr, x11 + adds x11, x24, x5 + eor x7, x3, x21 + adcs x3, x10, x23 + adcs x24, x16, x24 + adcs x4, x1, x10 + eor x10, x22, x15 + adcs x16, xzr, x16 + adc x1, xzr, x1 + cmn x21, #0x1 + adcs x8, x4, x8 + adcs x22, x16, x7 + adc x7, x1, x21 + subs x21, x19, x12 + csetm x4, cc + cneg x1, x21, cc + subs x21, x13, x2 + cinv x16, x4, cc + cneg x4, x21, cc + cmn x15, #0x1 + adcs x21, x23, x10 + mul x23, x1, x4 + adcs x11, x11, x17 + adcs x3, x3, x15 + umulh x1, x1, x4 + adcs x24, x24, x15 + adcs x8, x8, x15 + adcs x22, x22, x15 + eor x17, x23, x16 + adc x15, x7, x15 + subs x7, x20, x14 + cneg x7, x7, cc + csetm x4, cc + subs x10, x20, x12 + cneg x23, x10, cc + csetm x10, cc + subs x12, x6, x9 + cinv x20, x4, cc + cneg x12, x12, cc + cmn x16, #0x1 + eor x1, x1, x16 + adcs x17, x24, x17 + mul x4, x7, x12 + adcs x8, x8, x1 + umulh x1, x7, x12 + adcs x24, x22, x16 + adc x7, x15, x16 + subs x12, x13, x9 + cneg x12, x12, cc + cinv x13, x10, cc + subs x19, x19, x14 + mul x9, x23, x12 + cneg x19, x19, cc + csetm x10, cc + eor x16, x1, x20 + subs x22, x6, x2 + umulh x12, x23, x12 + eor x1, x4, x20 + cinv x4, x10, cc + cneg x22, x22, cc + cmn x20, #0x1 + adcs x15, x11, x1 + eor x6, x12, x13 + adcs x10, x3, x16 + adcs x17, x17, x20 + eor x23, x9, x13 + adcs x2, x8, x20 + mul x11, x19, x22 + adcs x24, x24, x20 + adc x7, x7, x20 + cmn x13, #0x1 + adcs x3, x10, x23 + umulh x22, x19, x22 + adcs x17, x17, x6 + eor x12, x22, x4 + extr x22, x15, x21, #63 + adcs x8, x2, x13 + extr x21, x21, x5, #63 + ldp x16, x23, [x0] + adcs x20, x24, x13 + eor x1, x11, x4 + adc x6, x7, x13 + cmn x4, #0x1 + ldp x2, x7, [x0, #16] + adcs x1, x3, x1 + extr x19, x1, x15, #63 + adcs x14, x17, x12 + extr x1, x14, x1, #63 + lsl x17, x5, #1 + adcs x8, x8, x4 + extr x12, x8, x14, #8 + ldp x15, x11, [x0, #32] + adcs x9, x20, x4 + adc x3, x6, x4 + adds x16, x12, x16 + extr x6, x9, x8, #8 + ldp x14, x12, [x0, #48] + extr x8, x3, x9, #8 + adcs x20, x6, x23 + ldr x24, [x0, #64] + lsr x6, x3, #8 + adcs x8, x8, x2 + and x2, x1, #0x1ff + and x1, x20, x8 + adcs x4, x6, x7 + adcs x3, x17, x15 + and x1, x1, x4 + adcs x9, x21, x11 + and x1, x1, x3 + adcs x6, x22, x14 + and x1, x1, x9 + and x21, x1, x6 + adcs x14, x19, x12 + adc x1, x24, x2 + cmp xzr, xzr + orr x12, x1, #0xfffffffffffffe00 + lsr x1, x1, #9 + adcs xzr, x16, x1 + and x21, x21, x14 + adcs xzr, x21, xzr + adcs xzr, x12, xzr + adcs x21, x16, x1 + adcs x1, x20, xzr + adcs x19, x8, xzr + stp x21, x1, [x0] + adcs x1, x4, xzr + adcs x21, x3, xzr + stp x19, x1, [x0, #16] + adcs x1, x9, xzr + stp x21, x1, [x0, #32] + adcs x21, x6, xzr + adcs x1, x14, xzr + stp x21, x1, [x0, #48] + adc x1, x12, xzr + and x1, x1, #0x1ff + str x1, [x0, #64] + +// Restore regs and return + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jadd.S b/third_party/s2n-bignum/arm/p521/p521_jadd.S index 1d6b196c8c..340766e6a2 100644 --- a/third_party/s2n-bignum/arm/p521/p521_jadd.S +++ b/third_party/s2n-bignum/arm/p521/p521_jadd.S @@ -53,6 +53,7 @@ #define z1sq sp, #(NUMSIZE*0) #define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) #define yd sp, #(NUMSIZE*1) #define y2a sp, #(NUMSIZE*1) @@ -66,664 +67,38 @@ #define t2 sp, #(NUMSIZE*4) #define x1a sp, #(NUMSIZE*4) #define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) #define xd sp, #(NUMSIZE*5) #define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) -#define y1a sp, #(NUMSIZE*6) +#define tmp sp, #(NUMSIZE*6) -// NUMSIZE*7 is not 16-aligned so we round it up +#define y1a sp, #(NUMSIZE*7) -#define NSPACE (NUMSIZE*7+8) +#define NSPACE (NUMSIZE*8) -// Corresponds exactly to bignum_mul_p521_alt +// For the three field operations, we use subroutines not inlining. +// Call local code very close to bignum_mul_p521 and bignum_sqr_p521 +// and bignum_sub_p521 #define mul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - cmp xzr, xzr; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adcs x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - orr x13, x24, #0xfffffffffffffe00; \ - lsr x14, x21, #9; \ - adcs x13, x13, x14; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -// Corresponds exactly to bignum_sqr_p521_alt + add x0, P0; \ + add x1, P1; \ + add x2, P2; \ + bl local_mul_p521 #define sqr_p521(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x11, x2, x3; \ - umulh x12, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x10, x2, x4; \ - umulh x13, x2, x4; \ - adds x12, x12, x10; \ - ldp x6, x7, [P1+32]; \ - mul x10, x2, x5; \ - umulh x14, x2, x5; \ - adcs x13, x13, x10; \ - ldp x8, x9, [P1+48]; \ - mul x10, x2, x6; \ - umulh x15, x2, x6; \ - adcs x14, x14, x10; \ - mul x10, x2, x7; \ - umulh x16, x2, x7; \ - adcs x15, x15, x10; \ - mul x10, x2, x8; \ - umulh x17, x2, x8; \ - adcs x16, x16, x10; \ - mul x10, x2, x9; \ - umulh x19, x2, x9; \ - adcs x17, x17, x10; \ - adc x19, x19, xzr; \ - mul x10, x3, x4; \ - adds x13, x13, x10; \ - mul x10, x3, x5; \ - adcs x14, x14, x10; \ - mul x10, x3, x6; \ - adcs x15, x15, x10; \ - mul x10, x3, x7; \ - adcs x16, x16, x10; \ - mul x10, x3, x8; \ - adcs x17, x17, x10; \ - mul x10, x3, x9; \ - adcs x19, x19, x10; \ - cset x20, hs; \ - umulh x10, x3, x4; \ - adds x14, x14, x10; \ - umulh x10, x3, x5; \ - adcs x15, x15, x10; \ - umulh x10, x3, x6; \ - adcs x16, x16, x10; \ - umulh x10, x3, x7; \ - adcs x17, x17, x10; \ - umulh x10, x3, x8; \ - adcs x19, x19, x10; \ - umulh x10, x3, x9; \ - adc x20, x20, x10; \ - mul x10, x6, x7; \ - umulh x21, x6, x7; \ - adds x20, x20, x10; \ - adc x21, x21, xzr; \ - mul x10, x4, x5; \ - adds x15, x15, x10; \ - mul x10, x4, x6; \ - adcs x16, x16, x10; \ - mul x10, x4, x7; \ - adcs x17, x17, x10; \ - mul x10, x4, x8; \ - adcs x19, x19, x10; \ - mul x10, x4, x9; \ - adcs x20, x20, x10; \ - mul x10, x6, x8; \ - adcs x21, x21, x10; \ - cset x22, hs; \ - umulh x10, x4, x5; \ - adds x16, x16, x10; \ - umulh x10, x4, x6; \ - adcs x17, x17, x10; \ - umulh x10, x4, x7; \ - adcs x19, x19, x10; \ - umulh x10, x4, x8; \ - adcs x20, x20, x10; \ - umulh x10, x4, x9; \ - adcs x21, x21, x10; \ - umulh x10, x6, x8; \ - adc x22, x22, x10; \ - mul x10, x7, x8; \ - umulh x23, x7, x8; \ - adds x22, x22, x10; \ - adc x23, x23, xzr; \ - mul x10, x5, x6; \ - adds x17, x17, x10; \ - mul x10, x5, x7; \ - adcs x19, x19, x10; \ - mul x10, x5, x8; \ - adcs x20, x20, x10; \ - mul x10, x5, x9; \ - adcs x21, x21, x10; \ - mul x10, x6, x9; \ - adcs x22, x22, x10; \ - mul x10, x7, x9; \ - adcs x23, x23, x10; \ - cset x24, hs; \ - umulh x10, x5, x6; \ - adds x19, x19, x10; \ - umulh x10, x5, x7; \ - adcs x20, x20, x10; \ - umulh x10, x5, x8; \ - adcs x21, x21, x10; \ - umulh x10, x5, x9; \ - adcs x22, x22, x10; \ - umulh x10, x6, x9; \ - adcs x23, x23, x10; \ - umulh x10, x7, x9; \ - adc x24, x24, x10; \ - mul x10, x8, x9; \ - umulh x25, x8, x9; \ - adds x24, x24, x10; \ - adc x25, x25, xzr; \ - adds x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - adcs x20, x20, x20; \ - adcs x21, x21, x21; \ - adcs x22, x22, x22; \ - adcs x23, x23, x23; \ - adcs x24, x24, x24; \ - adcs x25, x25, x25; \ - cset x0, hs; \ - umulh x10, x2, x2; \ - adds x11, x11, x10; \ - mul x10, x3, x3; \ - adcs x12, x12, x10; \ - umulh x10, x3, x3; \ - adcs x13, x13, x10; \ - mul x10, x4, x4; \ - adcs x14, x14, x10; \ - umulh x10, x4, x4; \ - adcs x15, x15, x10; \ - mul x10, x5, x5; \ - adcs x16, x16, x10; \ - umulh x10, x5, x5; \ - adcs x17, x17, x10; \ - mul x10, x6, x6; \ - adcs x19, x19, x10; \ - umulh x10, x6, x6; \ - adcs x20, x20, x10; \ - mul x10, x7, x7; \ - adcs x21, x21, x10; \ - umulh x10, x7, x7; \ - adcs x22, x22, x10; \ - mul x10, x8, x8; \ - adcs x23, x23, x10; \ - umulh x10, x8, x8; \ - adcs x24, x24, x10; \ - mul x10, x9, x9; \ - adcs x25, x25, x10; \ - umulh x10, x9, x9; \ - adc x0, x0, x10; \ - ldr x1, [P1+64]; \ - add x1, x1, x1; \ - mul x10, x1, x2; \ - adds x19, x19, x10; \ - umulh x10, x1, x2; \ - adcs x20, x20, x10; \ - mul x10, x1, x4; \ - adcs x21, x21, x10; \ - umulh x10, x1, x4; \ - adcs x22, x22, x10; \ - mul x10, x1, x6; \ - adcs x23, x23, x10; \ - umulh x10, x1, x6; \ - adcs x24, x24, x10; \ - mul x10, x1, x8; \ - adcs x25, x25, x10; \ - umulh x10, x1, x8; \ - adcs x0, x0, x10; \ - lsr x4, x1, #1; \ - mul x4, x4, x4; \ - adc x4, x4, xzr; \ - mul x10, x1, x3; \ - adds x20, x20, x10; \ - umulh x10, x1, x3; \ - adcs x21, x21, x10; \ - mul x10, x1, x5; \ - adcs x22, x22, x10; \ - umulh x10, x1, x5; \ - adcs x23, x23, x10; \ - mul x10, x1, x7; \ - adcs x24, x24, x10; \ - umulh x10, x1, x7; \ - adcs x25, x25, x10; \ - mul x10, x1, x9; \ - adcs x0, x0, x10; \ - umulh x10, x1, x9; \ - adc x4, x4, x10; \ - mul x2, x2, x2; \ - cmp xzr, xzr; \ - extr x10, x20, x19, #9; \ - adcs x2, x2, x10; \ - extr x10, x21, x20, #9; \ - adcs x11, x11, x10; \ - extr x10, x22, x21, #9; \ - adcs x12, x12, x10; \ - extr x10, x23, x22, #9; \ - adcs x13, x13, x10; \ - extr x10, x24, x23, #9; \ - adcs x14, x14, x10; \ - extr x10, x25, x24, #9; \ - adcs x15, x15, x10; \ - extr x10, x0, x25, #9; \ - adcs x16, x16, x10; \ - extr x10, x4, x0, #9; \ - adcs x17, x17, x10; \ - orr x19, x19, #0xfffffffffffffe00; \ - lsr x10, x4, #9; \ - adcs x19, x19, x10; \ - sbcs x2, x2, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x19, x19, xzr; \ - and x19, x19, #0x1ff; \ - stp x2, x11, [P0]; \ - stp x12, x13, [P0+16]; \ - stp x14, x15, [P0+32]; \ - stp x16, x17, [P0+48]; \ - str x19, [P0+64] - -// Corresponds exactly to bignum_sub_p521 + add x0, P0; \ + add x1, P1; \ + bl local_sqr_p521 #define sub_p521(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - ldp x11, x12, [P1+48]; \ - ldp x4, x3, [P2+48]; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - ldr x13, [P1+64]; \ - ldr x4, [P2+64]; \ - sbcs x13, x13, x4; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] + add x0, P0; \ + add x1, P1; \ + add x2, P2; \ + bl local_sub_p521 S2N_BN_SYMBOL(p521_jadd): @@ -734,6 +109,7 @@ S2N_BN_SYMBOL(p521_jadd): stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! sub sp, sp, NSPACE // Move the input arguments to stable places @@ -764,31 +140,1296 @@ S2N_BN_SYMBOL(p521_jadd): mul_p521(zzx1,zz,x1a) mul_p521(zzx2,zz,x2a) - sub_p521(x_3,ww,zzx1) + sub_p521(resx,ww,zzx1) sub_p521(t1,zzx2,zzx1) mul_p521(xd,xd,z_1) - sub_p521(x_3,x_3,zzx2) + sub_p521(resx,resx,zzx2) - sub_p521(t2,zzx1,x_3) + sub_p521(t2,zzx1,resx) mul_p521(t1,t1,y1a) - mul_p521(z_3,xd,z_2) + mul_p521(resz,xd,z_2) mul_p521(t2,yd,t2) - sub_p521(y_3,t2,t1) + sub_p521(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 +// Multiplex the z outputs accordingly and re-store in resz + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + ldp x6, x7, [z_1+48] + ldr x8, [z_1+64] + + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x23, x6, x7 + orr x20, x20, x21 + orr x22, x22, x23 + orr x20, x20, x8 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + + ldp x10, x11, [z_2] + ldp x12, x13, [z_2+16] + ldp x14, x15, [z_2+32] + ldp x16, x17, [z_2+48] + ldr x19, [z_2+64] + + orr x21, x10, x11 + orr x22, x12, x13 + orr x23, x14, x15 + orr x24, x16, x17 + orr x21, x21, x22 + orr x23, x23, x24 + orr x21, x21, x19 + orr x21, x21, x23 + + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + + cmp x21, xzr + cset x21, ne + + cmp x21, x20 + + ldp x10, x11, [resz] + ldp x12, x13, [resz+16] + ldp x14, x15, [resz+32] + ldp x16, x17, [resz+48] + ldr x19, [resz+64] + + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + + stp x0, x1, [resz] + stp x2, x3, [resz+16] + stp x4, x5, [resz+32] + stp x6, x7, [resz+48] + str x8, [resz+64] + +// Multiplex the x and y outputs too, keeping the results in registers + + ldp x20, x21, [x_1] + ldp x0, x1, [resx] + csel x0, x20, x0, lo + csel x1, x21, x1, lo + ldp x20, x21, [x_2] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + + ldp x20, x21, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x20, x2, lo + csel x3, x21, x3, lo + ldp x20, x21, [x_2+16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + + ldp x20, x21, [x_1+32] + ldp x4, x5, [resx+32] + csel x4, x20, x4, lo + csel x5, x21, x5, lo + ldp x20, x21, [x_2+32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + + ldp x20, x21, [x_1+48] + ldp x6, x7, [resx+48] + csel x6, x20, x6, lo + csel x7, x21, x7, lo + ldp x20, x21, [x_2+48] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + + ldr x20, [x_1+64] + ldr x8, [resx+64] + csel x8, x20, x8, lo + ldr x21, [x_2+64] + csel x8, x21, x8, hi + + + ldp x20, x21, [y_1] + ldp x10, x11, [resy] + csel x10, x20, x10, lo + csel x11, x21, x11, lo + ldp x20, x21, [y_2] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + + ldp x20, x21, [y_1+16] + ldp x12, x13, [resy+16] + csel x12, x20, x12, lo + csel x13, x21, x13, lo + ldp x20, x21, [y_2+16] + csel x12, x20, x12, hi + csel x13, x21, x13, hi + + ldp x20, x21, [y_1+32] + ldp x14, x15, [resy+32] + csel x14, x20, x14, lo + csel x15, x21, x15, lo + ldp x20, x21, [y_2+32] + csel x14, x20, x14, hi + csel x15, x21, x15, hi + + ldp x20, x21, [y_1+48] + ldp x16, x17, [resy+48] + csel x16, x20, x16, lo + csel x17, x21, x17, lo + ldp x20, x21, [y_2+48] + csel x16, x20, x16, hi + csel x17, x21, x17, hi + + ldr x20, [y_1+64] + ldr x19, [resy+64] + csel x19, x20, x19, lo + ldr x21, [y_2+64] + csel x19, x21, x19, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [x_3+48] + str x8, [x_3+64] + + ldp x0, x1, [resz] + ldp x2, x3, [resz+16] + ldp x4, x5, [resz+32] + ldp x6, x7, [resz+48] + ldr x8, [resz+64] + + stp x10, x11, [y_3] + stp x12, x13, [y_3+16] + stp x14, x15, [y_3+32] + stp x16, x17, [y_3+48] + str x19, [y_3+64] + + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] + stp x6, x7, [z_3+48] + str x8, [z_3+64] // Restore stack and registers add sp, sp, NSPACE + ldp x29, x30, [sp], 16 ldp x27, x28, [sp], 16 ldp x25, x26, [sp], 16 ldp x23, x24, [sp], 16 ldp x21, x22, [sp], 16 ldp x19, x20, [sp], 16 + ret + +// Local versions of the three field operations, almost identical to +// bignum_mul_p521, bignum_sqr_p521 and bignum_sub_p521 except for +// avoiding all intial register save-restore, and in the case of +// local_mul_p521, using the tmp buffer as temporary storage and +// avoiding x26. + +local_mul_p521: + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + lsl x21, x11, #9 + extr x11, x12, x11, #55 + extr x12, x13, x12, #55 + extr x13, x14, x13, #55 + lsr x14, x14, #55 + ldp x3, x4, [x1, #32] + ldp x5, x6, [x1, #48] + ldp x7, x8, [x2, #32] + ldp x9, x10, [x2, #48] + stp x15, x16, [tmp] + stp x17, x19, [tmp+16] + stp x21, x11, [tmp+32] + stp x12, x13, [tmp+48] + str x14, [tmp+64] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x23, x22, [tmp] + adds x11, x11, x23 + adcs x12, x12, x22 + stp x11, x12, [tmp] + ldp x23, x22, [tmp+16] + adcs x13, x13, x23 + adcs x14, x14, x22 + stp x13, x14, [tmp+16] + ldp x23, x22, [tmp+32] + adcs x15, x15, x23 + adcs x16, x16, x22 + stp x15, x16, [tmp+32] + ldp x23, x22, [tmp+48] + adcs x17, x17, x23 + adcs x19, x19, x22 + stp x17, x19, [tmp+48] + ldr x21, [tmp+64] + adc x21, x21, xzr + str x21, [tmp+64] + ldp x23, x22, [x1] + subs x3, x3, x23 + sbcs x4, x4, x22 + ldp x23, x22, [x1, #16] + sbcs x5, x5, x23 + sbcs x6, x6, x22 + csetm x24, lo + ldp x23, x22, [x2] + subs x7, x23, x7 + sbcs x8, x22, x8 + ldp x23, x22, [x2, #16] + sbcs x9, x23, x9 + sbcs x10, x22, x10 + csetm x25, lo + eor x3, x3, x24 + subs x3, x3, x24 + eor x4, x4, x24 + sbcs x4, x4, x24 + eor x5, x5, x24 + sbcs x5, x5, x24 + eor x6, x6, x24 + sbc x6, x6, x24 + eor x7, x7, x25 + subs x7, x7, x25 + eor x8, x8, x25 + sbcs x8, x8, x25 + eor x9, x9, x25 + sbcs x9, x9, x25 + eor x10, x10, x25 + sbc x10, x10, x25 + eor x25, x25, x24 + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x3, x4, [tmp] + ldp x5, x6, [tmp+16] + eor x11, x11, x25 + adds x11, x11, x3 + eor x12, x12, x25 + adcs x12, x12, x4 + eor x13, x13, x25 + adcs x13, x13, x5 + eor x14, x14, x25 + adcs x14, x14, x6 + eor x15, x15, x25 + ldp x7, x8, [tmp+32] + ldp x9, x10, [tmp+48] + ldr x20, [tmp+64] + adcs x15, x15, x7 + eor x16, x16, x25 + adcs x16, x16, x8 + eor x17, x17, x25 + adcs x17, x17, x9 + eor x19, x19, x25 + adcs x19, x19, x10 + adc x21, x20, xzr + adds x15, x15, x3 + adcs x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + and x25, x25, #0x1ff + lsl x24, x11, #9 + orr x24, x24, x25 + adcs x7, x7, x24 + extr x24, x12, x11, #55 + adcs x8, x8, x24 + extr x24, x13, x12, #55 + adcs x9, x9, x24 + extr x24, x14, x13, #55 + adcs x10, x10, x24 + lsr x24, x14, #55 + adc x20, x24, x20 + ldr x6, [x2, #64] + ldp x3, x4, [x1] + and x23, x3, #0xfffffffffffff + mul x23, x6, x23 + ldr x14, [x1, #64] + ldp x11, x12, [x2] + and x24, x11, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + extr x24, x4, x3, #52 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x12, x11, #52 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #12 + adds x15, x15, x24 + ldp x5, x3, [x1, #16] + ldp x13, x11, [x2, #16] + extr x24, x5, x4, #40 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #40 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #24 + adcs x16, x16, x24 + extr x24, x3, x5, #28 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #28 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #36 + adcs x17, x17, x24 + and x25, x16, x17 + ldp x4, x5, [x1, #32] + ldp x12, x13, [x2, #32] + extr x24, x4, x3, #16 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #16 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsl x21, x21, #48 + add x23, x23, x21 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #48 + adcs x19, x19, x24 + and x25, x25, x19 + lsr x24, x4, #4 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + lsr x24, x12, #4 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x21, x22, x23, #60 + extr x24, x5, x4, #56 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #56 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x21, x21, #8 + extr x24, x23, x21, #8 + adcs x7, x7, x24 + and x25, x25, x7 + ldp x3, x4, [x1, #48] + ldp x11, x12, [x2, #48] + extr x24, x3, x5, #44 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #44 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #20 + adcs x8, x8, x24 + and x25, x25, x8 + extr x24, x4, x3, #32 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #32 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #32 + adcs x9, x9, x24 + and x25, x25, x9 + lsr x24, x4, #20 + mul x22, x6, x24 + lsr x24, x12, #20 + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #44 + adcs x10, x10, x24 + and x25, x25, x10 + mul x24, x6, x14 + lsr x22, x22, #44 + add x24, x24, x22 + adc x20, x20, x24 + lsr x22, x20, #9 + orr x20, x20, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x15, x22 + adcs xzr, x25, xzr + adcs xzr, x20, xzr + adcs x15, x15, x22 + adcs x16, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x20, x20, xzr + and x22, x15, #0x1ff + extr x15, x16, x15, #9 + extr x16, x17, x16, #9 + stp x15, x16, [x0] + extr x17, x19, x17, #9 + extr x19, x7, x19, #9 + stp x17, x19, [x0, #16] + extr x7, x8, x7, #9 + extr x8, x9, x8, #9 + stp x7, x8, [x0, #32] + extr x9, x10, x9, #9 + extr x10, x20, x10, #9 + stp x9, x10, [x0, #48] + str x22, [x0, #64] + ret + +local_sqr_p521: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + mul x12, x6, x8 + mul x17, x7, x9 + umulh x22, x6, x8 + subs x23, x6, x7 + cneg x23, x23, cc + csetm x11, cc + subs x10, x9, x8 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x7, x9 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x6, x6 + mul x16, x7, x7 + mul x21, x6, x7 + umulh x11, x6, x6 + umulh x17, x7, x7 + umulh x20, x6, x7 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x8, x8 + mul x16, x9, x9 + mul x21, x8, x9 + umulh x15, x8, x8 + umulh x17, x9, x9 + umulh x20, x8, x9 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldr x19, [x1, #64] + add x23, x19, x19 + mul x19, x19, x19 + and x21, x2, #0xfffffffffffff + mul x21, x23, x21 + extr x20, x3, x2, #52 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #12 + adds x10, x10, x22 + extr x21, x4, x3, #40 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #24 + adcs x11, x11, x22 + extr x20, x5, x4, #28 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #36 + adcs x12, x12, x22 + extr x21, x6, x5, #16 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #48 + adcs x13, x13, x22 + lsr x20, x6, #4 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x24, x20, x21, #60 + extr x21, x7, x6, #56 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x24, x24, #8 + extr x22, x21, x24, #8 + adcs x14, x14, x22 + extr x20, x8, x7, #44 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #20 + adcs x15, x15, x22 + extr x21, x9, x8, #32 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #32 + adcs x16, x16, x22 + lsr x20, x9, #20 + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #44 + adcs x17, x17, x22 + lsr x20, x20, #44 + adc x19, x19, x20 + extr x21, x11, x10, #9 + extr x20, x12, x11, #9 + stp x21, x20, [x0] + extr x21, x13, x12, #9 + extr x20, x14, x13, #9 + stp x21, x20, [x0, #16] + extr x21, x15, x14, #9 + extr x20, x16, x15, #9 + stp x21, x20, [x0, #32] + extr x21, x17, x16, #9 + extr x20, x19, x17, #9 + stp x21, x20, [x0, #48] + and x22, x10, #0x1ff + lsr x19, x19, #9 + add x22, x22, x19 + str x22, [x0, #64] + mul x12, x2, x4 + mul x17, x3, x5 + umulh x22, x2, x4 + subs x23, x2, x3 + cneg x23, x23, cc + csetm x11, cc + subs x10, x5, x4 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x3, x5 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x2, x2 + mul x16, x3, x3 + mul x21, x2, x3 + umulh x11, x2, x2 + umulh x17, x3, x3 + umulh x20, x2, x3 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x4, x4 + mul x16, x5, x5 + mul x21, x4, x5 + umulh x15, x4, x4 + umulh x17, x5, x5 + umulh x20, x4, x5 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldp x21, x20, [x0] + adds x21, x21, x10 + adcs x20, x20, x11 + stp x21, x20, [x0] + ldp x21, x20, [x0, #16] + adcs x21, x21, x12 + adcs x20, x20, x13 + stp x21, x20, [x0, #16] + ldp x21, x20, [x0, #32] + adcs x21, x21, x14 + adcs x20, x20, x15 + stp x21, x20, [x0, #32] + ldp x21, x20, [x0, #48] + adcs x21, x21, x16 + adcs x20, x20, x17 + stp x21, x20, [x0, #48] + ldr x22, [x0, #64] + adc x22, x22, xzr + str x22, [x0, #64] + mul x10, x2, x6 + mul x14, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + umulh x17, x2, x6 + adds x14, x14, x17 + umulh x17, x3, x7 + adcs x15, x15, x17 + umulh x17, x4, x8 + adcs x16, x16, x17 + umulh x17, x5, x9 + adc x17, x17, xzr + adds x11, x14, x10 + adcs x14, x15, x14 + adcs x15, x16, x15 + adcs x16, x17, x16 + adc x17, xzr, x17 + adds x12, x14, x10 + adcs x13, x15, x11 + adcs x14, x16, x14 + adcs x15, x17, x15 + adcs x16, xzr, x16 + adc x17, xzr, x17 + subs x22, x4, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x8 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x15, x15, x21 + eor x20, x20, x19 + adcs x16, x16, x20 + adc x17, x17, x19 + subs x22, x2, x3 + cneg x22, x22, cc + csetm x19, cc + subs x20, x7, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x11, x11, x21 + eor x20, x20, x19 + adcs x12, x12, x20 + adcs x13, x13, x19 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x14, x14, x21 + eor x20, x20, x19 + adcs x15, x15, x20 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x12, x12, x21 + eor x20, x20, x19 + adcs x13, x13, x20 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + ldp x21, x20, [x0] + extr x2, x15, x14, #8 + adds x2, x2, x21 + extr x3, x16, x15, #8 + adcs x3, x3, x20 + ldp x21, x20, [x0, #16] + extr x4, x17, x16, #8 + adcs x4, x4, x21 + and x22, x3, x4 + lsr x5, x17, #8 + adcs x5, x5, x20 + and x22, x22, x5 + ldp x21, x20, [x0, #32] + lsl x6, x10, #1 + adcs x6, x6, x21 + and x22, x22, x6 + extr x7, x11, x10, #63 + adcs x7, x7, x20 + and x22, x22, x7 + ldp x21, x20, [x0, #48] + extr x8, x12, x11, #63 + adcs x8, x8, x21 + and x22, x22, x8 + extr x9, x13, x12, #63 + adcs x9, x9, x20 + and x22, x22, x9 + ldr x21, [x0, #64] + extr x10, x14, x13, #63 + and x10, x10, #0x1ff + adc x10, x21, x10 + lsr x20, x10, #9 + orr x10, x10, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x2, x20 + adcs xzr, x22, xzr + adcs xzr, x10, xzr + adcs x2, x2, x20 + adcs x3, x3, xzr + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + and x10, x10, #0x1ff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + str x10, [x0, #64] + ret +local_sub_p521: + ldp x5, x6, [x1] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [x2, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [x1, #48] + ldp x4, x3, [x2, #48] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [x1, #64] + ldr x4, [x2, #64] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] + stp x11, x12, [x0, #48] + str x13, [x0, #64] ret #if defined(__linux__) && defined(__ELF__) diff --git a/third_party/s2n-bignum/arm/p521/p521_jadd_alt.S b/third_party/s2n-bignum/arm/p521/p521_jadd_alt.S new file mode 100644 index 0000000000..72c9239be2 --- /dev/null +++ b/third_party/s2n-bignum/arm/p521/p521_jadd_alt.S @@ -0,0 +1,979 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jadd_alt +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_521, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 +#define input_y x28 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +// NUMSIZE*7 is not 16-aligned so we round it up + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + cmp xzr, xzr; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adcs x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + orr x13, x24, #0xfffffffffffffe00; \ + lsr x14, x21, #9; \ + adcs x13, x13, x14; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x11, x2, x3; \ + umulh x12, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x10, x2, x4; \ + umulh x13, x2, x4; \ + adds x12, x12, x10; \ + ldp x6, x7, [P1+32]; \ + mul x10, x2, x5; \ + umulh x14, x2, x5; \ + adcs x13, x13, x10; \ + ldp x8, x9, [P1+48]; \ + mul x10, x2, x6; \ + umulh x15, x2, x6; \ + adcs x14, x14, x10; \ + mul x10, x2, x7; \ + umulh x16, x2, x7; \ + adcs x15, x15, x10; \ + mul x10, x2, x8; \ + umulh x17, x2, x8; \ + adcs x16, x16, x10; \ + mul x10, x2, x9; \ + umulh x19, x2, x9; \ + adcs x17, x17, x10; \ + adc x19, x19, xzr; \ + mul x10, x3, x4; \ + adds x13, x13, x10; \ + mul x10, x3, x5; \ + adcs x14, x14, x10; \ + mul x10, x3, x6; \ + adcs x15, x15, x10; \ + mul x10, x3, x7; \ + adcs x16, x16, x10; \ + mul x10, x3, x8; \ + adcs x17, x17, x10; \ + mul x10, x3, x9; \ + adcs x19, x19, x10; \ + cset x20, hs; \ + umulh x10, x3, x4; \ + adds x14, x14, x10; \ + umulh x10, x3, x5; \ + adcs x15, x15, x10; \ + umulh x10, x3, x6; \ + adcs x16, x16, x10; \ + umulh x10, x3, x7; \ + adcs x17, x17, x10; \ + umulh x10, x3, x8; \ + adcs x19, x19, x10; \ + umulh x10, x3, x9; \ + adc x20, x20, x10; \ + mul x10, x6, x7; \ + umulh x21, x6, x7; \ + adds x20, x20, x10; \ + adc x21, x21, xzr; \ + mul x10, x4, x5; \ + adds x15, x15, x10; \ + mul x10, x4, x6; \ + adcs x16, x16, x10; \ + mul x10, x4, x7; \ + adcs x17, x17, x10; \ + mul x10, x4, x8; \ + adcs x19, x19, x10; \ + mul x10, x4, x9; \ + adcs x20, x20, x10; \ + mul x10, x6, x8; \ + adcs x21, x21, x10; \ + cset x22, hs; \ + umulh x10, x4, x5; \ + adds x16, x16, x10; \ + umulh x10, x4, x6; \ + adcs x17, x17, x10; \ + umulh x10, x4, x7; \ + adcs x19, x19, x10; \ + umulh x10, x4, x8; \ + adcs x20, x20, x10; \ + umulh x10, x4, x9; \ + adcs x21, x21, x10; \ + umulh x10, x6, x8; \ + adc x22, x22, x10; \ + mul x10, x7, x8; \ + umulh x23, x7, x8; \ + adds x22, x22, x10; \ + adc x23, x23, xzr; \ + mul x10, x5, x6; \ + adds x17, x17, x10; \ + mul x10, x5, x7; \ + adcs x19, x19, x10; \ + mul x10, x5, x8; \ + adcs x20, x20, x10; \ + mul x10, x5, x9; \ + adcs x21, x21, x10; \ + mul x10, x6, x9; \ + adcs x22, x22, x10; \ + mul x10, x7, x9; \ + adcs x23, x23, x10; \ + cset x24, hs; \ + umulh x10, x5, x6; \ + adds x19, x19, x10; \ + umulh x10, x5, x7; \ + adcs x20, x20, x10; \ + umulh x10, x5, x8; \ + adcs x21, x21, x10; \ + umulh x10, x5, x9; \ + adcs x22, x22, x10; \ + umulh x10, x6, x9; \ + adcs x23, x23, x10; \ + umulh x10, x7, x9; \ + adc x24, x24, x10; \ + mul x10, x8, x9; \ + umulh x25, x8, x9; \ + adds x24, x24, x10; \ + adc x25, x25, xzr; \ + adds x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + adcs x20, x20, x20; \ + adcs x21, x21, x21; \ + adcs x22, x22, x22; \ + adcs x23, x23, x23; \ + adcs x24, x24, x24; \ + adcs x25, x25, x25; \ + cset x0, hs; \ + umulh x10, x2, x2; \ + adds x11, x11, x10; \ + mul x10, x3, x3; \ + adcs x12, x12, x10; \ + umulh x10, x3, x3; \ + adcs x13, x13, x10; \ + mul x10, x4, x4; \ + adcs x14, x14, x10; \ + umulh x10, x4, x4; \ + adcs x15, x15, x10; \ + mul x10, x5, x5; \ + adcs x16, x16, x10; \ + umulh x10, x5, x5; \ + adcs x17, x17, x10; \ + mul x10, x6, x6; \ + adcs x19, x19, x10; \ + umulh x10, x6, x6; \ + adcs x20, x20, x10; \ + mul x10, x7, x7; \ + adcs x21, x21, x10; \ + umulh x10, x7, x7; \ + adcs x22, x22, x10; \ + mul x10, x8, x8; \ + adcs x23, x23, x10; \ + umulh x10, x8, x8; \ + adcs x24, x24, x10; \ + mul x10, x9, x9; \ + adcs x25, x25, x10; \ + umulh x10, x9, x9; \ + adc x0, x0, x10; \ + ldr x1, [P1+64]; \ + add x1, x1, x1; \ + mul x10, x1, x2; \ + adds x19, x19, x10; \ + umulh x10, x1, x2; \ + adcs x20, x20, x10; \ + mul x10, x1, x4; \ + adcs x21, x21, x10; \ + umulh x10, x1, x4; \ + adcs x22, x22, x10; \ + mul x10, x1, x6; \ + adcs x23, x23, x10; \ + umulh x10, x1, x6; \ + adcs x24, x24, x10; \ + mul x10, x1, x8; \ + adcs x25, x25, x10; \ + umulh x10, x1, x8; \ + adcs x0, x0, x10; \ + lsr x4, x1, #1; \ + mul x4, x4, x4; \ + adc x4, x4, xzr; \ + mul x10, x1, x3; \ + adds x20, x20, x10; \ + umulh x10, x1, x3; \ + adcs x21, x21, x10; \ + mul x10, x1, x5; \ + adcs x22, x22, x10; \ + umulh x10, x1, x5; \ + adcs x23, x23, x10; \ + mul x10, x1, x7; \ + adcs x24, x24, x10; \ + umulh x10, x1, x7; \ + adcs x25, x25, x10; \ + mul x10, x1, x9; \ + adcs x0, x0, x10; \ + umulh x10, x1, x9; \ + adc x4, x4, x10; \ + mul x2, x2, x2; \ + cmp xzr, xzr; \ + extr x10, x20, x19, #9; \ + adcs x2, x2, x10; \ + extr x10, x21, x20, #9; \ + adcs x11, x11, x10; \ + extr x10, x22, x21, #9; \ + adcs x12, x12, x10; \ + extr x10, x23, x22, #9; \ + adcs x13, x13, x10; \ + extr x10, x24, x23, #9; \ + adcs x14, x14, x10; \ + extr x10, x25, x24, #9; \ + adcs x15, x15, x10; \ + extr x10, x0, x25, #9; \ + adcs x16, x16, x10; \ + extr x10, x4, x0, #9; \ + adcs x17, x17, x10; \ + orr x19, x19, #0xfffffffffffffe00; \ + lsr x10, x4, #9; \ + adcs x19, x19, x10; \ + sbcs x2, x2, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x19, x19, xzr; \ + and x19, x19, #0x1ff; \ + stp x2, x11, [P0]; \ + stp x12, x13, [P0+16]; \ + stp x14, x15, [P0+32]; \ + stp x16, x17, [P0+48]; \ + str x19, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + sbcs x13, x13, x4; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +S2N_BN_SYMBOL(p521_jadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations + + sqr_p521(z1sq,z_1) + sqr_p521(z2sq,z_2) + + mul_p521(y1a,z_2,y_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,z1sq,x_2) + mul_p521(x1a,z2sq,x_1) + mul_p521(y2a,z1sq,y2a) + mul_p521(y1a,z2sq,y1a) + + sub_p521(xd,x2a,x1a) + sub_p521(yd,y2a,y1a) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x1a) + mul_p521(zzx2,zz,x2a) + + sub_p521(resx,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(xd,xd,z_1) + + sub_p521(resx,resx,zzx2) + + sub_p521(t2,zzx1,resx) + + mul_p521(t1,t1,y1a) + mul_p521(resz,xd,z_2) + mul_p521(t2,yd,t2) + + sub_p521(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 +// Multiplex the z outputs accordingly and re-store in resz + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + ldp x6, x7, [z_1+48] + ldr x8, [z_1+64] + + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x23, x6, x7 + orr x20, x20, x21 + orr x22, x22, x23 + orr x20, x20, x8 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + + ldp x10, x11, [z_2] + ldp x12, x13, [z_2+16] + ldp x14, x15, [z_2+32] + ldp x16, x17, [z_2+48] + ldr x19, [z_2+64] + + orr x21, x10, x11 + orr x22, x12, x13 + orr x23, x14, x15 + orr x24, x16, x17 + orr x21, x21, x22 + orr x23, x23, x24 + orr x21, x21, x19 + orr x21, x21, x23 + + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + + cmp x21, xzr + cset x21, ne + + cmp x21, x20 + + ldp x10, x11, [resz] + ldp x12, x13, [resz+16] + ldp x14, x15, [resz+32] + ldp x16, x17, [resz+48] + ldr x19, [resz+64] + + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + + stp x0, x1, [resz] + stp x2, x3, [resz+16] + stp x4, x5, [resz+32] + stp x6, x7, [resz+48] + str x8, [resz+64] + +// Multiplex the x and y outputs too, keeping the results in registers + + ldp x20, x21, [x_1] + ldp x0, x1, [resx] + csel x0, x20, x0, lo + csel x1, x21, x1, lo + ldp x20, x21, [x_2] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + + ldp x20, x21, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x20, x2, lo + csel x3, x21, x3, lo + ldp x20, x21, [x_2+16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + + ldp x20, x21, [x_1+32] + ldp x4, x5, [resx+32] + csel x4, x20, x4, lo + csel x5, x21, x5, lo + ldp x20, x21, [x_2+32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + + ldp x20, x21, [x_1+48] + ldp x6, x7, [resx+48] + csel x6, x20, x6, lo + csel x7, x21, x7, lo + ldp x20, x21, [x_2+48] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + + ldr x20, [x_1+64] + ldr x8, [resx+64] + csel x8, x20, x8, lo + ldr x21, [x_2+64] + csel x8, x21, x8, hi + + + ldp x20, x21, [y_1] + ldp x10, x11, [resy] + csel x10, x20, x10, lo + csel x11, x21, x11, lo + ldp x20, x21, [y_2] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + + ldp x20, x21, [y_1+16] + ldp x12, x13, [resy+16] + csel x12, x20, x12, lo + csel x13, x21, x13, lo + ldp x20, x21, [y_2+16] + csel x12, x20, x12, hi + csel x13, x21, x13, hi + + ldp x20, x21, [y_1+32] + ldp x14, x15, [resy+32] + csel x14, x20, x14, lo + csel x15, x21, x15, lo + ldp x20, x21, [y_2+32] + csel x14, x20, x14, hi + csel x15, x21, x15, hi + + ldp x20, x21, [y_1+48] + ldp x16, x17, [resy+48] + csel x16, x20, x16, lo + csel x17, x21, x17, lo + ldp x20, x21, [y_2+48] + csel x16, x20, x16, hi + csel x17, x21, x17, hi + + ldr x20, [y_1+64] + ldr x19, [resy+64] + csel x19, x20, x19, lo + ldr x21, [y_2+64] + csel x19, x21, x19, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [x_3+48] + str x8, [x_3+64] + + ldp x0, x1, [resz] + ldp x2, x3, [resz+16] + ldp x4, x5, [resz+32] + ldp x6, x7, [resz+48] + ldr x8, [resz+64] + + stp x10, x11, [y_3] + stp x12, x13, [y_3+16] + stp x14, x15, [y_3+32] + stp x16, x17, [y_3+48] + str x19, [y_3+64] + + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] + stp x6, x7, [z_3+48] + str x8, [z_3+64] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jdouble.S b/third_party/s2n-bignum/arm/p521/p521_jdouble.S index 100f6d3e87..3eb0250b33 100644 --- a/third_party/s2n-bignum/arm/p521/p521_jdouble.S +++ b/third_party/s2n-bignum/arm/p521/p521_jdouble.S @@ -28,8 +28,8 @@ // Stable homes for input arguments during main code sequence -#define input_z x26 -#define input_x x27 +#define input_z x27 +#define input_x x28 // Pointer-offset pairs for inputs and outputs @@ -61,616 +61,21 @@ #define NSPACE (NUMSIZE*7+8) -// Corresponds exactly to bignum_mul_p521_alt +// For the two "big" field operations, we use subroutines not inlining. +// Call local code very close to bignum_mul_p521 and bignum_sqr_p521. #define mul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - cmp xzr, xzr; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adcs x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - orr x13, x24, #0xfffffffffffffe00; \ - lsr x14, x21, #9; \ - adcs x13, x13, x14; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] + add x0, P0; \ + add x1, P1; \ + add x2, P2; \ + bl local_mul_p521 -// Corresponds exactly to bignum_sqr_p521_alt +// Call local code equivalent to bignum_sqr_p521 #define sqr_p521(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x11, x2, x3; \ - umulh x12, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x10, x2, x4; \ - umulh x13, x2, x4; \ - adds x12, x12, x10; \ - ldp x6, x7, [P1+32]; \ - mul x10, x2, x5; \ - umulh x14, x2, x5; \ - adcs x13, x13, x10; \ - ldp x8, x9, [P1+48]; \ - mul x10, x2, x6; \ - umulh x15, x2, x6; \ - adcs x14, x14, x10; \ - mul x10, x2, x7; \ - umulh x16, x2, x7; \ - adcs x15, x15, x10; \ - mul x10, x2, x8; \ - umulh x17, x2, x8; \ - adcs x16, x16, x10; \ - mul x10, x2, x9; \ - umulh x19, x2, x9; \ - adcs x17, x17, x10; \ - adc x19, x19, xzr; \ - mul x10, x3, x4; \ - adds x13, x13, x10; \ - mul x10, x3, x5; \ - adcs x14, x14, x10; \ - mul x10, x3, x6; \ - adcs x15, x15, x10; \ - mul x10, x3, x7; \ - adcs x16, x16, x10; \ - mul x10, x3, x8; \ - adcs x17, x17, x10; \ - mul x10, x3, x9; \ - adcs x19, x19, x10; \ - cset x20, hs; \ - umulh x10, x3, x4; \ - adds x14, x14, x10; \ - umulh x10, x3, x5; \ - adcs x15, x15, x10; \ - umulh x10, x3, x6; \ - adcs x16, x16, x10; \ - umulh x10, x3, x7; \ - adcs x17, x17, x10; \ - umulh x10, x3, x8; \ - adcs x19, x19, x10; \ - umulh x10, x3, x9; \ - adc x20, x20, x10; \ - mul x10, x6, x7; \ - umulh x21, x6, x7; \ - adds x20, x20, x10; \ - adc x21, x21, xzr; \ - mul x10, x4, x5; \ - adds x15, x15, x10; \ - mul x10, x4, x6; \ - adcs x16, x16, x10; \ - mul x10, x4, x7; \ - adcs x17, x17, x10; \ - mul x10, x4, x8; \ - adcs x19, x19, x10; \ - mul x10, x4, x9; \ - adcs x20, x20, x10; \ - mul x10, x6, x8; \ - adcs x21, x21, x10; \ - cset x22, hs; \ - umulh x10, x4, x5; \ - adds x16, x16, x10; \ - umulh x10, x4, x6; \ - adcs x17, x17, x10; \ - umulh x10, x4, x7; \ - adcs x19, x19, x10; \ - umulh x10, x4, x8; \ - adcs x20, x20, x10; \ - umulh x10, x4, x9; \ - adcs x21, x21, x10; \ - umulh x10, x6, x8; \ - adc x22, x22, x10; \ - mul x10, x7, x8; \ - umulh x23, x7, x8; \ - adds x22, x22, x10; \ - adc x23, x23, xzr; \ - mul x10, x5, x6; \ - adds x17, x17, x10; \ - mul x10, x5, x7; \ - adcs x19, x19, x10; \ - mul x10, x5, x8; \ - adcs x20, x20, x10; \ - mul x10, x5, x9; \ - adcs x21, x21, x10; \ - mul x10, x6, x9; \ - adcs x22, x22, x10; \ - mul x10, x7, x9; \ - adcs x23, x23, x10; \ - cset x24, hs; \ - umulh x10, x5, x6; \ - adds x19, x19, x10; \ - umulh x10, x5, x7; \ - adcs x20, x20, x10; \ - umulh x10, x5, x8; \ - adcs x21, x21, x10; \ - umulh x10, x5, x9; \ - adcs x22, x22, x10; \ - umulh x10, x6, x9; \ - adcs x23, x23, x10; \ - umulh x10, x7, x9; \ - adc x24, x24, x10; \ - mul x10, x8, x9; \ - umulh x25, x8, x9; \ - adds x24, x24, x10; \ - adc x25, x25, xzr; \ - adds x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - adcs x20, x20, x20; \ - adcs x21, x21, x21; \ - adcs x22, x22, x22; \ - adcs x23, x23, x23; \ - adcs x24, x24, x24; \ - adcs x25, x25, x25; \ - cset x0, hs; \ - umulh x10, x2, x2; \ - adds x11, x11, x10; \ - mul x10, x3, x3; \ - adcs x12, x12, x10; \ - umulh x10, x3, x3; \ - adcs x13, x13, x10; \ - mul x10, x4, x4; \ - adcs x14, x14, x10; \ - umulh x10, x4, x4; \ - adcs x15, x15, x10; \ - mul x10, x5, x5; \ - adcs x16, x16, x10; \ - umulh x10, x5, x5; \ - adcs x17, x17, x10; \ - mul x10, x6, x6; \ - adcs x19, x19, x10; \ - umulh x10, x6, x6; \ - adcs x20, x20, x10; \ - mul x10, x7, x7; \ - adcs x21, x21, x10; \ - umulh x10, x7, x7; \ - adcs x22, x22, x10; \ - mul x10, x8, x8; \ - adcs x23, x23, x10; \ - umulh x10, x8, x8; \ - adcs x24, x24, x10; \ - mul x10, x9, x9; \ - adcs x25, x25, x10; \ - umulh x10, x9, x9; \ - adc x0, x0, x10; \ - ldr x1, [P1+64]; \ - add x1, x1, x1; \ - mul x10, x1, x2; \ - adds x19, x19, x10; \ - umulh x10, x1, x2; \ - adcs x20, x20, x10; \ - mul x10, x1, x4; \ - adcs x21, x21, x10; \ - umulh x10, x1, x4; \ - adcs x22, x22, x10; \ - mul x10, x1, x6; \ - adcs x23, x23, x10; \ - umulh x10, x1, x6; \ - adcs x24, x24, x10; \ - mul x10, x1, x8; \ - adcs x25, x25, x10; \ - umulh x10, x1, x8; \ - adcs x0, x0, x10; \ - lsr x4, x1, #1; \ - mul x4, x4, x4; \ - adc x4, x4, xzr; \ - mul x10, x1, x3; \ - adds x20, x20, x10; \ - umulh x10, x1, x3; \ - adcs x21, x21, x10; \ - mul x10, x1, x5; \ - adcs x22, x22, x10; \ - umulh x10, x1, x5; \ - adcs x23, x23, x10; \ - mul x10, x1, x7; \ - adcs x24, x24, x10; \ - umulh x10, x1, x7; \ - adcs x25, x25, x10; \ - mul x10, x1, x9; \ - adcs x0, x0, x10; \ - umulh x10, x1, x9; \ - adc x4, x4, x10; \ - mul x2, x2, x2; \ - cmp xzr, xzr; \ - extr x10, x20, x19, #9; \ - adcs x2, x2, x10; \ - extr x10, x21, x20, #9; \ - adcs x11, x11, x10; \ - extr x10, x22, x21, #9; \ - adcs x12, x12, x10; \ - extr x10, x23, x22, #9; \ - adcs x13, x13, x10; \ - extr x10, x24, x23, #9; \ - adcs x14, x14, x10; \ - extr x10, x25, x24, #9; \ - adcs x15, x15, x10; \ - extr x10, x0, x25, #9; \ - adcs x16, x16, x10; \ - extr x10, x4, x0, #9; \ - adcs x17, x17, x10; \ - orr x19, x19, #0xfffffffffffffe00; \ - lsr x10, x4, #9; \ - adcs x19, x19, x10; \ - sbcs x2, x2, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x19, x19, xzr; \ - and x19, x19, #0x1ff; \ - stp x2, x11, [P0]; \ - stp x12, x13, [P0+16]; \ - stp x14, x15, [P0+32]; \ - stp x16, x17, [P0+48]; \ - str x19, [P0+64] + add x0, P0; \ + add x1, P1; \ + bl local_sqr_p521 // Corresponds exactly to bignum_add_p521 @@ -751,372 +156,6 @@ stp x11, x12, [P0+48]; \ str x13, [P0+64] -// Weak multiplication not fully reducing - -#define weakmul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adds x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - and x13, x24, #0x1ff; \ - lsr x14, x21, #9; \ - adc x13, x13, x14; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - // P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) #define cmsub_p521(P0,C,P1,D,P2) \ @@ -1384,6 +423,7 @@ S2N_BN_SYMBOL(p521_jdouble): stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! sub sp, sp, NSPACE // Move the input arguments to stable places @@ -1411,7 +451,7 @@ S2N_BN_SYMBOL(p521_jdouble): add_p521(t1,y_1,z_1) sqr_p521(x4p,x2p) - weakmul_p521(xy2,x_1,y2) + mul_p521(xy2,x_1,y2) // t2 = (y + z)^2 @@ -1431,7 +471,7 @@ S2N_BN_SYMBOL(p521_jdouble): // dx2 = d * x2p sub_p521(z_3,t1,y2) - weakmul_p521(dx2,d,x2p) + mul_p521(dx2,d,x2p) // x' = 4 * xy2 - d @@ -1445,12 +485,1060 @@ S2N_BN_SYMBOL(p521_jdouble): add sp, sp, NSPACE + ldp x29, x30, [sp], 16 ldp x27, x28, [sp], 16 ldp x25, x26, [sp], 16 ldp x23, x24, [sp], 16 ldp x21, x22, [sp], 16 ldp x19, x20, [sp], 16 + ret + +// Local versions of the two "big" field operations, almost identical to +// bignum_mul_p521 and bignum_sqr_p521 except for avoiding the intial +// register save-restore, and in the case of local_mul_p521, using the +// output buffer as temporary storage, slightly reordering a few loads +// and stores to make it aliasing-proof. + +local_mul_p521: + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + lsl x21, x11, #9 + extr x11, x12, x11, #55 + extr x12, x13, x12, #55 + extr x13, x14, x13, #55 + lsr x14, x14, #55 + ldp x3, x4, [x1, #32] + ldp x5, x6, [x1, #48] + ldp x7, x8, [x2, #32] + ldp x9, x10, [x2, #48] + stp x15, x16, [x0] + stp x17, x19, [x0, #16] + stp x21, x11, [x0, #32] + stp x12, x13, [x0, #48] + str x14, [x0, #64] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x23, x22, [x0] + adds x11, x11, x23 + adcs x12, x12, x22 + stp x11, x12, [x0] + ldp x23, x22, [x0, #16] + adcs x13, x13, x23 + adcs x14, x14, x22 + stp x13, x14, [x0, #16] + ldp x23, x22, [x0, #32] + adcs x15, x15, x23 + adcs x16, x16, x22 + stp x15, x16, [x0, #32] + ldp x23, x22, [x0, #48] + adcs x17, x17, x23 + adcs x19, x19, x22 + stp x17, x19, [x0, #48] + ldr x21, [x0, #64] + adc x21, x21, xzr + str x21, [x0, #64] + ldp x23, x22, [x1] + subs x3, x3, x23 + sbcs x4, x4, x22 + ldp x23, x22, [x1, #16] + sbcs x5, x5, x23 + sbcs x6, x6, x22 + csetm x24, lo + ldp x23, x22, [x2] + subs x7, x23, x7 + sbcs x8, x22, x8 + ldp x23, x22, [x2, #16] + sbcs x9, x23, x9 + sbcs x10, x22, x10 + csetm x25, lo + eor x3, x3, x24 + subs x3, x3, x24 + eor x4, x4, x24 + sbcs x4, x4, x24 + eor x5, x5, x24 + sbcs x5, x5, x24 + eor x6, x6, x24 + sbc x6, x6, x24 + eor x7, x7, x25 + subs x7, x7, x25 + eor x8, x8, x25 + sbcs x8, x8, x25 + eor x9, x9, x25 + sbcs x9, x9, x25 + eor x10, x10, x25 + sbc x10, x10, x25 + eor x25, x25, x24 + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x3, x4, [x0] + ldp x5, x6, [x0, #16] + eor x11, x11, x25 + adds x11, x11, x3 + eor x12, x12, x25 + adcs x12, x12, x4 + eor x13, x13, x25 + adcs x13, x13, x5 + eor x14, x14, x25 + adcs x14, x14, x6 + eor x15, x15, x25 + ldp x7, x8, [x0, #32] + ldp x9, x10, [x0, #48] + ldr x20, [x0, #64] + adcs x15, x15, x7 + eor x16, x16, x25 + adcs x16, x16, x8 + eor x17, x17, x25 + adcs x17, x17, x9 + eor x19, x19, x25 + adcs x19, x19, x10 + adc x21, x20, xzr + adds x15, x15, x3 + adcs x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + and x25, x25, #0x1ff + lsl x24, x11, #9 + orr x24, x24, x25 + adcs x7, x7, x24 + extr x24, x12, x11, #55 + adcs x8, x8, x24 + extr x24, x13, x12, #55 + adcs x9, x9, x24 + extr x24, x14, x13, #55 + adcs x10, x10, x24 + lsr x24, x14, #55 + adc x20, x24, x20 + ldr x6, [x2, #64] + ldp x3, x4, [x1] + and x23, x3, #0xfffffffffffff + mul x23, x6, x23 + ldr x14, [x1, #64] + ldp x11, x12, [x2] + and x24, x11, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + extr x24, x4, x3, #52 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x12, x11, #52 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #12 + adds x15, x15, x24 + ldp x5, x3, [x1, #16] + ldp x13, x11, [x2, #16] + extr x24, x5, x4, #40 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #40 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #24 + adcs x16, x16, x24 + extr x24, x3, x5, #28 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #28 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #36 + adcs x17, x17, x24 + and x26, x16, x17 + ldp x4, x5, [x1, #32] + ldp x12, x13, [x2, #32] + extr x24, x4, x3, #16 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #16 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsl x21, x21, #48 + add x23, x23, x21 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #48 + adcs x19, x19, x24 + and x26, x26, x19 + lsr x24, x4, #4 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + lsr x24, x12, #4 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x25, x22, x23, #60 + extr x24, x5, x4, #56 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #56 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x25, x25, #8 + extr x24, x23, x25, #8 + adcs x7, x7, x24 + and x26, x26, x7 + ldp x3, x4, [x1, #48] + ldp x11, x12, [x2, #48] + extr x24, x3, x5, #44 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #44 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #20 + adcs x8, x8, x24 + and x26, x26, x8 + extr x24, x4, x3, #32 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #32 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #32 + adcs x9, x9, x24 + and x26, x26, x9 + lsr x24, x4, #20 + mul x22, x6, x24 + lsr x24, x12, #20 + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #44 + adcs x10, x10, x24 + and x26, x26, x10 + mul x24, x6, x14 + lsr x22, x22, #44 + add x24, x24, x22 + adc x20, x20, x24 + lsr x22, x20, #9 + orr x20, x20, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x15, x22 + adcs xzr, x26, xzr + adcs xzr, x20, xzr + adcs x15, x15, x22 + adcs x16, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x20, x20, xzr + and x22, x15, #0x1ff + extr x15, x16, x15, #9 + extr x16, x17, x16, #9 + stp x15, x16, [x0] + extr x17, x19, x17, #9 + extr x19, x7, x19, #9 + stp x17, x19, [x0, #16] + extr x7, x8, x7, #9 + extr x8, x9, x8, #9 + stp x7, x8, [x0, #32] + extr x9, x10, x9, #9 + extr x10, x20, x10, #9 + stp x9, x10, [x0, #48] + str x22, [x0, #64] + ret +local_sqr_p521: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + mul x12, x6, x8 + mul x17, x7, x9 + umulh x22, x6, x8 + subs x23, x6, x7 + cneg x23, x23, cc + csetm x11, cc + subs x10, x9, x8 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x7, x9 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x6, x6 + mul x16, x7, x7 + mul x21, x6, x7 + umulh x11, x6, x6 + umulh x17, x7, x7 + umulh x20, x6, x7 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x8, x8 + mul x16, x9, x9 + mul x21, x8, x9 + umulh x15, x8, x8 + umulh x17, x9, x9 + umulh x20, x8, x9 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldr x19, [x1, #64] + add x23, x19, x19 + mul x19, x19, x19 + and x21, x2, #0xfffffffffffff + mul x21, x23, x21 + extr x20, x3, x2, #52 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #12 + adds x10, x10, x22 + extr x21, x4, x3, #40 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #24 + adcs x11, x11, x22 + extr x20, x5, x4, #28 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #36 + adcs x12, x12, x22 + extr x21, x6, x5, #16 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #48 + adcs x13, x13, x22 + lsr x20, x6, #4 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x24, x20, x21, #60 + extr x21, x7, x6, #56 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x24, x24, #8 + extr x22, x21, x24, #8 + adcs x14, x14, x22 + extr x20, x8, x7, #44 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #20 + adcs x15, x15, x22 + extr x21, x9, x8, #32 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #32 + adcs x16, x16, x22 + lsr x20, x9, #20 + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #44 + adcs x17, x17, x22 + lsr x20, x20, #44 + adc x19, x19, x20 + extr x21, x11, x10, #9 + extr x20, x12, x11, #9 + stp x21, x20, [x0] + extr x21, x13, x12, #9 + extr x20, x14, x13, #9 + stp x21, x20, [x0, #16] + extr x21, x15, x14, #9 + extr x20, x16, x15, #9 + stp x21, x20, [x0, #32] + extr x21, x17, x16, #9 + extr x20, x19, x17, #9 + stp x21, x20, [x0, #48] + and x22, x10, #0x1ff + lsr x19, x19, #9 + add x22, x22, x19 + str x22, [x0, #64] + mul x12, x2, x4 + mul x17, x3, x5 + umulh x22, x2, x4 + subs x23, x2, x3 + cneg x23, x23, cc + csetm x11, cc + subs x10, x5, x4 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x3, x5 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x2, x2 + mul x16, x3, x3 + mul x21, x2, x3 + umulh x11, x2, x2 + umulh x17, x3, x3 + umulh x20, x2, x3 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x4, x4 + mul x16, x5, x5 + mul x21, x4, x5 + umulh x15, x4, x4 + umulh x17, x5, x5 + umulh x20, x4, x5 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldp x21, x20, [x0] + adds x21, x21, x10 + adcs x20, x20, x11 + stp x21, x20, [x0] + ldp x21, x20, [x0, #16] + adcs x21, x21, x12 + adcs x20, x20, x13 + stp x21, x20, [x0, #16] + ldp x21, x20, [x0, #32] + adcs x21, x21, x14 + adcs x20, x20, x15 + stp x21, x20, [x0, #32] + ldp x21, x20, [x0, #48] + adcs x21, x21, x16 + adcs x20, x20, x17 + stp x21, x20, [x0, #48] + ldr x22, [x0, #64] + adc x22, x22, xzr + str x22, [x0, #64] + mul x10, x2, x6 + mul x14, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + umulh x17, x2, x6 + adds x14, x14, x17 + umulh x17, x3, x7 + adcs x15, x15, x17 + umulh x17, x4, x8 + adcs x16, x16, x17 + umulh x17, x5, x9 + adc x17, x17, xzr + adds x11, x14, x10 + adcs x14, x15, x14 + adcs x15, x16, x15 + adcs x16, x17, x16 + adc x17, xzr, x17 + adds x12, x14, x10 + adcs x13, x15, x11 + adcs x14, x16, x14 + adcs x15, x17, x15 + adcs x16, xzr, x16 + adc x17, xzr, x17 + subs x22, x4, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x8 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x15, x15, x21 + eor x20, x20, x19 + adcs x16, x16, x20 + adc x17, x17, x19 + subs x22, x2, x3 + cneg x22, x22, cc + csetm x19, cc + subs x20, x7, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x11, x11, x21 + eor x20, x20, x19 + adcs x12, x12, x20 + adcs x13, x13, x19 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x14, x14, x21 + eor x20, x20, x19 + adcs x15, x15, x20 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x12, x12, x21 + eor x20, x20, x19 + adcs x13, x13, x20 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + ldp x21, x20, [x0] + extr x2, x15, x14, #8 + adds x2, x2, x21 + extr x3, x16, x15, #8 + adcs x3, x3, x20 + ldp x21, x20, [x0, #16] + extr x4, x17, x16, #8 + adcs x4, x4, x21 + and x22, x3, x4 + lsr x5, x17, #8 + adcs x5, x5, x20 + and x22, x22, x5 + ldp x21, x20, [x0, #32] + lsl x6, x10, #1 + adcs x6, x6, x21 + and x22, x22, x6 + extr x7, x11, x10, #63 + adcs x7, x7, x20 + and x22, x22, x7 + ldp x21, x20, [x0, #48] + extr x8, x12, x11, #63 + adcs x8, x8, x21 + and x22, x22, x8 + extr x9, x13, x12, #63 + adcs x9, x9, x20 + and x22, x22, x9 + ldr x21, [x0, #64] + extr x10, x14, x13, #63 + and x10, x10, #0x1ff + adc x10, x21, x10 + lsr x20, x10, #9 + orr x10, x10, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x2, x20 + adcs xzr, x22, xzr + adcs xzr, x10, xzr + adcs x2, x2, x20 + adcs x3, x3, xzr + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + and x10, x10, #0x1ff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + str x10, [x0, #64] ret #if defined(__linux__) && defined(__ELF__) diff --git a/third_party/s2n-bignum/arm/p521/p521_jdouble_alt.S b/third_party/s2n-bignum/arm/p521/p521_jdouble_alt.S new file mode 100644 index 0000000000..fa61dcf8d9 --- /dev/null +++ b/third_party/s2n-bignum/arm/p521/p521_jdouble_alt.S @@ -0,0 +1,1458 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jdouble_alt +// (uint64_t p3[static 27],uint64_t p1[static 27]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_521 and that the z coordinate is not zero. +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jdouble_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y2 sp, #(NUMSIZE*1) +#define x2p sp, #(NUMSIZE*2) +#define xy2 sp, #(NUMSIZE*3) + +#define y4 sp, #(NUMSIZE*4) +#define t2 sp, #(NUMSIZE*4) + +#define dx2 sp, #(NUMSIZE*5) +#define t1 sp, #(NUMSIZE*5) + +#define d sp, #(NUMSIZE*6) +#define x4p sp, #(NUMSIZE*6) + +// NUMSIZE*7 is not 16-aligned so we round it up + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + cmp xzr, xzr; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adcs x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + orr x13, x24, #0xfffffffffffffe00; \ + lsr x14, x21, #9; \ + adcs x13, x13, x14; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x11, x2, x3; \ + umulh x12, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x10, x2, x4; \ + umulh x13, x2, x4; \ + adds x12, x12, x10; \ + ldp x6, x7, [P1+32]; \ + mul x10, x2, x5; \ + umulh x14, x2, x5; \ + adcs x13, x13, x10; \ + ldp x8, x9, [P1+48]; \ + mul x10, x2, x6; \ + umulh x15, x2, x6; \ + adcs x14, x14, x10; \ + mul x10, x2, x7; \ + umulh x16, x2, x7; \ + adcs x15, x15, x10; \ + mul x10, x2, x8; \ + umulh x17, x2, x8; \ + adcs x16, x16, x10; \ + mul x10, x2, x9; \ + umulh x19, x2, x9; \ + adcs x17, x17, x10; \ + adc x19, x19, xzr; \ + mul x10, x3, x4; \ + adds x13, x13, x10; \ + mul x10, x3, x5; \ + adcs x14, x14, x10; \ + mul x10, x3, x6; \ + adcs x15, x15, x10; \ + mul x10, x3, x7; \ + adcs x16, x16, x10; \ + mul x10, x3, x8; \ + adcs x17, x17, x10; \ + mul x10, x3, x9; \ + adcs x19, x19, x10; \ + cset x20, hs; \ + umulh x10, x3, x4; \ + adds x14, x14, x10; \ + umulh x10, x3, x5; \ + adcs x15, x15, x10; \ + umulh x10, x3, x6; \ + adcs x16, x16, x10; \ + umulh x10, x3, x7; \ + adcs x17, x17, x10; \ + umulh x10, x3, x8; \ + adcs x19, x19, x10; \ + umulh x10, x3, x9; \ + adc x20, x20, x10; \ + mul x10, x6, x7; \ + umulh x21, x6, x7; \ + adds x20, x20, x10; \ + adc x21, x21, xzr; \ + mul x10, x4, x5; \ + adds x15, x15, x10; \ + mul x10, x4, x6; \ + adcs x16, x16, x10; \ + mul x10, x4, x7; \ + adcs x17, x17, x10; \ + mul x10, x4, x8; \ + adcs x19, x19, x10; \ + mul x10, x4, x9; \ + adcs x20, x20, x10; \ + mul x10, x6, x8; \ + adcs x21, x21, x10; \ + cset x22, hs; \ + umulh x10, x4, x5; \ + adds x16, x16, x10; \ + umulh x10, x4, x6; \ + adcs x17, x17, x10; \ + umulh x10, x4, x7; \ + adcs x19, x19, x10; \ + umulh x10, x4, x8; \ + adcs x20, x20, x10; \ + umulh x10, x4, x9; \ + adcs x21, x21, x10; \ + umulh x10, x6, x8; \ + adc x22, x22, x10; \ + mul x10, x7, x8; \ + umulh x23, x7, x8; \ + adds x22, x22, x10; \ + adc x23, x23, xzr; \ + mul x10, x5, x6; \ + adds x17, x17, x10; \ + mul x10, x5, x7; \ + adcs x19, x19, x10; \ + mul x10, x5, x8; \ + adcs x20, x20, x10; \ + mul x10, x5, x9; \ + adcs x21, x21, x10; \ + mul x10, x6, x9; \ + adcs x22, x22, x10; \ + mul x10, x7, x9; \ + adcs x23, x23, x10; \ + cset x24, hs; \ + umulh x10, x5, x6; \ + adds x19, x19, x10; \ + umulh x10, x5, x7; \ + adcs x20, x20, x10; \ + umulh x10, x5, x8; \ + adcs x21, x21, x10; \ + umulh x10, x5, x9; \ + adcs x22, x22, x10; \ + umulh x10, x6, x9; \ + adcs x23, x23, x10; \ + umulh x10, x7, x9; \ + adc x24, x24, x10; \ + mul x10, x8, x9; \ + umulh x25, x8, x9; \ + adds x24, x24, x10; \ + adc x25, x25, xzr; \ + adds x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + adcs x20, x20, x20; \ + adcs x21, x21, x21; \ + adcs x22, x22, x22; \ + adcs x23, x23, x23; \ + adcs x24, x24, x24; \ + adcs x25, x25, x25; \ + cset x0, hs; \ + umulh x10, x2, x2; \ + adds x11, x11, x10; \ + mul x10, x3, x3; \ + adcs x12, x12, x10; \ + umulh x10, x3, x3; \ + adcs x13, x13, x10; \ + mul x10, x4, x4; \ + adcs x14, x14, x10; \ + umulh x10, x4, x4; \ + adcs x15, x15, x10; \ + mul x10, x5, x5; \ + adcs x16, x16, x10; \ + umulh x10, x5, x5; \ + adcs x17, x17, x10; \ + mul x10, x6, x6; \ + adcs x19, x19, x10; \ + umulh x10, x6, x6; \ + adcs x20, x20, x10; \ + mul x10, x7, x7; \ + adcs x21, x21, x10; \ + umulh x10, x7, x7; \ + adcs x22, x22, x10; \ + mul x10, x8, x8; \ + adcs x23, x23, x10; \ + umulh x10, x8, x8; \ + adcs x24, x24, x10; \ + mul x10, x9, x9; \ + adcs x25, x25, x10; \ + umulh x10, x9, x9; \ + adc x0, x0, x10; \ + ldr x1, [P1+64]; \ + add x1, x1, x1; \ + mul x10, x1, x2; \ + adds x19, x19, x10; \ + umulh x10, x1, x2; \ + adcs x20, x20, x10; \ + mul x10, x1, x4; \ + adcs x21, x21, x10; \ + umulh x10, x1, x4; \ + adcs x22, x22, x10; \ + mul x10, x1, x6; \ + adcs x23, x23, x10; \ + umulh x10, x1, x6; \ + adcs x24, x24, x10; \ + mul x10, x1, x8; \ + adcs x25, x25, x10; \ + umulh x10, x1, x8; \ + adcs x0, x0, x10; \ + lsr x4, x1, #1; \ + mul x4, x4, x4; \ + adc x4, x4, xzr; \ + mul x10, x1, x3; \ + adds x20, x20, x10; \ + umulh x10, x1, x3; \ + adcs x21, x21, x10; \ + mul x10, x1, x5; \ + adcs x22, x22, x10; \ + umulh x10, x1, x5; \ + adcs x23, x23, x10; \ + mul x10, x1, x7; \ + adcs x24, x24, x10; \ + umulh x10, x1, x7; \ + adcs x25, x25, x10; \ + mul x10, x1, x9; \ + adcs x0, x0, x10; \ + umulh x10, x1, x9; \ + adc x4, x4, x10; \ + mul x2, x2, x2; \ + cmp xzr, xzr; \ + extr x10, x20, x19, #9; \ + adcs x2, x2, x10; \ + extr x10, x21, x20, #9; \ + adcs x11, x11, x10; \ + extr x10, x22, x21, #9; \ + adcs x12, x12, x10; \ + extr x10, x23, x22, #9; \ + adcs x13, x13, x10; \ + extr x10, x24, x23, #9; \ + adcs x14, x14, x10; \ + extr x10, x25, x24, #9; \ + adcs x15, x15, x10; \ + extr x10, x0, x25, #9; \ + adcs x16, x16, x10; \ + extr x10, x4, x0, #9; \ + adcs x17, x17, x10; \ + orr x19, x19, #0xfffffffffffffe00; \ + lsr x10, x4, #9; \ + adcs x19, x19, x10; \ + sbcs x2, x2, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x19, x19, xzr; \ + and x19, x19, #0x1ff; \ + stp x2, x11, [P0]; \ + stp x12, x13, [P0+16]; \ + stp x14, x15, [P0+32]; \ + stp x16, x17, [P0+48]; \ + str x19, [P0+64] + +// Corresponds exactly to bignum_add_p521 + +#define add_p521(P0,P1,P2) \ + cmp xzr, xzr; \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + adcs x5, x5, x4; \ + adcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + adcs x7, x7, x4; \ + adcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + adcs x9, x9, x4; \ + adcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + adcs x11, x11, x4; \ + adcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + adc x13, x13, x4; \ + subs x4, x13, #512; \ + csetm x4, hs; \ + sbcs x5, x5, xzr; \ + and x4, x4, #0x200; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, x4; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + sbcs x13, x13, x4; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Weak multiplication not fully reducing + +#define weakmul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adds x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + and x13, x24, #0x1ff; \ + lsr x14, x21, #9; \ + adc x13, x13, x14; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) + +#define cmsub_p521(P0,C,P1,D,P2) \ + ldp x6, x7, [P1]; \ + mov x1, #(C); \ + mul x3, x1, x6; \ + mul x4, x1, x7; \ + umulh x6, x1, x6; \ + adds x4, x4, x6; \ + umulh x7, x1, x7; \ + ldp x8, x9, [P1+16]; \ + mul x5, x1, x8; \ + mul x6, x1, x9; \ + umulh x8, x1, x8; \ + adcs x5, x5, x7; \ + umulh x9, x1, x9; \ + adcs x6, x6, x8; \ + ldp x10, x11, [P1+32]; \ + mul x7, x1, x10; \ + mul x8, x1, x11; \ + umulh x10, x1, x10; \ + adcs x7, x7, x9; \ + umulh x11, x1, x11; \ + adcs x8, x8, x10; \ + ldp x12, x13, [P1+48]; \ + mul x9, x1, x12; \ + mul x10, x1, x13; \ + umulh x12, x1, x12; \ + adcs x9, x9, x11; \ + umulh x13, x1, x13; \ + adcs x10, x10, x12; \ + ldr x14, [P1+64]; \ + mul x11, x1, x14; \ + adc x11, x11, x13; \ + mov x1, #(D); \ + ldp x20, x21, [P2]; \ + mvn x20, x20; \ + mul x0, x1, x20; \ + umulh x20, x1, x20; \ + adds x3, x3, x0; \ + mvn x21, x21; \ + mul x0, x1, x21; \ + umulh x21, x1, x21; \ + adcs x4, x4, x0; \ + ldp x22, x23, [P2+16]; \ + mvn x22, x22; \ + mul x0, x1, x22; \ + umulh x22, x1, x22; \ + adcs x5, x5, x0; \ + mvn x23, x23; \ + mul x0, x1, x23; \ + umulh x23, x1, x23; \ + adcs x6, x6, x0; \ + ldp x17, x19, [P2+32]; \ + mvn x17, x17; \ + mul x0, x1, x17; \ + umulh x17, x1, x17; \ + adcs x7, x7, x0; \ + mvn x19, x19; \ + mul x0, x1, x19; \ + umulh x19, x1, x19; \ + adcs x8, x8, x0; \ + ldp x2, x16, [P2+48]; \ + mvn x2, x2; \ + mul x0, x1, x2; \ + umulh x2, x1, x2; \ + adcs x9, x9, x0; \ + mvn x16, x16; \ + mul x0, x1, x16; \ + umulh x16, x1, x16; \ + adcs x10, x10, x0; \ + ldr x0, [P2+64]; \ + eor x0, x0, #0x1ff; \ + mul x0, x1, x0; \ + adc x11, x11, x0; \ + adds x4, x4, x20; \ + adcs x5, x5, x21; \ + and x15, x4, x5; \ + adcs x6, x6, x22; \ + and x15, x15, x6; \ + adcs x7, x7, x23; \ + and x15, x15, x7; \ + adcs x8, x8, x17; \ + and x15, x15, x8; \ + adcs x9, x9, x19; \ + and x15, x15, x9; \ + adcs x10, x10, x2; \ + and x15, x15, x10; \ + adc x11, x11, x16; \ + lsr x12, x11, #9; \ + orr x11, x11, #0xfffffffffffffe00; \ + cmp xzr, xzr; \ + adcs xzr, x3, x12; \ + adcs xzr, x15, xzr; \ + adcs xzr, x11, xzr; \ + adcs x3, x3, x12; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adcs x6, x6, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adcs x10, x10, xzr; \ + adc x11, x11, xzr; \ + and x11, x11, #0x1ff; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16]; \ + stp x7, x8, [P0+32]; \ + stp x9, x10, [P0+48]; \ + str x11, [P0+64] + +// P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2) + +#define cmsub38_p521(P0,P1,P2) \ + ldp x6, x7, [P1]; \ + lsl x3, x6, #1; \ + adds x3, x3, x6; \ + extr x4, x7, x6, #63; \ + adcs x4, x4, x7; \ + ldp x8, x9, [P1+16]; \ + extr x5, x8, x7, #63; \ + adcs x5, x5, x8; \ + extr x6, x9, x8, #63; \ + adcs x6, x6, x9; \ + ldp x10, x11, [P1+32]; \ + extr x7, x10, x9, #63; \ + adcs x7, x7, x10; \ + extr x8, x11, x10, #63; \ + adcs x8, x8, x11; \ + ldp x12, x13, [P1+48]; \ + extr x9, x12, x11, #63; \ + adcs x9, x9, x12; \ + extr x10, x13, x12, #63; \ + adcs x10, x10, x13; \ + ldr x14, [P1+64]; \ + extr x11, x14, x13, #63; \ + adc x11, x11, x14; \ + ldp x20, x21, [P2]; \ + mvn x20, x20; \ + lsl x0, x20, #3; \ + adds x3, x3, x0; \ + mvn x21, x21; \ + extr x0, x21, x20, #61; \ + adcs x4, x4, x0; \ + ldp x22, x23, [P2+16]; \ + mvn x22, x22; \ + extr x0, x22, x21, #61; \ + adcs x5, x5, x0; \ + and x15, x4, x5; \ + mvn x23, x23; \ + extr x0, x23, x22, #61; \ + adcs x6, x6, x0; \ + and x15, x15, x6; \ + ldp x20, x21, [P2+32]; \ + mvn x20, x20; \ + extr x0, x20, x23, #61; \ + adcs x7, x7, x0; \ + and x15, x15, x7; \ + mvn x21, x21; \ + extr x0, x21, x20, #61; \ + adcs x8, x8, x0; \ + and x15, x15, x8; \ + ldp x22, x23, [P2+48]; \ + mvn x22, x22; \ + extr x0, x22, x21, #61; \ + adcs x9, x9, x0; \ + and x15, x15, x9; \ + mvn x23, x23; \ + extr x0, x23, x22, #61; \ + adcs x10, x10, x0; \ + and x15, x15, x10; \ + ldr x0, [P2+64]; \ + eor x0, x0, #0x1ff; \ + extr x0, x0, x23, #61; \ + adc x11, x11, x0; \ + lsr x12, x11, #9; \ + orr x11, x11, #0xfffffffffffffe00; \ + cmp xzr, xzr; \ + adcs xzr, x3, x12; \ + adcs xzr, x15, xzr; \ + adcs xzr, x11, xzr; \ + adcs x3, x3, x12; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adcs x6, x6, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adcs x10, x10, xzr; \ + adc x11, x11, xzr; \ + and x11, x11, #0x1ff; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16]; \ + stp x7, x8, [P0+32]; \ + stp x9, x10, [P0+48]; \ + str x11, [P0+64] + +// P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2) + +#define cmsub41_p521(P0,P1,P2) \ + ldp x6, x7, [P1]; \ + lsl x3, x6, #2; \ + extr x4, x7, x6, #62; \ + ldp x8, x9, [P1+16]; \ + extr x5, x8, x7, #62; \ + extr x6, x9, x8, #62; \ + ldp x10, x11, [P1+32]; \ + extr x7, x10, x9, #62; \ + extr x8, x11, x10, #62; \ + ldp x12, x13, [P1+48]; \ + extr x9, x12, x11, #62; \ + extr x10, x13, x12, #62; \ + ldr x14, [P1+64]; \ + extr x11, x14, x13, #62; \ + ldp x0, x1, [P2]; \ + mvn x0, x0; \ + adds x3, x3, x0; \ + sbcs x4, x4, x1; \ + ldp x0, x1, [P2+16]; \ + sbcs x5, x5, x0; \ + and x15, x4, x5; \ + sbcs x6, x6, x1; \ + and x15, x15, x6; \ + ldp x0, x1, [P2+32]; \ + sbcs x7, x7, x0; \ + and x15, x15, x7; \ + sbcs x8, x8, x1; \ + and x15, x15, x8; \ + ldp x0, x1, [P2+48]; \ + sbcs x9, x9, x0; \ + and x15, x15, x9; \ + sbcs x10, x10, x1; \ + and x15, x15, x10; \ + ldr x0, [P2+64]; \ + eor x0, x0, #0x1ff; \ + adc x11, x11, x0; \ + lsr x12, x11, #9; \ + orr x11, x11, #0xfffffffffffffe00; \ + cmp xzr, xzr; \ + adcs xzr, x3, x12; \ + adcs xzr, x15, xzr; \ + adcs xzr, x11, xzr; \ + adcs x3, x3, x12; \ + adcs x4, x4, xzr; \ + adcs x5, x5, xzr; \ + adcs x6, x6, xzr; \ + adcs x7, x7, xzr; \ + adcs x8, x8, xzr; \ + adcs x9, x9, xzr; \ + adcs x10, x10, xzr; \ + adc x11, x11, xzr; \ + and x11, x11, #0x1ff; \ + stp x3, x4, [P0]; \ + stp x5, x6, [P0+16]; \ + stp x7, x8, [P0+32]; \ + stp x9, x10, [P0+48]; \ + str x11, [P0+64] + +S2N_BN_SYMBOL(p521_jdouble_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + sqr_p521(z2,z_1) + sqr_p521(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + add_p521(t1,x_1,z2) + sub_p521(t2,x_1,z2) + mul_p521(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p521(t1,y_1,z_1) + sqr_p521(x4p,x2p) + weakmul_p521(xy2,x_1,y2) + +// t2 = (y + z)^2 + + sqr_p521(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p521(d,12,xy2,9,x4p) + sub_p521(t1,t2,z2) + +// y4 = y^4 + + sqr_p521(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p521(z_3,t1,y2) + weakmul_p521(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p521(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p521(y_3,dx2,y4) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jmixadd.S b/third_party/s2n-bignum/arm/p521/p521_jmixadd.S index c9b62a9aa1..b04e39327f 100644 --- a/third_party/s2n-bignum/arm/p521/p521_jmixadd.S +++ b/third_party/s2n-bignum/arm/p521/p521_jmixadd.S @@ -54,6 +54,7 @@ #define zp2 sp, #(NUMSIZE*0) #define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) #define yd sp, #(NUMSIZE*1) #define y2a sp, #(NUMSIZE*1) @@ -66,659 +67,35 @@ #define t2 sp, #(NUMSIZE*4) #define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) #define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) -#define NSPACE (NUMSIZE*6) +#define tmp sp, #(NUMSIZE*6) -// Corresponds exactly to bignum_mul_p521_alt +#define NSPACE (NUMSIZE*7+8) + +// For the three field operations, we use subroutines not inlining. +// Call local code very close to bignum_mul_p521 and bignum_sqr_p521 +// and bignum_sub_p521 #define mul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - cmp xzr, xzr; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adcs x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - orr x13, x24, #0xfffffffffffffe00; \ - lsr x14, x21, #9; \ - adcs x13, x13, x14; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -// Corresponds exactly to bignum_sqr_p521_alt + add x0, P0; \ + add x1, P1; \ + add x2, P2; \ + bl local_mul_p521 #define sqr_p521(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x11, x2, x3; \ - umulh x12, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x10, x2, x4; \ - umulh x13, x2, x4; \ - adds x12, x12, x10; \ - ldp x6, x7, [P1+32]; \ - mul x10, x2, x5; \ - umulh x14, x2, x5; \ - adcs x13, x13, x10; \ - ldp x8, x9, [P1+48]; \ - mul x10, x2, x6; \ - umulh x15, x2, x6; \ - adcs x14, x14, x10; \ - mul x10, x2, x7; \ - umulh x16, x2, x7; \ - adcs x15, x15, x10; \ - mul x10, x2, x8; \ - umulh x17, x2, x8; \ - adcs x16, x16, x10; \ - mul x10, x2, x9; \ - umulh x19, x2, x9; \ - adcs x17, x17, x10; \ - adc x19, x19, xzr; \ - mul x10, x3, x4; \ - adds x13, x13, x10; \ - mul x10, x3, x5; \ - adcs x14, x14, x10; \ - mul x10, x3, x6; \ - adcs x15, x15, x10; \ - mul x10, x3, x7; \ - adcs x16, x16, x10; \ - mul x10, x3, x8; \ - adcs x17, x17, x10; \ - mul x10, x3, x9; \ - adcs x19, x19, x10; \ - cset x20, hs; \ - umulh x10, x3, x4; \ - adds x14, x14, x10; \ - umulh x10, x3, x5; \ - adcs x15, x15, x10; \ - umulh x10, x3, x6; \ - adcs x16, x16, x10; \ - umulh x10, x3, x7; \ - adcs x17, x17, x10; \ - umulh x10, x3, x8; \ - adcs x19, x19, x10; \ - umulh x10, x3, x9; \ - adc x20, x20, x10; \ - mul x10, x6, x7; \ - umulh x21, x6, x7; \ - adds x20, x20, x10; \ - adc x21, x21, xzr; \ - mul x10, x4, x5; \ - adds x15, x15, x10; \ - mul x10, x4, x6; \ - adcs x16, x16, x10; \ - mul x10, x4, x7; \ - adcs x17, x17, x10; \ - mul x10, x4, x8; \ - adcs x19, x19, x10; \ - mul x10, x4, x9; \ - adcs x20, x20, x10; \ - mul x10, x6, x8; \ - adcs x21, x21, x10; \ - cset x22, hs; \ - umulh x10, x4, x5; \ - adds x16, x16, x10; \ - umulh x10, x4, x6; \ - adcs x17, x17, x10; \ - umulh x10, x4, x7; \ - adcs x19, x19, x10; \ - umulh x10, x4, x8; \ - adcs x20, x20, x10; \ - umulh x10, x4, x9; \ - adcs x21, x21, x10; \ - umulh x10, x6, x8; \ - adc x22, x22, x10; \ - mul x10, x7, x8; \ - umulh x23, x7, x8; \ - adds x22, x22, x10; \ - adc x23, x23, xzr; \ - mul x10, x5, x6; \ - adds x17, x17, x10; \ - mul x10, x5, x7; \ - adcs x19, x19, x10; \ - mul x10, x5, x8; \ - adcs x20, x20, x10; \ - mul x10, x5, x9; \ - adcs x21, x21, x10; \ - mul x10, x6, x9; \ - adcs x22, x22, x10; \ - mul x10, x7, x9; \ - adcs x23, x23, x10; \ - cset x24, hs; \ - umulh x10, x5, x6; \ - adds x19, x19, x10; \ - umulh x10, x5, x7; \ - adcs x20, x20, x10; \ - umulh x10, x5, x8; \ - adcs x21, x21, x10; \ - umulh x10, x5, x9; \ - adcs x22, x22, x10; \ - umulh x10, x6, x9; \ - adcs x23, x23, x10; \ - umulh x10, x7, x9; \ - adc x24, x24, x10; \ - mul x10, x8, x9; \ - umulh x25, x8, x9; \ - adds x24, x24, x10; \ - adc x25, x25, xzr; \ - adds x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - adcs x20, x20, x20; \ - adcs x21, x21, x21; \ - adcs x22, x22, x22; \ - adcs x23, x23, x23; \ - adcs x24, x24, x24; \ - adcs x25, x25, x25; \ - cset x0, hs; \ - umulh x10, x2, x2; \ - adds x11, x11, x10; \ - mul x10, x3, x3; \ - adcs x12, x12, x10; \ - umulh x10, x3, x3; \ - adcs x13, x13, x10; \ - mul x10, x4, x4; \ - adcs x14, x14, x10; \ - umulh x10, x4, x4; \ - adcs x15, x15, x10; \ - mul x10, x5, x5; \ - adcs x16, x16, x10; \ - umulh x10, x5, x5; \ - adcs x17, x17, x10; \ - mul x10, x6, x6; \ - adcs x19, x19, x10; \ - umulh x10, x6, x6; \ - adcs x20, x20, x10; \ - mul x10, x7, x7; \ - adcs x21, x21, x10; \ - umulh x10, x7, x7; \ - adcs x22, x22, x10; \ - mul x10, x8, x8; \ - adcs x23, x23, x10; \ - umulh x10, x8, x8; \ - adcs x24, x24, x10; \ - mul x10, x9, x9; \ - adcs x25, x25, x10; \ - umulh x10, x9, x9; \ - adc x0, x0, x10; \ - ldr x1, [P1+64]; \ - add x1, x1, x1; \ - mul x10, x1, x2; \ - adds x19, x19, x10; \ - umulh x10, x1, x2; \ - adcs x20, x20, x10; \ - mul x10, x1, x4; \ - adcs x21, x21, x10; \ - umulh x10, x1, x4; \ - adcs x22, x22, x10; \ - mul x10, x1, x6; \ - adcs x23, x23, x10; \ - umulh x10, x1, x6; \ - adcs x24, x24, x10; \ - mul x10, x1, x8; \ - adcs x25, x25, x10; \ - umulh x10, x1, x8; \ - adcs x0, x0, x10; \ - lsr x4, x1, #1; \ - mul x4, x4, x4; \ - adc x4, x4, xzr; \ - mul x10, x1, x3; \ - adds x20, x20, x10; \ - umulh x10, x1, x3; \ - adcs x21, x21, x10; \ - mul x10, x1, x5; \ - adcs x22, x22, x10; \ - umulh x10, x1, x5; \ - adcs x23, x23, x10; \ - mul x10, x1, x7; \ - adcs x24, x24, x10; \ - umulh x10, x1, x7; \ - adcs x25, x25, x10; \ - mul x10, x1, x9; \ - adcs x0, x0, x10; \ - umulh x10, x1, x9; \ - adc x4, x4, x10; \ - mul x2, x2, x2; \ - cmp xzr, xzr; \ - extr x10, x20, x19, #9; \ - adcs x2, x2, x10; \ - extr x10, x21, x20, #9; \ - adcs x11, x11, x10; \ - extr x10, x22, x21, #9; \ - adcs x12, x12, x10; \ - extr x10, x23, x22, #9; \ - adcs x13, x13, x10; \ - extr x10, x24, x23, #9; \ - adcs x14, x14, x10; \ - extr x10, x25, x24, #9; \ - adcs x15, x15, x10; \ - extr x10, x0, x25, #9; \ - adcs x16, x16, x10; \ - extr x10, x4, x0, #9; \ - adcs x17, x17, x10; \ - orr x19, x19, #0xfffffffffffffe00; \ - lsr x10, x4, #9; \ - adcs x19, x19, x10; \ - sbcs x2, x2, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x19, x19, xzr; \ - and x19, x19, #0x1ff; \ - stp x2, x11, [P0]; \ - stp x12, x13, [P0+16]; \ - stp x14, x15, [P0+32]; \ - stp x16, x17, [P0+48]; \ - str x19, [P0+64] - -// Corresponds exactly to bignum_sub_p521 + add x0, P0; \ + add x1, P1; \ + bl local_sqr_p521 #define sub_p521(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - ldp x11, x12, [P1+48]; \ - ldp x4, x3, [P2+48]; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - ldr x13, [P1+64]; \ - ldr x4, [P2+64]; \ - sbcs x13, x13, x4; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] + add x0, P0; \ + add x1, P1; \ + add x2, P2; \ + bl local_sub_p521 S2N_BN_SYMBOL(p521_jmixadd): @@ -729,6 +106,7 @@ S2N_BN_SYMBOL(p521_jmixadd): stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! sub sp, sp, NSPACE // Move the input arguments to stable places @@ -754,30 +132,1209 @@ S2N_BN_SYMBOL(p521_jmixadd): mul_p521(zzx1,zz,x_1) mul_p521(zzx2,zz,x2a) - sub_p521(x_3,ww,zzx1) + sub_p521(resx,ww,zzx1) sub_p521(t1,zzx2,zzx1) - mul_p521(z_3,xd,z_1) + mul_p521(resz,xd,z_1) - sub_p521(x_3,x_3,zzx2) + sub_p521(resx,resx,zzx2) - sub_p521(t2,zzx1,x_3) + sub_p521(t2,zzx1,resx) mul_p521(t1,t1,y_1) mul_p521(t2,yd,t2) - sub_p521(y_3,t2,t1) + sub_p521(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + orr x0, x0, x1 + ldp x2, x3, [z_1+16] + orr x2, x2, x3 + ldp x4, x5, [z_1+32] + orr x4, x4, x5 + ldp x6, x7, [z_1+48] + orr x6, x6, x7 + ldr x8, [z_1+64] + orr x0, x0, x2 + orr x4, x4, x6 + orr x0, x0, x4 + orr x0, x0, x8 + cmp x0, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x20, x21, [x_2] + csel x0, x0, x20, ne + csel x1, x1, x21, ne + ldp x2, x3, [resx+16] + ldp x20, x21, [x_2+16] + csel x2, x2, x20, ne + csel x3, x3, x21, ne + ldp x4, x5, [resx+32] + ldp x20, x21, [x_2+32] + csel x4, x4, x20, ne + csel x5, x5, x21, ne + ldp x6, x7, [resx+48] + ldp x20, x21, [x_2+48] + csel x6, x6, x20, ne + csel x7, x7, x21, ne + ldr x8, [resx+64] + ldr x20, [x_2+64] + csel x8, x8, x20, ne + + ldp x10, x11, [resy] + ldp x20, x21, [y_2] + csel x10, x10, x20, ne + csel x11, x11, x21, ne + ldp x12, x13, [resy+16] + ldp x20, x21, [y_2+16] + csel x12, x12, x20, ne + csel x13, x13, x21, ne + ldp x14, x15, [resy+32] + ldp x20, x21, [y_2+32] + csel x14, x14, x20, ne + csel x15, x15, x21, ne + ldp x16, x17, [resy+48] + ldp x20, x21, [y_2+48] + csel x16, x16, x20, ne + csel x17, x17, x21, ne + ldr x19, [resy+64] + ldr x20, [y_2+64] + csel x19, x19, x20, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [x_3+48] + str x8, [x_3+64] + stp x10, x11, [y_3] + stp x12, x13, [y_3+16] + stp x14, x15, [y_3+32] + stp x16, x17, [y_3+48] + str x19, [y_3+64] + + ldp x0, x1, [resz] + mov x20, #1 + csel x0, x0, x20, ne + csel x1, x1, xzr, ne + ldp x2, x3, [resz+16] + csel x2, x2, xzr, ne + csel x3, x3, xzr, ne + ldp x4, x5, [resz+32] + csel x4, x4, xzr, ne + csel x5, x5, xzr, ne + ldp x6, x7, [resz+48] + csel x6, x6, xzr, ne + csel x7, x7, xzr, ne + ldr x8, [resz+64] + csel x8, x8, xzr, ne + + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] + stp x6, x7, [z_3+48] + str x8, [z_3+64] // Restore stack and registers add sp, sp, NSPACE + ldp x29, x30, [sp], 16 ldp x27, x28, [sp], 16 ldp x25, x26, [sp], 16 ldp x23, x24, [sp], 16 ldp x21, x22, [sp], 16 ldp x19, x20, [sp], 16 + ret + +// Local versions of the three field operations, almost identical to +// bignum_mul_p521, bignum_sqr_p521 and bignum_sub_p521 except for +// avoiding all intial register save-restore, and in the case of +// local_mul_p521, using the tmp buffer as temporary storage and +// avoiding x26. + +local_mul_p521: + ldp x3, x4, [x1] + ldp x5, x6, [x1, #16] + ldp x7, x8, [x2] + ldp x9, x10, [x2, #16] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + lsl x21, x11, #9 + extr x11, x12, x11, #55 + extr x12, x13, x12, #55 + extr x13, x14, x13, #55 + lsr x14, x14, #55 + ldp x3, x4, [x1, #32] + ldp x5, x6, [x1, #48] + ldp x7, x8, [x2, #32] + ldp x9, x10, [x2, #48] + stp x15, x16, [tmp] + stp x17, x19, [tmp+16] + stp x21, x11, [tmp+32] + stp x12, x13, [tmp+48] + str x14, [tmp+64] + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x23, x22, [tmp] + adds x11, x11, x23 + adcs x12, x12, x22 + stp x11, x12, [tmp] + ldp x23, x22, [tmp+16] + adcs x13, x13, x23 + adcs x14, x14, x22 + stp x13, x14, [tmp+16] + ldp x23, x22, [tmp+32] + adcs x15, x15, x23 + adcs x16, x16, x22 + stp x15, x16, [tmp+32] + ldp x23, x22, [tmp+48] + adcs x17, x17, x23 + adcs x19, x19, x22 + stp x17, x19, [tmp+48] + ldr x21, [tmp+64] + adc x21, x21, xzr + str x21, [tmp+64] + ldp x23, x22, [x1] + subs x3, x3, x23 + sbcs x4, x4, x22 + ldp x23, x22, [x1, #16] + sbcs x5, x5, x23 + sbcs x6, x6, x22 + csetm x24, lo + ldp x23, x22, [x2] + subs x7, x23, x7 + sbcs x8, x22, x8 + ldp x23, x22, [x2, #16] + sbcs x9, x23, x9 + sbcs x10, x22, x10 + csetm x25, lo + eor x3, x3, x24 + subs x3, x3, x24 + eor x4, x4, x24 + sbcs x4, x4, x24 + eor x5, x5, x24 + sbcs x5, x5, x24 + eor x6, x6, x24 + sbc x6, x6, x24 + eor x7, x7, x25 + subs x7, x7, x25 + eor x8, x8, x25 + sbcs x8, x8, x25 + eor x9, x9, x25 + sbcs x9, x9, x25 + eor x10, x10, x25 + sbc x10, x10, x25 + eor x25, x25, x24 + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x9 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x16, x16, x23 + eor x22, x22, x21 + adcs x17, x17, x22 + adc x19, x19, x21 + subs x24, x3, x4 + cneg x24, x24, lo + csetm x21, lo + subs x22, x8, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x12, x12, x23 + eor x22, x22, x21 + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x15, x15, x23 + eor x22, x22, x21 + adcs x16, x16, x22 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x13, x13, x23 + eor x22, x22, x21 + adcs x14, x14, x22 + adcs x15, x15, x21 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x3, x6 + cneg x24, x24, lo + csetm x21, lo + subs x22, x10, x7 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + subs x24, x4, x5 + cneg x24, x24, lo + csetm x21, lo + subs x22, x9, x8 + cneg x22, x22, lo + mul x23, x24, x22 + umulh x22, x24, x22 + cinv x21, x21, lo + cmn x21, #1 + eor x23, x23, x21 + adcs x14, x14, x23 + eor x22, x22, x21 + adcs x15, x15, x22 + adcs x16, x16, x21 + adcs x17, x17, x21 + adc x19, x19, x21 + ldp x3, x4, [tmp] + ldp x5, x6, [tmp+16] + eor x11, x11, x25 + adds x11, x11, x3 + eor x12, x12, x25 + adcs x12, x12, x4 + eor x13, x13, x25 + adcs x13, x13, x5 + eor x14, x14, x25 + adcs x14, x14, x6 + eor x15, x15, x25 + ldp x7, x8, [tmp+32] + ldp x9, x10, [tmp+48] + ldr x20, [tmp+64] + adcs x15, x15, x7 + eor x16, x16, x25 + adcs x16, x16, x8 + eor x17, x17, x25 + adcs x17, x17, x9 + eor x19, x19, x25 + adcs x19, x19, x10 + adc x21, x20, xzr + adds x15, x15, x3 + adcs x16, x16, x4 + adcs x17, x17, x5 + adcs x19, x19, x6 + and x25, x25, #0x1ff + lsl x24, x11, #9 + orr x24, x24, x25 + adcs x7, x7, x24 + extr x24, x12, x11, #55 + adcs x8, x8, x24 + extr x24, x13, x12, #55 + adcs x9, x9, x24 + extr x24, x14, x13, #55 + adcs x10, x10, x24 + lsr x24, x14, #55 + adc x20, x24, x20 + ldr x6, [x2, #64] + ldp x3, x4, [x1] + and x23, x3, #0xfffffffffffff + mul x23, x6, x23 + ldr x14, [x1, #64] + ldp x11, x12, [x2] + and x24, x11, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + extr x24, x4, x3, #52 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x12, x11, #52 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #12 + adds x15, x15, x24 + ldp x5, x3, [x1, #16] + ldp x13, x11, [x2, #16] + extr x24, x5, x4, #40 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #40 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #24 + adcs x16, x16, x24 + extr x24, x3, x5, #28 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #28 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #36 + adcs x17, x17, x24 + and x25, x16, x17 + ldp x4, x5, [x1, #32] + ldp x12, x13, [x2, #32] + extr x24, x4, x3, #16 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #16 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsl x21, x21, #48 + add x23, x23, x21 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #48 + adcs x19, x19, x24 + and x25, x25, x19 + lsr x24, x4, #4 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + lsr x24, x12, #4 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x21, x22, x23, #60 + extr x24, x5, x4, #56 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x13, x12, #56 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x21, x21, #8 + extr x24, x23, x21, #8 + adcs x7, x7, x24 + and x25, x25, x7 + ldp x3, x4, [x1, #48] + ldp x11, x12, [x2, #48] + extr x24, x3, x5, #44 + and x24, x24, #0xfffffffffffff + mul x22, x6, x24 + extr x24, x11, x13, #44 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #20 + adcs x8, x8, x24 + and x25, x25, x8 + extr x24, x4, x3, #32 + and x24, x24, #0xfffffffffffff + mul x23, x6, x24 + extr x24, x12, x11, #32 + and x24, x24, #0xfffffffffffff + mul x24, x14, x24 + add x23, x23, x24 + lsr x24, x22, #52 + add x23, x23, x24 + lsl x22, x22, #12 + extr x24, x23, x22, #32 + adcs x9, x9, x24 + and x25, x25, x9 + lsr x24, x4, #20 + mul x22, x6, x24 + lsr x24, x12, #20 + mul x24, x14, x24 + add x22, x22, x24 + lsr x24, x23, #52 + add x22, x22, x24 + lsl x23, x23, #12 + extr x24, x22, x23, #44 + adcs x10, x10, x24 + and x25, x25, x10 + mul x24, x6, x14 + lsr x22, x22, #44 + add x24, x24, x22 + adc x20, x20, x24 + lsr x22, x20, #9 + orr x20, x20, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x15, x22 + adcs xzr, x25, xzr + adcs xzr, x20, xzr + adcs x15, x15, x22 + adcs x16, x16, xzr + adcs x17, x17, xzr + adcs x19, x19, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x20, x20, xzr + and x22, x15, #0x1ff + extr x15, x16, x15, #9 + extr x16, x17, x16, #9 + stp x15, x16, [x0] + extr x17, x19, x17, #9 + extr x19, x7, x19, #9 + stp x17, x19, [x0, #16] + extr x7, x8, x7, #9 + extr x8, x9, x8, #9 + stp x7, x8, [x0, #32] + extr x9, x10, x9, #9 + extr x10, x20, x10, #9 + stp x9, x10, [x0, #48] + str x22, [x0, #64] + ret + +local_sqr_p521: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + ldp x6, x7, [x1, #32] + ldp x8, x9, [x1, #48] + mul x12, x6, x8 + mul x17, x7, x9 + umulh x22, x6, x8 + subs x23, x6, x7 + cneg x23, x23, cc + csetm x11, cc + subs x10, x9, x8 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x7, x9 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x6, x6 + mul x16, x7, x7 + mul x21, x6, x7 + umulh x11, x6, x6 + umulh x17, x7, x7 + umulh x20, x6, x7 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x8, x8 + mul x16, x9, x9 + mul x21, x8, x9 + umulh x15, x8, x8 + umulh x17, x9, x9 + umulh x20, x8, x9 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldr x19, [x1, #64] + add x23, x19, x19 + mul x19, x19, x19 + and x21, x2, #0xfffffffffffff + mul x21, x23, x21 + extr x20, x3, x2, #52 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #12 + adds x10, x10, x22 + extr x21, x4, x3, #40 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #24 + adcs x11, x11, x22 + extr x20, x5, x4, #28 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #36 + adcs x12, x12, x22 + extr x21, x6, x5, #16 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #48 + adcs x13, x13, x22 + lsr x20, x6, #4 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x24, x20, x21, #60 + extr x21, x7, x6, #56 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x24, x24, #8 + extr x22, x21, x24, #8 + adcs x14, x14, x22 + extr x20, x8, x7, #44 + and x20, x20, #0xfffffffffffff + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #20 + adcs x15, x15, x22 + extr x21, x9, x8, #32 + and x21, x21, #0xfffffffffffff + mul x21, x23, x21 + lsr x22, x20, #52 + add x21, x21, x22 + lsl x20, x20, #12 + extr x22, x21, x20, #32 + adcs x16, x16, x22 + lsr x20, x9, #20 + mul x20, x23, x20 + lsr x22, x21, #52 + add x20, x20, x22 + lsl x21, x21, #12 + extr x22, x20, x21, #44 + adcs x17, x17, x22 + lsr x20, x20, #44 + adc x19, x19, x20 + extr x21, x11, x10, #9 + extr x20, x12, x11, #9 + stp x21, x20, [x0] + extr x21, x13, x12, #9 + extr x20, x14, x13, #9 + stp x21, x20, [x0, #16] + extr x21, x15, x14, #9 + extr x20, x16, x15, #9 + stp x21, x20, [x0, #32] + extr x21, x17, x16, #9 + extr x20, x19, x17, #9 + stp x21, x20, [x0, #48] + and x22, x10, #0x1ff + lsr x19, x19, #9 + add x22, x22, x19 + str x22, [x0, #64] + mul x12, x2, x4 + mul x17, x3, x5 + umulh x22, x2, x4 + subs x23, x2, x3 + cneg x23, x23, cc + csetm x11, cc + subs x10, x5, x4 + cneg x10, x10, cc + mul x16, x23, x10 + umulh x10, x23, x10 + cinv x11, x11, cc + eor x16, x16, x11 + eor x10, x10, x11 + adds x13, x12, x22 + adc x22, x22, xzr + umulh x23, x3, x5 + adds x13, x13, x17 + adcs x22, x22, x23 + adc x23, x23, xzr + adds x22, x22, x17 + adc x23, x23, xzr + cmn x11, #0x1 + adcs x13, x13, x16 + adcs x22, x22, x10 + adc x23, x23, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adcs x22, x22, x22 + adcs x23, x23, x23 + adc x19, xzr, xzr + mul x10, x2, x2 + mul x16, x3, x3 + mul x21, x2, x3 + umulh x11, x2, x2 + umulh x17, x3, x3 + umulh x20, x2, x3 + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x11, x11, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x22, x22, xzr + adcs x23, x23, xzr + adc x19, x19, xzr + mul x14, x4, x4 + mul x16, x5, x5 + mul x21, x4, x5 + umulh x15, x4, x4 + umulh x17, x5, x5 + umulh x20, x4, x5 + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x15, x15, x21 + adcs x16, x16, x20 + adc x17, x17, xzr + adds x14, x14, x22 + adcs x15, x15, x23 + adcs x16, x16, x19 + adc x17, x17, xzr + ldp x21, x20, [x0] + adds x21, x21, x10 + adcs x20, x20, x11 + stp x21, x20, [x0] + ldp x21, x20, [x0, #16] + adcs x21, x21, x12 + adcs x20, x20, x13 + stp x21, x20, [x0, #16] + ldp x21, x20, [x0, #32] + adcs x21, x21, x14 + adcs x20, x20, x15 + stp x21, x20, [x0, #32] + ldp x21, x20, [x0, #48] + adcs x21, x21, x16 + adcs x20, x20, x17 + stp x21, x20, [x0, #48] + ldr x22, [x0, #64] + adc x22, x22, xzr + str x22, [x0, #64] + mul x10, x2, x6 + mul x14, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + umulh x17, x2, x6 + adds x14, x14, x17 + umulh x17, x3, x7 + adcs x15, x15, x17 + umulh x17, x4, x8 + adcs x16, x16, x17 + umulh x17, x5, x9 + adc x17, x17, xzr + adds x11, x14, x10 + adcs x14, x15, x14 + adcs x15, x16, x15 + adcs x16, x17, x16 + adc x17, xzr, x17 + adds x12, x14, x10 + adcs x13, x15, x11 + adcs x14, x16, x14 + adcs x15, x17, x15 + adcs x16, xzr, x16 + adc x17, xzr, x17 + subs x22, x4, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x8 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x15, x15, x21 + eor x20, x20, x19 + adcs x16, x16, x20 + adc x17, x17, x19 + subs x22, x2, x3 + cneg x22, x22, cc + csetm x19, cc + subs x20, x7, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x11, x11, x21 + eor x20, x20, x19 + adcs x12, x12, x20 + adcs x13, x13, x19 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x14, x14, x21 + eor x20, x20, x19 + adcs x15, x15, x20 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x12, x12, x21 + eor x20, x20, x19 + adcs x13, x13, x20 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x5 + cneg x22, x22, cc + csetm x19, cc + subs x20, x9, x6 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x4 + cneg x22, x22, cc + csetm x19, cc + subs x20, x8, x7 + cneg x20, x20, cc + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + ldp x21, x20, [x0] + extr x2, x15, x14, #8 + adds x2, x2, x21 + extr x3, x16, x15, #8 + adcs x3, x3, x20 + ldp x21, x20, [x0, #16] + extr x4, x17, x16, #8 + adcs x4, x4, x21 + and x22, x3, x4 + lsr x5, x17, #8 + adcs x5, x5, x20 + and x22, x22, x5 + ldp x21, x20, [x0, #32] + lsl x6, x10, #1 + adcs x6, x6, x21 + and x22, x22, x6 + extr x7, x11, x10, #63 + adcs x7, x7, x20 + and x22, x22, x7 + ldp x21, x20, [x0, #48] + extr x8, x12, x11, #63 + adcs x8, x8, x21 + and x22, x22, x8 + extr x9, x13, x12, #63 + adcs x9, x9, x20 + and x22, x22, x9 + ldr x21, [x0, #64] + extr x10, x14, x13, #63 + and x10, x10, #0x1ff + adc x10, x21, x10 + lsr x20, x10, #9 + orr x10, x10, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x2, x20 + adcs xzr, x22, xzr + adcs xzr, x10, xzr + adcs x2, x2, x20 + adcs x3, x3, xzr + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adc x10, x10, xzr + and x10, x10, #0x1ff + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + str x10, [x0, #64] + ret +local_sub_p521: + ldp x5, x6, [x1] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [x2, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [x1, #48] + ldp x4, x3, [x2, #48] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [x1, #64] + ldr x4, [x2, #64] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] + stp x11, x12, [x0, #48] + str x13, [x0, #64] ret #if defined(__linux__) && defined(__ELF__) diff --git a/third_party/s2n-bignum/arm/p521/p521_jmixadd_alt.S b/third_party/s2n-bignum/arm/p521/p521_jmixadd_alt.S new file mode 100644 index 0000000000..783ca28cf8 --- /dev/null +++ b/third_party/s2n-bignum/arm/p521/p521_jmixadd_alt.S @@ -0,0 +1,882 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jmixadd_alt +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_521, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jmixadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 +#define input_y x28 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1]; \ + ldp x5, x6, [P2]; \ + mul x15, x3, x5; \ + umulh x16, x3, x5; \ + mul x14, x3, x6; \ + umulh x17, x3, x6; \ + adds x16, x16, x14; \ + ldp x7, x8, [P2+16]; \ + mul x14, x3, x7; \ + umulh x19, x3, x7; \ + adcs x17, x17, x14; \ + mul x14, x3, x8; \ + umulh x20, x3, x8; \ + adcs x19, x19, x14; \ + ldp x9, x10, [P2+32]; \ + mul x14, x3, x9; \ + umulh x21, x3, x9; \ + adcs x20, x20, x14; \ + mul x14, x3, x10; \ + umulh x22, x3, x10; \ + adcs x21, x21, x14; \ + ldp x11, x12, [P2+48]; \ + mul x14, x3, x11; \ + umulh x23, x3, x11; \ + adcs x22, x22, x14; \ + ldr x13, [P2+64]; \ + mul x14, x3, x12; \ + umulh x24, x3, x12; \ + adcs x23, x23, x14; \ + mul x14, x3, x13; \ + umulh x1, x3, x13; \ + adcs x24, x24, x14; \ + adc x1, x1, xzr; \ + mul x14, x4, x5; \ + adds x16, x16, x14; \ + mul x14, x4, x6; \ + adcs x17, x17, x14; \ + mul x14, x4, x7; \ + adcs x19, x19, x14; \ + mul x14, x4, x8; \ + adcs x20, x20, x14; \ + mul x14, x4, x9; \ + adcs x21, x21, x14; \ + mul x14, x4, x10; \ + adcs x22, x22, x14; \ + mul x14, x4, x11; \ + adcs x23, x23, x14; \ + mul x14, x4, x12; \ + adcs x24, x24, x14; \ + mul x14, x4, x13; \ + adcs x1, x1, x14; \ + cset x0, hs; \ + umulh x14, x4, x5; \ + adds x17, x17, x14; \ + umulh x14, x4, x6; \ + adcs x19, x19, x14; \ + umulh x14, x4, x7; \ + adcs x20, x20, x14; \ + umulh x14, x4, x8; \ + adcs x21, x21, x14; \ + umulh x14, x4, x9; \ + adcs x22, x22, x14; \ + umulh x14, x4, x10; \ + adcs x23, x23, x14; \ + umulh x14, x4, x11; \ + adcs x24, x24, x14; \ + umulh x14, x4, x12; \ + adcs x1, x1, x14; \ + umulh x14, x4, x13; \ + adc x0, x0, x14; \ + stp x15, x16, [P0]; \ + ldp x3, x4, [P1+16]; \ + mul x14, x3, x5; \ + adds x17, x17, x14; \ + mul x14, x3, x6; \ + adcs x19, x19, x14; \ + mul x14, x3, x7; \ + adcs x20, x20, x14; \ + mul x14, x3, x8; \ + adcs x21, x21, x14; \ + mul x14, x3, x9; \ + adcs x22, x22, x14; \ + mul x14, x3, x10; \ + adcs x23, x23, x14; \ + mul x14, x3, x11; \ + adcs x24, x24, x14; \ + mul x14, x3, x12; \ + adcs x1, x1, x14; \ + mul x14, x3, x13; \ + adcs x0, x0, x14; \ + cset x15, hs; \ + umulh x14, x3, x5; \ + adds x19, x19, x14; \ + umulh x14, x3, x6; \ + adcs x20, x20, x14; \ + umulh x14, x3, x7; \ + adcs x21, x21, x14; \ + umulh x14, x3, x8; \ + adcs x22, x22, x14; \ + umulh x14, x3, x9; \ + adcs x23, x23, x14; \ + umulh x14, x3, x10; \ + adcs x24, x24, x14; \ + umulh x14, x3, x11; \ + adcs x1, x1, x14; \ + umulh x14, x3, x12; \ + adcs x0, x0, x14; \ + umulh x14, x3, x13; \ + adc x15, x15, x14; \ + mul x14, x4, x5; \ + adds x19, x19, x14; \ + mul x14, x4, x6; \ + adcs x20, x20, x14; \ + mul x14, x4, x7; \ + adcs x21, x21, x14; \ + mul x14, x4, x8; \ + adcs x22, x22, x14; \ + mul x14, x4, x9; \ + adcs x23, x23, x14; \ + mul x14, x4, x10; \ + adcs x24, x24, x14; \ + mul x14, x4, x11; \ + adcs x1, x1, x14; \ + mul x14, x4, x12; \ + adcs x0, x0, x14; \ + mul x14, x4, x13; \ + adcs x15, x15, x14; \ + cset x16, hs; \ + umulh x14, x4, x5; \ + adds x20, x20, x14; \ + umulh x14, x4, x6; \ + adcs x21, x21, x14; \ + umulh x14, x4, x7; \ + adcs x22, x22, x14; \ + umulh x14, x4, x8; \ + adcs x23, x23, x14; \ + umulh x14, x4, x9; \ + adcs x24, x24, x14; \ + umulh x14, x4, x10; \ + adcs x1, x1, x14; \ + umulh x14, x4, x11; \ + adcs x0, x0, x14; \ + umulh x14, x4, x12; \ + adcs x15, x15, x14; \ + umulh x14, x4, x13; \ + adc x16, x16, x14; \ + stp x17, x19, [P0+16]; \ + ldp x3, x4, [P1+32]; \ + mul x14, x3, x5; \ + adds x20, x20, x14; \ + mul x14, x3, x6; \ + adcs x21, x21, x14; \ + mul x14, x3, x7; \ + adcs x22, x22, x14; \ + mul x14, x3, x8; \ + adcs x23, x23, x14; \ + mul x14, x3, x9; \ + adcs x24, x24, x14; \ + mul x14, x3, x10; \ + adcs x1, x1, x14; \ + mul x14, x3, x11; \ + adcs x0, x0, x14; \ + mul x14, x3, x12; \ + adcs x15, x15, x14; \ + mul x14, x3, x13; \ + adcs x16, x16, x14; \ + cset x17, hs; \ + umulh x14, x3, x5; \ + adds x21, x21, x14; \ + umulh x14, x3, x6; \ + adcs x22, x22, x14; \ + umulh x14, x3, x7; \ + adcs x23, x23, x14; \ + umulh x14, x3, x8; \ + adcs x24, x24, x14; \ + umulh x14, x3, x9; \ + adcs x1, x1, x14; \ + umulh x14, x3, x10; \ + adcs x0, x0, x14; \ + umulh x14, x3, x11; \ + adcs x15, x15, x14; \ + umulh x14, x3, x12; \ + adcs x16, x16, x14; \ + umulh x14, x3, x13; \ + adc x17, x17, x14; \ + mul x14, x4, x5; \ + adds x21, x21, x14; \ + mul x14, x4, x6; \ + adcs x22, x22, x14; \ + mul x14, x4, x7; \ + adcs x23, x23, x14; \ + mul x14, x4, x8; \ + adcs x24, x24, x14; \ + mul x14, x4, x9; \ + adcs x1, x1, x14; \ + mul x14, x4, x10; \ + adcs x0, x0, x14; \ + mul x14, x4, x11; \ + adcs x15, x15, x14; \ + mul x14, x4, x12; \ + adcs x16, x16, x14; \ + mul x14, x4, x13; \ + adcs x17, x17, x14; \ + cset x19, hs; \ + umulh x14, x4, x5; \ + adds x22, x22, x14; \ + umulh x14, x4, x6; \ + adcs x23, x23, x14; \ + umulh x14, x4, x7; \ + adcs x24, x24, x14; \ + umulh x14, x4, x8; \ + adcs x1, x1, x14; \ + umulh x14, x4, x9; \ + adcs x0, x0, x14; \ + umulh x14, x4, x10; \ + adcs x15, x15, x14; \ + umulh x14, x4, x11; \ + adcs x16, x16, x14; \ + umulh x14, x4, x12; \ + adcs x17, x17, x14; \ + umulh x14, x4, x13; \ + adc x19, x19, x14; \ + stp x20, x21, [P0+32]; \ + ldp x3, x4, [P1+48]; \ + mul x14, x3, x5; \ + adds x22, x22, x14; \ + mul x14, x3, x6; \ + adcs x23, x23, x14; \ + mul x14, x3, x7; \ + adcs x24, x24, x14; \ + mul x14, x3, x8; \ + adcs x1, x1, x14; \ + mul x14, x3, x9; \ + adcs x0, x0, x14; \ + mul x14, x3, x10; \ + adcs x15, x15, x14; \ + mul x14, x3, x11; \ + adcs x16, x16, x14; \ + mul x14, x3, x12; \ + adcs x17, x17, x14; \ + mul x14, x3, x13; \ + adcs x19, x19, x14; \ + cset x20, hs; \ + umulh x14, x3, x5; \ + adds x23, x23, x14; \ + umulh x14, x3, x6; \ + adcs x24, x24, x14; \ + umulh x14, x3, x7; \ + adcs x1, x1, x14; \ + umulh x14, x3, x8; \ + adcs x0, x0, x14; \ + umulh x14, x3, x9; \ + adcs x15, x15, x14; \ + umulh x14, x3, x10; \ + adcs x16, x16, x14; \ + umulh x14, x3, x11; \ + adcs x17, x17, x14; \ + umulh x14, x3, x12; \ + adcs x19, x19, x14; \ + umulh x14, x3, x13; \ + adc x20, x20, x14; \ + mul x14, x4, x5; \ + adds x23, x23, x14; \ + mul x14, x4, x6; \ + adcs x24, x24, x14; \ + mul x14, x4, x7; \ + adcs x1, x1, x14; \ + mul x14, x4, x8; \ + adcs x0, x0, x14; \ + mul x14, x4, x9; \ + adcs x15, x15, x14; \ + mul x14, x4, x10; \ + adcs x16, x16, x14; \ + mul x14, x4, x11; \ + adcs x17, x17, x14; \ + mul x14, x4, x12; \ + adcs x19, x19, x14; \ + mul x14, x4, x13; \ + adcs x20, x20, x14; \ + cset x21, hs; \ + umulh x14, x4, x5; \ + adds x24, x24, x14; \ + umulh x14, x4, x6; \ + adcs x1, x1, x14; \ + umulh x14, x4, x7; \ + adcs x0, x0, x14; \ + umulh x14, x4, x8; \ + adcs x15, x15, x14; \ + umulh x14, x4, x9; \ + adcs x16, x16, x14; \ + umulh x14, x4, x10; \ + adcs x17, x17, x14; \ + umulh x14, x4, x11; \ + adcs x19, x19, x14; \ + umulh x14, x4, x12; \ + adcs x20, x20, x14; \ + umulh x14, x4, x13; \ + adc x21, x21, x14; \ + stp x22, x23, [P0+48]; \ + ldr x3, [P1+64]; \ + mul x14, x3, x5; \ + adds x24, x24, x14; \ + mul x14, x3, x6; \ + adcs x1, x1, x14; \ + mul x14, x3, x7; \ + adcs x0, x0, x14; \ + mul x14, x3, x8; \ + adcs x15, x15, x14; \ + mul x14, x3, x9; \ + adcs x16, x16, x14; \ + mul x14, x3, x10; \ + adcs x17, x17, x14; \ + mul x14, x3, x11; \ + adcs x19, x19, x14; \ + mul x14, x3, x12; \ + adcs x20, x20, x14; \ + mul x14, x3, x13; \ + adc x21, x21, x14; \ + umulh x14, x3, x5; \ + adds x1, x1, x14; \ + umulh x14, x3, x6; \ + adcs x0, x0, x14; \ + umulh x14, x3, x7; \ + adcs x15, x15, x14; \ + umulh x14, x3, x8; \ + adcs x16, x16, x14; \ + umulh x14, x3, x9; \ + adcs x17, x17, x14; \ + umulh x14, x3, x10; \ + adcs x19, x19, x14; \ + umulh x14, x3, x11; \ + adcs x20, x20, x14; \ + umulh x14, x3, x12; \ + adc x21, x21, x14; \ + cmp xzr, xzr; \ + ldp x5, x6, [P0]; \ + extr x14, x1, x24, #9; \ + adcs x5, x5, x14; \ + extr x14, x0, x1, #9; \ + adcs x6, x6, x14; \ + ldp x7, x8, [P0+16]; \ + extr x14, x15, x0, #9; \ + adcs x7, x7, x14; \ + extr x14, x16, x15, #9; \ + adcs x8, x8, x14; \ + ldp x9, x10, [P0+32]; \ + extr x14, x17, x16, #9; \ + adcs x9, x9, x14; \ + extr x14, x19, x17, #9; \ + adcs x10, x10, x14; \ + ldp x11, x12, [P0+48]; \ + extr x14, x20, x19, #9; \ + adcs x11, x11, x14; \ + extr x14, x21, x20, #9; \ + adcs x12, x12, x14; \ + orr x13, x24, #0xfffffffffffffe00; \ + lsr x14, x21, #9; \ + adcs x13, x13, x14; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbc x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1]; \ + mul x11, x2, x3; \ + umulh x12, x2, x3; \ + ldp x4, x5, [P1+16]; \ + mul x10, x2, x4; \ + umulh x13, x2, x4; \ + adds x12, x12, x10; \ + ldp x6, x7, [P1+32]; \ + mul x10, x2, x5; \ + umulh x14, x2, x5; \ + adcs x13, x13, x10; \ + ldp x8, x9, [P1+48]; \ + mul x10, x2, x6; \ + umulh x15, x2, x6; \ + adcs x14, x14, x10; \ + mul x10, x2, x7; \ + umulh x16, x2, x7; \ + adcs x15, x15, x10; \ + mul x10, x2, x8; \ + umulh x17, x2, x8; \ + adcs x16, x16, x10; \ + mul x10, x2, x9; \ + umulh x19, x2, x9; \ + adcs x17, x17, x10; \ + adc x19, x19, xzr; \ + mul x10, x3, x4; \ + adds x13, x13, x10; \ + mul x10, x3, x5; \ + adcs x14, x14, x10; \ + mul x10, x3, x6; \ + adcs x15, x15, x10; \ + mul x10, x3, x7; \ + adcs x16, x16, x10; \ + mul x10, x3, x8; \ + adcs x17, x17, x10; \ + mul x10, x3, x9; \ + adcs x19, x19, x10; \ + cset x20, hs; \ + umulh x10, x3, x4; \ + adds x14, x14, x10; \ + umulh x10, x3, x5; \ + adcs x15, x15, x10; \ + umulh x10, x3, x6; \ + adcs x16, x16, x10; \ + umulh x10, x3, x7; \ + adcs x17, x17, x10; \ + umulh x10, x3, x8; \ + adcs x19, x19, x10; \ + umulh x10, x3, x9; \ + adc x20, x20, x10; \ + mul x10, x6, x7; \ + umulh x21, x6, x7; \ + adds x20, x20, x10; \ + adc x21, x21, xzr; \ + mul x10, x4, x5; \ + adds x15, x15, x10; \ + mul x10, x4, x6; \ + adcs x16, x16, x10; \ + mul x10, x4, x7; \ + adcs x17, x17, x10; \ + mul x10, x4, x8; \ + adcs x19, x19, x10; \ + mul x10, x4, x9; \ + adcs x20, x20, x10; \ + mul x10, x6, x8; \ + adcs x21, x21, x10; \ + cset x22, hs; \ + umulh x10, x4, x5; \ + adds x16, x16, x10; \ + umulh x10, x4, x6; \ + adcs x17, x17, x10; \ + umulh x10, x4, x7; \ + adcs x19, x19, x10; \ + umulh x10, x4, x8; \ + adcs x20, x20, x10; \ + umulh x10, x4, x9; \ + adcs x21, x21, x10; \ + umulh x10, x6, x8; \ + adc x22, x22, x10; \ + mul x10, x7, x8; \ + umulh x23, x7, x8; \ + adds x22, x22, x10; \ + adc x23, x23, xzr; \ + mul x10, x5, x6; \ + adds x17, x17, x10; \ + mul x10, x5, x7; \ + adcs x19, x19, x10; \ + mul x10, x5, x8; \ + adcs x20, x20, x10; \ + mul x10, x5, x9; \ + adcs x21, x21, x10; \ + mul x10, x6, x9; \ + adcs x22, x22, x10; \ + mul x10, x7, x9; \ + adcs x23, x23, x10; \ + cset x24, hs; \ + umulh x10, x5, x6; \ + adds x19, x19, x10; \ + umulh x10, x5, x7; \ + adcs x20, x20, x10; \ + umulh x10, x5, x8; \ + adcs x21, x21, x10; \ + umulh x10, x5, x9; \ + adcs x22, x22, x10; \ + umulh x10, x6, x9; \ + adcs x23, x23, x10; \ + umulh x10, x7, x9; \ + adc x24, x24, x10; \ + mul x10, x8, x9; \ + umulh x25, x8, x9; \ + adds x24, x24, x10; \ + adc x25, x25, xzr; \ + adds x11, x11, x11; \ + adcs x12, x12, x12; \ + adcs x13, x13, x13; \ + adcs x14, x14, x14; \ + adcs x15, x15, x15; \ + adcs x16, x16, x16; \ + adcs x17, x17, x17; \ + adcs x19, x19, x19; \ + adcs x20, x20, x20; \ + adcs x21, x21, x21; \ + adcs x22, x22, x22; \ + adcs x23, x23, x23; \ + adcs x24, x24, x24; \ + adcs x25, x25, x25; \ + cset x0, hs; \ + umulh x10, x2, x2; \ + adds x11, x11, x10; \ + mul x10, x3, x3; \ + adcs x12, x12, x10; \ + umulh x10, x3, x3; \ + adcs x13, x13, x10; \ + mul x10, x4, x4; \ + adcs x14, x14, x10; \ + umulh x10, x4, x4; \ + adcs x15, x15, x10; \ + mul x10, x5, x5; \ + adcs x16, x16, x10; \ + umulh x10, x5, x5; \ + adcs x17, x17, x10; \ + mul x10, x6, x6; \ + adcs x19, x19, x10; \ + umulh x10, x6, x6; \ + adcs x20, x20, x10; \ + mul x10, x7, x7; \ + adcs x21, x21, x10; \ + umulh x10, x7, x7; \ + adcs x22, x22, x10; \ + mul x10, x8, x8; \ + adcs x23, x23, x10; \ + umulh x10, x8, x8; \ + adcs x24, x24, x10; \ + mul x10, x9, x9; \ + adcs x25, x25, x10; \ + umulh x10, x9, x9; \ + adc x0, x0, x10; \ + ldr x1, [P1+64]; \ + add x1, x1, x1; \ + mul x10, x1, x2; \ + adds x19, x19, x10; \ + umulh x10, x1, x2; \ + adcs x20, x20, x10; \ + mul x10, x1, x4; \ + adcs x21, x21, x10; \ + umulh x10, x1, x4; \ + adcs x22, x22, x10; \ + mul x10, x1, x6; \ + adcs x23, x23, x10; \ + umulh x10, x1, x6; \ + adcs x24, x24, x10; \ + mul x10, x1, x8; \ + adcs x25, x25, x10; \ + umulh x10, x1, x8; \ + adcs x0, x0, x10; \ + lsr x4, x1, #1; \ + mul x4, x4, x4; \ + adc x4, x4, xzr; \ + mul x10, x1, x3; \ + adds x20, x20, x10; \ + umulh x10, x1, x3; \ + adcs x21, x21, x10; \ + mul x10, x1, x5; \ + adcs x22, x22, x10; \ + umulh x10, x1, x5; \ + adcs x23, x23, x10; \ + mul x10, x1, x7; \ + adcs x24, x24, x10; \ + umulh x10, x1, x7; \ + adcs x25, x25, x10; \ + mul x10, x1, x9; \ + adcs x0, x0, x10; \ + umulh x10, x1, x9; \ + adc x4, x4, x10; \ + mul x2, x2, x2; \ + cmp xzr, xzr; \ + extr x10, x20, x19, #9; \ + adcs x2, x2, x10; \ + extr x10, x21, x20, #9; \ + adcs x11, x11, x10; \ + extr x10, x22, x21, #9; \ + adcs x12, x12, x10; \ + extr x10, x23, x22, #9; \ + adcs x13, x13, x10; \ + extr x10, x24, x23, #9; \ + adcs x14, x14, x10; \ + extr x10, x25, x24, #9; \ + adcs x15, x15, x10; \ + extr x10, x0, x25, #9; \ + adcs x16, x16, x10; \ + extr x10, x4, x0, #9; \ + adcs x17, x17, x10; \ + orr x19, x19, #0xfffffffffffffe00; \ + lsr x10, x4, #9; \ + adcs x19, x19, x10; \ + sbcs x2, x2, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + sbcs x14, x14, xzr; \ + sbcs x15, x15, xzr; \ + sbcs x16, x16, xzr; \ + sbcs x17, x17, xzr; \ + sbc x19, x19, xzr; \ + and x19, x19, #0x1ff; \ + stp x2, x11, [P0]; \ + stp x12, x13, [P0+16]; \ + stp x14, x15, [P0+32]; \ + stp x16, x17, [P0+48]; \ + str x19, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1]; \ + ldp x4, x3, [P2]; \ + subs x5, x5, x4; \ + sbcs x6, x6, x3; \ + ldp x7, x8, [P1+16]; \ + ldp x4, x3, [P2+16]; \ + sbcs x7, x7, x4; \ + sbcs x8, x8, x3; \ + ldp x9, x10, [P1+32]; \ + ldp x4, x3, [P2+32]; \ + sbcs x9, x9, x4; \ + sbcs x10, x10, x3; \ + ldp x11, x12, [P1+48]; \ + ldp x4, x3, [P2+48]; \ + sbcs x11, x11, x4; \ + sbcs x12, x12, x3; \ + ldr x13, [P1+64]; \ + ldr x4, [P2+64]; \ + sbcs x13, x13, x4; \ + sbcs x5, x5, xzr; \ + sbcs x6, x6, xzr; \ + sbcs x7, x7, xzr; \ + sbcs x8, x8, xzr; \ + sbcs x9, x9, xzr; \ + sbcs x10, x10, xzr; \ + sbcs x11, x11, xzr; \ + sbcs x12, x12, xzr; \ + sbcs x13, x13, xzr; \ + and x13, x13, #0x1ff; \ + stp x5, x6, [P0]; \ + stp x7, x8, [P0+16]; \ + stp x9, x10, [P0+32]; \ + stp x11, x12, [P0+48]; \ + str x13, [P0+64] + +S2N_BN_SYMBOL(p521_jmixadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations + + sqr_p521(zp2,z_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,zp2,x_2) + mul_p521(y2a,zp2,y2a) + + sub_p521(xd,x2a,x_1) + sub_p521(yd,y2a,y_1) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x_1) + mul_p521(zzx2,zz,x2a) + + sub_p521(resx,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(resz,xd,z_1) + + sub_p521(resx,resx,zzx2) + + sub_p521(t2,zzx1,resx) + + mul_p521(t1,t1,y_1) + mul_p521(t2,yd,t2) + + sub_p521(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + orr x0, x0, x1 + ldp x2, x3, [z_1+16] + orr x2, x2, x3 + ldp x4, x5, [z_1+32] + orr x4, x4, x5 + ldp x6, x7, [z_1+48] + orr x6, x6, x7 + ldr x8, [z_1+64] + orr x0, x0, x2 + orr x4, x4, x6 + orr x0, x0, x4 + orr x0, x0, x8 + cmp x0, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x20, x21, [x_2] + csel x0, x0, x20, ne + csel x1, x1, x21, ne + ldp x2, x3, [resx+16] + ldp x20, x21, [x_2+16] + csel x2, x2, x20, ne + csel x3, x3, x21, ne + ldp x4, x5, [resx+32] + ldp x20, x21, [x_2+32] + csel x4, x4, x20, ne + csel x5, x5, x21, ne + ldp x6, x7, [resx+48] + ldp x20, x21, [x_2+48] + csel x6, x6, x20, ne + csel x7, x7, x21, ne + ldr x8, [resx+64] + ldr x20, [x_2+64] + csel x8, x8, x20, ne + + ldp x10, x11, [resy] + ldp x20, x21, [y_2] + csel x10, x10, x20, ne + csel x11, x11, x21, ne + ldp x12, x13, [resy+16] + ldp x20, x21, [y_2+16] + csel x12, x12, x20, ne + csel x13, x13, x21, ne + ldp x14, x15, [resy+32] + ldp x20, x21, [y_2+32] + csel x14, x14, x20, ne + csel x15, x15, x21, ne + ldp x16, x17, [resy+48] + ldp x20, x21, [y_2+48] + csel x16, x16, x20, ne + csel x17, x17, x21, ne + ldr x19, [resy+64] + ldr x20, [y_2+64] + csel x19, x19, x20, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [x_3+48] + str x8, [x_3+64] + stp x10, x11, [y_3] + stp x12, x13, [y_3+16] + stp x14, x15, [y_3+32] + stp x16, x17, [y_3+48] + str x19, [y_3+64] + + ldp x0, x1, [resz] + mov x20, #1 + csel x0, x0, x20, ne + csel x1, x1, xzr, ne + ldp x2, x3, [resz+16] + csel x2, x2, xzr, ne + csel x3, x3, xzr, ne + ldp x4, x5, [resz+32] + csel x4, x4, xzr, ne + csel x5, x5, xzr, ne + ldp x6, x7, [resz+48] + csel x6, x6, xzr, ne + csel x7, x7, xzr, ne + ldr x8, [resz+64] + csel x8, x8, xzr, ne + + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] + stp x6, x7, [z_3+48] + str x8, [z_3+64] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519.S b/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519.S index 87e5e9cf62..b9f7cdaa16 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519.S +++ b/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519.S @@ -755,8 +755,7 @@ curve25519_x25519_scalarloop: // Multiplex directly into (xn,zn) then do three pure doubling steps; // this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. +// of the scalar. movq swap, %rdx testq %rdx, %rdx @@ -788,12 +787,12 @@ curve25519_x25519_scalarloop: sub_twice4(p,s,d) cmadd_4(e,0x1db42,p,d) mul_4(xn,s,d) - mul_p25519(zn,p,e) + mul_4(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn +// Prepare to call the modular inverse function to get zn' = 1/zn - leaq 256(%rsp), %rdi + leaq 224(%rsp), %rdi leaq 224(%rsp), %rsi // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -801,7 +800,7 @@ curve25519_x25519_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 208 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn and zn. movq %rdi, 0xc0(%rsp) xorl %eax, %eax @@ -2149,31 +2148,14 @@ curve25519_x25519_midloop: movq %r14, 0x10(%rdi) movq %r15, 0x18(%rdi) -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - movq 224(%rsp), %rax - orq 232(%rsp), %rax - orq 240(%rsp), %rax - orq 248(%rsp), %rax - movq 320(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 320(%rsp) - movq 328(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 328(%rsp) - movq 336(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 336(%rsp) - movq 344(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 344(%rsp) - // Now the result is xn * (1/zn), fully reduced modulo p. +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. movq res, %rbp - mul_p25519(resx,xn,xm) + mul_p25519(resx,xn,zn) // Restore stack and registers diff --git a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519_alt.S b/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519_alt.S index 4a63a55f11..f7c6c3d7b0 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519_alt.S +++ b/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519_alt.S @@ -916,8 +916,7 @@ curve25519_x25519_alt_scalarloop: // Multiplex directly into (xn,zn) then do three pure doubling steps; // this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. +// of the scalar. movq swap, %rdx testq %rdx, %rdx @@ -949,12 +948,12 @@ curve25519_x25519_alt_scalarloop: sub_twice4(p,s,d) cmadd_4(e,0x1db42,p,d) mul_4(xn,s,d) - mul_p25519(zn,p,e) + mul_4(zn,p,e) // The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn +// Prepare to call the modular inverse function to get zn' = 1/zn - leaq 256(%rsp), %rdi + leaq 224(%rsp), %rdi leaq 224(%rsp), %rsi // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -962,7 +961,7 @@ curve25519_x25519_alt_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "x86/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 208 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn and zn. movq %rdi, 0xc0(%rsp) xorl %eax, %eax @@ -2310,31 +2309,14 @@ curve25519_x25519_alt_midloop: movq %r14, 0x10(%rdi) movq %r15, 0x18(%rdi) -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - movq 224(%rsp), %rax - orq 232(%rsp), %rax - orq 240(%rsp), %rax - orq 248(%rsp), %rax - movq 320(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 320(%rsp) - movq 328(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 328(%rsp) - movq 336(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 336(%rsp) - movq 344(%rsp), %rcx - cmovzq %rax, %rcx - movq %rcx, 344(%rsp) - // Now the result is xn * (1/zn), fully reduced modulo p. +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. movq res, %rbp - mul_p25519(resx,xn,xm) + mul_p25519(resx,xn,zn) // Restore stack and registers diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjadd.S b/third_party/s2n-bignum/x86_att/p384/p384_montjadd.S index 27b58bfc14..6078082204 100644 --- a/third_party/s2n-bignum/x86_att/p384/p384_montjadd.S +++ b/third_party/s2n-bignum/x86_att/p384/p384_montjadd.S @@ -28,7 +28,8 @@ // Pointer-offset pairs for inputs and outputs // These assume %rdi = p3, %rsi = p1 and %rcx = p2, -// which needs to be set up explicitly before use +// which needs to be set up explicitly before use. +// The %rdi value never changes, however. #define x_1 0(%rsi) #define y_1 NUMSIZE(%rsi) @@ -52,6 +53,7 @@ #define z1sq (NUMSIZE*0)(%rsp) #define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) #define yd (NUMSIZE*1)(%rsp) #define y2a (NUMSIZE*1)(%rsp) @@ -65,9 +67,11 @@ #define t2 (NUMSIZE*4)(%rsp) #define x1a (NUMSIZE*4)(%rsp) #define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) #define xd (NUMSIZE*5)(%rsp) #define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) #define y1a (NUMSIZE*6)(%rsp) @@ -75,9 +79,8 @@ #define input_x (NUMSIZE*7)(%rsp) #define input_y (NUMSIZE*7+8)(%rsp) -#define input_z (NUMSIZE*7+16)(%rsp) -#define NSPACE (NUMSIZE*7+24) +#define NSPACE (NUMSIZE*7+16) // Corresponds exactly to bignum_montmul_p384 @@ -843,6 +846,52 @@ sbbq $0x0, %r11 ; \ movq %r11, 0x28+P0 +// Additional macros to help with final multiplexing + +#define load6(r0,r1,r2,r3,r4,r5,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 + +#define store6(P,r0,r1,r2,r3,r4,r5) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P ; \ + +#define czload6(r0,r1,r2,r3,r4,r5,P) \ + cmovzq P, r0 ; \ + cmovzq 8+P, r1 ; \ + cmovzq 16+P, r2 ; \ + cmovzq 24+P, r3 ; \ + cmovzq 32+P, r4 ; \ + cmovzq 40+P, r5 + +#define muxload6(r0,r1,r2,r3,r4,r5,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 ; \ + movq 32+P0, r4 ; \ + cmovbq 32+P1, r4 ; \ + cmovnbe 32+P2, r4 ; \ + movq 40+P0, r5 ; \ + cmovbq 40+P1, r5 ; \ + cmovnbe 40+P2, r5 + S2N_BN_SYMBOL(p384_montjadd): #if WINDOWS_ABI @@ -865,7 +914,6 @@ S2N_BN_SYMBOL(p384_montjadd): subq $NSPACE, %rsp - movq %rdi, input_z movq %rsi, input_x movq %rdx, input_y @@ -899,28 +947,77 @@ S2N_BN_SYMBOL(p384_montjadd): montmul_p384(zzx1,zz,x1a) montmul_p384(zzx2,zz,x2a) - movq input_z, %rdi - sub_p384(x_3,ww,zzx1) + sub_p384(resx,ww,zzx1) sub_p384(t1,zzx2,zzx1) movq input_x, %rsi montmul_p384(xd,xd,z_1) - movq input_z, %rdi - sub_p384(x_3,x_3,zzx2) + sub_p384(resx,resx,zzx2) - movq input_z, %rdi - sub_p384(t2,zzx1,x_3) + sub_p384(t2,zzx1,resx) montmul_p384(t1,t1,y1a) - movq input_z, %rdi movq input_y, %rcx - montmul_p384(z_3,xd,z_2) + montmul_p384(resz,xd,z_2) montmul_p384(t2,yd,t2) - movq input_z, %rdi - sub_p384(y_3,t2,t1) + sub_p384(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) +// Multiplex the z outputs accordingly and re-store in resz + + movq input_y, %rcx + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,z_2) + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rbx, %rax + orq %rbp, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + + movq input_x, %rsi + load6(%r12,%r13,%r14,%r15,%rdx,%rcx,z_1) + cmovzq %r12, %r8 + cmovzq %r13, %r9 + cmovzq %r14, %r10 + cmovzq %r15, %r11 + cmovzq %rdx, %rbx + cmovzq %rcx, %rbp + orq %r13, %r12 + orq %r15, %r14 + orq %rcx, %rdx + orq %r14, %r12 + orq %r12, %rdx + negq %rdx + sbbq %rdx, %rdx + + cmpq %rdx, %rax + + czload6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + store6(resz,%r8,%r9,%r10,%r11,%rbx,%rbp) + +// Multiplex the x and y outputs too, keeping the results in registers + + movq input_y, %rcx + movq input_x, %rsi + muxload6(%r8,%r9,%r10,%r11,%rbx,%rbp,resx,x_1,x_2) + muxload6(%r12,%r13,%r14,%r15,%rdx,%rax,resy,y_1,y_2) + +// Finally store back the multiplexed values + + store6(x_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + store6(y_3,%r12,%r13,%r14,%r15,%rdx,%rax) + store6(z_3,%r8,%r9,%r10,%r11,%rbx,%rbp) // Restore stack and registers diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjadd_alt.S b/third_party/s2n-bignum/x86_att/p384/p384_montjadd_alt.S new file mode 100644 index 0000000000..e36a60f331 --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p384/p384_montjadd_alt.S @@ -0,0 +1,965 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjadd_alt +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjadd_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rcx = p2, +// which needs to be set up explicitly before use. +// The %rdi value never changes, however. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rcx) +#define y_2 NUMSIZE(%rcx) +#define z_2 (2*NUMSIZE)(%rcx) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// In one place it's convenient to use another register +// since the squaring function overwrites %rcx + +#define z_2_alt (2*NUMSIZE)(%rsi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +// Temporaries for the actual input pointers + +#define input_x (NUMSIZE*7)(%rsp) +#define input_y (NUMSIZE*7+8)(%rsp) + +#define NSPACE (NUMSIZE*7+16) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + movq P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + xorl %r13d, %r13d ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + xorl %r14d, %r14d ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + xorl %r15d, %r15d ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r8, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x8+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r8, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r8, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r8, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r8, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r8, %r8 ; \ + negq %r8; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r9, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbp, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r9, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r9, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r9, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r9, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r9, %r9 ; \ + negq %r9; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r10, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rbp, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r10, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r10, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r10, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r10, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r10, %r10 ; \ + negq %r10; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r11, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rbp, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r11, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r11, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r11, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r11, %r11 ; \ + negq %r11; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r12, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r13 ; \ + sbbq %rdx, %r14 ; \ + sbbq %rbp, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r12, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r12, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r12, %r12 ; \ + negq %r12; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r13, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r14 ; \ + sbbq %rdx, %r15 ; \ + sbbq %rbp, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorl %edx, %edx ; \ + xorl %ebp, %ebp ; \ + xorl %r13d, %r13d ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %ebx ; \ + adcq %r15, %rbx ; \ + movl $0x1, %ecx ; \ + adcq %r8, %rcx ; \ + adcq %r9, %rdx ; \ + adcq %r10, %rbp ; \ + adcq %r11, %r13 ; \ + adcq $0x0, %r12 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %rbx, %r15 ; \ + cmovneq %rcx, %r8 ; \ + cmovneq %rdx, %r9 ; \ + cmovneq %rbp, %r10 ; \ + cmovneq %r13, %r11 ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %r8, 0x10+P0 ; \ + movq %r9, 0x18+P0 ; \ + movq %r10, 0x20+P0 ; \ + movq %r11, 0x28+P0 + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + movq P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + movq %rax, %r15 ; \ + movq %rdx, %rcx ; \ + movq 0x10+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %rcx ; \ + movq 0x20+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rcx ; \ + sbbq %rbp, %rbp ; \ + xorl %ebx, %ebx ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + subq %rbp, %rdx ; \ + xorl %ebp, %ebp ; \ + addq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + xorl %r8d, %r8d ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %r15, %r15 ; \ + adcq %rcx, %rcx ; \ + adcq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcl %r8d, %r8d ; \ + movq P1, %rax ; \ + mulq %rax; \ + movq %r8, P0 ; \ + movq %rax, %r8 ; \ + movq 0x8+P1, %rax ; \ + movq %rbp, 0x8+P0 ; \ + addq %rdx, %r9 ; \ + sbbq %rbp, %rbp ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq 0x8+P0, %rax ; \ + adcq P0, %rdx ; \ + movq %rax, %rbp ; \ + movq %rdx, %rsi ; \ + movq %rbx, P0 ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r8, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rax, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rbx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r9, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rax, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rbx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r10, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rax, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rbx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r11, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rax, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rbx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r12, %r13 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rax, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rbx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r13, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rax, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rbx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rsi ; \ + movl $0x0, %r8d ; \ + adcq %r8, %r8 ; \ + xorq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + xorq %r13, %r13 ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %r9d ; \ + adcq %r15, %r9 ; \ + movl $0x1, %r10d ; \ + adcq %rcx, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq %rsi, %r13 ; \ + adcq $0x0, %r8 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %r9, %r15 ; \ + cmovneq %r10, %rcx ; \ + cmovneq %r11, %rbx ; \ + cmovneq %r12, %rbp ; \ + cmovneq %r13, %rsi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rsi, 0x28+P0 + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %esi ; \ + andq %rsi, %rcx ; \ + xorq %rsi, %rsi ; \ + subq %rcx, %rsi ; \ + subq %rsi, %rax ; \ + movq %rax, P0 ; \ + sbbq %rcx, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq %rax, %rax ; \ + andq %rsi, %rcx ; \ + negq %rax; \ + sbbq %rcx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// Additional macros to help with final multiplexing + +#define load6(r0,r1,r2,r3,r4,r5,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 + +#define store6(P,r0,r1,r2,r3,r4,r5) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P ; \ + +#define czload6(r0,r1,r2,r3,r4,r5,P) \ + cmovzq P, r0 ; \ + cmovzq 8+P, r1 ; \ + cmovzq 16+P, r2 ; \ + cmovzq 24+P, r3 ; \ + cmovzq 32+P, r4 ; \ + cmovzq 40+P, r5 + +#define muxload6(r0,r1,r2,r3,r4,r5,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 ; \ + movq 32+P0, r4 ; \ + cmovbq 32+P1, r4 ; \ + cmovnbe 32+P2, r4 ; \ + movq 40+P0, r5 ; \ + cmovbq 40+P1, r5 ; \ + cmovnbe 40+P2, r5 + +S2N_BN_SYMBOL(p384_montjadd_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input arguments in non-volatile places on the stack + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rsi, input_x + movq %rdx, input_y + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + montsqr_p384(z1sq,z_1) + movq input_y, %rsi + montsqr_p384(z2sq,z_2_alt) + + movq input_x, %rsi + movq input_y, %rcx + montmul_p384(y1a,z_2,y_1) + movq input_x, %rsi + movq input_y, %rcx + montmul_p384(y2a,z_1,y_2) + + movq input_y, %rcx + montmul_p384(x2a,z1sq,x_2) + movq input_x, %rsi + montmul_p384(x1a,z2sq,x_1) + montmul_p384(y2a,z1sq,y2a) + montmul_p384(y1a,z2sq,y1a) + + sub_p384(xd,x2a,x1a) + sub_p384(yd,y2a,y1a) + + montsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x1a) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + movq input_x, %rsi + montmul_p384(xd,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + montmul_p384(t1,t1,y1a) + + movq input_y, %rcx + montmul_p384(resz,xd,z_2) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) +// Multiplex the z outputs accordingly and re-store in resz + + movq input_y, %rcx + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,z_2) + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rbx, %rax + orq %rbp, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + + movq input_x, %rsi + load6(%r12,%r13,%r14,%r15,%rdx,%rcx,z_1) + cmovzq %r12, %r8 + cmovzq %r13, %r9 + cmovzq %r14, %r10 + cmovzq %r15, %r11 + cmovzq %rdx, %rbx + cmovzq %rcx, %rbp + orq %r13, %r12 + orq %r15, %r14 + orq %rcx, %rdx + orq %r14, %r12 + orq %r12, %rdx + negq %rdx + sbbq %rdx, %rdx + + cmpq %rdx, %rax + + czload6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + store6(resz,%r8,%r9,%r10,%r11,%rbx,%rbp) + +// Multiplex the x and y outputs too, keeping the results in registers + + movq input_y, %rcx + movq input_x, %rsi + muxload6(%r8,%r9,%r10,%r11,%rbx,%rbp,resx,x_1,x_2) + muxload6(%r12,%r13,%r14,%r15,%rdx,%rax,resy,y_1,y_2) + +// Finally store back the multiplexed values + + store6(x_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + store6(y_3,%r12,%r13,%r14,%r15,%rdx,%rax) + store6(z_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjdouble_alt.S b/third_party/s2n-bignum/x86_att/p384/p384_montjdouble_alt.S new file mode 100644 index 0000000000..8258e35267 --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p384/p384_montjdouble_alt.S @@ -0,0 +1,1196 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjdouble_alt +// (uint64_t p3[static 18],uint64_t p1[static 18]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjdouble_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1. The latter stays true +// but montsqr below modifies %rdi as well. Thus, we need +// to save %rdi and restore it before the writes to outputs. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 (NUMSIZE*0)(%rsp) +#define y2 (NUMSIZE*1)(%rsp) +#define x2p (NUMSIZE*2)(%rsp) +#define xy2 (NUMSIZE*3)(%rsp) + +#define y4 (NUMSIZE*4)(%rsp) +#define t2 (NUMSIZE*4)(%rsp) + +#define dx2 (NUMSIZE*5)(%rsp) +#define t1 (NUMSIZE*5)(%rsp) + +#define d (NUMSIZE*6)(%rsp) +#define x4p (NUMSIZE*6)(%rsp) + +// Safe place for pointer to the output + +#define input_z (NUMSIZE*7)(%rsp) + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + movq P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + xorl %r13d, %r13d ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + xorl %r14d, %r14d ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + xorl %r15d, %r15d ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r8, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x8+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r8, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r8, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r8, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r8, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r8, %r8 ; \ + negq %r8; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r9, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbp, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r9, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r9, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r9, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r9, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r9, %r9 ; \ + negq %r9; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r10, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rbp, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r10, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r10, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r10, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r10, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r10, %r10 ; \ + negq %r10; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r11, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rbp, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r11, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r11, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r11, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r11, %r11 ; \ + negq %r11; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r12, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r13 ; \ + sbbq %rdx, %r14 ; \ + sbbq %rbp, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r12, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r12, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r12, %r12 ; \ + negq %r12; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r13, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r14 ; \ + sbbq %rdx, %r15 ; \ + sbbq %rbp, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorl %edx, %edx ; \ + xorl %ebp, %ebp ; \ + xorl %r13d, %r13d ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %ebx ; \ + adcq %r15, %rbx ; \ + movl $0x1, %ecx ; \ + adcq %r8, %rcx ; \ + adcq %r9, %rdx ; \ + adcq %r10, %rbp ; \ + adcq %r11, %r13 ; \ + adcq $0x0, %r12 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %rbx, %r15 ; \ + cmovneq %rcx, %r8 ; \ + cmovneq %rdx, %r9 ; \ + cmovneq %rbp, %r10 ; \ + cmovneq %r13, %r11 ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %r8, 0x10+P0 ; \ + movq %r9, 0x18+P0 ; \ + movq %r10, 0x20+P0 ; \ + movq %r11, 0x28+P0 + +// Corresponds exactly to bignum_montsqr_p384_alt except %rsi -> %rdi + +#define montsqr_p384(P0,P1) \ + movq P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + movq %rax, %r15 ; \ + movq %rdx, %rcx ; \ + movq 0x10+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %rcx ; \ + movq 0x20+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rcx ; \ + sbbq %rbp, %rbp ; \ + xorl %ebx, %ebx ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + subq %rbp, %rdx ; \ + xorl %ebp, %ebp ; \ + addq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + xorl %r8d, %r8d ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %r15, %r15 ; \ + adcq %rcx, %rcx ; \ + adcq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcl %r8d, %r8d ; \ + movq P1, %rax ; \ + mulq %rax; \ + movq %r8, P0 ; \ + movq %rax, %r8 ; \ + movq 0x8+P1, %rax ; \ + movq %rbp, 0x8+P0 ; \ + addq %rdx, %r9 ; \ + sbbq %rbp, %rbp ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq 0x8+P0, %rax ; \ + adcq P0, %rdx ; \ + movq %rax, %rbp ; \ + movq %rdx, %rdi ; \ + movq %rbx, P0 ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r8, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rax, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rbx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r9, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rax, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rbx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r10, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rax, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rbx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r11, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rax, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rbx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r12, %r13 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rax, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rbx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r13, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rax, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rbx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rdi ; \ + movl $0x0, %r8d ; \ + adcq %r8, %r8 ; \ + xorq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + xorq %r13, %r13 ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %r9d ; \ + adcq %r15, %r9 ; \ + movl $0x1, %r10d ; \ + adcq %rcx, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq %rdi, %r13 ; \ + adcq $0x0, %r8 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %r9, %r15 ; \ + cmovneq %r10, %rcx ; \ + cmovneq %r11, %rbx ; \ + cmovneq %r12, %rbp ; \ + cmovneq %r13, %rdi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rdi, 0x28+P0 + +#define sub_p384(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %ebx ; \ + andq %rbx, %rcx ; \ + xorq %rbx, %rbx ; \ + subq %rcx, %rbx ; \ + subq %rbx, %rax ; \ + movq %rax, P0 ; \ + sbbq %rcx, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq %rax, %rax ; \ + andq %rbx, %rcx ; \ + negq %rax; \ + sbbq %rcx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// Simplified bignum_add_p384, without carry chain suspension + +#define add_p384(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + adcq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + adcq 0x28+P2, %r11 ; \ + movl $0x0, %edx ; \ + adcq %rdx, %rdx ; \ + movq $0xffffffff00000001, %rbp ; \ + addq %rbp, %rax ; \ + movl $0xffffffff, %ebp ; \ + adcq %rbp, %rcx ; \ + adcq $0x1, %r8 ; \ + adcq $0x0, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq $0x0, %r11 ; \ + adcq $0xffffffffffffffff, %rdx ; \ + movl $1, %ebx ; \ + andq %rdx, %rbx ; \ + andq %rbp, %rdx ; \ + xorq %rbp, %rbp ; \ + subq %rdx, %rbp ; \ + subq %rbp, %rax ; \ + movq %rax, P0 ; \ + sbbq %rdx, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + sbbq %rbx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// P0 = 4 * P1 - P2 + +#define cmsub41_p384(P0,P1,P2) \ + movq 40+P1, %rcx ; \ + movq %rcx, %r13 ; \ + shrq $62, %rcx ; \ + movq 32+P1, %r12 ; \ + shldq $2, %r12, %r13 ; \ + movq 24+P1, %r11 ; \ + shldq $2, %r11, %r12 ; \ + movq 16+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + addq $1, %rcx ; \ + subq P2, %r8 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + sbbq 0x18+P2, %r11 ; \ + sbbq 0x20+P2, %r12 ; \ + sbbq 0x28+P2, %r13 ; \ + sbbq $0, %rcx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rcx, %r10 ; \ + movq %rcx, %rax ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %edx ; \ + negq %rcx; \ + mulq %rdx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %rcx, %r11 ; \ + adcq $0x0, %r12 ; \ + adcq $0x0, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rcx; \ + movl $0xffffffff, %edx ; \ + xorq %rax, %rax ; \ + andq %rcx, %rdx ; \ + subq %rdx, %rax ; \ + andq $0x1, %rcx ; \ + subq %rax, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rcx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 + +// P0 = C * P1 - D * P2 + +#define cmsub_p384(P0,C,P1,D,P2) \ + movq $0x00000000ffffffff, %r9 ; \ + subq P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq 8+P2, %r10 ; \ + movq $0xfffffffffffffffe, %r11 ; \ + sbbq 16+P2, %r11 ; \ + movq $0xffffffffffffffff, %r12 ; \ + sbbq 24+P2, %r12 ; \ + movq $0xffffffffffffffff, %r13 ; \ + sbbq 32+P2, %r13 ; \ + movq $0xffffffffffffffff, %r14 ; \ + sbbq 40+P2, %r14 ; \ + movq $D, %rcx ; \ + movq %r9, %rax ; \ + mulq %rcx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq %r10, %rax ; \ + xorl %r10d, %r10d ; \ + mulq %rcx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq %r11, %rax ; \ + xorl %r11d, %r11d ; \ + mulq %rcx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq %r12, %rax ; \ + xorl %r12d, %r12d ; \ + mulq %rcx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq %r13, %rax ; \ + xorl %r13d, %r13d ; \ + mulq %rcx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + movq %r14, %rax ; \ + movl $1, %r14d ; \ + mulq %rcx; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + movl $C, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbx, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbx, %rbx ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbx, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbx, %rbx ; \ + movq 0x20+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbx, %rbx ; \ + movq 0x28+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %r14; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r14, %r10 ; \ + movq %r14, %rax ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %edx ; \ + negq %rcx; \ + mulq %rdx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %rcx, %r11 ; \ + adcq $0x0, %r12 ; \ + adcq $0x0, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rcx; \ + movl $0xffffffff, %edx ; \ + xorq %rax, %rax ; \ + andq %rcx, %rdx ; \ + subq %rdx, %rax ; \ + andq $0x1, %rcx ; \ + subq %rax, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rcx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 + +// A weak version of add that only guarantees sum in 6 digits + +#define weakadd_p384(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + adcq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + adcq 0x28+P2, %r11 ; \ + sbbq %rdx, %rdx ; \ + movl $1, %ebx ; \ + andq %rdx, %rbx ; \ + movl $0xffffffff, %ebp ; \ + andq %rbp, %rdx ; \ + xorq %rbp, %rbp ; \ + subq %rdx, %rbp ; \ + addq %rbp, %rax ; \ + movq %rax, P0 ; \ + adcq %rdx, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %rbx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + adcq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + adcq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// P0 = 3 * P1 - 8 * P2 + +#define cmsub38_p384(P0,P1,P2) \ + movq $0x00000000ffffffff, %r8 ; \ + subq P2, %r8 ; \ + movq $0xffffffff00000000, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq $0xfffffffffffffffe, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq $0xffffffffffffffff, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq $0xffffffffffffffff, %r12 ; \ + sbbq 32+P2, %r12 ; \ + movq $0xffffffffffffffff, %r13 ; \ + sbbq 40+P2, %r13 ; \ + movq %r13, %r14 ; \ + shrq $61, %r14 ; \ + shldq $3, %r12, %r13 ; \ + shldq $3, %r11, %r12 ; \ + shldq $3, %r10, %r11 ; \ + shldq $3, %r9, %r10 ; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + addq $1, %r14 ; \ + movl $3, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbx, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbx, %rbx ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbx, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbx, %rbx ; \ + movq 0x20+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbx, %rbx ; \ + movq 0x28+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %r14; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r14, %r10 ; \ + movq %r14, %rax ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %edx ; \ + negq %rcx; \ + mulq %rdx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %rcx, %r11 ; \ + adcq $0x0, %r12 ; \ + adcq $0x0, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rcx; \ + movl $0xffffffff, %edx ; \ + xorq %rax, %rax ; \ + andq %rcx, %rdx ; \ + subq %rdx, %rax ; \ + andq $0x1, %rcx ; \ + subq %rax, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rcx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 + +S2N_BN_SYMBOL(p384_montjdouble_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables +// Save the output pointer %rdi which gets overwritten in earlier +// operations before it is used. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdi, input_z + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p384(z2,z_1) + montsqr_p384(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + weakadd_p384(t1,x_1,z2) + sub_p384(t2,x_1,z2) + montmul_p384(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p384(t1,y_1,z_1) + montsqr_p384(x4p,x2p) + montmul_p384(xy2,x_1,y2) + +// t2 = (y + z)^2 + + montsqr_p384(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p384(d,12,xy2,9,x4p) + sub_p384(t1,t2,z2) + +// y4 = y^4 + + montsqr_p384(y4,y2) + +// Restore the output pointer to write to x_3, y_3 and z_3. + + movq input_z, %rdi + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p384(z_3,t1,y2) + montmul_p384(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p384(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p384(y_3,dx2,y4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd.S b/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd.S index 0d456464b9..539a28117a 100644 --- a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd.S +++ b/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd.S @@ -30,7 +30,8 @@ // Pointer-offset pairs for inputs and outputs // These assume %rdi = p3, %rsi = p1 and %rcx = p2, -// which needs to be set up explicitly before use +// which needs to be set up explicitly before use. +// However the %rdi value never changes. #define x_1 0(%rsi) #define y_1 NUMSIZE(%rsi) @@ -48,6 +49,7 @@ #define zp2 (NUMSIZE*0)(%rsp) #define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) #define yd (NUMSIZE*1)(%rsp) #define y2a (NUMSIZE*1)(%rsp) @@ -60,16 +62,17 @@ #define t2 (NUMSIZE*4)(%rsp) #define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) #define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) // Temporaries for the actual input pointers #define input_x (NUMSIZE*6)(%rsp) #define input_y (NUMSIZE*6+8)(%rsp) -#define input_z (NUMSIZE*6+16)(%rsp) -#define NSPACE (NUMSIZE*6+24) +#define NSPACE (NUMSIZE*6+16) // Corresponds exactly to bignum_montmul_p384 @@ -835,6 +838,53 @@ sbbq $0x0, %r11 ; \ movq %r11, 0x28+P0 +// Additional macros to help with final multiplexing + +#define testzero6(P) \ + movq P, %rax ; \ + movq 8+P, %rdx ; \ + orq 16+P, %rax ; \ + orq 24+P, %rdx ; \ + orq 32+P, %rax ; \ + orq 40+P, %rdx ; \ + orq %rdx, %rax + +#define mux6(r0,r1,r2,r3,r4,r5,PNE,PEQ) \ + movq PEQ, %rax ; \ + movq PNE, r0 ; \ + cmovzq %rax, r0 ; \ + movq 8+PEQ, %rax ; \ + movq 8+PNE, r1 ; \ + cmovzq %rax, r1 ; \ + movq 16+PEQ, %rax ; \ + movq 16+PNE, r2 ; \ + cmovzq %rax, r2 ; \ + movq 24+PEQ, %rax ; \ + movq 24+PNE, r3 ; \ + cmovzq %rax, r3 ; \ + movq 32+PEQ, %rax ; \ + movq 32+PNE, r4 ; \ + cmovzq %rax, r4 ; \ + movq 40+PEQ, %rax ; \ + movq 40+PNE, r5 ; \ + cmovzq %rax, r5 + +#define load6(r0,r1,r2,r3,r4,r5,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 + +#define store6(P,r0,r1,r2,r3,r4,r5) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P + S2N_BN_SYMBOL(p384_montjmixadd): #if WINDOWS_ABI @@ -857,7 +907,6 @@ S2N_BN_SYMBOL(p384_montjmixadd): subq $NSPACE, %rsp - movq %rdi, input_z movq %rsi, input_x movq %rdx, input_y @@ -887,26 +936,52 @@ S2N_BN_SYMBOL(p384_montjmixadd): montmul_p384(zzx1,zz,x_1) montmul_p384(zzx2,zz,x2a) - movq input_z, %rdi - sub_p384(x_3,ww,zzx1) + sub_p384(resx,ww,zzx1) sub_p384(t1,zzx2,zzx1) - movq input_z, %rdi movq input_x, %rsi - montmul_p384(z_3,xd,z_1) + montmul_p384(resz,xd,z_1) - movq input_z, %rdi - sub_p384(x_3,x_3,zzx2) + sub_p384(resx,resx,zzx2) - movq input_z, %rdi - sub_p384(t2,zzx1,x_3) + sub_p384(t2,zzx1,resx) movq input_x, %rsi montmul_p384(t1,t1,y_1) montmul_p384(t2,yd,t2) - movq input_z, %rdi - sub_p384(y_3,t2,t1) + sub_p384(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + movq input_x, %rsi + testzero6(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), +// hence giving 0 + p2 = p2 for the final result. + + movq input_y, %rcx + mux6(%r8,%r9,%r10,%r11,%rbx,%rbp,resx,x_2) + mux6(%r12,%r13,%r14,%r15,%rdx,%rcx,resy,y_2) + + store6(x_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + store6(y_3,%r12,%r13,%r14,%r15,%rdx,%rcx) + + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + movq $0xffffffff00000001, %rax + cmovzq %rax, %r8 + movl $0x00000000ffffffff, %eax + cmovzq %rax, %r9 + movq $1, %rax + cmovzq %rax, %r10 + movl $0, %eax + cmovzq %rax, %r11 + cmovzq %rax, %rbx + cmovzq %rax, %rbp + + store6(z_3,%r8,%r9,%r10,%r11,%rbx,%rbp) // Restore stack and registers diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd_alt.S b/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd_alt.S new file mode 100644 index 0000000000..da610ee88e --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd_alt.S @@ -0,0 +1,929 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjmixadd_alt +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rcx = p2, +// which needs to be set up explicitly before use. +// However the %rdi value never changes. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rcx) +#define y_2 NUMSIZE(%rcx) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +// Temporaries for the actual input pointers + +#define input_x (NUMSIZE*6)(%rsp) +#define input_y (NUMSIZE*6+8)(%rsp) + +#define NSPACE (NUMSIZE*6+16) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + movq P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + xorl %r13d, %r13d ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + xorl %r14d, %r14d ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + xorl %r15d, %r15d ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r8, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbp, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x8+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r8, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r8, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r8, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r8, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r8, %r8 ; \ + negq %r8; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r9, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbp, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r9, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r9, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r9, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r9, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r9, %r9 ; \ + negq %r9; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r10, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rbp, %r13 ; \ + sbbq $0x0, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r10, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r10, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r10, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r10, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r10, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r10, %r10 ; \ + negq %r10; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r11, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rbp, %r14 ; \ + sbbq $0x0, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r11, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r11, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r11, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r11, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r11, %r11 ; \ + negq %r11; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r12, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r13 ; \ + sbbq %rdx, %r14 ; \ + sbbq %rbp, %r15 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + sbbq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %r12, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r12, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %r12, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r12, %r12 ; \ + negq %r12; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + xorl %ebp, %ebp ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %r13, %rax ; \ + adcq %rbx, %rdx ; \ + adcl %ebp, %ebp ; \ + subq %rax, %r14 ; \ + sbbq %rdx, %r15 ; \ + sbbq %rbp, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %rbx ; \ + addq %rbx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorl %edx, %edx ; \ + xorl %ebp, %ebp ; \ + xorl %r13d, %r13d ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %ebx ; \ + adcq %r15, %rbx ; \ + movl $0x1, %ecx ; \ + adcq %r8, %rcx ; \ + adcq %r9, %rdx ; \ + adcq %r10, %rbp ; \ + adcq %r11, %r13 ; \ + adcq $0x0, %r12 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %rbx, %r15 ; \ + cmovneq %rcx, %r8 ; \ + cmovneq %rdx, %r9 ; \ + cmovneq %rbp, %r10 ; \ + cmovneq %r13, %r11 ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %r8, 0x10+P0 ; \ + movq %r9, 0x18+P0 ; \ + movq %r10, 0x20+P0 ; \ + movq %r11, 0x28+P0 + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + movq P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + movq %rax, %r15 ; \ + movq %rdx, %rcx ; \ + movq 0x10+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %rcx ; \ + movq 0x20+P1, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rbx; \ + subq %rbp, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rcx ; \ + sbbq %rbp, %rbp ; \ + xorl %ebx, %ebx ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + subq %rbp, %rdx ; \ + xorl %ebp, %ebp ; \ + addq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + adcl %ebp, %ebp ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + xorl %r8d, %r8d ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %r15, %r15 ; \ + adcq %rcx, %rcx ; \ + adcq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcl %r8d, %r8d ; \ + movq P1, %rax ; \ + mulq %rax; \ + movq %r8, P0 ; \ + movq %rax, %r8 ; \ + movq 0x8+P1, %rax ; \ + movq %rbp, 0x8+P0 ; \ + addq %rdx, %r9 ; \ + sbbq %rbp, %rbp ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq %rax, %rcx ; \ + adcq %rdx, %rbx ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + negq %rbp; \ + adcq 0x8+P0, %rax ; \ + adcq P0, %rdx ; \ + movq %rax, %rbp ; \ + movq %rdx, %rsi ; \ + movq %rbx, P0 ; \ + movq %r8, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r8, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r8 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r8, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rax, %r11 ; \ + sbbq $0x0, %r12 ; \ + sbbq $0x0, %r13 ; \ + movq %rbx, %r8 ; \ + sbbq $0x0, %r8 ; \ + movq %r9, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r9, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r9 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r9, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rax, %r12 ; \ + sbbq $0x0, %r13 ; \ + sbbq $0x0, %r8 ; \ + movq %rbx, %r9 ; \ + sbbq $0x0, %r9 ; \ + movq %r10, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r10, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r10 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r10, %r11 ; \ + sbbq %rdx, %r12 ; \ + sbbq %rax, %r13 ; \ + sbbq $0x0, %r8 ; \ + sbbq $0x0, %r9 ; \ + movq %rbx, %r10 ; \ + sbbq $0x0, %r10 ; \ + movq %r11, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r11, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r11 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r11, %r12 ; \ + sbbq %rdx, %r13 ; \ + sbbq %rax, %r8 ; \ + sbbq $0x0, %r9 ; \ + sbbq $0x0, %r10 ; \ + movq %rbx, %r11 ; \ + sbbq $0x0, %r11 ; \ + movq %r12, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r12, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r12 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r12 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r12, %r13 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rax, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq $0x0, %r11 ; \ + movq %rbx, %r12 ; \ + sbbq $0x0, %r12 ; \ + movq %r13, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %r13, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %rbx; \ + movq %rdx, %r13 ; \ + movq $0xffffffff, %rax ; \ + mulq %rbx; \ + addq %rax, %r13 ; \ + movl $0x0, %eax ; \ + adcq %rbx, %rdx ; \ + adcl %eax, %eax ; \ + subq %r13, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rax, %r10 ; \ + sbbq $0x0, %r11 ; \ + sbbq $0x0, %r12 ; \ + movq %rbx, %r13 ; \ + sbbq $0x0, %r13 ; \ + movq P0, %rbx ; \ + addq %r8, %r14 ; \ + adcq %r9, %r15 ; \ + adcq %r10, %rcx ; \ + adcq %r11, %rbx ; \ + adcq %r12, %rbp ; \ + adcq %r13, %rsi ; \ + movl $0x0, %r8d ; \ + adcq %r8, %r8 ; \ + xorq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + xorq %r13, %r13 ; \ + movq $0xffffffff00000001, %rax ; \ + addq %r14, %rax ; \ + movl $0xffffffff, %r9d ; \ + adcq %r15, %r9 ; \ + movl $0x1, %r10d ; \ + adcq %rcx, %r10 ; \ + adcq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq %rsi, %r13 ; \ + adcq $0x0, %r8 ; \ + cmovneq %rax, %r14 ; \ + cmovneq %r9, %r15 ; \ + cmovneq %r10, %rcx ; \ + cmovneq %r11, %rbx ; \ + cmovneq %r12, %rbp ; \ + cmovneq %r13, %rsi ; \ + movq %r14, P0 ; \ + movq %r15, 0x8+P0 ; \ + movq %rcx, 0x10+P0 ; \ + movq %rbx, 0x18+P0 ; \ + movq %rbp, 0x20+P0 ; \ + movq %rsi, 0x28+P0 + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + sbbq %rcx, %rcx ; \ + movl $0xffffffff, %esi ; \ + andq %rsi, %rcx ; \ + xorq %rsi, %rsi ; \ + subq %rcx, %rsi ; \ + subq %rsi, %rax ; \ + movq %rax, P0 ; \ + sbbq %rcx, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq %rax, %rax ; \ + andq %rsi, %rcx ; \ + negq %rax; \ + sbbq %rcx, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 + +// Additional macros to help with final multiplexing + +#define testzero6(P) \ + movq P, %rax ; \ + movq 8+P, %rdx ; \ + orq 16+P, %rax ; \ + orq 24+P, %rdx ; \ + orq 32+P, %rax ; \ + orq 40+P, %rdx ; \ + orq %rdx, %rax + +#define mux6(r0,r1,r2,r3,r4,r5,PNE,PEQ) \ + movq PEQ, %rax ; \ + movq PNE, r0 ; \ + cmovzq %rax, r0 ; \ + movq 8+PEQ, %rax ; \ + movq 8+PNE, r1 ; \ + cmovzq %rax, r1 ; \ + movq 16+PEQ, %rax ; \ + movq 16+PNE, r2 ; \ + cmovzq %rax, r2 ; \ + movq 24+PEQ, %rax ; \ + movq 24+PNE, r3 ; \ + cmovzq %rax, r3 ; \ + movq 32+PEQ, %rax ; \ + movq 32+PNE, r4 ; \ + cmovzq %rax, r4 ; \ + movq 40+PEQ, %rax ; \ + movq 40+PNE, r5 ; \ + cmovzq %rax, r5 + +#define load6(r0,r1,r2,r3,r4,r5,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 + +#define store6(P,r0,r1,r2,r3,r4,r5) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P + +S2N_BN_SYMBOL(p384_montjmixadd_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input arguments in non-volatile places on the stack + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rsi, input_x + movq %rdx, input_y + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + montsqr_p384(zp2,z_1) + + movq input_x, %rsi + movq input_y, %rcx + montmul_p384(y2a,z_1,y_2) + + movq input_y, %rcx + montmul_p384(x2a,zp2,x_2) + + montmul_p384(y2a,zp2,y2a) + + movq input_x, %rsi + sub_p384(xd,x2a,x_1) + movq input_x, %rsi + sub_p384(yd,y2a,y_1) + + montsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + movq input_x, %rsi + montmul_p384(zzx1,zz,x_1) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + movq input_x, %rsi + montmul_p384(resz,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + movq input_x, %rsi + montmul_p384(t1,t1,y_1) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + movq input_x, %rsi + testzero6(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), +// hence giving 0 + p2 = p2 for the final result. + + movq input_y, %rcx + mux6(%r8,%r9,%r10,%r11,%rbx,%rbp,resx,x_2) + mux6(%r12,%r13,%r14,%r15,%rdx,%rcx,resy,y_2) + + store6(x_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + store6(y_3,%r12,%r13,%r14,%r15,%rdx,%rcx) + + load6(%r8,%r9,%r10,%r11,%rbx,%rbp,resz) + movq $0xffffffff00000001, %rax + cmovzq %rax, %r8 + movl $0x00000000ffffffff, %eax + cmovzq %rax, %r9 + movq $1, %rax + cmovzq %rax, %r10 + movl $0, %eax + cmovzq %rax, %r11 + cmovzq %rax, %rbx + cmovzq %rax, %rbp + + store6(z_3,%r8,%r9,%r10,%r11,%rbx,%rbp) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521_alt.S b/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521_alt.S index f87546928a..a769fa0b3a 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521_alt.S +++ b/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521_alt.S @@ -74,10 +74,6 @@ S2N_BN_SYMBOL(bignum_mul_p521_alt): movq %rdx, y -// Copy y into a safe register to start with - - mov %rdx, y - // Start doing a conventional columnwise multiplication, // temporarily storing the lower 9 digits to the stack. // Start with result term 0 diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jadd.S b/third_party/s2n-bignum/x86_att/p521/p521_jadd.S index 807a7c5472..9f1b03c47b 100644 --- a/third_party/s2n-bignum/x86_att/p521/p521_jadd.S +++ b/third_party/s2n-bignum/x86_att/p521/p521_jadd.S @@ -55,6 +55,7 @@ #define z1sq (NUMSIZE*0)(%rsp) #define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) #define yd (NUMSIZE*1)(%rsp) #define y2a (NUMSIZE*1)(%rsp) @@ -68,9 +69,11 @@ #define t2 (NUMSIZE*4)(%rsp) #define x1a (NUMSIZE*4)(%rsp) #define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) #define xd (NUMSIZE*5)(%rsp) #define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) #define y1a (NUMSIZE*6)(%rsp) @@ -670,6 +673,79 @@ andq $0x1ff, %r14 ; \ movq %r14, 0x40+P0 +// Additional macros to help with final multiplexing + +#define load9(r0,r1,r2,r3,r4,r5,r6,r7,ra,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 ; \ + movq 48+P, r6 ; \ + movq 56+P, r7 ; \ + movq 64+P, ra + +#define store9(P,r0,r1,r2,r3,r4,r5,r6,r7,ra) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P ; \ + movq r6, 48+P ; \ + movq r7, 56+P ; \ + movq ra, 64+P + +#define muxload9(r0,r1,r2,r3,r4,r5,r6,r7,ra,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 ; \ + movq 32+P0, r4 ; \ + cmovbq 32+P1, r4 ; \ + cmovnbe 32+P2, r4 ; \ + movq 40+P0, r5 ; \ + cmovbq 40+P1, r5 ; \ + cmovnbe 40+P2, r5 ; \ + movq 48+P0, r6 ; \ + cmovbq 48+P1, r6 ; \ + cmovnbe 48+P2, r6 ; \ + movq 56+P0, r7 ; \ + cmovbq 56+P1, r7 ; \ + cmovnbe 56+P2, r7 ; \ + movq 64+P0, ra ; \ + cmovbq 64+P1, ra ; \ + cmovnbe 64+P2, ra + +#define copy9(P0,P1) \ + movq P1, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+P1, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+P1, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+P1, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+P1, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+P1, %rax ; \ + movq %rax, 64+P0 + S2N_BN_SYMBOL(p521_jadd): #if WINDOWS_ABI @@ -717,20 +793,64 @@ S2N_BN_SYMBOL(p521_jadd): mul_p521(zzx1,zz,x1a) mul_p521(zzx2,zz,x2a) - sub_p521(x_3,ww,zzx1) + sub_p521(resx,ww,zzx1) sub_p521(t1,zzx2,zzx1) mul_p521(xd,xd,z_1) - sub_p521(x_3,x_3,zzx2) + sub_p521(resx,resx,zzx2) - sub_p521(t2,zzx1,x_3) + sub_p521(t2,zzx1,resx) mul_p521(t1,t1,y1a) - mul_p521(z_3,xd,z_2) + mul_p521(resz,xd,z_2) mul_p521(t2,yd,t2) - sub_p521(y_3,t2,t1) + sub_p521(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) + + load9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,z_1) + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rax, %rax + + load9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,z_2) + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rdx, %rdx + + cmpq %rax, %rdx + +// Multiplex the outputs accordingly. Re-store them in resz until there +// are no more loads, so there are no assumptions on input-output aliasing + + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resy,y_1,y_2) + store9(resy,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resz,z_1,z_2) + store9(resz,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resx,x_1,x_2) + store9(x_3,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + copy9(y_3,resy) + copy9(z_3,resz) // Restore stack and registers diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jadd_alt.S b/third_party/s2n-bignum/x86_att/p521/p521_jadd_alt.S new file mode 100644 index 0000000000..5b51a4f6a6 --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p521/p521_jadd_alt.S @@ -0,0 +1,1149 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jadd_alt +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_521, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence +// These are where they arrive except for input_y, initially in %rdx + +#define input_z %rdi +#define input_x %rsi +#define input_y %rcx + +// Pointer-offset pairs for inputs and outputs + +#define x_1 0(input_x) +#define y_1 NUMSIZE(input_x) +#define z_1 (2*NUMSIZE)(input_x) + +#define x_2 0(input_y) +#define y_2 NUMSIZE(input_y) +#define z_2 (2*NUMSIZE)(input_y) + +#define x_3 0(input_z) +#define y_3 NUMSIZE(input_z) +#define z_3 (2*NUMSIZE)(input_z) + +// Pointer-offset pairs for temporaries, with some aliasing +// The tmp field is internal storage for field mul and sqr. +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +#define tmp (NUMSIZE*7)(%rsp) + +#define NSPACE (NUMSIZE*8) + +// Corresponds exactly to bignum_mul_p521_alt except temp storage + +#define mul_p521(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, 504(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r9, 512(%rsp) ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 520(%rsp) ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 528(%rsp) ; \ + xorq %r14, %r14 ; \ + movq P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 536(%rsp) ; \ + xorq %r15, %r15 ; \ + movq P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 544(%rsp) ; \ + xorq %r8, %r8 ; \ + movq P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 552(%rsp) ; \ + xorq %r9, %r9 ; \ + movq P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x38+P1, %rax ; \ + mulq P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 560(%rsp) ; \ + xorq %r10, %r10 ; \ + movq P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x40+P1, %rax ; \ + mulq P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 568(%rsp) ; \ + xorq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq 0x40+P2, %rax ; \ + addq %r8, %rax ; \ + movq 568(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 + +// Corresponds to bignum_sqr_p521_alt except %rbp is used +// in place of %rcx and tmp is the temp storage location + +#define sqr_p521(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, 504(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq %r9, 512(%rsp) ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 520(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 528(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x20+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rbp, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 536(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x28+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rbp, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 544(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x30+P1; \ + xorq %r8, %r8 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r8 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r8, %r8 ; \ + addq %rbx, %r14 ; \ + adcq %rbp, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 552(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x38+P1; \ + xorq %r9, %r9 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r9, %r9 ; \ + addq %rbx, %r15 ; \ + adcq %rbp, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 560(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r10, %r10 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r10, %r10 ; \ + addq %rbx, %r8 ; \ + adcq %rbp, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 568(%rsp) ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r11, %r11 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r11, %r11 ; \ + addq %rbx, %r9 ; \ + adcq %rbp, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r12, %r12 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r12 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r12, %r12 ; \ + addq %rbx, %r10 ; \ + adcq %rbp, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rbp, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rbp, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r8 ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq %rax, %rax ; \ + addq %r8, %rax ; \ + movq 568(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + sbbq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + sbbq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + sbbq 0x40+P2, %r14 ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq $0x0, %r14 ; \ + andq $0x1ff, %r14 ; \ + movq %r14, 0x40+P0 + +// Additional macros to help with final multiplexing + +#define load9(r0,r1,r2,r3,r4,r5,r6,r7,ra,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 ; \ + movq 32+P, r4 ; \ + movq 40+P, r5 ; \ + movq 48+P, r6 ; \ + movq 56+P, r7 ; \ + movq 64+P, ra + +#define store9(P,r0,r1,r2,r3,r4,r5,r6,r7,ra) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P ; \ + movq r4, 32+P ; \ + movq r5, 40+P ; \ + movq r6, 48+P ; \ + movq r7, 56+P ; \ + movq ra, 64+P + +#define muxload9(r0,r1,r2,r3,r4,r5,r6,r7,ra,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 ; \ + movq 32+P0, r4 ; \ + cmovbq 32+P1, r4 ; \ + cmovnbe 32+P2, r4 ; \ + movq 40+P0, r5 ; \ + cmovbq 40+P1, r5 ; \ + cmovnbe 40+P2, r5 ; \ + movq 48+P0, r6 ; \ + cmovbq 48+P1, r6 ; \ + cmovnbe 48+P2, r6 ; \ + movq 56+P0, r7 ; \ + cmovbq 56+P1, r7 ; \ + cmovnbe 56+P2, r7 ; \ + movq 64+P0, ra ; \ + cmovbq 64+P1, ra ; \ + cmovnbe 64+P2, ra + +#define copy9(P0,P1) \ + movq P1, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+P1, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+P1, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+P1, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+P1, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+P1, %rax ; \ + movq %rax, 64+P0 + +S2N_BN_SYMBOL(p521_jadd_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Move the input arguments to stable places (two are already there) + + movq %rdx, input_y + +// Main code, just a sequence of basic field operations + + sqr_p521(z1sq,z_1) + sqr_p521(z2sq,z_2) + + mul_p521(y1a,z_2,y_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,z1sq,x_2) + mul_p521(x1a,z2sq,x_1) + mul_p521(y2a,z1sq,y2a) + mul_p521(y1a,z2sq,y1a) + + sub_p521(xd,x2a,x1a) + sub_p521(yd,y2a,y1a) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x1a) + mul_p521(zzx2,zz,x2a) + + sub_p521(resx,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(xd,xd,z_1) + + sub_p521(resx,resx,zzx2) + + sub_p521(t2,zzx1,resx) + + mul_p521(t1,t1,y1a) + mul_p521(resz,xd,z_2) + mul_p521(t2,yd,t2) + + sub_p521(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) + + load9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,z_1) + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rax, %rax + + load9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,z_2) + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rdx, %rdx + + cmpq %rax, %rdx + +// Multiplex the outputs accordingly. Re-store them in resz until there +// are no more loads, so there are no assumptions on input-output aliasing + + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resy,y_1,y_2) + store9(resy,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resz,z_1,z_2) + store9(resz,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + muxload9(%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp,resx,x_1,x_2) + store9(x_3,%r8,%r9,%r10,%r11,%r12,%r13,%r14,%r15,%rbp) + copy9(y_3,resy) + copy9(z_3,resz) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jdouble_alt.S b/third_party/s2n-bignum/x86_att/p521/p521_jdouble_alt.S new file mode 100644 index 0000000000..2dc6c32120 --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p521/p521_jdouble_alt.S @@ -0,0 +1,1865 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jdouble_alt +// (uint64_t p3[static 27],uint64_t p1[static 27]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_521 and that the z coordinate is not zero. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jdouble_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence +// This is actually where they come in anyway and they stay there. + +#define input_z %rdi +#define input_x %rsi + +// Pointer-offset pairs for inputs and outputs + +#define x_1 0(input_x) +#define y_1 NUMSIZE(input_x) +#define z_1 (2*NUMSIZE)(input_x) + +#define x_3 0(input_z) +#define y_3 NUMSIZE(input_z) +#define z_3 (2*NUMSIZE)(input_z) + +// Pointer-offset pairs for temporaries, with some aliasing +// The tmp field is internal storage for field mul and sqr. +// NSPACE is the total stack needed for these temporaries + +#define z2 (NUMSIZE*0)(%rsp) +#define y2 (NUMSIZE*1)(%rsp) +#define x2p (NUMSIZE*2)(%rsp) +#define xy2 (NUMSIZE*3)(%rsp) + +#define y4 (NUMSIZE*4)(%rsp) +#define t2 (NUMSIZE*4)(%rsp) + +#define dx2 (NUMSIZE*5)(%rsp) +#define t1 (NUMSIZE*5)(%rsp) + +#define d (NUMSIZE*6)(%rsp) +#define x4p (NUMSIZE*6)(%rsp) + +#define tmp (NUMSIZE*7)(%rsp) + +#define NSPACE (NUMSIZE*7+72) + +// Corresponds to bignum_mul_p521_alt except temp storage location + +#define mul_p521(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, 504(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r9, 512(%rsp) ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 520(%rsp) ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 528(%rsp) ; \ + xorq %r14, %r14 ; \ + movq P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 536(%rsp) ; \ + xorq %r15, %r15 ; \ + movq P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 544(%rsp) ; \ + xorq %r8, %r8 ; \ + movq P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 552(%rsp) ; \ + xorq %r9, %r9 ; \ + movq P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x38+P1, %rax ; \ + mulq P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 560(%rsp) ; \ + xorq %r10, %r10 ; \ + movq P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x40+P1, %rax ; \ + mulq P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 568(%rsp) ; \ + xorq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq 0x40+P2, %rax ; \ + addq %r8, %rax ; \ + movq 568(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 + +// Corresponds to bignum_sqr_p521_alt except temp storage location + +#define sqr_p521(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, 504(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq %r9, 512(%rsp) ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 520(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rcx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 528(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x20+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rcx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 536(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x28+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rcx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 544(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x30+P1; \ + xorq %r8, %r8 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r8 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r8, %r8 ; \ + addq %rbx, %r14 ; \ + adcq %rcx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 552(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x38+P1; \ + xorq %r9, %r9 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r9 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r9, %r9 ; \ + addq %rbx, %r15 ; \ + adcq %rcx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 560(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r10, %r10 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r10 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r10, %r10 ; \ + addq %rbx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 568(%rsp) ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r11, %r11 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r11 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r11, %r11 ; \ + addq %rbx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r12, %r12 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r12 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r12, %r12 ; \ + addq %rbx, %r10 ; \ + adcq %rcx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rcx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rcx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rcx ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rcx ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rcx, %rcx ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rcx, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r8 ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq %rax, %rax ; \ + addq %r8, %rax ; \ + movq 568(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 ; \ + +// Corresponds exactly to bignum_add_p521 + +#define add_p521(P0,P1,P2) \ + stc; \ + movq P1, %rax ; \ + adcq P2, %rax ; \ + movq 0x8+P1, %rbx ; \ + adcq 0x8+P2, %rbx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + adcq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + adcq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + adcq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + adcq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + adcq 0x40+P2, %r14 ; \ + movq $0x200, %rdx ; \ + andq %r14, %rdx ; \ + cmpq $0x200, %rdx ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rbx ; \ + movq %rbx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq %rdx, %r14 ; \ + movq %r14, 0x40+P0 + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + sbbq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + sbbq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + sbbq 0x40+P2, %r14 ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq $0x0, %r14 ; \ + andq $0x1ff, %r14 ; \ + movq %r14, 0x40+P0 + +// Weak multiplication not fully reducing + +#define weakmul_p521(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, 504(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r9, 512(%rsp) ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 520(%rsp) ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 528(%rsp) ; \ + xorq %r14, %r14 ; \ + movq P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 536(%rsp) ; \ + xorq %r15, %r15 ; \ + movq P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 544(%rsp) ; \ + xorq %r8, %r8 ; \ + movq P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 552(%rsp) ; \ + xorq %r9, %r9 ; \ + movq P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x38+P1, %rax ; \ + mulq P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 560(%rsp) ; \ + xorq %r10, %r10 ; \ + movq P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x40+P1, %rax ; \ + mulq P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 568(%rsp) ; \ + xorq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq 0x40+P2, %rax ; \ + addq %r8, %rax ; \ + movq 568(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + addq 504(%rsp), %r8 ; \ + adcq 512(%rsp), %r9 ; \ + adcq 520(%rsp), %r10 ; \ + adcq 528(%rsp), %r11 ; \ + adcq 536(%rsp), %r12 ; \ + adcq 544(%rsp), %r13 ; \ + adcq 552(%rsp), %r14 ; \ + adcq 560(%rsp), %r15 ; \ + adcq $0, %rdx ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 ; \ + movq %r12, 0x20+P0 ; \ + movq %r13, 0x28+P0 ; \ + movq %r14, 0x30+P0 ; \ + movq %r15, 0x38+P0 ; \ + movq %rdx, 0x40+P0 + +// P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) + +#define cmsub_p521(P0,C,P1,D,P2) \ + movq $D, %rcx ; \ + movq P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 8+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 16+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 24+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 32+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r13d, %r13d ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + movq 40+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r14d, %r14d ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + movq 48+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %r15d, %r15d ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq 56+P2, %rax ; \ + notq %rax; \ + mulq %rcx; \ + xorl %ebx, %ebx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rbx ; \ + movq 64+P2, %rax ; \ + xorq $0x1FF, %rax ; \ + imulq %rcx, %rax ; \ + addq %rax, %rbx ; \ + xorl %eax, %eax ; \ + movl $C, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x30+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x38+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rbx ; \ + movq 0x40+P1, %rax ; \ + imulq %rcx, %rax ; \ + addq %rax, %rbx ; \ + movq %r9, %rax ; \ + andq %r10, %rax ; \ + andq %r11, %rax ; \ + andq %r12, %rax ; \ + andq %r13, %rax ; \ + andq %r14, %rax ; \ + andq %r15, %rax ; \ + movq %rbx, %rdx ; \ + shrq $9, %rdx ; \ + orq $~0x1FF, %rbx ; \ + leaq 1(%rdx), %rcx ; \ + addq %r8, %rcx ; \ + movl $0, %ecx ; \ + adcq %rcx, %rax ; \ + movq %rbx, %rax ; \ + adcq %rcx, %rax ; \ + adcq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rcx, %r9 ; \ + movq %r9, 8+P0 ; \ + adcq %rcx, %r10 ; \ + movq %r10, 16+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 24+P0 ; \ + adcq %rcx, %r12 ; \ + movq %r12, 32+P0 ; \ + adcq %rcx, %r13 ; \ + movq %r13, 40+P0 ; \ + adcq %rcx, %r14 ; \ + movq %r14, 48+P0 ; \ + adcq %rcx, %r15 ; \ + movq %r15, 56+P0 ; \ + adcq %rcx, %rbx ; \ + andq $0x1FF, %rbx ; \ + movq %rbx, 64+P0 + +// P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2) + +#define cmsub38_p521(P0,P1,P2) \ + movq 64+P2, %rbx ; \ + xorq $0x1FF, %rbx ; \ + movq 56+P2, %r15 ; \ + notq %r15; \ + shldq $3, %r15, %rbx ; \ + movq 48+P2, %r14 ; \ + notq %r14; \ + shldq $3, %r14, %r15 ; \ + movq 40+P2, %r13 ; \ + notq %r13; \ + shldq $3, %r13, %r14 ; \ + movq 32+P2, %r12 ; \ + notq %r12; \ + shldq $3, %r12, %r13 ; \ + movq 24+P2, %r11 ; \ + notq %r11; \ + shldq $3, %r11, %r12 ; \ + movq 16+P2, %r10 ; \ + notq %r10; \ + shldq $3, %r10, %r11 ; \ + movq 8+P2, %r9 ; \ + notq %r9; \ + shldq $3, %r9, %r10 ; \ + movq P2, %r8 ; \ + notq %r8; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + movl $3, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbp, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbp, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbp, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rbp, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rbp, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rbp, %rbp ; \ + movq 0x30+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + sbbq %rbp, %rbp ; \ + movq 0x38+P1, %rax ; \ + mulq %rcx; \ + subq %rbp, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %rbx ; \ + movq 0x40+P1, %rax ; \ + imulq %rcx, %rax ; \ + addq %rax, %rbx ; \ + movq %r9, %rax ; \ + andq %r10, %rax ; \ + andq %r11, %rax ; \ + andq %r12, %rax ; \ + andq %r13, %rax ; \ + andq %r14, %rax ; \ + andq %r15, %rax ; \ + movq %rbx, %rdx ; \ + shrq $9, %rdx ; \ + orq $~0x1FF, %rbx ; \ + leaq 1(%rdx), %rcx ; \ + addq %r8, %rcx ; \ + movl $0, %ecx ; \ + adcq %rcx, %rax ; \ + movq %rbx, %rax ; \ + adcq %rcx, %rax ; \ + adcq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rcx, %r9 ; \ + movq %r9, 8+P0 ; \ + adcq %rcx, %r10 ; \ + movq %r10, 16+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 24+P0 ; \ + adcq %rcx, %r12 ; \ + movq %r12, 32+P0 ; \ + adcq %rcx, %r13 ; \ + movq %r13, 40+P0 ; \ + adcq %rcx, %r14 ; \ + movq %r14, 48+P0 ; \ + adcq %rcx, %r15 ; \ + movq %r15, 56+P0 ; \ + adcq %rcx, %rbx ; \ + andq $0x1FF, %rbx ; \ + movq %rbx, 64+P0 + +// P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2) + +#define cmsub41_p521(P0,P1,P2) \ + movq 64+P1, %rbx ; \ + movq 56+P1, %r15 ; \ + shldq $2, %r15, %rbx ; \ + movq 48+P1, %r14 ; \ + shldq $2, %r14, %r15 ; \ + movq 40+P1, %r13 ; \ + shldq $2, %r13, %r14 ; \ + movq 32+P1, %r12 ; \ + shldq $2, %r12, %r13 ; \ + movq 24+P1, %r11 ; \ + shldq $2, %r11, %r12 ; \ + movq 16+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + movq 64+P2, %rcx ; \ + xorq $0x1FF, %rcx ; \ + movq P2, %rax ; \ + notq %rax; \ + addq %rax, %r8 ; \ + movq 8+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r9 ; \ + movq 16+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r10 ; \ + movq 24+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r11 ; \ + movq 32+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r12 ; \ + movq 40+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r13 ; \ + movq 48+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r14 ; \ + movq 56+P2, %rax ; \ + notq %rax; \ + adcq %rax, %r15 ; \ + adcq %rcx, %rbx ; \ + movq %r9, %rax ; \ + andq %r10, %rax ; \ + andq %r11, %rax ; \ + andq %r12, %rax ; \ + andq %r13, %rax ; \ + andq %r14, %rax ; \ + andq %r15, %rax ; \ + movq %rbx, %rdx ; \ + shrq $9, %rdx ; \ + orq $~0x1FF, %rbx ; \ + leaq 1(%rdx), %rcx ; \ + addq %r8, %rcx ; \ + movl $0, %ecx ; \ + adcq %rcx, %rax ; \ + movq %rbx, %rax ; \ + adcq %rcx, %rax ; \ + adcq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rcx, %r9 ; \ + movq %r9, 8+P0 ; \ + adcq %rcx, %r10 ; \ + movq %r10, 16+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 24+P0 ; \ + adcq %rcx, %r12 ; \ + movq %r12, 32+P0 ; \ + adcq %rcx, %r13 ; \ + movq %r13, 40+P0 ; \ + adcq %rcx, %r14 ; \ + movq %r14, 48+P0 ; \ + adcq %rcx, %r15 ; \ + movq %r15, 56+P0 ; \ + adcq %rcx, %rbx ; \ + andq $0x1FF, %rbx ; \ + movq %rbx, 64+P0 + +S2N_BN_SYMBOL(p521_jdouble_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + sqr_p521(z2,z_1) + sqr_p521(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + add_p521(t1,x_1,z2) + sub_p521(t2,x_1,z2) + mul_p521(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p521(t1,y_1,z_1) + sqr_p521(x4p,x2p) + weakmul_p521(xy2,x_1,y2) + +// t2 = (y + z)^2 + + sqr_p521(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p521(d,12,xy2,9,x4p) + sub_p521(t1,t2,z2) + +// y4 = y^4 + + sqr_p521(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p521(z_3,t1,y2) + weakmul_p521(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p521(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p521(y_3,dx2,y4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd.S b/third_party/s2n-bignum/x86_att/p521/p521_jmixadd.S index 702b63f560..879fce6954 100644 --- a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd.S +++ b/third_party/s2n-bignum/x86_att/p521/p521_jmixadd.S @@ -56,6 +56,7 @@ #define zp2 (NUMSIZE*0)(%rsp) #define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) #define yd (NUMSIZE*1)(%rsp) #define y2a (NUMSIZE*1)(%rsp) @@ -68,8 +69,10 @@ #define t2 (NUMSIZE*4)(%rsp) #define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) #define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) #define tmp (NUMSIZE*6)(%rsp) @@ -667,6 +670,111 @@ andq $0x1ff, %r14 ; \ movq %r14, 0x40+P0 +// Additional macros to help with final multiplexing + +#define testzero9(P) \ + movq P, %rax ; \ + movq 8+P, %rbx ; \ + movq 16+P, %rdx ; \ + movq 24+P, %rbp ; \ + orq 32+P, %rax ; \ + orq 40+P, %rbx ; \ + orq 48+P, %rdx ; \ + orq 56+P, %rbp ; \ + orq %rbx, %rax ; \ + orq %rbp, %rdx ; \ + orq 64+P, %rax ; \ + orq %rdx, %rax + +#define mux9(P0,PNE,PEQ) \ + movq PNE, %rax ; \ + movq PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, P0 ; \ + movq 8+PNE, %rax ; \ + movq 8+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+PNE, %rax ; \ + movq 16+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+PNE, %rax ; \ + movq 24+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+PNE, %rax ; \ + movq 32+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+PNE, %rax ; \ + movq 40+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+PNE, %rax ; \ + movq 48+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+PNE, %rax ; \ + movq 56+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+PNE, %rax ; \ + movq 64+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 64+P0 + +#define mux9c(P0,PNE) \ + movq PNE, %rax ; \ + movl $1, %ebx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, P0 ; \ + movq 8+PNE, %rax ; \ + movl $0, %ebx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 64+P0 + +#define copy9(P0,P1) \ + movq P1, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+P1, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+P1, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+P1, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+P1, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+P1, %rax ; \ + movq %rax, 64+P0 + S2N_BN_SYMBOL(p521_jmixadd): #if WINDOWS_ABI @@ -709,19 +817,35 @@ S2N_BN_SYMBOL(p521_jmixadd): mul_p521(zzx1,zz,x_1) mul_p521(zzx2,zz,x2a) - sub_p521(x_3,ww,zzx1) + sub_p521(resx,ww,zzx1) sub_p521(t1,zzx2,zzx1) - mul_p521(z_3,xd,z_1) + mul_p521(resz,xd,z_1) - sub_p521(x_3,x_3,zzx2) + sub_p521(resx,resx,zzx2) - sub_p521(t2,zzx1,x_3) + sub_p521(t2,zzx1,resx) mul_p521(t1,t1,y_1) mul_p521(t2,yd,t2) - sub_p521(y_3,t2,t1) + sub_p521(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + testzero9(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + mux9 (resx,resx,x_2) + mux9 (resy,resy,y_2) + + copy9(x_3,resx) + copy9(y_3,resy) + + mux9c(z_3,resz) // Restore stack and registers diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd_alt.S b/third_party/s2n-bignum/x86_att/p521/p521_jmixadd_alt.S new file mode 100644 index 0000000000..d9279fe305 --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p521/p521_jmixadd_alt.S @@ -0,0 +1,1144 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jmixadd_alt +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_521, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jmixadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence +// These are where they arrive except for input_y, initially in %rdx + +#define input_z %rdi +#define input_x %rsi +#define input_y %rcx + +// Pointer-offset pairs for inputs and outputs + +#define x_1 0(input_x) +#define y_1 NUMSIZE(input_x) +#define z_1 (2*NUMSIZE)(input_x) + +#define x_2 0(input_y) +#define y_2 NUMSIZE(input_y) + +#define x_3 0(input_z) +#define y_3 NUMSIZE(input_z) +#define z_3 (2*NUMSIZE)(input_z) + +// Pointer-offset pairs for temporaries, with some aliasing +// The tmp field is internal storage for field mul and sqr. +// NSPACE is the total stack needed for these temporaries + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define tmp (NUMSIZE*6)(%rsp) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_mul_p521_alt except temp storage + +#define mul_p521(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, 432(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq %r9, 440(%rsp) ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 448(%rsp) ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 456(%rsp) ; \ + xorq %r14, %r14 ; \ + movq P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 464(%rsp) ; \ + xorq %r15, %r15 ; \ + movq P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 472(%rsp) ; \ + xorq %r8, %r8 ; \ + movq P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 480(%rsp) ; \ + xorq %r9, %r9 ; \ + movq P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq 0x38+P1, %rax ; \ + mulq P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 488(%rsp) ; \ + xorq %r10, %r10 ; \ + movq P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %r10, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x40+P1, %rax ; \ + mulq P2; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 496(%rsp) ; \ + xorq %r11, %r11 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x20+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x28+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x30+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + mulq 0x38+P2; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq 0x40+P2, %rax ; \ + addq %r8, %rax ; \ + movq 496(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 432(%rsp), %r8 ; \ + adcq 440(%rsp), %r9 ; \ + adcq 448(%rsp), %r10 ; \ + adcq 456(%rsp), %r11 ; \ + adcq 464(%rsp), %r12 ; \ + adcq 472(%rsp), %r13 ; \ + adcq 480(%rsp), %r14 ; \ + adcq 488(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 + +// Corresponds to bignum_sqr_p521_alt except %rbp is used +// in place of %rcx and the output as temp storage location + +#define sqr_p521(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, 432(%rsp) ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + movq %r9, 440(%rsp) ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq %r10, 448(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq $0x0, %r13 ; \ + movq %r11, 456(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x20+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rbp, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq %r12, 464(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x28+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rbp, %r14 ; \ + adcq $0x0, %r15 ; \ + movq %r13, 472(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x30+P1; \ + xorq %r8, %r8 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r8 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r8, %r8 ; \ + addq %rbx, %r14 ; \ + adcq %rbp, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq %r14, 480(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x38+P1; \ + xorq %r9, %r9 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x20+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r9 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r9, %r9 ; \ + addq %rbx, %r15 ; \ + adcq %rbp, %r8 ; \ + adcq $0x0, %r9 ; \ + movq %r15, 488(%rsp) ; \ + movq P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r10, %r10 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x8+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r10 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r10, %r10 ; \ + addq %rbx, %r8 ; \ + adcq %rbp, %r9 ; \ + adcq $0x0, %r10 ; \ + movq 0x20+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + movq %r8, 496(%rsp) ; \ + movq 0x8+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r11, %r11 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x10+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x28+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r11 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r11, %r11 ; \ + addq %rbx, %r9 ; \ + adcq %rbp, %r10 ; \ + adcq $0x0, %r11 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r12, %r12 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x18+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r12 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r12, %r12 ; \ + addq %rbx, %r10 ; \ + adcq %rbp, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x28+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r13, %r13 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x20+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x30+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r13 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r13, %r13 ; \ + addq %rbx, %r11 ; \ + adcq %rbp, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x20+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r14, %r14 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x28+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r14 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r14, %r14 ; \ + addq %rbx, %r12 ; \ + adcq %rbp, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x30+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x28+P1, %rax ; \ + mulq 0x40+P1; \ + xorq %r15, %r15 ; \ + movq %rax, %rbx ; \ + movq %rdx, %rbp ; \ + movq 0x30+P1, %rax ; \ + mulq 0x38+P1; \ + addq %rax, %rbx ; \ + adcq %rdx, %rbp ; \ + adcq $0x0, %r15 ; \ + addq %rbx, %rbx ; \ + adcq %rbp, %rbp ; \ + adcq %r15, %r15 ; \ + addq %rbx, %r13 ; \ + adcq %rbp, %r14 ; \ + adcq $0x0, %r15 ; \ + xorq %r8, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x30+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r8 ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq $0x0, %r8 ; \ + movq 0x38+P1, %rax ; \ + mulq 0x40+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + addq %rax, %r15 ; \ + adcq %rdx, %r8 ; \ + movq 0x40+P1, %rax ; \ + imulq %rax, %rax ; \ + addq %r8, %rax ; \ + movq 496(%rsp), %r8 ; \ + movq %r8, %rdx ; \ + andq $0x1ff, %rdx ; \ + shrdq $0x9, %r9, %r8 ; \ + shrdq $0x9, %r10, %r9 ; \ + shrdq $0x9, %r11, %r10 ; \ + shrdq $0x9, %r12, %r11 ; \ + shrdq $0x9, %r13, %r12 ; \ + shrdq $0x9, %r14, %r13 ; \ + shrdq $0x9, %r15, %r14 ; \ + shrdq $0x9, %rax, %r15 ; \ + shrq $0x9, %rax ; \ + addq %rax, %rdx ; \ + stc; \ + adcq 432(%rsp), %r8 ; \ + adcq 440(%rsp), %r9 ; \ + adcq 448(%rsp), %r10 ; \ + adcq 456(%rsp), %r11 ; \ + adcq 464(%rsp), %r12 ; \ + adcq 472(%rsp), %r13 ; \ + adcq 480(%rsp), %r14 ; \ + adcq 488(%rsp), %r15 ; \ + adcq $0xfffffffffffffe00, %rdx ; \ + cmc; \ + sbbq $0x0, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x18+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x20+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x28+P0 ; \ + sbbq $0x0, %r14 ; \ + movq %r14, 0x30+P0 ; \ + sbbq $0x0, %r15 ; \ + movq %r15, 0x38+P0 ; \ + sbbq $0x0, %rdx ; \ + andq $0x1ff, %rdx ; \ + movq %rdx, 0x40+P0 ; \ + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rdx ; \ + sbbq 0x8+P2, %rdx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq 0x20+P1, %r10 ; \ + sbbq 0x20+P2, %r10 ; \ + movq 0x28+P1, %r11 ; \ + sbbq 0x28+P2, %r11 ; \ + movq 0x30+P1, %r12 ; \ + sbbq 0x30+P2, %r12 ; \ + movq 0x38+P1, %r13 ; \ + sbbq 0x38+P2, %r13 ; \ + movq 0x40+P1, %r14 ; \ + sbbq 0x40+P2, %r14 ; \ + sbbq $0x0, %rax ; \ + movq %rax, P0 ; \ + sbbq $0x0, %rdx ; \ + movq %rdx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq $0x0, %r9 ; \ + movq %r9, 0x18+P0 ; \ + sbbq $0x0, %r10 ; \ + movq %r10, 0x20+P0 ; \ + sbbq $0x0, %r11 ; \ + movq %r11, 0x28+P0 ; \ + sbbq $0x0, %r12 ; \ + movq %r12, 0x30+P0 ; \ + sbbq $0x0, %r13 ; \ + movq %r13, 0x38+P0 ; \ + sbbq $0x0, %r14 ; \ + andq $0x1ff, %r14 ; \ + movq %r14, 0x40+P0 + +// Additional macros to help with final multiplexing + +#define testzero9(P) \ + movq P, %rax ; \ + movq 8+P, %rbx ; \ + movq 16+P, %rdx ; \ + movq 24+P, %rbp ; \ + orq 32+P, %rax ; \ + orq 40+P, %rbx ; \ + orq 48+P, %rdx ; \ + orq 56+P, %rbp ; \ + orq %rbx, %rax ; \ + orq %rbp, %rdx ; \ + orq 64+P, %rax ; \ + orq %rdx, %rax + +#define mux9(P0,PNE,PEQ) \ + movq PNE, %rax ; \ + movq PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, P0 ; \ + movq 8+PNE, %rax ; \ + movq 8+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+PNE, %rax ; \ + movq 16+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+PNE, %rax ; \ + movq 24+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+PNE, %rax ; \ + movq 32+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+PNE, %rax ; \ + movq 40+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+PNE, %rax ; \ + movq 48+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+PNE, %rax ; \ + movq 56+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+PNE, %rax ; \ + movq 64+PEQ, %rbx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 64+P0 + +#define mux9c(P0,PNE) \ + movq PNE, %rax ; \ + movl $1, %ebx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, P0 ; \ + movq 8+PNE, %rax ; \ + movl $0, %ebx ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+PNE, %rax ; \ + cmovzq %rbx, %rax ; \ + movq %rax, 64+P0 + +#define copy9(P0,P1) \ + movq P1, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq %rax, 24+P0 ; \ + movq 32+P1, %rax ; \ + movq %rax, 32+P0 ; \ + movq 40+P1, %rax ; \ + movq %rax, 40+P0 ; \ + movq 48+P1, %rax ; \ + movq %rax, 48+P0 ; \ + movq 56+P1, %rax ; \ + movq %rax, 56+P0 ; \ + movq 64+P1, %rax ; \ + movq %rax, 64+P0 + +S2N_BN_SYMBOL(p521_jmixadd_alt): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Move the input arguments to stable places (two are already there) + + movq %rdx, input_y + +// Main code, just a sequence of basic field operations + + sqr_p521(zp2,z_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,zp2,x_2) + mul_p521(y2a,zp2,y2a) + + sub_p521(xd,x2a,x_1) + sub_p521(yd,y2a,y_1) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x_1) + mul_p521(zzx2,zz,x2a) + + sub_p521(resx,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(resz,xd,z_1) + + sub_p521(resx,resx,zzx2) + + sub_p521(t2,zzx1,resx) + + mul_p521(t1,t1,y_1) + mul_p521(t2,yd,t2) + + sub_p521(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + testzero9(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + mux9 (resx,resx,x_2) + mux9 (resy,resy,y_2) + + copy9(x_3,resx) + copy9(y_3,resy) + + mux9c(z_3,resz) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif