From 3e9428996698257aa2a3b4e974f574b8479f1261 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Wed, 10 May 2023 15:19:38 +0200
Subject: [PATCH 1/4] ct: Use volatile trick in scalar_cond_negate

---
 src/scalar_4x64_impl.h | 3 ++-
 src/scalar_8x32_impl.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 60aca8c10..a48d58c21 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -180,7 +180,8 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar *a) {
 static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     /* If we are flag = 0, mask = 00...00 and this is a no-op;
      * if we are flag = 1, mask = 11...11 and this is identical to secp256k1_scalar_negate */
-    uint64_t mask = !flag - 1;
+    volatile int vflag = flag;
+    uint64_t mask = -vflag;
     uint64_t nonzero = (secp256k1_scalar_is_zero(r) != 0) - 1;
     uint128_t t = (uint128_t)(r->d[0] ^ mask) + ((SECP256K1_N_0 + 1) & mask);
     r->d[0] = t & nonzero; t >>= 64;
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index ad025cff0..d960a9bda 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -253,7 +253,8 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar *a) {
 static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     /* If we are flag = 0, mask = 00...00 and this is a no-op;
      * if we are flag = 1, mask = 11...11 and this is identical to secp256k1_scalar_negate */
-    uint32_t mask = !flag - 1;
+    volatile int vflag = flag;
+    uint32_t mask = -vflag;
     uint32_t nonzero = 0xFFFFFFFFUL * (secp256k1_scalar_is_zero(r) == 0);
     uint64_t t = (uint64_t)(r->d[0] ^ mask) + ((SECP256K1_N_0 + 1) & mask);
     r->d[0] = t & nonzero; t >>= 32;

From c8c0f55a1132c0fc9a726f0a4a1417288163b904 Mon Sep 17 00:00:00 2001
From: Tim Ruffing <crypto@timruffing.de>
Date: Wed, 10 May 2023 16:25:37 +0200
Subject: [PATCH 2/4] ct: Be cautious and use volatile trick in more
 "conditional" paths

 - secp256k1_scalar_cadd_bit
 - secp256k1_modinvXX_normalize_YY
 - secp256k1_modinvXX_divsteps_ZZ
 - ECMULT_CONST_TABLE_GET_GE

Even though those code loations are not problematic right now
(with current compilers).
---
 src/ecmult_const_impl.h |  2 +-
 src/modinv32_impl.h     | 33 ++++++++++++++++++---------------
 src/modinv64_impl.h     | 31 +++++++++++++++++--------------
 src/scalar_4x64_impl.h  |  3 ++-
 src/scalar_8x32_impl.h  |  3 ++-
 5 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/src/ecmult_const_impl.h b/src/ecmult_const_impl.h
index 12dbcc6c5..c92b2a048 100644
--- a/src/ecmult_const_impl.h
+++ b/src/ecmult_const_impl.h
@@ -29,7 +29,7 @@ static void secp256k1_ecmult_odd_multiples_table_globalz_windowa(secp256k1_ge *p
 #define ECMULT_CONST_TABLE_GET_GE(r,pre,n,w) do { \
     int m = 0; \
     /* Extract the sign-bit for a constant time absolute-value. */ \
-    int mask = (n) >> (sizeof(n) * CHAR_BIT - 1); \
+    int volatile mask = (n) >> (sizeof(n) * CHAR_BIT - 1); \
     int abs_n = ((n) + mask) ^ mask; \
     int idx_n = abs_n >> 1; \
     secp256k1_fe neg_y; \
diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
index 661c5fc04..fc16eaaad 100644
--- a/src/modinv32_impl.h
+++ b/src/modinv32_impl.h
@@ -64,7 +64,7 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
     int32_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4],
             r5 = r->v[5], r6 = r->v[6], r7 = r->v[7], r8 = r->v[8];
-    int32_t cond_add, cond_negate;
+    volatile int32_t cond_add, cond_negate;
 
 #ifdef VERIFY
     /* Verify that all limbs are in range (-2^30,2^30). */
@@ -186,7 +186,8 @@ static int32_t secp256k1_modinv32_divsteps_30(int32_t zeta, uint32_t f0, uint32_
      * being inside [-2^31,2^31) means that casting to signed works correctly.
      */
     uint32_t u = 1, v = 0, q = 0, r = 1;
-    uint32_t c1, c2, f = f0, g = g0, x, y, z;
+    volatile uint32_t c1, c2;
+    uint32_t mask1, mask2, f = f0, g = g0, x, y, z;
     int i;
 
     for (i = 0; i < 30; ++i) {
@@ -195,23 +196,25 @@ static int32_t secp256k1_modinv32_divsteps_30(int32_t zeta, uint32_t f0, uint32_
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
         /* Compute conditional masks for (zeta < 0) and for (g & 1). */
         c1 = zeta >> 31;
-        c2 = -(g & 1);
+        mask1 = c1;
+        c2 = g & 1;
+        mask2 = -c2;
         /* Compute x,y,z, conditionally negated versions of f,u,v. */
-        x = (f ^ c1) - c1;
-        y = (u ^ c1) - c1;
-        z = (v ^ c1) - c1;
+        x = (f ^ mask1) - mask1;
+        y = (u ^ mask1) - mask1;
+        z = (v ^ mask1) - mask1;
         /* Conditionally add x,y,z to g,q,r. */
-        g += x & c2;
-        q += y & c2;
-        r += z & c2;
-        /* In what follows, c1 is a condition mask for (zeta < 0) and (g & 1). */
-        c1 &= c2;
+        g += x & mask2;
+        q += y & mask2;
+        r += z & mask2;
+        /* In what follows, mask1 is a condition mask for (zeta < 0) and (g & 1). */
+        mask1 &= mask2;
         /* Conditionally change zeta into -zeta-2 or zeta-1. */
-        zeta = (zeta ^ c1) - 1;
+        zeta = (zeta ^ mask1) - 1;
         /* Conditionally add g,q,r to f,u,v. */
-        f += g & c1;
-        u += q & c1;
-        v += r & c1;
+        f += g & mask1;
+        u += q & mask1;
+        v += r & mask1;
         /* Shifts */
         g >>= 1;
         u <<= 1;
diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
index 0743a9c82..905ef47be 100644
--- a/src/modinv64_impl.h
+++ b/src/modinv64_impl.h
@@ -69,7 +69,7 @@ static int secp256k1_modinv64_mul_cmp_62(const secp256k1_modinv64_signed62 *a, i
 static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int64_t sign, const secp256k1_modinv64_modinfo *modinfo) {
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     int64_t r0 = r->v[0], r1 = r->v[1], r2 = r->v[2], r3 = r->v[3], r4 = r->v[4];
-    int64_t cond_add, cond_negate;
+    volatile int64_t cond_add, cond_negate;
 
 #ifdef VERIFY
     /* Verify that all limbs are in range (-2^62,2^62). */
@@ -165,7 +165,8 @@ static int64_t secp256k1_modinv64_divsteps_59(int64_t zeta, uint64_t f0, uint64_
      * being inside [-2^63,2^63) means that casting to signed works correctly.
      */
     uint64_t u = 8, v = 0, q = 0, r = 8;
-    uint64_t c1, c2, f = f0, g = g0, x, y, z;
+    volatile uint64_t c1, c2;
+    uint64_t mask1, mask2, f = f0, g = g0, x, y, z;
     int i;
 
     for (i = 3; i < 62; ++i) {
@@ -174,23 +175,25 @@ static int64_t secp256k1_modinv64_divsteps_59(int64_t zeta, uint64_t f0, uint64_
         VERIFY_CHECK((q * f0 + r * g0) == g << i);
         /* Compute conditional masks for (zeta < 0) and for (g & 1). */
         c1 = zeta >> 63;
-        c2 = -(g & 1);
+        mask1 = c1;
+        c2 = g & 1;
+        mask2 = -c2;
         /* Compute x,y,z, conditionally negated versions of f,u,v. */
-        x = (f ^ c1) - c1;
-        y = (u ^ c1) - c1;
-        z = (v ^ c1) - c1;
+        x = (f ^ mask1) - mask1;
+        y = (u ^ mask1) - mask1;
+        z = (v ^ mask1) - mask1;
         /* Conditionally add x,y,z to g,q,r. */
-        g += x & c2;
-        q += y & c2;
-        r += z & c2;
+        g += x & mask2;
+        q += y & mask2;
+        r += z & mask2;
         /* In what follows, c1 is a condition mask for (zeta < 0) and (g & 1). */
-        c1 &= c2;
+        mask1 &= mask2;
         /* Conditionally change zeta into -zeta-2 or zeta-1. */
-        zeta = (zeta ^ c1) - 1;
+        zeta = (zeta ^ mask1) - 1;
         /* Conditionally add g,q,r to f,u,v. */
-        f += g & c1;
-        u += q & c1;
-        v += r & c1;
+        f += g & mask1;
+        u += q & mask1;
+        v += r & mask1;
         /* Shifts */
         g >>= 1;
         u <<= 1;
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index a48d58c21..4403e8a8d 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -110,8 +110,9 @@ static int secp256k1_scalar_add(secp256k1_scalar *r, const secp256k1_scalar *a,
 
 static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int flag) {
     uint128_t t;
+    volatile int vflag = flag;
     VERIFY_CHECK(bit < 256);
-    bit += ((uint32_t) flag - 1) & 0x100;  /* forcing (bit >> 6) > 3 makes this a noop */
+    bit += ((uint32_t) vflag - 1) & 0x100;  /* forcing (bit >> 6) > 3 makes this a noop */
     t = (uint128_t)r->d[0] + (((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F));
     r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     t += (uint128_t)r->d[1] + (((uint64_t)((bit >> 6) == 1)) << (bit & 0x3F));
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
index d960a9bda..b96e0335a 100644
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@@ -153,8 +153,9 @@ static int secp256k1_scalar_add(secp256k1_scalar *r, const secp256k1_scalar *a,
 
 static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int flag) {
     uint64_t t;
+    volatile int vflag = flag;
     VERIFY_CHECK(bit < 256);
-    bit += ((uint32_t) flag - 1) & 0x100;  /* forcing (bit >> 5) > 7 makes this a noop */
+    bit += ((uint32_t) vflag - 1) & 0x100;  /* forcing (bit >> 5) > 7 makes this a noop */
     t = (uint64_t)r->d[0] + (((uint32_t)((bit >> 5) == 0)) << (bit & 0x1F));
     r->d[0] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)r->d[1] + (((uint32_t)((bit >> 5) == 1)) << (bit & 0x1F));

From 56a5d41429a4daed2b02b59c45022044c3575955 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Fri, 12 May 2023 05:15:05 -0400
Subject: [PATCH 3/4] Bugfix: mark outputs as early clobber in scalar x86_64
 asm

In the existing code, the compiler is allowed to allocate the RSI register
for outputs m0, m1, or m2, which are written to before the input in RSI is
read from. Fix this by marking them as early clobber.

Reported by ehoffman2 in https://github.com/bitcoin-core/secp256k1/issues/766
---
 src/scalar_4x64_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
index 4403e8a8d..426c41f1a 100644
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@@ -389,7 +389,7 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar *r, const uint64_t *l)
     "movq %%r10, %q5\n"
     /* extract m6 */
     "movq %%r8, %q6\n"
-    : "=g"(m0), "=g"(m1), "=g"(m2), "=g"(m3), "=g"(m4), "=g"(m5), "=g"(m6)
+    : "=&g"(m0), "=&g"(m1), "=&g"(m2), "=g"(m3), "=g"(m4), "=g"(m5), "=g"(m6)
     : "S"(l), "i"(SECP256K1_N_C_0), "i"(SECP256K1_N_C_1)
     : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc");
 

From 39407c3f5999aa10e1470bc9eae8f63800a63e51 Mon Sep 17 00:00:00 2001
From: Pieter Wuille <pieter@wuille.net>
Date: Fri, 12 May 2023 05:17:11 -0400
Subject: [PATCH 4/4] Mark stack variables as early clobber for technical
 correctness

In the field 5x52 asm for x86_64, stack variables are provided as outputs.
The existing inputs are all forcibly allocated to registers, so cannot
coincide, but mark them as early clobber anyway to make this clearer.
---
 src/field_5x52_asm_impl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h
index a2118044a..51e35c96b 100644
--- a/src/field_5x52_asm_impl.h
+++ b/src/field_5x52_asm_impl.h
@@ -278,7 +278,7 @@ __asm__ __volatile__(
     "addq %%rsi,%%r8\n"
     /* r[4] = c */
     "movq %%r8,32(%%rdi)\n"
-: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
+: "+S"(a), "=&m"(tmp1), "=&m"(tmp2), "=&m"(tmp3)
 : "b"(b), "D"(r)
 : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
 );
@@ -493,7 +493,7 @@ __asm__ __volatile__(
     "addq %%rsi,%%r8\n"
     /* r[4] = c */
     "movq %%r8,32(%%rdi)\n"
-: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
+: "+S"(a), "=&m"(tmp1), "=&m"(tmp2), "=&m"(tmp3)
 : "D"(r)
 : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
 );