From 47a765238473d9539779c72abffcf61e1d7dd7ec Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Tue, 4 Nov 2014 19:16:55 +0700
Subject: [PATCH] Use Co-Z arithmetic for precomputations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Selected Co-Z formulas from "Scalar Multiplication on Weierstraß Elliptic Curves from Co-Z Arithmetic" (Goundar, Joye, et. al.) added as group methods with new type sep256k1_coz_t.
- Co-Z methods used for A and G point precomputations.
- WINDOW_A size increased to 6 since the precomputation is much faster per-point.
- DBLU cost: 3M+4S, ZADDU cost: 5M+2S.
- Take advantage of z-ratios from Co-Z to speed up table inversion.
---
 src/ecmult_impl.h |  33 ++++++++-------
 src/group.h       |  23 ++++++++++
 src/group_impl.h  | 104 +++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 139 insertions(+), 21 deletions(-)

diff --git a/src/ecmult_impl.h b/src/ecmult_impl.h
index 345cfae733..dfe99741a5 100644
--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@@ -12,7 +12,7 @@
 #include "ecmult.h"
 
 /* optimal for 128-bit and 256-bit exponents. */
-#define WINDOW_A 5
+#define WINDOW_A 6
 
 /** larger numbers may result in slightly better performance, at the cost of
     exponentially larger precomputed tables. */
@@ -24,6 +24,9 @@
 #define WINDOW_G 16
 #endif
 
+/** The number of entries a table with precomputed multiples needs to have. */
+#define ECMULT_TABLE_SIZE(w) (1 << ((w)-2))
+
 /** Fill a table 'pre' with precomputed odd multiples of a. W determines the size of the table.
  *  pre will contains the values [1*a,3*a,5*a,...,(2^(w-1)-1)*a], so it needs place for
  *  2^(w-2) entries.
@@ -36,28 +39,26 @@
  *  To compute a*P + b*G, we use the jacobian version for P, and the affine version for G, as
  *  G is constant, so it only needs to be done once in advance.
  */
-static void secp256k1_ecmult_table_precomp_gej_var(secp256k1_gej_t *pre, const secp256k1_gej_t *a, int w) {
-    pre[0] = *a;
-    secp256k1_gej_t d; secp256k1_gej_double_var(&d, &pre[0]);
-    for (int i=1; i<(1 << (w-2)); i++)
-        secp256k1_gej_add_var(&pre[i], &d, &pre[i-1]);
+static void secp256k1_ecmult_table_precomp_gej_var(secp256k1_gej_t *prej, const secp256k1_gej_t *a, int w) {
+    secp256k1_coz_t d; secp256k1_coz_dblu_var(&d, &prej[0], a);
+    secp256k1_fe_t zr;
+    for (int i=1; i<ECMULT_TABLE_SIZE(w); i++)
+        secp256k1_coz_zaddu_var(&prej[i], &d, &zr, &prej[i-1]);
 }
 
 static void secp256k1_ecmult_table_precomp_ge_var(secp256k1_ge_t *pre, const secp256k1_gej_t *a, int w) {
-    const int table_size = 1 << (w-2);
+    const int table_size = ECMULT_TABLE_SIZE(w);
     secp256k1_gej_t *prej = checked_malloc(sizeof(secp256k1_gej_t) * table_size);
-    prej[0] = *a;
-    secp256k1_gej_t d; secp256k1_gej_double_var(&d, a);
-    for (int i=1; i<table_size; i++) {
-        secp256k1_gej_add_var(&prej[i], &d, &prej[i-1]);
-    }
-    secp256k1_ge_set_all_gej_var(table_size, pre, prej);
+    secp256k1_fe_t *zr = checked_malloc(sizeof(secp256k1_fe_t) * table_size);
+    secp256k1_coz_t d; secp256k1_coz_dblu_var(&d, &prej[0], a);
+    for (int i=1; i<table_size; i++)
+        secp256k1_coz_zaddu_var(&prej[i], &d, &zr[i-1], &prej[i-1]);
+    secp256k1_fe_inv_var(&zr[table_size-1], &prej[table_size-1].z);
+    secp256k1_ge_set_table_gej(table_size, pre, prej, zr);
+    free(zr);
     free(prej);
 }
 
-/** The number of entries a table with precomputed multiples needs to have. */
-#define ECMULT_TABLE_SIZE(w) (1 << ((w)-2))
-
 /** The following two macro retrieves a particular odd multiple from a table
  *  of precomputed multiples. */
 #define ECMULT_TABLE_GET(r,pre,n,w,neg) do { \
diff --git a/src/group.h b/src/group.h
index 6dea6bb5ac..909e0dd521 100644
--- a/src/group.h
+++ b/src/group.h
@@ -25,6 +25,14 @@ typedef struct {
     int infinity; /* whether this represents the point at infinity */
 } secp256k1_gej_t;
 
+/** A group element of the secp256k1 curve, with an implicit z coordinate (and infinity flag).
+ *  An instance of secp256k1_coz_t is always "co-z" with some instance of secp256k1_gej_t, from
+ *  which it inherits its implied z coordinate and infinity flag. */
+typedef struct {
+    secp256k1_fe_t x; // actual X: x/z^2 (z implied)
+    secp256k1_fe_t y; // actual Y: y/z^3 (z implied)
+} secp256k1_coz_t;
+
 /** Global constants related to the group */
 typedef struct {
     secp256k1_ge_t g; /* the generator point */
@@ -70,6 +78,13 @@ static void secp256k1_ge_set_gej(secp256k1_ge_t *r, secp256k1_gej_t *a);
 /** Set a batch of group elements equal to the inputs given in jacobian coordinates */
 static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t r[len], const secp256k1_gej_t a[len]);
 
+/** Set a batch of group elements equal to the inputs given in jacobian coordinates (with known
+ *  z-ratios). zr must contain the known z-ratios such that mul(a[i].z, zr[i]) == a[i+1].z, with
+ *  mul(a[len-1].z, zr[len-1]) == 1 (i.e. the last zr element would normally be calculated by
+ *  a field inversion of the last z element). */
+static void secp256k1_ge_set_table_gej(size_t len, secp256k1_ge_t r[len], const secp256k1_gej_t a[len],
+    const secp256k1_fe_t zr[len]);
+
 
 /** Set a group element (jacobian) equal to the point at infinity. */
 static void secp256k1_gej_set_infinity(secp256k1_gej_t *r);
@@ -117,4 +132,12 @@ static void secp256k1_gej_clear(secp256k1_gej_t *r);
 /** Clear a secp256k1_ge_t to prevent leaking sensitive information. */
 static void secp256k1_ge_clear(secp256k1_ge_t *r);
 
+/** Set r equal to the double of a, and ra equal to a, such that r is co-z with ra. */
+static void secp256k1_coz_dblu_var(secp256k1_coz_t *r, secp256k1_gej_t *ra, const secp256k1_gej_t *a);
+
+/** Set r equal to the sum of ra and b. ra is initially co-z with b and finally co-z with r. rzr
+    returns the ratio r->z:b->z */
+static void secp256k1_coz_zaddu_var(secp256k1_gej_t *r, secp256k1_coz_t *ra, secp256k1_fe_t *rzr,
+    const secp256k1_gej_t *b);
+
 #endif
diff --git a/src/group_impl.h b/src/group_impl.h
index fef06df289..8aadade0e0 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -13,6 +13,16 @@
 #include "field.h"
 #include "group.h"
 
+/* TODO Consider whether this should be in the API. */
+static void secp256k1_ge_set_gej_zinv(secp256k1_ge_t *r, const secp256k1_gej_t *a,
+    const secp256k1_fe_t *zi) {
+    secp256k1_fe_t zi2; secp256k1_fe_sqr(&zi2, zi);
+    secp256k1_fe_t zi3; secp256k1_fe_mul(&zi3, &zi2, zi);
+    secp256k1_fe_mul(&r->x, &a->x, &zi2);
+    secp256k1_fe_mul(&r->y, &a->y, &zi3);
+    r->infinity = a->infinity;
+}
+
 static void secp256k1_ge_set_infinity(secp256k1_ge_t *r) {
     r->infinity = 1;
 }
@@ -98,16 +108,26 @@ static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t r[len], cons
     for (size_t i=0; i<len; i++) {
         r[i].infinity = a[i].infinity;
         if (!a[i].infinity) {
-            secp256k1_fe_t *zi = &azi[count++];
-            secp256k1_fe_t zi2; secp256k1_fe_sqr(&zi2, zi);
-            secp256k1_fe_t zi3; secp256k1_fe_mul(&zi3, &zi2, zi);
-            secp256k1_fe_mul(&r[i].x, &a[i].x, &zi2);
-            secp256k1_fe_mul(&r[i].y, &a[i].y, &zi3);
+            secp256k1_ge_set_gej_zinv(&r[i], &a[i], &azi[count++]);
         }
     }
     free(azi);
 }
 
+static void secp256k1_ge_set_table_gej(size_t len, secp256k1_ge_t r[len], const secp256k1_gej_t a[len],
+    const secp256k1_fe_t zr[len])
+{
+    if (len < 1)
+        return;
+    int i = len;
+    secp256k1_fe_t zi = zr[--i];
+    secp256k1_ge_set_gej_zinv(&r[i], &a[i], &zi);
+    while (--i >= 0) {
+        secp256k1_fe_mul(&zi, &zi, &zr[i]);
+        secp256k1_ge_set_gej_zinv(&r[i], &a[i], &zi);
+    }
+}
+
 static void secp256k1_gej_set_infinity(secp256k1_gej_t *r) {
     r->infinity = 1;
     secp256k1_fe_set_int(&r->x, 0);
@@ -402,6 +422,80 @@ static void secp256k1_gej_mul_lambda(secp256k1_gej_t *r, const secp256k1_gej_t *
 }
 #endif
 
+static void secp256k1_coz_dblu_impl_var(secp256k1_coz_t *r, secp256k1_coz_t *ra, secp256k1_fe_t *rzr,
+    const secp256k1_gej_t *a) {
+    secp256k1_fe_t E; secp256k1_fe_sqr(&E, &a->y);
+    secp256k1_fe_t L; secp256k1_fe_sqr(&L, &E);
+    secp256k1_fe_t M; secp256k1_fe_sqr(&M, &a->x); secp256k1_fe_mul_int(&M, 3);
+    secp256k1_fe_t *S = &ra->x; secp256k1_fe_mul(S, &a->x, &E); secp256k1_fe_mul_int(S, 4);
+    secp256k1_fe_normalize_weak(S);
+    *rzr = a->y; secp256k1_fe_mul_int(rzr, 2);
+    secp256k1_fe_t t; secp256k1_fe_negate(&t, S, 1); secp256k1_fe_mul_int(&t, 2);
+    secp256k1_fe_sqr(&r->x, &M); secp256k1_fe_add(&r->x, &t);
+    secp256k1_fe_negate(&t, &r->x, 5); secp256k1_fe_add(&t, S);
+    secp256k1_fe_mul(&r->y, &M, &t);
+    ra->y = L; secp256k1_fe_mul_int(&ra->y, 8); secp256k1_fe_normalize_weak(&ra->y);
+    secp256k1_fe_negate(&t, &ra->y, 1); secp256k1_fe_add(&r->y, &t);
+}
+
+static void secp256k1_coz_dblu_var(secp256k1_coz_t *r, secp256k1_gej_t *ra, const secp256k1_gej_t *a) {
+    ra->infinity = a->infinity;
+    if (a->infinity) {
+        return;
+    }
+    secp256k1_fe_t zr;
+    secp256k1_coz_dblu_impl_var(r, (secp256k1_coz_t*)ra, &zr, a);
+    secp256k1_fe_mul(&ra->z, &a->z, &zr);
+}
+
+static void secp256k1_coz_zaddu_var(secp256k1_gej_t *r, secp256k1_coz_t *ra, secp256k1_fe_t *rzr,
+    const secp256k1_gej_t *b) {
+    VERIFY_CHECK(rzr != &r->z);
+    /* Note that when b is infinity, ra is also infinity per the co-z definition */
+    r->infinity = b->infinity;
+    if (b->infinity) {
+        secp256k1_fe_set_int(rzr, 0);
+        return;
+    }
+
+    secp256k1_fe_t X1 = ra->x; secp256k1_fe_normalize_weak(&X1);
+    secp256k1_fe_t Y1 = ra->y; secp256k1_fe_normalize_weak(&Y1);
+    secp256k1_fe_t X2 = b->x; secp256k1_fe_normalize_weak(&X2);
+    secp256k1_fe_t Y2 = b->y; secp256k1_fe_normalize_weak(&Y2);
+
+    secp256k1_fe_t dX; secp256k1_fe_negate(&dX, &X2, 1); secp256k1_fe_add(&dX, &X1);
+    secp256k1_fe_t dY; secp256k1_fe_negate(&dY, &Y1, 1); secp256k1_fe_add(&dY, &Y2);
+
+    if (secp256k1_fe_normalizes_to_zero_var(&dX)) {
+        if (secp256k1_fe_normalizes_to_zero_var(&dY)) {
+            secp256k1_coz_dblu_impl_var((secp256k1_coz_t*)r, ra, rzr, b);
+            secp256k1_fe_mul(&r->z, &b->z, rzr);
+        } else {
+            r->infinity = 1;
+            secp256k1_fe_set_int(rzr, 0);
+        }
+        return;
+    }
+
+    secp256k1_fe_t C; secp256k1_fe_sqr(&C, &dX);
+    secp256k1_fe_t D; secp256k1_fe_sqr(&D, &dY);
+
+    secp256k1_fe_t W1; secp256k1_fe_mul(&W1, &X1, &C); ra->x = W1;
+    secp256k1_fe_t W2; secp256k1_fe_mul(&W2, &X2, &C);
+
+    secp256k1_fe_negate(&W1, &W1, 1);
+    secp256k1_fe_negate(&r->x, &W2, 1); secp256k1_fe_add(&r->x, &W1); secp256k1_fe_add(&r->x, &D);
+
+    secp256k1_fe_add(&W2, &W1);
+    secp256k1_fe_t A1; secp256k1_fe_mul(&A1, &W2, &Y1); secp256k1_fe_negate(&ra->y, &A1, 1);
+
+    r->y = r->x; secp256k1_fe_add(&r->y, &W1); secp256k1_fe_mul(&r->y, &r->y, &dY);
+    secp256k1_fe_add(&r->y, &A1);
+
+    secp256k1_fe_mul(&r->z, &b->z, &dX);
+    *rzr = dX;
+}
+
 static void secp256k1_ge_start(void) {
     static const unsigned char secp256k1_ge_consts_g_x[] = {
         0x79,0xBE,0x66,0x7E,0xF9,0xDC,0xBB,0xAC,