diff --git a/src/ecmult_gen_impl.h b/src/ecmult_gen_impl.h
index 5a5b16ce14..ac42961ec7 100644
--- a/src/ecmult_gen_impl.h
+++ b/src/ecmult_gen_impl.h
@@ -50,7 +50,7 @@ static void secp256k1_ecmult_gen_start(void) {
         VERIFY_CHECK(secp256k1_ge_set_xo_var(&nums_ge, &nums_x, 0));
         secp256k1_gej_set_ge(&nums_gej, &nums_ge);
         /* Add G to make the bits in x uniformly distributed. */
-        secp256k1_gej_add_ge_var(&nums_gej, &nums_gej, g);
+        secp256k1_gej_add_ge_var(&nums_gej, &nums_gej, NULL, g);
     }
 
     /* compute prec. */
diff --git a/src/ecmult_impl.h b/src/ecmult_impl.h
index 1f17a0d8a6..6498ecca97 100644
--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@@ -12,7 +12,7 @@
 #include "ecmult.h"
 
 /* optimal for 128-bit and 256-bit exponents. */
-#define WINDOW_A 6
+#define WINDOW_A 5
 
 /** larger numbers may result in slightly better performance, at the cost of
     exponentially larger precomputed tables. */
@@ -46,6 +46,46 @@ static void secp256k1_ecmult_table_precomp_gej_var(secp256k1_gej_t *prej, const
         secp256k1_coz_zaddu_var(&prej[i], &d, &zr, &prej[i-1]);
 }
 
+static void secp256k1_ecmult_table_precomp_coz_var(secp256k1_ge_t *pre, secp256k1_fe_t *rz, const secp256k1_gej_t *a, int w) {
+    CHECK(!a->infinity);
+
+    const int table_size = ECMULT_TABLE_SIZE(w);
+
+    // Run basic co-Z ladder and collect the z-ratios
+    secp256k1_gej_t prej[table_size];
+    secp256k1_fe_t zr[table_size-1];
+    secp256k1_coz_t d; secp256k1_coz_dblu_var(&d, &prej[0], a);
+    for (int i=1; i<table_size; i++)
+        secp256k1_coz_zaddu_var(&prej[i], &d, &zr[i-1], &prej[i-1]);
+
+    // The z of the final point gives us the "global co-Z" for the table
+    int j = table_size - 1;
+    pre[j].x = prej[j].x;
+    pre[j].y = prej[j].y;
+         *rz = prej[j].z;
+    pre[j].infinity = 0;
+
+#ifdef VERIFY
+    secp256k1_fe_normalize(rz);
+#endif
+
+    // Work our way backwards, using the z-ratios to scale the x/y values
+    secp256k1_fe_t zs; secp256k1_fe_set_int(&zs, 1);
+    while (--j >= 0) {
+        secp256k1_fe_mul(&zs, &zs, &zr[j]);
+        secp256k1_fe_t zs2; secp256k1_fe_sqr(&zs2, &zs);
+        secp256k1_fe_t zs3; secp256k1_fe_mul(&zs3, &zs2, &zs);
+        secp256k1_fe_mul(&pre[j].x, &prej[j].x, &zs2);
+        secp256k1_fe_mul(&pre[j].y, &prej[j].y, &zs3);
+        pre[j].infinity = 0;
+
+#ifdef VERIFY
+        secp256k1_fe_t z; secp256k1_fe_mul(&z, &zs, &prej[j].z); secp256k1_fe_normalize(&z);
+        VERIFY_CHECK(secp256k1_fe_equal(&z, rz));
+#endif
+    }
+}
+
 static void secp256k1_ecmult_table_precomp_ge_var(secp256k1_ge_t *pre, const secp256k1_gej_t *a, int w) {
     const int table_size = ECMULT_TABLE_SIZE(w);
     secp256k1_gej_t *prej = checked_malloc(sizeof(secp256k1_gej_t) * table_size);
@@ -185,13 +225,14 @@ static void secp256k1_ecmult(secp256k1_gej_t *r, const secp256k1_gej_t *a, const
 #endif
 
     /* calculate odd multiples of a */
-    secp256k1_gej_t pre_a[ECMULT_TABLE_SIZE(WINDOW_A)];
-    secp256k1_ecmult_table_precomp_gej_var(pre_a, a, WINDOW_A);
+    secp256k1_fe_t Z;
+    secp256k1_ge_t pre_a[ECMULT_TABLE_SIZE(WINDOW_A)];
+    secp256k1_ecmult_table_precomp_coz_var(pre_a, &Z, a, WINDOW_A);
 
 #ifdef USE_ENDOMORPHISM
-    secp256k1_gej_t pre_a_lam[ECMULT_TABLE_SIZE(WINDOW_A)];
+    secp256k1_ge_t pre_a_lam[ECMULT_TABLE_SIZE(WINDOW_A)];
     for (int i=0; i<ECMULT_TABLE_SIZE(WINDOW_A); i++)
-        secp256k1_gej_mul_lambda(&pre_a_lam[i], &pre_a[i]);
+        secp256k1_ge_mul_lambda(&pre_a_lam[i], &pre_a[i]);
 
     /* Splitted G factors. */
     secp256k1_scalar_t ng_1, ng_128;
@@ -210,7 +251,6 @@ static void secp256k1_ecmult(secp256k1_gej_t *r, const secp256k1_gej_t *a, const
 #endif
 
     secp256k1_gej_set_infinity(r);
-    secp256k1_gej_t tmpj;
     secp256k1_ge_t tmpa;
 
     for (int i=bits-1; i>=0; i--) {
@@ -218,32 +258,36 @@ static void secp256k1_ecmult(secp256k1_gej_t *r, const secp256k1_gej_t *a, const
         int n;
 #ifdef USE_ENDOMORPHISM
         if (i < bits_na_1 && (n = wnaf_na_1[i])) {
-            ECMULT_TABLE_GET_GEJ(&tmpj, pre_a, n, WINDOW_A);
-            secp256k1_gej_add_var(r, r, &tmpj);
+            ECMULT_TABLE_GET_GE(&tmpa, pre_a, n, WINDOW_A);
+            secp256k1_gej_add_ge_var(r, r, NULL, &tmpa);
         }
         if (i < bits_na_lam && (n = wnaf_na_lam[i])) {
-            ECMULT_TABLE_GET_GEJ(&tmpj, pre_a_lam, n, WINDOW_A);
-            secp256k1_gej_add_var(r, r, &tmpj);
+            ECMULT_TABLE_GET_GE(&tmpa, pre_a_lam, n, WINDOW_A);
+            secp256k1_gej_add_ge_var(r, r, NULL, &tmpa);
         }
         if (i < bits_ng_1 && (n = wnaf_ng_1[i])) {
             ECMULT_TABLE_GET_GE(&tmpa, c->pre_g, n, WINDOW_G);
-            secp256k1_gej_add_ge_var(r, r, &tmpa);
+            secp256k1_gej_add_ge_var(r, r, &Z, &tmpa);
         }
         if (i < bits_ng_128 && (n = wnaf_ng_128[i])) {
             ECMULT_TABLE_GET_GE(&tmpa, c->pre_g_128, n, WINDOW_G);
-            secp256k1_gej_add_ge_var(r, r, &tmpa);
+            secp256k1_gej_add_ge_var(r, r, &Z, &tmpa);
         }
 #else
         if (i < bits_na && (n = wnaf_na[i])) {
-            ECMULT_TABLE_GET_GEJ(&tmpj, pre_a, n, WINDOW_A);
-            secp256k1_gej_add_var(r, r, &tmpj);
+            ECMULT_TABLE_GET_GE(&tmpa, pre_a, n, WINDOW_A);
+            secp256k1_gej_add_ge_var(r, r, NULL, &tmpa);
         }
         if (i < bits_ng && (n = wnaf_ng[i])) {
             ECMULT_TABLE_GET_GE(&tmpa, c->pre_g, n, WINDOW_G);
-            secp256k1_gej_add_ge_var(r, r, &tmpa);
+            secp256k1_gej_add_ge_var(r, r, &Z, &tmpa);
         }
 #endif
     }
+
+    if (!r->infinity) {
+        secp256k1_fe_mul(&r->z, &r->z, &Z);
+    }
 }
 
 #endif
diff --git a/src/group.h b/src/group.h
index 03dcbdcae0..e9d08ed39b 100644
--- a/src/group.h
+++ b/src/group.h
@@ -117,12 +117,15 @@ static void secp256k1_gej_add_ge(secp256k1_gej_t *r, const secp256k1_gej_t *a, c
 /** Set r equal to the sum of a and b (with b given in affine coordinates). This is more efficient
     than secp256k1_gej_add_var. It is identical to secp256k1_gej_add_ge but without constant-time
     guarantee, and b is allowed to be infinity. */
-static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_ge_t *b);
+static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_fe_t *azr, const secp256k1_ge_t *b);
 
 /** Get a hex representation of a point. *rlen will be overwritten with the real length. */
 static void secp256k1_gej_get_hex(char *r, int *rlen, const secp256k1_gej_t *a);
 
 #ifdef USE_ENDOMORPHISM
+/** Set r to be equal to lambda times a, where lambda is chosen in a way such that this is very fast. */
+static void secp256k1_ge_mul_lambda(secp256k1_ge_t *r, const secp256k1_ge_t *a);
+
 /** Set r to be equal to lambda times a, where lambda is chosen in a way such that this is very fast. */
 static void secp256k1_gej_mul_lambda(secp256k1_gej_t *r, const secp256k1_gej_t *a);
 #endif
diff --git a/src/group_impl.h b/src/group_impl.h
index 798d99cada..68b73b35b0 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -310,11 +310,18 @@ static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a,
     secp256k1_fe_add(&r->y, &h3);
 }
 
-static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_ge_t *b) {
+static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_fe_t *azr, const secp256k1_ge_t *b) {
     if (a->infinity) {
         r->infinity = b->infinity;
-        r->x = b->x;
-        r->y = b->y;
+        if (azr == NULL) {
+            r->x = b->x;
+            r->y = b->y;
+        } else {
+            secp256k1_fe_t azr2; secp256k1_fe_sqr(&azr2, azr);
+            secp256k1_fe_t azr3; secp256k1_fe_mul(&azr3, &azr2, azr);
+            secp256k1_fe_mul(&r->x, &b->x, &azr2);
+            secp256k1_fe_mul(&r->y, &b->y, &azr3);
+        }
         secp256k1_fe_set_int(&r->z, 1);
         return;
     }
@@ -323,11 +330,19 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *
         return;
     }
     r->infinity = 0;
-    secp256k1_fe_t z12; secp256k1_fe_sqr(&z12, &a->z);
+
+    secp256k1_fe_t az;
+    if (azr == NULL) {
+        az = a->z;
+    } else {
+        secp256k1_fe_mul(&az, &a->z, azr);
+    }
+
+    secp256k1_fe_t z12; secp256k1_fe_sqr(&z12, &az);
     secp256k1_fe_t u1 = a->x;
     secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &z12);
     secp256k1_fe_t s1 = a->y; secp256k1_fe_normalize_var(&s1);
-    secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
+    secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &az);
     secp256k1_fe_normalize_var(&u1);
     secp256k1_fe_normalize_var(&u2);
     if (secp256k1_fe_equal(&u1, &u2)) {
@@ -433,6 +448,12 @@ static void secp256k1_gej_get_hex(char *r, int *rlen, const secp256k1_gej_t *a)
 }
 
 #ifdef USE_ENDOMORPHISM
+static void secp256k1_ge_mul_lambda(secp256k1_ge_t *r, const secp256k1_ge_t *a) {
+    const secp256k1_fe_t *beta = &secp256k1_ge_consts->beta;
+    *r = *a;
+    secp256k1_fe_mul(&r->x, &r->x, beta);
+}
+
 static void secp256k1_gej_mul_lambda(secp256k1_gej_t *r, const secp256k1_gej_t *a) {
     const secp256k1_fe_t *beta = &secp256k1_ge_consts->beta;
     *r = *a;
diff --git a/src/tests.c b/src/tests.c
index 8ba1f28682..0e64b395f0 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -724,12 +724,12 @@ void test_ge(void) {
     secp256k1_gej_t iij; secp256k1_gej_add_var(&iij, &ij, &ij);
 
     /* gej + ge adds */
-    secp256k1_gej_t aa; secp256k1_gej_add_ge_var(&aa, &aj, &a);
-    secp256k1_gej_t ab; secp256k1_gej_add_ge_var(&ab, &aj, &b);
-    secp256k1_gej_t ai; secp256k1_gej_add_ge_var(&ai, &aj, &i);
-    secp256k1_gej_t an; secp256k1_gej_add_ge_var(&an, &aj, &n);
-    secp256k1_gej_t ia; secp256k1_gej_add_ge_var(&ia, &ij, &a);
-    secp256k1_gej_t ii; secp256k1_gej_add_ge_var(&ii, &ij, &i);
+    secp256k1_gej_t aa; secp256k1_gej_add_ge_var(&aa, &aj, NULL, &a);
+    secp256k1_gej_t ab; secp256k1_gej_add_ge_var(&ab, &aj, NULL, &b);
+    secp256k1_gej_t ai; secp256k1_gej_add_ge_var(&ai, &aj, NULL, &i);
+    secp256k1_gej_t an; secp256k1_gej_add_ge_var(&an, &aj, NULL, &n);
+    secp256k1_gej_t ia; secp256k1_gej_add_ge_var(&ia, &ij, NULL, &a);
+    secp256k1_gej_t ii; secp256k1_gej_add_ge_var(&ii, &ij, NULL, &i);
 
     /* const gej + ge adds */
     secp256k1_gej_t aac; secp256k1_gej_add_ge(&aac, &aj, &a);