WIP: Simulated int128 type.

roconnor-blockstream · roconnor-blockstream · commit 4c684ff8ff56 · 2021-10-26T10:39:06.000-04:00
diff --git a/src/int128.h b/src/int128.h
@@ -0,0 +1,131 @@
+#ifndef SECP256K1_INT128_H
+#define SECP256K1_INT128_H
+
+#include <stdint.h>
+
+/*
+#ifdef UINT128_MAX
+#undef UINT128_MAX
+*/
+
+#if defined(UINT128_MAX)
+typedef uint128_t secp256k1_uint128;
+typedef int128_t secp256k1_int128;
+#else
+typedef struct {
+  uint64_t lo;
+  uint64_t hi;
+} secp256k1_uint128;
+
+typedef secp256k1_uint128 secp256k1_int128;
+#endif
+
+/* Low 32 bits of a (u)int64_t as an uint64_t. */
+#define LO32(x) ((uint64_t)(x) & 0xffffffff)
+#define HI32(x) ((x) >> 32)
+
+#if defined(_M_X64) | defined(_M_ARM64) | defined(_WIN64) /* MSVC */
+  #include <intrin.h>
+  #define secp256k1_umulh __umulh
+  #define secp256k1_mulh __mulh
+#else
+static SECP256K1_INLINE uint64_t secp256k1_umulh(uint64_t a, uint64_t b) {
+#if defined(UINT128_MAX)
+   return (uint64_t)(((uint128_t)a * b) >> 64);
+#else
+   uint64_t t1 = LO32(a)*LO32(b);
+   uint64_t t2 = HI32(a)*LO32(b);
+   uint64_t t3 = LO32(a)*HI32(b) + HI32(t1) + LO32(t2);
+   return HI32(a)*HI32(b) + HI32(t2) + HI32(t3);
+#endif
+}
+
+static SECP256K1_INLINE int64_t secp256k1_mulh(int64_t a, int64_t b) {
+#if defined(UINT128_MAX)
+   return (int64_t)(((int128_t)a * b) >> 64);
+#else
+   uint64_t t1 = (uint64_t)(uint32_t)a * (uint32_t)b;
+   int64_t t2 = (a >> 32) * (uint32_t)b;
+   int64_t t3 = (uint32_t)a * (b >> 32);
+   uint64_t t4 = (t1 >> 32) + (uint32_t)t2 + (uint32_t)t3;
+   return (a >> 32) * (b >> 32) + (t2 >> 32) + (t3 >> 32) + (int64_t)(t4 >> 32);
+#endif
+}
+#endif
+
+static SECP256K1_INLINE void secp256k1_umul128(secp256k1_int128 *r, uint64_t a, uint64_t b) {
+#if defined(UINT128_MAX)
+   *r = (uint128_t)a * b;
+#else
+   r->hi = secp256k1_umulh(a, b);
+   r->lo += a * b;
+#endif
+}
+
+static SECP256K1_INLINE void secp256k1_i128_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+#if defined(UINT128_MAX)
+   *r = (int128_t)a * b;
+#else
+   r->hi = (uint64_t)secp256k1_mulh(a, b);
+   r->lo = (uint64_t)a * (uint64_t)b;
+#endif
+}
+
+static SECP256K1_INLINE void secp256k1_i128_accum_mul(secp256k1_int128 *r, int64_t a, int64_t b) {
+#if defined(UINT128_MAX)
+   *r += (int128_t)a * b;
+#else
+   uint64_t lo = (uint64_t)a * (uint64_t)b;
+   r->hi += (uint64_t)secp256k1_mulh(a, b) + (r->lo > ~lo);
+   r->lo += lo;
+#endif
+}
+
+/* Signed (arithmetic) right shift.
+ * Non-constant time in b.
+ */
+static SECP256K1_INLINE void secp256k1_i128_rshift(secp256k1_int128 *r, unsigned int b) {
+#if defined(UINT128_MAX)
+   *r >>= b;
+#else
+   if (b >= 64) {
+     r->lo = (uint64_t)((int64_t)(r->hi) >> (b-64));
+     r->hi = (uint64_t)((int64_t)(r->hi) >> 63);
+   } else if (b > 0) {
+     r->lo = ((r->hi & (((uint64_t)1<<b)-1)) << (64-b)) | r->lo >> b;
+     r->hi = (uint64_t)((int64_t)(r->hi) >> b);
+   }
+#endif
+}
+
+static SECP256K1_INLINE int64_t secp256k1_i128_to_i64(const secp256k1_int128 *a) {
+#if defined(UINT128_MAX)
+   return (int64_t)(uint64_t)(*a);
+#else
+   return (int64_t)a->lo;
+#endif
+}
+
+static SECP256K1_INLINE void secp256k1_i128_from_i64(secp256k1_int128 *r, int64_t a) {
+#if defined(UINT128_MAX)
+   *r = a;
+#else
+   r->hi = (uint64_t)(a >> 63);
+   r->lo = (uint64_t)a;
+#endif
+}
+
+static SECP256K1_INLINE int secp256k1_i128_eq(const secp256k1_int128 *a, const secp256k1_int128 *b) {
+#if defined(UINT128_MAX)
+   return *a == *b;
+#else
+   return a->hi == b->hi && a->lo == b->lo;
+#endif
+}
+
+/*
+#endif
+#define UINT128_MAX ((uint128_t)(int128_t)(-1))
+*/
+
+#endif
diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
@@ -10,6 +10,7 @@
 #include "modinv64.h"
 
 #include "util.h"
+#include "int128.h"
 
 /* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
  * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
@@ -32,15 +33,17 @@ static const secp256k1_modinv64_signed62 SECP256K1_SIGNED62_ONE = {{1}};
 /* Compute a*factor and put it in r. All but the top limb in r will be in range [0,2^62). */
 static void secp256k1_modinv64_mul_62(secp256k1_modinv64_signed62 *r, const secp256k1_modinv64_signed62 *a, int alen, int64_t factor) {
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
-    int128_t c = 0;
+    secp256k1_int128 c, d;
     int i;
+    secp256k1_i128_from_i64(&c, 0);
     for (i = 0; i < 4; ++i) {
-        if (i < alen) c += (int128_t)a->v[i] * factor;
-        r->v[i] = (int64_t)c & M62; c >>= 62;
+        if (i < alen) secp256k1_i128_accum_mul(&c, a->v[i], factor);
+        r->v[i] = secp256k1_i128_to_i64(&c) & M62; secp256k1_i128_rshift(&c, 62);
     }
-    if (4 < alen) c += (int128_t)a->v[4] * factor;
-    VERIFY_CHECK(c == (int64_t)c);
-    r->v[4] = (int64_t)c;
+    if (4 < alen) secp256k1_i128_accum_mul(&c, a->v[4], factor);
+    secp256k1_i128_from_i64(&d, secp256k1_i128_to_i64(&c));
+    VERIFY_CHECK(secp256k1_i128_eq(&c, &d));
+    r->v[4] = secp256k1_i128_to_i64(&c);
 }
 
 /* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A has alen limbs; b has 5. */
@@ -307,7 +310,7 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
     const int64_t e0 = e->v[0], e1 = e->v[1], e2 = e->v[2], e3 = e->v[3], e4 = e->v[4];
     const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
     int64_t md, me, sd, se;
-    int128_t cd, ce;
+    secp256k1_int128 cd, ce;
 #ifdef VERIFY
     VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
     VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
@@ -324,54 +327,64 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
     md = (u & sd) + (v & se);
     me = (q & sd) + (r & se);
     /* Begin computing t*[d,e]. */
-    cd = (int128_t)u * d0 + (int128_t)v * e0;
-    ce = (int128_t)q * d0 + (int128_t)r * e0;
+    secp256k1_i128_mul(&cd, u, d0);
+    secp256k1_i128_accum_mul(&cd, v, e0);
+    secp256k1_i128_mul(&ce, q, d0);
+    secp256k1_i128_accum_mul(&ce, r, e0);
     /* Correct md,me so that t*[d,e]+modulus*[md,me] has 62 zero bottom bits. */
-    md -= (modinfo->modulus_inv62 * (uint64_t)cd + md) & M62;
-    me -= (modinfo->modulus_inv62 * (uint64_t)ce + me) & M62;
+    md -= (modinfo->modulus_inv62 * (uint64_t)secp256k1_i128_to_i64(&cd) + md) & M62;
+    me -= (modinfo->modulus_inv62 * (uint64_t)secp256k1_i128_to_i64(&ce) + me) & M62;
     /* Update the beginning of computation for t*[d,e]+modulus*[md,me] now md,me are known. */
-    cd += (int128_t)modinfo->modulus.v[0] * md;
-    ce += (int128_t)modinfo->modulus.v[0] * me;
+    secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[0], md);
+    secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[0], me);
     /* Verify that the low 62 bits of the computation are indeed zero, and then throw them away. */
-    VERIFY_CHECK(((int64_t)cd & M62) == 0); cd >>= 62;
-    VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cd) & M62) == 0); secp256k1_i128_rshift(&cd, 62);
+    VERIFY_CHECK((secp256k1_i128_to_i64(&ce) & M62) == 0); secp256k1_i128_rshift(&ce, 62);
     /* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */
-    cd += (int128_t)u * d1 + (int128_t)v * e1;
-    ce += (int128_t)q * d1 + (int128_t)r * e1;
+    secp256k1_i128_accum_mul(&cd, u, d1);
+    secp256k1_i128_accum_mul(&cd, v, e1);
+    secp256k1_i128_accum_mul(&ce, q, d1);
+    secp256k1_i128_accum_mul(&ce, r, e1);
     if (modinfo->modulus.v[1]) { /* Optimize for the case where limb of modulus is zero. */
-        cd += (int128_t)modinfo->modulus.v[1] * md;
-        ce += (int128_t)modinfo->modulus.v[1] * me;
+        secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[1], md);
+        secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[1], me);
     }
-    d->v[0] = (int64_t)cd & M62; cd >>= 62;
-    e->v[0] = (int64_t)ce & M62; ce >>= 62;
+    d->v[0] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[0] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
     /* Compute limb 2 of t*[d,e]+modulus*[md,me], and store it as output limb 1. */
-    cd += (int128_t)u * d2 + (int128_t)v * e2;
-    ce += (int128_t)q * d2 + (int128_t)r * e2;
+    secp256k1_i128_accum_mul(&cd, u, d2);
+    secp256k1_i128_accum_mul(&cd, v, e2);
+    secp256k1_i128_accum_mul(&ce, q, d2);
+    secp256k1_i128_accum_mul(&ce, r, e2);
     if (modinfo->modulus.v[2]) { /* Optimize for the case where limb of modulus is zero. */
-        cd += (int128_t)modinfo->modulus.v[2] * md;
-        ce += (int128_t)modinfo->modulus.v[2] * me;
+        secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[2], md);
+        secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[2], me);
     }
-    d->v[1] = (int64_t)cd & M62; cd >>= 62;
-    e->v[1] = (int64_t)ce & M62; ce >>= 62;
+    d->v[1] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[1] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
     /* Compute limb 3 of t*[d,e]+modulus*[md,me], and store it as output limb 2. */
-    cd += (int128_t)u * d3 + (int128_t)v * e3;
-    ce += (int128_t)q * d3 + (int128_t)r * e3;
+    secp256k1_i128_accum_mul(&cd, u, d3);
+    secp256k1_i128_accum_mul(&cd, v, e3);
+    secp256k1_i128_accum_mul(&ce, q, d3);
+    secp256k1_i128_accum_mul(&ce, r, e3);
     if (modinfo->modulus.v[3]) { /* Optimize for the case where limb of modulus is zero. */
-        cd += (int128_t)modinfo->modulus.v[3] * md;
-        ce += (int128_t)modinfo->modulus.v[3] * me;
+        secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[3], md);
+        secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[3], me);
     }
-    d->v[2] = (int64_t)cd & M62; cd >>= 62;
-    e->v[2] = (int64_t)ce & M62; ce >>= 62;
+    d->v[2] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[2] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
     /* Compute limb 4 of t*[d,e]+modulus*[md,me], and store it as output limb 3. */
-    cd += (int128_t)u * d4 + (int128_t)v * e4;
-    ce += (int128_t)q * d4 + (int128_t)r * e4;
-    cd += (int128_t)modinfo->modulus.v[4] * md;
-    ce += (int128_t)modinfo->modulus.v[4] * me;
-    d->v[3] = (int64_t)cd & M62; cd >>= 62;
-    e->v[3] = (int64_t)ce & M62; ce >>= 62;
+    secp256k1_i128_accum_mul(&cd, u, d4);
+    secp256k1_i128_accum_mul(&cd, v, e4);
+    secp256k1_i128_accum_mul(&ce, q, d4);
+    secp256k1_i128_accum_mul(&ce, r, e4);
+    secp256k1_i128_accum_mul(&cd, modinfo->modulus.v[4], md);
+    secp256k1_i128_accum_mul(&ce, modinfo->modulus.v[4], me);
+    d->v[3] = secp256k1_i128_to_i64(&cd) & M62; secp256k1_i128_rshift(&cd, 62);
+    e->v[3] = secp256k1_i128_to_i64(&ce) & M62; secp256k1_i128_rshift(&ce, 62);
     /* What remains is limb 5 of t*[d,e]+modulus*[md,me]; store it as output limb 4. */
-    d->v[4] = (int64_t)cd;
-    e->v[4] = (int64_t)ce;
+    d->v[4] = secp256k1_i128_to_i64(&cd);
+    e->v[4] = secp256k1_i128_to_i64(&ce);
 #ifdef VERIFY
     VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, -2) > 0); /* d > -2*modulus */
     VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(d, 5, &modinfo->modulus, 1) < 0);  /* d <    modulus */
@@ -389,36 +402,46 @@ static void secp256k1_modinv64_update_fg_62(secp256k1_modinv64_signed62 *f, secp
     const int64_t f0 = f->v[0], f1 = f->v[1], f2 = f->v[2], f3 = f->v[3], f4 = f->v[4];
     const int64_t g0 = g->v[0], g1 = g->v[1], g2 = g->v[2], g3 = g->v[3], g4 = g->v[4];
     const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
-    int128_t cf, cg;
+    secp256k1_int128 cf, cg;
     /* Start computing t*[f,g]. */
-    cf = (int128_t)u * f0 + (int128_t)v * g0;
-    cg = (int128_t)q * f0 + (int128_t)r * g0;
+    secp256k1_i128_mul(&cf, u, f0);
+    secp256k1_i128_accum_mul(&cf, v, g0);
+    secp256k1_i128_mul(&cg, q, f0);
+    secp256k1_i128_accum_mul(&cg, r, g0);
     /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
-    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
-    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cf) & M62) == 0); secp256k1_i128_rshift(&cf, 62);
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cg) & M62) == 0); secp256k1_i128_rshift(&cg, 62);
     /* Compute limb 1 of t*[f,g], and store it as output limb 0 (= down shift). */
-    cf += (int128_t)u * f1 + (int128_t)v * g1;
-    cg += (int128_t)q * f1 + (int128_t)r * g1;
-    f->v[0] = (int64_t)cf & M62; cf >>= 62;
-    g->v[0] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f1);
+    secp256k1_i128_accum_mul(&cf, v, g1);
+    secp256k1_i128_accum_mul(&cg, q, f1);
+    secp256k1_i128_accum_mul(&cg, r, g1);
+    f->v[0] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[0] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
     /* Compute limb 2 of t*[f,g], and store it as output limb 1. */
-    cf += (int128_t)u * f2 + (int128_t)v * g2;
-    cg += (int128_t)q * f2 + (int128_t)r * g2;
-    f->v[1] = (int64_t)cf & M62; cf >>= 62;
-    g->v[1] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f2);
+    secp256k1_i128_accum_mul(&cf, v, g2);
+    secp256k1_i128_accum_mul(&cg, q, f2);
+    secp256k1_i128_accum_mul(&cg, r, g2);
+    f->v[1] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[1] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
     /* Compute limb 3 of t*[f,g], and store it as output limb 2. */
-    cf += (int128_t)u * f3 + (int128_t)v * g3;
-    cg += (int128_t)q * f3 + (int128_t)r * g3;
-    f->v[2] = (int64_t)cf & M62; cf >>= 62;
-    g->v[2] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f3);
+    secp256k1_i128_accum_mul(&cf, v, g3);
+    secp256k1_i128_accum_mul(&cg, q, f3);
+    secp256k1_i128_accum_mul(&cg, r, g3);
+    f->v[2] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[2] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
     /* Compute limb 4 of t*[f,g], and store it as output limb 3. */
-    cf += (int128_t)u * f4 + (int128_t)v * g4;
-    cg += (int128_t)q * f4 + (int128_t)r * g4;
-    f->v[3] = (int64_t)cf & M62; cf >>= 62;
-    g->v[3] = (int64_t)cg & M62; cg >>= 62;
+    secp256k1_i128_accum_mul(&cf, u, f4);
+    secp256k1_i128_accum_mul(&cf, v, g4);
+    secp256k1_i128_accum_mul(&cg, q, f4);
+    secp256k1_i128_accum_mul(&cg, r, g4);
+    f->v[3] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+    g->v[3] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
     /* What remains is limb 5 of t*[f,g]; store it as output limb 4. */
-    f->v[4] = (int64_t)cf;
-    g->v[4] = (int64_t)cg;
+    f->v[4] = secp256k1_i128_to_i64(&cf);
+    g->v[4] = secp256k1_i128_to_i64(&cg);
 }
 
 /* Compute (t/2^62) * [f, g], where t is a transition matrix for 62 divsteps.
@@ -431,30 +454,34 @@ static void secp256k1_modinv64_update_fg_62_var(int len, secp256k1_modinv64_sign
     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
     const int64_t u = t->u, v = t->v, q = t->q, r = t->r;
     int64_t fi, gi;
-    int128_t cf, cg;
+    secp256k1_int128 cf, cg;
     int i;
     VERIFY_CHECK(len > 0);
     /* Start computing t*[f,g]. */
     fi = f->v[0];
     gi = g->v[0];
-    cf = (int128_t)u * fi + (int128_t)v * gi;
-    cg = (int128_t)q * fi + (int128_t)r * gi;
+    secp256k1_i128_mul(&cf, u, fi);
+    secp256k1_i128_accum_mul(&cf, v, gi);
+    secp256k1_i128_mul(&cg, q, fi);
+    secp256k1_i128_accum_mul(&cg, r, gi);
     /* Verify that the bottom 62 bits of the result are zero, and then throw them away. */
-    VERIFY_CHECK(((int64_t)cf & M62) == 0); cf >>= 62;
-    VERIFY_CHECK(((int64_t)cg & M62) == 0); cg >>= 62;
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cf) & M62) == 0); secp256k1_i128_rshift(&cf, 62);
+    VERIFY_CHECK((secp256k1_i128_to_i64(&cg) & M62) == 0); secp256k1_i128_rshift(&cg, 62);
     /* Now iteratively compute limb i=1..len of t*[f,g], and store them in output limb i-1 (shifting
      * down by 62 bits). */
     for (i = 1; i < len; ++i) {
         fi = f->v[i];
         gi = g->v[i];
-        cf += (int128_t)u * fi + (int128_t)v * gi;
-        cg += (int128_t)q * fi + (int128_t)r * gi;
-        f->v[i - 1] = (int64_t)cf & M62; cf >>= 62;
-        g->v[i - 1] = (int64_t)cg & M62; cg >>= 62;
+        secp256k1_i128_accum_mul(&cf, u, fi);
+        secp256k1_i128_accum_mul(&cf, v, gi);
+        secp256k1_i128_accum_mul(&cg, q, fi);
+        secp256k1_i128_accum_mul(&cg, r, gi);
+        f->v[i - 1] = secp256k1_i128_to_i64(&cf) & M62; secp256k1_i128_rshift(&cf, 62);
+        g->v[i - 1] = secp256k1_i128_to_i64(&cg) & M62; secp256k1_i128_rshift(&cg, 62);
     }
     /* What remains is limb (len) of t*[f,g]; store it as output limb (len-1). */
-    f->v[len - 1] = (int64_t)cf;
-    g->v[len - 1] = (int64_t)cg;
+    f->v[len - 1] = secp256k1_i128_to_i64(&cf);
+    g->v[len - 1] = secp256k1_i128_to_i64(&cg);
 }
 
 /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */