diff --git a/doc/safegcd_implementation.md b/doc/safegcd_implementation.md index 063aa8efae..6e832ed4a2 100644 --- a/doc/safegcd_implementation.md +++ b/doc/safegcd_implementation.md @@ -155,14 +155,14 @@ do one division by *2N* as a final step: ```python def divsteps_n_matrix(delta, f, g): """Compute delta and transition matrix t after N divsteps (multiplied by 2^N).""" - u, v, q, r = 1, 0, 0, 1 # start with identity matrix + u, v, q, r = 1< 0 and g & 1: - delta, f, g, u, v, q, r = 1 - delta, g, (g - f) // 2, 2*q, 2*r, q-u, r-v + delta, f, g, u, v, q, r = 1 - delta, g, (g-f)//2, q, r, (q-u)//2, (r-v)//2 elif g & 1: - delta, f, g, u, v, q, r = 1 + delta, f, (g + f) // 2, 2*u, 2*v, q+u, r+v + delta, f, g, u, v, q, r = 1 + delta, f, (g+f)//2, u, v, (q+u)//2, (r+v)//2 else: - delta, f, g, u, v, q, r = 1 + delta, f, (g ) // 2, 2*u, 2*v, q , r + delta, f, g, u, v, q, r = 1 + delta, f, (g )//2, u, v, (q )//2, (r )//2 return delta, (u, v, q, r) ``` @@ -414,9 +414,9 @@ operations (and hope the C compiler isn't smart enough to turn them back into br divstep can be written instead as (compare to the inner loop of `gcd` in section 1). ```python - x = -f if delta > 0 else f # set x equal to (input) -f or f + x = f if delta > 0 else -f # set x equal to (input) f or -f if g & 1: - g += x # set g to (input) g-f or g+f + g -= x # set g to (input) g-f or g+f if delta > 0: delta = -delta f += g # set f to (input) g (note that g was set to g-f before) @@ -433,19 +433,21 @@ that *-v == (v ^ -1) - (-1)*. Thus, if we have a variable *c* that takes on valu Using this we can write: ```python - x = -f if delta > 0 else f + x = f if delta > 0 else -f ``` in constant-time form as: ```python - c1 = (-delta) >> 63 + # Compute c1=0 if delta>0 and c1=-1 if delta<=0. + c1 = (delta - 1) >> 63 # Conditionally negate f based on c1: x = (f ^ c1) - c1 ``` -To use that trick, we need a helper mask variable *c1* that resolves the condition *δ>0* to *-1* -(if true) or *0* (if false). We compute *c1* using right shifting, which is equivalent to dividing by +To use that trick, we need a helper mask variable *c1* that resolves the condition *δ≤0* to *-1* +(if true) or *0* (if false). We compute *c1* by first subtracting *1*, which results in a negative value +if and only if *δ≤0*. That is then right shifted, which is equivalent to dividing by the specified power of *2* and rounding down (in Python, and also in C under the assumption of a typical two's complement system; see `assumptions.h` for tests that this is the case). Right shifting by *63* thus maps all numbers in range *[-263,0)* to *-1*, and numbers in range *[0,263)* to *0*. @@ -454,7 +456,7 @@ Using the facts that *x&0=0* and *x&(-1)=x* (on two's complement systems again), ```python if g & 1: - g += x + g -= x ``` as: @@ -463,7 +465,7 @@ as: # Compute c2=0 if g is even and c2=-1 if g is odd. c2 = -(g & 1) # This masks out x if g is even, and leaves x be if g is odd. - g += x & c2 + g -= x & c2 ``` Using the conditional negation trick again we can write: @@ -478,7 +480,7 @@ as: ```python # Compute c3=-1 if g is odd and delta>0, and 0 otherwise. - c3 = c1 & c2 + c3 = ~c1 & c2 # Conditionally negate delta based on c3: delta = (delta ^ c3) - c3 ``` @@ -497,45 +499,61 @@ becomes: f += g & c3 ``` -It turns out that this can be implemented more efficiently by applying the substitution -*η=-δ*. In this representation, negating *δ* corresponds to negating *η*, and incrementing -*δ* corresponds to decrementing *η*. This allows us to remove the negation in the *c1* -computation: +Putting everything together, extending all operations on f,g (with helper x) to also be applied +to u,q (with helper y) and v,r (with helper z), gives: ```python - # Compute a mask c1 for eta < 0, and compute the conditional negation x of f: - c1 = eta >> 63 - x = (f ^ c1) - c1 - # Compute a mask c2 for odd g, and conditionally add x to g: - c2 = -(g & 1) - g += x & c2 - # Compute a mask c for (eta < 0) and odd (input) g, and use it to conditionally negate eta, - # and add g to f: - c3 = c1 & c2 - eta = (eta ^ c3) - c3 - f += g & c3 - # Incrementing delta corresponds to decrementing eta. - eta -= 1 - g >>= 1 +def divsteps_n_matrix(delta, f, g): + """Compute delta and transition matrix t after N divsteps (multiplied by 2^N).""" + u, v, q, r = 1<> 63 + # Compute x, y, z as conditionally-negated versions of f, u, v. + x, y, z = (f ^ c1) - c1, (u ^ c1) - c1, (v ^ c1) - c1 + c2 = -(g & 1) + # Conditionally subtract x, y, z from g, q, r. + g, q, r = g - (x & c2), q - (y & c2), r - (z & c2) + c3 = ~c1 & c2 + # Conditionally negate delta, and then increment it by 1. + delta = (delta ^ c3) - c3 + 1 + # Conditionally add g, q, r to f, u, v. + f, u, v = f + (g & c3), u + (q & c3), v + (r & c3) + # Shift down g, q, r. + g, q, r = g >> 1, u >> 1, v >> 1 + return delta, (u, v, q, r) ``` -A variant of divsteps with better worst-case performance can be used instead: starting *δ* at +An interesting optimization is possible here. If we were to drop the *-c1* in the computation +of *x*, *y*, and *z*, we are making them at worst *1* less than the correct value. That +translates to *g*, *q*, and *r* further being at worst *1* more than the correct value. +Now observe that at the start of every iteration of the loop, *u*, *v*, *q*, and *r* are +all multiples of *2N-i*, with *i* the iteration number, and thus all even. +In other words, this potential off by one in *g*, *q*, and *r* only affects their bottommost +bit, which is shifted away at the end of the loop. Thus we can instead write: + +```python + # Compute x, y, z as conditionally complemented versions of f, u, v. + x, y, z = f ^ c1, u ^ c1, v ^ c1 +``` + +Finally, a variant of divsteps with better worst-case performance can be used instead: starting *δ* at *1/2* instead of *1*. This reduces the worst case number of iterations to *590* for *256*-bit inputs -(which can be shown using convex hull analysis). In this case, the substitution *ζ=-(δ+1/2)* -is used instead to keep the variable integral. Incrementing *δ* by *1* still translates to -decrementing *ζ* by *1*, but negating *δ* now corresponds to going from *ζ* to *-(ζ+1)*, or -*~ζ*. Doing that conditionally based on *c3* is simply: +(which can be shown using [convex hull analysis](https://github.com/sipa/safegcd-bounds)). +In this case, the substitution *θ=δ-1/2* is used to keep the variable integral. +*δ≤0* then translates to *θ≤-1/2*, or because *θ* is integral, *θ<0*. +Thus instead of `c1 = (delta - 1) >> 63` we get `c1 = theta >> 63`. +Negating *δ* now corresponds to going from *θ* to +*-θ-1*. Doing that conditionally based on *c3* (and then incrementing by one) gives us: ```python ... - c3 = c1 & c2 - zeta ^= c3 + theta = (theta ^ c3) + 1 ... ``` By replacing the loop in `divsteps_n_matrix` with a variant of the divstep code above (extended to also apply all *f* operations to *u*, *v* and all *g* operations to *q*, *r*), a constant-time version of -`divsteps_n_matrix` is obtained. The full code will be in section 7. +`divsteps_n_matrix` is obtained. The resulting code will be in section 7. These bit fiddling tricks can also be used to make the conditional negations and additions in `update_de` and `normalize` constant-time. @@ -550,7 +568,7 @@ faster non-constant time `divsteps_n_matrix` function. To do so, first consider yet another way of writing the inner loop of divstep operations in `gcd` from section 1. This decomposition is also explained in the paper in section 8.2. We use -the original version with initial *δ=1* and *η=-δ* here. +the original version with initial *δ=1*, but make the substitution *η=-δ*. ```python for _ in range(N): @@ -651,8 +669,13 @@ Here we need the negated modular inverse, which is a simple transformation of th have this 6-bit function (based on the 3-bit function above): - *f(f2 - 2)* -This loop, again extended to also handle *u*, *v*, *q*, and *r* alongside *f* and *g*, placed in -`divsteps_n_matrix`, gives a significantly faster, but non-constant time version. +This loop, extended to also handle *u*, *v*, *q*, and *r* alongside *f* and *g*, placed in +`divsteps_n_matrix`, gives a significantly faster, but non-constant time version. In order to +avoid intermediary values that need more than N+1 bits, it is possible to instead start +*u* and *v* at *1* instead of at *2N*, and then shift up *u* and *v* whenever +*g* is shifted down (instead of shifting down *q* and *r*). This is effectively making the +algorithm operate on *i*-bits downshifted versions of all these variables. The resulting +code is shown in the next section. ## 7. Final Python version @@ -660,28 +683,27 @@ This loop, again extended to also handle *u*, *v*, *q*, and *r* alongside *f* an All together we need the following functions: - A way to compute the transition matrix in constant time, using the `divsteps_n_matrix` function - from section 2, but with its loop replaced by a variant of the constant-time divstep from - section 5, extended to handle *u*, *v*, *q*, *r*: + from section 5, modified to operate on *θ* instead of *δ*: ```python -def divsteps_n_matrix(zeta, f, g): - """Compute zeta and transition matrix t after N divsteps (multiplied by 2^N).""" - u, v, q, r = 1, 0, 0, 1 # start with identity matrix +def divsteps_n_matrix(theta, f, g): + """Compute theta and transition matrix t after N divsteps (multiplied by 2^N).""" + u, v, q, r = 1<> 63 - # Compute x, y, z as conditionally-negated versions of f, u, v. - x, y, z = (f ^ c1) - c1, (u ^ c1) - c1, (v ^ c1) - c1 + c1 = theta >> 63 + # Compute x, y, z as conditionally complemented versions of f, u, v. + x, y, z = f ^ c1, u ^ c1, v ^ c1 c2 = -(g & 1) - # Conditionally add x, y, z to g, q, r. - g, q, r = g + (x & c2), q + (y & c2), r + (z & c2) - c1 &= c2 # reusing c1 here for the earlier c3 variable - zeta = (zeta ^ c1) - 1 # inlining the unconditional zeta decrement here + # Conditionally subtract x, y, z from g, q, r. + g, q, r = g - (x & c2), q - (y & c2), r - (z & c2) + c3 = ~c1 & c2 + # Conditionally complement theta, and then increment it by 1. + theta = (theta ^ c3) + 1 # Conditionally add g, q, r to f, u, v. - f, u, v = f + (g & c1), u + (q & c1), v + (r & c1) - # When shifting g down, don't shift q, r, as we construct a transition matrix multiplied - # by 2^N. Instead, shift f's coefficients u and v up. - g, u, v = g >> 1, u << 1, v << 1 - return zeta, (u, v, q, r) + f, u, v = f + (g & c3), u + (q & c3), v + (r & c3) + # Shift down f, q, r. + g, q, r = g >> 1, u >> 1, v >> 1 + return theta, (u, v, q, r) ``` - The functions to update *f* and *g*, and *d* and *e*, from section 2 and section 4, with the constant-time @@ -723,15 +745,15 @@ def normalize(sign, v, M): return v ``` -- And finally the `modinv` function too, adapted to use *ζ* instead of *δ*, and using the fixed +- And finally the `modinv` function too, adapted to use *θ* instead of *δ*, and using the fixed iteration count from section 5: ```python def modinv(M, Mi, x): """Compute the modular inverse of x mod M, given Mi=1/M mod 2^N.""" - zeta, f, g, d, e = -1, M, x, 0, 1 + theta, f, g, d, e = 0, M, x, 0, 1 for _ in range((590 + N - 1) // N): - zeta, t = divsteps_n_matrix(zeta, f % 2**N, g % 2**N) + theta, t = divsteps_n_matrix(theta, f % 2**N, g % 2**N) f, g = update_fg(f, g, t) d, e = update_de(d, e, t, M, Mi) return normalize(f, d, M) @@ -745,7 +767,7 @@ def modinv(M, Mi, x): NEGINV16 = [15, 5, 3, 9, 7, 13, 11, 1] # NEGINV16[n//2] = (-n)^-1 mod 16, for odd n def divsteps_n_matrix_var(eta, f, g): """Compute eta and transition matrix t after N divsteps (multiplied by 2^N).""" - u, v, q, r = 1, 0, 0, 1 + u, v, q, r = 1, 0, 0, 1 # Start with identity matrix (not scaled; shift during run instead). i = N while True: zeros = min(i, count_trailing_zeros(g)) diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h index 661c5fc04c..13317ecbfb 100644 --- a/src/modinv32_impl.h +++ b/src/modinv32_impl.h @@ -168,68 +168,71 @@ typedef struct { int32_t u, v, q, r; } secp256k1_modinv32_trans2x2; -/* Compute the transition matrix and zeta for 30 divsteps. +/* Compute the transition matrix and theta for 30 divsteps (where theta = delta-1/2). * - * Input: zeta: initial zeta - * f0: bottom limb of initial f - * g0: bottom limb of initial g + * Input: theta: initial theta + * f0: bottom limb of initial f + * g0: bottom limb of initial g * Output: t: transition matrix - * Return: final zeta + * Return: final theta * * Implements the divsteps_n_matrix function from the explanation. */ -static int32_t secp256k1_modinv32_divsteps_30(int32_t zeta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) { - /* u,v,q,r are the elements of the transformation matrix being built up, - * starting with the identity matrix. Semantically they are signed integers - * in range [-2^30,2^30], but here represented as unsigned mod 2^32. This - * permits left shifting (which is UB for negative numbers). The range - * being inside [-2^31,2^31) means that casting to signed works correctly. - */ - uint32_t u = 1, v = 0, q = 0, r = 1; - uint32_t c1, c2, f = f0, g = g0, x, y, z; +static int32_t secp256k1_modinv32_divsteps_30(int32_t theta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) { + /* The transformation matrix being built up, scaled by 2^30. */ + int32_t u = (int32_t)1 << 30, v = 0, q = 0, r = (int32_t)1 << 30, y, z; + uint32_t c1, c2, f = f0, g = g0, x; int i; for (i = 0; i < 30; ++i) { - VERIFY_CHECK((f & 1) == 1); /* f must always be odd */ - VERIFY_CHECK((u * f0 + v * g0) == f << i); - VERIFY_CHECK((q * f0 + r * g0) == g << i); - /* Compute conditional masks for (zeta < 0) and for (g & 1). */ - c1 = zeta >> 31; + /* f must always be odd */ + VERIFY_CHECK((f & 1) == 1); + /* Minimum trailing zeros count for matrix elements decreases in each iteration */ + VERIFY_CHECK(!((u | v | q | r) & (0xFFFFFFFFULL >> (i + 2)))); + /* Applying the matrix so far to the initial f,g gives current f,g. */ + VERIFY_CHECK((u >> (30 - i)) * f0 + (v >> (30 - i)) * g0 == f << i); + VERIFY_CHECK((q >> (30 - i)) * f0 + (r >> (30 - i)) * g0 == g << i); + /* Compute conditional masks for (theta < 0) and for (g & 1). */ + c1 = theta >> 31; c2 = -(g & 1); - /* Compute x,y,z, conditionally negated versions of f,u,v. */ - x = (f ^ c1) - c1; - y = (u ^ c1) - c1; - z = (v ^ c1) - c1; - /* Conditionally add x,y,z to g,q,r. */ - g += x & c2; - q += y & c2; - r += z & c2; - /* In what follows, c1 is a condition mask for (zeta < 0) and (g & 1). */ - c1 &= c2; - /* Conditionally change zeta into -zeta-2 or zeta-1. */ - zeta = (zeta ^ c1) - 1; + /* Compute x,y,z, conditionally complemented versions of f,u,v. */ + x = f ^ c1; + y = u ^ c1; + z = v ^ c1; + /* Conditionally subtract x,y,z from g,q,r. */ + g -= x & c2; + q -= y & c2; + r -= z & c2; + /* In what follows, c2 is a condition mask for (theta >= 0) and (g & 1). */ + c2 &= ~c1; + /* Conditionally change theta into -theta or theta+1. */ + theta = (theta ^ c2) + 1; /* Conditionally add g,q,r to f,u,v. */ - f += g & c1; - u += q & c1; - v += r & c1; + f += g & c2; + u += q & c2; + v += r & c2; /* Shifts */ g >>= 1; - u <<= 1; - v <<= 1; - /* Bounds on zeta that follow from the bounds on iteration count (max 20*30 divsteps). */ - VERIFY_CHECK(zeta >= -601 && zeta <= 601); + q >>= 1; + r >>= 1; + /* Bounds on theta that follow from the bounds on iteration count (max 20*30 divsteps). */ + VERIFY_CHECK(theta >= -600 && theta <= 600); } /* Return data in t and return value. */ - t->u = (int32_t)u; - t->v = (int32_t)v; - t->q = (int32_t)q; - t->r = (int32_t)r; + t->u = u; + t->v = v; + t->q = q; + t->r = r; + /* Applying the final matrix to the initial f,g gives final f,g. */ + VERIFY_CHECK(u * f0 + v * g0 == f << 30); + VERIFY_CHECK(q * f0 + r * g0 == g << 30); /* The determinant of t must be a power of two. This guarantees that multiplication with t * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which - * will be divided out again). As each divstep's individual matrix has determinant 2, the - * aggregate of 30 of them will have determinant 2^30. */ + * will be divided out again). As each divstep's individual matrix has determinant 2^-1, + * the aggregate of 30 of them will have determinant 2^-30. Multiplying with the initial + * 2^30*identity (which has determinant 2^60) means the result has determinant 2^30. */ VERIFY_CHECK((int64_t)t->u * t->r - (int64_t)t->v * t->q == ((int64_t)1) << 30); - return zeta; + return theta; } /* Compute the transition matrix and eta for 30 divsteps (variable time). @@ -258,7 +261,12 @@ static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint 0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01 }; - /* Transformation matrix; see comments in secp256k1_modinv32_divsteps_30. */ + /* u,v,q,r are the elements of the transformation matrix being built up, + * starting with the identity matrix. Semantically they are signed integers + * in range [-2^30,2^30], but here represented as unsigned mod 2^32. This + * permits left shifting (which is UB for negative numbers). The range + * being inside [-2^31,2^31) means that casting to signed works correctly. + */ uint32_t u = 1, v = 0, q = 0, r = 1; uint32_t f = f0, g = g0, m; uint16_t w; @@ -453,19 +461,19 @@ static void secp256k1_modinv32_update_fg_30_var(int len, secp256k1_modinv32_sign /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */ static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo) { - /* Start with d=0, e=1, f=modulus, g=x, zeta=-1. */ + /* Start with d=0, e=1, f=modulus, g=x, theta=0. */ secp256k1_modinv32_signed30 d = {{0}}; secp256k1_modinv32_signed30 e = {{1}}; secp256k1_modinv32_signed30 f = modinfo->modulus; secp256k1_modinv32_signed30 g = *x; int i; - int32_t zeta = -1; /* zeta = -(delta+1/2); delta is initially 1/2. */ + int32_t theta = 0; /* theta = delta-1/2; delta is initially 1/2. */ /* Do 20 iterations of 30 divsteps each = 600 divsteps. 590 suffices for 256-bit inputs. */ for (i = 0; i < 20; ++i) { /* Compute transition matrix and new zeta after 30 divsteps. */ secp256k1_modinv32_trans2x2 t; - zeta = secp256k1_modinv32_divsteps_30(zeta, f.v[0], g.v[0], &t); + theta = secp256k1_modinv32_divsteps_30(theta, f.v[0], g.v[0], &t); /* Update d,e using that transition matrix. */ secp256k1_modinv32_update_de_30(&d, &e, &t, modinfo); /* Update f,g using that transition matrix. */ diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h index 0743a9c821..8e2de33cb0 100644 --- a/src/modinv64_impl.h +++ b/src/modinv64_impl.h @@ -145,72 +145,74 @@ typedef struct { int64_t u, v, q, r; } secp256k1_modinv64_trans2x2; -/* Compute the transition matrix and eta for 59 divsteps (where zeta=-(delta+1/2)). - * Note that the transformation matrix is scaled by 2^62 and not 2^59. +/* Compute the transition matrix and theta for 59 divsteps (where theta=delta-1/2) + * Although only 59 divsteps are performed, the resulting transformation matrix + * is scaled by 2^62 to allow reuse of _update_de_62 and _update_fg_62 between + * _modinv64 and _modinv64_var. * - * Input: zeta: initial zeta - * f0: bottom limb of initial f - * g0: bottom limb of initial g + * Input: theta: initial theta + * f0: bottom limb of initial f + * g0: bottom limb of initial g * Output: t: transition matrix - * Return: final zeta + * Return: final theta * * Implements the divsteps_n_matrix function from the explanation. */ -static int64_t secp256k1_modinv64_divsteps_59(int64_t zeta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) { - /* u,v,q,r are the elements of the transformation matrix being built up, - * starting with the identity matrix times 8 (because the caller expects - * a result scaled by 2^62). Semantically they are signed integers - * in range [-2^62,2^62], but here represented as unsigned mod 2^64. This - * permits left shifting (which is UB for negative numbers). The range - * being inside [-2^63,2^63) means that casting to signed works correctly. - */ - uint64_t u = 8, v = 0, q = 0, r = 8; - uint64_t c1, c2, f = f0, g = g0, x, y, z; +static int64_t secp256k1_modinv64_divsteps_59(int64_t theta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) { + /* The transformation matrix being built up, scaled by 2^62. */ + int64_t u = (int64_t)1 << 62, v = 0, q = 0, r = (int64_t)1 << 62, y, z; + uint64_t c1, c2, f = f0, g = g0, x; int i; for (i = 3; i < 62; ++i) { - VERIFY_CHECK((f & 1) == 1); /* f must always be odd */ - VERIFY_CHECK((u * f0 + v * g0) == f << i); - VERIFY_CHECK((q * f0 + r * g0) == g << i); - /* Compute conditional masks for (zeta < 0) and for (g & 1). */ - c1 = zeta >> 63; + /* f must always be odd */ + VERIFY_CHECK((f & 1) == 1); + /* Minimum trailing zeros count for matrix elements decreases in each iteration */ + VERIFY_CHECK(!((u | v | q | r) & (0xFFFFFFFFFFFFFFFFULL >> (i - 1)))); + /* Applying the matrix so far to the initial f,g gives current f,g. */ + VERIFY_CHECK((u >> (62 - i)) * f0 + (v >> (62 - i)) * g0 == f << i); + VERIFY_CHECK((q >> (62 - i)) * f0 + (r >> (62 - i)) * g0 == g << i); + /* Compute conditional masks for (theta < 0) and for (g & 1). */ + c1 = theta >> 63; c2 = -(g & 1); - /* Compute x,y,z, conditionally negated versions of f,u,v. */ - x = (f ^ c1) - c1; - y = (u ^ c1) - c1; - z = (v ^ c1) - c1; - /* Conditionally add x,y,z to g,q,r. */ - g += x & c2; - q += y & c2; - r += z & c2; - /* In what follows, c1 is a condition mask for (zeta < 0) and (g & 1). */ - c1 &= c2; - /* Conditionally change zeta into -zeta-2 or zeta-1. */ - zeta = (zeta ^ c1) - 1; + /* Compute x,y,z, conditionally complemented versions of f,u,v. */ + x = f ^ c1; + y = u ^ c1; + z = v ^ c1; + /* Conditionally subtract x,y,z from g,q,r. */ + g -= x & c2; + q -= y & c2; + r -= z & c2; + /* In what follows, c2 is a condition mask for (theta >= 0) and (g & 1). */ + c2 &= ~c1; + /* Conditionally change theta into -theta or theta+1. */ + theta = (theta ^ c2) + 1; /* Conditionally add g,q,r to f,u,v. */ - f += g & c1; - u += q & c1; - v += r & c1; + f += g & c2; + u += q & c2; + v += r & c2; /* Shifts */ g >>= 1; - u <<= 1; - v <<= 1; - /* Bounds on zeta that follow from the bounds on iteration count (max 10*59 divsteps). */ - VERIFY_CHECK(zeta >= -591 && zeta <= 591); + q >>= 1; + r >>= 1; + /* Bounds on theta that follow from the bounds on iteration count (max 10*59 divsteps). */ + VERIFY_CHECK(theta >= -590 && theta <= 590); } /* Return data in t and return value. */ - t->u = (int64_t)u; - t->v = (int64_t)v; - t->q = (int64_t)q; - t->r = (int64_t)r; + t->u = u; + t->v = v; + t->q = q; + t->r = r; + /* Applying the final matrix to the initial f,g gives final f,g. */ + VERIFY_CHECK(u * f0 + v * g0 == f << 62); + VERIFY_CHECK(q * f0 + r * g0 == g << 62); /* The determinant of t must be a power of two. This guarantees that multiplication with t * does not change the gcd of f and g, apart from adding a power-of-2 factor to it (which - * will be divided out again). As each divstep's individual matrix has determinant 2, the - * aggregate of 59 of them will have determinant 2^59. Multiplying with the initial - * 8*identity (which has determinant 2^6) means the overall outputs has determinant - * 2^65. */ + * will be divided out again). As each divstep's individual matrix has determinant 2^-1, + * the aggregate of 59 of them will have determinant 2^-59. Multiplying with the initial + * 2^62*identity (which has determinant 2^124) means the result has determinant 2^65. */ VERIFY_CHECK((int128_t)t->u * t->r - (int128_t)t->v * t->q == ((int128_t)1) << 65); - return zeta; + return theta; } /* Compute the transition matrix and eta for 62 divsteps (variable time, eta=-delta). @@ -224,7 +226,12 @@ static int64_t secp256k1_modinv64_divsteps_59(int64_t zeta, uint64_t f0, uint64_ * Implements the divsteps_n_matrix_var function from the explanation. */ static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) { - /* Transformation matrix; see comments in secp256k1_modinv64_divsteps_62. */ + /* u,v,q,r are the elements of the transformation matrix being built up, + * starting with the identity matrix. Semantically they are signed integers + * in range [-2^62,2^62], but here represented as unsigned mod 2^64. This + * permits left shifting (which is UB for negative numbers). The range + * being inside [-2^63,2^63) means that casting to signed works correctly. + */ uint64_t u = 1, v = 0, q = 0, r = 1; uint64_t f = f0, g = g0, m; uint32_t w; @@ -459,19 +466,19 @@ static void secp256k1_modinv64_update_fg_62_var(int len, secp256k1_modinv64_sign /* Compute the inverse of x modulo modinfo->modulus, and replace x with it (constant time in x). */ static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo) { - /* Start with d=0, e=1, f=modulus, g=x, zeta=-1. */ + /* Start with d=0, e=1, f=modulus, g=x, theta=0. */ secp256k1_modinv64_signed62 d = {{0, 0, 0, 0, 0}}; secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}}; secp256k1_modinv64_signed62 f = modinfo->modulus; secp256k1_modinv64_signed62 g = *x; int i; - int64_t zeta = -1; /* zeta = -(delta+1/2); delta starts at 1/2. */ + int64_t theta = 0; /* theta = delta-1/2; delta starts at 1/2. */ /* Do 10 iterations of 59 divsteps each = 590 divsteps. This suffices for 256-bit inputs. */ for (i = 0; i < 10; ++i) { /* Compute transition matrix and new zeta after 59 divsteps. */ secp256k1_modinv64_trans2x2 t; - zeta = secp256k1_modinv64_divsteps_59(zeta, f.v[0], g.v[0], &t); + theta = secp256k1_modinv64_divsteps_59(theta, f.v[0], g.v[0], &t); /* Update d,e using that transition matrix. */ secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo); /* Update f,g using that transition matrix. */