Merge #1184: Signed-digit based ecmult_const algorithm

real-or-random · real-or-random · commit 40f50d0fbd3c · 2023-11-07T23:18:59.000+01:00
355bbdf Add changelog entry for signed-digit ecmult_const algorithm (Pieter Wuille) 21f49d9 Remove unused secp256k1_scalar_shr_int (Pieter Wuille) 115fdc7 Remove unused secp256k1_wnaf_const (Pieter Wuille) aa9f3a3 ecmult_const: add/improve tests (Jonas Nick) 4d16e90 Signed-digit based ecmult_const algorithm (Pieter Wuille) ba523be make SECP256K1_SCALAR_CONST reduce modulo exhaustive group order (Pieter Wuille) 2140da9 Add secp256k1_scalar_half for halving scalars (+ tests/benchmarks). (Pieter Wuille) Pull request description: Using some insights learned from #1058, this replaces the fixed-wnaf ecmult_const algorithm with a signed-digit based one. Conceptually both algorithms are very similar, in that they boil down to summing precomputed odd multiples of the input points. Practically however, the new algorithm is simpler because it's just using scalar operations, rather than relying on wnaf machinery with skew terms to guarantee odd multipliers. The idea is that we can compute $q \cdot A$ as follows: * Let $s = f(q)$, for some function $f()$. * Compute $(s_1, s_2)$ such that $s = s_1 + \lambda s_2$, using `secp256k1_scalar_lambda_split`. * Let $v_1 = s_1 + 2^{128}$ and $v_2 = s_2 + 2^{128}$ (such that the $v_i$ are positive and $n$ bits long). * Computing the result as $$\sum_{i=0}^{n-1} (2v_1[i]-1) 2^i A + \sum_{i=0}^{n-1} (2v_2[i]-1) 2^i \lambda A$$ where $x[i]$ stands for the *i*'th bit of $x$, so summing positive and negative powers of two times $A$, based on the bits of $v_1.$ The comments in `ecmult_const_impl.h` show that if $f(q) = (q + (1+\lambda)(2^n - 2^{129} - 1))/2 \mod n$, the result will equal $q \cdot A$. This last step can be performed in groups of multiple bits at once, by looking up entries in a precomputed table of odd multiples of $A$ and $\lambda A$, and then multiplying by a power of two before proceeding to the next group. The result is slightly faster (I measure ~2% speedup), but significantly simpler as it only uses scalar arithmetic to determine the table lookup values. The speedup is due to the fact that no skew corrections at the end are needed, and less overhead to determine table indices. The precomputed table sizes are also made independent from the `ecmult` ones, after observing that the optimal table size is bigger here (which also gives a small speedup). ACKs for top commit: jonasnick: ACK 355bbdf siv2r: ACK 355bbdf real-or-random: ACK 355bbdf Tree-SHA512: 13db572cb7f9be00bf0931c65fcd8bc8b5545be86a8c8700bd6a79ad9e4d9e5e79e7f763f92ca6a91d9717a355f8162204b0ea821b6ae99d58cb400497ddc656
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+#### Changed
+ - The point multiplication algorithm used for ECDH operations (module `ecdh`) was replaced with a slightly faster one.
+
 ## [0.4.0] - 2023-09-04
 
 #### Added
diff --git a/src/bench_internal.c b/src/bench_internal.c
@@ -14,7 +14,6 @@
 #include "field_impl.h"
 #include "group_impl.h"
 #include "scalar_impl.h"
-#include "ecmult_const_impl.h"
 #include "ecmult_impl.h"
 #include "bench.h"
 
@@ -98,6 +97,18 @@ static void bench_scalar_negate(void* arg, int iters) {
     }
 }
 
+static void bench_scalar_half(void* arg, int iters) {
+    int i;
+    bench_inv *data = (bench_inv*)arg;
+    secp256k1_scalar s = data->scalar[0];
+
+    for (i = 0; i < iters; i++) {
+        secp256k1_scalar_half(&s, &s);
+    }
+
+    data->scalar[0] = s;
+}
+
 static void bench_scalar_mul(void* arg, int iters) {
     int i;
     bench_inv *data = (bench_inv*)arg;
@@ -309,18 +320,6 @@ static void bench_ecmult_wnaf(void* arg, int iters) {
     CHECK(bits <= 256*iters);
 }
 
-static void bench_wnaf_const(void* arg, int iters) {
-    int i, bits = 0, overflow = 0;
-    bench_inv *data = (bench_inv*)arg;
-
-    for (i = 0; i < iters; i++) {
-        bits += secp256k1_wnaf_const(data->wnaf, &data->scalar[0], WINDOW_A, 256);
-        overflow += secp256k1_scalar_add(&data->scalar[0], &data->scalar[0], &data->scalar[1]);
-    }
-    CHECK(overflow >= 0);
-    CHECK(bits <= 256*iters);
-}
-
 static void bench_sha256(void* arg, int iters) {
     int i;
     bench_inv *data = (bench_inv*)arg;
@@ -370,6 +369,7 @@ int main(int argc, char **argv) {
     int d = argc == 1; /* default */
     print_output_table_header_row();
 
+    if (d || have_flag(argc, argv, "scalar") || have_flag(argc, argv, "half")) run_benchmark("scalar_half", bench_scalar_half, bench_setup, NULL, &data, 10, iters*100);
     if (d || have_flag(argc, argv, "scalar") || have_flag(argc, argv, "add")) run_benchmark("scalar_add", bench_scalar_add, bench_setup, NULL, &data, 10, iters*100);
     if (d || have_flag(argc, argv, "scalar") || have_flag(argc, argv, "negate")) run_benchmark("scalar_negate", bench_scalar_negate, bench_setup, NULL, &data, 10, iters*100);
     if (d || have_flag(argc, argv, "scalar") || have_flag(argc, argv, "mul")) run_benchmark("scalar_mul", bench_scalar_mul, bench_setup, NULL, &data, 10, iters*10);
@@ -394,7 +394,6 @@ int main(int argc, char **argv) {
     if (d || have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_zinv_var", bench_group_add_zinv_var, bench_setup, NULL, &data, 10, iters*10);
     if (d || have_flag(argc, argv, "group") || have_flag(argc, argv, "to_affine")) run_benchmark("group_to_affine_var", bench_group_to_affine_var, bench_setup, NULL, &data, 10, iters);
 
-    if (d || have_flag(argc, argv, "ecmult") || have_flag(argc, argv, "wnaf")) run_benchmark("wnaf_const", bench_wnaf_const, bench_setup, NULL, &data, 10, iters);
     if (d || have_flag(argc, argv, "ecmult") || have_flag(argc, argv, "wnaf")) run_benchmark("ecmult_wnaf", bench_ecmult_wnaf, bench_setup, NULL, &data, 10, iters);
 
     if (d || have_flag(argc, argv, "hash") || have_flag(argc, argv, "sha256")) run_benchmark("hash_sha256", bench_sha256, bench_setup, NULL, &data, 10, iters);
diff --git a/src/ecmult_const_impl.h b/src/ecmult_const_impl.h
diff --git a/src/scalar.h b/src/scalar.h
@@ -25,7 +25,7 @@ static void secp256k1_scalar_clear(secp256k1_scalar *r);
 /** Access bits from a scalar. All requested bits must belong to the same 32-bit limb. */
 static unsigned int secp256k1_scalar_get_bits(const secp256k1_scalar *a, unsigned int offset, unsigned int count);
 
-/** Access bits from a scalar. Not constant time. */
+/** Access bits from a scalar. Not constant time in offset and count. */
 static unsigned int secp256k1_scalar_get_bits_var(const secp256k1_scalar *a, unsigned int offset, unsigned int count);
 
 /** Set a scalar from a big endian byte array. The scalar will be reduced modulo group order `n`.
@@ -54,10 +54,6 @@ static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int
 /** Multiply two scalars (modulo the group order). */
 static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b);
 
-/** Shift a scalar right by some amount strictly between 0 and 16, returning
- *  the low bits that were shifted off */
-static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n);
-
 /** Compute the inverse of a scalar (modulo the group order). */
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *a);
 
@@ -67,6 +63,9 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
 /** Compute the complement of a scalar (modulo the group order). */
 static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar *a);
 
+/** Multiply a scalar with the multiplicative inverse of 2. */
+static void secp256k1_scalar_half(secp256k1_scalar *r, const secp256k1_scalar *a);
+
 /** Check whether a scalar equals zero. */
 static int secp256k1_scalar_is_zero(const secp256k1_scalar *a);
 
diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
@@ -199,6 +199,47 @@ static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar
     secp256k1_scalar_verify(r);
 }
 
+static void secp256k1_scalar_half(secp256k1_scalar *r, const secp256k1_scalar *a) {
+    /* Writing `/` for field division and `//` for integer division, we compute
+     *
+     *   a/2 = (a - (a&1))/2 + (a&1)/2
+     *       = (a >> 1) + (a&1 ?    1/2 : 0)
+     *       = (a >> 1) + (a&1 ? n//2+1 : 0),
+     *
+     * where n is the group order and in the last equality we have used 1/2 = n//2+1 (mod n).
+     * For n//2, we have the constants SECP256K1_N_H_0, ...
+     *
+     * This sum does not overflow. The most extreme case is a = -2, the largest odd scalar. Here:
+     * - the left summand is:  a >> 1 = (a - a&1)/2 = (n-2-1)//2           = (n-3)//2
+     * - the right summand is: a&1 ? n//2+1 : 0 = n//2+1 = (n-1)//2 + 2//2 = (n+1)//2
+     * Together they sum to (n-3)//2 + (n+1)//2 = (2n-2)//2 = n - 1, which is less than n.
+     */
+    uint64_t mask = -(uint64_t)(a->d[0] & 1U);
+    secp256k1_uint128 t;
+    secp256k1_scalar_verify(a);
+
+    secp256k1_u128_from_u64(&t, (a->d[0] >> 1) | (a->d[1] << 63));
+    secp256k1_u128_accum_u64(&t, (SECP256K1_N_H_0 + 1U) & mask);
+    r->d[0] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, (a->d[1] >> 1) | (a->d[2] << 63));
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_H_1 & mask);
+    r->d[1] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    secp256k1_u128_accum_u64(&t, (a->d[2] >> 1) | (a->d[3] << 63));
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_H_2 & mask);
+    r->d[2] = secp256k1_u128_to_u64(&t); secp256k1_u128_rshift(&t, 64);
+    r->d[3] = secp256k1_u128_to_u64(&t) + (a->d[3] >> 1) + (SECP256K1_N_H_3 & mask);
+#ifdef VERIFY
+    /* The line above only computed the bottom 64 bits of r->d[3]; redo the computation
+     * in full 128 bits to make sure the top 64 bits are indeed zero. */
+    secp256k1_u128_accum_u64(&t, a->d[3] >> 1);
+    secp256k1_u128_accum_u64(&t, SECP256K1_N_H_3 & mask);
+    secp256k1_u128_rshift(&t, 64);
+    VERIFY_CHECK(secp256k1_u128_to_u64(&t) == 0);
+
+    secp256k1_scalar_verify(r);
+#endif
+}
+
 SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) {
     secp256k1_scalar_verify(a);
 
@@ -809,22 +850,6 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
     secp256k1_scalar_verify(r);
 }
 
-static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
-    int ret;
-    secp256k1_scalar_verify(r);
-    VERIFY_CHECK(n > 0);
-    VERIFY_CHECK(n < 16);
-
-    ret = r->d[0] & ((1 << n) - 1);
-    r->d[0] = (r->d[0] >> n) + (r->d[1] << (64 - n));
-    r->d[1] = (r->d[1] >> n) + (r->d[2] << (64 - n));
-    r->d[2] = (r->d[2] >> n) + (r->d[3] << (64 - n));
-    r->d[3] = (r->d[3] >> n);
-
-    secp256k1_scalar_verify(r);
-    return ret;
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
     secp256k1_scalar_verify(k);
 
diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
@@ -245,6 +245,55 @@ static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar
     secp256k1_scalar_verify(r);
 }
 
+static void secp256k1_scalar_half(secp256k1_scalar *r, const secp256k1_scalar *a) {
+    /* Writing `/` for field division and `//` for integer division, we compute
+     *
+     *   a/2 = (a - (a&1))/2 + (a&1)/2
+     *       = (a >> 1) + (a&1 ?    1/2 : 0)
+     *       = (a >> 1) + (a&1 ? n//2+1 : 0),
+     *
+     * where n is the group order and in the last equality we have used 1/2 = n//2+1 (mod n).
+     * For n//2, we have the constants SECP256K1_N_H_0, ...
+     *
+     * This sum does not overflow. The most extreme case is a = -2, the largest odd scalar. Here:
+     * - the left summand is:  a >> 1 = (a - a&1)/2 = (n-2-1)//2           = (n-3)//2
+     * - the right summand is: a&1 ? n//2+1 : 0 = n//2+1 = (n-1)//2 + 2//2 = (n+1)//2
+     * Together they sum to (n-3)//2 + (n+1)//2 = (2n-2)//2 = n - 1, which is less than n.
+     */
+    uint32_t mask = -(uint32_t)(a->d[0] & 1U);
+    uint64_t t = (uint32_t)((a->d[0] >> 1) | (a->d[1] << 31));
+    secp256k1_scalar_verify(a);
+
+    t += (SECP256K1_N_H_0 + 1U) & mask;
+    r->d[0] = t; t >>= 32;
+    t += (uint32_t)((a->d[1] >> 1) | (a->d[2] << 31));
+    t += SECP256K1_N_H_1 & mask;
+    r->d[1] = t; t >>= 32;
+    t += (uint32_t)((a->d[2] >> 1) | (a->d[3] << 31));
+    t += SECP256K1_N_H_2 & mask;
+    r->d[2] = t; t >>= 32;
+    t += (uint32_t)((a->d[3] >> 1) | (a->d[4] << 31));
+    t += SECP256K1_N_H_3 & mask;
+    r->d[3] = t; t >>= 32;
+    t += (uint32_t)((a->d[4] >> 1) | (a->d[5] << 31));
+    t += SECP256K1_N_H_4 & mask;
+    r->d[4] = t; t >>= 32;
+    t += (uint32_t)((a->d[5] >> 1) | (a->d[6] << 31));
+    t += SECP256K1_N_H_5 & mask;
+    r->d[5] = t; t >>= 32;
+    t += (uint32_t)((a->d[6] >> 1) | (a->d[7] << 31));
+    t += SECP256K1_N_H_6 & mask;
+    r->d[6] = t; t >>= 32;
+    r->d[7] = (uint32_t)t + (uint32_t)(a->d[7] >> 1) + (SECP256K1_N_H_7 & mask);
+#ifdef VERIFY
+    /* The line above only computed the bottom 32 bits of r->d[7]. Redo the computation
+     * in full 64 bits to make sure the top 32 bits are indeed zero. */
+    VERIFY_CHECK((t + (a->d[7] >> 1) + (SECP256K1_N_H_7 & mask)) >> 32 == 0);
+
+    secp256k1_scalar_verify(r);
+#endif
+}
+
 SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) {
     secp256k1_scalar_verify(a);
 
@@ -613,26 +662,6 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
     secp256k1_scalar_verify(r);
 }
 
-static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
-    int ret;
-    secp256k1_scalar_verify(r);
-    VERIFY_CHECK(n > 0);
-    VERIFY_CHECK(n < 16);
-
-    ret = r->d[0] & ((1 << n) - 1);
-    r->d[0] = (r->d[0] >> n) + (r->d[1] << (32 - n));
-    r->d[1] = (r->d[1] >> n) + (r->d[2] << (32 - n));
-    r->d[2] = (r->d[2] >> n) + (r->d[3] << (32 - n));
-    r->d[3] = (r->d[3] >> n) + (r->d[4] << (32 - n));
-    r->d[4] = (r->d[4] >> n) + (r->d[5] << (32 - n));
-    r->d[5] = (r->d[5] >> n) + (r->d[6] << (32 - n));
-    r->d[6] = (r->d[6] >> n) + (r->d[7] << (32 - n));
-    r->d[7] = (r->d[7] >> n);
-
-    secp256k1_scalar_verify(r);
-    return ret;
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
     secp256k1_scalar_verify(k);
 
diff --git a/src/scalar_low.h b/src/scalar_low.h
@@ -1,5 +1,5 @@
 /***********************************************************************
- * Copyright (c) 2015 Andrew Poelstra                                  *
+ * Copyright (c) 2015, 2022 Andrew Poelstra, Pieter Wuille             *
  * Distributed under the MIT software license, see the accompanying    *
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
@@ -12,6 +12,13 @@
 /** A scalar modulo the group order of the secp256k1 curve. */
 typedef uint32_t secp256k1_scalar;
 
-#define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) (d0)
+/* A compile-time constant equal to 2^32 (modulo order). */
+#define SCALAR_2P32 ((0xffffffffUL % EXHAUSTIVE_TEST_ORDER) + 1U)
+
+/* Compute a*2^32 + b (modulo order). */
+#define SCALAR_HORNER(a, b) (((uint64_t)(a) * SCALAR_2P32 + (b)) % EXHAUSTIVE_TEST_ORDER)
+
+/* Evaluates to the provided 256-bit constant reduced modulo order. */
+#define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) SCALAR_HORNER(SCALAR_HORNER(SCALAR_HORNER(SCALAR_HORNER(SCALAR_HORNER(SCALAR_HORNER(SCALAR_HORNER((d7), (d6)), (d5)), (d4)), (d3)), (d2)), (d1)), (d0))
 
 #endif /* SECP256K1_SCALAR_REPR_H */
diff --git a/src/scalar_low_impl.h b/src/scalar_low_impl.h
@@ -139,19 +139,6 @@ static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a,
     secp256k1_scalar_verify(r);
 }
 
-static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
-    int ret;
-    secp256k1_scalar_verify(r);
-    VERIFY_CHECK(n > 0);
-    VERIFY_CHECK(n < 16);
-
-    ret = *r & ((1 << n) - 1);
-    *r >>= n;
-
-    secp256k1_scalar_verify(r);
-    return ret;
-}
-
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *a) {
     secp256k1_scalar_verify(a);
 
@@ -205,4 +192,12 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_sc
     secp256k1_scalar_verify(r);
 }
 
+static void secp256k1_scalar_half(secp256k1_scalar *r, const secp256k1_scalar *a) {
+    secp256k1_scalar_verify(a);
+
+    *r = (*a + ((-(uint32_t)(*a & 1)) & EXHAUSTIVE_TEST_ORDER)) >> 1;
+
+    secp256k1_scalar_verify(r);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/tests.c b/src/tests.c