bitcoin-core · sipa · Aug 2, 2018 · Aug 5, 2018 · Sep 23, 2018 · Sep 24, 2018
diff --git a/.cirrus.yml b/.cirrus.yml
@@ -1,7 +1,8 @@
 env:
   WIDEMUL: auto
   STATICPRECOMPUTATION: yes
-  ECMULTGENPRECISION: auto
+  ECMULTGENBLOCKS: auto
+  ECMULTGENTEETH: auto
   ASM: no
   BUILD: check
   WITH_VALGRIND: yes
@@ -73,8 +74,9 @@ task:
         EXPERIMENTAL: yes
         SCHNORRSIG: yes
         CTIMETEST: no
-    - env: { ECMULTGENPRECISION: 2 }
-    - env: { ECMULTGENPRECISION: 8 }
+    - env: { ECMULTGENBLOCKS: 256, ECMULTGENTEETH: 1 }
+    - env: { ECMULTGENBLOCKS: 43, ECMULTGENTEETH: 6, STATICPRECOMPUTATION: no }
+    - env: { ECMULTGENBLOCKS: 1, ECMULTGENTEETH: 1, STATICPRECOMPUTATION: no }
     - env:
         RUN_VALGRIND: yes
         ASM: x86_64

diff --git a/README.md b/README.md
@@ -49,13 +49,11 @@ Implementation details
   * Use a much larger window for multiples of G, using precomputed multiples.
   * Use Shamir's trick to do the multiplication with the public key and the generator simultaneously.
   * Use secp256k1's efficiently-computable endomorphism to split the P multiplicand into 2 half-sized ones.
-* Point multiplication for signing
-  * Use a precomputed table of multiples of powers of 16 multiplied with the generator, so general multiplication becomes a series of additions.
+* Point multiplication for signing using Mike Hamburg's signed-digit multi-comb method (see https://eprint.iacr.org/2012/309, section 3.3)
   * Intended to be completely free of timing sidechannels for secret-key operations (on reasonable hardware/toolchains)
     * Access the table with branch-free conditional moves so memory access is uniform.
     * No data-dependent branches
   * Optional runtime blinding which attempts to frustrate differential power analysis.
-  * The precomputed tables add and eventually subtract points for which no known scalar (secret key) is known, preventing even an attacker with control over the secret key used to control the data internally.
 
 Build steps
 -----------

diff --git a/ci/cirrus.sh b/ci/cirrus.sh
@@ -15,7 +15,7 @@ valgrind --version || true
 ./configure \
     --enable-experimental="$EXPERIMENTAL" \
     --with-test-override-wide-multiply="$WIDEMUL" --with-asm="$ASM" \
-    --enable-ecmult-static-precomputation="$STATICPRECOMPUTATION" --with-ecmult-gen-precision="$ECMULTGENPRECISION" \
+    --enable-ecmult-static-precomputation="$STATICPRECOMPUTATION" --with-ecmult-gen-blocks="$ECMULTGENBLOCKS" --with-ecmult-gen-teeth="$ECMULTGENTEETH" \
     --enable-module-ecdh="$ECDH" --enable-module-recovery="$RECOVERY" \
     --enable-module-schnorrsig="$SCHNORRSIG" \
     --with-valgrind="$WITH_VALGRIND" \

diff --git a/configure.ac b/configure.ac
@@ -170,13 +170,21 @@ AC_ARG_WITH([ecmult-window], [AS_HELP_STRING([--with-ecmult-window=SIZE|auto],
 )],
 [req_ecmult_window=$withval], [req_ecmult_window=auto])
 
-AC_ARG_WITH([ecmult-gen-precision], [AS_HELP_STRING([--with-ecmult-gen-precision=2|4|8|auto],
-[Precision bits to tune the precomputed table size for signing.]
-[The size of the table is 32kB for 2 bits, 64kB for 4 bits, 512kB for 8 bits of precision.]
-[A larger table size usually results in possible faster signing.]
+AC_ARG_WITH([ecmult-gen-blocks], [AS_HELP_STRING([--with-ecmult-gen-blocks=BLOCKS|auto],
+[The number of blocks to use in the multi-comb multiplication algorithm, in the range [1..256].]
+[Larger values result in possibly better performance at the cost of a linearly larger precomputed table.]
+[There must exist a multiple of BLOCKS*TEETH that is between 256 and 288, inclusive.]
 ["auto" is a reasonable setting for desktop machines (currently 4). [default=auto]]
 )],
-[req_ecmult_gen_precision=$withval], [req_ecmult_gen_precision=auto])
+[req_ecmult_gen_blocks=$withval], [req_ecmult_gen_blocks=auto])
+
+AC_ARG_WITH([ecmult-gen-teeth], [AS_HELP_STRING([--with-ecmult-gen-teeth=TEETH|auto],
+[The number of teeth to use in the multi-comb multiplication algorithm, in the range [1..8].]
+[Larger values result in possibly better performance at the cost of an exponentially larger precomputed table.]
+[There must exist a multiple of BLOCKS*TEETH that is between 256 and 288, inclusive.]
+["auto" is a reasonable setting for desktop machines (currently 5). [default=auto]]
+)],
+[req_ecmult_gen_teeth=$withval], [req_ecmult_gen_teeth=auto])
 
 AC_ARG_WITH([valgrind], [AS_HELP_STRING([--with-valgrind=yes|no|auto],
 [Build with extra checks for running inside Valgrind [default=auto]]
@@ -296,19 +304,45 @@ case $set_ecmult_window in
   ;;
 esac
 
-# Set ecmult gen precision
-if test x"$req_ecmult_gen_precision" = x"auto"; then
-  set_ecmult_gen_precision=4
+# Set ecmult gen blocks
+if test x"$req_ecmult_gen_blocks" = x"auto"; then
+  set_ecmult_gen_blocks=4
 else
-  set_ecmult_gen_precision=$req_ecmult_gen_precision
+  set_ecmult_gen_blocks=$req_ecmult_gen_blocks
 fi
+error_gen_blocks=['option to --with-ecmult-gen-blocks not an integer in range [1..256] or "auto"']
+case $set_ecmult_gen_blocks in
+''|*[[!0-9]]*)
+  # no valid integer
+  AC_MSG_ERROR($error_gen_blocks)
+  ;;
+*)
+  if test "$set_ecmult_gen_blocks" -lt 1 -o "$set_ecmult_gen_blocks" -gt 256 ; then
+    # not in range
+    AC_MSG_ERROR($error_gen_blocks)
+  fi
+  AC_DEFINE_UNQUOTED(COMB_BLOCKS, $set_ecmult_gen_blocks, [Set number of blocks in ecmult_gen precomputation])
+  ;;
+esac
 
-case $set_ecmult_gen_precision in
-2|4|8)
-  AC_DEFINE_UNQUOTED(ECMULT_GEN_PREC_BITS, $set_ecmult_gen_precision, [Set ecmult gen precision bits])
+#set ecmult gen teeth
+if test x"$req_ecmult_gen_teeth" = x"auto"; then
+  set_ecmult_gen_teeth=5
+else
+  set_ecmult_gen_teeth=$req_ecmult_gen_teeth
+fi
+error_gen_teeth=['option to --with-ecmult-gen-teeth not an integer in range [1..8] or "auto"']
+case $set_ecmult_gen_teeth in
+''|*[[!0-9]]*)
+  # no valid integer
+  AC_MSG_ERROR($error_gen_teeth)
   ;;
 *)
-  AC_MSG_ERROR(['ecmult gen precision not 2, 4, 8 or "auto"'])
+  if test "$set_ecmult_gen_teeth" -lt 1 -o "$set_ecmult_gen_teeth" -gt 8 ; then
+    # not in range
+    AC_MSG_ERROR($error_gen_teeth)
+  fi
+  AC_DEFINE_UNQUOTED(COMB_TEETH, $set_ecmult_gen_teeth, [Set number of teeth in ecmult_gen precomputation])
   ;;
 esac
 
@@ -515,7 +549,8 @@ echo "  module schnorrsig       = $enable_module_schnorrsig"
 echo
 echo "  asm                     = $set_asm"
 echo "  ecmult window size      = $set_ecmult_window"
-echo "  ecmult gen prec. bits   = $set_ecmult_gen_precision"
+echo "  ecmult gen blocks       = $set_ecmult_gen_blocks"
+echo "  ecmult gen teeth        = $set_ecmult_gen_teeth"
 # Hide test-only options unless they're used.
 if test x"$set_widemul" != xauto; then
 echo "  wide multiplication     = $set_widemul"

diff --git a/src/ecmult_gen.h b/src/ecmult_gen.h
@@ -10,27 +10,91 @@
 #include "scalar.h"
 #include "group.h"
 
-#if ECMULT_GEN_PREC_BITS != 2 && ECMULT_GEN_PREC_BITS != 4 && ECMULT_GEN_PREC_BITS != 8
-#  error "Set ECMULT_GEN_PREC_BITS to 2, 4 or 8."
+#if defined HAVE_CONFIG_H
+#include "libsecp256k1-config.h"
+#endif
+
+#if defined(EXHAUSTIVE_TEST_ORDER)
+
+  /* We need to control these values for exhaustive tests because
+   * the tables cannot have infinities in them (secp256k1_ge_storage
+   * doesn't support infinities) */
+#undef COMB_BLOCKS
+#undef COMB_TEETH
+#undef COMB_SPACING
+
+#  if EXHAUSTIVE_TEST_ORDER > 32
+#    define COMB_BLOCKS 52
+#    define COMB_TEETH 5
+#  elif EXHAUSTIVE_TEST_ORDER > 16
+#    define COMB_BLOCKS 64
+#    define COMB_TEETH 4
+#  elif EXHAUSTIVE_TEST_ORDER > 8
+#    define COMB_BLOCKS 86
+#    define COMB_TEETH 3
+#  elif EXHAUSTIVE_TEST_ORDER > 4
+#    define COMB_BLOCKS 128
+#    define COMB_TEETH 2
+#  else
+#    define COMB_BLOCKS 256
+#    define COMB_TEETH 1
+#  endif
+
+#  define COMB_SPACING 1
+
+#else
+
+  /* COMB_BLOCKS, COMB_TEETH, COMB_SPACING must all be positive and the product of the three (COMB_BITS)
+   * must evaluate to a value in the range [256, 288]. The resulting memory usage for precomputation
+   * will be COMB_POINTS_TOTAL * sizeof(secp256k1_ge_storage). */
+#  ifndef COMB_BLOCKS
+#    define COMB_BLOCKS 4
+#  endif
+#  ifndef COMB_TEETH
+#    define COMB_TEETH 5
+#  endif
+#  ifndef COMB_SPACING
+#    define COMB_SPACING ((COMB_BLOCKS * COMB_TEETH + 255) / (COMB_BLOCKS * COMB_TEETH))
+#  endif
+
+#endif
+
+#if !(1 <= COMB_BLOCKS && COMB_BLOCKS <= 256)
+#  error "COMB_BLOCKS must be in the range [1, 256]"
+#endif
+#if !(1 <= COMB_TEETH && COMB_TEETH <= 8)
+#  error "COMB_TEETH must be in the range [1, 8]"
+#endif
+#if !(1 <= COMB_SPACING && COMB_SPACING <= 256)
+#  error "COMB_SPACING must be in the range [1, 256]"
+#endif
+
+/* The remaining COMB_* parameters are derived values, don't modify these. */
+#define COMB_BITS (COMB_BLOCKS * COMB_TEETH * COMB_SPACING)
+#define COMB_GROUPED ((COMB_SPACING == 1) && ((32 % COMB_TEETH) == 0))
+#define COMB_OFFSET (COMB_BITS == 256)
+#define COMB_POINTS (1 << (COMB_TEETH - 1))
+#define COMB_POINTS_TOTAL (COMB_BLOCKS * COMB_POINTS)
+#define COMB_MASK (COMB_POINTS - 1)
+
+#if !(256 <= COMB_BITS && COMB_BITS <= 288)
+#  error "COMB_BITS must be in the range [256, 288]"
 #endif
-#define ECMULT_GEN_PREC_B ECMULT_GEN_PREC_BITS
-#define ECMULT_GEN_PREC_G (1 << ECMULT_GEN_PREC_B)
-#define ECMULT_GEN_PREC_N (256 / ECMULT_GEN_PREC_B)
 
 typedef struct {
-    /* For accelerating the computation of a*G:
-     * To harden against timing attacks, use the following mechanism:
-     * * Break up the multiplicand into groups of PREC_B bits, called n_0, n_1, n_2, ..., n_(PREC_N-1).
-     * * Compute sum(n_i * (PREC_G)^i * G + U_i, i=0 ... PREC_N-1), where:
-     *   * U_i = U * 2^i, for i=0 ... PREC_N-2
-     *   * U_i = U * (1-2^(PREC_N-1)), for i=PREC_N-1
-     *   where U is a point with no known corresponding scalar. Note that sum(U_i, i=0 ... PREC_N-1) = 0.
-     * For each i, and each of the PREC_G possible values of n_i, (n_i * (PREC_G)^i * G + U_i) is
-     * precomputed (call it prec(i, n_i)). The formula now becomes sum(prec(i, n_i), i=0 ... PREC_N-1).
-     * None of the resulting prec group elements have a known scalar, and neither do any of
-     * the intermediate sums while computing a*G.
+    /* Precomputation data for the signed-digit multi-comb algorithm as described in section 3.3 of:
+     *     "Fast and compact elliptic-curve cryptography", Mike Hamburg
+     *         (https://eprint.iacr.org/2012/309)
      */
-    secp256k1_ge_storage (*prec)[ECMULT_GEN_PREC_N][ECMULT_GEN_PREC_G]; /* prec[j][i] = (PREC_G)^j * i * G + U_i */
+    secp256k1_ge_storage (*prec)[COMB_BLOCKS][COMB_POINTS];
+#if COMB_OFFSET
+    /* Signed recoding of a 256-bit scalar must be at least 257 bits, with the top bit always 1. We
+     * support a 256-bit comb over a 257-bit recoding by pre-adding an 'offset' value to the context's
+     * 'initial' value, to account for the high 1 bit. Note that the 'offset' is calculated to allow
+     * for the (COMB_SPACING - 1) doublings in the _ecmult_gen ladder.
+     */
+    secp256k1_ge offset;
+#endif
     secp256k1_scalar blind;
     secp256k1_gej initial;
 } secp256k1_ecmult_gen_context;