Skip to content

Commit 1253a27

Browse files
Merge #1033: Add _fe_half and use in _gej_add_ge and _gej_double
e848c37 Update sage files for new formulae (Peter Dettman) d64bb5d Add fe_half tests for worst-case inputs (Peter Dettman) 4eb8b93 Further improve doubling formula using fe_half (Peter Dettman) 557b31f Doubling formula using fe_half (Pieter Wuille) 2cbb4b1 Run more iterations of run_field_misc (Pieter Wuille) 9cc5c25 Add test for secp256k1_fe_half (Pieter Wuille) 925f78d Add _fe_half and use in _gej_add_ge (Peter Dettman) Pull request description: - Trades 1 _half for 3 _mul_int and 2 _normalize_weak Gives around 2-3% faster signing and ECDH, depending on compiler/platform. ACKs for top commit: sipa: utACK e848c37 jonasnick: ACK e848c37 real-or-random: ACK e848c37 Tree-SHA512: 81a6c93b3d983f1b48ec8e8b6f262ba914215045a95415147f41ee6e85296aa4d0cbbad9f370cdf475571447baad861d2cc8e0b04a71202d48959cb8a098f584
2 parents 3ef94aa + e848c37 commit 1253a27

7 files changed

+318
-75
lines changed

sage/prove_group_implementations.sage

+17-24
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,20 @@ load("weierstrass_prover.sage")
88
def formula_secp256k1_gej_double_var(a):
99
"""libsecp256k1's secp256k1_gej_double_var, used by various addition functions"""
1010
rz = a.Z * a.Y
11-
rz = rz * 2
12-
t1 = a.X^2
13-
t1 = t1 * 3
14-
t2 = t1^2
15-
t3 = a.Y^2
16-
t3 = t3 * 2
17-
t4 = t3^2
18-
t4 = t4 * 2
19-
t3 = t3 * a.X
20-
rx = t3
21-
rx = rx * 4
22-
rx = -rx
23-
rx = rx + t2
24-
t2 = -t2
25-
t3 = t3 * 6
26-
t3 = t3 + t2
27-
ry = t1 * t3
28-
t2 = -t4
29-
ry = ry + t2
11+
s = a.Y^2
12+
l = a.X^2
13+
l = l * 3
14+
l = l / 2
15+
t = -s
16+
t = t * a.X
17+
rx = l^2
18+
rx = rx + t
19+
rx = rx + t
20+
s = s^2
21+
t = t + rx
22+
ry = t * l
23+
ry = ry + s
24+
ry = -ry
3025
return jacobianpoint(rx, ry, rz)
3126

3227
def formula_secp256k1_gej_add_var(branch, a, b):
@@ -197,7 +192,8 @@ def formula_secp256k1_gej_add_ge(branch, a, b):
197192
rr_alt = rr
198193
m_alt = m
199194
n = m_alt^2
200-
q = n * t
195+
q = -t
196+
q = q * n
201197
n = n^2
202198
if degenerate:
203199
n = m
@@ -210,17 +206,14 @@ def formula_secp256k1_gej_add_ge(branch, a, b):
210206
zeroes.update({rz : 'r.z=0'})
211207
else:
212208
nonzeroes.update({rz : 'r.z!=0'})
213-
rz = rz * 2
214-
q = -q
215209
t = t + q
216210
rx = t
217211
t = t * 2
218212
t = t + q
219213
t = t * rr_alt
220214
t = t + n
221215
ry = -t
222-
rx = rx * 4
223-
ry = ry * 4
216+
ry = ry / 2
224217
if a_infinity:
225218
rx = b.X
226219
ry = b.Y

src/bench_internal.c

+10
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,15 @@ void bench_scalar_inverse_var(void* arg, int iters) {
140140
CHECK(j <= iters);
141141
}
142142

143+
void bench_field_half(void* arg, int iters) {
144+
int i;
145+
bench_inv *data = (bench_inv*)arg;
146+
147+
for (i = 0; i < iters; i++) {
148+
secp256k1_fe_half(&data->fe[0]);
149+
}
150+
}
151+
143152
void bench_field_normalize(void* arg, int iters) {
144153
int i;
145154
bench_inv *data = (bench_inv*)arg;
@@ -354,6 +363,7 @@ int main(int argc, char **argv) {
354363
if (d || have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse", bench_scalar_inverse, bench_setup, NULL, &data, 10, iters);
355364
if (d || have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse_var", bench_scalar_inverse_var, bench_setup, NULL, &data, 10, iters);
356365

366+
if (d || have_flag(argc, argv, "field") || have_flag(argc, argv, "half")) run_benchmark("field_half", bench_field_half, bench_setup, NULL, &data, 10, iters*100);
357367
if (d || have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize", bench_field_normalize, bench_setup, NULL, &data, 10, iters*100);
358368
if (d || have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize_weak", bench_field_normalize_weak, bench_setup, NULL, &data, 10, iters*100);
359369
if (d || have_flag(argc, argv, "field") || have_flag(argc, argv, "sqr")) run_benchmark("field_sqr", bench_field_sqr, bench_setup, NULL, &data, 10, iters*10);

src/field.h

+9
Original file line numberDiff line numberDiff line change
@@ -130,4 +130,13 @@ static void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r, const secp256k1_f
130130
/** If flag is true, set *r equal to *a; otherwise leave it. Constant-time. Both *r and *a must be initialized.*/
131131
static void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag);
132132

133+
/** Halves the value of a field element modulo the field prime. Constant-time.
134+
* For an input magnitude 'm', the output magnitude is set to 'floor(m/2) + 1'.
135+
* The output is not guaranteed to be normalized, regardless of the input. */
136+
static void secp256k1_fe_half(secp256k1_fe *r);
137+
138+
/** Sets each limb of 'r' to its upper bound at magnitude 'm'. The output will also have its
139+
* magnitude set to 'm' and is normalized if (and only if) 'm' is zero. */
140+
static void secp256k1_fe_get_bounds(secp256k1_fe *r, int m);
141+
133142
#endif /* SECP256K1_FIELD_H */

src/field_10x26_impl.h

+96
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,26 @@ static void secp256k1_fe_verify(const secp256k1_fe *a) {
4949
}
5050
#endif
5151

52+
static void secp256k1_fe_get_bounds(secp256k1_fe *r, int m) {
53+
VERIFY_CHECK(m >= 0);
54+
VERIFY_CHECK(m <= 2048);
55+
r->n[0] = 0x3FFFFFFUL * 2 * m;
56+
r->n[1] = 0x3FFFFFFUL * 2 * m;
57+
r->n[2] = 0x3FFFFFFUL * 2 * m;
58+
r->n[3] = 0x3FFFFFFUL * 2 * m;
59+
r->n[4] = 0x3FFFFFFUL * 2 * m;
60+
r->n[5] = 0x3FFFFFFUL * 2 * m;
61+
r->n[6] = 0x3FFFFFFUL * 2 * m;
62+
r->n[7] = 0x3FFFFFFUL * 2 * m;
63+
r->n[8] = 0x3FFFFFFUL * 2 * m;
64+
r->n[9] = 0x03FFFFFUL * 2 * m;
65+
#ifdef VERIFY
66+
r->magnitude = m;
67+
r->normalized = (m == 0);
68+
secp256k1_fe_verify(r);
69+
#endif
70+
}
71+
5272
static void secp256k1_fe_normalize(secp256k1_fe *r) {
5373
uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
5474
t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
@@ -1133,6 +1153,82 @@ static SECP256K1_INLINE void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_
11331153
#endif
11341154
}
11351155

1156+
static SECP256K1_INLINE void secp256k1_fe_half(secp256k1_fe *r) {
1157+
uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
1158+
t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
1159+
uint32_t one = (uint32_t)1;
1160+
uint32_t mask = -(t0 & one) >> 6;
1161+
1162+
#ifdef VERIFY
1163+
secp256k1_fe_verify(r);
1164+
VERIFY_CHECK(r->magnitude < 32);
1165+
#endif
1166+
1167+
/* Bounds analysis (over the rationals).
1168+
*
1169+
* Let m = r->magnitude
1170+
* C = 0x3FFFFFFUL * 2
1171+
* D = 0x03FFFFFUL * 2
1172+
*
1173+
* Initial bounds: t0..t8 <= C * m
1174+
* t9 <= D * m
1175+
*/
1176+
1177+
t0 += 0x3FFFC2FUL & mask;
1178+
t1 += 0x3FFFFBFUL & mask;
1179+
t2 += mask;
1180+
t3 += mask;
1181+
t4 += mask;
1182+
t5 += mask;
1183+
t6 += mask;
1184+
t7 += mask;
1185+
t8 += mask;
1186+
t9 += mask >> 4;
1187+
1188+
VERIFY_CHECK((t0 & one) == 0);
1189+
1190+
/* t0..t8: added <= C/2
1191+
* t9: added <= D/2
1192+
*
1193+
* Current bounds: t0..t8 <= C * (m + 1/2)
1194+
* t9 <= D * (m + 1/2)
1195+
*/
1196+
1197+
r->n[0] = (t0 >> 1) + ((t1 & one) << 25);
1198+
r->n[1] = (t1 >> 1) + ((t2 & one) << 25);
1199+
r->n[2] = (t2 >> 1) + ((t3 & one) << 25);
1200+
r->n[3] = (t3 >> 1) + ((t4 & one) << 25);
1201+
r->n[4] = (t4 >> 1) + ((t5 & one) << 25);
1202+
r->n[5] = (t5 >> 1) + ((t6 & one) << 25);
1203+
r->n[6] = (t6 >> 1) + ((t7 & one) << 25);
1204+
r->n[7] = (t7 >> 1) + ((t8 & one) << 25);
1205+
r->n[8] = (t8 >> 1) + ((t9 & one) << 25);
1206+
r->n[9] = (t9 >> 1);
1207+
1208+
/* t0..t8: shifted right and added <= C/4 + 1/2
1209+
* t9: shifted right
1210+
*
1211+
* Current bounds: t0..t8 <= C * (m/2 + 1/2)
1212+
* t9 <= D * (m/2 + 1/4)
1213+
*/
1214+
1215+
#ifdef VERIFY
1216+
/* Therefore the output magnitude (M) has to be set such that:
1217+
* t0..t8: C * M >= C * (m/2 + 1/2)
1218+
* t9: D * M >= D * (m/2 + 1/4)
1219+
*
1220+
* It suffices for all limbs that, for any input magnitude m:
1221+
* M >= m/2 + 1/2
1222+
*
1223+
* and since we want the smallest such integer value for M:
1224+
* M == floor(m/2) + 1
1225+
*/
1226+
r->magnitude = (r->magnitude >> 1) + 1;
1227+
r->normalized = 0;
1228+
secp256k1_fe_verify(r);
1229+
#endif
1230+
}
1231+
11361232
static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r, const secp256k1_fe_storage *a, int flag) {
11371233
uint32_t mask0, mask1;
11381234
VG_CHECK_VERIFY(r->n, sizeof(r->n));

src/field_5x52_impl.h

+80
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,21 @@ static void secp256k1_fe_verify(const secp256k1_fe *a) {
5858
}
5959
#endif
6060

61+
static void secp256k1_fe_get_bounds(secp256k1_fe *r, int m) {
62+
VERIFY_CHECK(m >= 0);
63+
VERIFY_CHECK(m <= 2048);
64+
r->n[0] = 0xFFFFFFFFFFFFFULL * 2 * m;
65+
r->n[1] = 0xFFFFFFFFFFFFFULL * 2 * m;
66+
r->n[2] = 0xFFFFFFFFFFFFFULL * 2 * m;
67+
r->n[3] = 0xFFFFFFFFFFFFFULL * 2 * m;
68+
r->n[4] = 0x0FFFFFFFFFFFFULL * 2 * m;
69+
#ifdef VERIFY
70+
r->magnitude = m;
71+
r->normalized = (m == 0);
72+
secp256k1_fe_verify(r);
73+
#endif
74+
}
75+
6176
static void secp256k1_fe_normalize(secp256k1_fe *r) {
6277
uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
6378

@@ -477,6 +492,71 @@ static SECP256K1_INLINE void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_
477492
#endif
478493
}
479494

495+
static SECP256K1_INLINE void secp256k1_fe_half(secp256k1_fe *r) {
496+
uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
497+
uint64_t one = (uint64_t)1;
498+
uint64_t mask = -(t0 & one) >> 12;
499+
500+
#ifdef VERIFY
501+
secp256k1_fe_verify(r);
502+
VERIFY_CHECK(r->magnitude < 32);
503+
#endif
504+
505+
/* Bounds analysis (over the rationals).
506+
*
507+
* Let m = r->magnitude
508+
* C = 0xFFFFFFFFFFFFFULL * 2
509+
* D = 0x0FFFFFFFFFFFFULL * 2
510+
*
511+
* Initial bounds: t0..t3 <= C * m
512+
* t4 <= D * m
513+
*/
514+
515+
t0 += 0xFFFFEFFFFFC2FULL & mask;
516+
t1 += mask;
517+
t2 += mask;
518+
t3 += mask;
519+
t4 += mask >> 4;
520+
521+
VERIFY_CHECK((t0 & one) == 0);
522+
523+
/* t0..t3: added <= C/2
524+
* t4: added <= D/2
525+
*
526+
* Current bounds: t0..t3 <= C * (m + 1/2)
527+
* t4 <= D * (m + 1/2)
528+
*/
529+
530+
r->n[0] = (t0 >> 1) + ((t1 & one) << 51);
531+
r->n[1] = (t1 >> 1) + ((t2 & one) << 51);
532+
r->n[2] = (t2 >> 1) + ((t3 & one) << 51);
533+
r->n[3] = (t3 >> 1) + ((t4 & one) << 51);
534+
r->n[4] = (t4 >> 1);
535+
536+
/* t0..t3: shifted right and added <= C/4 + 1/2
537+
* t4: shifted right
538+
*
539+
* Current bounds: t0..t3 <= C * (m/2 + 1/2)
540+
* t4 <= D * (m/2 + 1/4)
541+
*/
542+
543+
#ifdef VERIFY
544+
/* Therefore the output magnitude (M) has to be set such that:
545+
* t0..t3: C * M >= C * (m/2 + 1/2)
546+
* t4: D * M >= D * (m/2 + 1/4)
547+
*
548+
* It suffices for all limbs that, for any input magnitude m:
549+
* M >= m/2 + 1/2
550+
*
551+
* and since we want the smallest such integer value for M:
552+
* M == floor(m/2) + 1
553+
*/
554+
r->magnitude = (r->magnitude >> 1) + 1;
555+
r->normalized = 0;
556+
secp256k1_fe_verify(r);
557+
#endif
558+
}
559+
480560
static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r, const secp256k1_fe_storage *a, int flag) {
481561
uint64_t mask0, mask1;
482562
VG_CHECK_VERIFY(r->n, sizeof(r->n));

0 commit comments

Comments
 (0)