modm-donna-64bit.h (13726B)
1 /* 2 Public domain by Andrew M. <liquidsun@gmail.com> 3 */ 4 5 6 /* 7 Arithmetic modulo the group order n = 2^252 + 27742317777372353535851937790883648493 = 7237005577332262213973186563042994240857116359379907606001950938285454250989 8 9 k = 32 10 b = 1 << 8 = 256 11 m = 2^252 + 27742317777372353535851937790883648493 = 0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed 12 mu = floor( b^(k*2) / m ) = 0xfffffffffffffffffffffffffffffffeb2106215d086329a7ed9ce5a30a2c131b 13 */ 14 15 #define bignum256modm_bits_per_limb 56 16 #define bignum256modm_limb_size 5 17 18 typedef uint64_t bignum256modm_element_t; 19 typedef bignum256modm_element_t bignum256modm[5]; 20 21 static const bignum256modm modm_m = { 22 0x12631a5cf5d3ed, 23 0xf9dea2f79cd658, 24 0x000000000014de, 25 0x00000000000000, 26 0x00000010000000 27 }; 28 29 static const bignum256modm modm_mu = { 30 0x9ce5a30a2c131b, 31 0x215d086329a7ed, 32 0xffffffffeb2106, 33 0xffffffffffffff, 34 0x00000fffffffff 35 }; 36 37 static bignum256modm_element_t 38 lt_modm(bignum256modm_element_t a, bignum256modm_element_t b) { 39 return (a - b) >> 63; 40 } 41 42 static void 43 reduce256_modm(bignum256modm r) { 44 bignum256modm t; 45 bignum256modm_element_t b = 0, pb, mask; 46 47 /* t = r - m */ 48 pb = 0; 49 pb += modm_m[0]; b = lt_modm(r[0], pb); t[0] = (r[0] - pb + (b << 56)); pb = b; 50 pb += modm_m[1]; b = lt_modm(r[1], pb); t[1] = (r[1] - pb + (b << 56)); pb = b; 51 pb += modm_m[2]; b = lt_modm(r[2], pb); t[2] = (r[2] - pb + (b << 56)); pb = b; 52 pb += modm_m[3]; b = lt_modm(r[3], pb); t[3] = (r[3] - pb + (b << 56)); pb = b; 53 pb += modm_m[4]; b = lt_modm(r[4], pb); t[4] = (r[4] - pb + (b << 32)); 54 55 /* keep r if r was smaller than m */ 56 mask = b - 1; 57 58 r[0] ^= mask & (r[0] ^ t[0]); 59 r[1] ^= mask & (r[1] ^ t[1]); 60 r[2] ^= mask & (r[2] ^ t[2]); 61 r[3] ^= mask & (r[3] ^ t[3]); 62 r[4] ^= mask & (r[4] ^ t[4]); 63 } 64 65 static void 66 barrett_reduce256_modm(bignum256modm r, const bignum256modm q1, const bignum256modm r1) { 67 bignum256modm q3, r2; 68 uint128_t c, mul; 69 bignum256modm_element_t f, b, pb; 70 71 /* q1 = x >> 248 = 264 bits = 5 56 bit elements 72 q2 = mu * q1 73 q3 = (q2 / 256(32+1)) = q2 / (2^8)^(32+1) = q2 >> 264 */ 74 mul64x64_128(c, modm_mu[0], q1[3]) mul64x64_128(mul, modm_mu[3], q1[0]) add128(c, mul) mul64x64_128(mul, modm_mu[1], q1[2]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[1]) add128(c, mul) shr128(f, c, 56); 75 mul64x64_128(c, modm_mu[0], q1[4]) add128_64(c, f) mul64x64_128(mul, modm_mu[4], q1[0]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[1]) add128(c, mul) mul64x64_128(mul, modm_mu[1], q1[3]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[2]) add128(c, mul) 76 f = lo128(c); q3[0] = (f >> 40) & 0xffff; shr128(f, c, 56); 77 mul64x64_128(c, modm_mu[4], q1[1]) add128_64(c, f) mul64x64_128(mul, modm_mu[1], q1[4]) add128(c, mul) mul64x64_128(mul, modm_mu[2], q1[3]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[2]) add128(c, mul) 78 f = lo128(c); q3[0] |= (f << 16) & 0xffffffffffffff; q3[1] = (f >> 40) & 0xffff; shr128(f, c, 56); 79 mul64x64_128(c, modm_mu[4], q1[2]) add128_64(c, f) mul64x64_128(mul, modm_mu[2], q1[4]) add128(c, mul) mul64x64_128(mul, modm_mu[3], q1[3]) add128(c, mul) 80 f = lo128(c); q3[1] |= (f << 16) & 0xffffffffffffff; q3[2] = (f >> 40) & 0xffff; shr128(f, c, 56); 81 mul64x64_128(c, modm_mu[4], q1[3]) add128_64(c, f) mul64x64_128(mul, modm_mu[3], q1[4]) add128(c, mul) 82 f = lo128(c); q3[2] |= (f << 16) & 0xffffffffffffff; q3[3] = (f >> 40) & 0xffff; shr128(f, c, 56); 83 mul64x64_128(c, modm_mu[4], q1[4]) add128_64(c, f) 84 f = lo128(c); q3[3] |= (f << 16) & 0xffffffffffffff; q3[4] = (f >> 40) & 0xffff; shr128(f, c, 56); 85 q3[4] |= (f << 16); 86 87 mul64x64_128(c, modm_m[0], q3[0]) 88 r2[0] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56); 89 mul64x64_128(c, modm_m[0], q3[1]) add128_64(c, f) mul64x64_128(mul, modm_m[1], q3[0]) add128(c, mul) 90 r2[1] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56); 91 mul64x64_128(c, modm_m[0], q3[2]) add128_64(c, f) mul64x64_128(mul, modm_m[2], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[1]) add128(c, mul) 92 r2[2] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56); 93 mul64x64_128(c, modm_m[0], q3[3]) add128_64(c, f) mul64x64_128(mul, modm_m[3], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[2]) add128(c, mul) mul64x64_128(mul, modm_m[2], q3[1]) add128(c, mul) 94 r2[3] = lo128(c) & 0xffffffffffffff; shr128(f, c, 56); 95 mul64x64_128(c, modm_m[0], q3[4]) add128_64(c, f) mul64x64_128(mul, modm_m[4], q3[0]) add128(c, mul) mul64x64_128(mul, modm_m[3], q3[1]) add128(c, mul) mul64x64_128(mul, modm_m[1], q3[3]) add128(c, mul) mul64x64_128(mul, modm_m[2], q3[2]) add128(c, mul) 96 r2[4] = lo128(c) & 0x0000ffffffffff; 97 98 pb = 0; 99 pb += r2[0]; b = lt_modm(r1[0], pb); r[0] = (r1[0] - pb + (b << 56)); pb = b; 100 pb += r2[1]; b = lt_modm(r1[1], pb); r[1] = (r1[1] - pb + (b << 56)); pb = b; 101 pb += r2[2]; b = lt_modm(r1[2], pb); r[2] = (r1[2] - pb + (b << 56)); pb = b; 102 pb += r2[3]; b = lt_modm(r1[3], pb); r[3] = (r1[3] - pb + (b << 56)); pb = b; 103 pb += r2[4]; b = lt_modm(r1[4], pb); r[4] = (r1[4] - pb + (b << 40)); 104 105 reduce256_modm(r); 106 reduce256_modm(r); 107 } 108 109 110 static void 111 add256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) { 112 bignum256modm_element_t c; 113 114 c = x[0] + y[0]; r[0] = c & 0xffffffffffffff; c >>= 56; 115 c += x[1] + y[1]; r[1] = c & 0xffffffffffffff; c >>= 56; 116 c += x[2] + y[2]; r[2] = c & 0xffffffffffffff; c >>= 56; 117 c += x[3] + y[3]; r[3] = c & 0xffffffffffffff; c >>= 56; 118 c += x[4] + y[4]; r[4] = c; 119 120 reduce256_modm(r); 121 } 122 123 static void 124 mul256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) { 125 bignum256modm q1, r1; 126 uint128_t c, mul; 127 bignum256modm_element_t f; 128 129 mul64x64_128(c, x[0], y[0]) 130 f = lo128(c); r1[0] = f & 0xffffffffffffff; shr128(f, c, 56); 131 mul64x64_128(c, x[0], y[1]) add128_64(c, f) mul64x64_128(mul, x[1], y[0]) add128(c, mul) 132 f = lo128(c); r1[1] = f & 0xffffffffffffff; shr128(f, c, 56); 133 mul64x64_128(c, x[0], y[2]) add128_64(c, f) mul64x64_128(mul, x[2], y[0]) add128(c, mul) mul64x64_128(mul, x[1], y[1]) add128(c, mul) 134 f = lo128(c); r1[2] = f & 0xffffffffffffff; shr128(f, c, 56); 135 mul64x64_128(c, x[0], y[3]) add128_64(c, f) mul64x64_128(mul, x[3], y[0]) add128(c, mul) mul64x64_128(mul, x[1], y[2]) add128(c, mul) mul64x64_128(mul, x[2], y[1]) add128(c, mul) 136 f = lo128(c); r1[3] = f & 0xffffffffffffff; shr128(f, c, 56); 137 mul64x64_128(c, x[0], y[4]) add128_64(c, f) mul64x64_128(mul, x[4], y[0]) add128(c, mul) mul64x64_128(mul, x[3], y[1]) add128(c, mul) mul64x64_128(mul, x[1], y[3]) add128(c, mul) mul64x64_128(mul, x[2], y[2]) add128(c, mul) 138 f = lo128(c); r1[4] = f & 0x0000ffffffffff; q1[0] = (f >> 24) & 0xffffffff; shr128(f, c, 56); 139 mul64x64_128(c, x[4], y[1]) add128_64(c, f) mul64x64_128(mul, x[1], y[4]) add128(c, mul) mul64x64_128(mul, x[2], y[3]) add128(c, mul) mul64x64_128(mul, x[3], y[2]) add128(c, mul) 140 f = lo128(c); q1[0] |= (f << 32) & 0xffffffffffffff; q1[1] = (f >> 24) & 0xffffffff; shr128(f, c, 56); 141 mul64x64_128(c, x[4], y[2]) add128_64(c, f) mul64x64_128(mul, x[2], y[4]) add128(c, mul) mul64x64_128(mul, x[3], y[3]) add128(c, mul) 142 f = lo128(c); q1[1] |= (f << 32) & 0xffffffffffffff; q1[2] = (f >> 24) & 0xffffffff; shr128(f, c, 56); 143 mul64x64_128(c, x[4], y[3]) add128_64(c, f) mul64x64_128(mul, x[3], y[4]) add128(c, mul) 144 f = lo128(c); q1[2] |= (f << 32) & 0xffffffffffffff; q1[3] = (f >> 24) & 0xffffffff; shr128(f, c, 56); 145 mul64x64_128(c, x[4], y[4]) add128_64(c, f) 146 f = lo128(c); q1[3] |= (f << 32) & 0xffffffffffffff; q1[4] = (f >> 24) & 0xffffffff; shr128(f, c, 56); 147 q1[4] |= (f << 32); 148 149 barrett_reduce256_modm(r, q1, r1); 150 } 151 152 static void 153 expand256_modm(bignum256modm out, const unsigned char *in, size_t len) { 154 unsigned char work[64] = {0}; 155 bignum256modm_element_t x[16]; 156 bignum256modm q1; 157 158 memcpy(work, in, len); 159 x[0] = U8TO64_LE(work + 0); 160 x[1] = U8TO64_LE(work + 8); 161 x[2] = U8TO64_LE(work + 16); 162 x[3] = U8TO64_LE(work + 24); 163 x[4] = U8TO64_LE(work + 32); 164 x[5] = U8TO64_LE(work + 40); 165 x[6] = U8TO64_LE(work + 48); 166 x[7] = U8TO64_LE(work + 56); 167 168 /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) */ 169 out[0] = ( x[0]) & 0xffffffffffffff; 170 out[1] = ((x[ 0] >> 56) | (x[ 1] << 8)) & 0xffffffffffffff; 171 out[2] = ((x[ 1] >> 48) | (x[ 2] << 16)) & 0xffffffffffffff; 172 out[3] = ((x[ 2] >> 40) | (x[ 3] << 24)) & 0xffffffffffffff; 173 out[4] = ((x[ 3] >> 32) | (x[ 4] << 32)) & 0x0000ffffffffff; 174 175 /* under 252 bits, no need to reduce */ 176 if (len < 32) 177 return; 178 179 /* q1 = x >> 248 = 264 bits */ 180 q1[0] = ((x[ 3] >> 56) | (x[ 4] << 8)) & 0xffffffffffffff; 181 q1[1] = ((x[ 4] >> 48) | (x[ 5] << 16)) & 0xffffffffffffff; 182 q1[2] = ((x[ 5] >> 40) | (x[ 6] << 24)) & 0xffffffffffffff; 183 q1[3] = ((x[ 6] >> 32) | (x[ 7] << 32)) & 0xffffffffffffff; 184 q1[4] = ((x[ 7] >> 24) ); 185 186 barrett_reduce256_modm(out, q1, out); 187 } 188 189 static void 190 expand_raw256_modm(bignum256modm out, const unsigned char in[32]) { 191 bignum256modm_element_t x[4]; 192 193 x[0] = U8TO64_LE(in + 0); 194 x[1] = U8TO64_LE(in + 8); 195 x[2] = U8TO64_LE(in + 16); 196 x[3] = U8TO64_LE(in + 24); 197 198 out[0] = ( x[0]) & 0xffffffffffffff; 199 out[1] = ((x[ 0] >> 56) | (x[ 1] << 8)) & 0xffffffffffffff; 200 out[2] = ((x[ 1] >> 48) | (x[ 2] << 16)) & 0xffffffffffffff; 201 out[3] = ((x[ 2] >> 40) | (x[ 3] << 24)) & 0xffffffffffffff; 202 out[4] = ((x[ 3] >> 32) ) & 0x000000ffffffff; 203 } 204 205 static void 206 contract256_modm(unsigned char out[32], const bignum256modm in) { 207 U64TO8_LE(out + 0, (in[0] ) | (in[1] << 56)); 208 U64TO8_LE(out + 8, (in[1] >> 8) | (in[2] << 48)); 209 U64TO8_LE(out + 16, (in[2] >> 16) | (in[3] << 40)); 210 U64TO8_LE(out + 24, (in[3] >> 24) | (in[4] << 32)); 211 } 212 213 static void 214 contract256_window4_modm(signed char r[64], const bignum256modm in) { 215 char carry; 216 signed char *quads = r; 217 bignum256modm_element_t i, j, v, m; 218 219 for (i = 0; i < 5; i++) { 220 v = in[i]; 221 m = (i == 4) ? 8 : 14; 222 for (j = 0; j < m; j++) { 223 *quads++ = (v & 15); 224 v >>= 4; 225 } 226 } 227 228 /* making it signed */ 229 carry = 0; 230 for(i = 0; i < 63; i++) { 231 r[i] += carry; 232 r[i+1] += (r[i] >> 4); 233 r[i] &= 15; 234 carry = (r[i] >> 3); 235 r[i] -= (carry << 4); 236 } 237 r[63] += carry; 238 } 239 240 static void 241 contract256_slidingwindow_modm(signed char r[256], const bignum256modm s, int windowsize) { 242 int i,j,k,b; 243 int m = (1 << (windowsize - 1)) - 1; 244 const int soplen = 256; 245 signed char *bits = r; 246 bignum256modm_element_t v; 247 248 /* first put the binary expansion into r */ 249 for (i = 0; i < 4; i++) { 250 v = s[i]; 251 for (j = 0; j < 56; j++, v >>= 1) 252 *bits++ = (v & 1); 253 } 254 v = s[4]; 255 for (j = 0; j < 32; j++, v >>= 1) 256 *bits++ = (v & 1); 257 258 /* Making it sliding window */ 259 for (j = 0; j < soplen; j++) { 260 if (!r[j]) 261 continue; 262 263 for (b = 1; (b < (soplen - j)) && (b <= 6); b++) { 264 /* XXX Tor: coverity scan says that r[j+b] can 265 * overflow, but that's not possible: b < (soplen-j) 266 * guarantees that b + j < soplen, so b+j < 256, 267 * so the index doesn't overflow. */ 268 if ((r[j] + (r[j + b] << b)) <= m) { 269 r[j] += r[j + b] << b; 270 r[j + b] = 0; 271 } else if ((r[j] - (r[j + b] << b)) >= -m) { 272 r[j] -= r[j + b] << b; 273 for (k = j + b; k < soplen; k++) { 274 if (!r[k]) { 275 r[k] = 1; 276 break; 277 } 278 r[k] = 0; 279 } 280 } else if (r[j + b]) { 281 break; 282 } 283 } 284 } 285 } 286 287 /* 288 helpers for batch verifcation, are allowed to be vartime 289 */ 290 291 /* out = a - b, a must be larger than b */ 292 static void 293 sub256_modm_batch(bignum256modm out, const bignum256modm a, const bignum256modm b, size_t limbsize) { 294 size_t i = 0; 295 bignum256modm_element_t carry = 0; 296 switch (limbsize) { 297 case 4: out[i] = (a[i] - b[i]) ; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++; FALLTHROUGH; 298 case 3: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++; FALLTHROUGH; 299 case 2: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++; FALLTHROUGH; 300 case 1: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 63); out[i] &= 0xffffffffffffff; i++; FALLTHROUGH; 301 case 0: 302 default: out[i] = (a[i] - b[i]) - carry; 303 } 304 } 305 306 307 /* is a < b */ 308 static int 309 lt256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) { 310 size_t i = 0; 311 bignum256modm_element_t t, carry = 0; 312 switch (limbsize) { 313 case 4: t = (a[i] - b[i]) ; carry = (t >> 63); i++; FALLTHROUGH; 314 case 3: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++; FALLTHROUGH; 315 case 2: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++; FALLTHROUGH; 316 case 1: t = (a[i] - b[i]) - carry; carry = (t >> 63); i++; FALLTHROUGH; 317 case 0: t = (a[i] - b[i]) - carry; carry = (t >> 63); 318 } 319 return (int)carry; 320 } 321 322 /* is a <= b */ 323 static int 324 lte256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) { 325 size_t i = 0; 326 bignum256modm_element_t t, carry = 0; 327 switch (limbsize) { 328 case 4: t = (b[i] - a[i]) ; carry = (t >> 63); i++; FALLTHROUGH; 329 case 3: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++; FALLTHROUGH; 330 case 2: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++; FALLTHROUGH; 331 case 1: t = (b[i] - a[i]) - carry; carry = (t >> 63); i++; FALLTHROUGH; 332 case 0: t = (b[i] - a[i]) - carry; carry = (t >> 63); 333 } 334 return (int)!carry; 335 } 336 337 /* is a == 0 */ 338 static int 339 iszero256_modm_batch(const bignum256modm a) { 340 size_t i; 341 for (i = 0; i < 5; i++) 342 if (a[i]) 343 return 0; 344 return 1; 345 } 346 347 /* is a == 1 */ 348 static int 349 isone256_modm_batch(const bignum256modm a) { 350 size_t i; 351 for (i = 0; i < 5; i++) 352 if (a[i] != ((i) ? 0 : 1)) 353 return 0; 354 return 1; 355 } 356 357 /* can a fit in to (at most) 128 bits */ 358 static int 359 isatmost128bits256_modm_batch(const bignum256modm a) { 360 uint64_t mask = 361 ((a[4] ) | /* 32 */ 362 (a[3] ) | /* 88 */ 363 (a[2] & 0xffffffffff0000)); 364 365 return (mask == 0); 366 }