tor

The Tor anonymity network
git clone https://git.dasho.dev/tor.git
Log | Files | Refs | README | LICENSE

modm-donna-32bit.h (21464B)


      1 /*
      2 Public domain by Andrew M. <liquidsun@gmail.com>
      3 */
      4 
      5 
      6 /*
      7 Arithmetic modulo the group order n = 2^252 +  27742317777372353535851937790883648493 = 7237005577332262213973186563042994240857116359379907606001950938285454250989
      8 
      9 k = 32
     10 b = 1 << 8 = 256
     11 m = 2^252 + 27742317777372353535851937790883648493 = 0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed
     12 mu = floor( b^(k*2) / m ) = 0xfffffffffffffffffffffffffffffffeb2106215d086329a7ed9ce5a30a2c131b
     13 */
     14 
     15 #define bignum256modm_bits_per_limb 30
     16 #define bignum256modm_limb_size 9
     17 
     18 typedef uint32_t bignum256modm_element_t;
     19 typedef bignum256modm_element_t bignum256modm[9];
     20 
     21 static const bignum256modm modm_m = {
     22 0x1cf5d3ed, 0x20498c69, 0x2f79cd65, 0x37be77a8,
     23 0x00000014,	0x00000000, 0x00000000,	0x00000000,
     24 0x00001000
     25 };
     26 
     27 static const bignum256modm modm_mu = {
     28 0x0a2c131b, 0x3673968c, 0x06329a7e, 0x01885742,
     29 0x3fffeb21, 0x3fffffff, 0x3fffffff, 0x3fffffff,
     30 0x000fffff
     31 };
     32 
     33 static bignum256modm_element_t
     34 lt_modm(bignum256modm_element_t a, bignum256modm_element_t b) {
     35 return (a - b) >> 31;
     36 }
     37 
     38 /* see HAC, Alg. 14.42 Step 4 */
     39 static void
     40 reduce256_modm(bignum256modm r) {
     41 bignum256modm t;
     42 bignum256modm_element_t b = 0, pb, mask;
     43 
     44 /* t = r - m */
     45 pb = 0;
     46 pb += modm_m[0]; b = lt_modm(r[0], pb); t[0] = (r[0] - pb + (b << 30)); pb = b;
     47 pb += modm_m[1]; b = lt_modm(r[1], pb); t[1] = (r[1] - pb + (b << 30)); pb = b;
     48 pb += modm_m[2]; b = lt_modm(r[2], pb); t[2] = (r[2] - pb + (b << 30)); pb = b;
     49 pb += modm_m[3]; b = lt_modm(r[3], pb); t[3] = (r[3] - pb + (b << 30)); pb = b;
     50 pb += modm_m[4]; b = lt_modm(r[4], pb); t[4] = (r[4] - pb + (b << 30)); pb = b;
     51 pb += modm_m[5]; b = lt_modm(r[5], pb); t[5] = (r[5] - pb + (b << 30)); pb = b;
     52 pb += modm_m[6]; b = lt_modm(r[6], pb); t[6] = (r[6] - pb + (b << 30)); pb = b;
     53 pb += modm_m[7]; b = lt_modm(r[7], pb); t[7] = (r[7] - pb + (b << 30)); pb = b;
     54 pb += modm_m[8]; b = lt_modm(r[8], pb); t[8] = (r[8] - pb + (b << 16));
     55 
     56 /* keep r if r was smaller than m */
     57 mask = b - 1;
     58 r[0] ^= mask & (r[0] ^ t[0]);
     59 r[1] ^= mask & (r[1] ^ t[1]);
     60 r[2] ^= mask & (r[2] ^ t[2]);
     61 r[3] ^= mask & (r[3] ^ t[3]);
     62 r[4] ^= mask & (r[4] ^ t[4]);
     63 r[5] ^= mask & (r[5] ^ t[5]);
     64 r[6] ^= mask & (r[6] ^ t[6]);
     65 r[7] ^= mask & (r[7] ^ t[7]);
     66 r[8] ^= mask & (r[8] ^ t[8]);
     67 }
     68 
     69 /*
     70 Barrett reduction,  see HAC, Alg. 14.42
     71 
     72 Instead of passing in x, pre-process in to q1 and r1 for efficiency
     73 */
     74 static void
     75 barrett_reduce256_modm(bignum256modm r, const bignum256modm q1, const bignum256modm r1) {
     76 bignum256modm q3, r2;
     77 uint64_t c;
     78 bignum256modm_element_t f, b, pb;
     79 
     80 /* q1 = x >> 248 = 264 bits = 9 30 bit elements
     81    q2 = mu * q1
     82    q3 = (q2 / 256(32+1)) = q2 / (2^8)^(32+1) = q2 >> 264 */
     83 c  = mul32x32_64(modm_mu[0], q1[7]) + mul32x32_64(modm_mu[1], q1[6]) + mul32x32_64(modm_mu[2], q1[5]) + mul32x32_64(modm_mu[3], q1[4]) + mul32x32_64(modm_mu[4], q1[3]) + mul32x32_64(modm_mu[5], q1[2]) + mul32x32_64(modm_mu[6], q1[1]) + mul32x32_64(modm_mu[7], q1[0]); 
     84 c >>= 30;
     85 c += mul32x32_64(modm_mu[0], q1[8]) + mul32x32_64(modm_mu[1], q1[7]) + mul32x32_64(modm_mu[2], q1[6]) + mul32x32_64(modm_mu[3], q1[5]) + mul32x32_64(modm_mu[4], q1[4]) + mul32x32_64(modm_mu[5], q1[3]) + mul32x32_64(modm_mu[6], q1[2]) + mul32x32_64(modm_mu[7], q1[1]) + mul32x32_64(modm_mu[8], q1[0]);
     86 f = (bignum256modm_element_t)c; q3[0] = (f >> 24) & 0x3f; c >>= 30;
     87 c += mul32x32_64(modm_mu[1], q1[8]) + mul32x32_64(modm_mu[2], q1[7]) + mul32x32_64(modm_mu[3], q1[6]) + mul32x32_64(modm_mu[4], q1[5]) + mul32x32_64(modm_mu[5], q1[4]) + mul32x32_64(modm_mu[6], q1[3]) + mul32x32_64(modm_mu[7], q1[2]) + mul32x32_64(modm_mu[8], q1[1]);
     88 f = (bignum256modm_element_t)c; q3[0] |= (f << 6) & 0x3fffffff; q3[1] = (f >> 24) & 0x3f; c >>= 30;
     89 c += mul32x32_64(modm_mu[2], q1[8]) + mul32x32_64(modm_mu[3], q1[7]) + mul32x32_64(modm_mu[4], q1[6]) + mul32x32_64(modm_mu[5], q1[5]) + mul32x32_64(modm_mu[6], q1[4]) + mul32x32_64(modm_mu[7], q1[3]) + mul32x32_64(modm_mu[8], q1[2]);
     90 f = (bignum256modm_element_t)c; q3[1] |= (f << 6) & 0x3fffffff; q3[2] = (f >> 24) & 0x3f; c >>= 30;
     91 c += mul32x32_64(modm_mu[3], q1[8]) + mul32x32_64(modm_mu[4], q1[7]) + mul32x32_64(modm_mu[5], q1[6]) + mul32x32_64(modm_mu[6], q1[5]) + mul32x32_64(modm_mu[7], q1[4]) + mul32x32_64(modm_mu[8], q1[3]);
     92 f = (bignum256modm_element_t)c; q3[2] |= (f << 6) & 0x3fffffff; q3[3] = (f >> 24) & 0x3f; c >>= 30;
     93 c += mul32x32_64(modm_mu[4], q1[8]) + mul32x32_64(modm_mu[5], q1[7]) + mul32x32_64(modm_mu[6], q1[6]) + mul32x32_64(modm_mu[7], q1[5]) + mul32x32_64(modm_mu[8], q1[4]);
     94 f = (bignum256modm_element_t)c; q3[3] |= (f << 6) & 0x3fffffff; q3[4] = (f >> 24) & 0x3f; c >>= 30;
     95 c += mul32x32_64(modm_mu[5], q1[8]) + mul32x32_64(modm_mu[6], q1[7]) + mul32x32_64(modm_mu[7], q1[6]) + mul32x32_64(modm_mu[8], q1[5]);
     96 f = (bignum256modm_element_t)c; q3[4] |= (f << 6) & 0x3fffffff; q3[5] = (f >> 24) & 0x3f; c >>= 30;
     97 c += mul32x32_64(modm_mu[6], q1[8]) + mul32x32_64(modm_mu[7], q1[7]) + mul32x32_64(modm_mu[8], q1[6]);
     98 f = (bignum256modm_element_t)c; q3[5] |= (f << 6) & 0x3fffffff; q3[6] = (f >> 24) & 0x3f; c >>= 30;
     99 c += mul32x32_64(modm_mu[7], q1[8]) + mul32x32_64(modm_mu[8], q1[7]);
    100 f = (bignum256modm_element_t)c; q3[6] |= (f << 6) & 0x3fffffff; q3[7] = (f >> 24) & 0x3f; c >>= 30;
    101 c += mul32x32_64(modm_mu[8], q1[8]);
    102 f = (bignum256modm_element_t)c; q3[7] |= (f << 6) & 0x3fffffff; q3[8] = (bignum256modm_element_t)(c >> 24);
    103 
    104 /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1)
    105    r2 = (q3 * m) mod (256^(32+1)) = (q3 * m) & ((1 << 264) - 1) */
    106 c = mul32x32_64(modm_m[0], q3[0]);
    107 r2[0] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
    108 c += mul32x32_64(modm_m[0], q3[1]) + mul32x32_64(modm_m[1], q3[0]);
    109 r2[1] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
    110 c += mul32x32_64(modm_m[0], q3[2]) + mul32x32_64(modm_m[1], q3[1]) + mul32x32_64(modm_m[2], q3[0]);
    111 r2[2] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
    112 c += mul32x32_64(modm_m[0], q3[3]) + mul32x32_64(modm_m[1], q3[2]) + mul32x32_64(modm_m[2], q3[1]) + mul32x32_64(modm_m[3], q3[0]);
    113 r2[3] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
    114 c += mul32x32_64(modm_m[0], q3[4]) + mul32x32_64(modm_m[1], q3[3]) + mul32x32_64(modm_m[2], q3[2]) + mul32x32_64(modm_m[3], q3[1]) + mul32x32_64(modm_m[4], q3[0]);
    115 r2[4] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
    116 c += mul32x32_64(modm_m[0], q3[5]) + mul32x32_64(modm_m[1], q3[4]) + mul32x32_64(modm_m[2], q3[3]) + mul32x32_64(modm_m[3], q3[2]) + mul32x32_64(modm_m[4], q3[1]) + mul32x32_64(modm_m[5], q3[0]);
    117 r2[5] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
    118 c += mul32x32_64(modm_m[0], q3[6]) + mul32x32_64(modm_m[1], q3[5]) + mul32x32_64(modm_m[2], q3[4]) + mul32x32_64(modm_m[3], q3[3]) + mul32x32_64(modm_m[4], q3[2]) + mul32x32_64(modm_m[5], q3[1]) + mul32x32_64(modm_m[6], q3[0]);
    119 r2[6] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
    120 c += mul32x32_64(modm_m[0], q3[7]) + mul32x32_64(modm_m[1], q3[6]) + mul32x32_64(modm_m[2], q3[5]) + mul32x32_64(modm_m[3], q3[4]) + mul32x32_64(modm_m[4], q3[3]) + mul32x32_64(modm_m[5], q3[2]) + mul32x32_64(modm_m[6], q3[1]) + mul32x32_64(modm_m[7], q3[0]);
    121 r2[7] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30;
    122 c += mul32x32_64(modm_m[0], q3[8]) + mul32x32_64(modm_m[1], q3[7]) + mul32x32_64(modm_m[2], q3[6]) + mul32x32_64(modm_m[3], q3[5]) + mul32x32_64(modm_m[4], q3[4]) + mul32x32_64(modm_m[5], q3[3]) + mul32x32_64(modm_m[6], q3[2]) + mul32x32_64(modm_m[7], q3[1]) + mul32x32_64(modm_m[8], q3[0]);
    123 r2[8] = (bignum256modm_element_t)(c & 0xffffff);
    124 
    125 /* r = r1 - r2
    126    if (r < 0) r += (1 << 264) */
    127 pb = 0;
    128 pb += r2[0]; b = lt_modm(r1[0], pb); r[0] = (r1[0] - pb + (b << 30)); pb = b;
    129 pb += r2[1]; b = lt_modm(r1[1], pb); r[1] = (r1[1] - pb + (b << 30)); pb = b;
    130 pb += r2[2]; b = lt_modm(r1[2], pb); r[2] = (r1[2] - pb + (b << 30)); pb = b;
    131 pb += r2[3]; b = lt_modm(r1[3], pb); r[3] = (r1[3] - pb + (b << 30)); pb = b;
    132 pb += r2[4]; b = lt_modm(r1[4], pb); r[4] = (r1[4] - pb + (b << 30)); pb = b;
    133 pb += r2[5]; b = lt_modm(r1[5], pb); r[5] = (r1[5] - pb + (b << 30)); pb = b;
    134 pb += r2[6]; b = lt_modm(r1[6], pb); r[6] = (r1[6] - pb + (b << 30)); pb = b;
    135 pb += r2[7]; b = lt_modm(r1[7], pb); r[7] = (r1[7] - pb + (b << 30)); pb = b;
    136 pb += r2[8]; b = lt_modm(r1[8], pb); r[8] = (r1[8] - pb + (b << 24));
    137 
    138 reduce256_modm(r);
    139 reduce256_modm(r);
    140 }
    141 
    142 /* addition modulo m */
    143 static void
    144 add256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
    145 bignum256modm_element_t c;
    146 
    147 c  = x[0] + y[0]; r[0] = c & 0x3fffffff; c >>= 30;
    148 c += x[1] + y[1]; r[1] = c & 0x3fffffff; c >>= 30;
    149 c += x[2] + y[2]; r[2] = c & 0x3fffffff; c >>= 30;
    150 c += x[3] + y[3]; r[3] = c & 0x3fffffff; c >>= 30;
    151 c += x[4] + y[4]; r[4] = c & 0x3fffffff; c >>= 30;
    152 c += x[5] + y[5]; r[5] = c & 0x3fffffff; c >>= 30;
    153 c += x[6] + y[6]; r[6] = c & 0x3fffffff; c >>= 30;
    154 c += x[7] + y[7]; r[7] = c & 0x3fffffff; c >>= 30;
    155 c += x[8] + y[8]; r[8] = c;
    156 
    157 reduce256_modm(r);
    158 }
    159 
    160 /* multiplication modulo m */
    161 static void 
    162 mul256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) {
    163 bignum256modm r1, q1;
    164 uint64_t c;
    165 bignum256modm_element_t f;
    166 
    167 /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1)
    168    q1 = x >> 248 = 264 bits = 9 30 bit elements */
    169 c = mul32x32_64(x[0], y[0]);
    170 f = (bignum256modm_element_t)c; r1[0] = (f & 0x3fffffff); c >>= 30;
    171 c += mul32x32_64(x[0], y[1]) + mul32x32_64(x[1], y[0]);
    172 f = (bignum256modm_element_t)c; r1[1] = (f & 0x3fffffff); c >>= 30;
    173 c += mul32x32_64(x[0], y[2]) + mul32x32_64(x[1], y[1]) + mul32x32_64(x[2], y[0]);
    174 f = (bignum256modm_element_t)c; r1[2] = (f & 0x3fffffff); c >>= 30;
    175 c += mul32x32_64(x[0], y[3]) + mul32x32_64(x[1], y[2]) + mul32x32_64(x[2], y[1]) + mul32x32_64(x[3], y[0]);
    176 f = (bignum256modm_element_t)c; r1[3] = (f & 0x3fffffff); c >>= 30;
    177 c += mul32x32_64(x[0], y[4]) + mul32x32_64(x[1], y[3]) + mul32x32_64(x[2], y[2]) + mul32x32_64(x[3], y[1]) + mul32x32_64(x[4], y[0]);
    178 f = (bignum256modm_element_t)c; r1[4] = (f & 0x3fffffff); c >>= 30;
    179 c += mul32x32_64(x[0], y[5]) + mul32x32_64(x[1], y[4]) + mul32x32_64(x[2], y[3]) + mul32x32_64(x[3], y[2]) + mul32x32_64(x[4], y[1]) + mul32x32_64(x[5], y[0]);
    180 f = (bignum256modm_element_t)c; r1[5] = (f & 0x3fffffff); c >>= 30;
    181 c += mul32x32_64(x[0], y[6]) + mul32x32_64(x[1], y[5]) + mul32x32_64(x[2], y[4]) + mul32x32_64(x[3], y[3]) + mul32x32_64(x[4], y[2]) + mul32x32_64(x[5], y[1]) + mul32x32_64(x[6], y[0]);
    182 f = (bignum256modm_element_t)c; r1[6] = (f & 0x3fffffff); c >>= 30;
    183 c += mul32x32_64(x[0], y[7]) + mul32x32_64(x[1], y[6]) + mul32x32_64(x[2], y[5]) + mul32x32_64(x[3], y[4]) + mul32x32_64(x[4], y[3]) + mul32x32_64(x[5], y[2]) + mul32x32_64(x[6], y[1]) + mul32x32_64(x[7], y[0]);
    184 f = (bignum256modm_element_t)c; r1[7] = (f & 0x3fffffff); c >>= 30;
    185 c += mul32x32_64(x[0], y[8]) + mul32x32_64(x[1], y[7]) + mul32x32_64(x[2], y[6]) + mul32x32_64(x[3], y[5]) + mul32x32_64(x[4], y[4]) + mul32x32_64(x[5], y[3]) + mul32x32_64(x[6], y[2]) + mul32x32_64(x[7], y[1]) + mul32x32_64(x[8], y[0]);
    186 f = (bignum256modm_element_t)c; r1[8] = (f & 0x00ffffff); q1[0] = (f >> 8) & 0x3fffff; c >>= 30;
    187 c += mul32x32_64(x[1], y[8]) + mul32x32_64(x[2], y[7]) + mul32x32_64(x[3], y[6]) + mul32x32_64(x[4], y[5]) + mul32x32_64(x[5], y[4]) + mul32x32_64(x[6], y[3]) + mul32x32_64(x[7], y[2]) + mul32x32_64(x[8], y[1]);
    188 f = (bignum256modm_element_t)c; q1[0] = (q1[0] | (f << 22)) & 0x3fffffff; q1[1] = (f >> 8) & 0x3fffff; c >>= 30;	
    189 c += mul32x32_64(x[2], y[8]) + mul32x32_64(x[3], y[7]) + mul32x32_64(x[4], y[6]) + mul32x32_64(x[5], y[5]) + mul32x32_64(x[6], y[4]) + mul32x32_64(x[7], y[3]) + mul32x32_64(x[8], y[2]);
    190 f = (bignum256modm_element_t)c; q1[1] = (q1[1] | (f << 22)) & 0x3fffffff; q1[2] = (f >> 8) & 0x3fffff; c >>= 30;	
    191 c += mul32x32_64(x[3], y[8]) + mul32x32_64(x[4], y[7]) + mul32x32_64(x[5], y[6]) + mul32x32_64(x[6], y[5]) + mul32x32_64(x[7], y[4]) + mul32x32_64(x[8], y[3]);
    192 f = (bignum256modm_element_t)c; q1[2] = (q1[2] | (f << 22)) & 0x3fffffff; q1[3] = (f >> 8) & 0x3fffff; c >>= 30;	
    193 c += mul32x32_64(x[4], y[8]) + mul32x32_64(x[5], y[7]) + mul32x32_64(x[6], y[6]) + mul32x32_64(x[7], y[5]) + mul32x32_64(x[8], y[4]);
    194 f = (bignum256modm_element_t)c; q1[3] = (q1[3] | (f << 22)) & 0x3fffffff; q1[4] = (f >> 8) & 0x3fffff; c >>= 30;	
    195 c += mul32x32_64(x[5], y[8]) + mul32x32_64(x[6], y[7]) + mul32x32_64(x[7], y[6]) + mul32x32_64(x[8], y[5]);
    196 f = (bignum256modm_element_t)c; q1[4] = (q1[4] | (f << 22)) & 0x3fffffff; q1[5] = (f >> 8) & 0x3fffff; c >>= 30;
    197 c += mul32x32_64(x[6], y[8]) + mul32x32_64(x[7], y[7]) + mul32x32_64(x[8], y[6]);
    198 f = (bignum256modm_element_t)c; q1[5] = (q1[5] | (f << 22)) & 0x3fffffff; q1[6] = (f >> 8) & 0x3fffff; c >>= 30;
    199 c += mul32x32_64(x[7], y[8]) + mul32x32_64(x[8], y[7]);
    200 f = (bignum256modm_element_t)c; q1[6] = (q1[6] | (f << 22)) & 0x3fffffff; q1[7] = (f >> 8) & 0x3fffff; c >>= 30;
    201 c += mul32x32_64(x[8], y[8]);
    202 f = (bignum256modm_element_t)c; q1[7] = (q1[7] | (f << 22)) & 0x3fffffff; q1[8] = (f >> 8) & 0x3fffff;
    203 
    204 barrett_reduce256_modm(r, q1, r1);
    205 }
    206 
    207 static void
    208 expand256_modm(bignum256modm out, const unsigned char *in, size_t len) {
    209 unsigned char work[64] = {0};
    210 bignum256modm_element_t x[16];
    211 bignum256modm q1;
    212 
    213 memcpy(work, in, len);
    214 x[0] = U8TO32_LE(work +  0);
    215 x[1] = U8TO32_LE(work +  4);
    216 x[2] = U8TO32_LE(work +  8);
    217 x[3] = U8TO32_LE(work + 12);
    218 x[4] = U8TO32_LE(work + 16);
    219 x[5] = U8TO32_LE(work + 20);
    220 x[6] = U8TO32_LE(work + 24);
    221 x[7] = U8TO32_LE(work + 28);
    222 x[8] = U8TO32_LE(work + 32);
    223 x[9] = U8TO32_LE(work + 36);
    224 x[10] = U8TO32_LE(work + 40);
    225 x[11] = U8TO32_LE(work + 44);
    226 x[12] = U8TO32_LE(work + 48);
    227 x[13] = U8TO32_LE(work + 52);
    228 x[14] = U8TO32_LE(work + 56);
    229 x[15] = U8TO32_LE(work + 60);
    230 
    231 /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) */
    232 out[0] = (                         x[0]) & 0x3fffffff;
    233 out[1] = ((x[ 0] >> 30) | (x[ 1] <<  2)) & 0x3fffffff;
    234 out[2] = ((x[ 1] >> 28) | (x[ 2] <<  4)) & 0x3fffffff;
    235 out[3] = ((x[ 2] >> 26) | (x[ 3] <<  6)) & 0x3fffffff;
    236 out[4] = ((x[ 3] >> 24) | (x[ 4] <<  8)) & 0x3fffffff;
    237 out[5] = ((x[ 4] >> 22) | (x[ 5] << 10)) & 0x3fffffff;
    238 out[6] = ((x[ 5] >> 20) | (x[ 6] << 12)) & 0x3fffffff;
    239 out[7] = ((x[ 6] >> 18) | (x[ 7] << 14)) & 0x3fffffff;
    240 out[8] = ((x[ 7] >> 16) | (x[ 8] << 16)) & 0x00ffffff;
    241 
    242 /* 8*31 = 248 bits, no need to reduce */
    243 if (len < 32)
    244 	return;
    245 
    246 /* q1 = x >> 248 = 264 bits = 9 30 bit elements */
    247 q1[0] = ((x[ 7] >> 24) | (x[ 8] <<  8)) & 0x3fffffff;
    248 q1[1] = ((x[ 8] >> 22) | (x[ 9] << 10)) & 0x3fffffff;
    249 q1[2] = ((x[ 9] >> 20) | (x[10] << 12)) & 0x3fffffff;
    250 q1[3] = ((x[10] >> 18) | (x[11] << 14)) & 0x3fffffff;
    251 q1[4] = ((x[11] >> 16) | (x[12] << 16)) & 0x3fffffff;
    252 q1[5] = ((x[12] >> 14) | (x[13] << 18)) & 0x3fffffff;
    253 q1[6] = ((x[13] >> 12) | (x[14] << 20)) & 0x3fffffff;
    254 q1[7] = ((x[14] >> 10) | (x[15] << 22)) & 0x3fffffff;
    255 q1[8] = ((x[15] >>  8)                );	
    256 
    257 barrett_reduce256_modm(out, q1, out);
    258 }
    259 
    260 static void
    261 expand_raw256_modm(bignum256modm out, const unsigned char in[32]) {
    262 bignum256modm_element_t x[8];
    263 
    264 x[0] = U8TO32_LE(in +  0);
    265 x[1] = U8TO32_LE(in +  4);
    266 x[2] = U8TO32_LE(in +  8);
    267 x[3] = U8TO32_LE(in + 12);
    268 x[4] = U8TO32_LE(in + 16);
    269 x[5] = U8TO32_LE(in + 20);
    270 x[6] = U8TO32_LE(in + 24);
    271 x[7] = U8TO32_LE(in + 28);
    272 
    273 out[0] = (                         x[0]) & 0x3fffffff;
    274 out[1] = ((x[ 0] >> 30) | (x[ 1] <<  2)) & 0x3fffffff;
    275 out[2] = ((x[ 1] >> 28) | (x[ 2] <<  4)) & 0x3fffffff;
    276 out[3] = ((x[ 2] >> 26) | (x[ 3] <<  6)) & 0x3fffffff;
    277 out[4] = ((x[ 3] >> 24) | (x[ 4] <<  8)) & 0x3fffffff;
    278 out[5] = ((x[ 4] >> 22) | (x[ 5] << 10)) & 0x3fffffff;
    279 out[6] = ((x[ 5] >> 20) | (x[ 6] << 12)) & 0x3fffffff;
    280 out[7] = ((x[ 6] >> 18) | (x[ 7] << 14)) & 0x3fffffff;
    281 out[8] = ((x[ 7] >> 16)                ) & 0x0000ffff;
    282 }
    283 
    284 static void
    285 contract256_modm(unsigned char out[32], const bignum256modm in) {
    286 U32TO8_LE(out +  0, (in[0]      ) | (in[1] << 30));
    287 U32TO8_LE(out +  4, (in[1] >>  2) | (in[2] << 28));
    288 U32TO8_LE(out +  8, (in[2] >>  4) | (in[3] << 26));
    289 U32TO8_LE(out + 12, (in[3] >>  6) | (in[4] << 24));
    290 U32TO8_LE(out + 16, (in[4] >>  8) | (in[5] << 22));
    291 U32TO8_LE(out + 20, (in[5] >> 10) | (in[6] << 20));
    292 U32TO8_LE(out + 24, (in[6] >> 12) | (in[7] << 18));
    293 U32TO8_LE(out + 28, (in[7] >> 14) | (in[8] << 16));
    294 }
    295 
    296 
    297 
    298 static void
    299 contract256_window4_modm(signed char r[64], const bignum256modm in) {
    300 char carry;
    301 signed char *quads = r;
    302 bignum256modm_element_t i, j, v;
    303 
    304 for (i = 0; i < 8; i += 2) {
    305 	v = in[i];
    306 	for (j = 0; j < 7; j++) {
    307 		*quads++ = (v & 15);
    308 		v >>= 4;
    309 	}
    310 	v |= (in[i+1] << 2);
    311 	for (j = 0; j < 8; j++) {
    312 		*quads++ = (v & 15);
    313 		v >>= 4;
    314 	}
    315 }
    316 v = in[8];
    317 *quads++ = (v & 15); v >>= 4;
    318 *quads++ = (v & 15); v >>= 4;
    319 *quads++ = (v & 15); v >>= 4;
    320 *quads++ = (v & 15); v >>= 4;
    321 
    322 /* making it signed */
    323 carry = 0;
    324 for(i = 0; i < 63; i++) {
    325 	r[i] += carry;
    326 	r[i+1] += (r[i] >> 4);
    327 	r[i] &= 15;
    328 	carry = (r[i] >> 3);
    329 	r[i] -= (carry << 4);
    330 }
    331 r[63] += carry;
    332 }
    333 
    334 static void
    335 contract256_slidingwindow_modm(signed char r[256], const bignum256modm s, int windowsize) {
    336 int i,j,k,b;
    337 int m = (1 << (windowsize - 1)) - 1;
    338        const int soplen = 256;
    339 signed char *bits = r;
    340 bignum256modm_element_t v;
    341 
    342 /* first put the binary expansion into r  */
    343 for (i = 0; i < 8; i++) {
    344 	v = s[i];
    345 	for (j = 0; j < 30; j++, v >>= 1)
    346 		*bits++ = (v & 1);
    347 }
    348 v = s[8];
    349 for (j = 0; j < 16; j++, v >>= 1)
    350 	*bits++ = (v & 1);
    351 
    352 /* Making it sliding window */
    353 for (j = 0; j < soplen; j++) {
    354 	if (!r[j])
    355 		continue;
    356 
    357 	for (b = 1; (b < (soplen - j)) && (b <= 6); b++) {
    358 		if ((r[j] + (r[j + b] << b)) <= m) {
    359 			r[j] += r[j + b] << b;
    360 			r[j + b] = 0;
    361 		} else if ((r[j] - (r[j + b] << b)) >= -m) {
    362 			r[j] -= r[j + b] << b;
    363 			for (k = j + b; k < soplen; k++) {
    364 				if (!r[k]) {
    365 					r[k] = 1;
    366 					break;
    367 				}
    368 				r[k] = 0;
    369 			}
    370 		} else if (r[j + b]) {
    371 			break;
    372 		}
    373 	}
    374 }
    375 }
    376 
    377 
    378 /*
    379 helpers for batch verifcation, are allowed to be vartime
    380 */
    381 
    382 /* out = a - b, a must be larger than b */
    383 static void
    384 sub256_modm_batch(bignum256modm out, const bignum256modm a, const bignum256modm b, size_t limbsize) {
    385 size_t i = 0;
    386 bignum256modm_element_t carry = 0;
    387 switch (limbsize) {
    388 	case 8: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH;
    389 	case 7: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH;
    390 	case 6: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH;
    391 	case 5: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH;
    392 	case 4: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH;
    393 	case 3: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH;
    394 	case 2: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH;
    395 	case 1: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH;
    396 	case 0: 
    397 	default: out[i] = (a[i] - b[i]) - carry;
    398 }
    399 }
    400 
    401 
    402 /* is a < b */
    403 static int
    404 lt256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
    405 switch (limbsize) {
    406 	case 8: if (a[8] > b[8]) return 0; if (a[8] < b[8]) return 1; FALLTHROUGH;
    407 	case 7: if (a[7] > b[7]) return 0; if (a[7] < b[7]) return 1; FALLTHROUGH;
    408 	case 6: if (a[6] > b[6]) return 0; if (a[6] < b[6]) return 1; FALLTHROUGH;
    409 	case 5: if (a[5] > b[5]) return 0; if (a[5] < b[5]) return 1; FALLTHROUGH;
    410 	case 4: if (a[4] > b[4]) return 0; if (a[4] < b[4]) return 1; FALLTHROUGH;
    411 	case 3: if (a[3] > b[3]) return 0; if (a[3] < b[3]) return 1; FALLTHROUGH;
    412 	case 2: if (a[2] > b[2]) return 0; if (a[2] < b[2]) return 1; FALLTHROUGH;
    413 	case 1: if (a[1] > b[1]) return 0; if (a[1] < b[1]) return 1; FALLTHROUGH;
    414 	case 0: if (a[0] > b[0]) return 0; if (a[0] < b[0]) return 1;
    415 }
    416 return 0;
    417 }
    418 
    419 /* is a <= b */
    420 static int
    421 lte256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) {
    422 switch (limbsize) {
    423 	case 8: if (a[8] > b[8]) return 0; if (a[8] < b[8]) return 1; FALLTHROUGH;
    424 	case 7: if (a[7] > b[7]) return 0; if (a[7] < b[7]) return 1; FALLTHROUGH;
    425 	case 6: if (a[6] > b[6]) return 0; if (a[6] < b[6]) return 1; FALLTHROUGH;
    426 	case 5: if (a[5] > b[5]) return 0; if (a[5] < b[5]) return 1; FALLTHROUGH;
    427 	case 4: if (a[4] > b[4]) return 0; if (a[4] < b[4]) return 1; FALLTHROUGH;
    428 	case 3: if (a[3] > b[3]) return 0; if (a[3] < b[3]) return 1; FALLTHROUGH;
    429 	case 2: if (a[2] > b[2]) return 0; if (a[2] < b[2]) return 1; FALLTHROUGH;
    430 	case 1: if (a[1] > b[1]) return 0; if (a[1] < b[1]) return 1; FALLTHROUGH;
    431 	case 0: if (a[0] > b[0]) return 0; if (a[0] < b[0]) return 1;
    432 }
    433 return 1;
    434 }
    435 
    436 
    437 /* is a == 0 */
    438 static int
    439 iszero256_modm_batch(const bignum256modm a) {
    440 size_t i;
    441 for (i = 0; i < 9; i++)
    442 	if (a[i])
    443 		return 0;
    444 return 1;
    445 }
    446 
    447 /* is a == 1 */
    448 static int
    449 isone256_modm_batch(const bignum256modm a) {
    450 size_t i;
    451 if (a[0] != 1)
    452 	return 0;
    453 for (i = 1; i < 9; i++)
    454 	if (a[i])
    455 		return 0;
    456 return 1;
    457 }
    458 
    459 /* can a fit in to (at most) 128 bits */
    460 static int
    461 isatmost128bits256_modm_batch(const bignum256modm a) {
    462 uint32_t mask =
    463 	((a[8]             )  | /*  16 */
    464 	 (a[7]             )  | /*  46 */
    465 	 (a[6]             )  | /*  76 */
    466 	 (a[5]             )  | /* 106 */
    467 	 (a[4] & 0x3fffff00));  /* 128 */
    468 
    469 return (mask == 0);
    470 }