modm-donna-32bit.h (21464B)
1 /* 2 Public domain by Andrew M. <liquidsun@gmail.com> 3 */ 4 5 6 /* 7 Arithmetic modulo the group order n = 2^252 + 27742317777372353535851937790883648493 = 7237005577332262213973186563042994240857116359379907606001950938285454250989 8 9 k = 32 10 b = 1 << 8 = 256 11 m = 2^252 + 27742317777372353535851937790883648493 = 0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ed 12 mu = floor( b^(k*2) / m ) = 0xfffffffffffffffffffffffffffffffeb2106215d086329a7ed9ce5a30a2c131b 13 */ 14 15 #define bignum256modm_bits_per_limb 30 16 #define bignum256modm_limb_size 9 17 18 typedef uint32_t bignum256modm_element_t; 19 typedef bignum256modm_element_t bignum256modm[9]; 20 21 static const bignum256modm modm_m = { 22 0x1cf5d3ed, 0x20498c69, 0x2f79cd65, 0x37be77a8, 23 0x00000014, 0x00000000, 0x00000000, 0x00000000, 24 0x00001000 25 }; 26 27 static const bignum256modm modm_mu = { 28 0x0a2c131b, 0x3673968c, 0x06329a7e, 0x01885742, 29 0x3fffeb21, 0x3fffffff, 0x3fffffff, 0x3fffffff, 30 0x000fffff 31 }; 32 33 static bignum256modm_element_t 34 lt_modm(bignum256modm_element_t a, bignum256modm_element_t b) { 35 return (a - b) >> 31; 36 } 37 38 /* see HAC, Alg. 14.42 Step 4 */ 39 static void 40 reduce256_modm(bignum256modm r) { 41 bignum256modm t; 42 bignum256modm_element_t b = 0, pb, mask; 43 44 /* t = r - m */ 45 pb = 0; 46 pb += modm_m[0]; b = lt_modm(r[0], pb); t[0] = (r[0] - pb + (b << 30)); pb = b; 47 pb += modm_m[1]; b = lt_modm(r[1], pb); t[1] = (r[1] - pb + (b << 30)); pb = b; 48 pb += modm_m[2]; b = lt_modm(r[2], pb); t[2] = (r[2] - pb + (b << 30)); pb = b; 49 pb += modm_m[3]; b = lt_modm(r[3], pb); t[3] = (r[3] - pb + (b << 30)); pb = b; 50 pb += modm_m[4]; b = lt_modm(r[4], pb); t[4] = (r[4] - pb + (b << 30)); pb = b; 51 pb += modm_m[5]; b = lt_modm(r[5], pb); t[5] = (r[5] - pb + (b << 30)); pb = b; 52 pb += modm_m[6]; b = lt_modm(r[6], pb); t[6] = (r[6] - pb + (b << 30)); pb = b; 53 pb += modm_m[7]; b = lt_modm(r[7], pb); t[7] = (r[7] - pb + (b << 30)); pb = b; 54 pb += modm_m[8]; b = lt_modm(r[8], pb); t[8] = (r[8] - pb + (b << 16)); 55 56 /* keep r if r was smaller than m */ 57 mask = b - 1; 58 r[0] ^= mask & (r[0] ^ t[0]); 59 r[1] ^= mask & (r[1] ^ t[1]); 60 r[2] ^= mask & (r[2] ^ t[2]); 61 r[3] ^= mask & (r[3] ^ t[3]); 62 r[4] ^= mask & (r[4] ^ t[4]); 63 r[5] ^= mask & (r[5] ^ t[5]); 64 r[6] ^= mask & (r[6] ^ t[6]); 65 r[7] ^= mask & (r[7] ^ t[7]); 66 r[8] ^= mask & (r[8] ^ t[8]); 67 } 68 69 /* 70 Barrett reduction, see HAC, Alg. 14.42 71 72 Instead of passing in x, pre-process in to q1 and r1 for efficiency 73 */ 74 static void 75 barrett_reduce256_modm(bignum256modm r, const bignum256modm q1, const bignum256modm r1) { 76 bignum256modm q3, r2; 77 uint64_t c; 78 bignum256modm_element_t f, b, pb; 79 80 /* q1 = x >> 248 = 264 bits = 9 30 bit elements 81 q2 = mu * q1 82 q3 = (q2 / 256(32+1)) = q2 / (2^8)^(32+1) = q2 >> 264 */ 83 c = mul32x32_64(modm_mu[0], q1[7]) + mul32x32_64(modm_mu[1], q1[6]) + mul32x32_64(modm_mu[2], q1[5]) + mul32x32_64(modm_mu[3], q1[4]) + mul32x32_64(modm_mu[4], q1[3]) + mul32x32_64(modm_mu[5], q1[2]) + mul32x32_64(modm_mu[6], q1[1]) + mul32x32_64(modm_mu[7], q1[0]); 84 c >>= 30; 85 c += mul32x32_64(modm_mu[0], q1[8]) + mul32x32_64(modm_mu[1], q1[7]) + mul32x32_64(modm_mu[2], q1[6]) + mul32x32_64(modm_mu[3], q1[5]) + mul32x32_64(modm_mu[4], q1[4]) + mul32x32_64(modm_mu[5], q1[3]) + mul32x32_64(modm_mu[6], q1[2]) + mul32x32_64(modm_mu[7], q1[1]) + mul32x32_64(modm_mu[8], q1[0]); 86 f = (bignum256modm_element_t)c; q3[0] = (f >> 24) & 0x3f; c >>= 30; 87 c += mul32x32_64(modm_mu[1], q1[8]) + mul32x32_64(modm_mu[2], q1[7]) + mul32x32_64(modm_mu[3], q1[6]) + mul32x32_64(modm_mu[4], q1[5]) + mul32x32_64(modm_mu[5], q1[4]) + mul32x32_64(modm_mu[6], q1[3]) + mul32x32_64(modm_mu[7], q1[2]) + mul32x32_64(modm_mu[8], q1[1]); 88 f = (bignum256modm_element_t)c; q3[0] |= (f << 6) & 0x3fffffff; q3[1] = (f >> 24) & 0x3f; c >>= 30; 89 c += mul32x32_64(modm_mu[2], q1[8]) + mul32x32_64(modm_mu[3], q1[7]) + mul32x32_64(modm_mu[4], q1[6]) + mul32x32_64(modm_mu[5], q1[5]) + mul32x32_64(modm_mu[6], q1[4]) + mul32x32_64(modm_mu[7], q1[3]) + mul32x32_64(modm_mu[8], q1[2]); 90 f = (bignum256modm_element_t)c; q3[1] |= (f << 6) & 0x3fffffff; q3[2] = (f >> 24) & 0x3f; c >>= 30; 91 c += mul32x32_64(modm_mu[3], q1[8]) + mul32x32_64(modm_mu[4], q1[7]) + mul32x32_64(modm_mu[5], q1[6]) + mul32x32_64(modm_mu[6], q1[5]) + mul32x32_64(modm_mu[7], q1[4]) + mul32x32_64(modm_mu[8], q1[3]); 92 f = (bignum256modm_element_t)c; q3[2] |= (f << 6) & 0x3fffffff; q3[3] = (f >> 24) & 0x3f; c >>= 30; 93 c += mul32x32_64(modm_mu[4], q1[8]) + mul32x32_64(modm_mu[5], q1[7]) + mul32x32_64(modm_mu[6], q1[6]) + mul32x32_64(modm_mu[7], q1[5]) + mul32x32_64(modm_mu[8], q1[4]); 94 f = (bignum256modm_element_t)c; q3[3] |= (f << 6) & 0x3fffffff; q3[4] = (f >> 24) & 0x3f; c >>= 30; 95 c += mul32x32_64(modm_mu[5], q1[8]) + mul32x32_64(modm_mu[6], q1[7]) + mul32x32_64(modm_mu[7], q1[6]) + mul32x32_64(modm_mu[8], q1[5]); 96 f = (bignum256modm_element_t)c; q3[4] |= (f << 6) & 0x3fffffff; q3[5] = (f >> 24) & 0x3f; c >>= 30; 97 c += mul32x32_64(modm_mu[6], q1[8]) + mul32x32_64(modm_mu[7], q1[7]) + mul32x32_64(modm_mu[8], q1[6]); 98 f = (bignum256modm_element_t)c; q3[5] |= (f << 6) & 0x3fffffff; q3[6] = (f >> 24) & 0x3f; c >>= 30; 99 c += mul32x32_64(modm_mu[7], q1[8]) + mul32x32_64(modm_mu[8], q1[7]); 100 f = (bignum256modm_element_t)c; q3[6] |= (f << 6) & 0x3fffffff; q3[7] = (f >> 24) & 0x3f; c >>= 30; 101 c += mul32x32_64(modm_mu[8], q1[8]); 102 f = (bignum256modm_element_t)c; q3[7] |= (f << 6) & 0x3fffffff; q3[8] = (bignum256modm_element_t)(c >> 24); 103 104 /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) 105 r2 = (q3 * m) mod (256^(32+1)) = (q3 * m) & ((1 << 264) - 1) */ 106 c = mul32x32_64(modm_m[0], q3[0]); 107 r2[0] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30; 108 c += mul32x32_64(modm_m[0], q3[1]) + mul32x32_64(modm_m[1], q3[0]); 109 r2[1] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30; 110 c += mul32x32_64(modm_m[0], q3[2]) + mul32x32_64(modm_m[1], q3[1]) + mul32x32_64(modm_m[2], q3[0]); 111 r2[2] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30; 112 c += mul32x32_64(modm_m[0], q3[3]) + mul32x32_64(modm_m[1], q3[2]) + mul32x32_64(modm_m[2], q3[1]) + mul32x32_64(modm_m[3], q3[0]); 113 r2[3] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30; 114 c += mul32x32_64(modm_m[0], q3[4]) + mul32x32_64(modm_m[1], q3[3]) + mul32x32_64(modm_m[2], q3[2]) + mul32x32_64(modm_m[3], q3[1]) + mul32x32_64(modm_m[4], q3[0]); 115 r2[4] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30; 116 c += mul32x32_64(modm_m[0], q3[5]) + mul32x32_64(modm_m[1], q3[4]) + mul32x32_64(modm_m[2], q3[3]) + mul32x32_64(modm_m[3], q3[2]) + mul32x32_64(modm_m[4], q3[1]) + mul32x32_64(modm_m[5], q3[0]); 117 r2[5] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30; 118 c += mul32x32_64(modm_m[0], q3[6]) + mul32x32_64(modm_m[1], q3[5]) + mul32x32_64(modm_m[2], q3[4]) + mul32x32_64(modm_m[3], q3[3]) + mul32x32_64(modm_m[4], q3[2]) + mul32x32_64(modm_m[5], q3[1]) + mul32x32_64(modm_m[6], q3[0]); 119 r2[6] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30; 120 c += mul32x32_64(modm_m[0], q3[7]) + mul32x32_64(modm_m[1], q3[6]) + mul32x32_64(modm_m[2], q3[5]) + mul32x32_64(modm_m[3], q3[4]) + mul32x32_64(modm_m[4], q3[3]) + mul32x32_64(modm_m[5], q3[2]) + mul32x32_64(modm_m[6], q3[1]) + mul32x32_64(modm_m[7], q3[0]); 121 r2[7] = (bignum256modm_element_t)(c & 0x3fffffff); c >>= 30; 122 c += mul32x32_64(modm_m[0], q3[8]) + mul32x32_64(modm_m[1], q3[7]) + mul32x32_64(modm_m[2], q3[6]) + mul32x32_64(modm_m[3], q3[5]) + mul32x32_64(modm_m[4], q3[4]) + mul32x32_64(modm_m[5], q3[3]) + mul32x32_64(modm_m[6], q3[2]) + mul32x32_64(modm_m[7], q3[1]) + mul32x32_64(modm_m[8], q3[0]); 123 r2[8] = (bignum256modm_element_t)(c & 0xffffff); 124 125 /* r = r1 - r2 126 if (r < 0) r += (1 << 264) */ 127 pb = 0; 128 pb += r2[0]; b = lt_modm(r1[0], pb); r[0] = (r1[0] - pb + (b << 30)); pb = b; 129 pb += r2[1]; b = lt_modm(r1[1], pb); r[1] = (r1[1] - pb + (b << 30)); pb = b; 130 pb += r2[2]; b = lt_modm(r1[2], pb); r[2] = (r1[2] - pb + (b << 30)); pb = b; 131 pb += r2[3]; b = lt_modm(r1[3], pb); r[3] = (r1[3] - pb + (b << 30)); pb = b; 132 pb += r2[4]; b = lt_modm(r1[4], pb); r[4] = (r1[4] - pb + (b << 30)); pb = b; 133 pb += r2[5]; b = lt_modm(r1[5], pb); r[5] = (r1[5] - pb + (b << 30)); pb = b; 134 pb += r2[6]; b = lt_modm(r1[6], pb); r[6] = (r1[6] - pb + (b << 30)); pb = b; 135 pb += r2[7]; b = lt_modm(r1[7], pb); r[7] = (r1[7] - pb + (b << 30)); pb = b; 136 pb += r2[8]; b = lt_modm(r1[8], pb); r[8] = (r1[8] - pb + (b << 24)); 137 138 reduce256_modm(r); 139 reduce256_modm(r); 140 } 141 142 /* addition modulo m */ 143 static void 144 add256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) { 145 bignum256modm_element_t c; 146 147 c = x[0] + y[0]; r[0] = c & 0x3fffffff; c >>= 30; 148 c += x[1] + y[1]; r[1] = c & 0x3fffffff; c >>= 30; 149 c += x[2] + y[2]; r[2] = c & 0x3fffffff; c >>= 30; 150 c += x[3] + y[3]; r[3] = c & 0x3fffffff; c >>= 30; 151 c += x[4] + y[4]; r[4] = c & 0x3fffffff; c >>= 30; 152 c += x[5] + y[5]; r[5] = c & 0x3fffffff; c >>= 30; 153 c += x[6] + y[6]; r[6] = c & 0x3fffffff; c >>= 30; 154 c += x[7] + y[7]; r[7] = c & 0x3fffffff; c >>= 30; 155 c += x[8] + y[8]; r[8] = c; 156 157 reduce256_modm(r); 158 } 159 160 /* multiplication modulo m */ 161 static void 162 mul256_modm(bignum256modm r, const bignum256modm x, const bignum256modm y) { 163 bignum256modm r1, q1; 164 uint64_t c; 165 bignum256modm_element_t f; 166 167 /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) 168 q1 = x >> 248 = 264 bits = 9 30 bit elements */ 169 c = mul32x32_64(x[0], y[0]); 170 f = (bignum256modm_element_t)c; r1[0] = (f & 0x3fffffff); c >>= 30; 171 c += mul32x32_64(x[0], y[1]) + mul32x32_64(x[1], y[0]); 172 f = (bignum256modm_element_t)c; r1[1] = (f & 0x3fffffff); c >>= 30; 173 c += mul32x32_64(x[0], y[2]) + mul32x32_64(x[1], y[1]) + mul32x32_64(x[2], y[0]); 174 f = (bignum256modm_element_t)c; r1[2] = (f & 0x3fffffff); c >>= 30; 175 c += mul32x32_64(x[0], y[3]) + mul32x32_64(x[1], y[2]) + mul32x32_64(x[2], y[1]) + mul32x32_64(x[3], y[0]); 176 f = (bignum256modm_element_t)c; r1[3] = (f & 0x3fffffff); c >>= 30; 177 c += mul32x32_64(x[0], y[4]) + mul32x32_64(x[1], y[3]) + mul32x32_64(x[2], y[2]) + mul32x32_64(x[3], y[1]) + mul32x32_64(x[4], y[0]); 178 f = (bignum256modm_element_t)c; r1[4] = (f & 0x3fffffff); c >>= 30; 179 c += mul32x32_64(x[0], y[5]) + mul32x32_64(x[1], y[4]) + mul32x32_64(x[2], y[3]) + mul32x32_64(x[3], y[2]) + mul32x32_64(x[4], y[1]) + mul32x32_64(x[5], y[0]); 180 f = (bignum256modm_element_t)c; r1[5] = (f & 0x3fffffff); c >>= 30; 181 c += mul32x32_64(x[0], y[6]) + mul32x32_64(x[1], y[5]) + mul32x32_64(x[2], y[4]) + mul32x32_64(x[3], y[3]) + mul32x32_64(x[4], y[2]) + mul32x32_64(x[5], y[1]) + mul32x32_64(x[6], y[0]); 182 f = (bignum256modm_element_t)c; r1[6] = (f & 0x3fffffff); c >>= 30; 183 c += mul32x32_64(x[0], y[7]) + mul32x32_64(x[1], y[6]) + mul32x32_64(x[2], y[5]) + mul32x32_64(x[3], y[4]) + mul32x32_64(x[4], y[3]) + mul32x32_64(x[5], y[2]) + mul32x32_64(x[6], y[1]) + mul32x32_64(x[7], y[0]); 184 f = (bignum256modm_element_t)c; r1[7] = (f & 0x3fffffff); c >>= 30; 185 c += mul32x32_64(x[0], y[8]) + mul32x32_64(x[1], y[7]) + mul32x32_64(x[2], y[6]) + mul32x32_64(x[3], y[5]) + mul32x32_64(x[4], y[4]) + mul32x32_64(x[5], y[3]) + mul32x32_64(x[6], y[2]) + mul32x32_64(x[7], y[1]) + mul32x32_64(x[8], y[0]); 186 f = (bignum256modm_element_t)c; r1[8] = (f & 0x00ffffff); q1[0] = (f >> 8) & 0x3fffff; c >>= 30; 187 c += mul32x32_64(x[1], y[8]) + mul32x32_64(x[2], y[7]) + mul32x32_64(x[3], y[6]) + mul32x32_64(x[4], y[5]) + mul32x32_64(x[5], y[4]) + mul32x32_64(x[6], y[3]) + mul32x32_64(x[7], y[2]) + mul32x32_64(x[8], y[1]); 188 f = (bignum256modm_element_t)c; q1[0] = (q1[0] | (f << 22)) & 0x3fffffff; q1[1] = (f >> 8) & 0x3fffff; c >>= 30; 189 c += mul32x32_64(x[2], y[8]) + mul32x32_64(x[3], y[7]) + mul32x32_64(x[4], y[6]) + mul32x32_64(x[5], y[5]) + mul32x32_64(x[6], y[4]) + mul32x32_64(x[7], y[3]) + mul32x32_64(x[8], y[2]); 190 f = (bignum256modm_element_t)c; q1[1] = (q1[1] | (f << 22)) & 0x3fffffff; q1[2] = (f >> 8) & 0x3fffff; c >>= 30; 191 c += mul32x32_64(x[3], y[8]) + mul32x32_64(x[4], y[7]) + mul32x32_64(x[5], y[6]) + mul32x32_64(x[6], y[5]) + mul32x32_64(x[7], y[4]) + mul32x32_64(x[8], y[3]); 192 f = (bignum256modm_element_t)c; q1[2] = (q1[2] | (f << 22)) & 0x3fffffff; q1[3] = (f >> 8) & 0x3fffff; c >>= 30; 193 c += mul32x32_64(x[4], y[8]) + mul32x32_64(x[5], y[7]) + mul32x32_64(x[6], y[6]) + mul32x32_64(x[7], y[5]) + mul32x32_64(x[8], y[4]); 194 f = (bignum256modm_element_t)c; q1[3] = (q1[3] | (f << 22)) & 0x3fffffff; q1[4] = (f >> 8) & 0x3fffff; c >>= 30; 195 c += mul32x32_64(x[5], y[8]) + mul32x32_64(x[6], y[7]) + mul32x32_64(x[7], y[6]) + mul32x32_64(x[8], y[5]); 196 f = (bignum256modm_element_t)c; q1[4] = (q1[4] | (f << 22)) & 0x3fffffff; q1[5] = (f >> 8) & 0x3fffff; c >>= 30; 197 c += mul32x32_64(x[6], y[8]) + mul32x32_64(x[7], y[7]) + mul32x32_64(x[8], y[6]); 198 f = (bignum256modm_element_t)c; q1[5] = (q1[5] | (f << 22)) & 0x3fffffff; q1[6] = (f >> 8) & 0x3fffff; c >>= 30; 199 c += mul32x32_64(x[7], y[8]) + mul32x32_64(x[8], y[7]); 200 f = (bignum256modm_element_t)c; q1[6] = (q1[6] | (f << 22)) & 0x3fffffff; q1[7] = (f >> 8) & 0x3fffff; c >>= 30; 201 c += mul32x32_64(x[8], y[8]); 202 f = (bignum256modm_element_t)c; q1[7] = (q1[7] | (f << 22)) & 0x3fffffff; q1[8] = (f >> 8) & 0x3fffff; 203 204 barrett_reduce256_modm(r, q1, r1); 205 } 206 207 static void 208 expand256_modm(bignum256modm out, const unsigned char *in, size_t len) { 209 unsigned char work[64] = {0}; 210 bignum256modm_element_t x[16]; 211 bignum256modm q1; 212 213 memcpy(work, in, len); 214 x[0] = U8TO32_LE(work + 0); 215 x[1] = U8TO32_LE(work + 4); 216 x[2] = U8TO32_LE(work + 8); 217 x[3] = U8TO32_LE(work + 12); 218 x[4] = U8TO32_LE(work + 16); 219 x[5] = U8TO32_LE(work + 20); 220 x[6] = U8TO32_LE(work + 24); 221 x[7] = U8TO32_LE(work + 28); 222 x[8] = U8TO32_LE(work + 32); 223 x[9] = U8TO32_LE(work + 36); 224 x[10] = U8TO32_LE(work + 40); 225 x[11] = U8TO32_LE(work + 44); 226 x[12] = U8TO32_LE(work + 48); 227 x[13] = U8TO32_LE(work + 52); 228 x[14] = U8TO32_LE(work + 56); 229 x[15] = U8TO32_LE(work + 60); 230 231 /* r1 = (x mod 256^(32+1)) = x mod (2^8)(31+1) = x & ((1 << 264) - 1) */ 232 out[0] = ( x[0]) & 0x3fffffff; 233 out[1] = ((x[ 0] >> 30) | (x[ 1] << 2)) & 0x3fffffff; 234 out[2] = ((x[ 1] >> 28) | (x[ 2] << 4)) & 0x3fffffff; 235 out[3] = ((x[ 2] >> 26) | (x[ 3] << 6)) & 0x3fffffff; 236 out[4] = ((x[ 3] >> 24) | (x[ 4] << 8)) & 0x3fffffff; 237 out[5] = ((x[ 4] >> 22) | (x[ 5] << 10)) & 0x3fffffff; 238 out[6] = ((x[ 5] >> 20) | (x[ 6] << 12)) & 0x3fffffff; 239 out[7] = ((x[ 6] >> 18) | (x[ 7] << 14)) & 0x3fffffff; 240 out[8] = ((x[ 7] >> 16) | (x[ 8] << 16)) & 0x00ffffff; 241 242 /* 8*31 = 248 bits, no need to reduce */ 243 if (len < 32) 244 return; 245 246 /* q1 = x >> 248 = 264 bits = 9 30 bit elements */ 247 q1[0] = ((x[ 7] >> 24) | (x[ 8] << 8)) & 0x3fffffff; 248 q1[1] = ((x[ 8] >> 22) | (x[ 9] << 10)) & 0x3fffffff; 249 q1[2] = ((x[ 9] >> 20) | (x[10] << 12)) & 0x3fffffff; 250 q1[3] = ((x[10] >> 18) | (x[11] << 14)) & 0x3fffffff; 251 q1[4] = ((x[11] >> 16) | (x[12] << 16)) & 0x3fffffff; 252 q1[5] = ((x[12] >> 14) | (x[13] << 18)) & 0x3fffffff; 253 q1[6] = ((x[13] >> 12) | (x[14] << 20)) & 0x3fffffff; 254 q1[7] = ((x[14] >> 10) | (x[15] << 22)) & 0x3fffffff; 255 q1[8] = ((x[15] >> 8) ); 256 257 barrett_reduce256_modm(out, q1, out); 258 } 259 260 static void 261 expand_raw256_modm(bignum256modm out, const unsigned char in[32]) { 262 bignum256modm_element_t x[8]; 263 264 x[0] = U8TO32_LE(in + 0); 265 x[1] = U8TO32_LE(in + 4); 266 x[2] = U8TO32_LE(in + 8); 267 x[3] = U8TO32_LE(in + 12); 268 x[4] = U8TO32_LE(in + 16); 269 x[5] = U8TO32_LE(in + 20); 270 x[6] = U8TO32_LE(in + 24); 271 x[7] = U8TO32_LE(in + 28); 272 273 out[0] = ( x[0]) & 0x3fffffff; 274 out[1] = ((x[ 0] >> 30) | (x[ 1] << 2)) & 0x3fffffff; 275 out[2] = ((x[ 1] >> 28) | (x[ 2] << 4)) & 0x3fffffff; 276 out[3] = ((x[ 2] >> 26) | (x[ 3] << 6)) & 0x3fffffff; 277 out[4] = ((x[ 3] >> 24) | (x[ 4] << 8)) & 0x3fffffff; 278 out[5] = ((x[ 4] >> 22) | (x[ 5] << 10)) & 0x3fffffff; 279 out[6] = ((x[ 5] >> 20) | (x[ 6] << 12)) & 0x3fffffff; 280 out[7] = ((x[ 6] >> 18) | (x[ 7] << 14)) & 0x3fffffff; 281 out[8] = ((x[ 7] >> 16) ) & 0x0000ffff; 282 } 283 284 static void 285 contract256_modm(unsigned char out[32], const bignum256modm in) { 286 U32TO8_LE(out + 0, (in[0] ) | (in[1] << 30)); 287 U32TO8_LE(out + 4, (in[1] >> 2) | (in[2] << 28)); 288 U32TO8_LE(out + 8, (in[2] >> 4) | (in[3] << 26)); 289 U32TO8_LE(out + 12, (in[3] >> 6) | (in[4] << 24)); 290 U32TO8_LE(out + 16, (in[4] >> 8) | (in[5] << 22)); 291 U32TO8_LE(out + 20, (in[5] >> 10) | (in[6] << 20)); 292 U32TO8_LE(out + 24, (in[6] >> 12) | (in[7] << 18)); 293 U32TO8_LE(out + 28, (in[7] >> 14) | (in[8] << 16)); 294 } 295 296 297 298 static void 299 contract256_window4_modm(signed char r[64], const bignum256modm in) { 300 char carry; 301 signed char *quads = r; 302 bignum256modm_element_t i, j, v; 303 304 for (i = 0; i < 8; i += 2) { 305 v = in[i]; 306 for (j = 0; j < 7; j++) { 307 *quads++ = (v & 15); 308 v >>= 4; 309 } 310 v |= (in[i+1] << 2); 311 for (j = 0; j < 8; j++) { 312 *quads++ = (v & 15); 313 v >>= 4; 314 } 315 } 316 v = in[8]; 317 *quads++ = (v & 15); v >>= 4; 318 *quads++ = (v & 15); v >>= 4; 319 *quads++ = (v & 15); v >>= 4; 320 *quads++ = (v & 15); v >>= 4; 321 322 /* making it signed */ 323 carry = 0; 324 for(i = 0; i < 63; i++) { 325 r[i] += carry; 326 r[i+1] += (r[i] >> 4); 327 r[i] &= 15; 328 carry = (r[i] >> 3); 329 r[i] -= (carry << 4); 330 } 331 r[63] += carry; 332 } 333 334 static void 335 contract256_slidingwindow_modm(signed char r[256], const bignum256modm s, int windowsize) { 336 int i,j,k,b; 337 int m = (1 << (windowsize - 1)) - 1; 338 const int soplen = 256; 339 signed char *bits = r; 340 bignum256modm_element_t v; 341 342 /* first put the binary expansion into r */ 343 for (i = 0; i < 8; i++) { 344 v = s[i]; 345 for (j = 0; j < 30; j++, v >>= 1) 346 *bits++ = (v & 1); 347 } 348 v = s[8]; 349 for (j = 0; j < 16; j++, v >>= 1) 350 *bits++ = (v & 1); 351 352 /* Making it sliding window */ 353 for (j = 0; j < soplen; j++) { 354 if (!r[j]) 355 continue; 356 357 for (b = 1; (b < (soplen - j)) && (b <= 6); b++) { 358 if ((r[j] + (r[j + b] << b)) <= m) { 359 r[j] += r[j + b] << b; 360 r[j + b] = 0; 361 } else if ((r[j] - (r[j + b] << b)) >= -m) { 362 r[j] -= r[j + b] << b; 363 for (k = j + b; k < soplen; k++) { 364 if (!r[k]) { 365 r[k] = 1; 366 break; 367 } 368 r[k] = 0; 369 } 370 } else if (r[j + b]) { 371 break; 372 } 373 } 374 } 375 } 376 377 378 /* 379 helpers for batch verifcation, are allowed to be vartime 380 */ 381 382 /* out = a - b, a must be larger than b */ 383 static void 384 sub256_modm_batch(bignum256modm out, const bignum256modm a, const bignum256modm b, size_t limbsize) { 385 size_t i = 0; 386 bignum256modm_element_t carry = 0; 387 switch (limbsize) { 388 case 8: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH; 389 case 7: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH; 390 case 6: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH; 391 case 5: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH; 392 case 4: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH; 393 case 3: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH; 394 case 2: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH; 395 case 1: out[i] = (a[i] - b[i]) - carry; carry = (out[i] >> 31); out[i] &= 0x3fffffff; i++; FALLTHROUGH; 396 case 0: 397 default: out[i] = (a[i] - b[i]) - carry; 398 } 399 } 400 401 402 /* is a < b */ 403 static int 404 lt256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) { 405 switch (limbsize) { 406 case 8: if (a[8] > b[8]) return 0; if (a[8] < b[8]) return 1; FALLTHROUGH; 407 case 7: if (a[7] > b[7]) return 0; if (a[7] < b[7]) return 1; FALLTHROUGH; 408 case 6: if (a[6] > b[6]) return 0; if (a[6] < b[6]) return 1; FALLTHROUGH; 409 case 5: if (a[5] > b[5]) return 0; if (a[5] < b[5]) return 1; FALLTHROUGH; 410 case 4: if (a[4] > b[4]) return 0; if (a[4] < b[4]) return 1; FALLTHROUGH; 411 case 3: if (a[3] > b[3]) return 0; if (a[3] < b[3]) return 1; FALLTHROUGH; 412 case 2: if (a[2] > b[2]) return 0; if (a[2] < b[2]) return 1; FALLTHROUGH; 413 case 1: if (a[1] > b[1]) return 0; if (a[1] < b[1]) return 1; FALLTHROUGH; 414 case 0: if (a[0] > b[0]) return 0; if (a[0] < b[0]) return 1; 415 } 416 return 0; 417 } 418 419 /* is a <= b */ 420 static int 421 lte256_modm_batch(const bignum256modm a, const bignum256modm b, size_t limbsize) { 422 switch (limbsize) { 423 case 8: if (a[8] > b[8]) return 0; if (a[8] < b[8]) return 1; FALLTHROUGH; 424 case 7: if (a[7] > b[7]) return 0; if (a[7] < b[7]) return 1; FALLTHROUGH; 425 case 6: if (a[6] > b[6]) return 0; if (a[6] < b[6]) return 1; FALLTHROUGH; 426 case 5: if (a[5] > b[5]) return 0; if (a[5] < b[5]) return 1; FALLTHROUGH; 427 case 4: if (a[4] > b[4]) return 0; if (a[4] < b[4]) return 1; FALLTHROUGH; 428 case 3: if (a[3] > b[3]) return 0; if (a[3] < b[3]) return 1; FALLTHROUGH; 429 case 2: if (a[2] > b[2]) return 0; if (a[2] < b[2]) return 1; FALLTHROUGH; 430 case 1: if (a[1] > b[1]) return 0; if (a[1] < b[1]) return 1; FALLTHROUGH; 431 case 0: if (a[0] > b[0]) return 0; if (a[0] < b[0]) return 1; 432 } 433 return 1; 434 } 435 436 437 /* is a == 0 */ 438 static int 439 iszero256_modm_batch(const bignum256modm a) { 440 size_t i; 441 for (i = 0; i < 9; i++) 442 if (a[i]) 443 return 0; 444 return 1; 445 } 446 447 /* is a == 1 */ 448 static int 449 isone256_modm_batch(const bignum256modm a) { 450 size_t i; 451 if (a[0] != 1) 452 return 0; 453 for (i = 1; i < 9; i++) 454 if (a[i]) 455 return 0; 456 return 1; 457 } 458 459 /* can a fit in to (at most) 128 bits */ 460 static int 461 isatmost128bits256_modm_batch(const bignum256modm a) { 462 uint32_t mask = 463 ((a[8] ) | /* 16 */ 464 (a[7] ) | /* 46 */ 465 (a[6] ) | /* 76 */ 466 (a[5] ) | /* 106 */ 467 (a[4] & 0x3fffff00)); /* 128 */ 468 469 return (mask == 0); 470 }