tor

The Tor anonymity network
git clone https://git.dasho.dev/tor.git
Log | Files | Refs | README | LICENSE

curve25519-ref10.c (38169B)


      1 #include <stdint.h>
      2 
      3 typedef int32_t crypto_int32;
      4 typedef int64_t crypto_int64;
      5 typedef uint64_t crypto_uint64;
      6 
      7 typedef crypto_int32 fe[10];
      8 
      9 /*
     10 h = 0
     11 */
     12 
     13 void fe_0(fe h)
     14 {
     15  h[0] = 0;
     16  h[1] = 0;
     17  h[2] = 0;
     18  h[3] = 0;
     19  h[4] = 0;
     20  h[5] = 0;
     21  h[6] = 0;
     22  h[7] = 0;
     23  h[8] = 0;
     24  h[9] = 0;
     25 }
     26 
     27 /*
     28 h = 1
     29 */
     30 
     31 void fe_1(fe h)
     32 {
     33  h[0] = 1;
     34  h[1] = 0;
     35  h[2] = 0;
     36  h[3] = 0;
     37  h[4] = 0;
     38  h[5] = 0;
     39  h[6] = 0;
     40  h[7] = 0;
     41  h[8] = 0;
     42  h[9] = 0;
     43 }
     44 
     45 /*
     46 h = f + g
     47 Can overlap h with f or g.
     48 
     49 Preconditions:
     50   |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
     51   |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
     52 
     53 Postconditions:
     54   |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
     55 */
     56 
     57 void fe_add(fe h,fe f,fe g)
     58 {
     59  crypto_int32 f0 = f[0];
     60  crypto_int32 f1 = f[1];
     61  crypto_int32 f2 = f[2];
     62  crypto_int32 f3 = f[3];
     63  crypto_int32 f4 = f[4];
     64  crypto_int32 f5 = f[5];
     65  crypto_int32 f6 = f[6];
     66  crypto_int32 f7 = f[7];
     67  crypto_int32 f8 = f[8];
     68  crypto_int32 f9 = f[9];
     69  crypto_int32 g0 = g[0];
     70  crypto_int32 g1 = g[1];
     71  crypto_int32 g2 = g[2];
     72  crypto_int32 g3 = g[3];
     73  crypto_int32 g4 = g[4];
     74  crypto_int32 g5 = g[5];
     75  crypto_int32 g6 = g[6];
     76  crypto_int32 g7 = g[7];
     77  crypto_int32 g8 = g[8];
     78  crypto_int32 g9 = g[9];
     79  crypto_int32 h0 = f0 + g0;
     80  crypto_int32 h1 = f1 + g1;
     81  crypto_int32 h2 = f2 + g2;
     82  crypto_int32 h3 = f3 + g3;
     83  crypto_int32 h4 = f4 + g4;
     84  crypto_int32 h5 = f5 + g5;
     85  crypto_int32 h6 = f6 + g6;
     86  crypto_int32 h7 = f7 + g7;
     87  crypto_int32 h8 = f8 + g8;
     88  crypto_int32 h9 = f9 + g9;
     89  h[0] = h0;
     90  h[1] = h1;
     91  h[2] = h2;
     92  h[3] = h3;
     93  h[4] = h4;
     94  h[5] = h5;
     95  h[6] = h6;
     96  h[7] = h7;
     97  h[8] = h8;
     98  h[9] = h9;
     99 }
    100 
    101 /*
    102 h = f
    103 */
    104 
    105 void fe_copy(fe h,fe f)
    106 {
    107  crypto_int32 f0 = f[0];
    108  crypto_int32 f1 = f[1];
    109  crypto_int32 f2 = f[2];
    110  crypto_int32 f3 = f[3];
    111  crypto_int32 f4 = f[4];
    112  crypto_int32 f5 = f[5];
    113  crypto_int32 f6 = f[6];
    114  crypto_int32 f7 = f[7];
    115  crypto_int32 f8 = f[8];
    116  crypto_int32 f9 = f[9];
    117  h[0] = f0;
    118  h[1] = f1;
    119  h[2] = f2;
    120  h[3] = f3;
    121  h[4] = f4;
    122  h[5] = f5;
    123  h[6] = f6;
    124  h[7] = f7;
    125  h[8] = f8;
    126  h[9] = f9;
    127 }
    128 
    129 
    130 /*
    131 Replace (f,g) with (g,f) if b == 1;
    132 replace (f,g) with (f,g) if b == 0.
    133 
    134 Preconditions: b in {0,1}.
    135 */
    136 
    137 void fe_cswap(fe f,fe g,unsigned int b)
    138 {
    139  crypto_int32 f0 = f[0];
    140  crypto_int32 f1 = f[1];
    141  crypto_int32 f2 = f[2];
    142  crypto_int32 f3 = f[3];
    143  crypto_int32 f4 = f[4];
    144  crypto_int32 f5 = f[5];
    145  crypto_int32 f6 = f[6];
    146  crypto_int32 f7 = f[7];
    147  crypto_int32 f8 = f[8];
    148  crypto_int32 f9 = f[9];
    149  crypto_int32 g0 = g[0];
    150  crypto_int32 g1 = g[1];
    151  crypto_int32 g2 = g[2];
    152  crypto_int32 g3 = g[3];
    153  crypto_int32 g4 = g[4];
    154  crypto_int32 g5 = g[5];
    155  crypto_int32 g6 = g[6];
    156  crypto_int32 g7 = g[7];
    157  crypto_int32 g8 = g[8];
    158  crypto_int32 g9 = g[9];
    159  crypto_int32 x0 = f0 ^ g0;
    160  crypto_int32 x1 = f1 ^ g1;
    161  crypto_int32 x2 = f2 ^ g2;
    162  crypto_int32 x3 = f3 ^ g3;
    163  crypto_int32 x4 = f4 ^ g4;
    164  crypto_int32 x5 = f5 ^ g5;
    165  crypto_int32 x6 = f6 ^ g6;
    166  crypto_int32 x7 = f7 ^ g7;
    167  crypto_int32 x8 = f8 ^ g8;
    168  crypto_int32 x9 = f9 ^ g9;
    169  b = -b;
    170  x0 &= b;
    171  x1 &= b;
    172  x2 &= b;
    173  x3 &= b;
    174  x4 &= b;
    175  x5 &= b;
    176  x6 &= b;
    177  x7 &= b;
    178  x8 &= b;
    179  x9 &= b;
    180  f[0] = f0 ^ x0;
    181  f[1] = f1 ^ x1;
    182  f[2] = f2 ^ x2;
    183  f[3] = f3 ^ x3;
    184  f[4] = f4 ^ x4;
    185  f[5] = f5 ^ x5;
    186  f[6] = f6 ^ x6;
    187  f[7] = f7 ^ x7;
    188  f[8] = f8 ^ x8;
    189  f[9] = f9 ^ x9;
    190  g[0] = g0 ^ x0;
    191  g[1] = g1 ^ x1;
    192  g[2] = g2 ^ x2;
    193  g[3] = g3 ^ x3;
    194  g[4] = g4 ^ x4;
    195  g[5] = g5 ^ x5;
    196  g[6] = g6 ^ x6;
    197  g[7] = g7 ^ x7;
    198  g[8] = g8 ^ x8;
    199  g[9] = g9 ^ x9;
    200 }
    201 
    202 static crypto_uint64 load_3(const unsigned char *in)
    203 {
    204  crypto_uint64 result;
    205  result = (crypto_uint64) in[0];
    206  result |= ((crypto_uint64) in[1]) << 8;
    207  result |= ((crypto_uint64) in[2]) << 16;
    208  return result;
    209 }
    210 
    211 static crypto_uint64 load_4(const unsigned char *in)
    212 {
    213  crypto_uint64 result;
    214  result = (crypto_uint64) in[0];
    215  result |= ((crypto_uint64) in[1]) << 8;
    216  result |= ((crypto_uint64) in[2]) << 16;
    217  result |= ((crypto_uint64) in[3]) << 24;
    218  return result;
    219 }
    220 
    221 void fe_frombytes(fe h,const unsigned char *s)
    222 {
    223  crypto_int64 h0 = load_4(s);
    224  crypto_int64 h1 = load_3(s + 4) << 6;
    225  crypto_int64 h2 = load_3(s + 7) << 5;
    226  crypto_int64 h3 = load_3(s + 10) << 3;
    227  crypto_int64 h4 = load_3(s + 13) << 2;
    228  crypto_int64 h5 = load_4(s + 16);
    229  crypto_int64 h6 = load_3(s + 20) << 7;
    230  crypto_int64 h7 = load_3(s + 23) << 5;
    231  crypto_int64 h8 = load_3(s + 26) << 4;
    232  crypto_int64 h9 = load_3(s + 29) << 2;
    233  crypto_int64 carry0;
    234  crypto_int64 carry1;
    235  crypto_int64 carry2;
    236  crypto_int64 carry3;
    237  crypto_int64 carry4;
    238  crypto_int64 carry5;
    239  crypto_int64 carry6;
    240  crypto_int64 carry7;
    241  crypto_int64 carry8;
    242  crypto_int64 carry9;
    243 
    244  carry9 = (h9 + (crypto_int64) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
    245  carry1 = (h1 + (crypto_int64) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
    246  carry3 = (h3 + (crypto_int64) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
    247  carry5 = (h5 + (crypto_int64) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
    248  carry7 = (h7 + (crypto_int64) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
    249 
    250  carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
    251  carry2 = (h2 + (crypto_int64) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
    252  carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
    253  carry6 = (h6 + (crypto_int64) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
    254  carry8 = (h8 + (crypto_int64) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
    255 
    256  h[0] = h0;
    257  h[1] = h1;
    258  h[2] = h2;
    259  h[3] = h3;
    260  h[4] = h4;
    261  h[5] = h5;
    262  h[6] = h6;
    263  h[7] = h7;
    264  h[8] = h8;
    265  h[9] = h9;
    266 }
    267 
    268 
    269 /*
    270 h = f * g
    271 Can overlap h with f or g.
    272 
    273 Preconditions:
    274   |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
    275   |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
    276 
    277 Postconditions:
    278   |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
    279 */
    280 
    281 /*
    282 Notes on implementation strategy:
    283 
    284 Using schoolbook multiplication.
    285 Karatsuba would save a little in some cost models.
    286 
    287 Most multiplications by 2 and 19 are 32-bit precomputations;
    288 cheaper than 64-bit postcomputations.
    289 
    290 There is one remaining multiplication by 19 in the carry chain;
    291 one *19 precomputation can be merged into this,
    292 but the resulting data flow is considerably less clean.
    293 
    294 There are 12 carries below.
    295 10 of them are 2-way parallelizable and vectorizable.
    296 Can get away with 11 carries, but then data flow is much deeper.
    297 
    298 With tighter constraints on inputs can squeeze carries into int32.
    299 */
    300 
    301 void fe_mul(fe h,fe f,fe g)
    302 {
    303  crypto_int32 f0 = f[0];
    304  crypto_int32 f1 = f[1];
    305  crypto_int32 f2 = f[2];
    306  crypto_int32 f3 = f[3];
    307  crypto_int32 f4 = f[4];
    308  crypto_int32 f5 = f[5];
    309  crypto_int32 f6 = f[6];
    310  crypto_int32 f7 = f[7];
    311  crypto_int32 f8 = f[8];
    312  crypto_int32 f9 = f[9];
    313  crypto_int32 g0 = g[0];
    314  crypto_int32 g1 = g[1];
    315  crypto_int32 g2 = g[2];
    316  crypto_int32 g3 = g[3];
    317  crypto_int32 g4 = g[4];
    318  crypto_int32 g5 = g[5];
    319  crypto_int32 g6 = g[6];
    320  crypto_int32 g7 = g[7];
    321  crypto_int32 g8 = g[8];
    322  crypto_int32 g9 = g[9];
    323  crypto_int32 g1_19 = 19 * g1; /* 1.4*2^29 */
    324  crypto_int32 g2_19 = 19 * g2; /* 1.4*2^30; still ok */
    325  crypto_int32 g3_19 = 19 * g3;
    326  crypto_int32 g4_19 = 19 * g4;
    327  crypto_int32 g5_19 = 19 * g5;
    328  crypto_int32 g6_19 = 19 * g6;
    329  crypto_int32 g7_19 = 19 * g7;
    330  crypto_int32 g8_19 = 19 * g8;
    331  crypto_int32 g9_19 = 19 * g9;
    332  crypto_int32 f1_2 = 2 * f1;
    333  crypto_int32 f3_2 = 2 * f3;
    334  crypto_int32 f5_2 = 2 * f5;
    335  crypto_int32 f7_2 = 2 * f7;
    336  crypto_int32 f9_2 = 2 * f9;
    337  crypto_int64 f0g0    = f0   * (crypto_int64) g0;
    338  crypto_int64 f0g1    = f0   * (crypto_int64) g1;
    339  crypto_int64 f0g2    = f0   * (crypto_int64) g2;
    340  crypto_int64 f0g3    = f0   * (crypto_int64) g3;
    341  crypto_int64 f0g4    = f0   * (crypto_int64) g4;
    342  crypto_int64 f0g5    = f0   * (crypto_int64) g5;
    343  crypto_int64 f0g6    = f0   * (crypto_int64) g6;
    344  crypto_int64 f0g7    = f0   * (crypto_int64) g7;
    345  crypto_int64 f0g8    = f0   * (crypto_int64) g8;
    346  crypto_int64 f0g9    = f0   * (crypto_int64) g9;
    347  crypto_int64 f1g0    = f1   * (crypto_int64) g0;
    348  crypto_int64 f1g1_2  = f1_2 * (crypto_int64) g1;
    349  crypto_int64 f1g2    = f1   * (crypto_int64) g2;
    350  crypto_int64 f1g3_2  = f1_2 * (crypto_int64) g3;
    351  crypto_int64 f1g4    = f1   * (crypto_int64) g4;
    352  crypto_int64 f1g5_2  = f1_2 * (crypto_int64) g5;
    353  crypto_int64 f1g6    = f1   * (crypto_int64) g6;
    354  crypto_int64 f1g7_2  = f1_2 * (crypto_int64) g7;
    355  crypto_int64 f1g8    = f1   * (crypto_int64) g8;
    356  crypto_int64 f1g9_38 = f1_2 * (crypto_int64) g9_19;
    357  crypto_int64 f2g0    = f2   * (crypto_int64) g0;
    358  crypto_int64 f2g1    = f2   * (crypto_int64) g1;
    359  crypto_int64 f2g2    = f2   * (crypto_int64) g2;
    360  crypto_int64 f2g3    = f2   * (crypto_int64) g3;
    361  crypto_int64 f2g4    = f2   * (crypto_int64) g4;
    362  crypto_int64 f2g5    = f2   * (crypto_int64) g5;
    363  crypto_int64 f2g6    = f2   * (crypto_int64) g6;
    364  crypto_int64 f2g7    = f2   * (crypto_int64) g7;
    365  crypto_int64 f2g8_19 = f2   * (crypto_int64) g8_19;
    366  crypto_int64 f2g9_19 = f2   * (crypto_int64) g9_19;
    367  crypto_int64 f3g0    = f3   * (crypto_int64) g0;
    368  crypto_int64 f3g1_2  = f3_2 * (crypto_int64) g1;
    369  crypto_int64 f3g2    = f3   * (crypto_int64) g2;
    370  crypto_int64 f3g3_2  = f3_2 * (crypto_int64) g3;
    371  crypto_int64 f3g4    = f3   * (crypto_int64) g4;
    372  crypto_int64 f3g5_2  = f3_2 * (crypto_int64) g5;
    373  crypto_int64 f3g6    = f3   * (crypto_int64) g6;
    374  crypto_int64 f3g7_38 = f3_2 * (crypto_int64) g7_19;
    375  crypto_int64 f3g8_19 = f3   * (crypto_int64) g8_19;
    376  crypto_int64 f3g9_38 = f3_2 * (crypto_int64) g9_19;
    377  crypto_int64 f4g0    = f4   * (crypto_int64) g0;
    378  crypto_int64 f4g1    = f4   * (crypto_int64) g1;
    379  crypto_int64 f4g2    = f4   * (crypto_int64) g2;
    380  crypto_int64 f4g3    = f4   * (crypto_int64) g3;
    381  crypto_int64 f4g4    = f4   * (crypto_int64) g4;
    382  crypto_int64 f4g5    = f4   * (crypto_int64) g5;
    383  crypto_int64 f4g6_19 = f4   * (crypto_int64) g6_19;
    384  crypto_int64 f4g7_19 = f4   * (crypto_int64) g7_19;
    385  crypto_int64 f4g8_19 = f4   * (crypto_int64) g8_19;
    386  crypto_int64 f4g9_19 = f4   * (crypto_int64) g9_19;
    387  crypto_int64 f5g0    = f5   * (crypto_int64) g0;
    388  crypto_int64 f5g1_2  = f5_2 * (crypto_int64) g1;
    389  crypto_int64 f5g2    = f5   * (crypto_int64) g2;
    390  crypto_int64 f5g3_2  = f5_2 * (crypto_int64) g3;
    391  crypto_int64 f5g4    = f5   * (crypto_int64) g4;
    392  crypto_int64 f5g5_38 = f5_2 * (crypto_int64) g5_19;
    393  crypto_int64 f5g6_19 = f5   * (crypto_int64) g6_19;
    394  crypto_int64 f5g7_38 = f5_2 * (crypto_int64) g7_19;
    395  crypto_int64 f5g8_19 = f5   * (crypto_int64) g8_19;
    396  crypto_int64 f5g9_38 = f5_2 * (crypto_int64) g9_19;
    397  crypto_int64 f6g0    = f6   * (crypto_int64) g0;
    398  crypto_int64 f6g1    = f6   * (crypto_int64) g1;
    399  crypto_int64 f6g2    = f6   * (crypto_int64) g2;
    400  crypto_int64 f6g3    = f6   * (crypto_int64) g3;
    401  crypto_int64 f6g4_19 = f6   * (crypto_int64) g4_19;
    402  crypto_int64 f6g5_19 = f6   * (crypto_int64) g5_19;
    403  crypto_int64 f6g6_19 = f6   * (crypto_int64) g6_19;
    404  crypto_int64 f6g7_19 = f6   * (crypto_int64) g7_19;
    405  crypto_int64 f6g8_19 = f6   * (crypto_int64) g8_19;
    406  crypto_int64 f6g9_19 = f6   * (crypto_int64) g9_19;
    407  crypto_int64 f7g0    = f7   * (crypto_int64) g0;
    408  crypto_int64 f7g1_2  = f7_2 * (crypto_int64) g1;
    409  crypto_int64 f7g2    = f7   * (crypto_int64) g2;
    410  crypto_int64 f7g3_38 = f7_2 * (crypto_int64) g3_19;
    411  crypto_int64 f7g4_19 = f7   * (crypto_int64) g4_19;
    412  crypto_int64 f7g5_38 = f7_2 * (crypto_int64) g5_19;
    413  crypto_int64 f7g6_19 = f7   * (crypto_int64) g6_19;
    414  crypto_int64 f7g7_38 = f7_2 * (crypto_int64) g7_19;
    415  crypto_int64 f7g8_19 = f7   * (crypto_int64) g8_19;
    416  crypto_int64 f7g9_38 = f7_2 * (crypto_int64) g9_19;
    417  crypto_int64 f8g0    = f8   * (crypto_int64) g0;
    418  crypto_int64 f8g1    = f8   * (crypto_int64) g1;
    419  crypto_int64 f8g2_19 = f8   * (crypto_int64) g2_19;
    420  crypto_int64 f8g3_19 = f8   * (crypto_int64) g3_19;
    421  crypto_int64 f8g4_19 = f8   * (crypto_int64) g4_19;
    422  crypto_int64 f8g5_19 = f8   * (crypto_int64) g5_19;
    423  crypto_int64 f8g6_19 = f8   * (crypto_int64) g6_19;
    424  crypto_int64 f8g7_19 = f8   * (crypto_int64) g7_19;
    425  crypto_int64 f8g8_19 = f8   * (crypto_int64) g8_19;
    426  crypto_int64 f8g9_19 = f8   * (crypto_int64) g9_19;
    427  crypto_int64 f9g0    = f9   * (crypto_int64) g0;
    428  crypto_int64 f9g1_38 = f9_2 * (crypto_int64) g1_19;
    429  crypto_int64 f9g2_19 = f9   * (crypto_int64) g2_19;
    430  crypto_int64 f9g3_38 = f9_2 * (crypto_int64) g3_19;
    431  crypto_int64 f9g4_19 = f9   * (crypto_int64) g4_19;
    432  crypto_int64 f9g5_38 = f9_2 * (crypto_int64) g5_19;
    433  crypto_int64 f9g6_19 = f9   * (crypto_int64) g6_19;
    434  crypto_int64 f9g7_38 = f9_2 * (crypto_int64) g7_19;
    435  crypto_int64 f9g8_19 = f9   * (crypto_int64) g8_19;
    436  crypto_int64 f9g9_38 = f9_2 * (crypto_int64) g9_19;
    437  crypto_int64 h0 = f0g0+f1g9_38+f2g8_19+f3g7_38+f4g6_19+f5g5_38+f6g4_19+f7g3_38+f8g2_19+f9g1_38;
    438  crypto_int64 h1 = f0g1+f1g0   +f2g9_19+f3g8_19+f4g7_19+f5g6_19+f6g5_19+f7g4_19+f8g3_19+f9g2_19;
    439  crypto_int64 h2 = f0g2+f1g1_2 +f2g0   +f3g9_38+f4g8_19+f5g7_38+f6g6_19+f7g5_38+f8g4_19+f9g3_38;
    440  crypto_int64 h3 = f0g3+f1g2   +f2g1   +f3g0   +f4g9_19+f5g8_19+f6g7_19+f7g6_19+f8g5_19+f9g4_19;
    441  crypto_int64 h4 = f0g4+f1g3_2 +f2g2   +f3g1_2 +f4g0   +f5g9_38+f6g8_19+f7g7_38+f8g6_19+f9g5_38;
    442  crypto_int64 h5 = f0g5+f1g4   +f2g3   +f3g2   +f4g1   +f5g0   +f6g9_19+f7g8_19+f8g7_19+f9g6_19;
    443  crypto_int64 h6 = f0g6+f1g5_2 +f2g4   +f3g3_2 +f4g2   +f5g1_2 +f6g0   +f7g9_38+f8g8_19+f9g7_38;
    444  crypto_int64 h7 = f0g7+f1g6   +f2g5   +f3g4   +f4g3   +f5g2   +f6g1   +f7g0   +f8g9_19+f9g8_19;
    445  crypto_int64 h8 = f0g8+f1g7_2 +f2g6   +f3g5_2 +f4g4   +f5g3_2 +f6g2   +f7g1_2 +f8g0   +f9g9_38;
    446  crypto_int64 h9 = f0g9+f1g8   +f2g7   +f3g6   +f4g5   +f5g4   +f6g3   +f7g2   +f8g1   +f9g0   ;
    447  crypto_int64 carry0;
    448  crypto_int64 carry1;
    449  crypto_int64 carry2;
    450  crypto_int64 carry3;
    451  crypto_int64 carry4;
    452  crypto_int64 carry5;
    453  crypto_int64 carry6;
    454  crypto_int64 carry7;
    455  crypto_int64 carry8;
    456  crypto_int64 carry9;
    457 
    458  /*
    459  |h0| <= (1.1*1.1*2^52*(1+19+19+19+19)+1.1*1.1*2^50*(38+38+38+38+38))
    460    i.e. |h0| <= 1.2*2^59; narrower ranges for h2, h4, h6, h8
    461  |h1| <= (1.1*1.1*2^51*(1+1+19+19+19+19+19+19+19+19))
    462    i.e. |h1| <= 1.5*2^58; narrower ranges for h3, h5, h7, h9
    463  */
    464 
    465  carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
    466  carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
    467  /* |h0| <= 2^25 */
    468  /* |h4| <= 2^25 */
    469  /* |h1| <= 1.51*2^58 */
    470  /* |h5| <= 1.51*2^58 */
    471 
    472  carry1 = (h1 + (crypto_int64) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
    473  carry5 = (h5 + (crypto_int64) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
    474  /* |h1| <= 2^24; from now on fits into int32 */
    475  /* |h5| <= 2^24; from now on fits into int32 */
    476  /* |h2| <= 1.21*2^59 */
    477  /* |h6| <= 1.21*2^59 */
    478 
    479  carry2 = (h2 + (crypto_int64) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
    480  carry6 = (h6 + (crypto_int64) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
    481  /* |h2| <= 2^25; from now on fits into int32 unchanged */
    482  /* |h6| <= 2^25; from now on fits into int32 unchanged */
    483  /* |h3| <= 1.51*2^58 */
    484  /* |h7| <= 1.51*2^58 */
    485 
    486  carry3 = (h3 + (crypto_int64) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
    487  carry7 = (h7 + (crypto_int64) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
    488  /* |h3| <= 2^24; from now on fits into int32 unchanged */
    489  /* |h7| <= 2^24; from now on fits into int32 unchanged */
    490  /* |h4| <= 1.52*2^33 */
    491  /* |h8| <= 1.52*2^33 */
    492 
    493  carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
    494  carry8 = (h8 + (crypto_int64) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
    495  /* |h4| <= 2^25; from now on fits into int32 unchanged */
    496  /* |h8| <= 2^25; from now on fits into int32 unchanged */
    497  /* |h5| <= 1.01*2^24 */
    498  /* |h9| <= 1.51*2^58 */
    499 
    500  carry9 = (h9 + (crypto_int64) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
    501  /* |h9| <= 2^24; from now on fits into int32 unchanged */
    502  /* |h0| <= 1.8*2^37 */
    503 
    504  carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
    505  /* |h0| <= 2^25; from now on fits into int32 unchanged */
    506  /* |h1| <= 1.01*2^24 */
    507 
    508  h[0] = h0;
    509  h[1] = h1;
    510  h[2] = h2;
    511  h[3] = h3;
    512  h[4] = h4;
    513  h[5] = h5;
    514  h[6] = h6;
    515  h[7] = h7;
    516  h[8] = h8;
    517  h[9] = h9;
    518 }
    519 
    520 /*
    521 h = f * 121666
    522 Can overlap h with f.
    523 
    524 Preconditions:
    525   |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
    526 
    527 Postconditions:
    528   |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
    529 */
    530 
    531 void fe_mul121666(fe h,fe f)
    532 {
    533  crypto_int32 f0 = f[0];
    534  crypto_int32 f1 = f[1];
    535  crypto_int32 f2 = f[2];
    536  crypto_int32 f3 = f[3];
    537  crypto_int32 f4 = f[4];
    538  crypto_int32 f5 = f[5];
    539  crypto_int32 f6 = f[6];
    540  crypto_int32 f7 = f[7];
    541  crypto_int32 f8 = f[8];
    542  crypto_int32 f9 = f[9];
    543  crypto_int64 h0 = f0 * (crypto_int64) 121666;
    544  crypto_int64 h1 = f1 * (crypto_int64) 121666;
    545  crypto_int64 h2 = f2 * (crypto_int64) 121666;
    546  crypto_int64 h3 = f3 * (crypto_int64) 121666;
    547  crypto_int64 h4 = f4 * (crypto_int64) 121666;
    548  crypto_int64 h5 = f5 * (crypto_int64) 121666;
    549  crypto_int64 h6 = f6 * (crypto_int64) 121666;
    550  crypto_int64 h7 = f7 * (crypto_int64) 121666;
    551  crypto_int64 h8 = f8 * (crypto_int64) 121666;
    552  crypto_int64 h9 = f9 * (crypto_int64) 121666;
    553  crypto_int64 carry0;
    554  crypto_int64 carry1;
    555  crypto_int64 carry2;
    556  crypto_int64 carry3;
    557  crypto_int64 carry4;
    558  crypto_int64 carry5;
    559  crypto_int64 carry6;
    560  crypto_int64 carry7;
    561  crypto_int64 carry8;
    562  crypto_int64 carry9;
    563 
    564  carry9 = (h9 + (crypto_int64) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
    565  carry1 = (h1 + (crypto_int64) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
    566  carry3 = (h3 + (crypto_int64) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
    567  carry5 = (h5 + (crypto_int64) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
    568  carry7 = (h7 + (crypto_int64) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
    569 
    570  carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
    571  carry2 = (h2 + (crypto_int64) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
    572  carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
    573  carry6 = (h6 + (crypto_int64) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
    574  carry8 = (h8 + (crypto_int64) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
    575 
    576  h[0] = h0;
    577  h[1] = h1;
    578  h[2] = h2;
    579  h[3] = h3;
    580  h[4] = h4;
    581  h[5] = h5;
    582  h[6] = h6;
    583  h[7] = h7;
    584  h[8] = h8;
    585  h[9] = h9;
    586 }
    587 
    588 /*
    589 h = f * f
    590 Can overlap h with f.
    591 
    592 Preconditions:
    593   |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
    594 
    595 Postconditions:
    596   |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
    597 */
    598 
    599 /*
    600 See fe_mul.c for discussion of implementation strategy.
    601 */
    602 
    603 void fe_sq(fe h,fe f)
    604 {
    605  crypto_int32 f0 = f[0];
    606  crypto_int32 f1 = f[1];
    607  crypto_int32 f2 = f[2];
    608  crypto_int32 f3 = f[3];
    609  crypto_int32 f4 = f[4];
    610  crypto_int32 f5 = f[5];
    611  crypto_int32 f6 = f[6];
    612  crypto_int32 f7 = f[7];
    613  crypto_int32 f8 = f[8];
    614  crypto_int32 f9 = f[9];
    615  crypto_int32 f0_2 = 2 * f0;
    616  crypto_int32 f1_2 = 2 * f1;
    617  crypto_int32 f2_2 = 2 * f2;
    618  crypto_int32 f3_2 = 2 * f3;
    619  crypto_int32 f4_2 = 2 * f4;
    620  crypto_int32 f5_2 = 2 * f5;
    621  crypto_int32 f6_2 = 2 * f6;
    622  crypto_int32 f7_2 = 2 * f7;
    623  crypto_int32 f5_38 = 38 * f5; /* 1.31*2^30 */
    624  crypto_int32 f6_19 = 19 * f6; /* 1.31*2^30 */
    625  crypto_int32 f7_38 = 38 * f7; /* 1.31*2^30 */
    626  crypto_int32 f8_19 = 19 * f8; /* 1.31*2^30 */
    627  crypto_int32 f9_38 = 38 * f9; /* 1.31*2^30 */
    628  crypto_int64 f0f0    = f0   * (crypto_int64) f0;
    629  crypto_int64 f0f1_2  = f0_2 * (crypto_int64) f1;
    630  crypto_int64 f0f2_2  = f0_2 * (crypto_int64) f2;
    631  crypto_int64 f0f3_2  = f0_2 * (crypto_int64) f3;
    632  crypto_int64 f0f4_2  = f0_2 * (crypto_int64) f4;
    633  crypto_int64 f0f5_2  = f0_2 * (crypto_int64) f5;
    634  crypto_int64 f0f6_2  = f0_2 * (crypto_int64) f6;
    635  crypto_int64 f0f7_2  = f0_2 * (crypto_int64) f7;
    636  crypto_int64 f0f8_2  = f0_2 * (crypto_int64) f8;
    637  crypto_int64 f0f9_2  = f0_2 * (crypto_int64) f9;
    638  crypto_int64 f1f1_2  = f1_2 * (crypto_int64) f1;
    639  crypto_int64 f1f2_2  = f1_2 * (crypto_int64) f2;
    640  crypto_int64 f1f3_4  = f1_2 * (crypto_int64) f3_2;
    641  crypto_int64 f1f4_2  = f1_2 * (crypto_int64) f4;
    642  crypto_int64 f1f5_4  = f1_2 * (crypto_int64) f5_2;
    643  crypto_int64 f1f6_2  = f1_2 * (crypto_int64) f6;
    644  crypto_int64 f1f7_4  = f1_2 * (crypto_int64) f7_2;
    645  crypto_int64 f1f8_2  = f1_2 * (crypto_int64) f8;
    646  crypto_int64 f1f9_76 = f1_2 * (crypto_int64) f9_38;
    647  crypto_int64 f2f2    = f2   * (crypto_int64) f2;
    648  crypto_int64 f2f3_2  = f2_2 * (crypto_int64) f3;
    649  crypto_int64 f2f4_2  = f2_2 * (crypto_int64) f4;
    650  crypto_int64 f2f5_2  = f2_2 * (crypto_int64) f5;
    651  crypto_int64 f2f6_2  = f2_2 * (crypto_int64) f6;
    652  crypto_int64 f2f7_2  = f2_2 * (crypto_int64) f7;
    653  crypto_int64 f2f8_38 = f2_2 * (crypto_int64) f8_19;
    654  crypto_int64 f2f9_38 = f2   * (crypto_int64) f9_38;
    655  crypto_int64 f3f3_2  = f3_2 * (crypto_int64) f3;
    656  crypto_int64 f3f4_2  = f3_2 * (crypto_int64) f4;
    657  crypto_int64 f3f5_4  = f3_2 * (crypto_int64) f5_2;
    658  crypto_int64 f3f6_2  = f3_2 * (crypto_int64) f6;
    659  crypto_int64 f3f7_76 = f3_2 * (crypto_int64) f7_38;
    660  crypto_int64 f3f8_38 = f3_2 * (crypto_int64) f8_19;
    661  crypto_int64 f3f9_76 = f3_2 * (crypto_int64) f9_38;
    662  crypto_int64 f4f4    = f4   * (crypto_int64) f4;
    663  crypto_int64 f4f5_2  = f4_2 * (crypto_int64) f5;
    664  crypto_int64 f4f6_38 = f4_2 * (crypto_int64) f6_19;
    665  crypto_int64 f4f7_38 = f4   * (crypto_int64) f7_38;
    666  crypto_int64 f4f8_38 = f4_2 * (crypto_int64) f8_19;
    667  crypto_int64 f4f9_38 = f4   * (crypto_int64) f9_38;
    668  crypto_int64 f5f5_38 = f5   * (crypto_int64) f5_38;
    669  crypto_int64 f5f6_38 = f5_2 * (crypto_int64) f6_19;
    670  crypto_int64 f5f7_76 = f5_2 * (crypto_int64) f7_38;
    671  crypto_int64 f5f8_38 = f5_2 * (crypto_int64) f8_19;
    672  crypto_int64 f5f9_76 = f5_2 * (crypto_int64) f9_38;
    673  crypto_int64 f6f6_19 = f6   * (crypto_int64) f6_19;
    674  crypto_int64 f6f7_38 = f6   * (crypto_int64) f7_38;
    675  crypto_int64 f6f8_38 = f6_2 * (crypto_int64) f8_19;
    676  crypto_int64 f6f9_38 = f6   * (crypto_int64) f9_38;
    677  crypto_int64 f7f7_38 = f7   * (crypto_int64) f7_38;
    678  crypto_int64 f7f8_38 = f7_2 * (crypto_int64) f8_19;
    679  crypto_int64 f7f9_76 = f7_2 * (crypto_int64) f9_38;
    680  crypto_int64 f8f8_19 = f8   * (crypto_int64) f8_19;
    681  crypto_int64 f8f9_38 = f8   * (crypto_int64) f9_38;
    682  crypto_int64 f9f9_38 = f9   * (crypto_int64) f9_38;
    683  crypto_int64 h0 = f0f0  +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38;
    684  crypto_int64 h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38;
    685  crypto_int64 h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19;
    686  crypto_int64 h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38;
    687  crypto_int64 h4 = f0f4_2+f1f3_4 +f2f2   +f5f9_76+f6f8_38+f7f7_38;
    688  crypto_int64 h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38;
    689  crypto_int64 h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19;
    690  crypto_int64 h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38;
    691  crypto_int64 h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4   +f9f9_38;
    692  crypto_int64 h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2;
    693  crypto_int64 carry0;
    694  crypto_int64 carry1;
    695  crypto_int64 carry2;
    696  crypto_int64 carry3;
    697  crypto_int64 carry4;
    698  crypto_int64 carry5;
    699  crypto_int64 carry6;
    700  crypto_int64 carry7;
    701  crypto_int64 carry8;
    702  crypto_int64 carry9;
    703 
    704  carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
    705  carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
    706 
    707  carry1 = (h1 + (crypto_int64) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
    708  carry5 = (h5 + (crypto_int64) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
    709 
    710  carry2 = (h2 + (crypto_int64) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
    711  carry6 = (h6 + (crypto_int64) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
    712 
    713  carry3 = (h3 + (crypto_int64) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
    714  carry7 = (h7 + (crypto_int64) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
    715 
    716  carry4 = (h4 + (crypto_int64) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
    717  carry8 = (h8 + (crypto_int64) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
    718 
    719  carry9 = (h9 + (crypto_int64) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
    720 
    721  carry0 = (h0 + (crypto_int64) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
    722 
    723  h[0] = h0;
    724  h[1] = h1;
    725  h[2] = h2;
    726  h[3] = h3;
    727  h[4] = h4;
    728  h[5] = h5;
    729  h[6] = h6;
    730  h[7] = h7;
    731  h[8] = h8;
    732  h[9] = h9;
    733 }
    734 
    735 /*
    736 h = f - g
    737 Can overlap h with f or g.
    738 
    739 Preconditions:
    740   |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
    741   |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
    742 
    743 Postconditions:
    744   |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
    745 */
    746 
    747 void fe_sub(fe h,fe f,fe g)
    748 {
    749  crypto_int32 f0 = f[0];
    750  crypto_int32 f1 = f[1];
    751  crypto_int32 f2 = f[2];
    752  crypto_int32 f3 = f[3];
    753  crypto_int32 f4 = f[4];
    754  crypto_int32 f5 = f[5];
    755  crypto_int32 f6 = f[6];
    756  crypto_int32 f7 = f[7];
    757  crypto_int32 f8 = f[8];
    758  crypto_int32 f9 = f[9];
    759  crypto_int32 g0 = g[0];
    760  crypto_int32 g1 = g[1];
    761  crypto_int32 g2 = g[2];
    762  crypto_int32 g3 = g[3];
    763  crypto_int32 g4 = g[4];
    764  crypto_int32 g5 = g[5];
    765  crypto_int32 g6 = g[6];
    766  crypto_int32 g7 = g[7];
    767  crypto_int32 g8 = g[8];
    768  crypto_int32 g9 = g[9];
    769  crypto_int32 h0 = f0 - g0;
    770  crypto_int32 h1 = f1 - g1;
    771  crypto_int32 h2 = f2 - g2;
    772  crypto_int32 h3 = f3 - g3;
    773  crypto_int32 h4 = f4 - g4;
    774  crypto_int32 h5 = f5 - g5;
    775  crypto_int32 h6 = f6 - g6;
    776  crypto_int32 h7 = f7 - g7;
    777  crypto_int32 h8 = f8 - g8;
    778  crypto_int32 h9 = f9 - g9;
    779  h[0] = h0;
    780  h[1] = h1;
    781  h[2] = h2;
    782  h[3] = h3;
    783  h[4] = h4;
    784  h[5] = h5;
    785  h[6] = h6;
    786  h[7] = h7;
    787  h[8] = h8;
    788  h[9] = h9;
    789 }
    790 
    791 /*
    792 Preconditions:
    793  |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
    794 
    795 Write p=2^255-19; q=floor(h/p).
    796 Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
    797 
    798 Proof:
    799  Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
    800  Also have |h-2^230 h9|<2^230 so |19 2^(-255)(h-2^230 h9)|<1/4.
    801 
    802  Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
    803  Then 0<y<1.
    804 
    805  Write r=h-pq.
    806  Have 0<=r<=p-1=2^255-20.
    807  Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
    808 
    809  Write x=r+19(2^-255)r+y.
    810  Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
    811 
    812  Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
    813  so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
    814 */
    815 
    816 void fe_tobytes(unsigned char *s,fe h)
    817 {
    818  crypto_int32 h0 = h[0];
    819  crypto_int32 h1 = h[1];
    820  crypto_int32 h2 = h[2];
    821  crypto_int32 h3 = h[3];
    822  crypto_int32 h4 = h[4];
    823  crypto_int32 h5 = h[5];
    824  crypto_int32 h6 = h[6];
    825  crypto_int32 h7 = h[7];
    826  crypto_int32 h8 = h[8];
    827  crypto_int32 h9 = h[9];
    828  crypto_int32 q;
    829  crypto_int32 carry0;
    830  crypto_int32 carry1;
    831  crypto_int32 carry2;
    832  crypto_int32 carry3;
    833  crypto_int32 carry4;
    834  crypto_int32 carry5;
    835  crypto_int32 carry6;
    836  crypto_int32 carry7;
    837  crypto_int32 carry8;
    838  crypto_int32 carry9;
    839 
    840  q = (19 * h9 + (((crypto_int32) 1) << 24)) >> 25;
    841  q = (h0 + q) >> 26;
    842  q = (h1 + q) >> 25;
    843  q = (h2 + q) >> 26;
    844  q = (h3 + q) >> 25;
    845  q = (h4 + q) >> 26;
    846  q = (h5 + q) >> 25;
    847  q = (h6 + q) >> 26;
    848  q = (h7 + q) >> 25;
    849  q = (h8 + q) >> 26;
    850  q = (h9 + q) >> 25;
    851 
    852  /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
    853  h0 += 19 * q;
    854  /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
    855 
    856  carry0 = h0 >> 26; h1 += carry0; h0 -= carry0 << 26;
    857  carry1 = h1 >> 25; h2 += carry1; h1 -= carry1 << 25;
    858  carry2 = h2 >> 26; h3 += carry2; h2 -= carry2 << 26;
    859  carry3 = h3 >> 25; h4 += carry3; h3 -= carry3 << 25;
    860  carry4 = h4 >> 26; h5 += carry4; h4 -= carry4 << 26;
    861  carry5 = h5 >> 25; h6 += carry5; h5 -= carry5 << 25;
    862  carry6 = h6 >> 26; h7 += carry6; h6 -= carry6 << 26;
    863  carry7 = h7 >> 25; h8 += carry7; h7 -= carry7 << 25;
    864  carry8 = h8 >> 26; h9 += carry8; h8 -= carry8 << 26;
    865  carry9 = h9 >> 25;               h9 -= carry9 << 25;
    866                  /* h10 = carry9 */
    867 
    868  /*
    869  Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
    870  Have h0+...+2^230 h9 between 0 and 2^255-1;
    871  evidently 2^255 h10-2^255 q = 0.
    872  Goal: Output h0+...+2^230 h9.
    873  */
    874 
    875  s[0] = h0 >> 0;
    876  s[1] = h0 >> 8;
    877  s[2] = h0 >> 16;
    878  s[3] = (h0 >> 24) | (h1 << 2);
    879  s[4] = h1 >> 6;
    880  s[5] = h1 >> 14;
    881  s[6] = (h1 >> 22) | (h2 << 3);
    882  s[7] = h2 >> 5;
    883  s[8] = h2 >> 13;
    884  s[9] = (h2 >> 21) | (h3 << 5);
    885  s[10] = h3 >> 3;
    886  s[11] = h3 >> 11;
    887  s[12] = (h3 >> 19) | (h4 << 6);
    888  s[13] = h4 >> 2;
    889  s[14] = h4 >> 10;
    890  s[15] = h4 >> 18;
    891  s[16] = h5 >> 0;
    892  s[17] = h5 >> 8;
    893  s[18] = h5 >> 16;
    894  s[19] = (h5 >> 24) | (h6 << 1);
    895  s[20] = h6 >> 7;
    896  s[21] = h6 >> 15;
    897  s[22] = (h6 >> 23) | (h7 << 3);
    898  s[23] = h7 >> 5;
    899  s[24] = h7 >> 13;
    900  s[25] = (h7 >> 21) | (h8 << 4);
    901  s[26] = h8 >> 4;
    902  s[27] = h8 >> 12;
    903  s[28] = (h8 >> 20) | (h9 << 6);
    904  s[29] = h9 >> 2;
    905  s[30] = h9 >> 10;
    906  s[31] = h9 >> 18;
    907 }
    908 
    909 void fe_invert(fe out,fe z)
    910 {
    911  fe t0;
    912  fe t1;
    913  fe t2;
    914  fe t3;
    915  int i;
    916 
    917 
    918 /* qhasm: fe z1 */
    919 
    920 /* qhasm: fe z2 */
    921 
    922 /* qhasm: fe z8 */
    923 
    924 /* qhasm: fe z9 */
    925 
    926 /* qhasm: fe z11 */
    927 
    928 /* qhasm: fe z22 */
    929 
    930 /* qhasm: fe z_5_0 */
    931 
    932 /* qhasm: fe z_10_5 */
    933 
    934 /* qhasm: fe z_10_0 */
    935 
    936 /* qhasm: fe z_20_10 */
    937 
    938 /* qhasm: fe z_20_0 */
    939 
    940 /* qhasm: fe z_40_20 */
    941 
    942 /* qhasm: fe z_40_0 */
    943 
    944 /* qhasm: fe z_50_10 */
    945 
    946 /* qhasm: fe z_50_0 */
    947 
    948 /* qhasm: fe z_100_50 */
    949 
    950 /* qhasm: fe z_100_0 */
    951 
    952 /* qhasm: fe z_200_100 */
    953 
    954 /* qhasm: fe z_200_0 */
    955 
    956 /* qhasm: fe z_250_50 */
    957 
    958 /* qhasm: fe z_250_0 */
    959 
    960 /* qhasm: fe z_255_5 */
    961 
    962 /* qhasm: fe z_255_21 */
    963 
    964 /* qhasm: enter pow225521 */
    965 
    966 /* qhasm: z2 = z1^2^1 */
    967 /* asm 1: fe_sq(>z2=fe#1,<z1=fe#11); for (i = 1;i < 1;++i) fe_sq(>z2=fe#1,>z2=fe#1); */
    968 /* asm 2: fe_sq(>z2=t0,<z1=z); for (i = 1;i < 1;++i) fe_sq(>z2=t0,>z2=t0); */
    969 fe_sq(t0,z); for (i = 1;i < 1;++i) fe_sq(t0,t0);
    970 
    971 /* qhasm: z8 = z2^2^2 */
    972 /* asm 1: fe_sq(>z8=fe#2,<z2=fe#1); for (i = 1;i < 2;++i) fe_sq(>z8=fe#2,>z8=fe#2); */
    973 /* asm 2: fe_sq(>z8=t1,<z2=t0); for (i = 1;i < 2;++i) fe_sq(>z8=t1,>z8=t1); */
    974 fe_sq(t1,t0); for (i = 1;i < 2;++i) fe_sq(t1,t1);
    975 
    976 /* qhasm: z9 = z1*z8 */
    977 /* asm 1: fe_mul(>z9=fe#2,<z1=fe#11,<z8=fe#2); */
    978 /* asm 2: fe_mul(>z9=t1,<z1=z,<z8=t1); */
    979 fe_mul(t1,z,t1);
    980 
    981 /* qhasm: z11 = z2*z9 */
    982 /* asm 1: fe_mul(>z11=fe#1,<z2=fe#1,<z9=fe#2); */
    983 /* asm 2: fe_mul(>z11=t0,<z2=t0,<z9=t1); */
    984 fe_mul(t0,t0,t1);
    985 
    986 /* qhasm: z22 = z11^2^1 */
    987 /* asm 1: fe_sq(>z22=fe#3,<z11=fe#1); for (i = 1;i < 1;++i) fe_sq(>z22=fe#3,>z22=fe#3); */
    988 /* asm 2: fe_sq(>z22=t2,<z11=t0); for (i = 1;i < 1;++i) fe_sq(>z22=t2,>z22=t2); */
    989 fe_sq(t2,t0); for (i = 1;i < 1;++i) fe_sq(t2,t2);
    990 
    991 /* qhasm: z_5_0 = z9*z22 */
    992 /* asm 1: fe_mul(>z_5_0=fe#2,<z9=fe#2,<z22=fe#3); */
    993 /* asm 2: fe_mul(>z_5_0=t1,<z9=t1,<z22=t2); */
    994 fe_mul(t1,t1,t2);
    995 
    996 /* qhasm: z_10_5 = z_5_0^2^5 */
    997 /* asm 1: fe_sq(>z_10_5=fe#3,<z_5_0=fe#2); for (i = 1;i < 5;++i) fe_sq(>z_10_5=fe#3,>z_10_5=fe#3); */
    998 /* asm 2: fe_sq(>z_10_5=t2,<z_5_0=t1); for (i = 1;i < 5;++i) fe_sq(>z_10_5=t2,>z_10_5=t2); */
    999 fe_sq(t2,t1); for (i = 1;i < 5;++i) fe_sq(t2,t2);
   1000 
   1001 /* qhasm: z_10_0 = z_10_5*z_5_0 */
   1002 /* asm 1: fe_mul(>z_10_0=fe#2,<z_10_5=fe#3,<z_5_0=fe#2); */
   1003 /* asm 2: fe_mul(>z_10_0=t1,<z_10_5=t2,<z_5_0=t1); */
   1004 fe_mul(t1,t2,t1);
   1005 
   1006 /* qhasm: z_20_10 = z_10_0^2^10 */
   1007 /* asm 1: fe_sq(>z_20_10=fe#3,<z_10_0=fe#2); for (i = 1;i < 10;++i) fe_sq(>z_20_10=fe#3,>z_20_10=fe#3); */
   1008 /* asm 2: fe_sq(>z_20_10=t2,<z_10_0=t1); for (i = 1;i < 10;++i) fe_sq(>z_20_10=t2,>z_20_10=t2); */
   1009 fe_sq(t2,t1); for (i = 1;i < 10;++i) fe_sq(t2,t2);
   1010 
   1011 /* qhasm: z_20_0 = z_20_10*z_10_0 */
   1012 /* asm 1: fe_mul(>z_20_0=fe#3,<z_20_10=fe#3,<z_10_0=fe#2); */
   1013 /* asm 2: fe_mul(>z_20_0=t2,<z_20_10=t2,<z_10_0=t1); */
   1014 fe_mul(t2,t2,t1);
   1015 
   1016 /* qhasm: z_40_20 = z_20_0^2^20 */
   1017 /* asm 1: fe_sq(>z_40_20=fe#4,<z_20_0=fe#3); for (i = 1;i < 20;++i) fe_sq(>z_40_20=fe#4,>z_40_20=fe#4); */
   1018 /* asm 2: fe_sq(>z_40_20=t3,<z_20_0=t2); for (i = 1;i < 20;++i) fe_sq(>z_40_20=t3,>z_40_20=t3); */
   1019 fe_sq(t3,t2); for (i = 1;i < 20;++i) fe_sq(t3,t3);
   1020 
   1021 /* qhasm: z_40_0 = z_40_20*z_20_0 */
   1022 /* asm 1: fe_mul(>z_40_0=fe#3,<z_40_20=fe#4,<z_20_0=fe#3); */
   1023 /* asm 2: fe_mul(>z_40_0=t2,<z_40_20=t3,<z_20_0=t2); */
   1024 fe_mul(t2,t3,t2);
   1025 
   1026 /* qhasm: z_50_10 = z_40_0^2^10 */
   1027 /* asm 1: fe_sq(>z_50_10=fe#3,<z_40_0=fe#3); for (i = 1;i < 10;++i) fe_sq(>z_50_10=fe#3,>z_50_10=fe#3); */
   1028 /* asm 2: fe_sq(>z_50_10=t2,<z_40_0=t2); for (i = 1;i < 10;++i) fe_sq(>z_50_10=t2,>z_50_10=t2); */
   1029 fe_sq(t2,t2); for (i = 1;i < 10;++i) fe_sq(t2,t2);
   1030 
   1031 /* qhasm: z_50_0 = z_50_10*z_10_0 */
   1032 /* asm 1: fe_mul(>z_50_0=fe#2,<z_50_10=fe#3,<z_10_0=fe#2); */
   1033 /* asm 2: fe_mul(>z_50_0=t1,<z_50_10=t2,<z_10_0=t1); */
   1034 fe_mul(t1,t2,t1);
   1035 
   1036 /* qhasm: z_100_50 = z_50_0^2^50 */
   1037 /* asm 1: fe_sq(>z_100_50=fe#3,<z_50_0=fe#2); for (i = 1;i < 50;++i) fe_sq(>z_100_50=fe#3,>z_100_50=fe#3); */
   1038 /* asm 2: fe_sq(>z_100_50=t2,<z_50_0=t1); for (i = 1;i < 50;++i) fe_sq(>z_100_50=t2,>z_100_50=t2); */
   1039 fe_sq(t2,t1); for (i = 1;i < 50;++i) fe_sq(t2,t2);
   1040 
   1041 /* qhasm: z_100_0 = z_100_50*z_50_0 */
   1042 /* asm 1: fe_mul(>z_100_0=fe#3,<z_100_50=fe#3,<z_50_0=fe#2); */
   1043 /* asm 2: fe_mul(>z_100_0=t2,<z_100_50=t2,<z_50_0=t1); */
   1044 fe_mul(t2,t2,t1);
   1045 
   1046 /* qhasm: z_200_100 = z_100_0^2^100 */
   1047 /* asm 1: fe_sq(>z_200_100=fe#4,<z_100_0=fe#3); for (i = 1;i < 100;++i) fe_sq(>z_200_100=fe#4,>z_200_100=fe#4); */
   1048 /* asm 2: fe_sq(>z_200_100=t3,<z_100_0=t2); for (i = 1;i < 100;++i) fe_sq(>z_200_100=t3,>z_200_100=t3); */
   1049 fe_sq(t3,t2); for (i = 1;i < 100;++i) fe_sq(t3,t3);
   1050 
   1051 /* qhasm: z_200_0 = z_200_100*z_100_0 */
   1052 /* asm 1: fe_mul(>z_200_0=fe#3,<z_200_100=fe#4,<z_100_0=fe#3); */
   1053 /* asm 2: fe_mul(>z_200_0=t2,<z_200_100=t3,<z_100_0=t2); */
   1054 fe_mul(t2,t3,t2);
   1055 
   1056 /* qhasm: z_250_50 = z_200_0^2^50 */
   1057 /* asm 1: fe_sq(>z_250_50=fe#3,<z_200_0=fe#3); for (i = 1;i < 50;++i) fe_sq(>z_250_50=fe#3,>z_250_50=fe#3); */
   1058 /* asm 2: fe_sq(>z_250_50=t2,<z_200_0=t2); for (i = 1;i < 50;++i) fe_sq(>z_250_50=t2,>z_250_50=t2); */
   1059 fe_sq(t2,t2); for (i = 1;i < 50;++i) fe_sq(t2,t2);
   1060 
   1061 /* qhasm: z_250_0 = z_250_50*z_50_0 */
   1062 /* asm 1: fe_mul(>z_250_0=fe#2,<z_250_50=fe#3,<z_50_0=fe#2); */
   1063 /* asm 2: fe_mul(>z_250_0=t1,<z_250_50=t2,<z_50_0=t1); */
   1064 fe_mul(t1,t2,t1);
   1065 
   1066 /* qhasm: z_255_5 = z_250_0^2^5 */
   1067 /* asm 1: fe_sq(>z_255_5=fe#2,<z_250_0=fe#2); for (i = 1;i < 5;++i) fe_sq(>z_255_5=fe#2,>z_255_5=fe#2); */
   1068 /* asm 2: fe_sq(>z_255_5=t1,<z_250_0=t1); for (i = 1;i < 5;++i) fe_sq(>z_255_5=t1,>z_255_5=t1); */
   1069 fe_sq(t1,t1); for (i = 1;i < 5;++i) fe_sq(t1,t1);
   1070 
   1071 /* qhasm: z_255_21 = z_255_5*z11 */
   1072 /* asm 1: fe_mul(>z_255_21=fe#12,<z_255_5=fe#2,<z11=fe#1); */
   1073 /* asm 2: fe_mul(>z_255_21=out,<z_255_5=t1,<z11=t0); */
   1074 fe_mul(out,t1,t0);
   1075 
   1076 /* qhasm: return */
   1077 
   1078  return;
   1079 }
   1080 
   1081 
   1082 int crypto_scalarmult_ref10(unsigned char *q,
   1083  const unsigned char *n,
   1084  const unsigned char *p)
   1085 {
   1086  unsigned char e[32];
   1087  unsigned int i;
   1088  fe x1;
   1089  fe x2;
   1090  fe z2;
   1091  fe x3;
   1092  fe z3;
   1093  fe tmp0;
   1094  fe tmp1;
   1095  int pos;
   1096  unsigned int swap;
   1097  unsigned int b;
   1098 
   1099  for (i = 0;i < 32;++i) e[i] = n[i];
   1100  e[0] &= 248;
   1101  e[31] &= 127;
   1102  e[31] |= 64;
   1103  fe_frombytes(x1,p);
   1104  fe_1(x2);
   1105  fe_0(z2);
   1106  fe_copy(x3,x1);
   1107  fe_1(z3);
   1108 
   1109  swap = 0;
   1110  for (pos = 254;pos >= 0;--pos) {
   1111    b = e[pos / 8] >> (pos & 7);
   1112    b &= 1;
   1113    swap ^= b;
   1114    fe_cswap(x2,x3,swap);
   1115    fe_cswap(z2,z3,swap);
   1116    swap = b;
   1117 /* qhasm: fe X2 */
   1118 
   1119 /* qhasm: fe Z2 */
   1120 
   1121 /* qhasm: fe X3 */
   1122 
   1123 /* qhasm: fe Z3 */
   1124 
   1125 /* qhasm: fe X4 */
   1126 
   1127 /* qhasm: fe Z4 */
   1128 
   1129 /* qhasm: fe X5 */
   1130 
   1131 /* qhasm: fe Z5 */
   1132 
   1133 /* qhasm: fe A */
   1134 
   1135 /* qhasm: fe B */
   1136 
   1137 /* qhasm: fe C */
   1138 
   1139 /* qhasm: fe D */
   1140 
   1141 /* qhasm: fe E */
   1142 
   1143 /* qhasm: fe AA */
   1144 
   1145 /* qhasm: fe BB */
   1146 
   1147 /* qhasm: fe DA */
   1148 
   1149 /* qhasm: fe CB */
   1150 
   1151 /* qhasm: fe t0 */
   1152 
   1153 /* qhasm: fe t1 */
   1154 
   1155 /* qhasm: fe t2 */
   1156 
   1157 /* qhasm: fe t3 */
   1158 
   1159 /* qhasm: fe t4 */
   1160 
   1161 /* qhasm: enter ladder */
   1162 
   1163 /* qhasm: D = X3-Z3 */
   1164 /* asm 1: fe_sub(>D=fe#5,<X3=fe#3,<Z3=fe#4); */
   1165 /* asm 2: fe_sub(>D=tmp0,<X3=x3,<Z3=z3); */
   1166 fe_sub(tmp0,x3,z3);
   1167 
   1168 /* qhasm: B = X2-Z2 */
   1169 /* asm 1: fe_sub(>B=fe#6,<X2=fe#1,<Z2=fe#2); */
   1170 /* asm 2: fe_sub(>B=tmp1,<X2=x2,<Z2=z2); */
   1171 fe_sub(tmp1,x2,z2);
   1172 
   1173 /* qhasm: A = X2+Z2 */
   1174 /* asm 1: fe_add(>A=fe#1,<X2=fe#1,<Z2=fe#2); */
   1175 /* asm 2: fe_add(>A=x2,<X2=x2,<Z2=z2); */
   1176 fe_add(x2,x2,z2);
   1177 
   1178 /* qhasm: C = X3+Z3 */
   1179 /* asm 1: fe_add(>C=fe#2,<X3=fe#3,<Z3=fe#4); */
   1180 /* asm 2: fe_add(>C=z2,<X3=x3,<Z3=z3); */
   1181 fe_add(z2,x3,z3);
   1182 
   1183 /* qhasm: DA = D*A */
   1184 /* asm 1: fe_mul(>DA=fe#4,<D=fe#5,<A=fe#1); */
   1185 /* asm 2: fe_mul(>DA=z3,<D=tmp0,<A=x2); */
   1186 fe_mul(z3,tmp0,x2);
   1187 
   1188 /* qhasm: CB = C*B */
   1189 /* asm 1: fe_mul(>CB=fe#2,<C=fe#2,<B=fe#6); */
   1190 /* asm 2: fe_mul(>CB=z2,<C=z2,<B=tmp1); */
   1191 fe_mul(z2,z2,tmp1);
   1192 
   1193 /* qhasm: BB = B^2 */
   1194 /* asm 1: fe_sq(>BB=fe#5,<B=fe#6); */
   1195 /* asm 2: fe_sq(>BB=tmp0,<B=tmp1); */
   1196 fe_sq(tmp0,tmp1);
   1197 
   1198 /* qhasm: AA = A^2 */
   1199 /* asm 1: fe_sq(>AA=fe#6,<A=fe#1); */
   1200 /* asm 2: fe_sq(>AA=tmp1,<A=x2); */
   1201 fe_sq(tmp1,x2);
   1202 
   1203 /* qhasm: t0 = DA+CB */
   1204 /* asm 1: fe_add(>t0=fe#3,<DA=fe#4,<CB=fe#2); */
   1205 /* asm 2: fe_add(>t0=x3,<DA=z3,<CB=z2); */
   1206 fe_add(x3,z3,z2);
   1207 
   1208 /* qhasm: assign x3 to t0 */
   1209 
   1210 /* qhasm: t1 = DA-CB */
   1211 /* asm 1: fe_sub(>t1=fe#2,<DA=fe#4,<CB=fe#2); */
   1212 /* asm 2: fe_sub(>t1=z2,<DA=z3,<CB=z2); */
   1213 fe_sub(z2,z3,z2);
   1214 
   1215 /* qhasm: X4 = AA*BB */
   1216 /* asm 1: fe_mul(>X4=fe#1,<AA=fe#6,<BB=fe#5); */
   1217 /* asm 2: fe_mul(>X4=x2,<AA=tmp1,<BB=tmp0); */
   1218 fe_mul(x2,tmp1,tmp0);
   1219 
   1220 /* qhasm: E = AA-BB */
   1221 /* asm 1: fe_sub(>E=fe#6,<AA=fe#6,<BB=fe#5); */
   1222 /* asm 2: fe_sub(>E=tmp1,<AA=tmp1,<BB=tmp0); */
   1223 fe_sub(tmp1,tmp1,tmp0);
   1224 
   1225 /* qhasm: t2 = t1^2 */
   1226 /* asm 1: fe_sq(>t2=fe#2,<t1=fe#2); */
   1227 /* asm 2: fe_sq(>t2=z2,<t1=z2); */
   1228 fe_sq(z2,z2);
   1229 
   1230 /* qhasm: t3 = a24*E */
   1231 /* asm 1: fe_mul121666(>t3=fe#4,<E=fe#6); */
   1232 /* asm 2: fe_mul121666(>t3=z3,<E=tmp1); */
   1233 fe_mul121666(z3,tmp1);
   1234 
   1235 /* qhasm: X5 = t0^2 */
   1236 /* asm 1: fe_sq(>X5=fe#3,<t0=fe#3); */
   1237 /* asm 2: fe_sq(>X5=x3,<t0=x3); */
   1238 fe_sq(x3,x3);
   1239 
   1240 /* qhasm: t4 = BB+t3 */
   1241 /* asm 1: fe_add(>t4=fe#5,<BB=fe#5,<t3=fe#4); */
   1242 /* asm 2: fe_add(>t4=tmp0,<BB=tmp0,<t3=z3); */
   1243 fe_add(tmp0,tmp0,z3);
   1244 
   1245 /* qhasm: Z5 = X1*t2 */
   1246 /* asm 1: fe_mul(>Z5=fe#4,x1,<t2=fe#2); */
   1247 /* asm 2: fe_mul(>Z5=z3,x1,<t2=z2); */
   1248 fe_mul(z3,x1,z2);
   1249 
   1250 /* qhasm: Z4 = E*t4 */
   1251 /* asm 1: fe_mul(>Z4=fe#2,<E=fe#6,<t4=fe#5); */
   1252 /* asm 2: fe_mul(>Z4=z2,<E=tmp1,<t4=tmp0); */
   1253 fe_mul(z2,tmp1,tmp0);
   1254 
   1255 /* qhasm: return */
   1256  }
   1257  fe_cswap(x2,x3,swap);
   1258  fe_cswap(z2,z3,swap);
   1259 
   1260  fe_invert(z2,z2);
   1261  fe_mul(x2,x2,z2);
   1262  fe_tobytes(q,x2);
   1263  return 0;
   1264 }
   1265 
   1266 static const unsigned char basepoint[32] = {9};
   1267 
   1268 int crypto_scalarmult_base_ref10(unsigned char *q,const unsigned char *n)
   1269 {
   1270  return crypto_scalarmult_ref10(q,n,basepoint);
   1271 }