tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

curve25519-inline.h (29414B)


      1 #ifdef __GNUC__
      2 #if defined(__x86_64__) || defined(_M_X64)
      3 #pragma once
      4 #include <inttypes.h>
      5 
      6 // Computes the addition of four-element f1 with value in f2
      7 // and returns the carry (if any)
      8 static inline void
      9 add_scalar(uint64_t *out, uint64_t *f1, uint64_t f2)
     10 {
     11    __asm__ volatile(
     12        // Clear registers to propagate the carry bit
     13        "  xor %%r8d, %%r8d;"
     14        "  xor %%r9d, %%r9d;"
     15        "  xor %%r10d, %%r10d;"
     16        "  xor %%r11d, %%r11d;"
     17        "  xor %%eax, %%eax;"
     18 
     19        // Begin addition chain
     20        "  addq 0(%2), %0;"
     21        "  movq %0, 0(%1);"
     22        "  adcxq 8(%2), %%r8;"
     23        "  movq %%r8, 8(%1);"
     24        "  adcxq 16(%2), %%r9;"
     25        "  movq %%r9, 16(%1);"
     26        "  adcxq 24(%2), %%r10;"
     27        "  movq %%r10, 24(%1);"
     28 
     29        // Return the carry bit in a register
     30        "  adcx %%r11, %%rax;"
     31        : "+&r"(f2)
     32        : "r"(out), "r"(f1)
     33        : "%rax", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
     34 }
     35 
     36 // Computes the field addition of two field elements
     37 static inline void
     38 fadd(uint64_t *out, uint64_t *f1, uint64_t *f2)
     39 {
     40    __asm__ volatile(
     41        // Compute the raw addition of f1 + f2
     42        "  movq 0(%0), %%r8;"
     43        "  addq 0(%2), %%r8;"
     44        "  movq 8(%0), %%r9;"
     45        "  adcxq 8(%2), %%r9;"
     46        "  movq 16(%0), %%r10;"
     47        "  adcxq 16(%2), %%r10;"
     48        "  movq 24(%0), %%r11;"
     49        "  adcxq 24(%2), %%r11;"
     50 
     51        /////// Wrap the result back into the field //////
     52 
     53        // Step 1: Compute carry*38
     54        "  mov $0, %%rax;"
     55        "  mov $38, %0;"
     56        "  cmovc %0, %%rax;"
     57 
     58        // Step 2: Add carry*38 to the original sum
     59        "  xor %%ecx, %%ecx;"
     60        "  add %%rax, %%r8;"
     61        "  adcx %%rcx, %%r9;"
     62        "  movq %%r9, 8(%1);"
     63        "  adcx %%rcx, %%r10;"
     64        "  movq %%r10, 16(%1);"
     65        "  adcx %%rcx, %%r11;"
     66        "  movq %%r11, 24(%1);"
     67 
     68        // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
     69        "  mov $0, %%rax;"
     70        "  cmovc %0, %%rax;"
     71        "  add %%rax, %%r8;"
     72        "  movq %%r8, 0(%1);"
     73        : "+&r"(f2)
     74        : "r"(out), "r"(f1)
     75        : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
     76 }
     77 
     78 // Computes the field substraction of two field elements
     79 static inline void
     80 fsub(uint64_t *out, uint64_t *f1, uint64_t *f2)
     81 {
     82    __asm__ volatile(
     83        // Compute the raw substraction of f1-f2
     84        "  movq 0(%1), %%r8;"
     85        "  subq 0(%2), %%r8;"
     86        "  movq 8(%1), %%r9;"
     87        "  sbbq 8(%2), %%r9;"
     88        "  movq 16(%1), %%r10;"
     89        "  sbbq 16(%2), %%r10;"
     90        "  movq 24(%1), %%r11;"
     91        "  sbbq 24(%2), %%r11;"
     92 
     93        /////// Wrap the result back into the field //////
     94 
     95        // Step 1: Compute carry*38
     96        "  mov $0, %%rax;"
     97        "  mov $38, %%rcx;"
     98        "  cmovc %%rcx, %%rax;"
     99 
    100        // Step 2: Substract carry*38 from the original difference
    101        "  sub %%rax, %%r8;"
    102        "  sbb $0, %%r9;"
    103        "  sbb $0, %%r10;"
    104        "  sbb $0, %%r11;"
    105 
    106        // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
    107        "  mov $0, %%rax;"
    108        "  cmovc %%rcx, %%rax;"
    109        "  sub %%rax, %%r8;"
    110 
    111        // Store the result
    112        "  movq %%r8, 0(%0);"
    113        "  movq %%r9, 8(%0);"
    114        "  movq %%r10, 16(%0);"
    115        "  movq %%r11, 24(%0);"
    116        :
    117        : "r"(out), "r"(f1), "r"(f2)
    118        : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
    119 }
    120 
    121 // Computes a field multiplication: out <- f1 * f2
    122 // Uses the 8-element buffer tmp for intermediate results
    123 static inline void
    124 fmul(uint64_t *out, uint64_t *f1, uint64_t *f2, uint64_t *tmp)
    125 {
    126    __asm__ volatile(
    127 
    128        /////// Compute the raw multiplication: tmp <- src1 * src2 //////
    129 
    130        // Compute src1[0] * src2
    131        "  movq 0(%0), %%rdx;"
    132        "  mulxq 0(%1), %%r8, %%r9;"
    133        "  xor %%r10d, %%r10d;"
    134        "  movq %%r8, 0(%2);"
    135        "  mulxq 8(%1), %%r10, %%r11;"
    136        "  adox %%r9, %%r10;"
    137        "  movq %%r10, 8(%2);"
    138        "  mulxq 16(%1), %%rbx, %%r13;"
    139        "  adox %%r11, %%rbx;"
    140        "  mulxq 24(%1), %%r14, %%rdx;"
    141        "  adox %%r13, %%r14;"
    142        "  mov $0, %%rax;"
    143        "  adox %%rdx, %%rax;"
    144 
    145        // Compute src1[1] * src2
    146        "  movq 8(%0), %%rdx;"
    147        "  mulxq 0(%1), %%r8, %%r9;"
    148        "  xor %%r10d, %%r10d;"
    149        "  adcxq 8(%2), %%r8;"
    150        "  movq %%r8, 8(%2);"
    151        "  mulxq 8(%1), %%r10, %%r11;"
    152        "  adox %%r9, %%r10;"
    153        "  adcx %%rbx, %%r10;"
    154        "  movq %%r10, 16(%2);"
    155        "  mulxq 16(%1), %%rbx, %%r13;"
    156        "  adox %%r11, %%rbx;"
    157        "  adcx %%r14, %%rbx;"
    158        "  mov $0, %%r8;"
    159        "  mulxq 24(%1), %%r14, %%rdx;"
    160        "  adox %%r13, %%r14;"
    161        "  adcx %%rax, %%r14;"
    162        "  mov $0, %%rax;"
    163        "  adox %%rdx, %%rax;"
    164        "  adcx %%r8, %%rax;"
    165 
    166        // Compute src1[2] * src2
    167        "  movq 16(%0), %%rdx;"
    168        "  mulxq 0(%1), %%r8, %%r9;"
    169        "  xor %%r10d, %%r10d;"
    170        "  adcxq 16(%2), %%r8;"
    171        "  movq %%r8, 16(%2);"
    172        "  mulxq 8(%1), %%r10, %%r11;"
    173        "  adox %%r9, %%r10;"
    174        "  adcx %%rbx, %%r10;"
    175        "  movq %%r10, 24(%2);"
    176        "  mulxq 16(%1), %%rbx, %%r13;"
    177        "  adox %%r11, %%rbx;"
    178        "  adcx %%r14, %%rbx;"
    179        "  mov $0, %%r8;"
    180        "  mulxq 24(%1), %%r14, %%rdx;"
    181        "  adox %%r13, %%r14;"
    182        "  adcx %%rax, %%r14;"
    183        "  mov $0, %%rax;"
    184        "  adox %%rdx, %%rax;"
    185        "  adcx %%r8, %%rax;"
    186 
    187        // Compute src1[3] * src2
    188        "  movq 24(%0), %%rdx;"
    189        "  mulxq 0(%1), %%r8, %%r9;"
    190        "  xor %%r10d, %%r10d;"
    191        "  adcxq 24(%2), %%r8;"
    192        "  movq %%r8, 24(%2);"
    193        "  mulxq 8(%1), %%r10, %%r11;"
    194        "  adox %%r9, %%r10;"
    195        "  adcx %%rbx, %%r10;"
    196        "  movq %%r10, 32(%2);"
    197        "  mulxq 16(%1), %%rbx, %%r13;"
    198        "  adox %%r11, %%rbx;"
    199        "  adcx %%r14, %%rbx;"
    200        "  movq %%rbx, 40(%2);"
    201        "  mov $0, %%r8;"
    202        "  mulxq 24(%1), %%r14, %%rdx;"
    203        "  adox %%r13, %%r14;"
    204        "  adcx %%rax, %%r14;"
    205        "  movq %%r14, 48(%2);"
    206        "  mov $0, %%rax;"
    207        "  adox %%rdx, %%rax;"
    208        "  adcx %%r8, %%rax;"
    209        "  movq %%rax, 56(%2);"
    210 
    211        // Line up pointers
    212        "  mov %2, %0;"
    213        "  mov %3, %2;"
    214 
    215        /////// Wrap the result back into the field //////
    216 
    217        // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
    218        "  mov $38, %%rdx;"
    219        "  mulxq 32(%0), %%r8, %%r13;"
    220        "  xor %k1, %k1;"
    221        "  adoxq 0(%0), %%r8;"
    222        "  mulxq 40(%0), %%r9, %%rbx;"
    223        "  adcx %%r13, %%r9;"
    224        "  adoxq 8(%0), %%r9;"
    225        "  mulxq 48(%0), %%r10, %%r13;"
    226        "  adcx %%rbx, %%r10;"
    227        "  adoxq 16(%0), %%r10;"
    228        "  mulxq 56(%0), %%r11, %%rax;"
    229        "  adcx %%r13, %%r11;"
    230        "  adoxq 24(%0), %%r11;"
    231        "  adcx %1, %%rax;"
    232        "  adox %1, %%rax;"
    233        "  imul %%rdx, %%rax;"
    234 
    235        // Step 2: Fold the carry back into dst
    236        "  add %%rax, %%r8;"
    237        "  adcx %1, %%r9;"
    238        "  movq %%r9, 8(%2);"
    239        "  adcx %1, %%r10;"
    240        "  movq %%r10, 16(%2);"
    241        "  adcx %1, %%r11;"
    242        "  movq %%r11, 24(%2);"
    243 
    244        // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
    245        "  mov $0, %%rax;"
    246        "  cmovc %%rdx, %%rax;"
    247        "  add %%rax, %%r8;"
    248        "  movq %%r8, 0(%2);"
    249        : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
    250        : "r"(out)
    251        : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "memory", "cc");
    252 }
    253 
    254 // Computes two field multiplications:
    255 //   out[0] <- f1[0] * f2[0]
    256 //   out[1] <- f1[1] * f2[1]
    257 // Uses the 16-element buffer tmp for intermediate results:
    258 static inline void
    259 fmul2(uint64_t *out, uint64_t *f1, uint64_t *f2, uint64_t *tmp)
    260 {
    261    __asm__ volatile(
    262 
    263        /////// Compute the raw multiplication tmp[0] <- f1[0] * f2[0] //////
    264 
    265        // Compute src1[0] * src2
    266        "  movq 0(%0), %%rdx;"
    267        "  mulxq 0(%1), %%r8, %%r9;"
    268        "  xor %%r10d, %%r10d;"
    269        "  movq %%r8, 0(%2);"
    270        "  mulxq 8(%1), %%r10, %%r11;"
    271        "  adox %%r9, %%r10;"
    272        "  movq %%r10, 8(%2);"
    273        "  mulxq 16(%1), %%rbx, %%r13;"
    274        "  adox %%r11, %%rbx;"
    275        "  mulxq 24(%1), %%r14, %%rdx;"
    276        "  adox %%r13, %%r14;"
    277        "  mov $0, %%rax;"
    278        "  adox %%rdx, %%rax;"
    279 
    280        // Compute src1[1] * src2
    281        "  movq 8(%0), %%rdx;"
    282        "  mulxq 0(%1), %%r8, %%r9;"
    283        "  xor %%r10d, %%r10d;"
    284        "  adcxq 8(%2), %%r8;"
    285        "  movq %%r8, 8(%2);"
    286        "  mulxq 8(%1), %%r10, %%r11;"
    287        "  adox %%r9, %%r10;"
    288        "  adcx %%rbx, %%r10;"
    289        "  movq %%r10, 16(%2);"
    290        "  mulxq 16(%1), %%rbx, %%r13;"
    291        "  adox %%r11, %%rbx;"
    292        "  adcx %%r14, %%rbx;"
    293        "  mov $0, %%r8;"
    294        "  mulxq 24(%1), %%r14, %%rdx;"
    295        "  adox %%r13, %%r14;"
    296        "  adcx %%rax, %%r14;"
    297        "  mov $0, %%rax;"
    298        "  adox %%rdx, %%rax;"
    299        "  adcx %%r8, %%rax;"
    300 
    301        // Compute src1[2] * src2
    302        "  movq 16(%0), %%rdx;"
    303        "  mulxq 0(%1), %%r8, %%r9;"
    304        "  xor %%r10d, %%r10d;"
    305        "  adcxq 16(%2), %%r8;"
    306        "  movq %%r8, 16(%2);"
    307        "  mulxq 8(%1), %%r10, %%r11;"
    308        "  adox %%r9, %%r10;"
    309        "  adcx %%rbx, %%r10;"
    310        "  movq %%r10, 24(%2);"
    311        "  mulxq 16(%1), %%rbx, %%r13;"
    312        "  adox %%r11, %%rbx;"
    313        "  adcx %%r14, %%rbx;"
    314        "  mov $0, %%r8;"
    315        "  mulxq 24(%1), %%r14, %%rdx;"
    316        "  adox %%r13, %%r14;"
    317        "  adcx %%rax, %%r14;"
    318        "  mov $0, %%rax;"
    319        "  adox %%rdx, %%rax;"
    320        "  adcx %%r8, %%rax;"
    321 
    322        // Compute src1[3] * src2
    323        "  movq 24(%0), %%rdx;"
    324        "  mulxq 0(%1), %%r8, %%r9;"
    325        "  xor %%r10d, %%r10d;"
    326        "  adcxq 24(%2), %%r8;"
    327        "  movq %%r8, 24(%2);"
    328        "  mulxq 8(%1), %%r10, %%r11;"
    329        "  adox %%r9, %%r10;"
    330        "  adcx %%rbx, %%r10;"
    331        "  movq %%r10, 32(%2);"
    332        "  mulxq 16(%1), %%rbx, %%r13;"
    333        "  adox %%r11, %%rbx;"
    334        "  adcx %%r14, %%rbx;"
    335        "  movq %%rbx, 40(%2);"
    336        "  mov $0, %%r8;"
    337        "  mulxq 24(%1), %%r14, %%rdx;"
    338        "  adox %%r13, %%r14;"
    339        "  adcx %%rax, %%r14;"
    340        "  movq %%r14, 48(%2);"
    341        "  mov $0, %%rax;"
    342        "  adox %%rdx, %%rax;"
    343        "  adcx %%r8, %%rax;"
    344        "  movq %%rax, 56(%2);"
    345 
    346        /////// Compute the raw multiplication tmp[1] <- f1[1] * f2[1] //////
    347 
    348        // Compute src1[0] * src2
    349        "  movq 32(%0), %%rdx;"
    350        "  mulxq 32(%1), %%r8, %%r9;"
    351        "  xor %%r10d, %%r10d;"
    352        "  movq %%r8, 64(%2);"
    353        "  mulxq 40(%1), %%r10, %%r11;"
    354        "  adox %%r9, %%r10;"
    355        "  movq %%r10, 72(%2);"
    356        "  mulxq 48(%1), %%rbx, %%r13;"
    357        "  adox %%r11, %%rbx;"
    358        "  mulxq 56(%1), %%r14, %%rdx;"
    359        "  adox %%r13, %%r14;"
    360        "  mov $0, %%rax;"
    361        "  adox %%rdx, %%rax;"
    362 
    363        // Compute src1[1] * src2
    364        "  movq 40(%0), %%rdx;"
    365        "  mulxq 32(%1), %%r8, %%r9;"
    366        "  xor %%r10d, %%r10d;"
    367        "  adcxq 72(%2), %%r8;"
    368        "  movq %%r8, 72(%2);"
    369        "  mulxq 40(%1), %%r10, %%r11;"
    370        "  adox %%r9, %%r10;"
    371        "  adcx %%rbx, %%r10;"
    372        "  movq %%r10, 80(%2);"
    373        "  mulxq 48(%1), %%rbx, %%r13;"
    374        "  adox %%r11, %%rbx;"
    375        "  adcx %%r14, %%rbx;"
    376        "  mov $0, %%r8;"
    377        "  mulxq 56(%1), %%r14, %%rdx;"
    378        "  adox %%r13, %%r14;"
    379        "  adcx %%rax, %%r14;"
    380        "  mov $0, %%rax;"
    381        "  adox %%rdx, %%rax;"
    382        "  adcx %%r8, %%rax;"
    383 
    384        // Compute src1[2] * src2
    385        "  movq 48(%0), %%rdx;"
    386        "  mulxq 32(%1), %%r8, %%r9;"
    387        "  xor %%r10d, %%r10d;"
    388        "  adcxq 80(%2), %%r8;"
    389        "  movq %%r8, 80(%2);"
    390        "  mulxq 40(%1), %%r10, %%r11;"
    391        "  adox %%r9, %%r10;"
    392        "  adcx %%rbx, %%r10;"
    393        "  movq %%r10, 88(%2);"
    394        "  mulxq 48(%1), %%rbx, %%r13;"
    395        "  adox %%r11, %%rbx;"
    396        "  adcx %%r14, %%rbx;"
    397        "  mov $0, %%r8;"
    398        "  mulxq 56(%1), %%r14, %%rdx;"
    399        "  adox %%r13, %%r14;"
    400        "  adcx %%rax, %%r14;"
    401        "  mov $0, %%rax;"
    402        "  adox %%rdx, %%rax;"
    403        "  adcx %%r8, %%rax;"
    404 
    405        // Compute src1[3] * src2
    406        "  movq 56(%0), %%rdx;"
    407        "  mulxq 32(%1), %%r8, %%r9;"
    408        "  xor %%r10d, %%r10d;"
    409        "  adcxq 88(%2), %%r8;"
    410        "  movq %%r8, 88(%2);"
    411        "  mulxq 40(%1), %%r10, %%r11;"
    412        "  adox %%r9, %%r10;"
    413        "  adcx %%rbx, %%r10;"
    414        "  movq %%r10, 96(%2);"
    415        "  mulxq 48(%1), %%rbx, %%r13;"
    416        "  adox %%r11, %%rbx;"
    417        "  adcx %%r14, %%rbx;"
    418        "  movq %%rbx, 104(%2);"
    419        "  mov $0, %%r8;"
    420        "  mulxq 56(%1), %%r14, %%rdx;"
    421        "  adox %%r13, %%r14;"
    422        "  adcx %%rax, %%r14;"
    423        "  movq %%r14, 112(%2);"
    424        "  mov $0, %%rax;"
    425        "  adox %%rdx, %%rax;"
    426        "  adcx %%r8, %%rax;"
    427        "  movq %%rax, 120(%2);"
    428 
    429        // Line up pointers
    430        "  mov %2, %0;"
    431        "  mov %3, %2;"
    432 
    433        /////// Wrap the results back into the field //////
    434 
    435        // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
    436        "  mov $38, %%rdx;"
    437        "  mulxq 32(%0), %%r8, %%r13;"
    438        "  xor %k1, %k1;"
    439        "  adoxq 0(%0), %%r8;"
    440        "  mulxq 40(%0), %%r9, %%rbx;"
    441        "  adcx %%r13, %%r9;"
    442        "  adoxq 8(%0), %%r9;"
    443        "  mulxq 48(%0), %%r10, %%r13;"
    444        "  adcx %%rbx, %%r10;"
    445        "  adoxq 16(%0), %%r10;"
    446        "  mulxq 56(%0), %%r11, %%rax;"
    447        "  adcx %%r13, %%r11;"
    448        "  adoxq 24(%0), %%r11;"
    449        "  adcx %1, %%rax;"
    450        "  adox %1, %%rax;"
    451        "  imul %%rdx, %%rax;"
    452 
    453        // Step 2: Fold the carry back into dst
    454        "  add %%rax, %%r8;"
    455        "  adcx %1, %%r9;"
    456        "  movq %%r9, 8(%2);"
    457        "  adcx %1, %%r10;"
    458        "  movq %%r10, 16(%2);"
    459        "  adcx %1, %%r11;"
    460        "  movq %%r11, 24(%2);"
    461 
    462        // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
    463        "  mov $0, %%rax;"
    464        "  cmovc %%rdx, %%rax;"
    465        "  add %%rax, %%r8;"
    466        "  movq %%r8, 0(%2);"
    467 
    468        // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
    469        "  mov $38, %%rdx;"
    470        "  mulxq 96(%0), %%r8, %%r13;"
    471        "  xor %k1, %k1;"
    472        "  adoxq 64(%0), %%r8;"
    473        "  mulxq 104(%0), %%r9, %%rbx;"
    474        "  adcx %%r13, %%r9;"
    475        "  adoxq 72(%0), %%r9;"
    476        "  mulxq 112(%0), %%r10, %%r13;"
    477        "  adcx %%rbx, %%r10;"
    478        "  adoxq 80(%0), %%r10;"
    479        "  mulxq 120(%0), %%r11, %%rax;"
    480        "  adcx %%r13, %%r11;"
    481        "  adoxq 88(%0), %%r11;"
    482        "  adcx %1, %%rax;"
    483        "  adox %1, %%rax;"
    484        "  imul %%rdx, %%rax;"
    485 
    486        // Step 2: Fold the carry back into dst
    487        "  add %%rax, %%r8;"
    488        "  adcx %1, %%r9;"
    489        "  movq %%r9, 40(%2);"
    490        "  adcx %1, %%r10;"
    491        "  movq %%r10, 48(%2);"
    492        "  adcx %1, %%r11;"
    493        "  movq %%r11, 56(%2);"
    494 
    495        // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
    496        "  mov $0, %%rax;"
    497        "  cmovc %%rdx, %%rax;"
    498        "  add %%rax, %%r8;"
    499        "  movq %%r8, 32(%2);"
    500        : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
    501        : "r"(out)
    502        : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "memory", "cc");
    503 }
    504 
    505 // Computes the field multiplication of four-element f1 with value in f2
    506 // Requires f2 to be smaller than 2^17
    507 static inline void
    508 fmul_scalar(uint64_t *out, uint64_t *f1, uint64_t f2)
    509 {
    510    register uint64_t f2_r __asm__("rdx") = f2;
    511 
    512    __asm__ volatile(
    513        // Compute the raw multiplication of f1*f2
    514        "  mulxq 0(%2), %%r8, %%rcx;" // f1[0]*f2
    515        "  mulxq 8(%2), %%r9, %%rbx;" // f1[1]*f2
    516        "  add %%rcx, %%r9;"
    517        "  mov $0, %%rcx;"
    518        "  mulxq 16(%2), %%r10, %%r13;" // f1[2]*f2
    519        "  adcx %%rbx, %%r10;"
    520        "  mulxq 24(%2), %%r11, %%rax;" // f1[3]*f2
    521        "  adcx %%r13, %%r11;"
    522        "  adcx %%rcx, %%rax;"
    523 
    524        /////// Wrap the result back into the field //////
    525 
    526        // Step 1: Compute carry*38
    527        "  mov $38, %%rdx;"
    528        "  imul %%rdx, %%rax;"
    529 
    530        // Step 2: Fold the carry back into dst
    531        "  add %%rax, %%r8;"
    532        "  adcx %%rcx, %%r9;"
    533        "  movq %%r9, 8(%1);"
    534        "  adcx %%rcx, %%r10;"
    535        "  movq %%r10, 16(%1);"
    536        "  adcx %%rcx, %%r11;"
    537        "  movq %%r11, 24(%1);"
    538 
    539        // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
    540        "  mov $0, %%rax;"
    541        "  cmovc %%rdx, %%rax;"
    542        "  add %%rax, %%r8;"
    543        "  movq %%r8, 0(%1);"
    544        : "+&r"(f2_r)
    545        : "r"(out), "r"(f1)
    546        : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13", "memory", "cc");
    547 }
    548 
    549 // Computes p1 <- bit ? p2 : p1 in constant time
    550 static inline void
    551 cswap2(uint64_t bit, uint64_t *p1, uint64_t *p2)
    552 {
    553    __asm__ volatile(
    554        // Transfer bit into CF flag
    555        "  add $18446744073709551615, %0;"
    556 
    557        // cswap p1[0], p2[0]
    558        "  movq 0(%1), %%r8;"
    559        "  movq 0(%2), %%r9;"
    560        "  mov %%r8, %%r10;"
    561        "  cmovc %%r9, %%r8;"
    562        "  cmovc %%r10, %%r9;"
    563        "  movq %%r8, 0(%1);"
    564        "  movq %%r9, 0(%2);"
    565 
    566        // cswap p1[1], p2[1]
    567        "  movq 8(%1), %%r8;"
    568        "  movq 8(%2), %%r9;"
    569        "  mov %%r8, %%r10;"
    570        "  cmovc %%r9, %%r8;"
    571        "  cmovc %%r10, %%r9;"
    572        "  movq %%r8, 8(%1);"
    573        "  movq %%r9, 8(%2);"
    574 
    575        // cswap p1[2], p2[2]
    576        "  movq 16(%1), %%r8;"
    577        "  movq 16(%2), %%r9;"
    578        "  mov %%r8, %%r10;"
    579        "  cmovc %%r9, %%r8;"
    580        "  cmovc %%r10, %%r9;"
    581        "  movq %%r8, 16(%1);"
    582        "  movq %%r9, 16(%2);"
    583 
    584        // cswap p1[3], p2[3]
    585        "  movq 24(%1), %%r8;"
    586        "  movq 24(%2), %%r9;"
    587        "  mov %%r8, %%r10;"
    588        "  cmovc %%r9, %%r8;"
    589        "  cmovc %%r10, %%r9;"
    590        "  movq %%r8, 24(%1);"
    591        "  movq %%r9, 24(%2);"
    592 
    593        // cswap p1[4], p2[4]
    594        "  movq 32(%1), %%r8;"
    595        "  movq 32(%2), %%r9;"
    596        "  mov %%r8, %%r10;"
    597        "  cmovc %%r9, %%r8;"
    598        "  cmovc %%r10, %%r9;"
    599        "  movq %%r8, 32(%1);"
    600        "  movq %%r9, 32(%2);"
    601 
    602        // cswap p1[5], p2[5]
    603        "  movq 40(%1), %%r8;"
    604        "  movq 40(%2), %%r9;"
    605        "  mov %%r8, %%r10;"
    606        "  cmovc %%r9, %%r8;"
    607        "  cmovc %%r10, %%r9;"
    608        "  movq %%r8, 40(%1);"
    609        "  movq %%r9, 40(%2);"
    610 
    611        // cswap p1[6], p2[6]
    612        "  movq 48(%1), %%r8;"
    613        "  movq 48(%2), %%r9;"
    614        "  mov %%r8, %%r10;"
    615        "  cmovc %%r9, %%r8;"
    616        "  cmovc %%r10, %%r9;"
    617        "  movq %%r8, 48(%1);"
    618        "  movq %%r9, 48(%2);"
    619 
    620        // cswap p1[7], p2[7]
    621        "  movq 56(%1), %%r8;"
    622        "  movq 56(%2), %%r9;"
    623        "  mov %%r8, %%r10;"
    624        "  cmovc %%r9, %%r8;"
    625        "  cmovc %%r10, %%r9;"
    626        "  movq %%r8, 56(%1);"
    627        "  movq %%r9, 56(%2);"
    628        : "+&r"(bit)
    629        : "r"(p1), "r"(p2)
    630        : "%r8", "%r9", "%r10", "memory", "cc");
    631 }
    632 
    633 // Computes the square of a field element: out <- f * f
    634 // Uses the 8-element buffer tmp for intermediate results
    635 static inline void
    636 fsqr(uint64_t *out, uint64_t *f, uint64_t *tmp)
    637 {
    638    __asm__ volatile(
    639 
    640        /////// Compute the raw multiplication: tmp <- f * f //////
    641 
    642        // Step 1: Compute all partial products
    643        "  movq 0(%0), %%rdx;" // f[0]
    644        "  mulxq 8(%0), %%r8, %%r14;"
    645        "  xor %%r15d, %%r15d;" // f[1]*f[0]
    646        "  mulxq 16(%0), %%r9, %%r10;"
    647        "  adcx %%r14, %%r9;" // f[2]*f[0]
    648        "  mulxq 24(%0), %%rax, %%rcx;"
    649        "  adcx %%rax, %%r10;"  // f[3]*f[0]
    650        "  movq 24(%0), %%rdx;" // f[3]
    651        "  mulxq 8(%0), %%r11, %%rbx;"
    652        "  adcx %%rcx, %%r11;" // f[1]*f[3]
    653        "  mulxq 16(%0), %%rax, %%r13;"
    654        "  adcx %%rax, %%rbx;" // f[2]*f[3]
    655        "  movq 8(%0), %%rdx;"
    656        "  adcx %%r15, %%r13;" // f1
    657        "  mulxq 16(%0), %%rax, %%rcx;"
    658        "  mov $0, %%r14;" // f[2]*f[1]
    659 
    660        // Step 2: Compute two parallel carry chains
    661        "  xor %%r15d, %%r15d;"
    662        "  adox %%rax, %%r10;"
    663        "  adcx %%r8, %%r8;"
    664        "  adox %%rcx, %%r11;"
    665        "  adcx %%r9, %%r9;"
    666        "  adox %%r15, %%rbx;"
    667        "  adcx %%r10, %%r10;"
    668        "  adox %%r15, %%r13;"
    669        "  adcx %%r11, %%r11;"
    670        "  adox %%r15, %%r14;"
    671        "  adcx %%rbx, %%rbx;"
    672        "  adcx %%r13, %%r13;"
    673        "  adcx %%r14, %%r14;"
    674 
    675        // Step 3: Compute intermediate squares
    676        "  movq 0(%0), %%rdx;"
    677        "  mulx %%rdx, %%rax, %%rcx;" // f[0]^2
    678        "  movq %%rax, 0(%1);"
    679        "  add %%rcx, %%r8;"
    680        "  movq %%r8, 8(%1);"
    681        "  movq 8(%0), %%rdx;"
    682        "  mulx %%rdx, %%rax, %%rcx;" // f[1]^2
    683        "  adcx %%rax, %%r9;"
    684        "  movq %%r9, 16(%1);"
    685        "  adcx %%rcx, %%r10;"
    686        "  movq %%r10, 24(%1);"
    687        "  movq 16(%0), %%rdx;"
    688        "  mulx %%rdx, %%rax, %%rcx;" // f[2]^2
    689        "  adcx %%rax, %%r11;"
    690        "  movq %%r11, 32(%1);"
    691        "  adcx %%rcx, %%rbx;"
    692        "  movq %%rbx, 40(%1);"
    693        "  movq 24(%0), %%rdx;"
    694        "  mulx %%rdx, %%rax, %%rcx;" // f[3]^2
    695        "  adcx %%rax, %%r13;"
    696        "  movq %%r13, 48(%1);"
    697        "  adcx %%rcx, %%r14;"
    698        "  movq %%r14, 56(%1);"
    699 
    700        // Line up pointers
    701        "  mov %1, %0;"
    702        "  mov %2, %1;"
    703 
    704        /////// Wrap the result back into the field //////
    705 
    706        // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
    707        "  mov $38, %%rdx;"
    708        "  mulxq 32(%0), %%r8, %%r13;"
    709        "  xor %%ecx, %%ecx;"
    710        "  adoxq 0(%0), %%r8;"
    711        "  mulxq 40(%0), %%r9, %%rbx;"
    712        "  adcx %%r13, %%r9;"
    713        "  adoxq 8(%0), %%r9;"
    714        "  mulxq 48(%0), %%r10, %%r13;"
    715        "  adcx %%rbx, %%r10;"
    716        "  adoxq 16(%0), %%r10;"
    717        "  mulxq 56(%0), %%r11, %%rax;"
    718        "  adcx %%r13, %%r11;"
    719        "  adoxq 24(%0), %%r11;"
    720        "  adcx %%rcx, %%rax;"
    721        "  adox %%rcx, %%rax;"
    722        "  imul %%rdx, %%rax;"
    723 
    724        // Step 2: Fold the carry back into dst
    725        "  add %%rax, %%r8;"
    726        "  adcx %%rcx, %%r9;"
    727        "  movq %%r9, 8(%1);"
    728        "  adcx %%rcx, %%r10;"
    729        "  movq %%r10, 16(%1);"
    730        "  adcx %%rcx, %%r11;"
    731        "  movq %%r11, 24(%1);"
    732 
    733        // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
    734        "  mov $0, %%rax;"
    735        "  cmovc %%rdx, %%rax;"
    736        "  add %%rax, %%r8;"
    737        "  movq %%r8, 0(%1);"
    738        : "+&r"(f), "+&r"(tmp)
    739        : "r"(out)
    740        : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "%r15", "memory", "cc");
    741 }
    742 
    743 // Computes two field squarings:
    744 //   out[0] <- f[0] * f[0]
    745 //   out[1] <- f[1] * f[1]
    746 // Uses the 16-element buffer tmp for intermediate results
    747 static inline void
    748 fsqr2(uint64_t *out, uint64_t *f, uint64_t *tmp)
    749 {
    750    __asm__ volatile(
    751        // Step 1: Compute all partial products
    752        "  movq 0(%0), %%rdx;" // f[0]
    753        "  mulxq 8(%0), %%r8, %%r14;"
    754        "  xor %%r15d, %%r15d;" // f[1]*f[0]
    755        "  mulxq 16(%0), %%r9, %%r10;"
    756        "  adcx %%r14, %%r9;" // f[2]*f[0]
    757        "  mulxq 24(%0), %%rax, %%rcx;"
    758        "  adcx %%rax, %%r10;"  // f[3]*f[0]
    759        "  movq 24(%0), %%rdx;" // f[3]
    760        "  mulxq 8(%0), %%r11, %%rbx;"
    761        "  adcx %%rcx, %%r11;" // f[1]*f[3]
    762        "  mulxq 16(%0), %%rax, %%r13;"
    763        "  adcx %%rax, %%rbx;" // f[2]*f[3]
    764        "  movq 8(%0), %%rdx;"
    765        "  adcx %%r15, %%r13;" // f1
    766        "  mulxq 16(%0), %%rax, %%rcx;"
    767        "  mov $0, %%r14;" // f[2]*f[1]
    768 
    769        // Step 2: Compute two parallel carry chains
    770        "  xor %%r15d, %%r15d;"
    771        "  adox %%rax, %%r10;"
    772        "  adcx %%r8, %%r8;"
    773        "  adox %%rcx, %%r11;"
    774        "  adcx %%r9, %%r9;"
    775        "  adox %%r15, %%rbx;"
    776        "  adcx %%r10, %%r10;"
    777        "  adox %%r15, %%r13;"
    778        "  adcx %%r11, %%r11;"
    779        "  adox %%r15, %%r14;"
    780        "  adcx %%rbx, %%rbx;"
    781        "  adcx %%r13, %%r13;"
    782        "  adcx %%r14, %%r14;"
    783 
    784        // Step 3: Compute intermediate squares
    785        "  movq 0(%0), %%rdx;"
    786        "  mulx %%rdx, %%rax, %%rcx;" // f[0]^2
    787        "  movq %%rax, 0(%1);"
    788        "  add %%rcx, %%r8;"
    789        "  movq %%r8, 8(%1);"
    790        "  movq 8(%0), %%rdx;"
    791        "  mulx %%rdx, %%rax, %%rcx;" // f[1]^2
    792        "  adcx %%rax, %%r9;"
    793        "  movq %%r9, 16(%1);"
    794        "  adcx %%rcx, %%r10;"
    795        "  movq %%r10, 24(%1);"
    796        "  movq 16(%0), %%rdx;"
    797        "  mulx %%rdx, %%rax, %%rcx;" // f[2]^2
    798        "  adcx %%rax, %%r11;"
    799        "  movq %%r11, 32(%1);"
    800        "  adcx %%rcx, %%rbx;"
    801        "  movq %%rbx, 40(%1);"
    802        "  movq 24(%0), %%rdx;"
    803        "  mulx %%rdx, %%rax, %%rcx;" // f[3]^2
    804        "  adcx %%rax, %%r13;"
    805        "  movq %%r13, 48(%1);"
    806        "  adcx %%rcx, %%r14;"
    807        "  movq %%r14, 56(%1);"
    808 
    809        // Step 1: Compute all partial products
    810        "  movq 32(%0), %%rdx;" // f[0]
    811        "  mulxq 40(%0), %%r8, %%r14;"
    812        "  xor %%r15d, %%r15d;" // f[1]*f[0]
    813        "  mulxq 48(%0), %%r9, %%r10;"
    814        "  adcx %%r14, %%r9;" // f[2]*f[0]
    815        "  mulxq 56(%0), %%rax, %%rcx;"
    816        "  adcx %%rax, %%r10;"  // f[3]*f[0]
    817        "  movq 56(%0), %%rdx;" // f[3]
    818        "  mulxq 40(%0), %%r11, %%rbx;"
    819        "  adcx %%rcx, %%r11;" // f[1]*f[3]
    820        "  mulxq 48(%0), %%rax, %%r13;"
    821        "  adcx %%rax, %%rbx;" // f[2]*f[3]
    822        "  movq 40(%0), %%rdx;"
    823        "  adcx %%r15, %%r13;" // f1
    824        "  mulxq 48(%0), %%rax, %%rcx;"
    825        "  mov $0, %%r14;" // f[2]*f[1]
    826 
    827        // Step 2: Compute two parallel carry chains
    828        "  xor %%r15d, %%r15d;"
    829        "  adox %%rax, %%r10;"
    830        "  adcx %%r8, %%r8;"
    831        "  adox %%rcx, %%r11;"
    832        "  adcx %%r9, %%r9;"
    833        "  adox %%r15, %%rbx;"
    834        "  adcx %%r10, %%r10;"
    835        "  adox %%r15, %%r13;"
    836        "  adcx %%r11, %%r11;"
    837        "  adox %%r15, %%r14;"
    838        "  adcx %%rbx, %%rbx;"
    839        "  adcx %%r13, %%r13;"
    840        "  adcx %%r14, %%r14;"
    841 
    842        // Step 3: Compute intermediate squares
    843        "  movq 32(%0), %%rdx;"
    844        "  mulx %%rdx, %%rax, %%rcx;" // f[0]^2
    845        "  movq %%rax, 64(%1);"
    846        "  add %%rcx, %%r8;"
    847        "  movq %%r8, 72(%1);"
    848        "  movq 40(%0), %%rdx;"
    849        "  mulx %%rdx, %%rax, %%rcx;" // f[1]^2
    850        "  adcx %%rax, %%r9;"
    851        "  movq %%r9, 80(%1);"
    852        "  adcx %%rcx, %%r10;"
    853        "  movq %%r10, 88(%1);"
    854        "  movq 48(%0), %%rdx;"
    855        "  mulx %%rdx, %%rax, %%rcx;" // f[2]^2
    856        "  adcx %%rax, %%r11;"
    857        "  movq %%r11, 96(%1);"
    858        "  adcx %%rcx, %%rbx;"
    859        "  movq %%rbx, 104(%1);"
    860        "  movq 56(%0), %%rdx;"
    861        "  mulx %%rdx, %%rax, %%rcx;" // f[3]^2
    862        "  adcx %%rax, %%r13;"
    863        "  movq %%r13, 112(%1);"
    864        "  adcx %%rcx, %%r14;"
    865        "  movq %%r14, 120(%1);"
    866 
    867        // Line up pointers
    868        "  mov %1, %0;"
    869        "  mov %2, %1;"
    870 
    871        // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
    872        "  mov $38, %%rdx;"
    873        "  mulxq 32(%0), %%r8, %%r13;"
    874        "  xor %%ecx, %%ecx;"
    875        "  adoxq 0(%0), %%r8;"
    876        "  mulxq 40(%0), %%r9, %%rbx;"
    877        "  adcx %%r13, %%r9;"
    878        "  adoxq 8(%0), %%r9;"
    879        "  mulxq 48(%0), %%r10, %%r13;"
    880        "  adcx %%rbx, %%r10;"
    881        "  adoxq 16(%0), %%r10;"
    882        "  mulxq 56(%0), %%r11, %%rax;"
    883        "  adcx %%r13, %%r11;"
    884        "  adoxq 24(%0), %%r11;"
    885        "  adcx %%rcx, %%rax;"
    886        "  adox %%rcx, %%rax;"
    887        "  imul %%rdx, %%rax;"
    888 
    889        // Step 2: Fold the carry back into dst
    890        "  add %%rax, %%r8;"
    891        "  adcx %%rcx, %%r9;"
    892        "  movq %%r9, 8(%1);"
    893        "  adcx %%rcx, %%r10;"
    894        "  movq %%r10, 16(%1);"
    895        "  adcx %%rcx, %%r11;"
    896        "  movq %%r11, 24(%1);"
    897 
    898        // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
    899        "  mov $0, %%rax;"
    900        "  cmovc %%rdx, %%rax;"
    901        "  add %%rax, %%r8;"
    902        "  movq %%r8, 0(%1);"
    903 
    904        // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
    905        "  mov $38, %%rdx;"
    906        "  mulxq 96(%0), %%r8, %%r13;"
    907        "  xor %%ecx, %%ecx;"
    908        "  adoxq 64(%0), %%r8;"
    909        "  mulxq 104(%0), %%r9, %%rbx;"
    910        "  adcx %%r13, %%r9;"
    911        "  adoxq 72(%0), %%r9;"
    912        "  mulxq 112(%0), %%r10, %%r13;"
    913        "  adcx %%rbx, %%r10;"
    914        "  adoxq 80(%0), %%r10;"
    915        "  mulxq 120(%0), %%r11, %%rax;"
    916        "  adcx %%r13, %%r11;"
    917        "  adoxq 88(%0), %%r11;"
    918        "  adcx %%rcx, %%rax;"
    919        "  adox %%rcx, %%rax;"
    920        "  imul %%rdx, %%rax;"
    921 
    922        // Step 2: Fold the carry back into dst
    923        "  add %%rax, %%r8;"
    924        "  adcx %%rcx, %%r9;"
    925        "  movq %%r9, 40(%1);"
    926        "  adcx %%rcx, %%r10;"
    927        "  movq %%r10, 48(%1);"
    928        "  adcx %%rcx, %%r11;"
    929        "  movq %%r11, 56(%1);"
    930 
    931        // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
    932        "  mov $0, %%rax;"
    933        "  cmovc %%rdx, %%rax;"
    934        "  add %%rax, %%r8;"
    935        "  movq %%r8, 32(%1);"
    936        : "+&r"(f), "+&r"(tmp)
    937        : "r"(out)
    938        : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "%r15", "memory", "cc");
    939 }
    940 
    941 #endif /* defined(__x86_64__) || defined(_M_X64) */
    942 #endif /* __GNUC__ */