curve25519-inline.h (29414B)
1 #ifdef __GNUC__ 2 #if defined(__x86_64__) || defined(_M_X64) 3 #pragma once 4 #include <inttypes.h> 5 6 // Computes the addition of four-element f1 with value in f2 7 // and returns the carry (if any) 8 static inline void 9 add_scalar(uint64_t *out, uint64_t *f1, uint64_t f2) 10 { 11 __asm__ volatile( 12 // Clear registers to propagate the carry bit 13 " xor %%r8d, %%r8d;" 14 " xor %%r9d, %%r9d;" 15 " xor %%r10d, %%r10d;" 16 " xor %%r11d, %%r11d;" 17 " xor %%eax, %%eax;" 18 19 // Begin addition chain 20 " addq 0(%2), %0;" 21 " movq %0, 0(%1);" 22 " adcxq 8(%2), %%r8;" 23 " movq %%r8, 8(%1);" 24 " adcxq 16(%2), %%r9;" 25 " movq %%r9, 16(%1);" 26 " adcxq 24(%2), %%r10;" 27 " movq %%r10, 24(%1);" 28 29 // Return the carry bit in a register 30 " adcx %%r11, %%rax;" 31 : "+&r"(f2) 32 : "r"(out), "r"(f1) 33 : "%rax", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); 34 } 35 36 // Computes the field addition of two field elements 37 static inline void 38 fadd(uint64_t *out, uint64_t *f1, uint64_t *f2) 39 { 40 __asm__ volatile( 41 // Compute the raw addition of f1 + f2 42 " movq 0(%0), %%r8;" 43 " addq 0(%2), %%r8;" 44 " movq 8(%0), %%r9;" 45 " adcxq 8(%2), %%r9;" 46 " movq 16(%0), %%r10;" 47 " adcxq 16(%2), %%r10;" 48 " movq 24(%0), %%r11;" 49 " adcxq 24(%2), %%r11;" 50 51 /////// Wrap the result back into the field ////// 52 53 // Step 1: Compute carry*38 54 " mov $0, %%rax;" 55 " mov $38, %0;" 56 " cmovc %0, %%rax;" 57 58 // Step 2: Add carry*38 to the original sum 59 " xor %%ecx, %%ecx;" 60 " add %%rax, %%r8;" 61 " adcx %%rcx, %%r9;" 62 " movq %%r9, 8(%1);" 63 " adcx %%rcx, %%r10;" 64 " movq %%r10, 16(%1);" 65 " adcx %%rcx, %%r11;" 66 " movq %%r11, 24(%1);" 67 68 // Step 3: Fold the carry bit back in; guaranteed not to carry at this point 69 " mov $0, %%rax;" 70 " cmovc %0, %%rax;" 71 " add %%rax, %%r8;" 72 " movq %%r8, 0(%1);" 73 : "+&r"(f2) 74 : "r"(out), "r"(f1) 75 : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); 76 } 77 78 // Computes the field substraction of two field elements 79 static inline void 80 fsub(uint64_t *out, uint64_t *f1, uint64_t *f2) 81 { 82 __asm__ volatile( 83 // Compute the raw substraction of f1-f2 84 " movq 0(%1), %%r8;" 85 " subq 0(%2), %%r8;" 86 " movq 8(%1), %%r9;" 87 " sbbq 8(%2), %%r9;" 88 " movq 16(%1), %%r10;" 89 " sbbq 16(%2), %%r10;" 90 " movq 24(%1), %%r11;" 91 " sbbq 24(%2), %%r11;" 92 93 /////// Wrap the result back into the field ////// 94 95 // Step 1: Compute carry*38 96 " mov $0, %%rax;" 97 " mov $38, %%rcx;" 98 " cmovc %%rcx, %%rax;" 99 100 // Step 2: Substract carry*38 from the original difference 101 " sub %%rax, %%r8;" 102 " sbb $0, %%r9;" 103 " sbb $0, %%r10;" 104 " sbb $0, %%r11;" 105 106 // Step 3: Fold the carry bit back in; guaranteed not to carry at this point 107 " mov $0, %%rax;" 108 " cmovc %%rcx, %%rax;" 109 " sub %%rax, %%r8;" 110 111 // Store the result 112 " movq %%r8, 0(%0);" 113 " movq %%r9, 8(%0);" 114 " movq %%r10, 16(%0);" 115 " movq %%r11, 24(%0);" 116 : 117 : "r"(out), "r"(f1), "r"(f2) 118 : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); 119 } 120 121 // Computes a field multiplication: out <- f1 * f2 122 // Uses the 8-element buffer tmp for intermediate results 123 static inline void 124 fmul(uint64_t *out, uint64_t *f1, uint64_t *f2, uint64_t *tmp) 125 { 126 __asm__ volatile( 127 128 /////// Compute the raw multiplication: tmp <- src1 * src2 ////// 129 130 // Compute src1[0] * src2 131 " movq 0(%0), %%rdx;" 132 " mulxq 0(%1), %%r8, %%r9;" 133 " xor %%r10d, %%r10d;" 134 " movq %%r8, 0(%2);" 135 " mulxq 8(%1), %%r10, %%r11;" 136 " adox %%r9, %%r10;" 137 " movq %%r10, 8(%2);" 138 " mulxq 16(%1), %%rbx, %%r13;" 139 " adox %%r11, %%rbx;" 140 " mulxq 24(%1), %%r14, %%rdx;" 141 " adox %%r13, %%r14;" 142 " mov $0, %%rax;" 143 " adox %%rdx, %%rax;" 144 145 // Compute src1[1] * src2 146 " movq 8(%0), %%rdx;" 147 " mulxq 0(%1), %%r8, %%r9;" 148 " xor %%r10d, %%r10d;" 149 " adcxq 8(%2), %%r8;" 150 " movq %%r8, 8(%2);" 151 " mulxq 8(%1), %%r10, %%r11;" 152 " adox %%r9, %%r10;" 153 " adcx %%rbx, %%r10;" 154 " movq %%r10, 16(%2);" 155 " mulxq 16(%1), %%rbx, %%r13;" 156 " adox %%r11, %%rbx;" 157 " adcx %%r14, %%rbx;" 158 " mov $0, %%r8;" 159 " mulxq 24(%1), %%r14, %%rdx;" 160 " adox %%r13, %%r14;" 161 " adcx %%rax, %%r14;" 162 " mov $0, %%rax;" 163 " adox %%rdx, %%rax;" 164 " adcx %%r8, %%rax;" 165 166 // Compute src1[2] * src2 167 " movq 16(%0), %%rdx;" 168 " mulxq 0(%1), %%r8, %%r9;" 169 " xor %%r10d, %%r10d;" 170 " adcxq 16(%2), %%r8;" 171 " movq %%r8, 16(%2);" 172 " mulxq 8(%1), %%r10, %%r11;" 173 " adox %%r9, %%r10;" 174 " adcx %%rbx, %%r10;" 175 " movq %%r10, 24(%2);" 176 " mulxq 16(%1), %%rbx, %%r13;" 177 " adox %%r11, %%rbx;" 178 " adcx %%r14, %%rbx;" 179 " mov $0, %%r8;" 180 " mulxq 24(%1), %%r14, %%rdx;" 181 " adox %%r13, %%r14;" 182 " adcx %%rax, %%r14;" 183 " mov $0, %%rax;" 184 " adox %%rdx, %%rax;" 185 " adcx %%r8, %%rax;" 186 187 // Compute src1[3] * src2 188 " movq 24(%0), %%rdx;" 189 " mulxq 0(%1), %%r8, %%r9;" 190 " xor %%r10d, %%r10d;" 191 " adcxq 24(%2), %%r8;" 192 " movq %%r8, 24(%2);" 193 " mulxq 8(%1), %%r10, %%r11;" 194 " adox %%r9, %%r10;" 195 " adcx %%rbx, %%r10;" 196 " movq %%r10, 32(%2);" 197 " mulxq 16(%1), %%rbx, %%r13;" 198 " adox %%r11, %%rbx;" 199 " adcx %%r14, %%rbx;" 200 " movq %%rbx, 40(%2);" 201 " mov $0, %%r8;" 202 " mulxq 24(%1), %%r14, %%rdx;" 203 " adox %%r13, %%r14;" 204 " adcx %%rax, %%r14;" 205 " movq %%r14, 48(%2);" 206 " mov $0, %%rax;" 207 " adox %%rdx, %%rax;" 208 " adcx %%r8, %%rax;" 209 " movq %%rax, 56(%2);" 210 211 // Line up pointers 212 " mov %2, %0;" 213 " mov %3, %2;" 214 215 /////// Wrap the result back into the field ////// 216 217 // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo 218 " mov $38, %%rdx;" 219 " mulxq 32(%0), %%r8, %%r13;" 220 " xor %k1, %k1;" 221 " adoxq 0(%0), %%r8;" 222 " mulxq 40(%0), %%r9, %%rbx;" 223 " adcx %%r13, %%r9;" 224 " adoxq 8(%0), %%r9;" 225 " mulxq 48(%0), %%r10, %%r13;" 226 " adcx %%rbx, %%r10;" 227 " adoxq 16(%0), %%r10;" 228 " mulxq 56(%0), %%r11, %%rax;" 229 " adcx %%r13, %%r11;" 230 " adoxq 24(%0), %%r11;" 231 " adcx %1, %%rax;" 232 " adox %1, %%rax;" 233 " imul %%rdx, %%rax;" 234 235 // Step 2: Fold the carry back into dst 236 " add %%rax, %%r8;" 237 " adcx %1, %%r9;" 238 " movq %%r9, 8(%2);" 239 " adcx %1, %%r10;" 240 " movq %%r10, 16(%2);" 241 " adcx %1, %%r11;" 242 " movq %%r11, 24(%2);" 243 244 // Step 3: Fold the carry bit back in; guaranteed not to carry at this point 245 " mov $0, %%rax;" 246 " cmovc %%rdx, %%rax;" 247 " add %%rax, %%r8;" 248 " movq %%r8, 0(%2);" 249 : "+&r"(f1), "+&r"(f2), "+&r"(tmp) 250 : "r"(out) 251 : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "memory", "cc"); 252 } 253 254 // Computes two field multiplications: 255 // out[0] <- f1[0] * f2[0] 256 // out[1] <- f1[1] * f2[1] 257 // Uses the 16-element buffer tmp for intermediate results: 258 static inline void 259 fmul2(uint64_t *out, uint64_t *f1, uint64_t *f2, uint64_t *tmp) 260 { 261 __asm__ volatile( 262 263 /////// Compute the raw multiplication tmp[0] <- f1[0] * f2[0] ////// 264 265 // Compute src1[0] * src2 266 " movq 0(%0), %%rdx;" 267 " mulxq 0(%1), %%r8, %%r9;" 268 " xor %%r10d, %%r10d;" 269 " movq %%r8, 0(%2);" 270 " mulxq 8(%1), %%r10, %%r11;" 271 " adox %%r9, %%r10;" 272 " movq %%r10, 8(%2);" 273 " mulxq 16(%1), %%rbx, %%r13;" 274 " adox %%r11, %%rbx;" 275 " mulxq 24(%1), %%r14, %%rdx;" 276 " adox %%r13, %%r14;" 277 " mov $0, %%rax;" 278 " adox %%rdx, %%rax;" 279 280 // Compute src1[1] * src2 281 " movq 8(%0), %%rdx;" 282 " mulxq 0(%1), %%r8, %%r9;" 283 " xor %%r10d, %%r10d;" 284 " adcxq 8(%2), %%r8;" 285 " movq %%r8, 8(%2);" 286 " mulxq 8(%1), %%r10, %%r11;" 287 " adox %%r9, %%r10;" 288 " adcx %%rbx, %%r10;" 289 " movq %%r10, 16(%2);" 290 " mulxq 16(%1), %%rbx, %%r13;" 291 " adox %%r11, %%rbx;" 292 " adcx %%r14, %%rbx;" 293 " mov $0, %%r8;" 294 " mulxq 24(%1), %%r14, %%rdx;" 295 " adox %%r13, %%r14;" 296 " adcx %%rax, %%r14;" 297 " mov $0, %%rax;" 298 " adox %%rdx, %%rax;" 299 " adcx %%r8, %%rax;" 300 301 // Compute src1[2] * src2 302 " movq 16(%0), %%rdx;" 303 " mulxq 0(%1), %%r8, %%r9;" 304 " xor %%r10d, %%r10d;" 305 " adcxq 16(%2), %%r8;" 306 " movq %%r8, 16(%2);" 307 " mulxq 8(%1), %%r10, %%r11;" 308 " adox %%r9, %%r10;" 309 " adcx %%rbx, %%r10;" 310 " movq %%r10, 24(%2);" 311 " mulxq 16(%1), %%rbx, %%r13;" 312 " adox %%r11, %%rbx;" 313 " adcx %%r14, %%rbx;" 314 " mov $0, %%r8;" 315 " mulxq 24(%1), %%r14, %%rdx;" 316 " adox %%r13, %%r14;" 317 " adcx %%rax, %%r14;" 318 " mov $0, %%rax;" 319 " adox %%rdx, %%rax;" 320 " adcx %%r8, %%rax;" 321 322 // Compute src1[3] * src2 323 " movq 24(%0), %%rdx;" 324 " mulxq 0(%1), %%r8, %%r9;" 325 " xor %%r10d, %%r10d;" 326 " adcxq 24(%2), %%r8;" 327 " movq %%r8, 24(%2);" 328 " mulxq 8(%1), %%r10, %%r11;" 329 " adox %%r9, %%r10;" 330 " adcx %%rbx, %%r10;" 331 " movq %%r10, 32(%2);" 332 " mulxq 16(%1), %%rbx, %%r13;" 333 " adox %%r11, %%rbx;" 334 " adcx %%r14, %%rbx;" 335 " movq %%rbx, 40(%2);" 336 " mov $0, %%r8;" 337 " mulxq 24(%1), %%r14, %%rdx;" 338 " adox %%r13, %%r14;" 339 " adcx %%rax, %%r14;" 340 " movq %%r14, 48(%2);" 341 " mov $0, %%rax;" 342 " adox %%rdx, %%rax;" 343 " adcx %%r8, %%rax;" 344 " movq %%rax, 56(%2);" 345 346 /////// Compute the raw multiplication tmp[1] <- f1[1] * f2[1] ////// 347 348 // Compute src1[0] * src2 349 " movq 32(%0), %%rdx;" 350 " mulxq 32(%1), %%r8, %%r9;" 351 " xor %%r10d, %%r10d;" 352 " movq %%r8, 64(%2);" 353 " mulxq 40(%1), %%r10, %%r11;" 354 " adox %%r9, %%r10;" 355 " movq %%r10, 72(%2);" 356 " mulxq 48(%1), %%rbx, %%r13;" 357 " adox %%r11, %%rbx;" 358 " mulxq 56(%1), %%r14, %%rdx;" 359 " adox %%r13, %%r14;" 360 " mov $0, %%rax;" 361 " adox %%rdx, %%rax;" 362 363 // Compute src1[1] * src2 364 " movq 40(%0), %%rdx;" 365 " mulxq 32(%1), %%r8, %%r9;" 366 " xor %%r10d, %%r10d;" 367 " adcxq 72(%2), %%r8;" 368 " movq %%r8, 72(%2);" 369 " mulxq 40(%1), %%r10, %%r11;" 370 " adox %%r9, %%r10;" 371 " adcx %%rbx, %%r10;" 372 " movq %%r10, 80(%2);" 373 " mulxq 48(%1), %%rbx, %%r13;" 374 " adox %%r11, %%rbx;" 375 " adcx %%r14, %%rbx;" 376 " mov $0, %%r8;" 377 " mulxq 56(%1), %%r14, %%rdx;" 378 " adox %%r13, %%r14;" 379 " adcx %%rax, %%r14;" 380 " mov $0, %%rax;" 381 " adox %%rdx, %%rax;" 382 " adcx %%r8, %%rax;" 383 384 // Compute src1[2] * src2 385 " movq 48(%0), %%rdx;" 386 " mulxq 32(%1), %%r8, %%r9;" 387 " xor %%r10d, %%r10d;" 388 " adcxq 80(%2), %%r8;" 389 " movq %%r8, 80(%2);" 390 " mulxq 40(%1), %%r10, %%r11;" 391 " adox %%r9, %%r10;" 392 " adcx %%rbx, %%r10;" 393 " movq %%r10, 88(%2);" 394 " mulxq 48(%1), %%rbx, %%r13;" 395 " adox %%r11, %%rbx;" 396 " adcx %%r14, %%rbx;" 397 " mov $0, %%r8;" 398 " mulxq 56(%1), %%r14, %%rdx;" 399 " adox %%r13, %%r14;" 400 " adcx %%rax, %%r14;" 401 " mov $0, %%rax;" 402 " adox %%rdx, %%rax;" 403 " adcx %%r8, %%rax;" 404 405 // Compute src1[3] * src2 406 " movq 56(%0), %%rdx;" 407 " mulxq 32(%1), %%r8, %%r9;" 408 " xor %%r10d, %%r10d;" 409 " adcxq 88(%2), %%r8;" 410 " movq %%r8, 88(%2);" 411 " mulxq 40(%1), %%r10, %%r11;" 412 " adox %%r9, %%r10;" 413 " adcx %%rbx, %%r10;" 414 " movq %%r10, 96(%2);" 415 " mulxq 48(%1), %%rbx, %%r13;" 416 " adox %%r11, %%rbx;" 417 " adcx %%r14, %%rbx;" 418 " movq %%rbx, 104(%2);" 419 " mov $0, %%r8;" 420 " mulxq 56(%1), %%r14, %%rdx;" 421 " adox %%r13, %%r14;" 422 " adcx %%rax, %%r14;" 423 " movq %%r14, 112(%2);" 424 " mov $0, %%rax;" 425 " adox %%rdx, %%rax;" 426 " adcx %%r8, %%rax;" 427 " movq %%rax, 120(%2);" 428 429 // Line up pointers 430 " mov %2, %0;" 431 " mov %3, %2;" 432 433 /////// Wrap the results back into the field ////// 434 435 // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo 436 " mov $38, %%rdx;" 437 " mulxq 32(%0), %%r8, %%r13;" 438 " xor %k1, %k1;" 439 " adoxq 0(%0), %%r8;" 440 " mulxq 40(%0), %%r9, %%rbx;" 441 " adcx %%r13, %%r9;" 442 " adoxq 8(%0), %%r9;" 443 " mulxq 48(%0), %%r10, %%r13;" 444 " adcx %%rbx, %%r10;" 445 " adoxq 16(%0), %%r10;" 446 " mulxq 56(%0), %%r11, %%rax;" 447 " adcx %%r13, %%r11;" 448 " adoxq 24(%0), %%r11;" 449 " adcx %1, %%rax;" 450 " adox %1, %%rax;" 451 " imul %%rdx, %%rax;" 452 453 // Step 2: Fold the carry back into dst 454 " add %%rax, %%r8;" 455 " adcx %1, %%r9;" 456 " movq %%r9, 8(%2);" 457 " adcx %1, %%r10;" 458 " movq %%r10, 16(%2);" 459 " adcx %1, %%r11;" 460 " movq %%r11, 24(%2);" 461 462 // Step 3: Fold the carry bit back in; guaranteed not to carry at this point 463 " mov $0, %%rax;" 464 " cmovc %%rdx, %%rax;" 465 " add %%rax, %%r8;" 466 " movq %%r8, 0(%2);" 467 468 // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo 469 " mov $38, %%rdx;" 470 " mulxq 96(%0), %%r8, %%r13;" 471 " xor %k1, %k1;" 472 " adoxq 64(%0), %%r8;" 473 " mulxq 104(%0), %%r9, %%rbx;" 474 " adcx %%r13, %%r9;" 475 " adoxq 72(%0), %%r9;" 476 " mulxq 112(%0), %%r10, %%r13;" 477 " adcx %%rbx, %%r10;" 478 " adoxq 80(%0), %%r10;" 479 " mulxq 120(%0), %%r11, %%rax;" 480 " adcx %%r13, %%r11;" 481 " adoxq 88(%0), %%r11;" 482 " adcx %1, %%rax;" 483 " adox %1, %%rax;" 484 " imul %%rdx, %%rax;" 485 486 // Step 2: Fold the carry back into dst 487 " add %%rax, %%r8;" 488 " adcx %1, %%r9;" 489 " movq %%r9, 40(%2);" 490 " adcx %1, %%r10;" 491 " movq %%r10, 48(%2);" 492 " adcx %1, %%r11;" 493 " movq %%r11, 56(%2);" 494 495 // Step 3: Fold the carry bit back in; guaranteed not to carry at this point 496 " mov $0, %%rax;" 497 " cmovc %%rdx, %%rax;" 498 " add %%rax, %%r8;" 499 " movq %%r8, 32(%2);" 500 : "+&r"(f1), "+&r"(f2), "+&r"(tmp) 501 : "r"(out) 502 : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "memory", "cc"); 503 } 504 505 // Computes the field multiplication of four-element f1 with value in f2 506 // Requires f2 to be smaller than 2^17 507 static inline void 508 fmul_scalar(uint64_t *out, uint64_t *f1, uint64_t f2) 509 { 510 register uint64_t f2_r __asm__("rdx") = f2; 511 512 __asm__ volatile( 513 // Compute the raw multiplication of f1*f2 514 " mulxq 0(%2), %%r8, %%rcx;" // f1[0]*f2 515 " mulxq 8(%2), %%r9, %%rbx;" // f1[1]*f2 516 " add %%rcx, %%r9;" 517 " mov $0, %%rcx;" 518 " mulxq 16(%2), %%r10, %%r13;" // f1[2]*f2 519 " adcx %%rbx, %%r10;" 520 " mulxq 24(%2), %%r11, %%rax;" // f1[3]*f2 521 " adcx %%r13, %%r11;" 522 " adcx %%rcx, %%rax;" 523 524 /////// Wrap the result back into the field ////// 525 526 // Step 1: Compute carry*38 527 " mov $38, %%rdx;" 528 " imul %%rdx, %%rax;" 529 530 // Step 2: Fold the carry back into dst 531 " add %%rax, %%r8;" 532 " adcx %%rcx, %%r9;" 533 " movq %%r9, 8(%1);" 534 " adcx %%rcx, %%r10;" 535 " movq %%r10, 16(%1);" 536 " adcx %%rcx, %%r11;" 537 " movq %%r11, 24(%1);" 538 539 // Step 3: Fold the carry bit back in; guaranteed not to carry at this point 540 " mov $0, %%rax;" 541 " cmovc %%rdx, %%rax;" 542 " add %%rax, %%r8;" 543 " movq %%r8, 0(%1);" 544 : "+&r"(f2_r) 545 : "r"(out), "r"(f1) 546 : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13", "memory", "cc"); 547 } 548 549 // Computes p1 <- bit ? p2 : p1 in constant time 550 static inline void 551 cswap2(uint64_t bit, uint64_t *p1, uint64_t *p2) 552 { 553 __asm__ volatile( 554 // Transfer bit into CF flag 555 " add $18446744073709551615, %0;" 556 557 // cswap p1[0], p2[0] 558 " movq 0(%1), %%r8;" 559 " movq 0(%2), %%r9;" 560 " mov %%r8, %%r10;" 561 " cmovc %%r9, %%r8;" 562 " cmovc %%r10, %%r9;" 563 " movq %%r8, 0(%1);" 564 " movq %%r9, 0(%2);" 565 566 // cswap p1[1], p2[1] 567 " movq 8(%1), %%r8;" 568 " movq 8(%2), %%r9;" 569 " mov %%r8, %%r10;" 570 " cmovc %%r9, %%r8;" 571 " cmovc %%r10, %%r9;" 572 " movq %%r8, 8(%1);" 573 " movq %%r9, 8(%2);" 574 575 // cswap p1[2], p2[2] 576 " movq 16(%1), %%r8;" 577 " movq 16(%2), %%r9;" 578 " mov %%r8, %%r10;" 579 " cmovc %%r9, %%r8;" 580 " cmovc %%r10, %%r9;" 581 " movq %%r8, 16(%1);" 582 " movq %%r9, 16(%2);" 583 584 // cswap p1[3], p2[3] 585 " movq 24(%1), %%r8;" 586 " movq 24(%2), %%r9;" 587 " mov %%r8, %%r10;" 588 " cmovc %%r9, %%r8;" 589 " cmovc %%r10, %%r9;" 590 " movq %%r8, 24(%1);" 591 " movq %%r9, 24(%2);" 592 593 // cswap p1[4], p2[4] 594 " movq 32(%1), %%r8;" 595 " movq 32(%2), %%r9;" 596 " mov %%r8, %%r10;" 597 " cmovc %%r9, %%r8;" 598 " cmovc %%r10, %%r9;" 599 " movq %%r8, 32(%1);" 600 " movq %%r9, 32(%2);" 601 602 // cswap p1[5], p2[5] 603 " movq 40(%1), %%r8;" 604 " movq 40(%2), %%r9;" 605 " mov %%r8, %%r10;" 606 " cmovc %%r9, %%r8;" 607 " cmovc %%r10, %%r9;" 608 " movq %%r8, 40(%1);" 609 " movq %%r9, 40(%2);" 610 611 // cswap p1[6], p2[6] 612 " movq 48(%1), %%r8;" 613 " movq 48(%2), %%r9;" 614 " mov %%r8, %%r10;" 615 " cmovc %%r9, %%r8;" 616 " cmovc %%r10, %%r9;" 617 " movq %%r8, 48(%1);" 618 " movq %%r9, 48(%2);" 619 620 // cswap p1[7], p2[7] 621 " movq 56(%1), %%r8;" 622 " movq 56(%2), %%r9;" 623 " mov %%r8, %%r10;" 624 " cmovc %%r9, %%r8;" 625 " cmovc %%r10, %%r9;" 626 " movq %%r8, 56(%1);" 627 " movq %%r9, 56(%2);" 628 : "+&r"(bit) 629 : "r"(p1), "r"(p2) 630 : "%r8", "%r9", "%r10", "memory", "cc"); 631 } 632 633 // Computes the square of a field element: out <- f * f 634 // Uses the 8-element buffer tmp for intermediate results 635 static inline void 636 fsqr(uint64_t *out, uint64_t *f, uint64_t *tmp) 637 { 638 __asm__ volatile( 639 640 /////// Compute the raw multiplication: tmp <- f * f ////// 641 642 // Step 1: Compute all partial products 643 " movq 0(%0), %%rdx;" // f[0] 644 " mulxq 8(%0), %%r8, %%r14;" 645 " xor %%r15d, %%r15d;" // f[1]*f[0] 646 " mulxq 16(%0), %%r9, %%r10;" 647 " adcx %%r14, %%r9;" // f[2]*f[0] 648 " mulxq 24(%0), %%rax, %%rcx;" 649 " adcx %%rax, %%r10;" // f[3]*f[0] 650 " movq 24(%0), %%rdx;" // f[3] 651 " mulxq 8(%0), %%r11, %%rbx;" 652 " adcx %%rcx, %%r11;" // f[1]*f[3] 653 " mulxq 16(%0), %%rax, %%r13;" 654 " adcx %%rax, %%rbx;" // f[2]*f[3] 655 " movq 8(%0), %%rdx;" 656 " adcx %%r15, %%r13;" // f1 657 " mulxq 16(%0), %%rax, %%rcx;" 658 " mov $0, %%r14;" // f[2]*f[1] 659 660 // Step 2: Compute two parallel carry chains 661 " xor %%r15d, %%r15d;" 662 " adox %%rax, %%r10;" 663 " adcx %%r8, %%r8;" 664 " adox %%rcx, %%r11;" 665 " adcx %%r9, %%r9;" 666 " adox %%r15, %%rbx;" 667 " adcx %%r10, %%r10;" 668 " adox %%r15, %%r13;" 669 " adcx %%r11, %%r11;" 670 " adox %%r15, %%r14;" 671 " adcx %%rbx, %%rbx;" 672 " adcx %%r13, %%r13;" 673 " adcx %%r14, %%r14;" 674 675 // Step 3: Compute intermediate squares 676 " movq 0(%0), %%rdx;" 677 " mulx %%rdx, %%rax, %%rcx;" // f[0]^2 678 " movq %%rax, 0(%1);" 679 " add %%rcx, %%r8;" 680 " movq %%r8, 8(%1);" 681 " movq 8(%0), %%rdx;" 682 " mulx %%rdx, %%rax, %%rcx;" // f[1]^2 683 " adcx %%rax, %%r9;" 684 " movq %%r9, 16(%1);" 685 " adcx %%rcx, %%r10;" 686 " movq %%r10, 24(%1);" 687 " movq 16(%0), %%rdx;" 688 " mulx %%rdx, %%rax, %%rcx;" // f[2]^2 689 " adcx %%rax, %%r11;" 690 " movq %%r11, 32(%1);" 691 " adcx %%rcx, %%rbx;" 692 " movq %%rbx, 40(%1);" 693 " movq 24(%0), %%rdx;" 694 " mulx %%rdx, %%rax, %%rcx;" // f[3]^2 695 " adcx %%rax, %%r13;" 696 " movq %%r13, 48(%1);" 697 " adcx %%rcx, %%r14;" 698 " movq %%r14, 56(%1);" 699 700 // Line up pointers 701 " mov %1, %0;" 702 " mov %2, %1;" 703 704 /////// Wrap the result back into the field ////// 705 706 // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo 707 " mov $38, %%rdx;" 708 " mulxq 32(%0), %%r8, %%r13;" 709 " xor %%ecx, %%ecx;" 710 " adoxq 0(%0), %%r8;" 711 " mulxq 40(%0), %%r9, %%rbx;" 712 " adcx %%r13, %%r9;" 713 " adoxq 8(%0), %%r9;" 714 " mulxq 48(%0), %%r10, %%r13;" 715 " adcx %%rbx, %%r10;" 716 " adoxq 16(%0), %%r10;" 717 " mulxq 56(%0), %%r11, %%rax;" 718 " adcx %%r13, %%r11;" 719 " adoxq 24(%0), %%r11;" 720 " adcx %%rcx, %%rax;" 721 " adox %%rcx, %%rax;" 722 " imul %%rdx, %%rax;" 723 724 // Step 2: Fold the carry back into dst 725 " add %%rax, %%r8;" 726 " adcx %%rcx, %%r9;" 727 " movq %%r9, 8(%1);" 728 " adcx %%rcx, %%r10;" 729 " movq %%r10, 16(%1);" 730 " adcx %%rcx, %%r11;" 731 " movq %%r11, 24(%1);" 732 733 // Step 3: Fold the carry bit back in; guaranteed not to carry at this point 734 " mov $0, %%rax;" 735 " cmovc %%rdx, %%rax;" 736 " add %%rax, %%r8;" 737 " movq %%r8, 0(%1);" 738 : "+&r"(f), "+&r"(tmp) 739 : "r"(out) 740 : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "%r15", "memory", "cc"); 741 } 742 743 // Computes two field squarings: 744 // out[0] <- f[0] * f[0] 745 // out[1] <- f[1] * f[1] 746 // Uses the 16-element buffer tmp for intermediate results 747 static inline void 748 fsqr2(uint64_t *out, uint64_t *f, uint64_t *tmp) 749 { 750 __asm__ volatile( 751 // Step 1: Compute all partial products 752 " movq 0(%0), %%rdx;" // f[0] 753 " mulxq 8(%0), %%r8, %%r14;" 754 " xor %%r15d, %%r15d;" // f[1]*f[0] 755 " mulxq 16(%0), %%r9, %%r10;" 756 " adcx %%r14, %%r9;" // f[2]*f[0] 757 " mulxq 24(%0), %%rax, %%rcx;" 758 " adcx %%rax, %%r10;" // f[3]*f[0] 759 " movq 24(%0), %%rdx;" // f[3] 760 " mulxq 8(%0), %%r11, %%rbx;" 761 " adcx %%rcx, %%r11;" // f[1]*f[3] 762 " mulxq 16(%0), %%rax, %%r13;" 763 " adcx %%rax, %%rbx;" // f[2]*f[3] 764 " movq 8(%0), %%rdx;" 765 " adcx %%r15, %%r13;" // f1 766 " mulxq 16(%0), %%rax, %%rcx;" 767 " mov $0, %%r14;" // f[2]*f[1] 768 769 // Step 2: Compute two parallel carry chains 770 " xor %%r15d, %%r15d;" 771 " adox %%rax, %%r10;" 772 " adcx %%r8, %%r8;" 773 " adox %%rcx, %%r11;" 774 " adcx %%r9, %%r9;" 775 " adox %%r15, %%rbx;" 776 " adcx %%r10, %%r10;" 777 " adox %%r15, %%r13;" 778 " adcx %%r11, %%r11;" 779 " adox %%r15, %%r14;" 780 " adcx %%rbx, %%rbx;" 781 " adcx %%r13, %%r13;" 782 " adcx %%r14, %%r14;" 783 784 // Step 3: Compute intermediate squares 785 " movq 0(%0), %%rdx;" 786 " mulx %%rdx, %%rax, %%rcx;" // f[0]^2 787 " movq %%rax, 0(%1);" 788 " add %%rcx, %%r8;" 789 " movq %%r8, 8(%1);" 790 " movq 8(%0), %%rdx;" 791 " mulx %%rdx, %%rax, %%rcx;" // f[1]^2 792 " adcx %%rax, %%r9;" 793 " movq %%r9, 16(%1);" 794 " adcx %%rcx, %%r10;" 795 " movq %%r10, 24(%1);" 796 " movq 16(%0), %%rdx;" 797 " mulx %%rdx, %%rax, %%rcx;" // f[2]^2 798 " adcx %%rax, %%r11;" 799 " movq %%r11, 32(%1);" 800 " adcx %%rcx, %%rbx;" 801 " movq %%rbx, 40(%1);" 802 " movq 24(%0), %%rdx;" 803 " mulx %%rdx, %%rax, %%rcx;" // f[3]^2 804 " adcx %%rax, %%r13;" 805 " movq %%r13, 48(%1);" 806 " adcx %%rcx, %%r14;" 807 " movq %%r14, 56(%1);" 808 809 // Step 1: Compute all partial products 810 " movq 32(%0), %%rdx;" // f[0] 811 " mulxq 40(%0), %%r8, %%r14;" 812 " xor %%r15d, %%r15d;" // f[1]*f[0] 813 " mulxq 48(%0), %%r9, %%r10;" 814 " adcx %%r14, %%r9;" // f[2]*f[0] 815 " mulxq 56(%0), %%rax, %%rcx;" 816 " adcx %%rax, %%r10;" // f[3]*f[0] 817 " movq 56(%0), %%rdx;" // f[3] 818 " mulxq 40(%0), %%r11, %%rbx;" 819 " adcx %%rcx, %%r11;" // f[1]*f[3] 820 " mulxq 48(%0), %%rax, %%r13;" 821 " adcx %%rax, %%rbx;" // f[2]*f[3] 822 " movq 40(%0), %%rdx;" 823 " adcx %%r15, %%r13;" // f1 824 " mulxq 48(%0), %%rax, %%rcx;" 825 " mov $0, %%r14;" // f[2]*f[1] 826 827 // Step 2: Compute two parallel carry chains 828 " xor %%r15d, %%r15d;" 829 " adox %%rax, %%r10;" 830 " adcx %%r8, %%r8;" 831 " adox %%rcx, %%r11;" 832 " adcx %%r9, %%r9;" 833 " adox %%r15, %%rbx;" 834 " adcx %%r10, %%r10;" 835 " adox %%r15, %%r13;" 836 " adcx %%r11, %%r11;" 837 " adox %%r15, %%r14;" 838 " adcx %%rbx, %%rbx;" 839 " adcx %%r13, %%r13;" 840 " adcx %%r14, %%r14;" 841 842 // Step 3: Compute intermediate squares 843 " movq 32(%0), %%rdx;" 844 " mulx %%rdx, %%rax, %%rcx;" // f[0]^2 845 " movq %%rax, 64(%1);" 846 " add %%rcx, %%r8;" 847 " movq %%r8, 72(%1);" 848 " movq 40(%0), %%rdx;" 849 " mulx %%rdx, %%rax, %%rcx;" // f[1]^2 850 " adcx %%rax, %%r9;" 851 " movq %%r9, 80(%1);" 852 " adcx %%rcx, %%r10;" 853 " movq %%r10, 88(%1);" 854 " movq 48(%0), %%rdx;" 855 " mulx %%rdx, %%rax, %%rcx;" // f[2]^2 856 " adcx %%rax, %%r11;" 857 " movq %%r11, 96(%1);" 858 " adcx %%rcx, %%rbx;" 859 " movq %%rbx, 104(%1);" 860 " movq 56(%0), %%rdx;" 861 " mulx %%rdx, %%rax, %%rcx;" // f[3]^2 862 " adcx %%rax, %%r13;" 863 " movq %%r13, 112(%1);" 864 " adcx %%rcx, %%r14;" 865 " movq %%r14, 120(%1);" 866 867 // Line up pointers 868 " mov %1, %0;" 869 " mov %2, %1;" 870 871 // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo 872 " mov $38, %%rdx;" 873 " mulxq 32(%0), %%r8, %%r13;" 874 " xor %%ecx, %%ecx;" 875 " adoxq 0(%0), %%r8;" 876 " mulxq 40(%0), %%r9, %%rbx;" 877 " adcx %%r13, %%r9;" 878 " adoxq 8(%0), %%r9;" 879 " mulxq 48(%0), %%r10, %%r13;" 880 " adcx %%rbx, %%r10;" 881 " adoxq 16(%0), %%r10;" 882 " mulxq 56(%0), %%r11, %%rax;" 883 " adcx %%r13, %%r11;" 884 " adoxq 24(%0), %%r11;" 885 " adcx %%rcx, %%rax;" 886 " adox %%rcx, %%rax;" 887 " imul %%rdx, %%rax;" 888 889 // Step 2: Fold the carry back into dst 890 " add %%rax, %%r8;" 891 " adcx %%rcx, %%r9;" 892 " movq %%r9, 8(%1);" 893 " adcx %%rcx, %%r10;" 894 " movq %%r10, 16(%1);" 895 " adcx %%rcx, %%r11;" 896 " movq %%r11, 24(%1);" 897 898 // Step 3: Fold the carry bit back in; guaranteed not to carry at this point 899 " mov $0, %%rax;" 900 " cmovc %%rdx, %%rax;" 901 " add %%rax, %%r8;" 902 " movq %%r8, 0(%1);" 903 904 // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo 905 " mov $38, %%rdx;" 906 " mulxq 96(%0), %%r8, %%r13;" 907 " xor %%ecx, %%ecx;" 908 " adoxq 64(%0), %%r8;" 909 " mulxq 104(%0), %%r9, %%rbx;" 910 " adcx %%r13, %%r9;" 911 " adoxq 72(%0), %%r9;" 912 " mulxq 112(%0), %%r10, %%r13;" 913 " adcx %%rbx, %%r10;" 914 " adoxq 80(%0), %%r10;" 915 " mulxq 120(%0), %%r11, %%rax;" 916 " adcx %%r13, %%r11;" 917 " adoxq 88(%0), %%r11;" 918 " adcx %%rcx, %%rax;" 919 " adox %%rcx, %%rax;" 920 " imul %%rdx, %%rax;" 921 922 // Step 2: Fold the carry back into dst 923 " add %%rax, %%r8;" 924 " adcx %%rcx, %%r9;" 925 " movq %%r9, 40(%1);" 926 " adcx %%rcx, %%r10;" 927 " movq %%r10, 48(%1);" 928 " adcx %%rcx, %%r11;" 929 " movq %%r11, 56(%1);" 930 931 // Step 3: Fold the carry bit back in; guaranteed not to carry at this point 932 " mov $0, %%rax;" 933 " cmovc %%rdx, %%rax;" 934 " add %%rax, %%r8;" 935 " movq %%r8, 32(%1);" 936 : "+&r"(f), "+&r"(tmp) 937 : "r"(out) 938 : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", "%r14", "%r15", "memory", "cc"); 939 } 940 941 #endif /* defined(__x86_64__) || defined(_M_X64) */ 942 #endif /* __GNUC__ */