intel-gcm-x64-masm.asm (34729B)
1 ; LICENSE: 2 ; This submission to NSS is to be made available under the terms of the 3 ; Mozilla Public License, v. 2.0. You can obtain one at http: 4 ; //mozilla.org/MPL/2.0/. 5 ;############################################################################### 6 ; Copyright(c) 2014, Intel Corp. 7 ; Developers and authors: 8 ; Shay Gueron and Vlad Krasnov 9 ; Intel Corporation, Israel Development Centre, Haifa, Israel 10 ; Please send feedback directly to crypto.feedback.alias@intel.com 11 12 13 .DATA 14 ALIGN 16 15 Lone dq 1,0 16 Ltwo dq 2,0 17 Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 18 Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh 19 Lpoly dq 01h, 0c200000000000000h 20 21 .CODE 22 23 24 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 25 vpclmulqdq TMP1, SRC2, SRC1, 0h 26 vpclmulqdq TMP4, SRC2, SRC1, 011h 27 28 vpshufd TMP2, SRC2, 78 29 vpshufd TMP3, SRC1, 78 30 vpxor TMP2, TMP2, SRC2 31 vpxor TMP3, TMP3, SRC1 32 33 vpclmulqdq TMP2, TMP2, TMP3, 0h 34 vpxor TMP2, TMP2, TMP1 35 vpxor TMP2, TMP2, TMP4 36 37 vpslldq TMP3, TMP2, 8 38 vpsrldq TMP2, TMP2, 8 39 40 vpxor TMP1, TMP1, TMP3 41 vpxor TMP4, TMP4, TMP2 42 43 vpclmulqdq TMP2, TMP1, [Lpoly], 010h 44 vpshufd TMP3, TMP1, 78 45 vpxor TMP1, TMP2, TMP3 46 47 vpclmulqdq TMP2, TMP1, [Lpoly], 010h 48 vpshufd TMP3, TMP1, 78 49 vpxor TMP1, TMP2, TMP3 50 51 vpxor DST, TMP1, TMP4 52 53 ENDM 54 55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 56 ; 57 ; Generates the final GCM tag 58 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16], 59 ; unsigned char *Tp, 60 ; unsigned int Mlen, 61 ; unsigned int Alen, 62 ; unsigned char *X0, 63 ; unsigned char *TAG); 64 ; 65 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 66 67 ALIGN 16 68 intel_aes_gcmTAG PROC 69 70 Htbl textequ <rcx> 71 Tp textequ <rdx> 72 Mlen textequ <r8> 73 Alen textequ <r9> 74 X0 textequ <r10> 75 TAG textequ <r11> 76 77 T textequ <xmm0> 78 TMP0 textequ <xmm1> 79 80 mov X0, [rsp + 1*8 + 4*8] 81 mov TAG, [rsp + 1*8 + 5*8] 82 83 vzeroupper 84 vmovdqu T, XMMWORD PTR[Tp] 85 vpxor TMP0, TMP0, TMP0 86 87 shl Mlen, 3 88 shl Alen, 3 89 90 ;vpinsrq TMP0, TMP0, Mlen, 0 91 ;vpinsrq TMP0, TMP0, Alen, 1 92 ; workaround the ml64.exe vpinsrq issue 93 vpinsrd TMP0, TMP0, r8d, 0 94 vpinsrd TMP0, TMP0, r9d, 2 95 shr Mlen, 32 96 shr Alen, 32 97 vpinsrd TMP0, TMP0, r8d, 1 98 vpinsrd TMP0, TMP0, r9d, 3 99 100 vpxor T, T, TMP0 101 vmovdqu TMP0, XMMWORD PTR[Htbl] 102 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 103 104 vpshufb T, T, [Lbswap_mask] 105 vpxor T, T, [X0] 106 vmovdqu XMMWORD PTR[TAG], T 107 vzeroupper 108 109 ret 110 111 intel_aes_gcmTAG ENDP 112 113 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 114 ; 115 ; Generates the H table 116 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); 117 ; 118 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 119 120 ALIGN 16 121 intel_aes_gcmINIT PROC 122 123 Htbl textequ <rcx> 124 KS textequ <rdx> 125 NR textequ <r8d> 126 127 T textequ <xmm0> 128 TMP0 textequ <xmm1> 129 130 vzeroupper 131 ; AES-ENC(0) 132 vmovdqu T, XMMWORD PTR[KS] 133 lea KS, [16 + KS] 134 dec NR 135 Lenc_loop: 136 vaesenc T, T, [KS] 137 lea KS, [16 + KS] 138 dec NR 139 jnz Lenc_loop 140 141 vaesenclast T, T, [KS] 142 vpshufb T, T, [Lbswap_mask] 143 144 ;Calculate H` = GFMUL(H, 2) 145 vpsrad xmm3, T, 31 146 vpshufd xmm3, xmm3, 0ffh 147 vpand xmm5, xmm3, [Lpoly] 148 vpsrld xmm3, T, 31 149 vpslld xmm4, T, 1 150 vpslldq xmm3, xmm3, 4 151 vpxor T, xmm4, xmm3 152 vpxor T, T, xmm5 153 154 vmovdqu TMP0, T 155 vmovdqu XMMWORD PTR[Htbl + 0*16], T 156 157 vpshufd xmm2, T, 78 158 vpxor xmm2, xmm2, T 159 vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 160 161 i = 1 162 WHILE i LT 8 163 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 164 vmovdqu XMMWORD PTR[Htbl + i*16], T 165 vpshufd xmm2, T, 78 166 vpxor xmm2, xmm2, T 167 vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 168 i = i+1 169 ENDM 170 vzeroupper 171 ret 172 intel_aes_gcmINIT ENDP 173 174 175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 176 ; 177 ; Authenticate only 178 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); 179 ; 180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 181 182 ALIGN 16 183 intel_aes_gcmAAD PROC 184 185 Htbl textequ <rcx> 186 inp textequ <rdx> 187 len textequ <r8> 188 Tp textequ <r9> 189 hlp0 textequ <r10> 190 191 DATA textequ <xmm0> 192 T textequ <xmm1> 193 TMP0 textequ <xmm2> 194 TMP1 textequ <xmm3> 195 TMP2 textequ <xmm4> 196 TMP3 textequ <xmm5> 197 TMP4 textequ <xmm6> 198 Xhi textequ <xmm7> 199 200 KARATSUBA_AAD MACRO i 201 vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h 202 vpxor TMP0, TMP0, TMP3 203 vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h 204 vpxor TMP1, TMP1, TMP3 205 vpshufd TMP3, DATA, 78 206 vpxor TMP3, TMP3, DATA 207 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h 208 vpxor TMP2, TMP2, TMP3 209 ENDM 210 211 test len, len 212 jnz LbeginAAD 213 ret 214 215 LbeginAAD: 216 vzeroupper 217 218 sub rsp, 2*16 219 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 220 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 221 222 vpxor Xhi, Xhi, Xhi 223 224 vmovdqu T, XMMWORD PTR[Tp] 225 ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first 226 mov hlp0, len 227 and hlp0, 128-1 228 jz Lmod_loop 229 230 and len, -128 231 sub hlp0, 16 232 233 ; Prefix block 234 vmovdqu DATA, XMMWORD PTR[inp] 235 vpshufb DATA, DATA, [Lbswap_mask] 236 vpxor DATA, DATA, T 237 238 vpclmulqdq TMP0, DATA, [Htbl + hlp0], 0h 239 vpclmulqdq TMP1, DATA, [Htbl + hlp0], 011h 240 vpshufd TMP3, DATA, 78 241 vpxor TMP3, TMP3, DATA 242 vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h 243 244 lea inp, [inp+16] 245 test hlp0, hlp0 246 jnz Lpre_loop 247 jmp Lred1 248 249 ;hash remaining prefix bocks (up to 7 total prefix blocks) 250 Lpre_loop: 251 252 sub hlp0, 16 253 254 vmovdqu DATA, XMMWORD PTR[inp] 255 vpshufb DATA, DATA, [Lbswap_mask] 256 257 vpclmulqdq TMP3, DATA, [Htbl + hlp0], 0h 258 vpxor TMP0, TMP0, TMP3 259 vpclmulqdq TMP3, DATA, [Htbl + hlp0], 011h 260 vpxor TMP1, TMP1, TMP3 261 vpshufd TMP3, DATA, 78 262 vpxor TMP3, TMP3, DATA 263 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h 264 vpxor TMP2, TMP2, TMP3 265 266 test hlp0, hlp0 267 lea inp, [inp+16] 268 jnz Lpre_loop 269 270 Lred1: 271 272 vpxor TMP2, TMP2, TMP0 273 vpxor TMP2, TMP2, TMP1 274 vpsrldq TMP3, TMP2, 8 275 vpslldq TMP2, TMP2, 8 276 277 vpxor Xhi, TMP1, TMP3 278 vpxor T, TMP0, TMP2 279 280 281 Lmod_loop: 282 283 sub len, 16*8 284 jb Ldone 285 ; Block #0 286 vmovdqu DATA, XMMWORD PTR[inp + 16*7] 287 vpshufb DATA, DATA, [Lbswap_mask] 288 289 vpclmulqdq TMP0, DATA, [Htbl + 0*16], 0h 290 vpclmulqdq TMP1, DATA, [Htbl + 0*16], 011h 291 vpshufd TMP3, DATA, 78 292 vpxor TMP3, TMP3, DATA 293 vpclmulqdq TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h 294 295 ; Block #1 296 vmovdqu DATA, XMMWORD PTR[inp + 16*6] 297 vpshufb DATA, DATA, [Lbswap_mask] 298 KARATSUBA_AAD 1 299 300 ; Block #2 301 vmovdqu DATA, XMMWORD PTR[inp + 16*5] 302 vpshufb DATA, DATA, [Lbswap_mask] 303 304 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a 305 vpalignr T, T, T, 8 306 307 KARATSUBA_AAD 2 308 309 vpxor T, T, TMP4 ;reduction stage 1b 310 311 ; Block #3 312 vmovdqu DATA, XMMWORD PTR[inp + 16*4] 313 vpshufb DATA, DATA, [Lbswap_mask] 314 KARATSUBA_AAD 3 315 ; Block #4 316 vmovdqu DATA, XMMWORD PTR[inp + 16*3] 317 vpshufb DATA, DATA, [Lbswap_mask] 318 319 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a 320 vpalignr T, T, T, 8 321 322 KARATSUBA_AAD 4 323 324 vpxor T, T, TMP4 ;reduction stage 2b 325 ; Block #5 326 vmovdqu DATA, XMMWORD PTR[inp + 16*2] 327 vpshufb DATA, DATA, [Lbswap_mask] 328 KARATSUBA_AAD 5 329 330 vpxor T, T, Xhi ;reduction finalize 331 ; Block #6 332 vmovdqu DATA, XMMWORD PTR[inp + 16*1] 333 vpshufb DATA, DATA, [Lbswap_mask] 334 KARATSUBA_AAD 6 335 ; Block #7 336 vmovdqu DATA, XMMWORD PTR[inp + 16*0] 337 vpshufb DATA, DATA, [Lbswap_mask] 338 vpxor DATA, DATA, T 339 KARATSUBA_AAD 7 340 ; Aggregated 8 blocks, now karatsuba fixup 341 vpxor TMP2, TMP2, TMP0 342 vpxor TMP2, TMP2, TMP1 343 vpsrldq TMP3, TMP2, 8 344 vpslldq TMP2, TMP2, 8 345 346 vpxor Xhi, TMP1, TMP3 347 vpxor T, TMP0, TMP2 348 349 lea inp, [inp + 16*8] 350 jmp Lmod_loop 351 352 Ldone: 353 vpclmulqdq TMP4, T, [Lpoly], 010h 354 vpalignr T, T, T, 8 355 vpxor T, T, TMP4 356 357 vpclmulqdq TMP4, T, [Lpoly], 010h 358 vpalignr T, T, T, 8 359 vpxor T, T, TMP4 360 361 vpxor T, T, Xhi 362 vmovdqu XMMWORD PTR[Tp], T 363 vzeroupper 364 365 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] 366 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] 367 add rsp, 16*2 368 369 ret 370 371 intel_aes_gcmAAD ENDP 372 373 374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 375 ; 376 ; Encrypt and Authenticate 377 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); 378 ; 379 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 380 381 ALIGN 16 382 intel_aes_gcmENC PROC 383 384 PT textequ <rcx> 385 CT textequ <rdx> 386 Htbl textequ <r8> 387 Gctx textequ <r8> 388 len textequ <r9> 389 KS textequ <r10> 390 NR textequ <eax> 391 392 aluCTR textequ <r11d> 393 aluKSl textequ <r12d> 394 aluTMP textequ <r13d> 395 396 T textequ <xmm0> 397 TMP0 textequ <xmm1> 398 TMP1 textequ <xmm2> 399 TMP2 textequ <xmm3> 400 TMP3 textequ <xmm4> 401 TMP4 textequ <xmm5> 402 TMP5 textequ <xmm6> 403 CTR0 textequ <xmm7> 404 CTR1 textequ <xmm8> 405 CTR2 textequ <xmm9> 406 CTR3 textequ <xmm10> 407 CTR4 textequ <xmm11> 408 CTR5 textequ <xmm12> 409 CTR6 textequ <xmm13> 410 CTR7 textequ <xmm14> 411 BSWAPMASK textequ <xmm15> 412 413 ROUND MACRO i 414 vmovdqu TMP3, XMMWORD PTR[i*16 + KS] 415 vaesenc CTR0, CTR0, TMP3 416 vaesenc CTR1, CTR1, TMP3 417 vaesenc CTR2, CTR2, TMP3 418 vaesenc CTR3, CTR3, TMP3 419 vaesenc CTR4, CTR4, TMP3 420 vaesenc CTR5, CTR5, TMP3 421 vaesenc CTR6, CTR6, TMP3 422 vaesenc CTR7, CTR7, TMP3 423 ENDM 424 ROUNDMUL MACRO i 425 vmovdqu TMP3, XMMWORD PTR[i*16 + KS] 426 427 vaesenc CTR0, CTR0, TMP3 428 vaesenc CTR1, CTR1, TMP3 429 vaesenc CTR2, CTR2, TMP3 430 vaesenc CTR3, CTR3, TMP3 431 432 vpshufd TMP4, TMP5, 78 433 vpxor TMP4, TMP4, TMP5 434 435 vaesenc CTR4, CTR4, TMP3 436 vaesenc CTR5, CTR5, TMP3 437 vaesenc CTR6, CTR6, TMP3 438 vaesenc CTR7, CTR7, TMP3 439 440 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h 441 vpxor TMP0, TMP0, TMP3 442 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] 443 vpclmulqdq TMP3, TMP5, TMP4, 011h 444 vpxor TMP1, TMP1, TMP3 445 vpclmulqdq TMP3, TMP5, TMP4, 000h 446 vpxor TMP2, TMP2, TMP3 447 ENDM 448 KARATSUBA MACRO i 449 vpshufd TMP4, TMP5, 78 450 vpxor TMP4, TMP4, TMP5 451 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h 452 vpxor TMP0, TMP0, TMP3 453 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] 454 vpclmulqdq TMP3, TMP5, TMP4, 011h 455 vpxor TMP1, TMP1, TMP3 456 vpclmulqdq TMP3, TMP5, TMP4, 000h 457 vpxor TMP2, TMP2, TMP3 458 ENDM 459 NEXTCTR MACRO i 460 add aluCTR, 1 461 mov aluTMP, aluCTR 462 xor aluTMP, aluKSl 463 bswap aluTMP 464 mov [3*4 + 8*16 + i*16 + rsp], aluTMP 465 ENDM 466 467 468 test len, len 469 jnz LbeginENC 470 ret 471 472 LbeginENC: 473 474 vzeroupper 475 push r11 476 push r12 477 push r13 478 push rbp 479 sub rsp, 10*16 480 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 481 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 482 vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 483 vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 484 vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 485 vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 486 vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 487 vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 488 vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 489 vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 490 491 mov rbp, rsp 492 sub rsp, 16*16 493 and rsp, -16 494 495 vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] 496 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] 497 vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] 498 mov KS, [16*16 + 3*16 + Gctx] 499 mov NR, [244 + KS] 500 lea KS, [KS] 501 502 vpshufb CTR0, CTR0, BSWAPMASK 503 504 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] 505 mov aluKSl, [3*4 + KS] 506 bswap aluCTR 507 bswap aluKSl 508 509 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] 510 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] 511 vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0 512 513 cmp len, 128 514 jb LEncDataSingles 515 ; Prepare the "top" counters 516 vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0 517 vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0 518 vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0 519 vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0 520 vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0 521 vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0 522 vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0 523 524 ; Encrypt the initial 8 blocks 525 sub len, 128 526 vpaddd CTR1, CTR0, XMMWORD PTR[Lone] 527 vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] 528 vpaddd CTR3, CTR2, XMMWORD PTR[Lone] 529 vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] 530 vpaddd CTR5, CTR4, XMMWORD PTR[Lone] 531 vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] 532 vpaddd CTR7, CTR6, XMMWORD PTR[Lone] 533 534 vpshufb CTR0, CTR0, BSWAPMASK 535 vpshufb CTR1, CTR1, BSWAPMASK 536 vpshufb CTR2, CTR2, BSWAPMASK 537 vpshufb CTR3, CTR3, BSWAPMASK 538 vpshufb CTR4, CTR4, BSWAPMASK 539 vpshufb CTR5, CTR5, BSWAPMASK 540 vpshufb CTR6, CTR6, BSWAPMASK 541 vpshufb CTR7, CTR7, BSWAPMASK 542 543 vmovdqu TMP3, XMMWORD PTR[0*16 + KS] 544 vpxor CTR0, CTR0, TMP3 545 vpxor CTR1, CTR1, TMP3 546 vpxor CTR2, CTR2, TMP3 547 vpxor CTR3, CTR3, TMP3 548 vpxor CTR4, CTR4, TMP3 549 vpxor CTR5, CTR5, TMP3 550 vpxor CTR6, CTR6, TMP3 551 vpxor CTR7, CTR7, TMP3 552 553 ROUND 1 554 555 add aluCTR, 8 556 mov aluTMP, aluCTR 557 xor aluTMP, aluKSl 558 bswap aluTMP 559 mov [8*16 + 0*16 + 3*4 + rsp], aluTMP 560 561 ROUND 2 562 NEXTCTR 1 563 ROUND 3 564 NEXTCTR 2 565 ROUND 4 566 NEXTCTR 3 567 ROUND 5 568 NEXTCTR 4 569 ROUND 6 570 NEXTCTR 5 571 ROUND 7 572 NEXTCTR 6 573 ROUND 8 574 NEXTCTR 7 575 ROUND 9 576 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] 577 cmp NR, 10 578 je @f 579 580 ROUND 10 581 ROUND 11 582 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] 583 cmp NR, 12 584 je @f 585 586 ROUND 12 587 ROUND 13 588 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] 589 @@: 590 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] 591 vaesenclast CTR0, CTR0, TMP3 592 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] 593 vaesenclast CTR1, CTR1, TMP3 594 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] 595 vaesenclast CTR2, CTR2, TMP3 596 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] 597 vaesenclast CTR3, CTR3, TMP3 598 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] 599 vaesenclast CTR4, CTR4, TMP3 600 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] 601 vaesenclast CTR5, CTR5, TMP3 602 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] 603 vaesenclast CTR6, CTR6, TMP3 604 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] 605 vaesenclast CTR7, CTR7, TMP3 606 607 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 608 vpshufb CTR0, CTR0, BSWAPMASK 609 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 610 vpshufb CTR1, CTR1, BSWAPMASK 611 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 612 vpshufb CTR2, CTR2, BSWAPMASK 613 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 614 vpshufb CTR3, CTR3, BSWAPMASK 615 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 616 vpshufb CTR4, CTR4, BSWAPMASK 617 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 618 vpshufb CTR5, CTR5, BSWAPMASK 619 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 620 vpshufb CTR6, CTR6, BSWAPMASK 621 vmovdqu XMMWORD PTR[7*16 + CT], CTR7 622 vpshufb TMP5, CTR7, BSWAPMASK 623 624 vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 625 vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 626 vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 627 vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 628 vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 629 vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 630 vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 631 632 lea CT, [8*16 + CT] 633 lea PT, [8*16 + PT] 634 jmp LEncDataOctets 635 636 LEncDataOctets: 637 cmp len, 128 638 jb LEndEncOctets 639 sub len, 128 640 641 vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp] 642 vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp] 643 vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp] 644 vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp] 645 vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp] 646 vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp] 647 vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp] 648 vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp] 649 650 vpshufd TMP4, TMP5, 78 651 vpxor TMP4, TMP4, TMP5 652 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h 653 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] 654 vpclmulqdq TMP1, TMP5, TMP4, 011h 655 vpclmulqdq TMP2, TMP5, TMP4, 000h 656 657 vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] 658 ROUNDMUL 1 659 NEXTCTR 0 660 vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] 661 ROUNDMUL 2 662 NEXTCTR 1 663 vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] 664 ROUNDMUL 3 665 NEXTCTR 2 666 vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] 667 ROUNDMUL 4 668 NEXTCTR 3 669 vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] 670 ROUNDMUL 5 671 NEXTCTR 4 672 vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] 673 ROUNDMUL 6 674 NEXTCTR 5 675 vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] 676 ROUNDMUL 7 677 NEXTCTR 6 678 679 ROUND 8 680 NEXTCTR 7 681 682 vpxor TMP0, TMP0, TMP1 683 vpxor TMP0, TMP0, TMP2 684 vpsrldq TMP3, TMP0, 8 685 vpxor TMP4, TMP1, TMP3 686 vpslldq TMP3, TMP0, 8 687 vpxor T, TMP2, TMP3 688 689 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 690 vpalignr T,T,T,8 691 vpxor T, T, TMP1 692 693 ROUND 9 694 695 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 696 vpalignr T,T,T,8 697 vpxor T, T, TMP1 698 699 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] 700 cmp NR, 10 701 je @f 702 703 ROUND 10 704 ROUND 11 705 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] 706 cmp NR, 12 707 je @f 708 709 ROUND 12 710 ROUND 13 711 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] 712 @@: 713 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + PT] 714 vaesenclast CTR0, CTR0, TMP3 715 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + PT] 716 vaesenclast CTR1, CTR1, TMP3 717 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + PT] 718 vaesenclast CTR2, CTR2, TMP3 719 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + PT] 720 vaesenclast CTR3, CTR3, TMP3 721 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + PT] 722 vaesenclast CTR4, CTR4, TMP3 723 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + PT] 724 vaesenclast CTR5, CTR5, TMP3 725 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + PT] 726 vaesenclast CTR6, CTR6, TMP3 727 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + PT] 728 vaesenclast CTR7, CTR7, TMP3 729 730 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 731 vpshufb CTR0, CTR0, BSWAPMASK 732 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 733 vpshufb CTR1, CTR1, BSWAPMASK 734 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 735 vpshufb CTR2, CTR2, BSWAPMASK 736 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 737 vpshufb CTR3, CTR3, BSWAPMASK 738 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 739 vpshufb CTR4, CTR4, BSWAPMASK 740 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 741 vpshufb CTR5, CTR5, BSWAPMASK 742 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 743 vpshufb CTR6, CTR6, BSWAPMASK 744 vmovdqu XMMWORD PTR[7*16 + CT], CTR7 745 vpshufb TMP5, CTR7, BSWAPMASK 746 747 vmovdqa XMMWORD PTR[1*16 + rsp], CTR6 748 vmovdqa XMMWORD PTR[2*16 + rsp], CTR5 749 vmovdqa XMMWORD PTR[3*16 + rsp], CTR4 750 vmovdqa XMMWORD PTR[4*16 + rsp], CTR3 751 vmovdqa XMMWORD PTR[5*16 + rsp], CTR2 752 vmovdqa XMMWORD PTR[6*16 + rsp], CTR1 753 vmovdqa XMMWORD PTR[7*16 + rsp], CTR0 754 755 vpxor T, T, TMP4 756 757 lea CT, [8*16 + CT] 758 lea PT, [8*16 + PT] 759 jmp LEncDataOctets 760 761 LEndEncOctets: 762 763 vpshufd TMP4, TMP5, 78 764 vpxor TMP4, TMP4, TMP5 765 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h 766 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] 767 vpclmulqdq TMP1, TMP5, TMP4, 011h 768 vpclmulqdq TMP2, TMP5, TMP4, 000h 769 770 vmovdqu TMP5, XMMWORD PTR[1*16 + rsp] 771 KARATSUBA 1 772 vmovdqu TMP5, XMMWORD PTR[2*16 + rsp] 773 KARATSUBA 2 774 vmovdqu TMP5, XMMWORD PTR[3*16 + rsp] 775 KARATSUBA 3 776 vmovdqu TMP5, XMMWORD PTR[4*16 + rsp] 777 KARATSUBA 4 778 vmovdqu TMP5, XMMWORD PTR[5*16 + rsp] 779 KARATSUBA 5 780 vmovdqu TMP5, XMMWORD PTR[6*16 + rsp] 781 KARATSUBA 6 782 vpxor TMP5, T, XMMWORD PTR[7*16 + rsp] 783 KARATSUBA 7 784 785 vpxor TMP0, TMP0, TMP1 786 vpxor TMP0, TMP0, TMP2 787 vpsrldq TMP3, TMP0, 8 788 vpxor TMP4, TMP1, TMP3 789 vpslldq TMP3, TMP0, 8 790 vpxor T, TMP2, TMP3 791 792 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 793 vpalignr T,T,T,8 794 vpxor T, T, TMP1 795 796 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 797 vpalignr T,T,T,8 798 vpxor T, T, TMP1 799 800 vpxor T, T, TMP4 801 802 sub aluCTR, 7 803 804 LEncDataSingles: 805 806 cmp len, 16 807 jb LEncDataTail 808 sub len, 16 809 810 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] 811 NEXTCTR 0 812 813 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 814 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 815 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 816 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 817 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 818 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 819 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 820 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 821 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 822 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 823 cmp NR, 10 824 je @f 825 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 826 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 827 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 828 cmp NR, 12 829 je @f 830 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 831 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 832 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 833 @@: 834 vaesenclast TMP1, TMP1, TMP2 835 vpxor TMP1, TMP1, XMMWORD PTR[PT] 836 vmovdqu XMMWORD PTR[CT], TMP1 837 838 lea PT, [16+PT] 839 lea CT, [16+CT] 840 841 vpshufb TMP1, TMP1, BSWAPMASK 842 vpxor T, T, TMP1 843 vmovdqu TMP0, XMMWORD PTR[Htbl] 844 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 845 846 jmp LEncDataSingles 847 848 LEncDataTail: 849 850 test len, len 851 jz LEncDataEnd 852 853 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp] 854 855 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 856 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 857 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 858 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 859 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 860 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 861 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 862 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 863 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 864 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 865 cmp NR, 10 866 je @f 867 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 868 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 869 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 870 cmp NR, 12 871 je @f 872 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 873 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 874 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 875 @@: 876 vaesenclast TMP1, TMP1, TMP2 877 ; zero a temp location 878 vpxor TMP2, TMP2, TMP2 879 vmovdqa XMMWORD PTR[rsp], TMP2 880 ; copy as many bytes as needed 881 xor KS, KS 882 883 @@: 884 cmp len, KS 885 je @f 886 mov al, [PT + KS] 887 mov [rsp + KS], al 888 inc KS 889 jmp @b 890 @@: 891 vpxor TMP1, TMP1, XMMWORD PTR[rsp] 892 vmovdqa XMMWORD PTR[rsp], TMP1 893 xor KS, KS 894 @@: 895 cmp len, KS 896 je @f 897 mov al, [rsp + KS] 898 mov [CT + KS], al 899 inc KS 900 jmp @b 901 @@: 902 cmp KS, 16 903 je @f 904 mov BYTE PTR[rsp + KS], 0 905 inc KS 906 jmp @b 907 @@: 908 BAIL: 909 vmovdqa TMP1, XMMWORD PTR[rsp] 910 vpshufb TMP1, TMP1, BSWAPMASK 911 vpxor T, T, TMP1 912 vmovdqu TMP0, XMMWORD PTR[Htbl] 913 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 914 915 LEncDataEnd: 916 917 vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T 918 bswap aluCTR 919 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR 920 921 mov rsp, rbp 922 923 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] 924 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] 925 vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] 926 vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] 927 vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] 928 vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] 929 vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] 930 vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] 931 vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] 932 vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] 933 934 add rsp, 10*16 935 pop rbp 936 pop r13 937 pop r12 938 pop r11 939 940 vzeroupper 941 942 ret 943 intel_aes_gcmENC ENDP 944 945 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 946 ; 947 ; Decrypt and Authenticate 948 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); 949 ; 950 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 951 952 ALIGN 16 953 intel_aes_gcmDEC PROC 954 955 NEXTCTR MACRO i 956 add aluCTR, 1 957 mov aluTMP, aluCTR 958 xor aluTMP, aluKSl 959 bswap aluTMP 960 mov [3*4 + i*16 + rsp], aluTMP 961 ENDM 962 963 PT textequ <rdx> 964 CT textequ <rcx> 965 966 test len, len 967 jnz LbeginDEC 968 ret 969 970 LbeginDEC: 971 972 vzeroupper 973 push r11 974 push r12 975 push r13 976 push rbp 977 sub rsp, 10*16 978 vmovdqu XMMWORD PTR[rsp + 0*16], xmm6 979 vmovdqu XMMWORD PTR[rsp + 1*16], xmm7 980 vmovdqu XMMWORD PTR[rsp + 2*16], xmm8 981 vmovdqu XMMWORD PTR[rsp + 3*16], xmm9 982 vmovdqu XMMWORD PTR[rsp + 4*16], xmm10 983 vmovdqu XMMWORD PTR[rsp + 5*16], xmm11 984 vmovdqu XMMWORD PTR[rsp + 6*16], xmm12 985 vmovdqu XMMWORD PTR[rsp + 7*16], xmm13 986 vmovdqu XMMWORD PTR[rsp + 8*16], xmm14 987 vmovdqu XMMWORD PTR[rsp + 9*16], xmm15 988 989 mov rbp, rsp 990 sub rsp, 8*16 991 and rsp, -16 992 993 vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx] 994 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] 995 vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask] 996 mov KS, [16*16 + 3*16 + Gctx] 997 mov NR, [244 + KS] 998 999 vpshufb CTR0, CTR0, BSWAPMASK 1000 1001 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] 1002 mov aluKSl, [3*4 + KS] 1003 bswap aluCTR 1004 bswap aluKSl 1005 1006 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] 1007 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] 1008 vmovdqu XMMWORD PTR[0*16 + rsp], TMP0 1009 1010 cmp len, 128 1011 jb LDecDataSingles 1012 ; Prepare the "top" counters 1013 vmovdqu XMMWORD PTR[1*16 + rsp], TMP0 1014 vmovdqu XMMWORD PTR[2*16 + rsp], TMP0 1015 vmovdqu XMMWORD PTR[3*16 + rsp], TMP0 1016 vmovdqu XMMWORD PTR[4*16 + rsp], TMP0 1017 vmovdqu XMMWORD PTR[5*16 + rsp], TMP0 1018 vmovdqu XMMWORD PTR[6*16 + rsp], TMP0 1019 vmovdqu XMMWORD PTR[7*16 + rsp], TMP0 1020 1021 NEXTCTR 1 1022 NEXTCTR 2 1023 NEXTCTR 3 1024 NEXTCTR 4 1025 NEXTCTR 5 1026 NEXTCTR 6 1027 NEXTCTR 7 1028 1029 LDecDataOctets: 1030 cmp len, 128 1031 jb LEndDecOctets 1032 sub len, 128 1033 1034 vmovdqa CTR0, XMMWORD PTR[0*16 + rsp] 1035 vmovdqa CTR1, XMMWORD PTR[1*16 + rsp] 1036 vmovdqa CTR2, XMMWORD PTR[2*16 + rsp] 1037 vmovdqa CTR3, XMMWORD PTR[3*16 + rsp] 1038 vmovdqa CTR4, XMMWORD PTR[4*16 + rsp] 1039 vmovdqa CTR5, XMMWORD PTR[5*16 + rsp] 1040 vmovdqa CTR6, XMMWORD PTR[6*16 + rsp] 1041 vmovdqa CTR7, XMMWORD PTR[7*16 + rsp] 1042 1043 vmovdqu TMP5, XMMWORD PTR[7*16 + CT] 1044 vpshufb TMP5, TMP5, BSWAPMASK 1045 vpshufd TMP4, TMP5, 78 1046 vpxor TMP4, TMP4, TMP5 1047 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h 1048 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] 1049 vpclmulqdq TMP1, TMP5, TMP4, 011h 1050 vpclmulqdq TMP2, TMP5, TMP4, 000h 1051 1052 vmovdqu TMP5, XMMWORD PTR[6*16 + CT] 1053 vpshufb TMP5, TMP5, BSWAPMASK 1054 ROUNDMUL 1 1055 NEXTCTR 0 1056 vmovdqu TMP5, XMMWORD PTR[5*16 + CT] 1057 vpshufb TMP5, TMP5, BSWAPMASK 1058 ROUNDMUL 2 1059 NEXTCTR 1 1060 vmovdqu TMP5, XMMWORD PTR[4*16 + CT] 1061 vpshufb TMP5, TMP5, BSWAPMASK 1062 ROUNDMUL 3 1063 NEXTCTR 2 1064 vmovdqu TMP5, XMMWORD PTR[3*16 + CT] 1065 vpshufb TMP5, TMP5, BSWAPMASK 1066 ROUNDMUL 4 1067 NEXTCTR 3 1068 vmovdqu TMP5, XMMWORD PTR[2*16 + CT] 1069 vpshufb TMP5, TMP5, BSWAPMASK 1070 ROUNDMUL 5 1071 NEXTCTR 4 1072 vmovdqu TMP5, XMMWORD PTR[1*16 + CT] 1073 vpshufb TMP5, TMP5, BSWAPMASK 1074 ROUNDMUL 6 1075 NEXTCTR 5 1076 vmovdqu TMP5, XMMWORD PTR[0*16 + CT] 1077 vpshufb TMP5, TMP5, BSWAPMASK 1078 vpxor TMP5, TMP5, T 1079 ROUNDMUL 7 1080 NEXTCTR 6 1081 1082 ROUND 8 1083 NEXTCTR 7 1084 1085 vpxor TMP0, TMP0, TMP1 1086 vpxor TMP0, TMP0, TMP2 1087 vpsrldq TMP3, TMP0, 8 1088 vpxor TMP4, TMP1, TMP3 1089 vpslldq TMP3, TMP0, 8 1090 vpxor T, TMP2, TMP3 1091 1092 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 1093 vpalignr T,T,T,8 1094 vpxor T, T, TMP1 1095 1096 ROUND 9 1097 1098 vpclmulqdq TMP1, T, XMMWORD PTR[Lpoly], 010h 1099 vpalignr T,T,T,8 1100 vpxor T, T, TMP1 1101 1102 vmovdqu TMP5, XMMWORD PTR[10*16 + KS] 1103 cmp NR, 10 1104 je @f 1105 1106 ROUND 10 1107 ROUND 11 1108 vmovdqu TMP5, XMMWORD PTR[12*16 + KS] 1109 cmp NR, 12 1110 je @f 1111 1112 ROUND 12 1113 ROUND 13 1114 vmovdqu TMP5, XMMWORD PTR[14*16 + KS] 1115 @@: 1116 vpxor TMP3, TMP5, XMMWORD PTR[0*16 + CT] 1117 vaesenclast CTR0, CTR0, TMP3 1118 vpxor TMP3, TMP5, XMMWORD PTR[1*16 + CT] 1119 vaesenclast CTR1, CTR1, TMP3 1120 vpxor TMP3, TMP5, XMMWORD PTR[2*16 + CT] 1121 vaesenclast CTR2, CTR2, TMP3 1122 vpxor TMP3, TMP5, XMMWORD PTR[3*16 + CT] 1123 vaesenclast CTR3, CTR3, TMP3 1124 vpxor TMP3, TMP5, XMMWORD PTR[4*16 + CT] 1125 vaesenclast CTR4, CTR4, TMP3 1126 vpxor TMP3, TMP5, XMMWORD PTR[5*16 + CT] 1127 vaesenclast CTR5, CTR5, TMP3 1128 vpxor TMP3, TMP5, XMMWORD PTR[6*16 + CT] 1129 vaesenclast CTR6, CTR6, TMP3 1130 vpxor TMP3, TMP5, XMMWORD PTR[7*16 + CT] 1131 vaesenclast CTR7, CTR7, TMP3 1132 1133 vmovdqu XMMWORD PTR[0*16 + PT], CTR0 1134 vmovdqu XMMWORD PTR[1*16 + PT], CTR1 1135 vmovdqu XMMWORD PTR[2*16 + PT], CTR2 1136 vmovdqu XMMWORD PTR[3*16 + PT], CTR3 1137 vmovdqu XMMWORD PTR[4*16 + PT], CTR4 1138 vmovdqu XMMWORD PTR[5*16 + PT], CTR5 1139 vmovdqu XMMWORD PTR[6*16 + PT], CTR6 1140 vmovdqu XMMWORD PTR[7*16 + PT], CTR7 1141 1142 vpxor T, T, TMP4 1143 1144 lea CT, [8*16 + CT] 1145 lea PT, [8*16 + PT] 1146 jmp LDecDataOctets 1147 1148 LEndDecOctets: 1149 1150 sub aluCTR, 7 1151 1152 LDecDataSingles: 1153 1154 cmp len, 16 1155 jb LDecDataTail 1156 sub len, 16 1157 1158 vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] 1159 NEXTCTR 0 1160 1161 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1162 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1163 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1164 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1165 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1166 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1167 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1168 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1169 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1170 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1171 cmp NR, 10 1172 je @f 1173 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1174 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1175 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1176 cmp NR, 12 1177 je @f 1178 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1179 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1180 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1181 @@: 1182 vaesenclast TMP1, TMP1, TMP2 1183 1184 vmovdqu TMP2, XMMWORD PTR[CT] 1185 vpxor TMP1, TMP1, TMP2 1186 vmovdqu XMMWORD PTR[PT], TMP1 1187 1188 lea PT, [16+PT] 1189 lea CT, [16+CT] 1190 1191 vpshufb TMP2, TMP2, BSWAPMASK 1192 vpxor T, T, TMP2 1193 vmovdqu TMP0, XMMWORD PTR[Htbl] 1194 GFMUL T, T, TMP0, TMP1, TMP2, TMP3, TMP4 1195 1196 jmp LDecDataSingles 1197 1198 LDecDataTail: 1199 1200 test len, len 1201 jz LDecDataEnd 1202 1203 vmovdqa TMP1, XMMWORD PTR[0*16 + rsp] 1204 inc aluCTR 1205 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1206 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1207 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1208 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1209 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1210 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1211 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1212 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1213 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1214 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1215 cmp NR, 10 1216 je @f 1217 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1218 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1219 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1220 cmp NR, 12 1221 je @f 1222 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1223 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1224 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1225 @@: 1226 vaesenclast TMP1, TMP1, TMP2 1227 ; copy as many bytes as needed 1228 xor KS, KS 1229 @@: 1230 cmp len, KS 1231 je @f 1232 mov al, [CT + KS] 1233 mov [rsp + KS], al 1234 inc KS 1235 jmp @b 1236 @@: 1237 cmp KS, 16 1238 je @f 1239 mov BYTE PTR[rsp + KS], 0 1240 inc KS 1241 jmp @b 1242 @@: 1243 vmovdqa TMP2, XMMWORD PTR[rsp] 1244 vpshufb TMP2, TMP2, BSWAPMASK 1245 vpxor T, T, TMP2 1246 vmovdqu TMP0, XMMWORD PTR[Htbl] 1247 GFMUL T, T, TMP0, TMP5, TMP2, TMP3, TMP4 1248 1249 1250 vpxor TMP1, TMP1, XMMWORD PTR[rsp] 1251 vmovdqa XMMWORD PTR[rsp], TMP1 1252 xor KS, KS 1253 @@: 1254 cmp len, KS 1255 je @f 1256 mov al, [rsp + KS] 1257 mov [PT + KS], al 1258 inc KS 1259 jmp @b 1260 @@: 1261 1262 LDecDataEnd: 1263 1264 vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T 1265 bswap aluCTR 1266 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR 1267 1268 mov rsp, rbp 1269 1270 vmovdqu xmm6, XMMWORD PTR[rsp + 0*16] 1271 vmovdqu xmm7, XMMWORD PTR[rsp + 1*16] 1272 vmovdqu xmm8, XMMWORD PTR[rsp + 2*16] 1273 vmovdqu xmm9, XMMWORD PTR[rsp + 3*16] 1274 vmovdqu xmm10, XMMWORD PTR[rsp + 4*16] 1275 vmovdqu xmm11, XMMWORD PTR[rsp + 5*16] 1276 vmovdqu xmm12, XMMWORD PTR[rsp + 6*16] 1277 vmovdqu xmm13, XMMWORD PTR[rsp + 7*16] 1278 vmovdqu xmm14, XMMWORD PTR[rsp + 8*16] 1279 vmovdqu xmm15, XMMWORD PTR[rsp + 9*16] 1280 1281 add rsp, 10*16 1282 pop rbp 1283 pop r13 1284 pop r12 1285 pop r11 1286 1287 vzeroupper 1288 1289 ret 1290 ret 1291 intel_aes_gcmDEC ENDP 1292 1293 1294 END