intel-gcm-x86-masm.asm (31751B)
1 ; LICENSE: 2 ; This submission to NSS is to be made available under the terms of the 3 ; Mozilla Public License, v. 2.0. You can obtain one at http: 4 ; //mozilla.org/MPL/2.0/. 5 ;############################################################################### 6 ; Copyright(c) 2014, Intel Corp. 7 ; Developers and authors: 8 ; Shay Gueron and Vlad Krasnov 9 ; Intel Corporation, Israel Development Centre, Haifa, Israel 10 ; Please send feedback directly to crypto.feedback.alias@intel.com 11 12 13 .MODEL FLAT, C 14 .XMM 15 16 .DATA 17 ALIGN 16 18 Lone dq 1,0 19 Ltwo dq 2,0 20 Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 21 Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh 22 Lpoly dq 01h, 0c200000000000000h 23 24 .CODE 25 26 27 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 28 vpclmulqdq TMP1, SRC2, SRC1, 0h 29 vpclmulqdq TMP4, SRC2, SRC1, 011h 30 31 vpshufd TMP2, SRC2, 78 32 vpshufd TMP3, SRC1, 78 33 vpxor TMP2, TMP2, SRC2 34 vpxor TMP3, TMP3, SRC1 35 36 vpclmulqdq TMP2, TMP2, TMP3, 0h 37 vpxor TMP2, TMP2, TMP1 38 vpxor TMP2, TMP2, TMP4 39 40 vpslldq TMP3, TMP2, 8 41 vpsrldq TMP2, TMP2, 8 42 43 vpxor TMP1, TMP1, TMP3 44 vpxor TMP4, TMP4, TMP2 45 46 vpclmulqdq TMP2, TMP1, [Lpoly], 010h 47 vpshufd TMP3, TMP1, 78 48 vpxor TMP1, TMP2, TMP3 49 50 vpclmulqdq TMP2, TMP1, [Lpoly], 010h 51 vpshufd TMP3, TMP1, 78 52 vpxor TMP1, TMP2, TMP3 53 54 vpxor DST, TMP1, TMP4 55 56 ENDM 57 58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 59 ; 60 ; Generates the final GCM tag 61 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16], 62 ; unsigned char *Tp, 63 ; unsigned int Mlen, 64 ; unsigned int Alen, 65 ; unsigned char* X0, 66 ; unsigned char* TAG); 67 ; 68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 69 70 ALIGN 16 71 intel_aes_gcmTAG PROC 72 73 Htbl textequ <eax> 74 Tp textequ <ecx> 75 X0 textequ <edx> 76 TAG textequ <ebx> 77 78 T textequ <xmm0> 79 TMP0 textequ <xmm1> 80 81 push ebx 82 83 mov Htbl, [esp + 2*4 + 0*4] 84 mov Tp, [esp + 2*4 + 1*4] 85 mov X0, [esp + 2*4 + 4*4] 86 mov TAG, [esp + 2*4 + 5*4] 87 88 vzeroupper 89 vmovdqu T, XMMWORD PTR[Tp] 90 91 vpxor TMP0, TMP0, TMP0 92 vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0 93 vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2 94 vpsllq TMP0, TMP0, 3 95 96 vpxor T, T, TMP0 97 vmovdqu TMP0, XMMWORD PTR[Htbl] 98 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 99 100 vpshufb T, T, [Lbswap_mask] 101 vpxor T, T, [X0] 102 vmovdqu XMMWORD PTR[TAG], T 103 vzeroupper 104 105 pop ebx 106 107 ret 108 109 intel_aes_gcmTAG ENDP 110 111 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 112 ; 113 ; Generates the H table 114 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); 115 ; 116 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 117 118 ALIGN 16 119 intel_aes_gcmINIT PROC 120 121 Htbl textequ <eax> 122 KS textequ <ecx> 123 NR textequ <edx> 124 125 T textequ <xmm0> 126 TMP0 textequ <xmm1> 127 128 mov Htbl, [esp + 4*1 + 0*4] 129 mov KS, [esp + 4*1 + 1*4] 130 mov NR, [esp + 4*1 + 2*4] 131 132 vzeroupper 133 ; AES-ENC(0) 134 vmovdqu T, XMMWORD PTR[KS] 135 lea KS, [16 + KS] 136 dec NR 137 Lenc_loop: 138 vaesenc T, T, [KS] 139 lea KS, [16 + KS] 140 dec NR 141 jnz Lenc_loop 142 143 vaesenclast T, T, [KS] 144 vpshufb T, T, [Lbswap_mask] 145 146 ;Calculate H` = GFMUL(H, 2) 147 vpsrad xmm3, T, 31 148 vpshufd xmm3, xmm3, 0ffh 149 vpand xmm5, xmm3, [Lpoly] 150 vpsrld xmm3, T, 31 151 vpslld xmm4, T, 1 152 vpslldq xmm3, xmm3, 4 153 vpxor T, xmm4, xmm3 154 vpxor T, T, xmm5 155 156 vmovdqu TMP0, T 157 vmovdqu XMMWORD PTR[Htbl + 0*16], T 158 159 vpshufd xmm2, T, 78 160 vpxor xmm2, xmm2, T 161 vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 162 163 i = 1 164 WHILE i LT 8 165 GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 166 vmovdqu XMMWORD PTR[Htbl + i*16], T 167 vpshufd xmm2, T, 78 168 vpxor xmm2, xmm2, T 169 vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 170 i = i+1 171 ENDM 172 vzeroupper 173 ret 174 intel_aes_gcmINIT ENDP 175 176 177 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 178 ; 179 ; Authenticate only 180 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); 181 ; 182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 183 184 ALIGN 16 185 intel_aes_gcmAAD PROC 186 187 Htbl textequ <eax> 188 inp textequ <ecx> 189 len textequ <edx> 190 Tp textequ <ebx> 191 hlp0 textequ <esi> 192 193 DATA textequ <xmm0> 194 T textequ <xmm1> 195 TMP0 textequ <xmm2> 196 TMP1 textequ <xmm3> 197 TMP2 textequ <xmm4> 198 TMP3 textequ <xmm5> 199 TMP4 textequ <xmm6> 200 Xhi textequ <xmm7> 201 202 KARATSUBA_AAD MACRO i 203 vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h 204 vpxor TMP0, TMP0, TMP3 205 vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h 206 vpxor TMP1, TMP1, TMP3 207 vpshufd TMP3, DATA, 78 208 vpxor TMP3, TMP3, DATA 209 vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h 210 vpxor TMP2, TMP2, TMP3 211 ENDM 212 213 cmp DWORD PTR[esp + 1*3 + 2*4], 0 214 jnz LbeginAAD 215 ret 216 217 LbeginAAD: 218 push ebx 219 push esi 220 221 mov Htbl, [esp + 4*3 + 0*4] 222 mov inp, [esp + 4*3 + 1*4] 223 mov len, [esp + 4*3 + 2*4] 224 mov Tp, [esp + 4*3 + 3*4] 225 226 vzeroupper 227 228 vpxor Xhi, Xhi, Xhi 229 230 vmovdqu T, XMMWORD PTR[Tp] 231 ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first 232 mov hlp0, len 233 and hlp0, 128-1 234 jz Lmod_loop 235 236 and len, -128 237 sub hlp0, 16 238 239 ; Prefix block 240 vmovdqu DATA, XMMWORD PTR[inp] 241 vpshufb DATA, DATA, [Lbswap_mask] 242 vpxor DATA, DATA, T 243 244 vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h 245 vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h 246 vpshufd TMP3, DATA, 78 247 vpxor TMP3, TMP3, DATA 248 vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h 249 250 lea inp, [inp+16] 251 test hlp0, hlp0 252 jnz Lpre_loop 253 jmp Lred1 254 255 ;hash remaining prefix bocks (up to 7 total prefix blocks) 256 Lpre_loop: 257 258 sub hlp0, 16 259 260 vmovdqu DATA, XMMWORD PTR[inp] 261 vpshufb DATA, DATA, [Lbswap_mask] 262 263 vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h 264 vpxor TMP0, TMP0, TMP3 265 vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h 266 vpxor TMP1, TMP1, TMP3 267 vpshufd TMP3, DATA, 78 268 vpxor TMP3, TMP3, DATA 269 vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h 270 vpxor TMP2, TMP2, TMP3 271 272 test hlp0, hlp0 273 lea inp, [inp+16] 274 jnz Lpre_loop 275 276 Lred1: 277 278 vpxor TMP2, TMP2, TMP0 279 vpxor TMP2, TMP2, TMP1 280 vpsrldq TMP3, TMP2, 8 281 vpslldq TMP2, TMP2, 8 282 283 vpxor Xhi, TMP1, TMP3 284 vpxor T, TMP0, TMP2 285 286 Lmod_loop: 287 288 sub len, 16*8 289 jb Ldone 290 ; Block #0 291 vmovdqu DATA, XMMWORD PTR[inp + 16*7] 292 vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask] 293 294 vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h 295 vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h 296 vpshufd TMP3, DATA, 78 297 vpxor TMP3, TMP3, DATA 298 vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h 299 300 ; Block #1 301 vmovdqu DATA, XMMWORD PTR[inp + 16*6] 302 vpshufb DATA, DATA, [Lbswap_mask] 303 KARATSUBA_AAD 1 304 305 ; Block #2 306 vmovdqu DATA, XMMWORD PTR[inp + 16*5] 307 vpshufb DATA, DATA, [Lbswap_mask] 308 309 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a 310 vpalignr T, T, T, 8 311 312 KARATSUBA_AAD 2 313 314 vpxor T, T, TMP4 ;reduction stage 1b 315 316 ; Block #3 317 vmovdqu DATA, XMMWORD PTR[inp + 16*4] 318 vpshufb DATA, DATA, [Lbswap_mask] 319 KARATSUBA_AAD 3 320 ; Block #4 321 vmovdqu DATA, XMMWORD PTR[inp + 16*3] 322 vpshufb DATA, DATA, [Lbswap_mask] 323 324 vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a 325 vpalignr T, T, T, 8 326 327 KARATSUBA_AAD 4 328 329 vpxor T, T, TMP4 ;reduction stage 2b 330 ; Block #5 331 vmovdqu DATA, XMMWORD PTR[inp + 16*2] 332 vpshufb DATA, DATA, [Lbswap_mask] 333 KARATSUBA_AAD 5 334 335 vpxor T, T, Xhi ;reduction finalize 336 ; Block #6 337 vmovdqu DATA, XMMWORD PTR[inp + 16*1] 338 vpshufb DATA, DATA, [Lbswap_mask] 339 KARATSUBA_AAD 6 340 ; Block #7 341 vmovdqu DATA, XMMWORD PTR[inp + 16*0] 342 vpshufb DATA, DATA, [Lbswap_mask] 343 vpxor DATA, DATA, T 344 KARATSUBA_AAD 7 345 ; Aggregated 8 blocks, now karatsuba fixup 346 vpxor TMP2, TMP2, TMP0 347 vpxor TMP2, TMP2, TMP1 348 vpsrldq TMP3, TMP2, 8 349 vpslldq TMP2, TMP2, 8 350 351 vpxor Xhi, TMP1, TMP3 352 vpxor T, TMP0, TMP2 353 354 lea inp, [inp + 16*8] 355 jmp Lmod_loop 356 357 Ldone: 358 vpclmulqdq TMP4, T, [Lpoly], 010h 359 vpalignr T, T, T, 8 360 vpxor T, T, TMP4 361 362 vpclmulqdq TMP4, T, [Lpoly], 010h 363 vpalignr T, T, T, 8 364 vpxor T, T, TMP4 365 366 vpxor T, T, Xhi 367 vmovdqu XMMWORD PTR[Tp], T 368 vzeroupper 369 370 pop esi 371 pop ebx 372 ret 373 374 intel_aes_gcmAAD ENDP 375 376 377 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 378 ; 379 ; Encrypt and Authenticate 380 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); 381 ; 382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 383 384 ALIGN 16 385 intel_aes_gcmENC PROC 386 387 PT textequ <eax> 388 CT textequ <ecx> 389 Htbl textequ <edx> 390 Gctx textequ <edx> 391 len textequ <DWORD PTR[ebp + 5*4 + 3*4]> 392 KS textequ <esi> 393 NR textequ <DWORD PTR[244+KS]> 394 395 aluCTR textequ <ebx> 396 aluTMP textequ <edi> 397 398 T textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]> 399 TMP0 textequ <xmm1> 400 TMP1 textequ <xmm2> 401 TMP2 textequ <xmm3> 402 TMP3 textequ <xmm4> 403 TMP4 textequ <xmm5> 404 TMP5 textequ <xmm6> 405 406 CTR0 textequ <xmm0> 407 CTR1 textequ <xmm1> 408 CTR2 textequ <xmm2> 409 CTR3 textequ <xmm3> 410 CTR4 textequ <xmm4> 411 CTR5 textequ <xmm5> 412 CTR6 textequ <xmm6> 413 414 ROUND MACRO i 415 vmovdqu xmm7, XMMWORD PTR[i*16 + KS] 416 vaesenc CTR0, CTR0, xmm7 417 vaesenc CTR1, CTR1, xmm7 418 vaesenc CTR2, CTR2, xmm7 419 vaesenc CTR3, CTR3, xmm7 420 vaesenc CTR4, CTR4, xmm7 421 vaesenc CTR5, CTR5, xmm7 422 vaesenc CTR6, CTR6, xmm7 423 ENDM 424 425 KARATSUBA MACRO i 426 vpshufd TMP4, TMP5, 78 427 vpxor TMP4, TMP4, TMP5 428 vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h 429 vpxor TMP0, TMP0, TMP3 430 vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] 431 vpclmulqdq TMP3, TMP5, TMP4, 011h 432 vpxor TMP1, TMP1, TMP3 433 vpclmulqdq TMP3, TMP5, TMP4, 000h 434 vpxor TMP2, TMP2, TMP3 435 ENDM 436 437 NEXTCTR MACRO i 438 add aluCTR, 1 439 mov aluTMP, aluCTR 440 bswap aluTMP 441 xor aluTMP, [3*4 + KS] 442 mov [3*4 + 8*16 + i*16 + esp], aluTMP 443 ENDM 444 445 cmp DWORD PTR[1*4 + 3*4 + esp], 0 446 jne LbeginENC 447 ret 448 449 LbeginENC: 450 451 vzeroupper 452 push ebp 453 push ebx 454 push esi 455 push edi 456 457 mov ebp, esp 458 sub esp, 16*16 459 and esp, -16 460 461 mov PT, [ebp + 5*4 + 0*4] 462 mov CT, [ebp + 5*4 + 1*4] 463 mov Gctx, [ebp + 5*4 + 2*4] 464 465 mov KS, [16*16 + 3*16 + Gctx] 466 467 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] 468 bswap aluCTR 469 470 471 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] 472 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] 473 vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0 474 475 cmp len, 16*7 476 jb LEncDataSingles 477 ; Prepare the "top" counters 478 vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0 479 vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0 480 vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0 481 vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0 482 vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0 483 vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0 484 485 vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] 486 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] 487 ; Encrypt the initial 7 blocks 488 sub len, 16*7 489 vpaddd CTR1, CTR0, XMMWORD PTR[Lone] 490 vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] 491 vpaddd CTR3, CTR2, XMMWORD PTR[Lone] 492 vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] 493 vpaddd CTR5, CTR4, XMMWORD PTR[Lone] 494 vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] 495 496 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] 497 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] 498 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] 499 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] 500 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] 501 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] 502 vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask] 503 504 vmovdqu xmm7, XMMWORD PTR[0*16 + KS] 505 vpxor CTR0, CTR0, xmm7 506 vpxor CTR1, CTR1, xmm7 507 vpxor CTR2, CTR2, xmm7 508 vpxor CTR3, CTR3, xmm7 509 vpxor CTR4, CTR4, xmm7 510 vpxor CTR5, CTR5, xmm7 511 vpxor CTR6, CTR6, xmm7 512 513 ROUND 1 514 515 add aluCTR, 7 516 mov aluTMP, aluCTR 517 bswap aluTMP 518 xor aluTMP, [KS + 3*4] 519 mov [8*16 + 0*16 + 3*4 + esp], aluTMP 520 521 ROUND 2 522 NEXTCTR 1 523 ROUND 3 524 NEXTCTR 2 525 ROUND 4 526 NEXTCTR 3 527 ROUND 5 528 NEXTCTR 4 529 ROUND 6 530 NEXTCTR 5 531 ROUND 7 532 NEXTCTR 6 533 ROUND 8 534 ROUND 9 535 vmovdqu xmm7, XMMWORD PTR[10*16 + KS] 536 cmp NR, 10 537 je @f 538 539 ROUND 10 540 ROUND 11 541 vmovdqu xmm7, XMMWORD PTR[12*16 + KS] 542 cmp NR, 12 543 je @f 544 545 ROUND 12 546 ROUND 13 547 vmovdqu xmm7, XMMWORD PTR[14*16 + KS] 548 @@: 549 vaesenclast CTR0, CTR0, xmm7 550 vaesenclast CTR1, CTR1, xmm7 551 vaesenclast CTR2, CTR2, xmm7 552 vaesenclast CTR3, CTR3, xmm7 553 vaesenclast CTR4, CTR4, xmm7 554 vaesenclast CTR5, CTR5, xmm7 555 vaesenclast CTR6, CTR6, xmm7 556 557 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] 558 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] 559 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] 560 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] 561 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] 562 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] 563 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] 564 565 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 566 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 567 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 568 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 569 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 570 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 571 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 572 573 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] 574 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] 575 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] 576 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] 577 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] 578 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] 579 vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] 580 581 vmovdqa XMMWORD PTR[1*16 + esp], CTR5 582 vmovdqa XMMWORD PTR[2*16 + esp], CTR4 583 vmovdqa XMMWORD PTR[3*16 + esp], CTR3 584 vmovdqa XMMWORD PTR[4*16 + esp], CTR2 585 vmovdqa XMMWORD PTR[5*16 + esp], CTR1 586 vmovdqa XMMWORD PTR[6*16 + esp], CTR0 587 588 lea CT, [7*16 + CT] 589 lea PT, [7*16 + PT] 590 jmp LEncData7 591 592 LEncData7: 593 cmp len, 16*7 594 jb LEndEnc7 595 sub len, 16*7 596 597 vpshufd TMP4, TMP5, 78 598 vpxor TMP4, TMP4, TMP5 599 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h 600 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] 601 vpclmulqdq TMP1, TMP5, TMP4, 011h 602 vpclmulqdq TMP2, TMP5, TMP4, 000h 603 604 vmovdqu TMP5, XMMWORD PTR[1*16 + esp] 605 KARATSUBA 1 606 vmovdqu TMP5, XMMWORD PTR[2*16 + esp] 607 KARATSUBA 2 608 vmovdqu TMP5, XMMWORD PTR[3*16 + esp] 609 KARATSUBA 3 610 vmovdqu TMP5, XMMWORD PTR[4*16 + esp] 611 KARATSUBA 4 612 vmovdqu TMP5, XMMWORD PTR[5*16 + esp] 613 KARATSUBA 5 614 vmovdqu TMP5, XMMWORD PTR[6*16 + esp] 615 vpxor TMP5, TMP5, T 616 KARATSUBA 6 617 618 vpxor TMP0, TMP0, TMP1 619 vpxor TMP0, TMP0, TMP2 620 vpsrldq TMP3, TMP0, 8 621 vpxor TMP4, TMP1, TMP3 622 vpslldq TMP3, TMP0, 8 623 vpxor TMP5, TMP2, TMP3 624 625 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 626 vpalignr TMP5,TMP5,TMP5,8 627 vpxor TMP5, TMP5, TMP1 628 629 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 630 vpalignr TMP5,TMP5,TMP5,8 631 vpxor TMP5, TMP5, TMP1 632 633 vpxor TMP5, TMP5, TMP4 634 vmovdqu T, TMP5 635 636 vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp] 637 vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp] 638 vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp] 639 vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp] 640 vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp] 641 vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp] 642 vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp] 643 644 ROUND 1 645 NEXTCTR 0 646 ROUND 2 647 NEXTCTR 1 648 ROUND 3 649 NEXTCTR 2 650 ROUND 4 651 NEXTCTR 3 652 ROUND 5 653 NEXTCTR 4 654 ROUND 6 655 NEXTCTR 5 656 ROUND 7 657 NEXTCTR 6 658 659 ROUND 8 660 ROUND 9 661 662 vmovdqu xmm7, XMMWORD PTR[10*16 + KS] 663 cmp NR, 10 664 je @f 665 666 ROUND 10 667 ROUND 11 668 vmovdqu xmm7, XMMWORD PTR[12*16 + KS] 669 cmp NR, 12 670 je @f 671 672 ROUND 12 673 ROUND 13 674 vmovdqu xmm7, XMMWORD PTR[14*16 + KS] 675 @@: 676 vaesenclast CTR0, CTR0, xmm7 677 vaesenclast CTR1, CTR1, xmm7 678 vaesenclast CTR2, CTR2, xmm7 679 vaesenclast CTR3, CTR3, xmm7 680 vaesenclast CTR4, CTR4, xmm7 681 vaesenclast CTR5, CTR5, xmm7 682 vaesenclast CTR6, CTR6, xmm7 683 684 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] 685 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] 686 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] 687 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] 688 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] 689 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] 690 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] 691 692 vmovdqu XMMWORD PTR[0*16 + CT], CTR0 693 vmovdqu XMMWORD PTR[1*16 + CT], CTR1 694 vmovdqu XMMWORD PTR[2*16 + CT], CTR2 695 vmovdqu XMMWORD PTR[3*16 + CT], CTR3 696 vmovdqu XMMWORD PTR[4*16 + CT], CTR4 697 vmovdqu XMMWORD PTR[5*16 + CT], CTR5 698 vmovdqu XMMWORD PTR[6*16 + CT], CTR6 699 700 vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] 701 vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] 702 vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] 703 vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] 704 vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] 705 vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] 706 vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] 707 708 vmovdqa XMMWORD PTR[1*16 + esp], CTR5 709 vmovdqa XMMWORD PTR[2*16 + esp], CTR4 710 vmovdqa XMMWORD PTR[3*16 + esp], CTR3 711 vmovdqa XMMWORD PTR[4*16 + esp], CTR2 712 vmovdqa XMMWORD PTR[5*16 + esp], CTR1 713 vmovdqa XMMWORD PTR[6*16 + esp], CTR0 714 715 lea CT, [7*16 + CT] 716 lea PT, [7*16 + PT] 717 jmp LEncData7 718 719 LEndEnc7: 720 721 vpshufd TMP4, TMP5, 78 722 vpxor TMP4, TMP4, TMP5 723 vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h 724 vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] 725 vpclmulqdq TMP1, TMP5, TMP4, 011h 726 vpclmulqdq TMP2, TMP5, TMP4, 000h 727 728 vmovdqu TMP5, XMMWORD PTR[1*16 + esp] 729 KARATSUBA 1 730 vmovdqu TMP5, XMMWORD PTR[2*16 + esp] 731 KARATSUBA 2 732 vmovdqu TMP5, XMMWORD PTR[3*16 + esp] 733 KARATSUBA 3 734 vmovdqu TMP5, XMMWORD PTR[4*16 + esp] 735 KARATSUBA 4 736 vmovdqu TMP5, XMMWORD PTR[5*16 + esp] 737 KARATSUBA 5 738 vmovdqu TMP5, XMMWORD PTR[6*16 + esp] 739 vpxor TMP5, TMP5, T 740 KARATSUBA 6 741 742 vpxor TMP0, TMP0, TMP1 743 vpxor TMP0, TMP0, TMP2 744 vpsrldq TMP3, TMP0, 8 745 vpxor TMP4, TMP1, TMP3 746 vpslldq TMP3, TMP0, 8 747 vpxor TMP5, TMP2, TMP3 748 749 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 750 vpalignr TMP5,TMP5,TMP5,8 751 vpxor TMP5, TMP5, TMP1 752 753 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 754 vpalignr TMP5,TMP5,TMP5,8 755 vpxor TMP5, TMP5, TMP1 756 757 vpxor TMP5, TMP5, TMP4 758 vmovdqu T, TMP5 759 760 sub aluCTR, 6 761 762 LEncDataSingles: 763 764 cmp len, 16 765 jb LEncDataTail 766 sub len, 16 767 768 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] 769 NEXTCTR 0 770 771 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 772 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 773 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 774 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 775 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 776 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 777 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 778 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 779 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 780 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 781 cmp NR, 10 782 je @f 783 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 784 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 785 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 786 cmp NR, 12 787 je @f 788 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 789 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 790 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 791 @@: 792 vaesenclast TMP1, TMP1, TMP2 793 vpxor TMP1, TMP1, XMMWORD PTR[PT] 794 vmovdqu XMMWORD PTR[CT], TMP1 795 796 lea PT, [16+PT] 797 lea CT, [16+CT] 798 799 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] 800 vpxor TMP1, TMP1, T 801 802 vmovdqu TMP0, XMMWORD PTR[Htbl] 803 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 804 vmovdqu T, TMP1 805 806 jmp LEncDataSingles 807 808 LEncDataTail: 809 810 cmp len, 0 811 je LEncDataEnd 812 813 vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] 814 815 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 816 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 817 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 818 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 819 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 820 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 821 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 822 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 823 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 824 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 825 cmp NR, 10 826 je @f 827 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 828 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 829 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 830 cmp NR, 12 831 je @f 832 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 833 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 834 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 835 @@: 836 vaesenclast TMP1, TMP1, TMP2 837 ; zero a temp location 838 vpxor TMP2, TMP2, TMP2 839 vmovdqa XMMWORD PTR[esp], TMP2 840 ; copy as many bytes as needed 841 xor KS, KS 842 mov aluTMP, edx 843 @@: 844 cmp len, KS 845 je @f 846 mov dl, BYTE PTR[PT + KS] 847 mov BYTE PTR[esp + KS], dl 848 inc KS 849 jmp @b 850 @@: 851 vpxor TMP1, TMP1, XMMWORD PTR[esp] 852 vmovdqa XMMWORD PTR[esp], TMP1 853 xor KS, KS 854 @@: 855 cmp len, KS 856 je @f 857 mov dl, BYTE PTR[esp + KS] 858 mov BYTE PTR[CT + KS], dl 859 inc KS 860 jmp @b 861 @@: 862 cmp KS, 16 863 je @f 864 mov BYTE PTR[esp + KS], 0 865 inc KS 866 jmp @b 867 @@: 868 mov edx, aluTMP 869 vmovdqa TMP1, XMMWORD PTR[esp] 870 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] 871 vpxor TMP1, TMP1, T 872 873 vmovdqu TMP0, XMMWORD PTR[Htbl] 874 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 875 vmovdqu T, TMP1 876 877 LEncDataEnd: 878 inc aluCTR 879 bswap aluCTR 880 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR 881 882 mov esp, ebp 883 pop edi 884 pop esi 885 pop ebx 886 pop ebp 887 888 889 vzeroupper 890 891 ret 892 intel_aes_gcmENC ENDP 893 894 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 895 ; 896 ; Decrypt and Authenticate 897 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); 898 ; 899 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 900 901 902 NEXTCTR MACRO i 903 add aluCTR, 1 904 mov aluTMP, aluCTR 905 bswap aluTMP 906 xor aluTMP, [3*4 + KS] 907 mov [3*4 + i*16 + esp], aluTMP 908 ENDM 909 910 intel_aes_gcmDEC PROC 911 912 cmp DWORD PTR[1*4 + 3*4 + esp], 0 913 jne LbeginDEC 914 ret 915 916 LbeginDEC: 917 918 vzeroupper 919 push ebp 920 push ebx 921 push esi 922 push edi 923 924 mov ebp, esp 925 sub esp, 8*16 926 and esp, -16 927 928 mov CT, [ebp + 5*4 + 0*4] 929 mov PT, [ebp + 5*4 + 1*4] 930 mov Gctx, [ebp + 5*4 + 2*4] 931 932 mov KS, [16*16 + 3*16 + Gctx] 933 934 mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] 935 bswap aluCTR 936 937 938 vmovdqu TMP0, XMMWORD PTR[0*16 + KS] 939 vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] 940 vmovdqu XMMWORD PTR[0*16 + esp], TMP0 941 942 cmp len, 16*7 943 jb LDecDataSingles 944 vmovdqu XMMWORD PTR[1*16 + esp], TMP0 945 vmovdqu XMMWORD PTR[2*16 + esp], TMP0 946 vmovdqu XMMWORD PTR[3*16 + esp], TMP0 947 vmovdqu XMMWORD PTR[4*16 + esp], TMP0 948 vmovdqu XMMWORD PTR[5*16 + esp], TMP0 949 vmovdqu XMMWORD PTR[6*16 + esp], TMP0 950 dec aluCTR 951 952 LDecData7: 953 cmp len, 16*7 954 jb LDecData7End 955 sub len, 16*7 956 957 vmovdqu TMP5, XMMWORD PTR[0*16 + CT] 958 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 959 vpxor TMP5, TMP5, T 960 vpshufd TMP4, TMP5, 78 961 vpxor TMP4, TMP4, TMP5 962 vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h 963 vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl] 964 vpclmulqdq TMP1, TMP5, TMP4, 011h 965 vpclmulqdq TMP2, TMP5, TMP4, 000h 966 967 NEXTCTR 0 968 vmovdqu TMP5, XMMWORD PTR[1*16 + CT] 969 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 970 KARATSUBA 5 971 NEXTCTR 1 972 vmovdqu TMP5, XMMWORD PTR[2*16 + CT] 973 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 974 KARATSUBA 4 975 NEXTCTR 2 976 vmovdqu TMP5, XMMWORD PTR[3*16 + CT] 977 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 978 KARATSUBA 3 979 NEXTCTR 3 980 vmovdqu TMP5, XMMWORD PTR[4*16 + CT] 981 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 982 KARATSUBA 2 983 NEXTCTR 4 984 vmovdqu TMP5, XMMWORD PTR[5*16 + CT] 985 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 986 KARATSUBA 1 987 NEXTCTR 5 988 vmovdqu TMP5, XMMWORD PTR[6*16 + CT] 989 vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] 990 KARATSUBA 0 991 NEXTCTR 6 992 993 vpxor TMP0, TMP0, TMP1 994 vpxor TMP0, TMP0, TMP2 995 vpsrldq TMP3, TMP0, 8 996 vpxor TMP4, TMP1, TMP3 997 vpslldq TMP3, TMP0, 8 998 vpxor TMP5, TMP2, TMP3 999 1000 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 1001 vpalignr TMP5,TMP5,TMP5,8 1002 vpxor TMP5, TMP5, TMP1 1003 1004 vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h 1005 vpalignr TMP5,TMP5,TMP5,8 1006 vpxor TMP5, TMP5, TMP1 1007 1008 vpxor TMP5, TMP5, TMP4 1009 vmovdqu T, TMP5 1010 1011 vmovdqa CTR0, XMMWORD PTR[0*16 + esp] 1012 vmovdqa CTR1, XMMWORD PTR[1*16 + esp] 1013 vmovdqa CTR2, XMMWORD PTR[2*16 + esp] 1014 vmovdqa CTR3, XMMWORD PTR[3*16 + esp] 1015 vmovdqa CTR4, XMMWORD PTR[4*16 + esp] 1016 vmovdqa CTR5, XMMWORD PTR[5*16 + esp] 1017 vmovdqa CTR6, XMMWORD PTR[6*16 + esp] 1018 1019 ROUND 1 1020 ROUND 2 1021 ROUND 3 1022 ROUND 4 1023 ROUND 5 1024 ROUND 6 1025 ROUND 7 1026 ROUND 8 1027 ROUND 9 1028 vmovdqu xmm7, XMMWORD PTR[10*16 + KS] 1029 cmp NR, 10 1030 je @f 1031 1032 ROUND 10 1033 ROUND 11 1034 vmovdqu xmm7, XMMWORD PTR[12*16 + KS] 1035 cmp NR, 12 1036 je @f 1037 1038 ROUND 12 1039 ROUND 13 1040 vmovdqu xmm7, XMMWORD PTR[14*16 + KS] 1041 @@: 1042 vaesenclast CTR0, CTR0, xmm7 1043 vaesenclast CTR1, CTR1, xmm7 1044 vaesenclast CTR2, CTR2, xmm7 1045 vaesenclast CTR3, CTR3, xmm7 1046 vaesenclast CTR4, CTR4, xmm7 1047 vaesenclast CTR5, CTR5, xmm7 1048 vaesenclast CTR6, CTR6, xmm7 1049 1050 vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT] 1051 vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT] 1052 vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT] 1053 vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT] 1054 vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT] 1055 vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT] 1056 vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT] 1057 1058 vmovdqu XMMWORD PTR[0*16 + PT], CTR0 1059 vmovdqu XMMWORD PTR[1*16 + PT], CTR1 1060 vmovdqu XMMWORD PTR[2*16 + PT], CTR2 1061 vmovdqu XMMWORD PTR[3*16 + PT], CTR3 1062 vmovdqu XMMWORD PTR[4*16 + PT], CTR4 1063 vmovdqu XMMWORD PTR[5*16 + PT], CTR5 1064 vmovdqu XMMWORD PTR[6*16 + PT], CTR6 1065 1066 lea CT, [7*16 + CT] 1067 lea PT, [7*16 + PT] 1068 jmp LDecData7 1069 1070 LDecData7End: 1071 1072 NEXTCTR 0 1073 1074 LDecDataSingles: 1075 1076 cmp len, 16 1077 jb LDecDataTail 1078 sub len, 16 1079 1080 vmovdqu TMP1, XMMWORD PTR[CT] 1081 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] 1082 vpxor TMP1, TMP1, T 1083 1084 vmovdqu TMP0, XMMWORD PTR[Htbl] 1085 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 1086 vmovdqu T, TMP1 1087 1088 vmovdqa TMP1, XMMWORD PTR[0*16 + esp] 1089 NEXTCTR 0 1090 1091 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1092 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1093 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1094 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1095 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1096 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1097 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1098 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1099 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1100 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1101 cmp NR, 10 1102 je @f 1103 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1104 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1105 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1106 cmp NR, 12 1107 je @f 1108 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1109 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1110 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1111 @@: 1112 vaesenclast TMP1, TMP1, TMP2 1113 vpxor TMP1, TMP1, XMMWORD PTR[CT] 1114 vmovdqu XMMWORD PTR[PT], TMP1 1115 1116 lea PT, [16+PT] 1117 lea CT, [16+CT] 1118 jmp LDecDataSingles 1119 1120 LDecDataTail: 1121 1122 cmp len, 0 1123 je LDecDataEnd 1124 1125 vmovdqa TMP1, XMMWORD PTR[0*16 + esp] 1126 inc aluCTR 1127 vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] 1128 vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] 1129 vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] 1130 vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] 1131 vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] 1132 vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] 1133 vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] 1134 vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] 1135 vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] 1136 vmovdqu TMP2, XMMWORD PTR[10*16 + KS] 1137 cmp NR, 10 1138 je @f 1139 vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] 1140 vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] 1141 vmovdqu TMP2, XMMWORD PTR[12*16 + KS] 1142 cmp NR, 12 1143 je @f 1144 vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] 1145 vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] 1146 vmovdqu TMP2, XMMWORD PTR[14*16 + KS] 1147 @@: 1148 vaesenclast xmm7, TMP1, TMP2 1149 1150 ; copy as many bytes as needed 1151 xor KS, KS 1152 mov aluTMP, edx 1153 @@: 1154 cmp len, KS 1155 je @f 1156 mov dl, BYTE PTR[CT + KS] 1157 mov BYTE PTR[esp + KS], dl 1158 inc KS 1159 jmp @b 1160 @@: 1161 cmp KS, 16 1162 je @f 1163 mov BYTE PTR[esp + KS], 0 1164 inc KS 1165 jmp @b 1166 @@: 1167 mov edx, aluTMP 1168 vmovdqa TMP1, XMMWORD PTR[esp] 1169 vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] 1170 vpxor TMP1, TMP1, T 1171 1172 vmovdqu TMP0, XMMWORD PTR[Htbl] 1173 GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 1174 vmovdqu T, TMP1 1175 1176 vpxor xmm7, xmm7, XMMWORD PTR[esp] 1177 vmovdqa XMMWORD PTR[esp], xmm7 1178 xor KS, KS 1179 mov aluTMP, edx 1180 @@: 1181 cmp len, KS 1182 je @f 1183 mov dl, BYTE PTR[esp + KS] 1184 mov BYTE PTR[PT + KS], dl 1185 inc KS 1186 jmp @b 1187 @@: 1188 mov edx, aluTMP 1189 1190 LDecDataEnd: 1191 1192 bswap aluCTR 1193 mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR 1194 1195 mov esp, ebp 1196 pop edi 1197 pop esi 1198 pop ebx 1199 pop ebp 1200 1201 vzeroupper 1202 1203 ret 1204 intel_aes_gcmDEC ENDP 1205 1206 1207 END