intel-gcm.s (32871B)
1 # LICENSE: 2 # This submission to NSS is to be made available under the terms of the 3 # Mozilla Public License, v. 2.0. You can obtain one at http: 4 # //mozilla.org/MPL/2.0/. 5 ################################################################################ 6 # Copyright(c) 2012, Intel Corp. 7 8 .section .rodata 9 .align 16 10 .Lone: 11 .quad 1,0 12 .Ltwo: 13 .quad 2,0 14 .Lbswap_mask: 15 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 16 .Lshuff_mask: 17 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 18 .Lpoly: 19 .quad 0x1, 0xc200000000000000 20 21 .section .text 22 23 ################################################################################ 24 # Generates the final GCM tag 25 # void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG); 26 .type intel_aes_gcmTAG,@function 27 .globl intel_aes_gcmTAG 28 .align 16 29 intel_aes_gcmTAG: 30 31 .set Htbl, %rdi 32 .set Tp, %rsi 33 .set Mlen, %rdx 34 .set Alen, %rcx 35 .set X0, %r8 36 .set TAG, %r9 37 38 .set T,%xmm0 39 .set TMP0,%xmm1 40 41 vmovdqu (Tp), T 42 vpshufb .Lbswap_mask(%rip), T, T 43 vpxor TMP0, TMP0, TMP0 44 shl $3, Mlen 45 shl $3, Alen 46 vpinsrq $0, Mlen, TMP0, TMP0 47 vpinsrq $1, Alen, TMP0, TMP0 48 vpxor TMP0, T, T 49 vmovdqu (Htbl), TMP0 50 call GFMUL 51 vpshufb .Lbswap_mask(%rip), T, T 52 vpxor (X0), T, T 53 vmovdqu T, (TAG) 54 55 ret 56 .size intel_aes_gcmTAG, .-intel_aes_gcmTAG 57 ################################################################################ 58 # Generates the H table 59 # void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR); 60 .type intel_aes_gcmINIT,@function 61 .globl intel_aes_gcmINIT 62 .align 16 63 intel_aes_gcmINIT: 64 65 .set Htbl, %rdi 66 .set KS, %rsi 67 .set NR, %edx 68 69 .set T,%xmm0 70 .set TMP0,%xmm1 71 72 CALCULATE_POWERS_OF_H: 73 vmovdqu 16*0(KS), T 74 vaesenc 16*1(KS), T, T 75 vaesenc 16*2(KS), T, T 76 vaesenc 16*3(KS), T, T 77 vaesenc 16*4(KS), T, T 78 vaesenc 16*5(KS), T, T 79 vaesenc 16*6(KS), T, T 80 vaesenc 16*7(KS), T, T 81 vaesenc 16*8(KS), T, T 82 vaesenc 16*9(KS), T, T 83 vmovdqu 16*10(KS), TMP0 84 cmp $10, NR 85 je .LH0done 86 vaesenc 16*10(KS), T, T 87 vaesenc 16*11(KS), T, T 88 vmovdqu 16*12(KS), TMP0 89 cmp $12, NR 90 je .LH0done 91 vaesenc 16*12(KS), T, T 92 vaesenc 16*13(KS), T, T 93 vmovdqu 16*14(KS), TMP0 94 95 .LH0done: 96 vaesenclast TMP0, T, T 97 98 vpshufb .Lbswap_mask(%rip), T, T 99 100 vmovdqu T, TMP0 101 # Calculate H` = GFMUL(H, 2) 102 vpsrld $7 , T , %xmm3 103 vmovdqu .Lshuff_mask(%rip), %xmm4 104 vpshufb %xmm4, %xmm3 , %xmm3 105 movq $0xff00 , %rax 106 vmovq %rax, %xmm4 107 vpshufb %xmm3, %xmm4 , %xmm4 108 vmovdqu .Lpoly(%rip), %xmm5 109 vpand %xmm4, %xmm5, %xmm5 110 vpsrld $31, T, %xmm3 111 vpslld $1, T, %xmm4 112 vpslldq $4, %xmm3, %xmm3 113 vpxor %xmm3, %xmm4, T #xmm1 holds now p(x)<<1 114 115 #adding p(x)<<1 to xmm5 116 vpxor %xmm5, T , T 117 vmovdqu T, TMP0 118 vmovdqu T, (Htbl) # H * 2 119 call GFMUL 120 vmovdqu T, 16(Htbl) # H^2 * 2 121 call GFMUL 122 vmovdqu T, 32(Htbl) # H^3 * 2 123 call GFMUL 124 vmovdqu T, 48(Htbl) # H^4 * 2 125 call GFMUL 126 vmovdqu T, 64(Htbl) # H^5 * 2 127 call GFMUL 128 vmovdqu T, 80(Htbl) # H^6 * 2 129 call GFMUL 130 vmovdqu T, 96(Htbl) # H^7 * 2 131 call GFMUL 132 vmovdqu T, 112(Htbl) # H^8 * 2 133 134 # Precalculations for the reduce 4 step 135 vpshufd $78, (Htbl), %xmm8 136 vpshufd $78, 16(Htbl), %xmm9 137 vpshufd $78, 32(Htbl), %xmm10 138 vpshufd $78, 48(Htbl), %xmm11 139 vpshufd $78, 64(Htbl), %xmm12 140 vpshufd $78, 80(Htbl), %xmm13 141 vpshufd $78, 96(Htbl), %xmm14 142 vpshufd $78, 112(Htbl), %xmm15 143 144 vpxor (Htbl), %xmm8, %xmm8 145 vpxor 16(Htbl), %xmm9, %xmm9 146 vpxor 32(Htbl), %xmm10, %xmm10 147 vpxor 48(Htbl), %xmm11, %xmm11 148 vpxor 64(Htbl), %xmm12, %xmm12 149 vpxor 80(Htbl), %xmm13, %xmm13 150 vpxor 96(Htbl), %xmm14, %xmm14 151 vpxor 112(Htbl), %xmm15, %xmm15 152 153 vmovdqu %xmm8, 128(Htbl) 154 vmovdqu %xmm9, 144(Htbl) 155 vmovdqu %xmm10, 160(Htbl) 156 vmovdqu %xmm11, 176(Htbl) 157 vmovdqu %xmm12, 192(Htbl) 158 vmovdqu %xmm13, 208(Htbl) 159 vmovdqu %xmm14, 224(Htbl) 160 vmovdqu %xmm15, 240(Htbl) 161 162 ret 163 .size intel_aes_gcmINIT, .-intel_aes_gcmINIT 164 ################################################################################ 165 # Authenticate only 166 # void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp); 167 168 .globl intel_aes_gcmAAD 169 .type intel_aes_gcmAAD,@function 170 .align 16 171 intel_aes_gcmAAD: 172 173 .set DATA, %xmm0 174 .set T, %xmm1 175 .set BSWAP_MASK, %xmm2 176 .set TMP0, %xmm3 177 .set TMP1, %xmm4 178 .set TMP2, %xmm5 179 .set TMP3, %xmm6 180 .set TMP4, %xmm7 181 .set Xhi, %xmm9 182 183 .set Htbl, %rdi 184 .set inp, %rsi 185 .set len, %rdx 186 .set Tp, %rcx 187 188 .set hlp0, %r11 189 190 .macro KARATSUBA_AAD i 191 vpclmulqdq $0x00, 16*\i(Htbl), DATA, TMP3 192 vpxor TMP3, TMP0, TMP0 193 vpclmulqdq $0x11, 16*\i(Htbl), DATA, TMP3 194 vpxor TMP3, TMP1, TMP1 195 vpshufd $78, DATA, TMP3 196 vpxor DATA, TMP3, TMP3 197 vpclmulqdq $0x00, 16*(\i+8)(Htbl), TMP3, TMP3 198 vpxor TMP3, TMP2, TMP2 199 .endm 200 201 test len, len 202 jnz .LbeginAAD 203 ret 204 205 .LbeginAAD: 206 207 push hlp0 208 vzeroupper 209 210 vmovdqa .Lbswap_mask(%rip), BSWAP_MASK 211 212 vpxor Xhi, Xhi, Xhi 213 214 vmovdqu (Tp),T 215 vpshufb BSWAP_MASK,T,T 216 217 # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first 218 mov len, hlp0 219 and $~-128, hlp0 220 221 jz .Lmod_loop 222 223 sub hlp0, len 224 sub $16, hlp0 225 226 #hash first prefix block 227 vmovdqu (inp), DATA 228 vpshufb BSWAP_MASK, DATA, DATA 229 vpxor T, DATA, DATA 230 231 vpclmulqdq $0x00, (Htbl, hlp0), DATA, TMP0 232 vpclmulqdq $0x11, (Htbl, hlp0), DATA, TMP1 233 vpshufd $78, DATA, TMP2 234 vpxor DATA, TMP2, TMP2 235 vpclmulqdq $0x00, 16*8(Htbl, hlp0), TMP2, TMP2 236 237 lea 16(inp), inp 238 test hlp0, hlp0 239 jnz .Lpre_loop 240 jmp .Lred1 241 242 #hash remaining prefix bocks (up to 7 total prefix blocks) 243 .align 64 244 .Lpre_loop: 245 246 sub $16, hlp0 247 248 vmovdqu (inp),DATA # next data block 249 vpshufb BSWAP_MASK,DATA,DATA 250 251 vpclmulqdq $0x00, (Htbl,hlp0), DATA, TMP3 252 vpxor TMP3, TMP0, TMP0 253 vpclmulqdq $0x11, (Htbl,hlp0), DATA, TMP3 254 vpxor TMP3, TMP1, TMP1 255 vpshufd $78, DATA, TMP3 256 vpxor DATA, TMP3, TMP3 257 vpclmulqdq $0x00, 16*8(Htbl,hlp0), TMP3, TMP3 258 vpxor TMP3, TMP2, TMP2 259 260 test hlp0, hlp0 261 262 lea 16(inp), inp 263 264 jnz .Lpre_loop 265 266 .Lred1: 267 vpxor TMP0, TMP2, TMP2 268 vpxor TMP1, TMP2, TMP2 269 vpsrldq $8, TMP2, TMP3 270 vpslldq $8, TMP2, TMP2 271 272 vpxor TMP3, TMP1, Xhi 273 vpxor TMP2, TMP0, T 274 275 .align 64 276 .Lmod_loop: 277 sub $0x80, len 278 jb .Ldone 279 280 vmovdqu 16*7(inp),DATA # Ii 281 vpshufb BSWAP_MASK,DATA,DATA 282 283 vpclmulqdq $0x00, (Htbl), DATA, TMP0 284 vpclmulqdq $0x11, (Htbl), DATA, TMP1 285 vpshufd $78, DATA, TMP2 286 vpxor DATA, TMP2, TMP2 287 vpclmulqdq $0x00, 16*8(Htbl), TMP2, TMP2 288 ######################################################### 289 vmovdqu 16*6(inp),DATA 290 vpshufb BSWAP_MASK,DATA,DATA 291 KARATSUBA_AAD 1 292 ######################################################### 293 vmovdqu 16*5(inp),DATA 294 vpshufb BSWAP_MASK,DATA,DATA 295 296 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 1a 297 vpalignr $8, T, T, T 298 299 KARATSUBA_AAD 2 300 301 vpxor TMP4, T, T #reduction stage 1b 302 ######################################################### 303 vmovdqu 16*4(inp),DATA 304 vpshufb BSWAP_MASK,DATA,DATA 305 306 KARATSUBA_AAD 3 307 ######################################################### 308 vmovdqu 16*3(inp),DATA 309 vpshufb BSWAP_MASK,DATA,DATA 310 311 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 2a 312 vpalignr $8, T, T, T 313 314 KARATSUBA_AAD 4 315 316 vpxor TMP4, T, T #reduction stage 2b 317 ######################################################### 318 vmovdqu 16*2(inp),DATA 319 vpshufb BSWAP_MASK,DATA,DATA 320 321 KARATSUBA_AAD 5 322 323 vpxor Xhi, T, T #reduction finalize 324 ######################################################### 325 vmovdqu 16*1(inp),DATA 326 vpshufb BSWAP_MASK,DATA,DATA 327 328 KARATSUBA_AAD 6 329 ######################################################### 330 vmovdqu 16*0(inp),DATA 331 vpshufb BSWAP_MASK,DATA,DATA 332 vpxor T,DATA,DATA 333 334 KARATSUBA_AAD 7 335 ######################################################### 336 vpxor TMP0, TMP2, TMP2 # karatsuba fixup 337 vpxor TMP1, TMP2, TMP2 338 vpsrldq $8, TMP2, TMP3 339 vpslldq $8, TMP2, TMP2 340 341 vpxor TMP3, TMP1, Xhi 342 vpxor TMP2, TMP0, T 343 344 lea 16*8(inp), inp 345 jmp .Lmod_loop 346 ######################################################### 347 348 .Ldone: 349 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 350 vpalignr $8, T, T, T 351 vpxor TMP3, T, T 352 353 vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 354 vpalignr $8, T, T, T 355 vpxor TMP3, T, T 356 357 vpxor Xhi, T, T 358 359 .Lsave: 360 vpshufb BSWAP_MASK,T, T 361 vmovdqu T,(Tp) 362 vzeroupper 363 364 pop hlp0 365 ret 366 .size intel_aes_gcmAAD,.-intel_aes_gcmAAD 367 368 ################################################################################ 369 # Encrypt and Authenticate 370 # void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); 371 .type intel_aes_gcmENC,@function 372 .globl intel_aes_gcmENC 373 .align 16 374 intel_aes_gcmENC: 375 376 .set PT,%rdi 377 .set CT,%rsi 378 .set Htbl, %rdx 379 .set len, %rcx 380 .set KS,%r9 381 .set NR,%r10d 382 383 .set Gctx, %rdx 384 385 .set T,%xmm0 386 .set TMP0,%xmm1 387 .set TMP1,%xmm2 388 .set TMP2,%xmm3 389 .set TMP3,%xmm4 390 .set TMP4,%xmm5 391 .set TMP5,%xmm6 392 .set CTR0,%xmm7 393 .set CTR1,%xmm8 394 .set CTR2,%xmm9 395 .set CTR3,%xmm10 396 .set CTR4,%xmm11 397 .set CTR5,%xmm12 398 .set CTR6,%xmm13 399 .set CTR7,%xmm14 400 .set CTR,%xmm15 401 402 .macro ROUND i 403 vmovdqu \i*16(KS), TMP3 404 vaesenc TMP3, CTR0, CTR0 405 vaesenc TMP3, CTR1, CTR1 406 vaesenc TMP3, CTR2, CTR2 407 vaesenc TMP3, CTR3, CTR3 408 vaesenc TMP3, CTR4, CTR4 409 vaesenc TMP3, CTR5, CTR5 410 vaesenc TMP3, CTR6, CTR6 411 vaesenc TMP3, CTR7, CTR7 412 .endm 413 414 .macro ROUNDMUL i 415 416 vmovdqu \i*16(%rsp), TMP5 417 vmovdqu \i*16(KS), TMP3 418 419 vaesenc TMP3, CTR0, CTR0 420 vaesenc TMP3, CTR1, CTR1 421 vaesenc TMP3, CTR2, CTR2 422 vaesenc TMP3, CTR3, CTR3 423 424 vpshufd $78, TMP5, TMP4 425 vpxor TMP5, TMP4, TMP4 426 427 vaesenc TMP3, CTR4, CTR4 428 vaesenc TMP3, CTR5, CTR5 429 vaesenc TMP3, CTR6, CTR6 430 vaesenc TMP3, CTR7, CTR7 431 432 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP4, TMP3 433 vpxor TMP3, TMP0, TMP0 434 vmovdqa \i*16(Htbl), TMP4 435 vpclmulqdq $0x11, TMP4, TMP5, TMP3 436 vpxor TMP3, TMP1, TMP1 437 vpclmulqdq $0x00, TMP4, TMP5, TMP3 438 vpxor TMP3, TMP2, TMP2 439 440 .endm 441 442 .macro KARATSUBA i 443 vmovdqu \i*16(%rsp), TMP5 444 445 vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 446 vpxor TMP3, TMP1, TMP1 447 vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 448 vpxor TMP3, TMP2, TMP2 449 vpshufd $78, TMP5, TMP3 450 vpxor TMP5, TMP3, TMP5 451 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 452 vpxor TMP3, TMP0, TMP0 453 .endm 454 455 test len, len 456 jnz .Lbegin 457 ret 458 459 .Lbegin: 460 461 vzeroupper 462 push %rbp 463 movq %rsp, %rbp 464 465 sub $128, %rsp 466 andq $-16, %rsp 467 468 vmovdqu 288(Gctx), CTR 469 vmovdqu 272(Gctx), T 470 mov 304(Gctx), KS 471 # AESContext->Nr 472 mov 244(KS), NR 473 474 vpshufb .Lbswap_mask(%rip), CTR, CTR 475 vpshufb .Lbswap_mask(%rip), T, T 476 477 cmp $128, len 478 jb .LDataSingles 479 480 # Encrypt the first eight blocks 481 sub $128, len 482 vmovdqa CTR, CTR0 483 vpaddd .Lone(%rip), CTR0, CTR1 484 vpaddd .Ltwo(%rip), CTR0, CTR2 485 vpaddd .Lone(%rip), CTR2, CTR3 486 vpaddd .Ltwo(%rip), CTR2, CTR4 487 vpaddd .Lone(%rip), CTR4, CTR5 488 vpaddd .Ltwo(%rip), CTR4, CTR6 489 vpaddd .Lone(%rip), CTR6, CTR7 490 vpaddd .Ltwo(%rip), CTR6, CTR 491 492 vpshufb .Lbswap_mask(%rip), CTR0, CTR0 493 vpshufb .Lbswap_mask(%rip), CTR1, CTR1 494 vpshufb .Lbswap_mask(%rip), CTR2, CTR2 495 vpshufb .Lbswap_mask(%rip), CTR3, CTR3 496 vpshufb .Lbswap_mask(%rip), CTR4, CTR4 497 vpshufb .Lbswap_mask(%rip), CTR5, CTR5 498 vpshufb .Lbswap_mask(%rip), CTR6, CTR6 499 vpshufb .Lbswap_mask(%rip), CTR7, CTR7 500 501 vpxor (KS), CTR0, CTR0 502 vpxor (KS), CTR1, CTR1 503 vpxor (KS), CTR2, CTR2 504 vpxor (KS), CTR3, CTR3 505 vpxor (KS), CTR4, CTR4 506 vpxor (KS), CTR5, CTR5 507 vpxor (KS), CTR6, CTR6 508 vpxor (KS), CTR7, CTR7 509 510 ROUND 1 511 ROUND 2 512 ROUND 3 513 ROUND 4 514 ROUND 5 515 ROUND 6 516 ROUND 7 517 ROUND 8 518 ROUND 9 519 520 vmovdqu 160(KS), TMP5 521 cmp $12, NR 522 jb .LLast1 523 524 ROUND 10 525 ROUND 11 526 527 vmovdqu 192(KS), TMP5 528 cmp $14, NR 529 jb .LLast1 530 531 ROUND 12 532 ROUND 13 533 534 vmovdqu 224(KS), TMP5 535 536 .LLast1: 537 538 vpxor (PT), TMP5, TMP3 539 vaesenclast TMP3, CTR0, CTR0 540 vpxor 16(PT), TMP5, TMP3 541 vaesenclast TMP3, CTR1, CTR1 542 vpxor 32(PT), TMP5, TMP3 543 vaesenclast TMP3, CTR2, CTR2 544 vpxor 48(PT), TMP5, TMP3 545 vaesenclast TMP3, CTR3, CTR3 546 vpxor 64(PT), TMP5, TMP3 547 vaesenclast TMP3, CTR4, CTR4 548 vpxor 80(PT), TMP5, TMP3 549 vaesenclast TMP3, CTR5, CTR5 550 vpxor 96(PT), TMP5, TMP3 551 vaesenclast TMP3, CTR6, CTR6 552 vpxor 112(PT), TMP5, TMP3 553 vaesenclast TMP3, CTR7, CTR7 554 555 vmovdqu .Lbswap_mask(%rip), TMP3 556 557 vmovdqu CTR0, (CT) 558 vpshufb TMP3, CTR0, CTR0 559 vmovdqu CTR1, 16(CT) 560 vpshufb TMP3, CTR1, CTR1 561 vmovdqu CTR2, 32(CT) 562 vpshufb TMP3, CTR2, CTR2 563 vmovdqu CTR3, 48(CT) 564 vpshufb TMP3, CTR3, CTR3 565 vmovdqu CTR4, 64(CT) 566 vpshufb TMP3, CTR4, CTR4 567 vmovdqu CTR5, 80(CT) 568 vpshufb TMP3, CTR5, CTR5 569 vmovdqu CTR6, 96(CT) 570 vpshufb TMP3, CTR6, CTR6 571 vmovdqu CTR7, 112(CT) 572 vpshufb TMP3, CTR7, CTR7 573 574 lea 128(CT), CT 575 lea 128(PT), PT 576 jmp .LDataOctets 577 578 # Encrypt 8 blocks each time while hashing previous 8 blocks 579 .align 64 580 .LDataOctets: 581 cmp $128, len 582 jb .LEndOctets 583 sub $128, len 584 585 vmovdqa CTR7, TMP5 586 vmovdqa CTR6, 1*16(%rsp) 587 vmovdqa CTR5, 2*16(%rsp) 588 vmovdqa CTR4, 3*16(%rsp) 589 vmovdqa CTR3, 4*16(%rsp) 590 vmovdqa CTR2, 5*16(%rsp) 591 vmovdqa CTR1, 6*16(%rsp) 592 vmovdqa CTR0, 7*16(%rsp) 593 594 vmovdqa CTR, CTR0 595 vpaddd .Lone(%rip), CTR0, CTR1 596 vpaddd .Ltwo(%rip), CTR0, CTR2 597 vpaddd .Lone(%rip), CTR2, CTR3 598 vpaddd .Ltwo(%rip), CTR2, CTR4 599 vpaddd .Lone(%rip), CTR4, CTR5 600 vpaddd .Ltwo(%rip), CTR4, CTR6 601 vpaddd .Lone(%rip), CTR6, CTR7 602 vpaddd .Ltwo(%rip), CTR6, CTR 603 604 vmovdqu (KS), TMP4 605 vpshufb TMP3, CTR0, CTR0 606 vpxor TMP4, CTR0, CTR0 607 vpshufb TMP3, CTR1, CTR1 608 vpxor TMP4, CTR1, CTR1 609 vpshufb TMP3, CTR2, CTR2 610 vpxor TMP4, CTR2, CTR2 611 vpshufb TMP3, CTR3, CTR3 612 vpxor TMP4, CTR3, CTR3 613 vpshufb TMP3, CTR4, CTR4 614 vpxor TMP4, CTR4, CTR4 615 vpshufb TMP3, CTR5, CTR5 616 vpxor TMP4, CTR5, CTR5 617 vpshufb TMP3, CTR6, CTR6 618 vpxor TMP4, CTR6, CTR6 619 vpshufb TMP3, CTR7, CTR7 620 vpxor TMP4, CTR7, CTR7 621 622 vmovdqu 16*0(Htbl), TMP3 623 vpclmulqdq $0x11, TMP3, TMP5, TMP1 624 vpclmulqdq $0x00, TMP3, TMP5, TMP2 625 vpshufd $78, TMP5, TMP3 626 vpxor TMP5, TMP3, TMP5 627 vmovdqu 128+0*16(Htbl), TMP3 628 vpclmulqdq $0x00, TMP3, TMP5, TMP0 629 630 ROUNDMUL 1 631 632 ROUNDMUL 2 633 634 ROUNDMUL 3 635 636 ROUNDMUL 4 637 638 ROUNDMUL 5 639 640 ROUNDMUL 6 641 642 vpxor 7*16(%rsp), T, TMP5 643 vmovdqu 7*16(KS), TMP3 644 645 vaesenc TMP3, CTR0, CTR0 646 vaesenc TMP3, CTR1, CTR1 647 vaesenc TMP3, CTR2, CTR2 648 vaesenc TMP3, CTR3, CTR3 649 650 vpshufd $78, TMP5, TMP4 651 vpxor TMP5, TMP4, TMP4 652 653 vaesenc TMP3, CTR4, CTR4 654 vaesenc TMP3, CTR5, CTR5 655 vaesenc TMP3, CTR6, CTR6 656 vaesenc TMP3, CTR7, CTR7 657 658 vpclmulqdq $0x11, 7*16(Htbl), TMP5, TMP3 659 vpxor TMP3, TMP1, TMP1 660 vpclmulqdq $0x00, 7*16(Htbl), TMP5, TMP3 661 vpxor TMP3, TMP2, TMP2 662 vpclmulqdq $0x00, 128+7*16(Htbl), TMP4, TMP3 663 vpxor TMP3, TMP0, TMP0 664 665 ROUND 8 666 vmovdqa .Lpoly(%rip), TMP5 667 668 vpxor TMP1, TMP0, TMP0 669 vpxor TMP2, TMP0, TMP0 670 vpsrldq $8, TMP0, TMP3 671 vpxor TMP3, TMP1, TMP4 672 vpslldq $8, TMP0, TMP3 673 vpxor TMP3, TMP2, T 674 675 vpclmulqdq $0x10, TMP5, T, TMP1 676 vpalignr $8, T, T, T 677 vpxor T, TMP1, T 678 679 ROUND 9 680 681 vpclmulqdq $0x10, TMP5, T, TMP1 682 vpalignr $8, T, T, T 683 vpxor T, TMP1, T 684 685 vmovdqu 160(KS), TMP5 686 cmp $10, NR 687 jbe .LLast2 688 689 ROUND 10 690 ROUND 11 691 692 vmovdqu 192(KS), TMP5 693 cmp $12, NR 694 jbe .LLast2 695 696 ROUND 12 697 ROUND 13 698 699 vmovdqu 224(KS), TMP5 700 701 .LLast2: 702 703 vpxor (PT), TMP5, TMP3 704 vaesenclast TMP3, CTR0, CTR0 705 vpxor 16(PT), TMP5, TMP3 706 vaesenclast TMP3, CTR1, CTR1 707 vpxor 32(PT), TMP5, TMP3 708 vaesenclast TMP3, CTR2, CTR2 709 vpxor 48(PT), TMP5, TMP3 710 vaesenclast TMP3, CTR3, CTR3 711 vpxor 64(PT), TMP5, TMP3 712 vaesenclast TMP3, CTR4, CTR4 713 vpxor 80(PT), TMP5, TMP3 714 vaesenclast TMP3, CTR5, CTR5 715 vpxor 96(PT), TMP5, TMP3 716 vaesenclast TMP3, CTR6, CTR6 717 vpxor 112(PT), TMP5, TMP3 718 vaesenclast TMP3, CTR7, CTR7 719 720 vmovdqu .Lbswap_mask(%rip), TMP3 721 722 vmovdqu CTR0, (CT) 723 vpshufb TMP3, CTR0, CTR0 724 vmovdqu CTR1, 16(CT) 725 vpshufb TMP3, CTR1, CTR1 726 vmovdqu CTR2, 32(CT) 727 vpshufb TMP3, CTR2, CTR2 728 vmovdqu CTR3, 48(CT) 729 vpshufb TMP3, CTR3, CTR3 730 vmovdqu CTR4, 64(CT) 731 vpshufb TMP3, CTR4, CTR4 732 vmovdqu CTR5, 80(CT) 733 vpshufb TMP3, CTR5, CTR5 734 vmovdqu CTR6, 96(CT) 735 vpshufb TMP3, CTR6, CTR6 736 vmovdqu CTR7,112(CT) 737 vpshufb TMP3, CTR7, CTR7 738 739 vpxor TMP4, T, T 740 741 lea 128(CT), CT 742 lea 128(PT), PT 743 jmp .LDataOctets 744 745 .LEndOctets: 746 747 vmovdqa CTR7, TMP5 748 vmovdqa CTR6, 1*16(%rsp) 749 vmovdqa CTR5, 2*16(%rsp) 750 vmovdqa CTR4, 3*16(%rsp) 751 vmovdqa CTR3, 4*16(%rsp) 752 vmovdqa CTR2, 5*16(%rsp) 753 vmovdqa CTR1, 6*16(%rsp) 754 vmovdqa CTR0, 7*16(%rsp) 755 756 vmovdqu 16*0(Htbl), TMP3 757 vpclmulqdq $0x11, TMP3, TMP5, TMP1 758 vpclmulqdq $0x00, TMP3, TMP5, TMP2 759 vpshufd $78, TMP5, TMP3 760 vpxor TMP5, TMP3, TMP5 761 vmovdqu 128+0*16(Htbl), TMP3 762 vpclmulqdq $0x00, TMP3, TMP5, TMP0 763 764 KARATSUBA 1 765 KARATSUBA 2 766 KARATSUBA 3 767 KARATSUBA 4 768 KARATSUBA 5 769 KARATSUBA 6 770 771 vmovdqu 7*16(%rsp), TMP5 772 vpxor T, TMP5, TMP5 773 vmovdqu 16*7(Htbl), TMP4 774 vpclmulqdq $0x11, TMP4, TMP5, TMP3 775 vpxor TMP3, TMP1, TMP1 776 vpclmulqdq $0x00, TMP4, TMP5, TMP3 777 vpxor TMP3, TMP2, TMP2 778 vpshufd $78, TMP5, TMP3 779 vpxor TMP5, TMP3, TMP5 780 vmovdqu 128+7*16(Htbl), TMP4 781 vpclmulqdq $0x00, TMP4, TMP5, TMP3 782 vpxor TMP3, TMP0, TMP0 783 784 vpxor TMP1, TMP0, TMP0 785 vpxor TMP2, TMP0, TMP0 786 787 vpsrldq $8, TMP0, TMP3 788 vpxor TMP3, TMP1, TMP4 789 vpslldq $8, TMP0, TMP3 790 vpxor TMP3, TMP2, T 791 792 vmovdqa .Lpoly(%rip), TMP2 793 794 vpalignr $8, T, T, TMP1 795 vpclmulqdq $0x10, TMP2, T, T 796 vpxor T, TMP1, T 797 798 vpalignr $8, T, T, TMP1 799 vpclmulqdq $0x10, TMP2, T, T 800 vpxor T, TMP1, T 801 802 vpxor TMP4, T, T 803 804 #Here we encrypt any remaining whole block 805 .LDataSingles: 806 807 cmp $16, len 808 jb .LDataTail 809 sub $16, len 810 811 vpshufb .Lbswap_mask(%rip), CTR, TMP1 812 vpaddd .Lone(%rip), CTR, CTR 813 814 vpxor (KS), TMP1, TMP1 815 vaesenc 16*1(KS), TMP1, TMP1 816 vaesenc 16*2(KS), TMP1, TMP1 817 vaesenc 16*3(KS), TMP1, TMP1 818 vaesenc 16*4(KS), TMP1, TMP1 819 vaesenc 16*5(KS), TMP1, TMP1 820 vaesenc 16*6(KS), TMP1, TMP1 821 vaesenc 16*7(KS), TMP1, TMP1 822 vaesenc 16*8(KS), TMP1, TMP1 823 vaesenc 16*9(KS), TMP1, TMP1 824 vmovdqu 16*10(KS), TMP2 825 cmp $10, NR 826 je .LLast3 827 vaesenc 16*10(KS), TMP1, TMP1 828 vaesenc 16*11(KS), TMP1, TMP1 829 vmovdqu 16*12(KS), TMP2 830 cmp $12, NR 831 je .LLast3 832 vaesenc 16*12(KS), TMP1, TMP1 833 vaesenc 16*13(KS), TMP1, TMP1 834 vmovdqu 16*14(KS), TMP2 835 836 .LLast3: 837 vaesenclast TMP2, TMP1, TMP1 838 839 vpxor (PT), TMP1, TMP1 840 vmovdqu TMP1, (CT) 841 addq $16, CT 842 addq $16, PT 843 844 vpshufb .Lbswap_mask(%rip), TMP1, TMP1 845 vpxor TMP1, T, T 846 vmovdqu (Htbl), TMP0 847 call GFMUL 848 849 jmp .LDataSingles 850 851 #Here we encypt the final partial block, if there is one 852 .LDataTail: 853 854 test len, len 855 jz DATA_END 856 # First prepare the counter block 857 vpshufb .Lbswap_mask(%rip), CTR, TMP1 858 vpaddd .Lone(%rip), CTR, CTR 859 860 vpxor (KS), TMP1, TMP1 861 vaesenc 16*1(KS), TMP1, TMP1 862 vaesenc 16*2(KS), TMP1, TMP1 863 vaesenc 16*3(KS), TMP1, TMP1 864 vaesenc 16*4(KS), TMP1, TMP1 865 vaesenc 16*5(KS), TMP1, TMP1 866 vaesenc 16*6(KS), TMP1, TMP1 867 vaesenc 16*7(KS), TMP1, TMP1 868 vaesenc 16*8(KS), TMP1, TMP1 869 vaesenc 16*9(KS), TMP1, TMP1 870 vmovdqu 16*10(KS), TMP2 871 cmp $10, NR 872 je .LLast4 873 vaesenc 16*10(KS), TMP1, TMP1 874 vaesenc 16*11(KS), TMP1, TMP1 875 vmovdqu 16*12(KS), TMP2 876 cmp $12, NR 877 je .LLast4 878 vaesenc 16*12(KS), TMP1, TMP1 879 vaesenc 16*13(KS), TMP1, TMP1 880 vmovdqu 16*14(KS), TMP2 881 882 .LLast4: 883 vaesenclast TMP2, TMP1, TMP1 884 #Zero a temp location 885 vpxor TMP2, TMP2, TMP2 886 vmovdqa TMP2, (%rsp) 887 888 # Copy the required bytes only (could probably use rep movsb) 889 xor KS, KS 890 .LEncCpy: 891 cmp KS, len 892 je .LEncCpyEnd 893 movb (PT, KS, 1), %r8b 894 movb %r8b, (%rsp, KS, 1) 895 inc KS 896 jmp .LEncCpy 897 .LEncCpyEnd: 898 # Xor with the counter block 899 vpxor (%rsp), TMP1, TMP0 900 # Again, store at temp location 901 vmovdqa TMP0, (%rsp) 902 # Copy only the required bytes to CT, and zero the rest for the hash 903 xor KS, KS 904 .LEncCpy2: 905 cmp KS, len 906 je .LEncCpy3 907 movb (%rsp, KS, 1), %r8b 908 movb %r8b, (CT, KS, 1) 909 inc KS 910 jmp .LEncCpy2 911 .LEncCpy3: 912 cmp $16, KS 913 je .LEndCpy3 914 movb $0, (%rsp, KS, 1) 915 inc KS 916 jmp .LEncCpy3 917 .LEndCpy3: 918 vmovdqa (%rsp), TMP0 919 920 vpshufb .Lbswap_mask(%rip), TMP0, TMP0 921 vpxor TMP0, T, T 922 vmovdqu (Htbl), TMP0 923 call GFMUL 924 925 DATA_END: 926 927 vpshufb .Lbswap_mask(%rip), T, T 928 vpshufb .Lbswap_mask(%rip), CTR, CTR 929 vmovdqu T, 272(Gctx) 930 vmovdqu CTR, 288(Gctx) 931 932 movq %rbp, %rsp 933 popq %rbp 934 ret 935 .size intel_aes_gcmENC, .-intel_aes_gcmENC 936 937 ######################### 938 # Decrypt and Authenticate 939 # void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); 940 .type intel_aes_gcmDEC,@function 941 .globl intel_aes_gcmDEC 942 .align 16 943 intel_aes_gcmDEC: 944 # parameter 1: CT # input 945 # parameter 2: PT # output 946 # parameter 3: %rdx # Gctx 947 # parameter 4: %rcx # len 948 949 .macro DEC_KARATSUBA i 950 vmovdqu (7-\i)*16(CT), TMP5 951 vpshufb .Lbswap_mask(%rip), TMP5, TMP5 952 953 vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 954 vpxor TMP3, TMP1, TMP1 955 vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 956 vpxor TMP3, TMP2, TMP2 957 vpshufd $78, TMP5, TMP3 958 vpxor TMP5, TMP3, TMP5 959 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 960 vpxor TMP3, TMP0, TMP0 961 .endm 962 963 .set PT,%rsi 964 .set CT,%rdi 965 .set Htbl, %rdx 966 .set len, %rcx 967 .set KS,%r9 968 .set NR,%r10d 969 970 .set Gctx, %rdx 971 972 .set T,%xmm0 973 .set TMP0,%xmm1 974 .set TMP1,%xmm2 975 .set TMP2,%xmm3 976 .set TMP3,%xmm4 977 .set TMP4,%xmm5 978 .set TMP5,%xmm6 979 .set CTR0,%xmm7 980 .set CTR1,%xmm8 981 .set CTR2,%xmm9 982 .set CTR3,%xmm10 983 .set CTR4,%xmm11 984 .set CTR5,%xmm12 985 .set CTR6,%xmm13 986 .set CTR7,%xmm14 987 .set CTR,%xmm15 988 989 test len, len 990 jnz .LbeginDec 991 ret 992 993 .LbeginDec: 994 995 pushq %rbp 996 movq %rsp, %rbp 997 sub $128, %rsp 998 andq $-16, %rsp 999 vmovdqu 288(Gctx), CTR 1000 vmovdqu 272(Gctx), T 1001 mov 304(Gctx), KS 1002 # AESContext->Nr 1003 mov 244(KS), NR 1004 1005 vpshufb .Lbswap_mask(%rip), CTR, CTR 1006 vpshufb .Lbswap_mask(%rip), T, T 1007 1008 vmovdqu .Lbswap_mask(%rip), TMP3 1009 jmp .LDECOctets 1010 1011 # Decrypt 8 blocks each time while hashing them at the same time 1012 .align 64 1013 .LDECOctets: 1014 1015 cmp $128, len 1016 jb .LDECSingles 1017 sub $128, len 1018 1019 vmovdqa CTR, CTR0 1020 vpaddd .Lone(%rip), CTR0, CTR1 1021 vpaddd .Ltwo(%rip), CTR0, CTR2 1022 vpaddd .Lone(%rip), CTR2, CTR3 1023 vpaddd .Ltwo(%rip), CTR2, CTR4 1024 vpaddd .Lone(%rip), CTR4, CTR5 1025 vpaddd .Ltwo(%rip), CTR4, CTR6 1026 vpaddd .Lone(%rip), CTR6, CTR7 1027 vpaddd .Ltwo(%rip), CTR6, CTR 1028 1029 vpshufb TMP3, CTR0, CTR0 1030 vpshufb TMP3, CTR1, CTR1 1031 vpshufb TMP3, CTR2, CTR2 1032 vpshufb TMP3, CTR3, CTR3 1033 vpshufb TMP3, CTR4, CTR4 1034 vpshufb TMP3, CTR5, CTR5 1035 vpshufb TMP3, CTR6, CTR6 1036 vpshufb TMP3, CTR7, CTR7 1037 1038 vmovdqu (KS), TMP3 1039 vpxor TMP3, CTR0, CTR0 1040 vpxor TMP3, CTR1, CTR1 1041 vpxor TMP3, CTR2, CTR2 1042 vpxor TMP3, CTR3, CTR3 1043 vpxor TMP3, CTR4, CTR4 1044 vpxor TMP3, CTR5, CTR5 1045 vpxor TMP3, CTR6, CTR6 1046 vpxor TMP3, CTR7, CTR7 1047 1048 vmovdqu 7*16(CT), TMP5 1049 vpshufb .Lbswap_mask(%rip), TMP5, TMP5 1050 vmovdqu 16*0(Htbl), TMP3 1051 vpclmulqdq $0x11, TMP3, TMP5, TMP1 1052 vpclmulqdq $0x00, TMP3, TMP5, TMP2 1053 vpshufd $78, TMP5, TMP3 1054 vpxor TMP5, TMP3, TMP5 1055 vmovdqu 128+0*16(Htbl), TMP3 1056 vpclmulqdq $0x00, TMP3, TMP5, TMP0 1057 1058 ROUND 1 1059 DEC_KARATSUBA 1 1060 1061 ROUND 2 1062 DEC_KARATSUBA 2 1063 1064 ROUND 3 1065 DEC_KARATSUBA 3 1066 1067 ROUND 4 1068 DEC_KARATSUBA 4 1069 1070 ROUND 5 1071 DEC_KARATSUBA 5 1072 1073 ROUND 6 1074 DEC_KARATSUBA 6 1075 1076 ROUND 7 1077 1078 vmovdqu 0*16(CT), TMP5 1079 vpshufb .Lbswap_mask(%rip), TMP5, TMP5 1080 vpxor T, TMP5, TMP5 1081 vmovdqu 16*7(Htbl), TMP4 1082 1083 vpclmulqdq $0x11, TMP4, TMP5, TMP3 1084 vpxor TMP3, TMP1, TMP1 1085 vpclmulqdq $0x00, TMP4, TMP5, TMP3 1086 vpxor TMP3, TMP2, TMP2 1087 1088 vpshufd $78, TMP5, TMP3 1089 vpxor TMP5, TMP3, TMP5 1090 vmovdqu 128+7*16(Htbl), TMP4 1091 1092 vpclmulqdq $0x00, TMP4, TMP5, TMP3 1093 vpxor TMP3, TMP0, TMP0 1094 1095 ROUND 8 1096 1097 vpxor TMP1, TMP0, TMP0 1098 vpxor TMP2, TMP0, TMP0 1099 1100 vpsrldq $8, TMP0, TMP3 1101 vpxor TMP3, TMP1, TMP4 1102 vpslldq $8, TMP0, TMP3 1103 vpxor TMP3, TMP2, T 1104 vmovdqa .Lpoly(%rip), TMP2 1105 1106 vpalignr $8, T, T, TMP1 1107 vpclmulqdq $0x10, TMP2, T, T 1108 vpxor T, TMP1, T 1109 1110 ROUND 9 1111 1112 vpalignr $8, T, T, TMP1 1113 vpclmulqdq $0x10, TMP2, T, T 1114 vpxor T, TMP1, T 1115 1116 vmovdqu 160(KS), TMP5 1117 cmp $10, NR 1118 1119 jbe .LDECLast1 1120 1121 ROUND 10 1122 ROUND 11 1123 1124 vmovdqu 192(KS), TMP5 1125 cmp $12, NR 1126 1127 jbe .LDECLast1 1128 1129 ROUND 12 1130 ROUND 13 1131 1132 vmovdqu 224(KS), TMP5 1133 1134 .LDECLast1: 1135 1136 vpxor (CT), TMP5, TMP3 1137 vaesenclast TMP3, CTR0, CTR0 1138 vpxor 16(CT), TMP5, TMP3 1139 vaesenclast TMP3, CTR1, CTR1 1140 vpxor 32(CT), TMP5, TMP3 1141 vaesenclast TMP3, CTR2, CTR2 1142 vpxor 48(CT), TMP5, TMP3 1143 vaesenclast TMP3, CTR3, CTR3 1144 vpxor 64(CT), TMP5, TMP3 1145 vaesenclast TMP3, CTR4, CTR4 1146 vpxor 80(CT), TMP5, TMP3 1147 vaesenclast TMP3, CTR5, CTR5 1148 vpxor 96(CT), TMP5, TMP3 1149 vaesenclast TMP3, CTR6, CTR6 1150 vpxor 112(CT), TMP5, TMP3 1151 vaesenclast TMP3, CTR7, CTR7 1152 1153 vmovdqu .Lbswap_mask(%rip), TMP3 1154 1155 vmovdqu CTR0, (PT) 1156 vmovdqu CTR1, 16(PT) 1157 vmovdqu CTR2, 32(PT) 1158 vmovdqu CTR3, 48(PT) 1159 vmovdqu CTR4, 64(PT) 1160 vmovdqu CTR5, 80(PT) 1161 vmovdqu CTR6, 96(PT) 1162 vmovdqu CTR7,112(PT) 1163 1164 vpxor TMP4, T, T 1165 1166 lea 128(CT), CT 1167 lea 128(PT), PT 1168 jmp .LDECOctets 1169 1170 #Here we decrypt and hash any remaining whole block 1171 .LDECSingles: 1172 1173 cmp $16, len 1174 jb .LDECTail 1175 sub $16, len 1176 1177 vmovdqu (CT), TMP1 1178 vpshufb .Lbswap_mask(%rip), TMP1, TMP1 1179 vpxor TMP1, T, T 1180 vmovdqu (Htbl), TMP0 1181 call GFMUL 1182 1183 1184 vpshufb .Lbswap_mask(%rip), CTR, TMP1 1185 vpaddd .Lone(%rip), CTR, CTR 1186 1187 vpxor (KS), TMP1, TMP1 1188 vaesenc 16*1(KS), TMP1, TMP1 1189 vaesenc 16*2(KS), TMP1, TMP1 1190 vaesenc 16*3(KS), TMP1, TMP1 1191 vaesenc 16*4(KS), TMP1, TMP1 1192 vaesenc 16*5(KS), TMP1, TMP1 1193 vaesenc 16*6(KS), TMP1, TMP1 1194 vaesenc 16*7(KS), TMP1, TMP1 1195 vaesenc 16*8(KS), TMP1, TMP1 1196 vaesenc 16*9(KS), TMP1, TMP1 1197 vmovdqu 16*10(KS), TMP2 1198 cmp $10, NR 1199 je .LDECLast2 1200 vaesenc 16*10(KS), TMP1, TMP1 1201 vaesenc 16*11(KS), TMP1, TMP1 1202 vmovdqu 16*12(KS), TMP2 1203 cmp $12, NR 1204 je .LDECLast2 1205 vaesenc 16*12(KS), TMP1, TMP1 1206 vaesenc 16*13(KS), TMP1, TMP1 1207 vmovdqu 16*14(KS), TMP2 1208 .LDECLast2: 1209 vaesenclast TMP2, TMP1, TMP1 1210 1211 vpxor (CT), TMP1, TMP1 1212 vmovdqu TMP1, (PT) 1213 addq $16, CT 1214 addq $16, PT 1215 jmp .LDECSingles 1216 1217 #Here we decrypt the final partial block, if there is one 1218 .LDECTail: 1219 test len, len 1220 jz .LDEC_END 1221 1222 vpshufb .Lbswap_mask(%rip), CTR, TMP1 1223 vpaddd .Lone(%rip), CTR, CTR 1224 1225 vpxor (KS), TMP1, TMP1 1226 vaesenc 16*1(KS), TMP1, TMP1 1227 vaesenc 16*2(KS), TMP1, TMP1 1228 vaesenc 16*3(KS), TMP1, TMP1 1229 vaesenc 16*4(KS), TMP1, TMP1 1230 vaesenc 16*5(KS), TMP1, TMP1 1231 vaesenc 16*6(KS), TMP1, TMP1 1232 vaesenc 16*7(KS), TMP1, TMP1 1233 vaesenc 16*8(KS), TMP1, TMP1 1234 vaesenc 16*9(KS), TMP1, TMP1 1235 vmovdqu 16*10(KS), TMP2 1236 cmp $10, NR 1237 je .LDECLast3 1238 vaesenc 16*10(KS), TMP1, TMP1 1239 vaesenc 16*11(KS), TMP1, TMP1 1240 vmovdqu 16*12(KS), TMP2 1241 cmp $12, NR 1242 je .LDECLast3 1243 vaesenc 16*12(KS), TMP1, TMP1 1244 vaesenc 16*13(KS), TMP1, TMP1 1245 vmovdqu 16*14(KS), TMP2 1246 1247 .LDECLast3: 1248 vaesenclast TMP2, TMP1, TMP1 1249 1250 vpxor TMP2, TMP2, TMP2 1251 vmovdqa TMP2, (%rsp) 1252 # Copy the required bytes only (could probably use rep movsb) 1253 xor KS, KS 1254 .LDecCpy: 1255 cmp KS, len 1256 je .LDecCpy2 1257 movb (CT, KS, 1), %r8b 1258 movb %r8b, (%rsp, KS, 1) 1259 inc KS 1260 jmp .LDecCpy 1261 .LDecCpy2: 1262 cmp $16, KS 1263 je .LDecCpyEnd 1264 movb $0, (%rsp, KS, 1) 1265 inc KS 1266 jmp .LDecCpy2 1267 .LDecCpyEnd: 1268 # Xor with the counter block 1269 vmovdqa (%rsp), TMP0 1270 vpxor TMP0, TMP1, TMP1 1271 # Again, store at temp location 1272 vmovdqa TMP1, (%rsp) 1273 # Copy only the required bytes to PT, and zero the rest for the hash 1274 xor KS, KS 1275 .LDecCpy3: 1276 cmp KS, len 1277 je .LDecCpyEnd3 1278 movb (%rsp, KS, 1), %r8b 1279 movb %r8b, (PT, KS, 1) 1280 inc KS 1281 jmp .LDecCpy3 1282 .LDecCpyEnd3: 1283 vpshufb .Lbswap_mask(%rip), TMP0, TMP0 1284 vpxor TMP0, T, T 1285 vmovdqu (Htbl), TMP0 1286 call GFMUL 1287 .LDEC_END: 1288 1289 vpshufb .Lbswap_mask(%rip), T, T 1290 vpshufb .Lbswap_mask(%rip), CTR, CTR 1291 vmovdqu T, 272(Gctx) 1292 vmovdqu CTR, 288(Gctx) 1293 1294 movq %rbp, %rsp 1295 popq %rbp 1296 ret 1297 .size intel_aes_gcmDEC, .-intel_aes_gcmDEC 1298 ######################### 1299 # a = T 1300 # b = TMP0 - remains unchanged 1301 # res = T 1302 # uses also TMP1,TMP2,TMP3,TMP4 1303 # __m128i GFMUL(__m128i A, __m128i B); 1304 .type GFMUL,@function 1305 .globl GFMUL 1306 GFMUL: 1307 pushq %rbp 1308 movq %rsp, %rbp 1309 vpclmulqdq $0x00, TMP0, T, TMP1 1310 vpclmulqdq $0x11, TMP0, T, TMP4 1311 1312 vpshufd $78, T, TMP2 1313 vpshufd $78, TMP0, TMP3 1314 vpxor T, TMP2, TMP2 1315 vpxor TMP0, TMP3, TMP3 1316 1317 vpclmulqdq $0x00, TMP3, TMP2, TMP2 1318 vpxor TMP1, TMP2, TMP2 1319 vpxor TMP4, TMP2, TMP2 1320 1321 vpslldq $8, TMP2, TMP3 1322 vpsrldq $8, TMP2, TMP2 1323 1324 vpxor TMP3, TMP1, TMP1 1325 vpxor TMP2, TMP4, TMP4 1326 1327 vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 1328 vpshufd $78, TMP1, TMP3 1329 vpxor TMP3, TMP2, TMP1 1330 1331 vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 1332 vpshufd $78, TMP1, TMP3 1333 vpxor TMP3, TMP2, TMP1 1334 1335 vpxor TMP4, TMP1, T 1336 movq %rbp, %rsp 1337 popq %rbp 1338 ret 1339 .size GFMUL, .-GFMUL