tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intel-gcm-x64-masm.asm (34729B)


      1 ; LICENSE:
      2 ; This submission to NSS is to be made available under the terms of the
      3 ; Mozilla Public License, v. 2.0. You can obtain one at http:
      4 ; //mozilla.org/MPL/2.0/.
      5 ;###############################################################################
      6 ; Copyright(c) 2014, Intel Corp.
      7 ; Developers and authors:
      8 ; Shay Gueron and Vlad Krasnov
      9 ; Intel Corporation, Israel Development Centre, Haifa, Israel
     10 ; Please send feedback directly to crypto.feedback.alias@intel.com
     11 
     12 
     13 .DATA
     14 ALIGN 16
     15 Lone            dq 1,0
     16 Ltwo            dq 2,0
     17 Lbswap_mask     db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
     18 Lshuff_mask     dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
     19 Lpoly           dq 01h, 0c200000000000000h
     20 
     21 .CODE
     22 
     23 
     24 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
     25    vpclmulqdq  TMP1, SRC2, SRC1, 0h
     26    vpclmulqdq  TMP4, SRC2, SRC1, 011h
     27 
     28    vpshufd     TMP2, SRC2, 78
     29    vpshufd     TMP3, SRC1, 78
     30    vpxor       TMP2, TMP2, SRC2
     31    vpxor       TMP3, TMP3, SRC1
     32 
     33    vpclmulqdq  TMP2, TMP2, TMP3, 0h
     34    vpxor       TMP2, TMP2, TMP1
     35    vpxor       TMP2, TMP2, TMP4
     36 
     37    vpslldq     TMP3, TMP2, 8
     38    vpsrldq     TMP2, TMP2, 8
     39 
     40    vpxor       TMP1, TMP1, TMP3
     41    vpxor       TMP4, TMP4, TMP2
     42 
     43    vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
     44    vpshufd     TMP3, TMP1, 78
     45    vpxor       TMP1, TMP2, TMP3
     46 
     47    vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
     48    vpshufd     TMP3, TMP1, 78
     49    vpxor       TMP1, TMP2, TMP3
     50 
     51    vpxor       DST, TMP1, TMP4
     52 
     53    ENDM
     54 
     55 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     56 ;
     57 ; Generates the final GCM tag
     58 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
     59 ;                       unsigned char *Tp,
     60 ;                       unsigned int Mlen,
     61 ;                       unsigned int Alen,
     62 ;                       unsigned char *X0,
     63 ;                       unsigned char *TAG);
     64 ;
     65 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     66 
     67 ALIGN 16
     68 intel_aes_gcmTAG PROC
     69 
     70 Htbl    textequ <rcx>
     71 Tp      textequ <rdx>
     72 Mlen    textequ <r8>
     73 Alen    textequ <r9>
     74 X0      textequ <r10>
     75 TAG     textequ <r11>
     76 
     77 T       textequ <xmm0>
     78 TMP0    textequ <xmm1>
     79 
     80    mov     X0, [rsp + 1*8 + 4*8]
     81    mov     TAG, [rsp + 1*8 + 5*8]
     82 
     83    vzeroupper
     84    vmovdqu T, XMMWORD PTR[Tp]
     85    vpxor   TMP0, TMP0, TMP0
     86 
     87    shl     Mlen, 3
     88    shl     Alen, 3
     89 
     90    ;vpinsrq    TMP0, TMP0, Mlen, 0
     91    ;vpinsrq    TMP0, TMP0, Alen, 1
     92    ; workaround the ml64.exe vpinsrq issue
     93    vpinsrd TMP0, TMP0, r8d, 0
     94    vpinsrd TMP0, TMP0, r9d, 2
     95    shr Mlen, 32
     96    shr Alen, 32
     97    vpinsrd TMP0, TMP0, r8d, 1
     98    vpinsrd TMP0, TMP0, r9d, 3
     99 
    100    vpxor   T, T, TMP0
    101    vmovdqu TMP0, XMMWORD PTR[Htbl]
    102    GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
    103 
    104    vpshufb T, T, [Lbswap_mask]
    105    vpxor   T, T, [X0]
    106    vmovdqu XMMWORD PTR[TAG], T
    107    vzeroupper
    108 
    109    ret
    110 
    111 intel_aes_gcmTAG ENDP
    112 
    113 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    114 ;
    115 ; Generates the H table
    116 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
    117 ;
    118 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    119 
    120 ALIGN 16
    121 intel_aes_gcmINIT PROC
    122 
    123 Htbl    textequ <rcx>
    124 KS      textequ <rdx>
    125 NR      textequ <r8d>
    126 
    127 T       textequ <xmm0>
    128 TMP0    textequ <xmm1>
    129 
    130    vzeroupper
    131    ; AES-ENC(0)
    132    vmovdqu T, XMMWORD PTR[KS]
    133    lea KS, [16 + KS]
    134    dec NR
    135 Lenc_loop:
    136        vaesenc T, T, [KS]
    137        lea KS, [16 + KS]
    138        dec NR
    139        jnz Lenc_loop
    140 
    141    vaesenclast T, T, [KS]
    142    vpshufb T, T, [Lbswap_mask]
    143 
    144    ;Calculate H` = GFMUL(H, 2)
    145    vpsrad  xmm3, T, 31
    146    vpshufd xmm3, xmm3, 0ffh
    147    vpand   xmm5, xmm3, [Lpoly]
    148    vpsrld  xmm3, T, 31
    149    vpslld  xmm4, T, 1
    150    vpslldq xmm3, xmm3, 4
    151    vpxor   T, xmm4, xmm3
    152    vpxor   T, T, xmm5
    153 
    154    vmovdqu TMP0, T
    155    vmovdqu XMMWORD PTR[Htbl + 0*16], T
    156 
    157    vpshufd xmm2, T, 78
    158    vpxor   xmm2, xmm2, T
    159    vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
    160 
    161    i = 1
    162    WHILE i LT 8
    163        GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
    164        vmovdqu XMMWORD PTR[Htbl + i*16], T
    165        vpshufd xmm2, T, 78
    166        vpxor   xmm2, xmm2, T
    167        vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
    168        i = i+1
    169        ENDM
    170    vzeroupper
    171    ret
    172 intel_aes_gcmINIT ENDP
    173 
    174 
    175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    176 ;
    177 ; Authenticate only
    178 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
    179 ;
    180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    181 
    182 ALIGN 16
    183 intel_aes_gcmAAD PROC
    184 
    185 Htbl    textequ <rcx>
    186 inp     textequ <rdx>
    187 len     textequ <r8>
    188 Tp      textequ <r9>
    189 hlp0    textequ <r10>
    190 
    191 DATA    textequ <xmm0>
    192 T       textequ <xmm1>
    193 TMP0    textequ <xmm2>
    194 TMP1    textequ <xmm3>
    195 TMP2    textequ <xmm4>
    196 TMP3    textequ <xmm5>
    197 TMP4    textequ <xmm6>
    198 Xhi     textequ <xmm7>
    199 
    200 KARATSUBA_AAD MACRO i
    201    vpclmulqdq  TMP3, DATA, [Htbl + i*16], 0h
    202    vpxor       TMP0, TMP0, TMP3
    203    vpclmulqdq  TMP3, DATA, [Htbl + i*16], 011h
    204    vpxor       TMP1, TMP1, TMP3
    205    vpshufd     TMP3, DATA, 78
    206    vpxor       TMP3, TMP3, DATA
    207    vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
    208    vpxor       TMP2, TMP2, TMP3
    209 ENDM
    210 
    211    test  len, len
    212    jnz   LbeginAAD
    213    ret
    214 
    215 LbeginAAD:
    216    vzeroupper
    217 
    218    sub rsp, 2*16
    219    vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
    220    vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
    221 
    222    vpxor   Xhi, Xhi, Xhi
    223 
    224    vmovdqu T, XMMWORD PTR[Tp]
    225    ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
    226    mov hlp0, len
    227    and hlp0, 128-1
    228    jz  Lmod_loop
    229 
    230    and len, -128
    231    sub hlp0, 16
    232 
    233    ; Prefix block
    234    vmovdqu DATA, XMMWORD PTR[inp]
    235    vpshufb DATA, DATA, [Lbswap_mask]
    236    vpxor   DATA, DATA, T
    237 
    238    vpclmulqdq  TMP0, DATA, [Htbl + hlp0], 0h
    239    vpclmulqdq  TMP1, DATA, [Htbl + hlp0], 011h
    240    vpshufd     TMP3, DATA, 78
    241    vpxor       TMP3, TMP3, DATA
    242    vpclmulqdq  TMP2, TMP3, [Htbl + 8*16 + hlp0], 0h
    243 
    244    lea     inp, [inp+16]
    245    test    hlp0, hlp0
    246    jnz     Lpre_loop
    247    jmp     Lred1
    248 
    249    ;hash remaining prefix bocks (up to 7 total prefix blocks)
    250 Lpre_loop:
    251 
    252        sub hlp0, 16
    253 
    254        vmovdqu DATA, XMMWORD PTR[inp]
    255        vpshufb DATA, DATA, [Lbswap_mask]
    256 
    257        vpclmulqdq  TMP3, DATA, [Htbl + hlp0], 0h
    258        vpxor       TMP0, TMP0, TMP3
    259        vpclmulqdq  TMP3, DATA, [Htbl + hlp0], 011h
    260        vpxor       TMP1, TMP1, TMP3
    261        vpshufd     TMP3, DATA, 78
    262        vpxor       TMP3, TMP3, DATA
    263        vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + hlp0], 0h
    264        vpxor       TMP2, TMP2, TMP3
    265 
    266        test    hlp0, hlp0
    267        lea     inp, [inp+16]
    268        jnz     Lpre_loop
    269 
    270 Lred1:
    271 
    272    vpxor       TMP2, TMP2, TMP0
    273    vpxor       TMP2, TMP2, TMP1
    274    vpsrldq     TMP3, TMP2, 8
    275    vpslldq     TMP2, TMP2, 8
    276 
    277    vpxor       Xhi, TMP1, TMP3
    278    vpxor       T, TMP0, TMP2
    279 
    280 
    281 Lmod_loop:
    282 
    283        sub len, 16*8
    284        jb  Ldone
    285        ; Block #0
    286        vmovdqu DATA, XMMWORD PTR[inp + 16*7]
    287        vpshufb DATA, DATA, [Lbswap_mask]
    288 
    289        vpclmulqdq  TMP0, DATA, [Htbl + 0*16], 0h
    290        vpclmulqdq  TMP1, DATA, [Htbl + 0*16], 011h
    291        vpshufd     TMP3, DATA, 78
    292        vpxor       TMP3, TMP3, DATA
    293        vpclmulqdq  TMP2, TMP3, [Htbl + 8*16 + 0*16], 0h
    294 
    295        ; Block #1
    296        vmovdqu DATA, XMMWORD PTR[inp + 16*6]
    297        vpshufb DATA, DATA, [Lbswap_mask]
    298        KARATSUBA_AAD 1
    299 
    300        ; Block #2
    301        vmovdqu DATA, XMMWORD PTR[inp + 16*5]
    302        vpshufb DATA, DATA, [Lbswap_mask]
    303 
    304        vpclmulqdq  TMP4, T, [Lpoly], 010h         ;reduction stage 1a
    305        vpalignr    T, T, T, 8
    306 
    307        KARATSUBA_AAD 2
    308 
    309        vpxor       T, T, TMP4                          ;reduction stage 1b
    310 
    311        ; Block #3
    312        vmovdqu DATA, XMMWORD PTR[inp + 16*4]
    313        vpshufb DATA, DATA, [Lbswap_mask]
    314        KARATSUBA_AAD 3
    315        ; Block #4
    316        vmovdqu DATA, XMMWORD PTR[inp + 16*3]
    317        vpshufb DATA, DATA, [Lbswap_mask]
    318 
    319        vpclmulqdq  TMP4, T, [Lpoly], 010h        ;reduction stage 2a
    320        vpalignr    T, T, T, 8
    321 
    322        KARATSUBA_AAD 4
    323 
    324        vpxor       T, T, TMP4                          ;reduction stage 2b
    325        ; Block #5
    326        vmovdqu DATA, XMMWORD PTR[inp + 16*2]
    327        vpshufb DATA, DATA, [Lbswap_mask]
    328        KARATSUBA_AAD 5
    329 
    330        vpxor   T, T, Xhi                               ;reduction finalize
    331        ; Block #6
    332        vmovdqu DATA, XMMWORD PTR[inp + 16*1]
    333        vpshufb DATA, DATA, [Lbswap_mask]
    334        KARATSUBA_AAD 6
    335        ; Block #7
    336        vmovdqu DATA, XMMWORD PTR[inp + 16*0]
    337        vpshufb DATA, DATA, [Lbswap_mask]
    338        vpxor   DATA, DATA, T
    339        KARATSUBA_AAD 7
    340        ; Aggregated 8 blocks, now karatsuba fixup
    341        vpxor   TMP2, TMP2, TMP0
    342        vpxor   TMP2, TMP2, TMP1
    343        vpsrldq TMP3, TMP2, 8
    344        vpslldq TMP2, TMP2, 8
    345 
    346        vpxor   Xhi, TMP1, TMP3
    347        vpxor   T, TMP0, TMP2
    348 
    349        lea inp, [inp + 16*8]
    350        jmp Lmod_loop
    351 
    352 Ldone:
    353    vpclmulqdq  TMP4, T, [Lpoly], 010h
    354    vpalignr    T, T, T, 8
    355    vpxor       T, T, TMP4
    356 
    357    vpclmulqdq  TMP4, T, [Lpoly], 010h
    358    vpalignr    T, T, T, 8
    359    vpxor       T, T, TMP4
    360 
    361    vpxor       T, T, Xhi
    362    vmovdqu     XMMWORD PTR[Tp], T
    363    vzeroupper
    364 
    365    vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
    366    vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
    367    add rsp, 16*2
    368 
    369    ret
    370 
    371 intel_aes_gcmAAD ENDP
    372 
    373 
    374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    375 ;
    376 ; Encrypt and Authenticate
    377 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
    378 ;
    379 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    380 
    381 ALIGN 16
    382 intel_aes_gcmENC PROC
    383 
    384 PT      textequ <rcx>
    385 CT      textequ <rdx>
    386 Htbl    textequ <r8>
    387 Gctx    textequ <r8>
    388 len     textequ <r9>
    389 KS      textequ <r10>
    390 NR      textequ <eax>
    391 
    392 aluCTR  textequ <r11d>
    393 aluKSl  textequ <r12d>
    394 aluTMP  textequ <r13d>
    395 
    396 T       textequ <xmm0>
    397 TMP0    textequ <xmm1>
    398 TMP1    textequ <xmm2>
    399 TMP2    textequ <xmm3>
    400 TMP3    textequ <xmm4>
    401 TMP4    textequ <xmm5>
    402 TMP5    textequ <xmm6>
    403 CTR0    textequ <xmm7>
    404 CTR1    textequ <xmm8>
    405 CTR2    textequ <xmm9>
    406 CTR3    textequ <xmm10>
    407 CTR4    textequ <xmm11>
    408 CTR5    textequ <xmm12>
    409 CTR6    textequ <xmm13>
    410 CTR7    textequ <xmm14>
    411 BSWAPMASK   textequ <xmm15>
    412 
    413 ROUND MACRO i
    414    vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
    415    vaesenc CTR0, CTR0, TMP3
    416    vaesenc CTR1, CTR1, TMP3
    417    vaesenc CTR2, CTR2, TMP3
    418    vaesenc CTR3, CTR3, TMP3
    419    vaesenc CTR4, CTR4, TMP3
    420    vaesenc CTR5, CTR5, TMP3
    421    vaesenc CTR6, CTR6, TMP3
    422    vaesenc CTR7, CTR7, TMP3
    423 ENDM
    424 ROUNDMUL MACRO i
    425    vmovdqu TMP3, XMMWORD PTR[i*16 + KS]
    426 
    427    vaesenc CTR0, CTR0, TMP3
    428    vaesenc CTR1, CTR1, TMP3
    429    vaesenc CTR2, CTR2, TMP3
    430    vaesenc CTR3, CTR3, TMP3
    431 
    432    vpshufd TMP4, TMP5, 78
    433    vpxor   TMP4, TMP4, TMP5
    434 
    435    vaesenc CTR4, CTR4, TMP3
    436    vaesenc CTR5, CTR5, TMP3
    437    vaesenc CTR6, CTR6, TMP3
    438    vaesenc CTR7, CTR7, TMP3
    439 
    440    vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
    441    vpxor       TMP0, TMP0, TMP3
    442    vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
    443    vpclmulqdq  TMP3, TMP5, TMP4, 011h
    444    vpxor       TMP1, TMP1, TMP3
    445    vpclmulqdq  TMP3, TMP5, TMP4, 000h
    446    vpxor       TMP2, TMP2, TMP3
    447 ENDM
    448 KARATSUBA MACRO i
    449    vpshufd TMP4, TMP5, 78
    450    vpxor   TMP4, TMP4, TMP5
    451    vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
    452    vpxor       TMP0, TMP0, TMP3
    453    vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
    454    vpclmulqdq  TMP3, TMP5, TMP4, 011h
    455    vpxor       TMP1, TMP1, TMP3
    456    vpclmulqdq  TMP3, TMP5, TMP4, 000h
    457    vpxor       TMP2, TMP2, TMP3
    458 ENDM
    459 NEXTCTR MACRO i
    460    add aluCTR, 1
    461    mov aluTMP, aluCTR
    462    xor aluTMP, aluKSl
    463    bswap   aluTMP
    464    mov [3*4 + 8*16 + i*16 + rsp], aluTMP
    465 ENDM
    466 
    467 
    468    test  len, len
    469    jnz   LbeginENC
    470    ret
    471 
    472 LbeginENC:
    473 
    474    vzeroupper
    475    push    r11
    476    push    r12
    477    push    r13
    478    push    rbp
    479    sub rsp, 10*16
    480    vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
    481    vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
    482    vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
    483    vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
    484    vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
    485    vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
    486    vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
    487    vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
    488    vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
    489    vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
    490 
    491    mov rbp, rsp
    492    sub rsp, 16*16
    493    and rsp, -16
    494 
    495    vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
    496    vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
    497    vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
    498    mov     KS, [16*16 + 3*16 + Gctx]
    499    mov     NR, [244 + KS]
    500    lea     KS, [KS]
    501 
    502    vpshufb CTR0, CTR0, BSWAPMASK
    503 
    504    mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
    505    mov aluKSl, [3*4 + KS]
    506    bswap   aluCTR
    507    bswap   aluKSl
    508 
    509    vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
    510    vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
    511    vmovdqu XMMWORD PTR[8*16 + 0*16 + rsp], TMP0
    512 
    513    cmp len, 128
    514    jb  LEncDataSingles
    515 ; Prepare the "top" counters
    516    vmovdqu XMMWORD PTR[8*16 + 1*16 + rsp], TMP0
    517    vmovdqu XMMWORD PTR[8*16 + 2*16 + rsp], TMP0
    518    vmovdqu XMMWORD PTR[8*16 + 3*16 + rsp], TMP0
    519    vmovdqu XMMWORD PTR[8*16 + 4*16 + rsp], TMP0
    520    vmovdqu XMMWORD PTR[8*16 + 5*16 + rsp], TMP0
    521    vmovdqu XMMWORD PTR[8*16 + 6*16 + rsp], TMP0
    522    vmovdqu XMMWORD PTR[8*16 + 7*16 + rsp], TMP0
    523 
    524 ; Encrypt the initial 8 blocks
    525    sub len, 128
    526    vpaddd  CTR1, CTR0, XMMWORD PTR[Lone]
    527    vpaddd  CTR2, CTR0, XMMWORD PTR[Ltwo]
    528    vpaddd  CTR3, CTR2, XMMWORD PTR[Lone]
    529    vpaddd  CTR4, CTR2, XMMWORD PTR[Ltwo]
    530    vpaddd  CTR5, CTR4, XMMWORD PTR[Lone]
    531    vpaddd  CTR6, CTR4, XMMWORD PTR[Ltwo]
    532    vpaddd  CTR7, CTR6, XMMWORD PTR[Lone]
    533 
    534    vpshufb CTR0, CTR0, BSWAPMASK
    535    vpshufb CTR1, CTR1, BSWAPMASK
    536    vpshufb CTR2, CTR2, BSWAPMASK
    537    vpshufb CTR3, CTR3, BSWAPMASK
    538    vpshufb CTR4, CTR4, BSWAPMASK
    539    vpshufb CTR5, CTR5, BSWAPMASK
    540    vpshufb CTR6, CTR6, BSWAPMASK
    541    vpshufb CTR7, CTR7, BSWAPMASK
    542 
    543    vmovdqu TMP3, XMMWORD PTR[0*16 + KS]
    544    vpxor   CTR0, CTR0, TMP3
    545    vpxor   CTR1, CTR1, TMP3
    546    vpxor   CTR2, CTR2, TMP3
    547    vpxor   CTR3, CTR3, TMP3
    548    vpxor   CTR4, CTR4, TMP3
    549    vpxor   CTR5, CTR5, TMP3
    550    vpxor   CTR6, CTR6, TMP3
    551    vpxor   CTR7, CTR7, TMP3
    552 
    553    ROUND   1
    554 
    555    add aluCTR, 8
    556    mov aluTMP, aluCTR
    557    xor aluTMP, aluKSl
    558    bswap   aluTMP
    559    mov [8*16 + 0*16 + 3*4 + rsp], aluTMP
    560 
    561    ROUND   2
    562    NEXTCTR 1
    563    ROUND   3
    564    NEXTCTR 2
    565    ROUND   4
    566    NEXTCTR 3
    567    ROUND   5
    568    NEXTCTR 4
    569    ROUND   6
    570    NEXTCTR 5
    571    ROUND   7
    572    NEXTCTR 6
    573    ROUND   8
    574    NEXTCTR 7
    575    ROUND   9
    576    vmovdqu TMP5, XMMWORD PTR[10*16 + KS]
    577    cmp     NR, 10
    578    je      @f
    579 
    580    ROUND   10
    581    ROUND   11
    582    vmovdqu TMP5, XMMWORD PTR[12*16 + KS]
    583    cmp     NR, 12
    584    je      @f
    585 
    586    ROUND   12
    587    ROUND   13
    588    vmovdqu TMP5, XMMWORD PTR[14*16 + KS]
    589 @@:
    590    vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + PT]
    591    vaesenclast CTR0, CTR0, TMP3
    592    vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + PT]
    593    vaesenclast CTR1, CTR1, TMP3
    594    vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + PT]
    595    vaesenclast CTR2, CTR2, TMP3
    596    vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + PT]
    597    vaesenclast CTR3, CTR3, TMP3
    598    vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + PT]
    599    vaesenclast CTR4, CTR4, TMP3
    600    vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + PT]
    601    vaesenclast CTR5, CTR5, TMP3
    602    vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + PT]
    603    vaesenclast CTR6, CTR6, TMP3
    604    vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + PT]
    605    vaesenclast CTR7, CTR7, TMP3
    606 
    607    vmovdqu XMMWORD PTR[0*16 + CT], CTR0
    608    vpshufb CTR0, CTR0, BSWAPMASK
    609    vmovdqu XMMWORD PTR[1*16 + CT], CTR1
    610    vpshufb CTR1, CTR1, BSWAPMASK
    611    vmovdqu XMMWORD PTR[2*16 + CT], CTR2
    612    vpshufb CTR2, CTR2, BSWAPMASK
    613    vmovdqu XMMWORD PTR[3*16 + CT], CTR3
    614    vpshufb CTR3, CTR3, BSWAPMASK
    615    vmovdqu XMMWORD PTR[4*16 + CT], CTR4
    616    vpshufb CTR4, CTR4, BSWAPMASK
    617    vmovdqu XMMWORD PTR[5*16 + CT], CTR5
    618    vpshufb CTR5, CTR5, BSWAPMASK
    619    vmovdqu XMMWORD PTR[6*16 + CT], CTR6
    620    vpshufb CTR6, CTR6, BSWAPMASK
    621    vmovdqu XMMWORD PTR[7*16 + CT], CTR7
    622    vpshufb TMP5, CTR7, BSWAPMASK
    623 
    624    vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
    625    vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
    626    vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
    627    vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
    628    vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
    629    vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
    630    vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
    631 
    632    lea CT, [8*16 + CT]
    633    lea PT, [8*16 + PT]
    634    jmp LEncDataOctets
    635 
    636 LEncDataOctets:
    637        cmp len, 128
    638        jb  LEndEncOctets
    639        sub len, 128
    640 
    641        vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + rsp]
    642        vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + rsp]
    643        vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + rsp]
    644        vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + rsp]
    645        vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + rsp]
    646        vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + rsp]
    647        vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + rsp]
    648        vmovdqa CTR7, XMMWORD PTR[8*16 + 7*16 + rsp]
    649 
    650        vpshufd TMP4, TMP5, 78
    651        vpxor   TMP4, TMP4, TMP5
    652        vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
    653        vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
    654        vpclmulqdq  TMP1, TMP5, TMP4, 011h
    655        vpclmulqdq  TMP2, TMP5, TMP4, 000h
    656 
    657        vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
    658        ROUNDMUL 1
    659        NEXTCTR 0
    660        vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
    661        ROUNDMUL 2
    662        NEXTCTR 1
    663        vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
    664        ROUNDMUL 3
    665        NEXTCTR 2
    666        vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
    667        ROUNDMUL 4
    668        NEXTCTR 3
    669        vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
    670        ROUNDMUL 5
    671        NEXTCTR 4
    672        vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
    673        ROUNDMUL 6
    674        NEXTCTR 5
    675        vpxor   TMP5, T, XMMWORD PTR[7*16 + rsp]
    676        ROUNDMUL 7
    677        NEXTCTR 6
    678 
    679        ROUND 8
    680        NEXTCTR 7
    681 
    682        vpxor   TMP0, TMP0, TMP1
    683        vpxor   TMP0, TMP0, TMP2
    684        vpsrldq TMP3, TMP0, 8
    685        vpxor   TMP4, TMP1, TMP3
    686        vpslldq TMP3, TMP0, 8
    687        vpxor   T, TMP2, TMP3
    688 
    689        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
    690        vpalignr    T,T,T,8
    691        vpxor       T, T, TMP1
    692 
    693        ROUND 9
    694 
    695        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
    696        vpalignr    T,T,T,8
    697        vpxor       T, T, TMP1
    698 
    699        vmovdqu     TMP5, XMMWORD PTR[10*16 + KS]
    700        cmp         NR, 10
    701        je          @f
    702 
    703        ROUND 10
    704        ROUND 11
    705        vmovdqu     TMP5, XMMWORD PTR[12*16 + KS]
    706        cmp         NR, 12
    707        je          @f
    708 
    709        ROUND 12
    710        ROUND 13
    711        vmovdqu     TMP5, XMMWORD PTR[14*16 + KS]
    712 @@:
    713        vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + PT]
    714        vaesenclast CTR0, CTR0, TMP3
    715        vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + PT]
    716        vaesenclast CTR1, CTR1, TMP3
    717        vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + PT]
    718        vaesenclast CTR2, CTR2, TMP3
    719        vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + PT]
    720        vaesenclast CTR3, CTR3, TMP3
    721        vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + PT]
    722        vaesenclast CTR4, CTR4, TMP3
    723        vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + PT]
    724        vaesenclast CTR5, CTR5, TMP3
    725        vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + PT]
    726        vaesenclast CTR6, CTR6, TMP3
    727        vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + PT]
    728        vaesenclast CTR7, CTR7, TMP3
    729 
    730        vmovdqu XMMWORD PTR[0*16 + CT], CTR0
    731        vpshufb CTR0, CTR0, BSWAPMASK
    732        vmovdqu XMMWORD PTR[1*16 + CT], CTR1
    733        vpshufb CTR1, CTR1, BSWAPMASK
    734        vmovdqu XMMWORD PTR[2*16 + CT], CTR2
    735        vpshufb CTR2, CTR2, BSWAPMASK
    736        vmovdqu XMMWORD PTR[3*16 + CT], CTR3
    737        vpshufb CTR3, CTR3, BSWAPMASK
    738        vmovdqu XMMWORD PTR[4*16 + CT], CTR4
    739        vpshufb CTR4, CTR4, BSWAPMASK
    740        vmovdqu XMMWORD PTR[5*16 + CT], CTR5
    741        vpshufb CTR5, CTR5, BSWAPMASK
    742        vmovdqu XMMWORD PTR[6*16 + CT], CTR6
    743        vpshufb CTR6, CTR6, BSWAPMASK
    744        vmovdqu XMMWORD PTR[7*16 + CT], CTR7
    745        vpshufb TMP5, CTR7, BSWAPMASK
    746 
    747        vmovdqa XMMWORD PTR[1*16 + rsp], CTR6
    748        vmovdqa XMMWORD PTR[2*16 + rsp], CTR5
    749        vmovdqa XMMWORD PTR[3*16 + rsp], CTR4
    750        vmovdqa XMMWORD PTR[4*16 + rsp], CTR3
    751        vmovdqa XMMWORD PTR[5*16 + rsp], CTR2
    752        vmovdqa XMMWORD PTR[6*16 + rsp], CTR1
    753        vmovdqa XMMWORD PTR[7*16 + rsp], CTR0
    754 
    755        vpxor   T, T, TMP4
    756 
    757        lea CT, [8*16 + CT]
    758        lea PT, [8*16 + PT]
    759        jmp LEncDataOctets
    760 
    761 LEndEncOctets:
    762 
    763    vpshufd TMP4, TMP5, 78
    764    vpxor   TMP4, TMP4, TMP5
    765    vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
    766    vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
    767    vpclmulqdq  TMP1, TMP5, TMP4, 011h
    768    vpclmulqdq  TMP2, TMP5, TMP4, 000h
    769 
    770    vmovdqu TMP5, XMMWORD PTR[1*16 + rsp]
    771    KARATSUBA 1
    772    vmovdqu TMP5, XMMWORD PTR[2*16 + rsp]
    773    KARATSUBA 2
    774    vmovdqu TMP5, XMMWORD PTR[3*16 + rsp]
    775    KARATSUBA 3
    776    vmovdqu TMP5, XMMWORD PTR[4*16 + rsp]
    777    KARATSUBA 4
    778    vmovdqu TMP5, XMMWORD PTR[5*16 + rsp]
    779    KARATSUBA 5
    780    vmovdqu TMP5, XMMWORD PTR[6*16 + rsp]
    781    KARATSUBA 6
    782    vpxor   TMP5, T, XMMWORD PTR[7*16 + rsp]
    783    KARATSUBA 7
    784 
    785    vpxor   TMP0, TMP0, TMP1
    786    vpxor   TMP0, TMP0, TMP2
    787    vpsrldq TMP3, TMP0, 8
    788    vpxor   TMP4, TMP1, TMP3
    789    vpslldq TMP3, TMP0, 8
    790    vpxor   T, TMP2, TMP3
    791 
    792    vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
    793    vpalignr    T,T,T,8
    794    vpxor       T, T, TMP1
    795 
    796    vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
    797    vpalignr    T,T,T,8
    798    vpxor       T, T, TMP1
    799 
    800    vpxor       T, T, TMP4
    801 
    802    sub aluCTR, 7
    803 
    804 LEncDataSingles:
    805 
    806        cmp len, 16
    807        jb  LEncDataTail
    808        sub len, 16
    809 
    810        vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
    811        NEXTCTR 0
    812 
    813        vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
    814        vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
    815        vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
    816        vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
    817        vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
    818        vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
    819        vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
    820        vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
    821        vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
    822        vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
    823        cmp NR, 10
    824        je  @f
    825        vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
    826        vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
    827        vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
    828        cmp NR, 12
    829        je  @f
    830        vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
    831        vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
    832        vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
    833 @@:
    834        vaesenclast TMP1, TMP1, TMP2
    835        vpxor   TMP1, TMP1, XMMWORD PTR[PT]
    836        vmovdqu XMMWORD PTR[CT], TMP1
    837 
    838        lea PT, [16+PT]
    839        lea CT, [16+CT]
    840 
    841        vpshufb TMP1, TMP1, BSWAPMASK
    842        vpxor   T, T, TMP1
    843        vmovdqu TMP0, XMMWORD PTR[Htbl]
    844        GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
    845 
    846        jmp LEncDataSingles
    847 
    848 LEncDataTail:
    849 
    850    test    len, len
    851    jz  LEncDataEnd
    852 
    853    vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + rsp]
    854 
    855    vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
    856    vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
    857    vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
    858    vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
    859    vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
    860    vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
    861    vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
    862    vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
    863    vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
    864    vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
    865    cmp NR, 10
    866    je  @f
    867    vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
    868    vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
    869    vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
    870    cmp NR, 12
    871    je  @f
    872    vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
    873    vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
    874    vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
    875 @@:
    876    vaesenclast TMP1, TMP1, TMP2
    877 ; zero a temp location
    878    vpxor   TMP2, TMP2, TMP2
    879    vmovdqa XMMWORD PTR[rsp], TMP2
    880 ; copy as many bytes as needed
    881    xor KS, KS
    882 
    883 @@:
    884        cmp len, KS
    885        je  @f
    886        mov al, [PT + KS]
    887        mov [rsp + KS], al
    888        inc KS
    889        jmp @b
    890 @@:
    891    vpxor   TMP1, TMP1, XMMWORD PTR[rsp]
    892    vmovdqa XMMWORD PTR[rsp], TMP1
    893    xor KS, KS
    894 @@:
    895        cmp len, KS
    896        je  @f
    897        mov al, [rsp + KS]
    898        mov [CT + KS], al
    899        inc KS
    900        jmp @b
    901 @@:
    902        cmp KS, 16
    903        je  @f
    904        mov BYTE PTR[rsp + KS], 0
    905        inc KS
    906        jmp @b
    907 @@:
    908 BAIL:
    909    vmovdqa TMP1, XMMWORD PTR[rsp]
    910    vpshufb TMP1, TMP1, BSWAPMASK
    911    vpxor   T, T, TMP1
    912    vmovdqu TMP0, XMMWORD PTR[Htbl]
    913    GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
    914 
    915 LEncDataEnd:
    916 
    917    vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
    918    bswap   aluCTR
    919    mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
    920 
    921    mov rsp, rbp
    922 
    923    vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
    924    vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
    925    vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
    926    vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
    927    vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
    928    vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
    929    vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
    930    vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
    931    vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
    932    vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
    933 
    934    add rsp, 10*16
    935    pop rbp
    936    pop r13
    937    pop r12
    938    pop r11
    939 
    940    vzeroupper
    941 
    942    ret
    943 intel_aes_gcmENC ENDP
    944 
    945 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    946 ;
    947 ; Decrypt and Authenticate
    948 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
    949 ;
    950 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    951 
    952 ALIGN 16
    953 intel_aes_gcmDEC PROC
    954 
    955 NEXTCTR MACRO i
    956    add aluCTR, 1
    957    mov aluTMP, aluCTR
    958    xor aluTMP, aluKSl
    959    bswap   aluTMP
    960    mov [3*4 + i*16 + rsp], aluTMP
    961 ENDM
    962 
    963 PT      textequ <rdx>
    964 CT      textequ <rcx>
    965 
    966    test  len, len
    967    jnz   LbeginDEC
    968    ret
    969 
    970 LbeginDEC:
    971 
    972    vzeroupper
    973    push    r11
    974    push    r12
    975    push    r13
    976    push    rbp
    977    sub rsp, 10*16
    978    vmovdqu XMMWORD PTR[rsp + 0*16], xmm6
    979    vmovdqu XMMWORD PTR[rsp + 1*16], xmm7
    980    vmovdqu XMMWORD PTR[rsp + 2*16], xmm8
    981    vmovdqu XMMWORD PTR[rsp + 3*16], xmm9
    982    vmovdqu XMMWORD PTR[rsp + 4*16], xmm10
    983    vmovdqu XMMWORD PTR[rsp + 5*16], xmm11
    984    vmovdqu XMMWORD PTR[rsp + 6*16], xmm12
    985    vmovdqu XMMWORD PTR[rsp + 7*16], xmm13
    986    vmovdqu XMMWORD PTR[rsp + 8*16], xmm14
    987    vmovdqu XMMWORD PTR[rsp + 9*16], xmm15
    988 
    989    mov rbp, rsp
    990    sub rsp, 8*16
    991    and rsp, -16
    992 
    993    vmovdqu T, XMMWORD PTR[16*16 + 1*16 + Gctx]
    994    vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
    995    vmovdqu BSWAPMASK, XMMWORD PTR[Lbswap_mask]
    996    mov     KS, [16*16 + 3*16 + Gctx]
    997    mov     NR, [244 + KS]
    998 
    999    vpshufb CTR0, CTR0, BSWAPMASK
   1000 
   1001    mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
   1002    mov aluKSl, [3*4 + KS]
   1003    bswap   aluCTR
   1004    bswap   aluKSl
   1005 
   1006    vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
   1007    vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
   1008    vmovdqu XMMWORD PTR[0*16 + rsp], TMP0
   1009 
   1010    cmp len, 128
   1011    jb  LDecDataSingles
   1012 ; Prepare the "top" counters
   1013    vmovdqu XMMWORD PTR[1*16 + rsp], TMP0
   1014    vmovdqu XMMWORD PTR[2*16 + rsp], TMP0
   1015    vmovdqu XMMWORD PTR[3*16 + rsp], TMP0
   1016    vmovdqu XMMWORD PTR[4*16 + rsp], TMP0
   1017    vmovdqu XMMWORD PTR[5*16 + rsp], TMP0
   1018    vmovdqu XMMWORD PTR[6*16 + rsp], TMP0
   1019    vmovdqu XMMWORD PTR[7*16 + rsp], TMP0
   1020 
   1021    NEXTCTR 1
   1022    NEXTCTR 2
   1023    NEXTCTR 3
   1024    NEXTCTR 4
   1025    NEXTCTR 5
   1026    NEXTCTR 6
   1027    NEXTCTR 7
   1028 
   1029 LDecDataOctets:
   1030        cmp len, 128
   1031        jb  LEndDecOctets
   1032        sub len, 128
   1033 
   1034        vmovdqa CTR0, XMMWORD PTR[0*16 + rsp]
   1035        vmovdqa CTR1, XMMWORD PTR[1*16 + rsp]
   1036        vmovdqa CTR2, XMMWORD PTR[2*16 + rsp]
   1037        vmovdqa CTR3, XMMWORD PTR[3*16 + rsp]
   1038        vmovdqa CTR4, XMMWORD PTR[4*16 + rsp]
   1039        vmovdqa CTR5, XMMWORD PTR[5*16 + rsp]
   1040        vmovdqa CTR6, XMMWORD PTR[6*16 + rsp]
   1041        vmovdqa CTR7, XMMWORD PTR[7*16 + rsp]
   1042 
   1043        vmovdqu TMP5, XMMWORD PTR[7*16 + CT]
   1044        vpshufb TMP5, TMP5, BSWAPMASK
   1045        vpshufd TMP4, TMP5, 78
   1046        vpxor   TMP4, TMP4, TMP5
   1047        vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
   1048        vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
   1049        vpclmulqdq  TMP1, TMP5, TMP4, 011h
   1050        vpclmulqdq  TMP2, TMP5, TMP4, 000h
   1051 
   1052        vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
   1053        vpshufb TMP5, TMP5, BSWAPMASK
   1054        ROUNDMUL 1
   1055        NEXTCTR 0
   1056        vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
   1057        vpshufb TMP5, TMP5, BSWAPMASK
   1058        ROUNDMUL 2
   1059        NEXTCTR 1
   1060        vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
   1061        vpshufb TMP5, TMP5, BSWAPMASK
   1062        ROUNDMUL 3
   1063        NEXTCTR 2
   1064        vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
   1065        vpshufb TMP5, TMP5, BSWAPMASK
   1066        ROUNDMUL 4
   1067        NEXTCTR 3
   1068        vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
   1069        vpshufb TMP5, TMP5, BSWAPMASK
   1070        ROUNDMUL 5
   1071        NEXTCTR 4
   1072        vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
   1073        vpshufb TMP5, TMP5, BSWAPMASK
   1074        ROUNDMUL 6
   1075        NEXTCTR 5
   1076        vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
   1077        vpshufb TMP5, TMP5, BSWAPMASK
   1078        vpxor   TMP5, TMP5, T
   1079        ROUNDMUL 7
   1080        NEXTCTR 6
   1081 
   1082        ROUND 8
   1083        NEXTCTR 7
   1084 
   1085        vpxor   TMP0, TMP0, TMP1
   1086        vpxor   TMP0, TMP0, TMP2
   1087        vpsrldq TMP3, TMP0, 8
   1088        vpxor   TMP4, TMP1, TMP3
   1089        vpslldq TMP3, TMP0, 8
   1090        vpxor   T, TMP2, TMP3
   1091 
   1092        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
   1093        vpalignr    T,T,T,8
   1094        vpxor       T, T, TMP1
   1095 
   1096        ROUND 9
   1097 
   1098        vpclmulqdq  TMP1, T, XMMWORD PTR[Lpoly], 010h
   1099        vpalignr    T,T,T,8
   1100        vpxor       T, T, TMP1
   1101 
   1102        vmovdqu     TMP5, XMMWORD PTR[10*16 + KS]
   1103        cmp         NR, 10
   1104        je          @f
   1105 
   1106        ROUND 10
   1107        ROUND 11
   1108        vmovdqu     TMP5, XMMWORD PTR[12*16 + KS]
   1109        cmp         NR, 12
   1110        je          @f
   1111 
   1112        ROUND 12
   1113        ROUND 13
   1114        vmovdqu     TMP5, XMMWORD PTR[14*16 + KS]
   1115 @@:
   1116        vpxor   TMP3, TMP5, XMMWORD PTR[0*16 + CT]
   1117        vaesenclast CTR0, CTR0, TMP3
   1118        vpxor   TMP3, TMP5, XMMWORD PTR[1*16 + CT]
   1119        vaesenclast CTR1, CTR1, TMP3
   1120        vpxor   TMP3, TMP5, XMMWORD PTR[2*16 + CT]
   1121        vaesenclast CTR2, CTR2, TMP3
   1122        vpxor   TMP3, TMP5, XMMWORD PTR[3*16 + CT]
   1123        vaesenclast CTR3, CTR3, TMP3
   1124        vpxor   TMP3, TMP5, XMMWORD PTR[4*16 + CT]
   1125        vaesenclast CTR4, CTR4, TMP3
   1126        vpxor   TMP3, TMP5, XMMWORD PTR[5*16 + CT]
   1127        vaesenclast CTR5, CTR5, TMP3
   1128        vpxor   TMP3, TMP5, XMMWORD PTR[6*16 + CT]
   1129        vaesenclast CTR6, CTR6, TMP3
   1130        vpxor   TMP3, TMP5, XMMWORD PTR[7*16 + CT]
   1131        vaesenclast CTR7, CTR7, TMP3
   1132 
   1133        vmovdqu XMMWORD PTR[0*16 + PT], CTR0
   1134        vmovdqu XMMWORD PTR[1*16 + PT], CTR1
   1135        vmovdqu XMMWORD PTR[2*16 + PT], CTR2
   1136        vmovdqu XMMWORD PTR[3*16 + PT], CTR3
   1137        vmovdqu XMMWORD PTR[4*16 + PT], CTR4
   1138        vmovdqu XMMWORD PTR[5*16 + PT], CTR5
   1139        vmovdqu XMMWORD PTR[6*16 + PT], CTR6
   1140        vmovdqu XMMWORD PTR[7*16 + PT], CTR7
   1141 
   1142        vpxor   T, T, TMP4
   1143 
   1144        lea CT, [8*16 + CT]
   1145        lea PT, [8*16 + PT]
   1146        jmp LDecDataOctets
   1147 
   1148 LEndDecOctets:
   1149 
   1150    sub aluCTR, 7
   1151 
   1152 LDecDataSingles:
   1153 
   1154        cmp len, 16
   1155        jb  LDecDataTail
   1156        sub len, 16
   1157 
   1158        vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
   1159        NEXTCTR 0
   1160 
   1161        vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
   1162        vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
   1163        vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
   1164        vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
   1165        vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
   1166        vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
   1167        vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
   1168        vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
   1169        vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
   1170        vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
   1171        cmp NR, 10
   1172        je  @f
   1173        vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
   1174        vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
   1175        vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
   1176        cmp NR, 12
   1177        je  @f
   1178        vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
   1179        vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
   1180        vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
   1181 @@:
   1182        vaesenclast TMP1, TMP1, TMP2
   1183 
   1184        vmovdqu TMP2, XMMWORD PTR[CT]
   1185        vpxor   TMP1, TMP1, TMP2
   1186        vmovdqu XMMWORD PTR[PT], TMP1
   1187 
   1188        lea PT, [16+PT]
   1189        lea CT, [16+CT]
   1190 
   1191        vpshufb TMP2, TMP2, BSWAPMASK
   1192        vpxor   T, T, TMP2
   1193        vmovdqu TMP0, XMMWORD PTR[Htbl]
   1194        GFMUL   T, T, TMP0, TMP1, TMP2, TMP3, TMP4
   1195 
   1196        jmp LDecDataSingles
   1197 
   1198 LDecDataTail:
   1199 
   1200    test    len, len
   1201    jz      LDecDataEnd
   1202 
   1203    vmovdqa TMP1, XMMWORD PTR[0*16 + rsp]
   1204    inc aluCTR
   1205    vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
   1206    vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
   1207    vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
   1208    vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
   1209    vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
   1210    vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
   1211    vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
   1212    vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
   1213    vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
   1214    vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
   1215    cmp NR, 10
   1216    je  @f
   1217    vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
   1218    vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
   1219    vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
   1220    cmp NR, 12
   1221    je  @f
   1222    vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
   1223    vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
   1224    vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
   1225 @@:
   1226    vaesenclast TMP1, TMP1, TMP2
   1227 ; copy as many bytes as needed
   1228    xor KS, KS
   1229 @@:
   1230        cmp len, KS
   1231        je  @f
   1232        mov al, [CT + KS]
   1233        mov [rsp + KS], al
   1234        inc KS
   1235        jmp @b
   1236 @@:
   1237        cmp KS, 16
   1238        je  @f
   1239        mov BYTE PTR[rsp + KS], 0
   1240        inc KS
   1241        jmp @b
   1242 @@:
   1243    vmovdqa TMP2, XMMWORD PTR[rsp]
   1244    vpshufb TMP2, TMP2, BSWAPMASK
   1245    vpxor   T, T, TMP2
   1246    vmovdqu TMP0, XMMWORD PTR[Htbl]
   1247    GFMUL   T, T, TMP0, TMP5, TMP2, TMP3, TMP4
   1248 
   1249 
   1250    vpxor   TMP1, TMP1, XMMWORD PTR[rsp]
   1251    vmovdqa XMMWORD PTR[rsp], TMP1
   1252    xor KS, KS
   1253 @@:
   1254        cmp len, KS
   1255        je  @f
   1256        mov al, [rsp + KS]
   1257        mov [PT + KS], al
   1258        inc KS
   1259        jmp @b
   1260 @@:
   1261 
   1262 LDecDataEnd:
   1263 
   1264    vmovdqu XMMWORD PTR[16*16 + 1*16 + Gctx], T
   1265    bswap   aluCTR
   1266    mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
   1267 
   1268    mov rsp, rbp
   1269 
   1270    vmovdqu xmm6, XMMWORD PTR[rsp + 0*16]
   1271    vmovdqu xmm7, XMMWORD PTR[rsp + 1*16]
   1272    vmovdqu xmm8, XMMWORD PTR[rsp + 2*16]
   1273    vmovdqu xmm9, XMMWORD PTR[rsp + 3*16]
   1274    vmovdqu xmm10, XMMWORD PTR[rsp + 4*16]
   1275    vmovdqu xmm11, XMMWORD PTR[rsp + 5*16]
   1276    vmovdqu xmm12, XMMWORD PTR[rsp + 6*16]
   1277    vmovdqu xmm13, XMMWORD PTR[rsp + 7*16]
   1278    vmovdqu xmm14, XMMWORD PTR[rsp + 8*16]
   1279    vmovdqu xmm15, XMMWORD PTR[rsp + 9*16]
   1280 
   1281    add rsp, 10*16
   1282    pop rbp
   1283    pop r13
   1284    pop r12
   1285    pop r11
   1286 
   1287    vzeroupper
   1288 
   1289    ret
   1290 ret
   1291 intel_aes_gcmDEC ENDP
   1292 
   1293 
   1294 END