tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intel-gcm-x86-masm.asm (31751B)


      1 ; LICENSE:
      2 ; This submission to NSS is to be made available under the terms of the
      3 ; Mozilla Public License, v. 2.0. You can obtain one at http:
      4 ; //mozilla.org/MPL/2.0/.
      5 ;###############################################################################
      6 ; Copyright(c) 2014, Intel Corp.
      7 ; Developers and authors:
      8 ; Shay Gueron and Vlad Krasnov
      9 ; Intel Corporation, Israel Development Centre, Haifa, Israel
     10 ; Please send feedback directly to crypto.feedback.alias@intel.com
     11 
     12 
     13 .MODEL FLAT, C
     14 .XMM
     15 
     16 .DATA
     17 ALIGN 16
     18 Lone            dq 1,0
     19 Ltwo            dq 2,0
     20 Lbswap_mask     db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
     21 Lshuff_mask     dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh
     22 Lpoly           dq 01h, 0c200000000000000h
     23 
     24 .CODE
     25 
     26 
     27 GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4
     28    vpclmulqdq  TMP1, SRC2, SRC1, 0h
     29    vpclmulqdq  TMP4, SRC2, SRC1, 011h
     30 
     31    vpshufd     TMP2, SRC2, 78
     32    vpshufd     TMP3, SRC1, 78
     33    vpxor       TMP2, TMP2, SRC2
     34    vpxor       TMP3, TMP3, SRC1
     35 
     36    vpclmulqdq  TMP2, TMP2, TMP3, 0h
     37    vpxor       TMP2, TMP2, TMP1
     38    vpxor       TMP2, TMP2, TMP4
     39 
     40    vpslldq     TMP3, TMP2, 8
     41    vpsrldq     TMP2, TMP2, 8
     42 
     43    vpxor       TMP1, TMP1, TMP3
     44    vpxor       TMP4, TMP4, TMP2
     45 
     46    vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
     47    vpshufd     TMP3, TMP1, 78
     48    vpxor       TMP1, TMP2, TMP3
     49 
     50    vpclmulqdq  TMP2, TMP1, [Lpoly], 010h
     51    vpshufd     TMP3, TMP1, 78
     52    vpxor       TMP1, TMP2, TMP3
     53 
     54    vpxor       DST, TMP1, TMP4
     55 
     56    ENDM
     57 
     58 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     59 ;
     60 ; Generates the final GCM tag
     61 ; void intel_aes_gcmTAG(unsigned char Htbl[16*16],
     62 ;                       unsigned char *Tp,
     63 ;                       unsigned int Mlen,
     64 ;                       unsigned int Alen,
     65 ;                       unsigned char* X0,
     66 ;                       unsigned char* TAG);
     67 ;
     68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     69 
     70 ALIGN 16
     71 intel_aes_gcmTAG PROC
     72 
     73 Htbl    textequ <eax>
     74 Tp      textequ <ecx>
     75 X0      textequ <edx>
     76 TAG     textequ <ebx>
     77 
     78 T       textequ <xmm0>
     79 TMP0    textequ <xmm1>
     80 
     81    push    ebx
     82 
     83    mov     Htbl,   [esp + 2*4 + 0*4]
     84    mov     Tp,     [esp + 2*4 + 1*4]
     85    mov     X0,     [esp + 2*4 + 4*4]
     86    mov     TAG,    [esp + 2*4 + 5*4]
     87 
     88    vzeroupper
     89    vmovdqu T, XMMWORD PTR[Tp]
     90 
     91    vpxor   TMP0, TMP0, TMP0
     92    vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0
     93    vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2
     94    vpsllq  TMP0, TMP0, 3
     95 
     96    vpxor   T, T, TMP0
     97    vmovdqu TMP0, XMMWORD PTR[Htbl]
     98    GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
     99 
    100    vpshufb T, T, [Lbswap_mask]
    101    vpxor   T, T, [X0]
    102    vmovdqu XMMWORD PTR[TAG], T
    103    vzeroupper
    104 
    105    pop ebx
    106 
    107    ret
    108 
    109 intel_aes_gcmTAG ENDP
    110 
    111 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    112 ;
    113 ; Generates the H table
    114 ; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR);
    115 ;
    116 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    117 
    118 ALIGN 16
    119 intel_aes_gcmINIT PROC
    120 
    121 Htbl    textequ <eax>
    122 KS      textequ <ecx>
    123 NR      textequ <edx>
    124 
    125 T       textequ <xmm0>
    126 TMP0    textequ <xmm1>
    127 
    128    mov     Htbl,   [esp + 4*1 + 0*4]
    129    mov     KS,     [esp + 4*1 + 1*4]
    130    mov     NR,     [esp + 4*1 + 2*4]
    131 
    132    vzeroupper
    133    ; AES-ENC(0)
    134    vmovdqu T, XMMWORD PTR[KS]
    135    lea KS, [16 + KS]
    136    dec NR
    137 Lenc_loop:
    138        vaesenc T, T, [KS]
    139        lea KS, [16 + KS]
    140        dec NR
    141        jnz Lenc_loop
    142 
    143    vaesenclast T, T, [KS]
    144    vpshufb T, T, [Lbswap_mask]
    145 
    146    ;Calculate H` = GFMUL(H, 2)
    147    vpsrad  xmm3, T, 31
    148    vpshufd xmm3, xmm3, 0ffh
    149    vpand   xmm5, xmm3, [Lpoly]
    150    vpsrld  xmm3, T, 31
    151    vpslld  xmm4, T, 1
    152    vpslldq xmm3, xmm3, 4
    153    vpxor   T, xmm4, xmm3
    154    vpxor   T, T, xmm5
    155 
    156    vmovdqu TMP0, T
    157    vmovdqu XMMWORD PTR[Htbl + 0*16], T
    158 
    159    vpshufd xmm2, T, 78
    160    vpxor   xmm2, xmm2, T
    161    vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2
    162 
    163    i = 1
    164    WHILE i LT 8
    165        GFMUL   T, T, TMP0, xmm2, xmm3, xmm4, xmm5
    166        vmovdqu XMMWORD PTR[Htbl + i*16], T
    167        vpshufd xmm2, T, 78
    168        vpxor   xmm2, xmm2, T
    169        vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2
    170        i = i+1
    171        ENDM
    172    vzeroupper
    173    ret
    174 intel_aes_gcmINIT ENDP
    175 
    176 
    177 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    178 ;
    179 ; Authenticate only
    180 ; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp);
    181 ;
    182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    183 
    184 ALIGN 16
    185 intel_aes_gcmAAD PROC
    186 
    187 Htbl    textequ <eax>
    188 inp     textequ <ecx>
    189 len     textequ <edx>
    190 Tp      textequ <ebx>
    191 hlp0    textequ <esi>
    192 
    193 DATA    textequ <xmm0>
    194 T       textequ <xmm1>
    195 TMP0    textequ <xmm2>
    196 TMP1    textequ <xmm3>
    197 TMP2    textequ <xmm4>
    198 TMP3    textequ <xmm5>
    199 TMP4    textequ <xmm6>
    200 Xhi     textequ <xmm7>
    201 
    202 KARATSUBA_AAD MACRO i
    203    vpclmulqdq  TMP3, DATA, [Htbl + i*16], 0h
    204    vpxor       TMP0, TMP0, TMP3
    205    vpclmulqdq  TMP3, DATA, [Htbl + i*16], 011h
    206    vpxor       TMP1, TMP1, TMP3
    207    vpshufd     TMP3, DATA, 78
    208    vpxor       TMP3, TMP3, DATA
    209    vpclmulqdq  TMP3, TMP3, [Htbl + 8*16 + i*16], 0h
    210    vpxor       TMP2, TMP2, TMP3
    211 ENDM
    212 
    213    cmp   DWORD PTR[esp + 1*3 + 2*4], 0
    214    jnz   LbeginAAD
    215    ret
    216 
    217 LbeginAAD:
    218    push    ebx
    219    push    esi
    220 
    221    mov     Htbl,   [esp + 4*3 + 0*4]
    222    mov     inp,    [esp + 4*3 + 1*4]
    223    mov     len,    [esp + 4*3 + 2*4]
    224    mov     Tp,     [esp + 4*3 + 3*4]
    225 
    226    vzeroupper
    227 
    228    vpxor   Xhi, Xhi, Xhi
    229 
    230    vmovdqu T, XMMWORD PTR[Tp]
    231    ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
    232    mov hlp0, len
    233    and hlp0, 128-1
    234    jz  Lmod_loop
    235 
    236    and len, -128
    237    sub hlp0, 16
    238 
    239    ; Prefix block
    240    vmovdqu DATA, XMMWORD PTR[inp]
    241    vpshufb DATA, DATA, [Lbswap_mask]
    242    vpxor   DATA, DATA, T
    243 
    244    vpclmulqdq  TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h
    245    vpclmulqdq  TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h
    246    vpshufd     TMP3, DATA, 78
    247    vpxor       TMP3, TMP3, DATA
    248    vpclmulqdq  TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
    249 
    250    lea     inp, [inp+16]
    251    test    hlp0, hlp0
    252    jnz     Lpre_loop
    253    jmp     Lred1
    254 
    255    ;hash remaining prefix bocks (up to 7 total prefix blocks)
    256 Lpre_loop:
    257 
    258        sub hlp0, 16
    259 
    260        vmovdqu DATA, XMMWORD PTR[inp]
    261        vpshufb DATA, DATA, [Lbswap_mask]
    262 
    263        vpclmulqdq  TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h
    264        vpxor       TMP0, TMP0, TMP3
    265        vpclmulqdq  TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h
    266        vpxor       TMP1, TMP1, TMP3
    267        vpshufd     TMP3, DATA, 78
    268        vpxor       TMP3, TMP3, DATA
    269        vpclmulqdq  TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h
    270        vpxor       TMP2, TMP2, TMP3
    271 
    272        test    hlp0, hlp0
    273        lea     inp, [inp+16]
    274        jnz     Lpre_loop
    275 
    276 Lred1:
    277 
    278    vpxor       TMP2, TMP2, TMP0
    279    vpxor       TMP2, TMP2, TMP1
    280    vpsrldq     TMP3, TMP2, 8
    281    vpslldq     TMP2, TMP2, 8
    282 
    283    vpxor       Xhi, TMP1, TMP3
    284    vpxor       T, TMP0, TMP2
    285 
    286 Lmod_loop:
    287 
    288        sub len, 16*8
    289        jb  Ldone
    290        ; Block #0
    291        vmovdqu DATA, XMMWORD PTR[inp + 16*7]
    292        vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask]
    293 
    294        vpclmulqdq  TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h
    295        vpclmulqdq  TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h
    296        vpshufd     TMP3, DATA, 78
    297        vpxor       TMP3, TMP3, DATA
    298        vpclmulqdq  TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h
    299 
    300        ; Block #1
    301        vmovdqu DATA, XMMWORD PTR[inp + 16*6]
    302        vpshufb DATA, DATA, [Lbswap_mask]
    303        KARATSUBA_AAD 1
    304 
    305        ; Block #2
    306        vmovdqu DATA, XMMWORD PTR[inp + 16*5]
    307        vpshufb DATA, DATA, [Lbswap_mask]
    308 
    309        vpclmulqdq  TMP4, T, [Lpoly], 010h         ;reduction stage 1a
    310        vpalignr    T, T, T, 8
    311 
    312        KARATSUBA_AAD 2
    313 
    314        vpxor       T, T, TMP4                          ;reduction stage 1b
    315 
    316        ; Block #3
    317        vmovdqu DATA, XMMWORD PTR[inp + 16*4]
    318        vpshufb DATA, DATA, [Lbswap_mask]
    319        KARATSUBA_AAD 3
    320        ; Block #4
    321        vmovdqu DATA, XMMWORD PTR[inp + 16*3]
    322        vpshufb DATA, DATA, [Lbswap_mask]
    323 
    324        vpclmulqdq  TMP4, T, [Lpoly], 010h        ;reduction stage 2a
    325        vpalignr    T, T, T, 8
    326 
    327        KARATSUBA_AAD 4
    328 
    329        vpxor       T, T, TMP4                          ;reduction stage 2b
    330        ; Block #5
    331        vmovdqu DATA, XMMWORD PTR[inp + 16*2]
    332        vpshufb DATA, DATA, [Lbswap_mask]
    333        KARATSUBA_AAD 5
    334 
    335        vpxor   T, T, Xhi                               ;reduction finalize
    336        ; Block #6
    337        vmovdqu DATA, XMMWORD PTR[inp + 16*1]
    338        vpshufb DATA, DATA, [Lbswap_mask]
    339        KARATSUBA_AAD 6
    340        ; Block #7
    341        vmovdqu DATA, XMMWORD PTR[inp + 16*0]
    342        vpshufb DATA, DATA, [Lbswap_mask]
    343        vpxor   DATA, DATA, T
    344        KARATSUBA_AAD 7
    345        ; Aggregated 8 blocks, now karatsuba fixup
    346        vpxor   TMP2, TMP2, TMP0
    347        vpxor   TMP2, TMP2, TMP1
    348        vpsrldq TMP3, TMP2, 8
    349        vpslldq TMP2, TMP2, 8
    350 
    351        vpxor   Xhi, TMP1, TMP3
    352        vpxor   T, TMP0, TMP2
    353 
    354        lea inp, [inp + 16*8]
    355        jmp Lmod_loop
    356 
    357 Ldone:
    358    vpclmulqdq  TMP4, T, [Lpoly], 010h
    359    vpalignr    T, T, T, 8
    360    vpxor       T, T, TMP4
    361 
    362    vpclmulqdq  TMP4, T, [Lpoly], 010h
    363    vpalignr    T, T, T, 8
    364    vpxor       T, T, TMP4
    365 
    366    vpxor       T, T, Xhi
    367    vmovdqu     XMMWORD PTR[Tp], T
    368    vzeroupper
    369 
    370    pop esi
    371    pop ebx
    372    ret
    373 
    374 intel_aes_gcmAAD ENDP
    375 
    376 
    377 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    378 ;
    379 ; Encrypt and Authenticate
    380 ; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len);
    381 ;
    382 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    383 
    384 ALIGN 16
    385 intel_aes_gcmENC PROC
    386 
    387 PT      textequ <eax>
    388 CT      textequ <ecx>
    389 Htbl    textequ <edx>
    390 Gctx    textequ <edx>
    391 len     textequ <DWORD PTR[ebp + 5*4 + 3*4]>
    392 KS      textequ <esi>
    393 NR      textequ <DWORD PTR[244+KS]>
    394 
    395 aluCTR  textequ <ebx>
    396 aluTMP  textequ <edi>
    397 
    398 T       textequ <XMMWORD PTR[16*16 + 1*16 + Gctx]>
    399 TMP0    textequ <xmm1>
    400 TMP1    textequ <xmm2>
    401 TMP2    textequ <xmm3>
    402 TMP3    textequ <xmm4>
    403 TMP4    textequ <xmm5>
    404 TMP5    textequ <xmm6>
    405 
    406 CTR0    textequ <xmm0>
    407 CTR1    textequ <xmm1>
    408 CTR2    textequ <xmm2>
    409 CTR3    textequ <xmm3>
    410 CTR4    textequ <xmm4>
    411 CTR5    textequ <xmm5>
    412 CTR6    textequ <xmm6>
    413 
    414 ROUND MACRO i
    415    vmovdqu xmm7, XMMWORD PTR[i*16 + KS]
    416    vaesenc CTR0, CTR0, xmm7
    417    vaesenc CTR1, CTR1, xmm7
    418    vaesenc CTR2, CTR2, xmm7
    419    vaesenc CTR3, CTR3, xmm7
    420    vaesenc CTR4, CTR4, xmm7
    421    vaesenc CTR5, CTR5, xmm7
    422    vaesenc CTR6, CTR6, xmm7
    423 ENDM
    424 
    425 KARATSUBA MACRO i
    426    vpshufd TMP4, TMP5, 78
    427    vpxor   TMP4, TMP4, TMP5
    428    vpclmulqdq  TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h
    429    vpxor       TMP0, TMP0, TMP3
    430    vmovdqu     TMP4, XMMWORD PTR[i*16 + Htbl]
    431    vpclmulqdq  TMP3, TMP5, TMP4, 011h
    432    vpxor       TMP1, TMP1, TMP3
    433    vpclmulqdq  TMP3, TMP5, TMP4, 000h
    434    vpxor       TMP2, TMP2, TMP3
    435 ENDM
    436 
    437 NEXTCTR MACRO i
    438    add     aluCTR, 1
    439    mov     aluTMP, aluCTR
    440    bswap   aluTMP
    441    xor     aluTMP, [3*4 + KS]
    442    mov     [3*4 + 8*16 + i*16 + esp], aluTMP
    443 ENDM
    444 
    445    cmp DWORD PTR[1*4 + 3*4 + esp], 0
    446    jne LbeginENC
    447    ret
    448 
    449 LbeginENC:
    450 
    451    vzeroupper
    452    push    ebp
    453    push    ebx
    454    push    esi
    455    push    edi
    456 
    457    mov ebp, esp
    458    sub esp, 16*16
    459    and esp, -16
    460 
    461    mov PT, [ebp + 5*4 + 0*4]
    462    mov CT, [ebp + 5*4 + 1*4]
    463    mov Gctx, [ebp + 5*4 + 2*4]
    464 
    465    mov     KS, [16*16 + 3*16 + Gctx]
    466 
    467    mov     aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
    468    bswap   aluCTR
    469 
    470 
    471    vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
    472    vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
    473    vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0
    474 
    475    cmp len, 16*7
    476    jb  LEncDataSingles
    477 ; Prepare the "top" counters
    478    vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0
    479    vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0
    480    vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0
    481    vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0
    482    vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0
    483    vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0
    484 
    485    vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx]
    486    vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
    487 ; Encrypt the initial 7 blocks
    488    sub len, 16*7
    489    vpaddd  CTR1, CTR0, XMMWORD PTR[Lone]
    490    vpaddd  CTR2, CTR0, XMMWORD PTR[Ltwo]
    491    vpaddd  CTR3, CTR2, XMMWORD PTR[Lone]
    492    vpaddd  CTR4, CTR2, XMMWORD PTR[Ltwo]
    493    vpaddd  CTR5, CTR4, XMMWORD PTR[Lone]
    494    vpaddd  CTR6, CTR4, XMMWORD PTR[Ltwo]
    495 
    496    vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
    497    vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
    498    vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
    499    vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
    500    vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
    501    vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
    502    vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask]
    503 
    504    vmovdqu xmm7, XMMWORD PTR[0*16 + KS]
    505    vpxor   CTR0, CTR0, xmm7
    506    vpxor   CTR1, CTR1, xmm7
    507    vpxor   CTR2, CTR2, xmm7
    508    vpxor   CTR3, CTR3, xmm7
    509    vpxor   CTR4, CTR4, xmm7
    510    vpxor   CTR5, CTR5, xmm7
    511    vpxor   CTR6, CTR6, xmm7
    512 
    513    ROUND   1
    514 
    515    add aluCTR, 7
    516    mov aluTMP, aluCTR
    517    bswap   aluTMP
    518    xor aluTMP, [KS + 3*4]
    519    mov [8*16 + 0*16 + 3*4 + esp], aluTMP
    520 
    521    ROUND   2
    522    NEXTCTR 1
    523    ROUND   3
    524    NEXTCTR 2
    525    ROUND   4
    526    NEXTCTR 3
    527    ROUND   5
    528    NEXTCTR 4
    529    ROUND   6
    530    NEXTCTR 5
    531    ROUND   7
    532    NEXTCTR 6
    533    ROUND   8
    534    ROUND   9
    535    vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
    536    cmp     NR, 10
    537    je      @f
    538 
    539    ROUND   10
    540    ROUND   11
    541    vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
    542    cmp     NR, 12
    543    je      @f
    544 
    545    ROUND   12
    546    ROUND   13
    547    vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
    548 @@:
    549    vaesenclast CTR0, CTR0, xmm7
    550    vaesenclast CTR1, CTR1, xmm7
    551    vaesenclast CTR2, CTR2, xmm7
    552    vaesenclast CTR3, CTR3, xmm7
    553    vaesenclast CTR4, CTR4, xmm7
    554    vaesenclast CTR5, CTR5, xmm7
    555    vaesenclast CTR6, CTR6, xmm7
    556 
    557    vpxor   CTR0, CTR0, XMMWORD PTR[0*16 + PT]
    558    vpxor   CTR1, CTR1, XMMWORD PTR[1*16 + PT]
    559    vpxor   CTR2, CTR2, XMMWORD PTR[2*16 + PT]
    560    vpxor   CTR3, CTR3, XMMWORD PTR[3*16 + PT]
    561    vpxor   CTR4, CTR4, XMMWORD PTR[4*16 + PT]
    562    vpxor   CTR5, CTR5, XMMWORD PTR[5*16 + PT]
    563    vpxor   CTR6, CTR6, XMMWORD PTR[6*16 + PT]
    564 
    565    vmovdqu XMMWORD PTR[0*16 + CT], CTR0
    566    vmovdqu XMMWORD PTR[1*16 + CT], CTR1
    567    vmovdqu XMMWORD PTR[2*16 + CT], CTR2
    568    vmovdqu XMMWORD PTR[3*16 + CT], CTR3
    569    vmovdqu XMMWORD PTR[4*16 + CT], CTR4
    570    vmovdqu XMMWORD PTR[5*16 + CT], CTR5
    571    vmovdqu XMMWORD PTR[6*16 + CT], CTR6
    572 
    573    vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
    574    vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
    575    vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
    576    vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
    577    vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
    578    vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
    579    vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
    580 
    581    vmovdqa XMMWORD PTR[1*16 + esp], CTR5
    582    vmovdqa XMMWORD PTR[2*16 + esp], CTR4
    583    vmovdqa XMMWORD PTR[3*16 + esp], CTR3
    584    vmovdqa XMMWORD PTR[4*16 + esp], CTR2
    585    vmovdqa XMMWORD PTR[5*16 + esp], CTR1
    586    vmovdqa XMMWORD PTR[6*16 + esp], CTR0
    587 
    588    lea CT, [7*16 + CT]
    589    lea PT, [7*16 + PT]
    590    jmp LEncData7
    591 
    592 LEncData7:
    593        cmp len, 16*7
    594        jb  LEndEnc7
    595        sub len, 16*7
    596 
    597        vpshufd TMP4, TMP5, 78
    598        vpxor   TMP4, TMP4, TMP5
    599        vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
    600        vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
    601        vpclmulqdq  TMP1, TMP5, TMP4, 011h
    602        vpclmulqdq  TMP2, TMP5, TMP4, 000h
    603 
    604        vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
    605        KARATSUBA 1
    606        vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
    607        KARATSUBA 2
    608        vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
    609        KARATSUBA 3
    610        vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
    611        KARATSUBA 4
    612        vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
    613        KARATSUBA 5
    614        vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
    615        vpxor   TMP5, TMP5, T
    616        KARATSUBA 6
    617 
    618        vpxor   TMP0, TMP0, TMP1
    619        vpxor   TMP0, TMP0, TMP2
    620        vpsrldq TMP3, TMP0, 8
    621        vpxor   TMP4, TMP1, TMP3
    622        vpslldq TMP3, TMP0, 8
    623        vpxor   TMP5, TMP2, TMP3
    624 
    625        vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
    626        vpalignr    TMP5,TMP5,TMP5,8
    627        vpxor       TMP5, TMP5, TMP1
    628 
    629        vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
    630        vpalignr    TMP5,TMP5,TMP5,8
    631        vpxor       TMP5, TMP5, TMP1
    632 
    633        vpxor       TMP5, TMP5, TMP4
    634        vmovdqu     T, TMP5
    635 
    636        vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp]
    637        vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp]
    638        vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp]
    639        vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp]
    640        vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp]
    641        vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp]
    642        vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp]
    643 
    644        ROUND 1
    645        NEXTCTR 0
    646        ROUND 2
    647        NEXTCTR 1
    648        ROUND 3
    649        NEXTCTR 2
    650        ROUND 4
    651        NEXTCTR 3
    652        ROUND 5
    653        NEXTCTR 4
    654        ROUND 6
    655        NEXTCTR 5
    656        ROUND 7
    657        NEXTCTR 6
    658 
    659        ROUND 8
    660        ROUND 9
    661 
    662        vmovdqu     xmm7, XMMWORD PTR[10*16 + KS]
    663        cmp         NR, 10
    664        je          @f
    665 
    666        ROUND 10
    667        ROUND 11
    668        vmovdqu     xmm7, XMMWORD PTR[12*16 + KS]
    669        cmp         NR, 12
    670        je          @f
    671 
    672        ROUND 12
    673        ROUND 13
    674        vmovdqu     xmm7, XMMWORD PTR[14*16 + KS]
    675 @@:
    676        vaesenclast CTR0, CTR0, xmm7
    677        vaesenclast CTR1, CTR1, xmm7
    678        vaesenclast CTR2, CTR2, xmm7
    679        vaesenclast CTR3, CTR3, xmm7
    680        vaesenclast CTR4, CTR4, xmm7
    681        vaesenclast CTR5, CTR5, xmm7
    682        vaesenclast CTR6, CTR6, xmm7
    683 
    684        vpxor   CTR0, CTR0, XMMWORD PTR[0*16 + PT]
    685        vpxor   CTR1, CTR1, XMMWORD PTR[1*16 + PT]
    686        vpxor   CTR2, CTR2, XMMWORD PTR[2*16 + PT]
    687        vpxor   CTR3, CTR3, XMMWORD PTR[3*16 + PT]
    688        vpxor   CTR4, CTR4, XMMWORD PTR[4*16 + PT]
    689        vpxor   CTR5, CTR5, XMMWORD PTR[5*16 + PT]
    690        vpxor   CTR6, CTR6, XMMWORD PTR[6*16 + PT]
    691 
    692        vmovdqu XMMWORD PTR[0*16 + CT], CTR0
    693        vmovdqu XMMWORD PTR[1*16 + CT], CTR1
    694        vmovdqu XMMWORD PTR[2*16 + CT], CTR2
    695        vmovdqu XMMWORD PTR[3*16 + CT], CTR3
    696        vmovdqu XMMWORD PTR[4*16 + CT], CTR4
    697        vmovdqu XMMWORD PTR[5*16 + CT], CTR5
    698        vmovdqu XMMWORD PTR[6*16 + CT], CTR6
    699 
    700        vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask]
    701        vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask]
    702        vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask]
    703        vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask]
    704        vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask]
    705        vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask]
    706        vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask]
    707 
    708        vmovdqa XMMWORD PTR[1*16 + esp], CTR5
    709        vmovdqa XMMWORD PTR[2*16 + esp], CTR4
    710        vmovdqa XMMWORD PTR[3*16 + esp], CTR3
    711        vmovdqa XMMWORD PTR[4*16 + esp], CTR2
    712        vmovdqa XMMWORD PTR[5*16 + esp], CTR1
    713        vmovdqa XMMWORD PTR[6*16 + esp], CTR0
    714 
    715        lea CT, [7*16 + CT]
    716        lea PT, [7*16 + PT]
    717        jmp LEncData7
    718 
    719 LEndEnc7:
    720 
    721    vpshufd TMP4, TMP5, 78
    722    vpxor   TMP4, TMP4, TMP5
    723    vpclmulqdq  TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h
    724    vmovdqu     TMP4, XMMWORD PTR[0*16 + Htbl]
    725    vpclmulqdq  TMP1, TMP5, TMP4, 011h
    726    vpclmulqdq  TMP2, TMP5, TMP4, 000h
    727 
    728    vmovdqu TMP5, XMMWORD PTR[1*16 + esp]
    729    KARATSUBA 1
    730    vmovdqu TMP5, XMMWORD PTR[2*16 + esp]
    731    KARATSUBA 2
    732    vmovdqu TMP5, XMMWORD PTR[3*16 + esp]
    733    KARATSUBA 3
    734    vmovdqu TMP5, XMMWORD PTR[4*16 + esp]
    735    KARATSUBA 4
    736    vmovdqu TMP5, XMMWORD PTR[5*16 + esp]
    737    KARATSUBA 5
    738    vmovdqu TMP5, XMMWORD PTR[6*16 + esp]
    739    vpxor   TMP5, TMP5, T
    740    KARATSUBA 6
    741 
    742    vpxor   TMP0, TMP0, TMP1
    743    vpxor   TMP0, TMP0, TMP2
    744    vpsrldq TMP3, TMP0, 8
    745    vpxor   TMP4, TMP1, TMP3
    746    vpslldq TMP3, TMP0, 8
    747    vpxor   TMP5, TMP2, TMP3
    748 
    749    vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
    750    vpalignr    TMP5,TMP5,TMP5,8
    751    vpxor       TMP5, TMP5, TMP1
    752 
    753    vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
    754    vpalignr    TMP5,TMP5,TMP5,8
    755    vpxor       TMP5, TMP5, TMP1
    756 
    757    vpxor       TMP5, TMP5, TMP4
    758    vmovdqu     T, TMP5
    759 
    760    sub aluCTR, 6
    761 
    762 LEncDataSingles:
    763 
    764        cmp len, 16
    765        jb  LEncDataTail
    766        sub len, 16
    767 
    768        vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
    769        NEXTCTR 0
    770 
    771        vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
    772        vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
    773        vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
    774        vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
    775        vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
    776        vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
    777        vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
    778        vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
    779        vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
    780        vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
    781        cmp NR, 10
    782        je  @f
    783        vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
    784        vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
    785        vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
    786        cmp NR, 12
    787        je  @f
    788        vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
    789        vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
    790        vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
    791 @@:
    792        vaesenclast TMP1, TMP1, TMP2
    793        vpxor   TMP1, TMP1, XMMWORD PTR[PT]
    794        vmovdqu XMMWORD PTR[CT], TMP1
    795 
    796        lea PT, [16+PT]
    797        lea CT, [16+CT]
    798 
    799        vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
    800        vpxor   TMP1, TMP1, T
    801 
    802        vmovdqu TMP0, XMMWORD PTR[Htbl]
    803        GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
    804        vmovdqu T, TMP1
    805 
    806        jmp LEncDataSingles
    807 
    808 LEncDataTail:
    809 
    810    cmp len, 0
    811    je  LEncDataEnd
    812 
    813    vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp]
    814 
    815    vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
    816    vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
    817    vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
    818    vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
    819    vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
    820    vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
    821    vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
    822    vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
    823    vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
    824    vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
    825    cmp NR, 10
    826    je  @f
    827    vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
    828    vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
    829    vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
    830    cmp NR, 12
    831    je  @f
    832    vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
    833    vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
    834    vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
    835 @@:
    836    vaesenclast TMP1, TMP1, TMP2
    837 ; zero a temp location
    838    vpxor   TMP2, TMP2, TMP2
    839    vmovdqa XMMWORD PTR[esp], TMP2
    840 ; copy as many bytes as needed
    841    xor KS, KS
    842    mov aluTMP, edx
    843 @@:
    844        cmp len, KS
    845        je  @f
    846        mov dl, BYTE PTR[PT + KS]
    847        mov BYTE PTR[esp + KS], dl
    848        inc KS
    849        jmp @b
    850 @@:
    851    vpxor   TMP1, TMP1, XMMWORD PTR[esp]
    852    vmovdqa XMMWORD PTR[esp], TMP1
    853    xor KS, KS
    854 @@:
    855        cmp len, KS
    856        je  @f
    857        mov dl, BYTE PTR[esp + KS]
    858        mov BYTE PTR[CT + KS], dl
    859        inc KS
    860        jmp @b
    861 @@:
    862        cmp KS, 16
    863        je  @f
    864        mov BYTE PTR[esp + KS], 0
    865        inc KS
    866        jmp @b
    867 @@:
    868    mov edx, aluTMP
    869    vmovdqa TMP1, XMMWORD PTR[esp]
    870    vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
    871    vpxor   TMP1, TMP1, T
    872 
    873    vmovdqu TMP0, XMMWORD PTR[Htbl]
    874    GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
    875    vmovdqu T, TMP1
    876 
    877 LEncDataEnd:
    878    inc     aluCTR
    879    bswap   aluCTR
    880    mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
    881 
    882    mov esp, ebp
    883    pop edi
    884    pop esi
    885    pop ebx
    886    pop ebp
    887 
    888 
    889    vzeroupper
    890 
    891    ret
    892 intel_aes_gcmENC ENDP
    893 
    894 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    895 ;
    896 ; Decrypt and Authenticate
    897 ; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len);
    898 ;
    899 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
    900 
    901 
    902 NEXTCTR MACRO i
    903    add     aluCTR, 1
    904    mov     aluTMP, aluCTR
    905    bswap   aluTMP
    906    xor     aluTMP, [3*4 + KS]
    907    mov     [3*4 + i*16 + esp], aluTMP
    908 ENDM
    909 
    910 intel_aes_gcmDEC PROC
    911 
    912    cmp DWORD PTR[1*4 + 3*4 + esp], 0
    913    jne LbeginDEC
    914    ret
    915 
    916 LbeginDEC:
    917 
    918    vzeroupper
    919    push    ebp
    920    push    ebx
    921    push    esi
    922    push    edi
    923 
    924    mov ebp, esp
    925    sub esp, 8*16
    926    and esp, -16
    927 
    928    mov CT, [ebp + 5*4 + 0*4]
    929    mov PT, [ebp + 5*4 + 1*4]
    930    mov Gctx, [ebp + 5*4 + 2*4]
    931 
    932    mov     KS, [16*16 + 3*16 + Gctx]
    933 
    934    mov     aluCTR, [16*16 + 2*16 + 3*4 + Gctx]
    935    bswap   aluCTR
    936 
    937 
    938    vmovdqu TMP0, XMMWORD PTR[0*16 + KS]
    939    vpxor   TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx]
    940    vmovdqu XMMWORD PTR[0*16 + esp], TMP0
    941 
    942    cmp len, 16*7
    943    jb  LDecDataSingles
    944    vmovdqu XMMWORD PTR[1*16 + esp], TMP0
    945    vmovdqu XMMWORD PTR[2*16 + esp], TMP0
    946    vmovdqu XMMWORD PTR[3*16 + esp], TMP0
    947    vmovdqu XMMWORD PTR[4*16 + esp], TMP0
    948    vmovdqu XMMWORD PTR[5*16 + esp], TMP0
    949    vmovdqu XMMWORD PTR[6*16 + esp], TMP0
    950    dec aluCTR
    951 
    952 LDecData7:
    953    cmp len, 16*7
    954    jb  LDecData7End
    955    sub len, 16*7
    956 
    957    vmovdqu TMP5, XMMWORD PTR[0*16 + CT]
    958    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
    959    vpxor   TMP5, TMP5, T
    960    vpshufd TMP4, TMP5, 78
    961    vpxor   TMP4, TMP4, TMP5
    962    vpclmulqdq  TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h
    963    vmovdqu     TMP4, XMMWORD PTR[6*16 + Htbl]
    964    vpclmulqdq  TMP1, TMP5, TMP4, 011h
    965    vpclmulqdq  TMP2, TMP5, TMP4, 000h
    966 
    967    NEXTCTR 0
    968    vmovdqu TMP5, XMMWORD PTR[1*16 + CT]
    969    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
    970    KARATSUBA 5
    971    NEXTCTR 1
    972    vmovdqu TMP5, XMMWORD PTR[2*16 + CT]
    973    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
    974    KARATSUBA 4
    975    NEXTCTR 2
    976    vmovdqu TMP5, XMMWORD PTR[3*16 + CT]
    977    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
    978    KARATSUBA 3
    979    NEXTCTR 3
    980    vmovdqu TMP5, XMMWORD PTR[4*16 + CT]
    981    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
    982    KARATSUBA 2
    983    NEXTCTR 4
    984    vmovdqu TMP5, XMMWORD PTR[5*16 + CT]
    985    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
    986    KARATSUBA 1
    987    NEXTCTR 5
    988    vmovdqu TMP5, XMMWORD PTR[6*16 + CT]
    989    vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask]
    990    KARATSUBA 0
    991    NEXTCTR 6
    992 
    993    vpxor   TMP0, TMP0, TMP1
    994    vpxor   TMP0, TMP0, TMP2
    995    vpsrldq TMP3, TMP0, 8
    996    vpxor   TMP4, TMP1, TMP3
    997    vpslldq TMP3, TMP0, 8
    998    vpxor   TMP5, TMP2, TMP3
    999 
   1000    vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
   1001    vpalignr    TMP5,TMP5,TMP5,8
   1002    vpxor       TMP5, TMP5, TMP1
   1003 
   1004    vpclmulqdq  TMP1, TMP5, XMMWORD PTR[Lpoly], 010h
   1005    vpalignr    TMP5,TMP5,TMP5,8
   1006    vpxor       TMP5, TMP5, TMP1
   1007 
   1008    vpxor       TMP5, TMP5, TMP4
   1009    vmovdqu     T, TMP5
   1010 
   1011    vmovdqa CTR0, XMMWORD PTR[0*16 + esp]
   1012    vmovdqa CTR1, XMMWORD PTR[1*16 + esp]
   1013    vmovdqa CTR2, XMMWORD PTR[2*16 + esp]
   1014    vmovdqa CTR3, XMMWORD PTR[3*16 + esp]
   1015    vmovdqa CTR4, XMMWORD PTR[4*16 + esp]
   1016    vmovdqa CTR5, XMMWORD PTR[5*16 + esp]
   1017    vmovdqa CTR6, XMMWORD PTR[6*16 + esp]
   1018 
   1019    ROUND   1
   1020    ROUND   2
   1021    ROUND   3
   1022    ROUND   4
   1023    ROUND   5
   1024    ROUND   6
   1025    ROUND   7
   1026    ROUND   8
   1027    ROUND   9
   1028    vmovdqu xmm7, XMMWORD PTR[10*16 + KS]
   1029    cmp     NR, 10
   1030    je      @f
   1031 
   1032    ROUND   10
   1033    ROUND   11
   1034    vmovdqu xmm7, XMMWORD PTR[12*16 + KS]
   1035    cmp     NR, 12
   1036    je      @f
   1037 
   1038    ROUND   12
   1039    ROUND   13
   1040    vmovdqu xmm7, XMMWORD PTR[14*16 + KS]
   1041 @@:
   1042    vaesenclast CTR0, CTR0, xmm7
   1043    vaesenclast CTR1, CTR1, xmm7
   1044    vaesenclast CTR2, CTR2, xmm7
   1045    vaesenclast CTR3, CTR3, xmm7
   1046    vaesenclast CTR4, CTR4, xmm7
   1047    vaesenclast CTR5, CTR5, xmm7
   1048    vaesenclast CTR6, CTR6, xmm7
   1049 
   1050    vpxor   CTR0, CTR0, XMMWORD PTR[0*16 + CT]
   1051    vpxor   CTR1, CTR1, XMMWORD PTR[1*16 + CT]
   1052    vpxor   CTR2, CTR2, XMMWORD PTR[2*16 + CT]
   1053    vpxor   CTR3, CTR3, XMMWORD PTR[3*16 + CT]
   1054    vpxor   CTR4, CTR4, XMMWORD PTR[4*16 + CT]
   1055    vpxor   CTR5, CTR5, XMMWORD PTR[5*16 + CT]
   1056    vpxor   CTR6, CTR6, XMMWORD PTR[6*16 + CT]
   1057 
   1058    vmovdqu XMMWORD PTR[0*16 + PT], CTR0
   1059    vmovdqu XMMWORD PTR[1*16 + PT], CTR1
   1060    vmovdqu XMMWORD PTR[2*16 + PT], CTR2
   1061    vmovdqu XMMWORD PTR[3*16 + PT], CTR3
   1062    vmovdqu XMMWORD PTR[4*16 + PT], CTR4
   1063    vmovdqu XMMWORD PTR[5*16 + PT], CTR5
   1064    vmovdqu XMMWORD PTR[6*16 + PT], CTR6
   1065 
   1066    lea CT, [7*16 + CT]
   1067    lea PT, [7*16 + PT]
   1068    jmp LDecData7
   1069 
   1070 LDecData7End:
   1071 
   1072    NEXTCTR 0
   1073 
   1074 LDecDataSingles:
   1075 
   1076        cmp len, 16
   1077        jb  LDecDataTail
   1078        sub len, 16
   1079 
   1080        vmovdqu TMP1, XMMWORD PTR[CT]
   1081        vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
   1082        vpxor   TMP1, TMP1, T
   1083 
   1084        vmovdqu TMP0, XMMWORD PTR[Htbl]
   1085        GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
   1086        vmovdqu T, TMP1
   1087 
   1088        vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
   1089        NEXTCTR 0
   1090 
   1091        vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
   1092        vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
   1093        vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
   1094        vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
   1095        vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
   1096        vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
   1097        vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
   1098        vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
   1099        vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
   1100        vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
   1101        cmp NR, 10
   1102        je  @f
   1103        vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
   1104        vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
   1105        vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
   1106        cmp NR, 12
   1107        je  @f
   1108        vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
   1109        vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
   1110        vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
   1111 @@:
   1112        vaesenclast TMP1, TMP1, TMP2
   1113        vpxor   TMP1, TMP1, XMMWORD PTR[CT]
   1114        vmovdqu XMMWORD PTR[PT], TMP1
   1115 
   1116        lea PT, [16+PT]
   1117        lea CT, [16+CT]
   1118        jmp LDecDataSingles
   1119 
   1120 LDecDataTail:
   1121 
   1122    cmp len, 0
   1123    je  LDecDataEnd
   1124 
   1125    vmovdqa TMP1, XMMWORD PTR[0*16 + esp]
   1126    inc aluCTR
   1127    vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS]
   1128    vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS]
   1129    vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS]
   1130    vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS]
   1131    vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS]
   1132    vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS]
   1133    vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS]
   1134    vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS]
   1135    vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS]
   1136    vmovdqu TMP2, XMMWORD PTR[10*16 + KS]
   1137    cmp NR, 10
   1138    je  @f
   1139    vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS]
   1140    vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS]
   1141    vmovdqu TMP2, XMMWORD PTR[12*16 + KS]
   1142    cmp NR, 12
   1143    je  @f
   1144    vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS]
   1145    vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS]
   1146    vmovdqu TMP2, XMMWORD PTR[14*16 + KS]
   1147 @@:
   1148    vaesenclast xmm7, TMP1, TMP2
   1149 
   1150 ; copy as many bytes as needed
   1151    xor KS, KS
   1152    mov aluTMP, edx
   1153 @@:
   1154        cmp len, KS
   1155        je  @f
   1156        mov dl, BYTE PTR[CT + KS]
   1157        mov BYTE PTR[esp + KS], dl
   1158        inc KS
   1159        jmp @b
   1160 @@:
   1161        cmp KS, 16
   1162        je  @f
   1163        mov BYTE PTR[esp + KS], 0
   1164        inc KS
   1165        jmp @b
   1166 @@:
   1167    mov edx, aluTMP
   1168    vmovdqa TMP1, XMMWORD PTR[esp]
   1169    vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask]
   1170    vpxor   TMP1, TMP1, T
   1171 
   1172    vmovdqu TMP0, XMMWORD PTR[Htbl]
   1173    GFMUL   TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4
   1174    vmovdqu T, TMP1
   1175 
   1176    vpxor   xmm7, xmm7, XMMWORD PTR[esp]
   1177    vmovdqa XMMWORD PTR[esp], xmm7
   1178    xor     KS, KS
   1179    mov aluTMP, edx
   1180 @@:
   1181        cmp len, KS
   1182        je  @f
   1183        mov dl, BYTE PTR[esp + KS]
   1184        mov BYTE PTR[PT + KS], dl
   1185        inc KS
   1186        jmp @b
   1187 @@:
   1188    mov edx, aluTMP
   1189 
   1190 LDecDataEnd:
   1191 
   1192    bswap   aluCTR
   1193    mov     [16*16 + 2*16 + 3*4 + Gctx], aluCTR
   1194 
   1195    mov esp, ebp
   1196    pop edi
   1197    pop esi
   1198    pop ebx
   1199    pop ebp
   1200 
   1201    vzeroupper
   1202 
   1203    ret
   1204 intel_aes_gcmDEC ENDP
   1205 
   1206 
   1207 END