tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intel-gcm.s (32871B)


      1 # LICENSE:                                                                  
      2 # This submission to NSS is to be made available under the terms of the
      3 # Mozilla Public License, v. 2.0. You can obtain one at http:         
      4 # //mozilla.org/MPL/2.0/. 
      5 ################################################################################
      6 # Copyright(c) 2012, Intel Corp.
      7 
      8 .section .rodata
      9 .align  16
     10 .Lone:
     11 .quad 1,0
     12 .Ltwo:
     13 .quad 2,0
     14 .Lbswap_mask:
     15 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
     16 .Lshuff_mask:
     17 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
     18 .Lpoly:
     19 .quad 0x1, 0xc200000000000000 
     20 
     21 .section .text
     22 
     23 ################################################################################
     24 # Generates the final GCM tag
     25 # void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
     26 .type intel_aes_gcmTAG,@function
     27 .globl intel_aes_gcmTAG
     28 .align 16
     29 intel_aes_gcmTAG:
     30 
     31 .set  Htbl, %rdi
     32 .set  Tp, %rsi
     33 .set  Mlen, %rdx
     34 .set  Alen, %rcx
     35 .set  X0, %r8
     36 .set  TAG, %r9
     37 
     38 .set T,%xmm0
     39 .set TMP0,%xmm1
     40 
     41   vmovdqu  (Tp), T
     42   vpshufb  .Lbswap_mask(%rip), T, T
     43   vpxor    TMP0, TMP0, TMP0
     44   shl      $3, Mlen
     45   shl      $3, Alen
     46   vpinsrq  $0, Mlen, TMP0, TMP0
     47   vpinsrq  $1, Alen, TMP0, TMP0
     48   vpxor    TMP0, T, T
     49   vmovdqu  (Htbl), TMP0
     50   call     GFMUL
     51   vpshufb  .Lbswap_mask(%rip), T, T
     52   vpxor    (X0), T, T
     53   vmovdqu  T, (TAG)
     54   
     55 ret
     56 .size intel_aes_gcmTAG, .-intel_aes_gcmTAG
     57 ################################################################################
     58 # Generates the H table
     59 # void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR);
     60 .type intel_aes_gcmINIT,@function
     61 .globl intel_aes_gcmINIT
     62 .align 16
     63 intel_aes_gcmINIT:
     64   
     65 .set  Htbl, %rdi
     66 .set  KS, %rsi
     67 .set  NR, %edx
     68 
     69 .set T,%xmm0
     70 .set TMP0,%xmm1
     71 
     72 CALCULATE_POWERS_OF_H:
     73    vmovdqu      16*0(KS), T
     74    vaesenc      16*1(KS), T, T
     75    vaesenc      16*2(KS), T, T
     76    vaesenc      16*3(KS), T, T
     77    vaesenc      16*4(KS), T, T
     78    vaesenc      16*5(KS), T, T
     79    vaesenc      16*6(KS), T, T
     80    vaesenc      16*7(KS), T, T
     81    vaesenc      16*8(KS), T, T
     82    vaesenc      16*9(KS), T, T
     83    vmovdqu      16*10(KS), TMP0
     84    cmp          $10, NR
     85    je           .LH0done
     86    vaesenc      16*10(KS), T, T
     87    vaesenc      16*11(KS), T, T
     88    vmovdqu      16*12(KS), TMP0
     89    cmp          $12, NR
     90    je           .LH0done
     91    vaesenc      16*12(KS), T, T
     92    vaesenc      16*13(KS), T, T
     93    vmovdqu      16*14(KS), TMP0
     94  
     95 .LH0done:
     96    vaesenclast  TMP0, T, T
     97 
     98    vpshufb      .Lbswap_mask(%rip), T, T  
     99 
    100    vmovdqu	T, TMP0
    101    # Calculate H` = GFMUL(H, 2)
    102    vpsrld	$7 , T , %xmm3
    103    vmovdqu	.Lshuff_mask(%rip), %xmm4
    104    vpshufb	%xmm4, %xmm3 , %xmm3
    105    movq	$0xff00 , %rax
    106    vmovq	%rax, %xmm4
    107    vpshufb	%xmm3, %xmm4 , %xmm4
    108    vmovdqu	.Lpoly(%rip), %xmm5
    109    vpand	%xmm4, %xmm5, %xmm5
    110    vpsrld	$31, T, %xmm3
    111    vpslld	$1, T, %xmm4
    112    vpslldq	$4, %xmm3, %xmm3
    113    vpxor	%xmm3, %xmm4, T  #xmm1 holds now p(x)<<1
    114 
    115    #adding p(x)<<1 to xmm5
    116    vpxor     %xmm5, T , T
    117    vmovdqu   T, TMP0
    118    vmovdqu   T, (Htbl)     # H * 2
    119    call  GFMUL
    120    vmovdqu  T, 16(Htbl)    # H^2 * 2
    121    call  GFMUL
    122    vmovdqu  T, 32(Htbl)    # H^3 * 2
    123    call  GFMUL
    124    vmovdqu  T, 48(Htbl)    # H^4 * 2
    125    call  GFMUL
    126    vmovdqu  T, 64(Htbl)    # H^5 * 2
    127    call  GFMUL
    128    vmovdqu  T, 80(Htbl)    # H^6 * 2
    129    call  GFMUL
    130    vmovdqu  T, 96(Htbl)    # H^7 * 2
    131    call  GFMUL
    132    vmovdqu  T, 112(Htbl)   # H^8 * 2  
    133 
    134    # Precalculations for the reduce 4 step
    135    vpshufd  $78, (Htbl), %xmm8
    136    vpshufd  $78, 16(Htbl), %xmm9
    137    vpshufd  $78, 32(Htbl), %xmm10
    138    vpshufd  $78, 48(Htbl), %xmm11
    139    vpshufd  $78, 64(Htbl), %xmm12
    140    vpshufd  $78, 80(Htbl), %xmm13
    141    vpshufd  $78, 96(Htbl), %xmm14
    142    vpshufd  $78, 112(Htbl), %xmm15
    143 
    144    vpxor  (Htbl), %xmm8, %xmm8
    145    vpxor  16(Htbl), %xmm9, %xmm9
    146    vpxor  32(Htbl), %xmm10, %xmm10
    147    vpxor  48(Htbl), %xmm11, %xmm11
    148    vpxor  64(Htbl), %xmm12, %xmm12
    149    vpxor  80(Htbl), %xmm13, %xmm13
    150    vpxor  96(Htbl), %xmm14, %xmm14
    151    vpxor  112(Htbl), %xmm15, %xmm15
    152 
    153    vmovdqu   %xmm8, 128(Htbl)
    154    vmovdqu   %xmm9, 144(Htbl)
    155    vmovdqu   %xmm10, 160(Htbl)
    156    vmovdqu   %xmm11, 176(Htbl)
    157    vmovdqu   %xmm12, 192(Htbl)
    158    vmovdqu   %xmm13, 208(Htbl)
    159    vmovdqu   %xmm14, 224(Htbl)
    160    vmovdqu   %xmm15, 240(Htbl)
    161 
    162    ret
    163 .size intel_aes_gcmINIT, .-intel_aes_gcmINIT
    164 ################################################################################
    165 # Authenticate only
    166 # void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
    167 
    168 .globl  intel_aes_gcmAAD
    169 .type   intel_aes_gcmAAD,@function
    170 .align  16
    171 intel_aes_gcmAAD:
    172 
    173 .set DATA, %xmm0
    174 .set T, %xmm1
    175 .set BSWAP_MASK, %xmm2
    176 .set TMP0, %xmm3
    177 .set TMP1, %xmm4
    178 .set TMP2, %xmm5
    179 .set TMP3, %xmm6
    180 .set TMP4, %xmm7
    181 .set Xhi, %xmm9
    182 
    183 .set Htbl, %rdi
    184 .set inp, %rsi
    185 .set len, %rdx
    186 .set Tp, %rcx
    187 
    188 .set hlp0, %r11
    189 
    190 .macro KARATSUBA_AAD i
    191    vpclmulqdq  $0x00, 16*\i(Htbl), DATA, TMP3
    192    vpxor       TMP3, TMP0, TMP0
    193    vpclmulqdq  $0x11, 16*\i(Htbl), DATA, TMP3
    194    vpxor       TMP3, TMP1, TMP1
    195    vpshufd     $78,  DATA, TMP3
    196    vpxor       DATA, TMP3, TMP3
    197    vpclmulqdq  $0x00, 16*(\i+8)(Htbl), TMP3, TMP3
    198    vpxor       TMP3, TMP2, TMP2
    199 .endm
    200 
    201    test  len, len
    202    jnz   .LbeginAAD
    203    ret
    204 
    205 .LbeginAAD:
    206 
    207   push  hlp0
    208   vzeroupper
    209   
    210   vmovdqa  .Lbswap_mask(%rip), BSWAP_MASK
    211   
    212   vpxor    Xhi, Xhi, Xhi
    213   
    214   vmovdqu  (Tp),T
    215   vpshufb  BSWAP_MASK,T,T
    216 
    217   # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
    218    mov     len, hlp0
    219    and	    $~-128, hlp0
    220 
    221    jz      .Lmod_loop
    222 
    223    sub     hlp0, len
    224    sub     $16, hlp0
    225 
    226   #hash first prefix block
    227 vmovdqu (inp), DATA
    228 vpshufb  BSWAP_MASK, DATA, DATA
    229 vpxor    T, DATA, DATA
    230 
    231 vpclmulqdq  $0x00, (Htbl, hlp0), DATA, TMP0
    232 vpclmulqdq  $0x11, (Htbl, hlp0), DATA, TMP1
    233 vpshufd     $78, DATA, TMP2
    234 vpxor       DATA, TMP2, TMP2
    235 vpclmulqdq  $0x00, 16*8(Htbl, hlp0), TMP2, TMP2
    236 
    237 lea	    16(inp), inp
    238 test    hlp0, hlp0
    239 jnz	    .Lpre_loop
    240 jmp	    .Lred1
    241 
    242    #hash remaining prefix bocks (up to 7 total prefix blocks)
    243 .align 64
    244 .Lpre_loop:
    245 
    246    sub	$16, hlp0
    247 
    248    vmovdqu     (inp),DATA           # next data block
    249    vpshufb     BSWAP_MASK,DATA,DATA
    250 
    251    vpclmulqdq  $0x00, (Htbl,hlp0), DATA, TMP3
    252    vpxor       TMP3, TMP0, TMP0
    253    vpclmulqdq  $0x11, (Htbl,hlp0), DATA, TMP3
    254    vpxor       TMP3, TMP1, TMP1
    255    vpshufd	    $78, DATA, TMP3
    256    vpxor       DATA, TMP3, TMP3
    257    vpclmulqdq  $0x00, 16*8(Htbl,hlp0), TMP3, TMP3
    258    vpxor       TMP3, TMP2, TMP2
    259 
    260    test	hlp0, hlp0
    261 
    262    lea	16(inp), inp
    263 
    264    jnz	.Lpre_loop
    265 
    266 .Lred1:
    267    vpxor       TMP0, TMP2, TMP2
    268    vpxor       TMP1, TMP2, TMP2
    269    vpsrldq     $8, TMP2, TMP3
    270    vpslldq     $8, TMP2, TMP2
    271 
    272    vpxor       TMP3, TMP1, Xhi
    273    vpxor       TMP2, TMP0, T
    274 
    275 .align 64
    276 .Lmod_loop:
    277    sub	$0x80, len
    278    jb	.Ldone
    279 
    280    vmovdqu     16*7(inp),DATA		# Ii
    281    vpshufb     BSWAP_MASK,DATA,DATA
    282 
    283    vpclmulqdq  $0x00, (Htbl), DATA, TMP0
    284    vpclmulqdq  $0x11, (Htbl), DATA, TMP1
    285    vpshufd     $78, DATA, TMP2
    286    vpxor       DATA, TMP2, TMP2
    287    vpclmulqdq  $0x00, 16*8(Htbl), TMP2, TMP2
    288    #########################################################
    289    vmovdqu     16*6(inp),DATA
    290    vpshufb     BSWAP_MASK,DATA,DATA
    291    KARATSUBA_AAD 1
    292    #########################################################
    293    vmovdqu     16*5(inp),DATA
    294    vpshufb     BSWAP_MASK,DATA,DATA
    295 
    296    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP4         #reduction stage 1a
    297    vpalignr    $8, T, T, T
    298 
    299    KARATSUBA_AAD 2
    300 
    301    vpxor       TMP4, T, T                 #reduction stage 1b
    302    #########################################################
    303    vmovdqu		16*4(inp),DATA
    304    vpshufb	    BSWAP_MASK,DATA,DATA
    305 
    306    KARATSUBA_AAD 3
    307    #########################################################
    308    vmovdqu     16*3(inp),DATA
    309    vpshufb     BSWAP_MASK,DATA,DATA
    310 
    311    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP4         #reduction stage 2a
    312    vpalignr    $8, T, T, T
    313 
    314    KARATSUBA_AAD 4
    315 
    316    vpxor       TMP4, T, T                 #reduction stage 2b
    317    #########################################################
    318    vmovdqu     16*2(inp),DATA
    319    vpshufb     BSWAP_MASK,DATA,DATA
    320 
    321    KARATSUBA_AAD 5
    322 
    323    vpxor       Xhi, T, T                  #reduction finalize
    324    #########################################################
    325    vmovdqu     16*1(inp),DATA
    326    vpshufb     BSWAP_MASK,DATA,DATA
    327 
    328    KARATSUBA_AAD 6
    329    #########################################################
    330    vmovdqu     16*0(inp),DATA
    331    vpshufb     BSWAP_MASK,DATA,DATA
    332    vpxor       T,DATA,DATA
    333 
    334    KARATSUBA_AAD 7
    335    #########################################################
    336    vpxor       TMP0, TMP2, TMP2              # karatsuba fixup
    337    vpxor       TMP1, TMP2, TMP2
    338    vpsrldq     $8, TMP2, TMP3
    339    vpslldq     $8, TMP2, TMP2
    340 
    341    vpxor       TMP3, TMP1, Xhi
    342    vpxor       TMP2, TMP0, T
    343 
    344    lea	16*8(inp), inp
    345    jmp .Lmod_loop
    346    #########################################################
    347 
    348 .Ldone:
    349    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP3
    350    vpalignr    $8, T, T, T
    351    vpxor       TMP3, T, T
    352 
    353    vpclmulqdq  $0x10, .Lpoly(%rip), T, TMP3
    354    vpalignr    $8, T, T, T
    355    vpxor       TMP3, T, T
    356 
    357    vpxor       Xhi, T, T
    358   
    359 .Lsave:
    360    vpshufb     BSWAP_MASK,T, T
    361    vmovdqu     T,(Tp)
    362    vzeroupper
    363 
    364    pop hlp0
    365    ret
    366 .size   intel_aes_gcmAAD,.-intel_aes_gcmAAD
    367 
    368 ################################################################################
    369 # Encrypt and Authenticate
    370 # void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
    371 .type intel_aes_gcmENC,@function
    372 .globl intel_aes_gcmENC
    373 .align 16
    374 intel_aes_gcmENC:
    375 
    376 .set PT,%rdi
    377 .set CT,%rsi
    378 .set Htbl, %rdx
    379 .set len, %rcx
    380 .set KS,%r9
    381 .set NR,%r10d
    382 
    383 .set Gctx, %rdx
    384 
    385 .set T,%xmm0
    386 .set TMP0,%xmm1
    387 .set TMP1,%xmm2
    388 .set TMP2,%xmm3
    389 .set TMP3,%xmm4
    390 .set TMP4,%xmm5
    391 .set TMP5,%xmm6
    392 .set CTR0,%xmm7
    393 .set CTR1,%xmm8
    394 .set CTR2,%xmm9
    395 .set CTR3,%xmm10
    396 .set CTR4,%xmm11
    397 .set CTR5,%xmm12
    398 .set CTR6,%xmm13
    399 .set CTR7,%xmm14
    400 .set CTR,%xmm15
    401 
    402 .macro ROUND i
    403    vmovdqu \i*16(KS), TMP3
    404    vaesenc TMP3, CTR0, CTR0
    405    vaesenc TMP3, CTR1, CTR1
    406    vaesenc TMP3, CTR2, CTR2
    407    vaesenc TMP3, CTR3, CTR3
    408    vaesenc TMP3, CTR4, CTR4
    409    vaesenc TMP3, CTR5, CTR5
    410    vaesenc TMP3, CTR6, CTR6
    411    vaesenc TMP3, CTR7, CTR7
    412 .endm
    413 
    414 .macro ROUNDMUL i
    415 
    416    vmovdqu \i*16(%rsp), TMP5
    417    vmovdqu \i*16(KS), TMP3
    418 
    419    vaesenc TMP3, CTR0, CTR0
    420    vaesenc TMP3, CTR1, CTR1
    421    vaesenc TMP3, CTR2, CTR2
    422    vaesenc TMP3, CTR3, CTR3
    423 
    424    vpshufd $78, TMP5, TMP4
    425    vpxor   TMP5, TMP4, TMP4
    426 
    427    vaesenc TMP3, CTR4, CTR4
    428    vaesenc TMP3, CTR5, CTR5
    429    vaesenc TMP3, CTR6, CTR6
    430    vaesenc TMP3, CTR7, CTR7
    431 
    432    vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP4, TMP3
    433    vpxor       TMP3, TMP0, TMP0
    434    vmovdqa     \i*16(Htbl), TMP4
    435    vpclmulqdq  $0x11, TMP4, TMP5, TMP3
    436    vpxor       TMP3, TMP1, TMP1
    437    vpclmulqdq  $0x00, TMP4, TMP5, TMP3
    438    vpxor       TMP3, TMP2, TMP2
    439  
    440 .endm
    441 
    442 .macro KARATSUBA i
    443    vmovdqu \i*16(%rsp), TMP5
    444 
    445    vpclmulqdq  $0x11, 16*\i(Htbl), TMP5, TMP3
    446    vpxor       TMP3, TMP1, TMP1
    447    vpclmulqdq  $0x00, 16*\i(Htbl), TMP5, TMP3
    448    vpxor       TMP3, TMP2, TMP2
    449    vpshufd     $78, TMP5, TMP3
    450    vpxor       TMP5, TMP3, TMP5
    451    vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP5, TMP3
    452    vpxor       TMP3, TMP0, TMP0
    453 .endm
    454 
    455    test len, len
    456    jnz  .Lbegin
    457    ret
    458   
    459 .Lbegin:
    460 
    461    vzeroupper
    462    push %rbp
    463    movq %rsp, %rbp
    464 
    465    sub  $128, %rsp
    466    andq $-16, %rsp
    467 
    468    vmovdqu  288(Gctx), CTR
    469    vmovdqu  272(Gctx), T
    470    mov  304(Gctx), KS
    471 # AESContext->Nr
    472    mov  244(KS), NR
    473 
    474    vpshufb  .Lbswap_mask(%rip), CTR, CTR
    475    vpshufb  .Lbswap_mask(%rip), T, T
    476 
    477    cmp  $128, len
    478    jb   .LDataSingles
    479   
    480 # Encrypt the first eight blocks
    481    sub     $128, len
    482    vmovdqa CTR, CTR0
    483    vpaddd  .Lone(%rip), CTR0, CTR1
    484    vpaddd  .Ltwo(%rip), CTR0, CTR2
    485    vpaddd  .Lone(%rip), CTR2, CTR3
    486    vpaddd  .Ltwo(%rip), CTR2, CTR4
    487    vpaddd  .Lone(%rip), CTR4, CTR5
    488    vpaddd  .Ltwo(%rip), CTR4, CTR6
    489    vpaddd  .Lone(%rip), CTR6, CTR7
    490    vpaddd  .Ltwo(%rip), CTR6, CTR
    491 
    492    vpshufb .Lbswap_mask(%rip), CTR0, CTR0
    493    vpshufb .Lbswap_mask(%rip), CTR1, CTR1
    494    vpshufb .Lbswap_mask(%rip), CTR2, CTR2
    495    vpshufb .Lbswap_mask(%rip), CTR3, CTR3
    496    vpshufb .Lbswap_mask(%rip), CTR4, CTR4
    497    vpshufb .Lbswap_mask(%rip), CTR5, CTR5
    498    vpshufb .Lbswap_mask(%rip), CTR6, CTR6
    499    vpshufb .Lbswap_mask(%rip), CTR7, CTR7
    500 
    501    vpxor   (KS), CTR0, CTR0
    502    vpxor   (KS), CTR1, CTR1
    503    vpxor   (KS), CTR2, CTR2
    504    vpxor   (KS), CTR3, CTR3
    505    vpxor   (KS), CTR4, CTR4
    506    vpxor   (KS), CTR5, CTR5
    507    vpxor   (KS), CTR6, CTR6
    508    vpxor   (KS), CTR7, CTR7
    509 
    510    ROUND 1
    511    ROUND 2
    512    ROUND 3
    513    ROUND 4
    514    ROUND 5
    515    ROUND 6
    516    ROUND 7
    517    ROUND 8
    518    ROUND 9
    519 
    520    vmovdqu 160(KS), TMP5
    521    cmp $12, NR
    522    jb  .LLast1
    523 
    524    ROUND 10
    525    ROUND 11
    526 
    527    vmovdqu 192(KS), TMP5
    528    cmp $14, NR
    529    jb  .LLast1
    530 
    531    ROUND 12
    532    ROUND 13
    533 
    534    vmovdqu 224(KS), TMP5
    535  
    536 .LLast1:
    537 
    538    vpxor       (PT), TMP5, TMP3
    539    vaesenclast TMP3, CTR0, CTR0
    540    vpxor       16(PT), TMP5, TMP3
    541    vaesenclast TMP3, CTR1, CTR1
    542    vpxor       32(PT), TMP5, TMP3
    543    vaesenclast TMP3, CTR2, CTR2
    544    vpxor       48(PT), TMP5, TMP3
    545    vaesenclast TMP3, CTR3, CTR3
    546    vpxor       64(PT), TMP5, TMP3
    547    vaesenclast TMP3, CTR4, CTR4
    548    vpxor       80(PT), TMP5, TMP3
    549    vaesenclast TMP3, CTR5, CTR5
    550    vpxor       96(PT), TMP5, TMP3
    551    vaesenclast TMP3, CTR6, CTR6
    552    vpxor       112(PT), TMP5, TMP3
    553    vaesenclast TMP3, CTR7, CTR7
    554    
    555    vmovdqu     .Lbswap_mask(%rip), TMP3
    556   
    557    vmovdqu CTR0, (CT)
    558    vpshufb TMP3, CTR0, CTR0
    559    vmovdqu CTR1, 16(CT)
    560    vpshufb TMP3, CTR1, CTR1
    561    vmovdqu CTR2, 32(CT)
    562    vpshufb TMP3, CTR2, CTR2
    563    vmovdqu CTR3, 48(CT)
    564    vpshufb TMP3, CTR3, CTR3
    565    vmovdqu CTR4, 64(CT)
    566    vpshufb TMP3, CTR4, CTR4
    567    vmovdqu CTR5, 80(CT)
    568    vpshufb TMP3, CTR5, CTR5
    569    vmovdqu CTR6, 96(CT)
    570    vpshufb TMP3, CTR6, CTR6
    571    vmovdqu CTR7, 112(CT)
    572    vpshufb TMP3, CTR7, CTR7
    573 
    574    lea 128(CT), CT
    575    lea 128(PT), PT
    576    jmp .LDataOctets
    577 
    578 # Encrypt 8 blocks each time while hashing previous 8 blocks
    579 .align 64
    580 .LDataOctets:
    581        cmp $128, len
    582        jb  .LEndOctets
    583        sub $128, len
    584 
    585        vmovdqa CTR7, TMP5
    586        vmovdqa CTR6, 1*16(%rsp)
    587        vmovdqa CTR5, 2*16(%rsp)
    588        vmovdqa CTR4, 3*16(%rsp)
    589        vmovdqa CTR3, 4*16(%rsp)
    590        vmovdqa CTR2, 5*16(%rsp)
    591        vmovdqa CTR1, 6*16(%rsp)
    592        vmovdqa CTR0, 7*16(%rsp)
    593 
    594        vmovdqa CTR, CTR0
    595        vpaddd  .Lone(%rip), CTR0, CTR1
    596        vpaddd  .Ltwo(%rip), CTR0, CTR2
    597        vpaddd  .Lone(%rip), CTR2, CTR3
    598        vpaddd  .Ltwo(%rip), CTR2, CTR4
    599        vpaddd  .Lone(%rip), CTR4, CTR5
    600        vpaddd  .Ltwo(%rip), CTR4, CTR6
    601        vpaddd  .Lone(%rip), CTR6, CTR7
    602        vpaddd  .Ltwo(%rip), CTR6, CTR
    603 
    604        vmovdqu (KS), TMP4
    605        vpshufb TMP3, CTR0, CTR0
    606        vpxor   TMP4, CTR0, CTR0
    607        vpshufb TMP3, CTR1, CTR1
    608        vpxor   TMP4, CTR1, CTR1
    609        vpshufb TMP3, CTR2, CTR2
    610        vpxor   TMP4, CTR2, CTR2
    611        vpshufb TMP3, CTR3, CTR3
    612        vpxor   TMP4, CTR3, CTR3
    613        vpshufb TMP3, CTR4, CTR4
    614        vpxor   TMP4, CTR4, CTR4
    615        vpshufb TMP3, CTR5, CTR5
    616        vpxor   TMP4, CTR5, CTR5
    617        vpshufb TMP3, CTR6, CTR6
    618        vpxor   TMP4, CTR6, CTR6
    619        vpshufb TMP3, CTR7, CTR7
    620        vpxor   TMP4, CTR7, CTR7
    621 
    622        vmovdqu     16*0(Htbl), TMP3
    623        vpclmulqdq  $0x11, TMP3, TMP5, TMP1
    624        vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
    625        vpshufd     $78, TMP5, TMP3
    626        vpxor       TMP5, TMP3, TMP5
    627        vmovdqu     128+0*16(Htbl), TMP3      
    628        vpclmulqdq  $0x00, TMP3, TMP5, TMP0
    629 
    630        ROUNDMUL 1
    631 
    632        ROUNDMUL 2
    633 
    634        ROUNDMUL 3
    635 
    636        ROUNDMUL 4
    637 
    638        ROUNDMUL 5
    639 
    640        ROUNDMUL 6
    641 
    642        vpxor   7*16(%rsp), T, TMP5
    643        vmovdqu 7*16(KS), TMP3
    644 
    645        vaesenc TMP3, CTR0, CTR0
    646        vaesenc TMP3, CTR1, CTR1
    647        vaesenc TMP3, CTR2, CTR2
    648        vaesenc TMP3, CTR3, CTR3
    649 
    650        vpshufd $78, TMP5, TMP4
    651        vpxor   TMP5, TMP4, TMP4
    652 
    653        vaesenc TMP3, CTR4, CTR4
    654        vaesenc TMP3, CTR5, CTR5
    655        vaesenc TMP3, CTR6, CTR6
    656        vaesenc TMP3, CTR7, CTR7
    657 
    658        vpclmulqdq  $0x11, 7*16(Htbl), TMP5, TMP3
    659        vpxor       TMP3, TMP1, TMP1
    660        vpclmulqdq  $0x00, 7*16(Htbl), TMP5, TMP3
    661        vpxor       TMP3, TMP2, TMP2
    662        vpclmulqdq  $0x00, 128+7*16(Htbl), TMP4, TMP3
    663        vpxor       TMP3, TMP0, TMP0
    664 
    665        ROUND 8    
    666        vmovdqa .Lpoly(%rip), TMP5
    667 
    668        vpxor   TMP1, TMP0, TMP0
    669        vpxor   TMP2, TMP0, TMP0
    670        vpsrldq $8, TMP0, TMP3
    671        vpxor   TMP3, TMP1, TMP4
    672        vpslldq $8, TMP0, TMP3
    673        vpxor   TMP3, TMP2, T
    674 
    675        vpclmulqdq  $0x10, TMP5, T, TMP1
    676        vpalignr    $8, T, T, T
    677        vpxor       T, TMP1, T
    678 
    679        ROUND 9
    680 
    681        vpclmulqdq  $0x10, TMP5, T, TMP1
    682        vpalignr    $8, T, T, T
    683        vpxor       T, TMP1, T
    684 
    685        vmovdqu 160(KS), TMP5
    686        cmp     $10, NR
    687        jbe     .LLast2
    688 
    689        ROUND 10
    690        ROUND 11
    691 
    692        vmovdqu 192(KS), TMP5
    693        cmp     $12, NR
    694        jbe     .LLast2
    695 
    696        ROUND 12
    697        ROUND 13
    698 
    699        vmovdqu 224(KS), TMP5
    700 
    701 .LLast2:
    702      
    703        vpxor       (PT), TMP5, TMP3
    704        vaesenclast TMP3, CTR0, CTR0
    705        vpxor       16(PT), TMP5, TMP3
    706        vaesenclast TMP3, CTR1, CTR1
    707        vpxor       32(PT), TMP5, TMP3
    708        vaesenclast TMP3, CTR2, CTR2
    709        vpxor       48(PT), TMP5, TMP3
    710        vaesenclast TMP3, CTR3, CTR3
    711        vpxor       64(PT), TMP5, TMP3
    712        vaesenclast TMP3, CTR4, CTR4
    713        vpxor       80(PT), TMP5, TMP3
    714        vaesenclast TMP3, CTR5, CTR5
    715        vpxor       96(PT), TMP5, TMP3
    716        vaesenclast TMP3, CTR6, CTR6
    717        vpxor       112(PT), TMP5, TMP3
    718        vaesenclast TMP3, CTR7, CTR7
    719 
    720        vmovdqu .Lbswap_mask(%rip), TMP3
    721 
    722        vmovdqu CTR0, (CT)
    723        vpshufb TMP3, CTR0, CTR0
    724        vmovdqu CTR1, 16(CT)
    725        vpshufb TMP3, CTR1, CTR1
    726        vmovdqu CTR2, 32(CT)
    727        vpshufb TMP3, CTR2, CTR2
    728        vmovdqu CTR3, 48(CT)
    729        vpshufb TMP3, CTR3, CTR3
    730        vmovdqu CTR4, 64(CT)
    731        vpshufb TMP3, CTR4, CTR4
    732        vmovdqu CTR5, 80(CT)
    733        vpshufb TMP3, CTR5, CTR5
    734        vmovdqu CTR6, 96(CT)
    735        vpshufb TMP3, CTR6, CTR6
    736        vmovdqu CTR7,112(CT)
    737        vpshufb TMP3, CTR7, CTR7
    738 
    739        vpxor   TMP4, T, T
    740 
    741        lea 128(CT), CT
    742        lea 128(PT), PT
    743    jmp  .LDataOctets
    744 
    745 .LEndOctets:
    746    
    747    vmovdqa CTR7, TMP5
    748    vmovdqa CTR6, 1*16(%rsp)
    749    vmovdqa CTR5, 2*16(%rsp)
    750    vmovdqa CTR4, 3*16(%rsp)
    751    vmovdqa CTR3, 4*16(%rsp)
    752    vmovdqa CTR2, 5*16(%rsp)
    753    vmovdqa CTR1, 6*16(%rsp)
    754    vmovdqa CTR0, 7*16(%rsp)
    755 
    756    vmovdqu     16*0(Htbl), TMP3
    757    vpclmulqdq  $0x11, TMP3, TMP5, TMP1
    758    vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
    759    vpshufd     $78, TMP5, TMP3
    760    vpxor       TMP5, TMP3, TMP5
    761    vmovdqu     128+0*16(Htbl), TMP3      
    762    vpclmulqdq  $0x00, TMP3, TMP5, TMP0
    763 
    764    KARATSUBA 1
    765    KARATSUBA 2
    766    KARATSUBA 3      
    767    KARATSUBA 4
    768    KARATSUBA 5
    769    KARATSUBA 6
    770 
    771    vmovdqu     7*16(%rsp), TMP5
    772    vpxor       T, TMP5, TMP5
    773    vmovdqu     16*7(Htbl), TMP4            
    774    vpclmulqdq  $0x11, TMP4, TMP5, TMP3
    775    vpxor       TMP3, TMP1, TMP1
    776    vpclmulqdq  $0x00, TMP4, TMP5, TMP3
    777    vpxor       TMP3, TMP2, TMP2      
    778    vpshufd     $78, TMP5, TMP3
    779    vpxor       TMP5, TMP3, TMP5
    780    vmovdqu     128+7*16(Htbl), TMP4      
    781    vpclmulqdq  $0x00, TMP4, TMP5, TMP3
    782    vpxor       TMP3, TMP0, TMP0
    783 
    784    vpxor       TMP1, TMP0, TMP0
    785    vpxor       TMP2, TMP0, TMP0
    786 
    787    vpsrldq     $8, TMP0, TMP3
    788    vpxor       TMP3, TMP1, TMP4
    789    vpslldq     $8, TMP0, TMP3
    790    vpxor       TMP3, TMP2, T
    791 
    792    vmovdqa     .Lpoly(%rip), TMP2
    793 
    794    vpalignr    $8, T, T, TMP1
    795    vpclmulqdq  $0x10, TMP2, T, T
    796    vpxor       T, TMP1, T
    797 
    798    vpalignr    $8, T, T, TMP1
    799    vpclmulqdq  $0x10, TMP2, T, T
    800    vpxor       T, TMP1, T
    801 
    802    vpxor       TMP4, T, T
    803 
    804 #Here we encrypt any remaining whole block
    805 .LDataSingles:
    806 
    807    cmp $16, len
    808    jb  .LDataTail
    809    sub $16, len
    810 
    811    vpshufb .Lbswap_mask(%rip), CTR, TMP1
    812    vpaddd  .Lone(%rip), CTR, CTR
    813 
    814    vpxor   (KS), TMP1, TMP1
    815    vaesenc 16*1(KS), TMP1, TMP1
    816    vaesenc 16*2(KS), TMP1, TMP1
    817    vaesenc 16*3(KS), TMP1, TMP1
    818    vaesenc 16*4(KS), TMP1, TMP1
    819    vaesenc 16*5(KS), TMP1, TMP1
    820    vaesenc 16*6(KS), TMP1, TMP1
    821    vaesenc 16*7(KS), TMP1, TMP1
    822    vaesenc 16*8(KS), TMP1, TMP1
    823    vaesenc 16*9(KS), TMP1, TMP1
    824    vmovdqu 16*10(KS), TMP2
    825    cmp     $10, NR
    826    je      .LLast3
    827    vaesenc 16*10(KS), TMP1, TMP1
    828    vaesenc 16*11(KS), TMP1, TMP1
    829    vmovdqu 16*12(KS), TMP2
    830    cmp     $12, NR
    831    je      .LLast3
    832    vaesenc 16*12(KS), TMP1, TMP1
    833    vaesenc 16*13(KS), TMP1, TMP1
    834    vmovdqu 16*14(KS), TMP2
    835 
    836 .LLast3:
    837    vaesenclast TMP2, TMP1, TMP1
    838 
    839    vpxor   (PT), TMP1, TMP1
    840    vmovdqu TMP1, (CT)
    841    addq    $16, CT
    842    addq    $16, PT
    843 
    844    vpshufb .Lbswap_mask(%rip), TMP1, TMP1
    845    vpxor   TMP1, T, T
    846    vmovdqu (Htbl), TMP0
    847    call    GFMUL
    848 
    849    jmp .LDataSingles
    850 
    851 #Here we encypt the final partial block, if there is one
    852 .LDataTail:
    853 
    854    test    len, len
    855    jz      DATA_END
    856 # First prepare the counter block
    857    vpshufb .Lbswap_mask(%rip), CTR, TMP1
    858    vpaddd  .Lone(%rip), CTR, CTR
    859 
    860    vpxor   (KS), TMP1, TMP1
    861    vaesenc 16*1(KS), TMP1, TMP1
    862    vaesenc 16*2(KS), TMP1, TMP1
    863    vaesenc 16*3(KS), TMP1, TMP1
    864    vaesenc 16*4(KS), TMP1, TMP1
    865    vaesenc 16*5(KS), TMP1, TMP1
    866    vaesenc 16*6(KS), TMP1, TMP1
    867    vaesenc 16*7(KS), TMP1, TMP1
    868    vaesenc 16*8(KS), TMP1, TMP1
    869    vaesenc 16*9(KS), TMP1, TMP1
    870    vmovdqu 16*10(KS), TMP2
    871    cmp     $10, NR
    872    je      .LLast4
    873    vaesenc 16*10(KS), TMP1, TMP1
    874    vaesenc 16*11(KS), TMP1, TMP1
    875    vmovdqu 16*12(KS), TMP2
    876    cmp     $12, NR
    877    je      .LLast4
    878    vaesenc 16*12(KS), TMP1, TMP1
    879    vaesenc 16*13(KS), TMP1, TMP1
    880    vmovdqu 16*14(KS), TMP2
    881  
    882 .LLast4:
    883    vaesenclast TMP2, TMP1, TMP1
    884 #Zero a temp location
    885    vpxor   TMP2, TMP2, TMP2
    886    vmovdqa TMP2, (%rsp)
    887    
    888 # Copy the required bytes only (could probably use rep movsb)
    889    xor KS, KS  
    890 .LEncCpy:
    891        cmp     KS, len
    892        je      .LEncCpyEnd
    893        movb    (PT, KS, 1), %r8b
    894        movb    %r8b, (%rsp, KS, 1)
    895        inc     KS
    896        jmp .LEncCpy
    897 .LEncCpyEnd:
    898 # Xor with the counter block
    899    vpxor   (%rsp), TMP1, TMP0
    900 # Again, store at temp location
    901    vmovdqa TMP0, (%rsp)
    902 # Copy only the required bytes to CT, and zero the rest for the hash
    903    xor KS, KS
    904 .LEncCpy2:
    905    cmp     KS, len
    906    je      .LEncCpy3
    907    movb    (%rsp, KS, 1), %r8b
    908    movb    %r8b, (CT, KS, 1)
    909    inc     KS
    910    jmp .LEncCpy2
    911 .LEncCpy3:
    912    cmp     $16, KS
    913    je      .LEndCpy3
    914    movb    $0, (%rsp, KS, 1)
    915    inc     KS
    916    jmp .LEncCpy3
    917 .LEndCpy3:
    918   vmovdqa  (%rsp), TMP0
    919 
    920   vpshufb  .Lbswap_mask(%rip), TMP0, TMP0
    921   vpxor    TMP0, T, T
    922   vmovdqu  (Htbl), TMP0
    923   call     GFMUL
    924 
    925 DATA_END:
    926 
    927   vpshufb  .Lbswap_mask(%rip), T, T
    928   vpshufb  .Lbswap_mask(%rip), CTR, CTR
    929   vmovdqu  T, 272(Gctx)
    930   vmovdqu  CTR, 288(Gctx)
    931 
    932   movq %rbp, %rsp
    933   popq %rbp
    934   ret
    935   .size intel_aes_gcmENC, .-intel_aes_gcmENC
    936  
    937 #########################
    938 # Decrypt and Authenticate
    939 # void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
    940 .type intel_aes_gcmDEC,@function
    941 .globl intel_aes_gcmDEC
    942 .align 16
    943 intel_aes_gcmDEC:
    944 # parameter 1: CT    # input
    945 # parameter 2: PT    # output
    946 # parameter 3: %rdx  # Gctx
    947 # parameter 4: %rcx  # len
    948 
    949 .macro DEC_KARATSUBA i
    950    vmovdqu     (7-\i)*16(CT), TMP5
    951    vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
    952 
    953    vpclmulqdq  $0x11, 16*\i(Htbl), TMP5, TMP3
    954    vpxor       TMP3, TMP1, TMP1
    955    vpclmulqdq  $0x00, 16*\i(Htbl), TMP5, TMP3
    956    vpxor       TMP3, TMP2, TMP2
    957    vpshufd     $78, TMP5, TMP3
    958    vpxor       TMP5, TMP3, TMP5
    959    vpclmulqdq  $0x00, 128+\i*16(Htbl), TMP5, TMP3
    960    vpxor       TMP3, TMP0, TMP0
    961 .endm
    962 
    963 .set PT,%rsi
    964 .set CT,%rdi
    965 .set Htbl, %rdx
    966 .set len, %rcx
    967 .set KS,%r9
    968 .set NR,%r10d
    969 
    970 .set Gctx, %rdx
    971 
    972 .set T,%xmm0
    973 .set TMP0,%xmm1
    974 .set TMP1,%xmm2
    975 .set TMP2,%xmm3
    976 .set TMP3,%xmm4
    977 .set TMP4,%xmm5
    978 .set TMP5,%xmm6
    979 .set CTR0,%xmm7
    980 .set CTR1,%xmm8
    981 .set CTR2,%xmm9
    982 .set CTR3,%xmm10
    983 .set CTR4,%xmm11
    984 .set CTR5,%xmm12
    985 .set CTR6,%xmm13
    986 .set CTR7,%xmm14
    987 .set CTR,%xmm15
    988 
    989    test  len, len
    990    jnz   .LbeginDec
    991    ret
    992   
    993 .LbeginDec:
    994 
    995    pushq   %rbp
    996    movq    %rsp, %rbp
    997    sub     $128, %rsp
    998    andq    $-16, %rsp
    999    vmovdqu 288(Gctx), CTR
   1000    vmovdqu 272(Gctx), T
   1001    mov     304(Gctx), KS
   1002 # AESContext->Nr
   1003    mov     244(KS), NR
   1004 
   1005    vpshufb .Lbswap_mask(%rip), CTR, CTR
   1006    vpshufb .Lbswap_mask(%rip), T, T
   1007     
   1008    vmovdqu .Lbswap_mask(%rip), TMP3
   1009    jmp     .LDECOctets
   1010      
   1011 # Decrypt 8 blocks each time while hashing them at the same time
   1012 .align 64
   1013 .LDECOctets:
   1014   
   1015        cmp $128, len
   1016        jb  .LDECSingles
   1017        sub $128, len
   1018 
   1019        vmovdqa CTR, CTR0
   1020        vpaddd  .Lone(%rip), CTR0, CTR1
   1021        vpaddd  .Ltwo(%rip), CTR0, CTR2
   1022        vpaddd  .Lone(%rip), CTR2, CTR3
   1023        vpaddd  .Ltwo(%rip), CTR2, CTR4
   1024        vpaddd  .Lone(%rip), CTR4, CTR5
   1025        vpaddd  .Ltwo(%rip), CTR4, CTR6
   1026        vpaddd  .Lone(%rip), CTR6, CTR7
   1027        vpaddd  .Ltwo(%rip), CTR6, CTR
   1028 
   1029        vpshufb TMP3, CTR0, CTR0
   1030        vpshufb TMP3, CTR1, CTR1
   1031        vpshufb TMP3, CTR2, CTR2
   1032        vpshufb TMP3, CTR3, CTR3
   1033        vpshufb TMP3, CTR4, CTR4
   1034        vpshufb TMP3, CTR5, CTR5
   1035        vpshufb TMP3, CTR6, CTR6
   1036        vpshufb TMP3, CTR7, CTR7
   1037 
   1038        vmovdqu (KS), TMP3
   1039        vpxor  TMP3, CTR0, CTR0
   1040        vpxor  TMP3, CTR1, CTR1
   1041        vpxor  TMP3, CTR2, CTR2
   1042        vpxor  TMP3, CTR3, CTR3
   1043        vpxor  TMP3, CTR4, CTR4
   1044        vpxor  TMP3, CTR5, CTR5
   1045        vpxor  TMP3, CTR6, CTR6
   1046        vpxor  TMP3, CTR7, CTR7
   1047 
   1048        vmovdqu     7*16(CT), TMP5
   1049        vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
   1050        vmovdqu     16*0(Htbl), TMP3
   1051        vpclmulqdq  $0x11, TMP3, TMP5, TMP1
   1052        vpclmulqdq  $0x00, TMP3, TMP5, TMP2      
   1053        vpshufd     $78, TMP5, TMP3
   1054        vpxor       TMP5, TMP3, TMP5
   1055        vmovdqu     128+0*16(Htbl), TMP3      
   1056        vpclmulqdq  $0x00, TMP3, TMP5, TMP0
   1057 
   1058        ROUND 1
   1059        DEC_KARATSUBA 1
   1060 
   1061        ROUND 2
   1062        DEC_KARATSUBA 2
   1063 
   1064        ROUND 3
   1065        DEC_KARATSUBA 3
   1066 
   1067        ROUND 4
   1068        DEC_KARATSUBA 4
   1069 
   1070        ROUND 5
   1071        DEC_KARATSUBA 5
   1072 
   1073        ROUND 6
   1074        DEC_KARATSUBA 6
   1075 
   1076        ROUND 7
   1077 
   1078        vmovdqu     0*16(CT), TMP5
   1079        vpshufb     .Lbswap_mask(%rip), TMP5, TMP5
   1080        vpxor       T, TMP5, TMP5
   1081        vmovdqu     16*7(Htbl), TMP4
   1082            
   1083        vpclmulqdq  $0x11, TMP4, TMP5, TMP3
   1084        vpxor       TMP3, TMP1, TMP1
   1085        vpclmulqdq  $0x00, TMP4, TMP5, TMP3
   1086        vpxor       TMP3, TMP2, TMP2
   1087 
   1088        vpshufd     $78, TMP5, TMP3
   1089        vpxor       TMP5, TMP3, TMP5
   1090        vmovdqu     128+7*16(Htbl), TMP4
   1091 
   1092        vpclmulqdq  $0x00, TMP4, TMP5, TMP3
   1093        vpxor       TMP3, TMP0, TMP0
   1094 
   1095        ROUND 8      
   1096 
   1097        vpxor       TMP1, TMP0, TMP0
   1098        vpxor       TMP2, TMP0, TMP0
   1099 
   1100        vpsrldq     $8, TMP0, TMP3
   1101        vpxor       TMP3, TMP1, TMP4
   1102        vpslldq     $8, TMP0, TMP3
   1103        vpxor       TMP3, TMP2, T
   1104        vmovdqa	  .Lpoly(%rip), TMP2
   1105 
   1106        vpalignr    $8, T, T, TMP1
   1107        vpclmulqdq  $0x10, TMP2, T, T
   1108        vpxor       T, TMP1, T
   1109 
   1110        ROUND 9
   1111 
   1112        vpalignr    $8, T, T, TMP1
   1113        vpclmulqdq  $0x10, TMP2, T, T
   1114        vpxor       T, TMP1, T
   1115 
   1116        vmovdqu     160(KS), TMP5
   1117        cmp         $10, NR
   1118 
   1119        jbe  .LDECLast1
   1120 
   1121        ROUND 10
   1122        ROUND 11
   1123 
   1124        vmovdqu     192(KS), TMP5
   1125        cmp         $12, NR       
   1126 
   1127        jbe  .LDECLast1
   1128 
   1129        ROUND 12
   1130        ROUND 13
   1131 
   1132        vmovdqu  224(KS), TMP5
   1133 
   1134 .LDECLast1:      
   1135      
   1136        vpxor   (CT), TMP5, TMP3
   1137        vaesenclast TMP3, CTR0, CTR0
   1138        vpxor   16(CT), TMP5, TMP3
   1139        vaesenclast TMP3, CTR1, CTR1
   1140        vpxor   32(CT), TMP5, TMP3
   1141        vaesenclast TMP3, CTR2, CTR2
   1142        vpxor   48(CT), TMP5, TMP3
   1143        vaesenclast TMP3, CTR3, CTR3
   1144        vpxor   64(CT), TMP5, TMP3
   1145        vaesenclast TMP3, CTR4, CTR4
   1146        vpxor   80(CT), TMP5, TMP3
   1147        vaesenclast TMP3, CTR5, CTR5
   1148        vpxor   96(CT), TMP5, TMP3
   1149        vaesenclast TMP3, CTR6, CTR6
   1150        vpxor   112(CT), TMP5, TMP3
   1151        vaesenclast TMP3, CTR7, CTR7
   1152 
   1153        vmovdqu .Lbswap_mask(%rip), TMP3
   1154 
   1155        vmovdqu CTR0, (PT)
   1156        vmovdqu CTR1, 16(PT)
   1157        vmovdqu CTR2, 32(PT)
   1158        vmovdqu CTR3, 48(PT)
   1159        vmovdqu CTR4, 64(PT)
   1160        vmovdqu CTR5, 80(PT)
   1161        vmovdqu CTR6, 96(PT)
   1162        vmovdqu CTR7,112(PT)
   1163 
   1164        vpxor   TMP4, T, T
   1165 
   1166        lea 128(CT), CT
   1167        lea 128(PT), PT
   1168   jmp  .LDECOctets
   1169   
   1170 #Here we decrypt and hash any remaining whole block
   1171 .LDECSingles:
   1172 
   1173    cmp   $16, len
   1174    jb    .LDECTail
   1175    sub   $16, len
   1176 
   1177    vmovdqu  (CT), TMP1
   1178    vpshufb  .Lbswap_mask(%rip), TMP1, TMP1
   1179    vpxor    TMP1, T, T
   1180    vmovdqu  (Htbl), TMP0
   1181    call     GFMUL
   1182 
   1183 
   1184    vpshufb  .Lbswap_mask(%rip), CTR, TMP1
   1185    vpaddd   .Lone(%rip), CTR, CTR
   1186 
   1187    vpxor    (KS), TMP1, TMP1
   1188    vaesenc  16*1(KS), TMP1, TMP1
   1189    vaesenc  16*2(KS), TMP1, TMP1
   1190    vaesenc  16*3(KS), TMP1, TMP1
   1191    vaesenc  16*4(KS), TMP1, TMP1
   1192    vaesenc  16*5(KS), TMP1, TMP1
   1193    vaesenc  16*6(KS), TMP1, TMP1
   1194    vaesenc  16*7(KS), TMP1, TMP1
   1195    vaesenc  16*8(KS), TMP1, TMP1
   1196    vaesenc  16*9(KS), TMP1, TMP1
   1197    vmovdqu  16*10(KS), TMP2
   1198    cmp      $10, NR
   1199    je       .LDECLast2
   1200    vaesenc  16*10(KS), TMP1, TMP1
   1201    vaesenc  16*11(KS), TMP1, TMP1
   1202    vmovdqu  16*12(KS), TMP2
   1203    cmp      $12, NR
   1204    je       .LDECLast2
   1205    vaesenc  16*12(KS), TMP1, TMP1
   1206    vaesenc  16*13(KS), TMP1, TMP1
   1207    vmovdqu  16*14(KS), TMP2
   1208 .LDECLast2:
   1209    vaesenclast TMP2, TMP1, TMP1
   1210 
   1211    vpxor    (CT), TMP1, TMP1
   1212    vmovdqu  TMP1, (PT)
   1213    addq     $16, CT
   1214    addq     $16, PT  
   1215    jmp   .LDECSingles
   1216 
   1217 #Here we decrypt the final partial block, if there is one
   1218 .LDECTail:
   1219   test   len, len
   1220   jz     .LDEC_END
   1221 
   1222   vpshufb  .Lbswap_mask(%rip), CTR, TMP1
   1223   vpaddd .Lone(%rip), CTR, CTR
   1224 
   1225   vpxor  (KS), TMP1, TMP1
   1226   vaesenc  16*1(KS), TMP1, TMP1
   1227   vaesenc  16*2(KS), TMP1, TMP1
   1228   vaesenc  16*3(KS), TMP1, TMP1
   1229   vaesenc  16*4(KS), TMP1, TMP1
   1230   vaesenc  16*5(KS), TMP1, TMP1
   1231   vaesenc  16*6(KS), TMP1, TMP1
   1232   vaesenc  16*7(KS), TMP1, TMP1
   1233   vaesenc  16*8(KS), TMP1, TMP1
   1234   vaesenc  16*9(KS), TMP1, TMP1
   1235   vmovdqu  16*10(KS), TMP2
   1236   cmp      $10, NR
   1237   je       .LDECLast3
   1238   vaesenc  16*10(KS), TMP1, TMP1
   1239   vaesenc  16*11(KS), TMP1, TMP1
   1240   vmovdqu  16*12(KS), TMP2
   1241   cmp      $12, NR
   1242   je       .LDECLast3
   1243   vaesenc  16*12(KS), TMP1, TMP1
   1244   vaesenc  16*13(KS), TMP1, TMP1
   1245   vmovdqu  16*14(KS), TMP2
   1246 
   1247 .LDECLast3:
   1248   vaesenclast TMP2, TMP1, TMP1
   1249  
   1250   vpxor   TMP2, TMP2, TMP2
   1251   vmovdqa TMP2, (%rsp) 
   1252 # Copy the required bytes only (could probably use rep movsb)
   1253    xor KS, KS  
   1254 .LDecCpy:
   1255        cmp     KS, len
   1256        je      .LDecCpy2
   1257        movb    (CT, KS, 1), %r8b
   1258        movb    %r8b, (%rsp, KS, 1)
   1259        inc     KS
   1260        jmp     .LDecCpy
   1261 .LDecCpy2:
   1262        cmp     $16, KS
   1263        je      .LDecCpyEnd
   1264        movb    $0, (%rsp, KS, 1)
   1265        inc     KS
   1266        jmp     .LDecCpy2
   1267 .LDecCpyEnd:
   1268 # Xor with the counter block
   1269    vmovdqa (%rsp), TMP0
   1270    vpxor   TMP0, TMP1, TMP1
   1271 # Again, store at temp location
   1272    vmovdqa TMP1, (%rsp)
   1273 # Copy only the required bytes to PT, and zero the rest for the hash
   1274    xor KS, KS
   1275 .LDecCpy3:
   1276    cmp     KS, len
   1277    je      .LDecCpyEnd3
   1278    movb    (%rsp, KS, 1), %r8b
   1279    movb    %r8b, (PT, KS, 1)
   1280    inc     KS
   1281    jmp     .LDecCpy3
   1282 .LDecCpyEnd3:
   1283   vpshufb  .Lbswap_mask(%rip), TMP0, TMP0
   1284   vpxor    TMP0, T, T
   1285   vmovdqu  (Htbl), TMP0
   1286   call     GFMUL
   1287 .LDEC_END:
   1288 
   1289   vpshufb  .Lbswap_mask(%rip), T, T
   1290   vpshufb  .Lbswap_mask(%rip), CTR, CTR
   1291   vmovdqu  T, 272(Gctx)
   1292   vmovdqu  CTR, 288(Gctx)
   1293 
   1294   movq %rbp, %rsp
   1295   popq %rbp
   1296   ret
   1297  .size intel_aes_gcmDEC, .-intel_aes_gcmDEC
   1298 #########################
   1299 # a = T
   1300 # b = TMP0 - remains unchanged
   1301 # res = T
   1302 # uses also TMP1,TMP2,TMP3,TMP4
   1303 # __m128i GFMUL(__m128i A, __m128i B);
   1304 .type GFMUL,@function
   1305 .globl GFMUL
   1306 GFMUL:  
   1307    pushq   %rbp
   1308    movq    %rsp, %rbp
   1309    vpclmulqdq  $0x00, TMP0, T, TMP1
   1310    vpclmulqdq  $0x11, TMP0, T, TMP4
   1311 
   1312    vpshufd     $78, T, TMP2
   1313    vpshufd     $78, TMP0, TMP3
   1314    vpxor       T, TMP2, TMP2
   1315    vpxor       TMP0, TMP3, TMP3
   1316 
   1317    vpclmulqdq  $0x00, TMP3, TMP2, TMP2
   1318    vpxor       TMP1, TMP2, TMP2
   1319    vpxor       TMP4, TMP2, TMP2
   1320 
   1321    vpslldq     $8, TMP2, TMP3
   1322    vpsrldq     $8, TMP2, TMP2
   1323 
   1324    vpxor       TMP3, TMP1, TMP1
   1325    vpxor       TMP2, TMP4, TMP4
   1326 
   1327    vpclmulqdq  $0x10, .Lpoly(%rip), TMP1, TMP2
   1328    vpshufd     $78, TMP1, TMP3
   1329    vpxor       TMP3, TMP2, TMP1
   1330 
   1331    vpclmulqdq  $0x10, .Lpoly(%rip), TMP1, TMP2
   1332    vpshufd     $78, TMP1, TMP3
   1333    vpxor       TMP3, TMP2, TMP1
   1334 
   1335    vpxor       TMP4, TMP1, T
   1336    movq %rbp, %rsp
   1337    popq %rbp
   1338    ret
   1339 .size GFMUL, .-GFMUL