tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intel-aes-x64-masm.asm (21141B)


      1 ; LICENSE:
      2 ; This submission to NSS is to be made available under the terms of the
      3 ; Mozilla Public License, v. 2.0. You can obtain one at http:
      4 ; //mozilla.org/MPL/2.0/.
      5 ;###############################################################################
      6 ; Copyright(c) 2014, Intel Corp.
      7 ; Developers and authors:
      8 ; Shay Gueron and Vlad Krasnov
      9 ; Intel Corporation, Israel Development Centre, Haifa, Israel
     10 ; Please send feedback directly to crypto.feedback.alias@intel.com
     11 
     12 
     13 .DATA
     14 ALIGN 16
     15 Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
     16 Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
     17 Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
     18 Lcon1 dd 1,1,1,1
     19 Lcon2 dd 1bh,1bh,1bh,1bh
     20 
     21 .CODE
     22 
     23 ctx     textequ <rcx>
     24 output  textequ <rdx>
     25 input   textequ <r8>
     26 inputLen textequ <r9d>
     27 
     28 
     29 aes_rnd MACRO i
     30    movdqu  xmm8, [i*16 + ctx]
     31    aesenc  xmm0, xmm8
     32    aesenc  xmm1, xmm8
     33    aesenc  xmm2, xmm8
     34    aesenc  xmm3, xmm8
     35    aesenc  xmm4, xmm8
     36    aesenc  xmm5, xmm8
     37    aesenc  xmm6, xmm8
     38    aesenc  xmm7, xmm8
     39    ENDM
     40 
     41 aes_last_rnd MACRO i
     42    movdqu  xmm8, [i*16 + ctx]
     43    aesenclast  xmm0, xmm8
     44    aesenclast  xmm1, xmm8
     45    aesenclast  xmm2, xmm8
     46    aesenclast  xmm3, xmm8
     47    aesenclast  xmm4, xmm8
     48    aesenclast  xmm5, xmm8
     49    aesenclast  xmm6, xmm8
     50    aesenclast  xmm7, xmm8
     51    ENDM
     52 
     53 aes_dec_rnd MACRO i
     54    movdqu  xmm8, [i*16 + ctx]
     55    aesdec  xmm0, xmm8
     56    aesdec  xmm1, xmm8
     57    aesdec  xmm2, xmm8
     58    aesdec  xmm3, xmm8
     59    aesdec  xmm4, xmm8
     60    aesdec  xmm5, xmm8
     61    aesdec  xmm6, xmm8
     62    aesdec  xmm7, xmm8
     63    ENDM
     64 
     65 aes_dec_last_rnd MACRO i
     66    movdqu  xmm8, [i*16 + ctx]
     67    aesdeclast  xmm0, xmm8
     68    aesdeclast  xmm1, xmm8
     69    aesdeclast  xmm2, xmm8
     70    aesdeclast  xmm3, xmm8
     71    aesdeclast  xmm4, xmm8
     72    aesdeclast  xmm5, xmm8
     73    aesdeclast  xmm6, xmm8
     74    aesdeclast  xmm7, xmm8
     75    ENDM
     76 
     77 
     78 gen_aes_ecb_func MACRO enc, rnds
     79 
     80 LOCAL   loop8
     81 LOCAL   loop1
     82 LOCAL   bail
     83 
     84        xor     inputLen, inputLen
     85        mov     input,      [rsp + 1*8 + 8*4]
     86        mov     inputLen,   [rsp + 1*8 + 8*5]
     87 
     88        sub     rsp, 3*16
     89 
     90        movdqu  [rsp + 0*16], xmm6
     91        movdqu  [rsp + 1*16], xmm7
     92        movdqu  [rsp + 2*16], xmm8
     93 
     94 loop8:
     95        cmp     inputLen, 8*16
     96        jb      loop1
     97 
     98        movdqu  xmm0, [0*16 + input]
     99        movdqu  xmm1, [1*16 + input]
    100        movdqu  xmm2, [2*16 + input]
    101        movdqu  xmm3, [3*16 + input]
    102        movdqu  xmm4, [4*16 + input]
    103        movdqu  xmm5, [5*16 + input]
    104        movdqu  xmm6, [6*16 + input]
    105        movdqu  xmm7, [7*16 + input]
    106 
    107        movdqu  xmm8, [0*16 + ctx]
    108        pxor    xmm0, xmm8
    109        pxor    xmm1, xmm8
    110        pxor    xmm2, xmm8
    111        pxor    xmm3, xmm8
    112        pxor    xmm4, xmm8
    113        pxor    xmm5, xmm8
    114        pxor    xmm6, xmm8
    115        pxor    xmm7, xmm8
    116 
    117 IF enc eq 1
    118        rnd textequ <aes_rnd>
    119        lastrnd textequ <aes_last_rnd>
    120        aesinst textequ <aesenc>
    121        aeslastinst textequ <aesenclast>
    122 ELSE
    123        rnd textequ <aes_dec_rnd>
    124        lastrnd textequ <aes_dec_last_rnd>
    125        aesinst textequ <aesdec>
    126        aeslastinst textequ <aesdeclast>
    127 ENDIF
    128 
    129        i = 1
    130        WHILE i LT rnds
    131            rnd i
    132            i = i+1
    133            ENDM
    134        lastrnd rnds
    135 
    136        movdqu  [0*16 + output], xmm0
    137        movdqu  [1*16 + output], xmm1
    138        movdqu  [2*16 + output], xmm2
    139        movdqu  [3*16 + output], xmm3
    140        movdqu  [4*16 + output], xmm4
    141        movdqu  [5*16 + output], xmm5
    142        movdqu  [6*16 + output], xmm6
    143        movdqu  [7*16 + output], xmm7
    144 
    145        lea input, [8*16 + input]
    146        lea output, [8*16 + output]
    147        sub inputLen, 8*16
    148        jmp loop8
    149 
    150 loop1:
    151        cmp     inputLen, 1*16
    152        jb      bail
    153 
    154        movdqu  xmm0, [input]
    155        movdqu  xmm7, [0*16 + ctx]
    156        pxor    xmm0, xmm7
    157 
    158        i = 1
    159    WHILE i LT rnds
    160            movdqu  xmm7, [i*16 + ctx]
    161            aesinst  xmm0, xmm7
    162            i = i+1
    163        ENDM
    164        movdqu  xmm7, [rnds*16 + ctx]
    165        aeslastinst xmm0, xmm7
    166 
    167        movdqu  [output], xmm0
    168 
    169        lea input, [1*16 + input]
    170        lea output, [1*16 + output]
    171        sub inputLen, 1*16
    172        jmp loop1
    173 
    174 bail:
    175        xor rax, rax
    176 
    177        movdqu  xmm6, [rsp + 0*16]
    178        movdqu  xmm7, [rsp + 1*16]
    179        movdqu  xmm8, [rsp + 2*16]
    180        add     rsp, 3*16
    181        ret
    182 ENDM
    183 
    184 intel_aes_encrypt_ecb_128 PROC
    185 gen_aes_ecb_func 1, 10
    186 intel_aes_encrypt_ecb_128 ENDP
    187 
    188 intel_aes_encrypt_ecb_192 PROC
    189 gen_aes_ecb_func 1, 12
    190 intel_aes_encrypt_ecb_192 ENDP
    191 
    192 intel_aes_encrypt_ecb_256 PROC
    193 gen_aes_ecb_func 1, 14
    194 intel_aes_encrypt_ecb_256 ENDP
    195 
    196 intel_aes_decrypt_ecb_128 PROC
    197 gen_aes_ecb_func 0, 10
    198 intel_aes_decrypt_ecb_128 ENDP
    199 
    200 intel_aes_decrypt_ecb_192 PROC
    201 gen_aes_ecb_func 0, 12
    202 intel_aes_decrypt_ecb_192 ENDP
    203 
    204 intel_aes_decrypt_ecb_256 PROC
    205 gen_aes_ecb_func 0, 14
    206 intel_aes_decrypt_ecb_256 ENDP
    207 
    208 
    209 KEY textequ <rcx>
    210 KS  textequ <rdx>
    211 ITR textequ <r8>
    212 
    213 intel_aes_encrypt_init_128  PROC
    214 
    215    movdqu  xmm1, [KEY]
    216    movdqu  [KS], xmm1
    217    movdqa  xmm2, xmm1
    218 
    219    lea ITR, Lcon1
    220    movdqa  xmm0, [ITR]
    221    lea ITR, Lmask
    222    movdqa  xmm4, [ITR]
    223 
    224    mov ITR, 8
    225 
    226 Lenc_128_ks_loop:
    227        lea KS, [16 + KS]
    228        dec ITR
    229 
    230        pshufb  xmm2, xmm4
    231        aesenclast  xmm2, xmm0
    232        pslld   xmm0, 1
    233        movdqa  xmm3, xmm1
    234        pslldq  xmm3, 4
    235        pxor    xmm1, xmm3
    236        pslldq  xmm3, 4
    237        pxor    xmm1, xmm3
    238        pslldq  xmm3, 4
    239        pxor    xmm1, xmm3
    240        pxor    xmm1, xmm2
    241        movdqu  [KS], xmm1
    242        movdqa  xmm2, xmm1
    243 
    244        jne Lenc_128_ks_loop
    245 
    246    lea ITR, Lcon2
    247    movdqa  xmm0, [ITR]
    248 
    249    pshufb  xmm2, xmm4
    250    aesenclast  xmm2, xmm0
    251    pslld   xmm0, 1
    252    movdqa  xmm3, xmm1
    253    pslldq  xmm3, 4
    254    pxor    xmm1, xmm3
    255    pslldq  xmm3, 4
    256    pxor    xmm1, xmm3
    257    pslldq  xmm3, 4
    258    pxor    xmm1, xmm3
    259    pxor    xmm1, xmm2
    260    movdqu  [16 + KS], xmm1
    261    movdqa  xmm2, xmm1
    262 
    263    pshufb  xmm2, xmm4
    264    aesenclast  xmm2, xmm0
    265    movdqa  xmm3, xmm1
    266    pslldq  xmm3, 4
    267    pxor    xmm1, xmm3
    268    pslldq  xmm3, 4
    269    pxor    xmm1, xmm3
    270    pslldq  xmm3, 4
    271    pxor    xmm1, xmm3
    272    pxor    xmm1, xmm2
    273    movdqu  [32 + KS], xmm1
    274    movdqa  xmm2, xmm1
    275 
    276    ret
    277 intel_aes_encrypt_init_128  ENDP
    278 
    279 
    280 intel_aes_decrypt_init_128  PROC
    281 
    282    push    KS
    283    push    KEY
    284 
    285    call    intel_aes_encrypt_init_128
    286 
    287    pop     KEY
    288    pop     KS
    289 
    290    movdqu  xmm0, [0*16 + KS]
    291    movdqu  xmm1, [10*16 + KS]
    292    movdqu  [10*16 + KS], xmm0
    293    movdqu  [0*16 + KS], xmm1
    294 
    295    i = 1
    296    WHILE i LT 5
    297        movdqu  xmm0, [i*16 + KS]
    298        movdqu  xmm1, [(10-i)*16 + KS]
    299 
    300        aesimc  xmm0, xmm0
    301        aesimc  xmm1, xmm1
    302 
    303        movdqu  [(10-i)*16 + KS], xmm0
    304        movdqu  [i*16 + KS], xmm1
    305 
    306        i = i+1
    307    ENDM
    308 
    309    movdqu  xmm0, [5*16 + KS]
    310    aesimc  xmm0, xmm0
    311    movdqu  [5*16 + KS], xmm0
    312    ret
    313 intel_aes_decrypt_init_128  ENDP
    314 
    315 
    316 intel_aes_encrypt_init_192  PROC
    317 
    318    sub     rsp, 16*2
    319    movdqu  [16*0 + rsp], xmm6
    320    movdqu  [16*1 + rsp], xmm7
    321 
    322    movdqu  xmm1, [KEY]
    323    mov     ITR, [16 + KEY]
    324    movd    xmm3, ITR
    325 
    326    movdqu  [KS], xmm1
    327    movdqa  xmm5, xmm3
    328 
    329    lea ITR, Lcon1
    330    movdqu  xmm0, [ITR]
    331    lea ITR, Lmask192
    332    movdqu  xmm4, [ITR]
    333 
    334    mov ITR, 4
    335 
    336 Lenc_192_ks_loop:
    337        movdqa  xmm2, xmm3
    338        pshufb  xmm2, xmm4
    339        aesenclast xmm2, xmm0
    340        pslld   xmm0, 1
    341 
    342        movdqa  xmm6, xmm1
    343        movdqa  xmm7, xmm3
    344        pslldq  xmm6, 4
    345        pslldq  xmm7, 4
    346        pxor    xmm1, xmm6
    347        pxor    xmm3, xmm7
    348        pslldq  xmm6, 4
    349        pxor    xmm1, xmm6
    350        pslldq  xmm6, 4
    351        pxor    xmm1, xmm6
    352        pxor    xmm1, xmm2
    353        pshufd  xmm2, xmm1, 0ffh
    354        pxor    xmm3, xmm2
    355 
    356        movdqa  xmm6, xmm1
    357        shufpd  xmm5, xmm1, 00h
    358        shufpd  xmm6, xmm3, 01h
    359 
    360        movdqu  [16 + KS], xmm5
    361        movdqu  [32 + KS], xmm6
    362 
    363        movdqa  xmm2, xmm3
    364        pshufb  xmm2, xmm4
    365        aesenclast  xmm2, xmm0
    366        pslld   xmm0, 1
    367 
    368        movdqa  xmm6, xmm1
    369        movdqa  xmm7, xmm3
    370        pslldq  xmm6, 4
    371        pslldq  xmm7, 4
    372        pxor    xmm1, xmm6
    373        pxor    xmm3, xmm7
    374        pslldq  xmm6, 4
    375        pxor    xmm1, xmm6
    376        pslldq  xmm6, 4
    377        pxor    xmm1, xmm6
    378        pxor    xmm1, xmm2
    379        pshufd  xmm2, xmm1, 0ffh
    380        pxor    xmm3, xmm2
    381 
    382        movdqu  [48 + KS], xmm1
    383        movdqa  xmm5, xmm3
    384 
    385        lea KS, [48 + KS]
    386 
    387        dec ITR
    388        jnz Lenc_192_ks_loop
    389 
    390    movdqu  [16 + KS], xmm5
    391 
    392    movdqu  xmm7, [16*1 + rsp]
    393    movdqu  xmm6, [16*0 + rsp]
    394    add rsp, 16*2
    395    ret
    396 intel_aes_encrypt_init_192  ENDP
    397 
    398 intel_aes_decrypt_init_192  PROC
    399    push    KS
    400    push    KEY
    401 
    402    call    intel_aes_encrypt_init_192
    403 
    404    pop     KEY
    405    pop     KS
    406 
    407    movdqu  xmm0, [0*16 + KS]
    408    movdqu  xmm1, [12*16 + KS]
    409    movdqu  [12*16 + KS], xmm0
    410    movdqu  [0*16 + KS], xmm1
    411 
    412    i = 1
    413    WHILE i LT 6
    414        movdqu  xmm0, [i*16 + KS]
    415        movdqu  xmm1, [(12-i)*16 + KS]
    416 
    417        aesimc  xmm0, xmm0
    418        aesimc  xmm1, xmm1
    419 
    420        movdqu  [(12-i)*16 + KS], xmm0
    421        movdqu  [i*16 + KS], xmm1
    422 
    423        i = i+1
    424    ENDM
    425 
    426    movdqu  xmm0, [6*16 + KS]
    427    aesimc  xmm0, xmm0
    428    movdqu  [6*16 + KS], xmm0
    429    ret
    430 intel_aes_decrypt_init_192  ENDP
    431 
    432 
    433 intel_aes_encrypt_init_256  PROC
    434    sub     rsp, 16*2
    435    movdqu  [16*0 + rsp], xmm6
    436    movdqu  [16*1 + rsp], xmm7
    437 
    438    movdqu  xmm1, [16*0 + KEY]
    439    movdqu  xmm3, [16*1 + KEY]
    440 
    441    movdqu  [16*0 + KS], xmm1
    442    movdqu  [16*1 + KS], xmm3
    443 
    444    lea ITR, Lcon1
    445    movdqu  xmm0, [ITR]
    446    lea ITR, Lmask256
    447    movdqu  xmm5, [ITR]
    448 
    449    pxor    xmm6, xmm6
    450 
    451    mov ITR, 6
    452 
    453 Lenc_256_ks_loop:
    454 
    455        movdqa  xmm2, xmm3
    456        pshufb  xmm2, xmm5
    457        aesenclast  xmm2, xmm0
    458        pslld   xmm0, 1
    459        movdqa  xmm4, xmm1
    460        pslldq  xmm4, 4
    461        pxor    xmm1, xmm4
    462        pslldq  xmm4, 4
    463        pxor    xmm1, xmm4
    464        pslldq  xmm4, 4
    465        pxor    xmm1, xmm4
    466        pxor    xmm1, xmm2
    467        movdqu  [16*2 + KS], xmm1
    468 
    469        pshufd  xmm2, xmm1, 0ffh
    470        aesenclast  xmm2, xmm6
    471        movdqa  xmm4, xmm3
    472        pslldq  xmm4, 4
    473        pxor    xmm3, xmm4
    474        pslldq  xmm4, 4
    475        pxor    xmm3, xmm4
    476        pslldq  xmm4, 4
    477        pxor    xmm3, xmm4
    478        pxor    xmm3, xmm2
    479        movdqu  [16*3 + KS], xmm3
    480 
    481        lea KS, [32 + KS]
    482        dec ITR
    483        jnz Lenc_256_ks_loop
    484 
    485    movdqa  xmm2, xmm3
    486    pshufb  xmm2, xmm5
    487    aesenclast  xmm2, xmm0
    488    movdqa  xmm4, xmm1
    489    pslldq  xmm4, 4
    490    pxor    xmm1, xmm4
    491    pslldq  xmm4, 4
    492    pxor    xmm1, xmm4
    493    pslldq  xmm4, 4
    494    pxor    xmm1, xmm4
    495    pxor    xmm1, xmm2
    496    movdqu  [16*2 + KS], xmm1
    497 
    498    movdqu  xmm7, [16*1 + rsp]
    499    movdqu  xmm6, [16*0 + rsp]
    500    add rsp, 16*2
    501    ret
    502 
    503 intel_aes_encrypt_init_256  ENDP
    504 
    505 
    506 intel_aes_decrypt_init_256  PROC
    507    push    KS
    508    push    KEY
    509 
    510    call    intel_aes_encrypt_init_256
    511 
    512    pop     KEY
    513    pop     KS
    514 
    515    movdqu  xmm0, [0*16 + KS]
    516    movdqu  xmm1, [14*16 + KS]
    517    movdqu  [14*16 + KS], xmm0
    518    movdqu  [0*16 + KS], xmm1
    519 
    520    i = 1
    521    WHILE i LT 7
    522        movdqu  xmm0, [i*16 + KS]
    523        movdqu  xmm1, [(14-i)*16 + KS]
    524 
    525        aesimc  xmm0, xmm0
    526        aesimc  xmm1, xmm1
    527 
    528        movdqu  [(14-i)*16 + KS], xmm0
    529        movdqu  [i*16 + KS], xmm1
    530 
    531        i = i+1
    532    ENDM
    533 
    534    movdqu  xmm0, [7*16 + KS]
    535    aesimc  xmm0, xmm0
    536    movdqu  [7*16 + KS], xmm0
    537    ret
    538 intel_aes_decrypt_init_256  ENDP
    539 
    540 
    541 
    542 gen_aes_cbc_enc_func MACRO rnds
    543 
    544 LOCAL   loop1
    545 LOCAL   bail
    546 
    547        mov     input,      [rsp + 1*8 + 8*4]
    548        mov     inputLen,   [rsp + 1*8 + 8*5]
    549 
    550        sub     rsp, 3*16
    551 
    552        movdqu  [rsp + 0*16], xmm6
    553        movdqu  [rsp + 1*16], xmm7
    554        movdqu  [rsp + 2*16], xmm8
    555 
    556        movdqu  xmm0, [256+ctx]
    557 
    558        movdqu  xmm2, [0*16 + ctx]
    559        movdqu  xmm3, [1*16 + ctx]
    560        movdqu  xmm4, [2*16 + ctx]
    561        movdqu  xmm5, [3*16 + ctx]
    562        movdqu  xmm6, [4*16 + ctx]
    563        movdqu  xmm7, [5*16 + ctx]
    564 
    565 loop1:
    566        cmp     inputLen, 1*16
    567        jb      bail
    568 
    569        movdqu  xmm1, [input]
    570        pxor    xmm1, xmm2
    571        pxor    xmm0, xmm1
    572 
    573        aesenc  xmm0, xmm3
    574        aesenc  xmm0, xmm4
    575        aesenc  xmm0, xmm5
    576        aesenc  xmm0, xmm6
    577        aesenc  xmm0, xmm7
    578 
    579        i = 6
    580    WHILE i LT rnds
    581            movdqu  xmm8, [i*16 + ctx]
    582            aesenc  xmm0, xmm8
    583            i = i+1
    584        ENDM
    585        movdqu  xmm8, [rnds*16 + ctx]
    586        aesenclast xmm0, xmm8
    587 
    588        movdqu  [output], xmm0
    589 
    590        lea input, [1*16 + input]
    591        lea output, [1*16 + output]
    592        sub inputLen, 1*16
    593        jmp loop1
    594 
    595 bail:
    596        movdqu  [256+ctx], xmm0
    597 
    598        xor rax, rax
    599 
    600        movdqu  xmm6, [rsp + 0*16]
    601        movdqu  xmm7, [rsp + 1*16]
    602        movdqu  xmm8, [rsp + 2*16]
    603        add     rsp, 3*16
    604        ret
    605 
    606 ENDM
    607 
    608 gen_aes_cbc_dec_func MACRO rnds
    609 
    610 LOCAL   loop8
    611 LOCAL   loop1
    612 LOCAL   dec1
    613 LOCAL   bail
    614 
    615        mov     input,      [rsp + 1*8 + 8*4]
    616        mov     inputLen,   [rsp + 1*8 + 8*5]
    617 
    618        sub     rsp, 3*16
    619 
    620        movdqu  [rsp + 0*16], xmm6
    621        movdqu  [rsp + 1*16], xmm7
    622        movdqu  [rsp + 2*16], xmm8
    623 
    624 loop8:
    625        cmp     inputLen, 8*16
    626        jb      dec1
    627 
    628        movdqu  xmm0, [0*16 + input]
    629        movdqu  xmm1, [1*16 + input]
    630        movdqu  xmm2, [2*16 + input]
    631        movdqu  xmm3, [3*16 + input]
    632        movdqu  xmm4, [4*16 + input]
    633        movdqu  xmm5, [5*16 + input]
    634        movdqu  xmm6, [6*16 + input]
    635        movdqu  xmm7, [7*16 + input]
    636 
    637        movdqu  xmm8, [0*16 + ctx]
    638        pxor    xmm0, xmm8
    639        pxor    xmm1, xmm8
    640        pxor    xmm2, xmm8
    641        pxor    xmm3, xmm8
    642        pxor    xmm4, xmm8
    643        pxor    xmm5, xmm8
    644        pxor    xmm6, xmm8
    645        pxor    xmm7, xmm8
    646 
    647        i = 1
    648        WHILE i LT rnds
    649            aes_dec_rnd i
    650            i = i+1
    651            ENDM
    652        aes_dec_last_rnd rnds
    653 
    654        movdqu  xmm8, [256 + ctx]
    655        pxor    xmm0, xmm8
    656        movdqu  xmm8, [0*16 + input]
    657        pxor    xmm1, xmm8
    658        movdqu  xmm8, [1*16 + input]
    659        pxor    xmm2, xmm8
    660        movdqu  xmm8, [2*16 + input]
    661        pxor    xmm3, xmm8
    662        movdqu  xmm8, [3*16 + input]
    663        pxor    xmm4, xmm8
    664        movdqu  xmm8, [4*16 + input]
    665        pxor    xmm5, xmm8
    666        movdqu  xmm8, [5*16 + input]
    667        pxor    xmm6, xmm8
    668        movdqu  xmm8, [6*16 + input]
    669        pxor    xmm7, xmm8
    670        movdqu  xmm8, [7*16 + input]
    671 
    672        movdqu  [0*16 + output], xmm0
    673        movdqu  [1*16 + output], xmm1
    674        movdqu  [2*16 + output], xmm2
    675        movdqu  [3*16 + output], xmm3
    676        movdqu  [4*16 + output], xmm4
    677        movdqu  [5*16 + output], xmm5
    678        movdqu  [6*16 + output], xmm6
    679        movdqu  [7*16 + output], xmm7
    680        movdqu  [256 + ctx], xmm8
    681 
    682        lea input, [8*16 + input]
    683        lea output, [8*16 + output]
    684        sub inputLen, 8*16
    685        jmp loop8
    686 dec1:
    687 
    688        movdqu  xmm3, [256 + ctx]
    689 
    690 loop1:
    691        cmp     inputLen, 1*16
    692        jb      bail
    693 
    694        movdqu  xmm0, [input]
    695        movdqa  xmm4, xmm0
    696        movdqu  xmm7, [0*16 + ctx]
    697        pxor    xmm0, xmm7
    698 
    699        i = 1
    700    WHILE i LT rnds
    701            movdqu  xmm7, [i*16 + ctx]
    702            aesdec  xmm0, xmm7
    703            i = i+1
    704        ENDM
    705        movdqu  xmm7, [rnds*16 + ctx]
    706        aesdeclast xmm0, xmm7
    707        pxor    xmm3, xmm0
    708 
    709        movdqu  [output], xmm3
    710        movdqa  xmm3, xmm4
    711 
    712        lea input, [1*16 + input]
    713        lea output, [1*16 + output]
    714        sub inputLen, 1*16
    715        jmp loop1
    716 
    717 bail:
    718        movdqu  [256 + ctx], xmm3
    719        xor rax, rax
    720 
    721        movdqu  xmm6, [rsp + 0*16]
    722        movdqu  xmm7, [rsp + 1*16]
    723        movdqu  xmm8, [rsp + 2*16]
    724        add     rsp, 3*16
    725        ret
    726 ENDM
    727 
    728 intel_aes_encrypt_cbc_128 PROC
    729 gen_aes_cbc_enc_func  10
    730 intel_aes_encrypt_cbc_128 ENDP
    731 
    732 intel_aes_encrypt_cbc_192 PROC
    733 gen_aes_cbc_enc_func  12
    734 intel_aes_encrypt_cbc_192 ENDP
    735 
    736 intel_aes_encrypt_cbc_256 PROC
    737 gen_aes_cbc_enc_func  14
    738 intel_aes_encrypt_cbc_256 ENDP
    739 
    740 intel_aes_decrypt_cbc_128 PROC
    741 gen_aes_cbc_dec_func  10
    742 intel_aes_decrypt_cbc_128 ENDP
    743 
    744 intel_aes_decrypt_cbc_192 PROC
    745 gen_aes_cbc_dec_func  12
    746 intel_aes_decrypt_cbc_192 ENDP
    747 
    748 intel_aes_decrypt_cbc_256 PROC
    749 gen_aes_cbc_dec_func  14
    750 intel_aes_decrypt_cbc_256 ENDP
    751 
    752 
    753 
    754 ctrCtx textequ <r10>
    755 CTR textequ <r11d>
    756 CTRSave textequ <eax>
    757 
    758 gen_aes_ctr_func MACRO rnds
    759 
    760 LOCAL   loop8
    761 LOCAL   loop1
    762 LOCAL   enc1
    763 LOCAL   bail
    764 
    765        mov     input,      [rsp + 8*1 + 4*8]
    766        mov     inputLen,   [rsp + 8*1 + 5*8]
    767 
    768        mov     ctrCtx, ctx
    769        mov     ctx, [8+ctrCtx]
    770 
    771        sub     rsp, 3*16
    772        movdqu  [rsp + 0*16], xmm6
    773        movdqu  [rsp + 1*16], xmm7
    774        movdqu  [rsp + 2*16], xmm8
    775 
    776 
    777        push    rbp
    778        mov     rbp, rsp
    779        sub     rsp, 8*16
    780        and     rsp, -16
    781 
    782 
    783        movdqu  xmm0, [16+ctrCtx]
    784        mov     CTRSave, DWORD PTR [ctrCtx + 16 + 3*4]
    785        bswap   CTRSave
    786        movdqu  xmm1, [ctx + 0*16]
    787 
    788        pxor    xmm0, xmm1
    789 
    790        movdqa  [rsp + 0*16], xmm0
    791        movdqa  [rsp + 1*16], xmm0
    792        movdqa  [rsp + 2*16], xmm0
    793        movdqa  [rsp + 3*16], xmm0
    794        movdqa  [rsp + 4*16], xmm0
    795        movdqa  [rsp + 5*16], xmm0
    796        movdqa  [rsp + 6*16], xmm0
    797        movdqa  [rsp + 7*16], xmm0
    798 
    799        inc     CTRSave
    800        mov     CTR, CTRSave
    801        bswap   CTR
    802        xor     CTR, DWORD PTR [ctx + 3*4]
    803        mov     DWORD PTR [rsp + 1*16 + 3*4], CTR
    804 
    805        inc     CTRSave
    806        mov     CTR, CTRSave
    807        bswap   CTR
    808        xor     CTR, DWORD PTR [ctx + 3*4]
    809        mov     DWORD PTR [rsp + 2*16 + 3*4], CTR
    810 
    811        inc     CTRSave
    812        mov     CTR, CTRSave
    813        bswap   CTR
    814        xor     CTR, DWORD PTR [ctx + 3*4]
    815        mov     DWORD PTR [rsp + 3*16 + 3*4], CTR
    816 
    817        inc     CTRSave
    818        mov     CTR, CTRSave
    819        bswap   CTR
    820        xor     CTR, DWORD PTR [ctx + 3*4]
    821        mov     DWORD PTR [rsp + 4*16 + 3*4], CTR
    822 
    823        inc     CTRSave
    824        mov     CTR, CTRSave
    825        bswap   CTR
    826        xor     CTR, DWORD PTR [ctx + 3*4]
    827        mov     DWORD PTR [rsp + 5*16 + 3*4], CTR
    828 
    829        inc     CTRSave
    830        mov     CTR, CTRSave
    831        bswap   CTR
    832        xor     CTR, DWORD PTR [ctx + 3*4]
    833        mov     DWORD PTR [rsp + 6*16 + 3*4], CTR
    834 
    835        inc     CTRSave
    836        mov     CTR, CTRSave
    837        bswap   CTR
    838        xor     CTR, DWORD PTR [ctx + 3*4]
    839        mov     DWORD PTR [rsp + 7*16 + 3*4], CTR
    840 
    841 
    842 loop8:
    843        cmp     inputLen, 8*16
    844        jb      loop1
    845 
    846        movdqu  xmm0, [0*16 + rsp]
    847        movdqu  xmm1, [1*16 + rsp]
    848        movdqu  xmm2, [2*16 + rsp]
    849        movdqu  xmm3, [3*16 + rsp]
    850        movdqu  xmm4, [4*16 + rsp]
    851        movdqu  xmm5, [5*16 + rsp]
    852        movdqu  xmm6, [6*16 + rsp]
    853        movdqu  xmm7, [7*16 + rsp]
    854 
    855        i = 1
    856        WHILE i LE 8
    857            aes_rnd i
    858 
    859            inc     CTRSave
    860            mov     CTR, CTRSave
    861            bswap   CTR
    862            xor     CTR, DWORD PTR [ctx + 3*4]
    863            mov     DWORD PTR [rsp + (i-1)*16 + 3*4], CTR
    864 
    865            i = i+1
    866        ENDM
    867        WHILE i LT rnds
    868            aes_rnd i
    869            i = i+1
    870            ENDM
    871        aes_last_rnd rnds
    872 
    873        movdqu  xmm8, [0*16 + input]
    874        pxor    xmm0, xmm8
    875        movdqu  xmm8, [1*16 + input]
    876        pxor    xmm1, xmm8
    877        movdqu  xmm8, [2*16 + input]
    878        pxor    xmm2, xmm8
    879        movdqu  xmm8, [3*16 + input]
    880        pxor    xmm3, xmm8
    881        movdqu  xmm8, [4*16 + input]
    882        pxor    xmm4, xmm8
    883        movdqu  xmm8, [5*16 + input]
    884        pxor    xmm5, xmm8
    885        movdqu  xmm8, [6*16 + input]
    886        pxor    xmm6, xmm8
    887        movdqu  xmm8, [7*16 + input]
    888        pxor    xmm7, xmm8
    889 
    890        movdqu  [0*16 + output], xmm0
    891        movdqu  [1*16 + output], xmm1
    892        movdqu  [2*16 + output], xmm2
    893        movdqu  [3*16 + output], xmm3
    894        movdqu  [4*16 + output], xmm4
    895        movdqu  [5*16 + output], xmm5
    896        movdqu  [6*16 + output], xmm6
    897        movdqu  [7*16 + output], xmm7
    898 
    899        lea input, [8*16 + input]
    900        lea output, [8*16 + output]
    901        sub inputLen, 8*16
    902        jmp loop8
    903 
    904 
    905 loop1:
    906        cmp     inputLen, 1*16
    907        jb      bail
    908 
    909        movdqu  xmm0, [rsp]
    910        add     rsp, 16
    911 
    912        i = 1
    913    WHILE i LT rnds
    914            movdqu  xmm7, [i*16 + ctx]
    915            aesenc  xmm0, xmm7
    916            i = i+1
    917        ENDM
    918        movdqu  xmm7, [rnds*16 + ctx]
    919        aesenclast xmm0, xmm7
    920 
    921        movdqu  xmm7, [input]
    922        pxor    xmm0, xmm7
    923        movdqu  [output], xmm0
    924 
    925        lea input, [1*16 + input]
    926        lea output, [1*16 + output]
    927        sub inputLen, 1*16
    928        jmp loop1
    929 
    930 bail:
    931 
    932        movdqu  xmm0, [rsp]
    933        movdqu  xmm1, [ctx + 0*16]
    934        pxor    xmm0, xmm1
    935        movdqu  [16+ctrCtx], xmm0
    936 
    937 
    938        xor     rax, rax
    939        mov     rsp, rbp
    940        pop     rbp
    941 
    942        movdqu  xmm6, [rsp + 0*16]
    943        movdqu  xmm7, [rsp + 1*16]
    944        movdqu  xmm8, [rsp + 2*16]
    945        add     rsp, 3*16
    946 
    947        ret
    948 ENDM
    949 
    950 
    951 intel_aes_encrypt_ctr_128 PROC
    952 gen_aes_ctr_func  10
    953 intel_aes_encrypt_ctr_128 ENDP
    954 
    955 intel_aes_encrypt_ctr_192 PROC
    956 gen_aes_ctr_func  12
    957 intel_aes_encrypt_ctr_192 ENDP
    958 
    959 intel_aes_encrypt_ctr_256 PROC
    960 gen_aes_ctr_func  14
    961 intel_aes_encrypt_ctr_256 ENDP
    962 
    963 
    964 END