tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intel-aes-x86-masm.asm (20146B)


      1 ; LICENSE:
      2 ; This submission to NSS is to be made available under the terms of the
      3 ; Mozilla Public License, v. 2.0. You can obtain one at http:
      4 ; //mozilla.org/MPL/2.0/.
      5 ;###############################################################################
      6 ; Copyright(c) 2014, Intel Corp.
      7 ; Developers and authors:
      8 ; Shay Gueron and Vlad Krasnov
      9 ; Intel Corporation, Israel Development Centre, Haifa, Israel
     10 ; Please send feedback directly to crypto.feedback.alias@intel.com
     11 
     12 
     13 .MODEL FLAT, C
     14 .XMM
     15 
     16 .DATA
     17 ALIGN 16
     18 Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh
     19 Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h
     20 Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh
     21 Lcon1 dd 1,1,1,1
     22 Lcon2 dd 1bh,1bh,1bh,1bh
     23 
     24 .CODE
     25 
     26 ctx     textequ <ecx>
     27 output  textequ <edx>
     28 input   textequ <eax>
     29 inputLen textequ <edi>
     30 
     31 
     32 aes_rnd MACRO i
     33    movdqu  xmm7, [i*16 + ctx]
     34    aesenc  xmm0, xmm7
     35    aesenc  xmm1, xmm7
     36    aesenc  xmm2, xmm7
     37    aesenc  xmm3, xmm7
     38    aesenc  xmm4, xmm7
     39    aesenc  xmm5, xmm7
     40    aesenc  xmm6, xmm7
     41    ENDM
     42 
     43 aes_last_rnd MACRO i
     44    movdqu  xmm7, [i*16 + ctx]
     45    aesenclast  xmm0, xmm7
     46    aesenclast  xmm1, xmm7
     47    aesenclast  xmm2, xmm7
     48    aesenclast  xmm3, xmm7
     49    aesenclast  xmm4, xmm7
     50    aesenclast  xmm5, xmm7
     51    aesenclast  xmm6, xmm7
     52    ENDM
     53 
     54 aes_dec_rnd MACRO i
     55    movdqu  xmm7, [i*16 + ctx]
     56    aesdec  xmm0, xmm7
     57    aesdec  xmm1, xmm7
     58    aesdec  xmm2, xmm7
     59    aesdec  xmm3, xmm7
     60    aesdec  xmm4, xmm7
     61    aesdec  xmm5, xmm7
     62    aesdec  xmm6, xmm7
     63    ENDM
     64 
     65 aes_dec_last_rnd MACRO i
     66    movdqu  xmm7, [i*16 + ctx]
     67    aesdeclast  xmm0, xmm7
     68    aesdeclast  xmm1, xmm7
     69    aesdeclast  xmm2, xmm7
     70    aesdeclast  xmm3, xmm7
     71    aesdeclast  xmm4, xmm7
     72    aesdeclast  xmm5, xmm7
     73    aesdeclast  xmm6, xmm7
     74    ENDM
     75 
     76 
     77 gen_aes_ecb_func MACRO enc, rnds
     78 
     79 LOCAL   loop7
     80 LOCAL   loop1
     81 LOCAL   bail
     82 
     83        push    inputLen
     84 
     85        mov     ctx,    [esp + 2*4 + 0*4]
     86        mov     output,     [esp + 2*4 + 1*4]
     87        mov     input,      [esp + 2*4 + 4*4]
     88        mov     inputLen,   [esp + 2*4 + 5*4]
     89 
     90 loop7:
     91        cmp     inputLen, 7*16
     92        jb      loop1
     93 
     94        movdqu  xmm0, [0*16 + input]
     95        movdqu  xmm1, [1*16 + input]
     96        movdqu  xmm2, [2*16 + input]
     97        movdqu  xmm3, [3*16 + input]
     98        movdqu  xmm4, [4*16 + input]
     99        movdqu  xmm5, [5*16 + input]
    100        movdqu  xmm6, [6*16 + input]
    101 
    102        movdqu  xmm7, [0*16 + ctx]
    103        pxor    xmm0, xmm7
    104        pxor    xmm1, xmm7
    105        pxor    xmm2, xmm7
    106        pxor    xmm3, xmm7
    107        pxor    xmm4, xmm7
    108        pxor    xmm5, xmm7
    109        pxor    xmm6, xmm7
    110 
    111 IF enc eq 1
    112        rnd textequ <aes_rnd>
    113        lastrnd textequ <aes_last_rnd>
    114        aesinst textequ <aesenc>
    115        aeslastinst textequ <aesenclast>
    116 ELSE
    117        rnd textequ <aes_dec_rnd>
    118        lastrnd textequ <aes_dec_last_rnd>
    119        aesinst textequ <aesdec>
    120        aeslastinst textequ <aesdeclast>
    121 ENDIF
    122 
    123        i = 1
    124        WHILE i LT rnds
    125            rnd i
    126            i = i+1
    127            ENDM
    128        lastrnd rnds
    129 
    130        movdqu  [0*16 + output], xmm0
    131        movdqu  [1*16 + output], xmm1
    132        movdqu  [2*16 + output], xmm2
    133        movdqu  [3*16 + output], xmm3
    134        movdqu  [4*16 + output], xmm4
    135        movdqu  [5*16 + output], xmm5
    136        movdqu  [6*16 + output], xmm6
    137 
    138        lea input, [7*16 + input]
    139        lea output, [7*16 + output]
    140        sub inputLen, 7*16
    141        jmp loop7
    142 
    143 loop1:
    144        cmp     inputLen, 1*16
    145        jb      bail
    146 
    147        movdqu  xmm0, [input]
    148        movdqu  xmm7, [0*16 + ctx]
    149        pxor    xmm0, xmm7
    150 
    151        i = 1
    152    WHILE i LT rnds
    153            movdqu  xmm7, [i*16 + ctx]
    154            aesinst  xmm0, xmm7
    155            i = i+1
    156        ENDM
    157        movdqu  xmm7, [rnds*16 + ctx]
    158        aeslastinst xmm0, xmm7
    159 
    160        movdqu  [output], xmm0
    161 
    162        lea input, [1*16 + input]
    163        lea output, [1*16 + output]
    164        sub inputLen, 1*16
    165        jmp loop1
    166 
    167 bail:
    168        xor eax, eax
    169        pop     inputLen
    170        ret
    171 
    172 ENDM
    173 
    174 ALIGN 16
    175 intel_aes_encrypt_ecb_128 PROC
    176 gen_aes_ecb_func 1, 10
    177 intel_aes_encrypt_ecb_128 ENDP
    178 
    179 ALIGN 16
    180 intel_aes_encrypt_ecb_192 PROC
    181 gen_aes_ecb_func 1, 12
    182 intel_aes_encrypt_ecb_192 ENDP
    183 
    184 ALIGN 16
    185 intel_aes_encrypt_ecb_256 PROC
    186 gen_aes_ecb_func 1, 14
    187 intel_aes_encrypt_ecb_256 ENDP
    188 
    189 ALIGN 16
    190 intel_aes_decrypt_ecb_128 PROC
    191 gen_aes_ecb_func 0, 10
    192 intel_aes_decrypt_ecb_128 ENDP
    193 
    194 ALIGN 16
    195 intel_aes_decrypt_ecb_192 PROC
    196 gen_aes_ecb_func 0, 12
    197 intel_aes_decrypt_ecb_192 ENDP
    198 
    199 ALIGN 16
    200 intel_aes_decrypt_ecb_256 PROC
    201 gen_aes_ecb_func 0, 14
    202 intel_aes_decrypt_ecb_256 ENDP
    203 
    204 
    205 KEY textequ <ecx>
    206 KS  textequ <edx>
    207 ITR textequ <eax>
    208 
    209 ALIGN 16
    210 intel_aes_encrypt_init_128  PROC
    211 
    212    mov     KEY,        [esp + 1*4 + 0*4]
    213    mov     KS,         [esp + 1*4 + 1*4]
    214 
    215 
    216    movdqu  xmm1, [KEY]
    217    movdqu  [KS], xmm1
    218    movdqa  xmm2, xmm1
    219 
    220    lea ITR, Lcon1
    221    movdqa  xmm0, [ITR]
    222    lea ITR, Lmask
    223    movdqa  xmm4, [ITR]
    224 
    225    mov ITR, 8
    226 
    227 Lenc_128_ks_loop:
    228        lea KS, [16 + KS]
    229        dec ITR
    230 
    231        pshufb  xmm2, xmm4
    232        aesenclast  xmm2, xmm0
    233        pslld   xmm0, 1
    234        movdqa  xmm3, xmm1
    235        pslldq  xmm3, 4
    236        pxor    xmm1, xmm3
    237        pslldq  xmm3, 4
    238        pxor    xmm1, xmm3
    239        pslldq  xmm3, 4
    240        pxor    xmm1, xmm3
    241        pxor    xmm1, xmm2
    242        movdqu  [KS], xmm1
    243        movdqa  xmm2, xmm1
    244 
    245        jne Lenc_128_ks_loop
    246 
    247    lea ITR, Lcon2
    248    movdqa  xmm0, [ITR]
    249 
    250    pshufb  xmm2, xmm4
    251    aesenclast  xmm2, xmm0
    252    pslld   xmm0, 1
    253    movdqa  xmm3, xmm1
    254    pslldq  xmm3, 4
    255    pxor    xmm1, xmm3
    256    pslldq  xmm3, 4
    257    pxor    xmm1, xmm3
    258    pslldq  xmm3, 4
    259    pxor    xmm1, xmm3
    260    pxor    xmm1, xmm2
    261    movdqu  [16 + KS], xmm1
    262    movdqa  xmm2, xmm1
    263 
    264    pshufb  xmm2, xmm4
    265    aesenclast  xmm2, xmm0
    266    movdqa  xmm3, xmm1
    267    pslldq  xmm3, 4
    268    pxor    xmm1, xmm3
    269    pslldq  xmm3, 4
    270    pxor    xmm1, xmm3
    271    pslldq  xmm3, 4
    272    pxor    xmm1, xmm3
    273    pxor    xmm1, xmm2
    274    movdqu  [32 + KS], xmm1
    275    movdqa  xmm2, xmm1
    276 
    277    ret
    278 intel_aes_encrypt_init_128  ENDP
    279 
    280 
    281 ALIGN 16
    282 intel_aes_decrypt_init_128  PROC
    283 
    284    mov     KEY,        [esp + 1*4 + 0*4]
    285    mov     KS,         [esp + 1*4 + 1*4]
    286 
    287    push    KS
    288    push    KEY
    289 
    290    call    intel_aes_encrypt_init_128
    291 
    292    pop     KEY
    293    pop     KS
    294 
    295    movdqu  xmm0, [0*16 + KS]
    296    movdqu  xmm1, [10*16 + KS]
    297    movdqu  [10*16 + KS], xmm0
    298    movdqu  [0*16 + KS], xmm1
    299 
    300    i = 1
    301    WHILE i LT 5
    302        movdqu  xmm0, [i*16 + KS]
    303        movdqu  xmm1, [(10-i)*16 + KS]
    304 
    305        aesimc  xmm0, xmm0
    306        aesimc  xmm1, xmm1
    307 
    308        movdqu  [(10-i)*16 + KS], xmm0
    309        movdqu  [i*16 + KS], xmm1
    310 
    311        i = i+1
    312    ENDM
    313 
    314    movdqu  xmm0, [5*16 + KS]
    315    aesimc  xmm0, xmm0
    316    movdqu  [5*16 + KS], xmm0
    317    ret
    318 intel_aes_decrypt_init_128  ENDP
    319 
    320 
    321 ALIGN 16
    322 intel_aes_encrypt_init_192  PROC
    323 
    324    mov     KEY, [esp + 1*4 + 0*4]
    325    mov     KS,  [esp + 1*4 + 1*4]
    326 
    327    pxor    xmm3, xmm3
    328    movdqu  xmm1, [KEY]
    329    pinsrd  xmm3, DWORD PTR [16 + KEY], 0
    330    pinsrd  xmm3, DWORD PTR [20 + KEY], 1
    331 
    332    movdqu  [KS], xmm1
    333    movdqa  xmm5, xmm3
    334 
    335    lea ITR, Lcon1
    336    movdqu  xmm0, [ITR]
    337    lea ITR, Lmask192
    338    movdqu  xmm4, [ITR]
    339 
    340    mov ITR, 4
    341 
    342 Lenc_192_ks_loop:
    343        movdqa  xmm2, xmm3
    344        pshufb  xmm2, xmm4
    345        aesenclast xmm2, xmm0
    346        pslld   xmm0, 1
    347 
    348        movdqa  xmm6, xmm1
    349        movdqa  xmm7, xmm3
    350        pslldq  xmm6, 4
    351        pslldq  xmm7, 4
    352        pxor    xmm1, xmm6
    353        pxor    xmm3, xmm7
    354        pslldq  xmm6, 4
    355        pxor    xmm1, xmm6
    356        pslldq  xmm6, 4
    357        pxor    xmm1, xmm6
    358        pxor    xmm1, xmm2
    359        pshufd  xmm2, xmm1, 0ffh
    360        pxor    xmm3, xmm2
    361 
    362        movdqa  xmm6, xmm1
    363        shufpd  xmm5, xmm1, 00h
    364        shufpd  xmm6, xmm3, 01h
    365 
    366        movdqu  [16 + KS], xmm5
    367        movdqu  [32 + KS], xmm6
    368 
    369        movdqa  xmm2, xmm3
    370        pshufb  xmm2, xmm4
    371        aesenclast  xmm2, xmm0
    372        pslld   xmm0, 1
    373 
    374        movdqa  xmm6, xmm1
    375        movdqa  xmm7, xmm3
    376        pslldq  xmm6, 4
    377        pslldq  xmm7, 4
    378        pxor    xmm1, xmm6
    379        pxor    xmm3, xmm7
    380        pslldq  xmm6, 4
    381        pxor    xmm1, xmm6
    382        pslldq  xmm6, 4
    383        pxor    xmm1, xmm6
    384        pxor    xmm1, xmm2
    385        pshufd  xmm2, xmm1, 0ffh
    386        pxor    xmm3, xmm2
    387 
    388        movdqu  [48 + KS], xmm1
    389        movdqa  xmm5, xmm3
    390 
    391        lea KS, [48 + KS]
    392 
    393        dec ITR
    394        jnz Lenc_192_ks_loop
    395 
    396    movdqu  [16 + KS], xmm5
    397 ret
    398 intel_aes_encrypt_init_192  ENDP
    399 
    400 ALIGN 16
    401 intel_aes_decrypt_init_192  PROC
    402    mov     KEY,        [esp + 1*4 + 0*4]
    403    mov     KS,         [esp + 1*4 + 1*4]
    404 
    405    push    KS
    406    push    KEY
    407 
    408    call    intel_aes_encrypt_init_192
    409 
    410    pop     KEY
    411    pop     KS
    412 
    413    movdqu  xmm0, [0*16 + KS]
    414    movdqu  xmm1, [12*16 + KS]
    415    movdqu  [12*16 + KS], xmm0
    416    movdqu  [0*16 + KS], xmm1
    417 
    418    i = 1
    419    WHILE i LT 6
    420        movdqu  xmm0, [i*16 + KS]
    421        movdqu  xmm1, [(12-i)*16 + KS]
    422 
    423        aesimc  xmm0, xmm0
    424        aesimc  xmm1, xmm1
    425 
    426        movdqu  [(12-i)*16 + KS], xmm0
    427        movdqu  [i*16 + KS], xmm1
    428 
    429        i = i+1
    430    ENDM
    431 
    432    movdqu  xmm0, [6*16 + KS]
    433    aesimc  xmm0, xmm0
    434    movdqu  [6*16 + KS], xmm0
    435    ret
    436 intel_aes_decrypt_init_192  ENDP
    437 
    438 ALIGN 16
    439 intel_aes_encrypt_init_256  PROC
    440 
    441    mov     KEY,    [esp + 1*4 + 0*4]
    442    mov     KS,     [esp + 1*4 + 1*4]
    443    movdqu  xmm1, [16*0 + KEY]
    444    movdqu  xmm3, [16*1 + KEY]
    445 
    446    movdqu  [16*0 + KS], xmm1
    447    movdqu  [16*1 + KS], xmm3
    448 
    449    lea ITR, Lcon1
    450    movdqu  xmm0, [ITR]
    451    lea ITR, Lmask256
    452    movdqu  xmm5, [ITR]
    453 
    454    pxor    xmm6, xmm6
    455 
    456    mov ITR, 6
    457 
    458 Lenc_256_ks_loop:
    459 
    460        movdqa  xmm2, xmm3
    461        pshufb  xmm2, xmm5
    462        aesenclast  xmm2, xmm0
    463        pslld   xmm0, 1
    464        movdqa  xmm4, xmm1
    465        pslldq  xmm4, 4
    466        pxor    xmm1, xmm4
    467        pslldq  xmm4, 4
    468        pxor    xmm1, xmm4
    469        pslldq  xmm4, 4
    470        pxor    xmm1, xmm4
    471        pxor    xmm1, xmm2
    472        movdqu  [16*2 + KS], xmm1
    473 
    474        pshufd  xmm2, xmm1, 0ffh
    475        aesenclast  xmm2, xmm6
    476        movdqa  xmm4, xmm3
    477        pslldq  xmm4, 4
    478        pxor    xmm3, xmm4
    479        pslldq  xmm4, 4
    480        pxor    xmm3, xmm4
    481        pslldq  xmm4, 4
    482        pxor    xmm3, xmm4
    483        pxor    xmm3, xmm2
    484        movdqu  [16*3 + KS], xmm3
    485 
    486        lea KS, [32 + KS]
    487        dec ITR
    488        jnz Lenc_256_ks_loop
    489 
    490    movdqa  xmm2, xmm3
    491    pshufb  xmm2, xmm5
    492    aesenclast  xmm2, xmm0
    493    movdqa  xmm4, xmm1
    494    pslldq  xmm4, 4
    495    pxor    xmm1, xmm4
    496    pslldq  xmm4, 4
    497    pxor    xmm1, xmm4
    498    pslldq  xmm4, 4
    499    pxor    xmm1, xmm4
    500    pxor    xmm1, xmm2
    501    movdqu  [16*2 + KS], xmm1
    502 
    503    ret
    504 intel_aes_encrypt_init_256  ENDP
    505 
    506 ALIGN 16
    507 intel_aes_decrypt_init_256  PROC
    508    mov     KEY,        [esp + 1*4 + 0*4]
    509    mov     KS,         [esp + 1*4 + 1*4]
    510 
    511    push    KS
    512    push    KEY
    513 
    514    call    intel_aes_encrypt_init_256
    515 
    516    pop     KEY
    517    pop     KS
    518 
    519    movdqu  xmm0, [0*16 + KS]
    520    movdqu  xmm1, [14*16 + KS]
    521    movdqu  [14*16 + KS], xmm0
    522    movdqu  [0*16 + KS], xmm1
    523 
    524    i = 1
    525    WHILE i LT 7
    526        movdqu  xmm0, [i*16 + KS]
    527        movdqu  xmm1, [(14-i)*16 + KS]
    528 
    529        aesimc  xmm0, xmm0
    530        aesimc  xmm1, xmm1
    531 
    532        movdqu  [(14-i)*16 + KS], xmm0
    533        movdqu  [i*16 + KS], xmm1
    534 
    535        i = i+1
    536    ENDM
    537 
    538    movdqu  xmm0, [7*16 + KS]
    539    aesimc  xmm0, xmm0
    540    movdqu  [7*16 + KS], xmm0
    541    ret
    542 intel_aes_decrypt_init_256  ENDP
    543 
    544 
    545 
    546 gen_aes_cbc_enc_func MACRO rnds
    547 
    548 LOCAL   loop1
    549 LOCAL   bail
    550 
    551        push    inputLen
    552 
    553        mov     ctx,    [esp + 2*4 + 0*4]
    554        mov     output,     [esp + 2*4 + 1*4]
    555        mov     input,      [esp + 2*4 + 4*4]
    556        mov     inputLen,   [esp + 2*4 + 5*4]
    557 
    558        movdqu  xmm0, [252+ctx]
    559 
    560        movdqu  xmm2, [0*16 + ctx]
    561        movdqu  xmm3, [1*16 + ctx]
    562        movdqu  xmm4, [2*16 + ctx]
    563        movdqu  xmm5, [3*16 + ctx]
    564        movdqu  xmm6, [4*16 + ctx]
    565 
    566 loop1:
    567        cmp     inputLen, 1*16
    568        jb      bail
    569 
    570        movdqu  xmm1, [input]
    571        pxor    xmm1, xmm2
    572        pxor    xmm0, xmm1
    573 
    574        aesenc  xmm0, xmm3
    575        aesenc  xmm0, xmm4
    576        aesenc  xmm0, xmm5
    577        aesenc  xmm0, xmm6
    578 
    579        i = 5
    580    WHILE i LT rnds
    581            movdqu  xmm7, [i*16 + ctx]
    582            aesenc  xmm0, xmm7
    583            i = i+1
    584        ENDM
    585        movdqu  xmm7, [rnds*16 + ctx]
    586        aesenclast xmm0, xmm7
    587 
    588        movdqu  [output], xmm0
    589 
    590        lea input, [1*16 + input]
    591        lea output, [1*16 + output]
    592        sub inputLen, 1*16
    593        jmp loop1
    594 
    595 bail:
    596        movdqu  [252+ctx], xmm0
    597 
    598        xor eax, eax
    599        pop inputLen
    600        ret
    601 
    602 ENDM
    603 
    604 gen_aes_cbc_dec_func MACRO rnds
    605 
    606 LOCAL   loop7
    607 LOCAL   loop1
    608 LOCAL   dec1
    609 LOCAL   bail
    610 
    611        push    inputLen
    612 
    613        mov     ctx,    [esp + 2*4 + 0*4]
    614        mov     output,     [esp + 2*4 + 1*4]
    615        mov     input,      [esp + 2*4 + 4*4]
    616        mov     inputLen,   [esp + 2*4 + 5*4]
    617 
    618 loop7:
    619        cmp     inputLen, 7*16
    620        jb      dec1
    621 
    622        movdqu  xmm0, [0*16 + input]
    623        movdqu  xmm1, [1*16 + input]
    624        movdqu  xmm2, [2*16 + input]
    625        movdqu  xmm3, [3*16 + input]
    626        movdqu  xmm4, [4*16 + input]
    627        movdqu  xmm5, [5*16 + input]
    628        movdqu  xmm6, [6*16 + input]
    629 
    630        movdqu  xmm7, [0*16 + ctx]
    631        pxor    xmm0, xmm7
    632        pxor    xmm1, xmm7
    633        pxor    xmm2, xmm7
    634        pxor    xmm3, xmm7
    635        pxor    xmm4, xmm7
    636        pxor    xmm5, xmm7
    637        pxor    xmm6, xmm7
    638 
    639        i = 1
    640        WHILE i LT rnds
    641            aes_dec_rnd i
    642            i = i+1
    643            ENDM
    644        aes_dec_last_rnd rnds
    645 
    646        movdqu  xmm7, [252 + ctx]
    647        pxor    xmm0, xmm7
    648        movdqu  xmm7, [0*16 + input]
    649        pxor    xmm1, xmm7
    650        movdqu  xmm7, [1*16 + input]
    651        pxor    xmm2, xmm7
    652        movdqu  xmm7, [2*16 + input]
    653        pxor    xmm3, xmm7
    654        movdqu  xmm7, [3*16 + input]
    655        pxor    xmm4, xmm7
    656        movdqu  xmm7, [4*16 + input]
    657        pxor    xmm5, xmm7
    658        movdqu  xmm7, [5*16 + input]
    659        pxor    xmm6, xmm7
    660        movdqu  xmm7, [6*16 + input]
    661 
    662        movdqu  [0*16 + output], xmm0
    663        movdqu  [1*16 + output], xmm1
    664        movdqu  [2*16 + output], xmm2
    665        movdqu  [3*16 + output], xmm3
    666        movdqu  [4*16 + output], xmm4
    667        movdqu  [5*16 + output], xmm5
    668        movdqu  [6*16 + output], xmm6
    669        movdqu  [252 + ctx], xmm7
    670 
    671        lea input, [7*16 + input]
    672        lea output, [7*16 + output]
    673        sub inputLen, 7*16
    674        jmp loop7
    675 dec1:
    676 
    677        movdqu  xmm3, [252 + ctx]
    678 
    679 loop1:
    680        cmp     inputLen, 1*16
    681        jb      bail
    682 
    683        movdqu  xmm0, [input]
    684        movdqa  xmm4, xmm0
    685        movdqu  xmm7, [0*16 + ctx]
    686        pxor    xmm0, xmm7
    687 
    688        i = 1
    689    WHILE i LT rnds
    690            movdqu  xmm7, [i*16 + ctx]
    691            aesdec  xmm0, xmm7
    692            i = i+1
    693        ENDM
    694        movdqu  xmm7, [rnds*16 + ctx]
    695        aesdeclast xmm0, xmm7
    696        pxor    xmm3, xmm0
    697 
    698        movdqu  [output], xmm3
    699        movdqa  xmm3, xmm4
    700 
    701        lea input, [1*16 + input]
    702        lea output, [1*16 + output]
    703        sub inputLen, 1*16
    704        jmp loop1
    705 
    706 bail:
    707        movdqu  [252 + ctx], xmm3
    708        xor eax, eax
    709        pop     inputLen
    710        ret
    711 ENDM
    712 
    713 ALIGN 16
    714 intel_aes_encrypt_cbc_128 PROC
    715 gen_aes_cbc_enc_func  10
    716 intel_aes_encrypt_cbc_128 ENDP
    717 
    718 ALIGN 16
    719 intel_aes_encrypt_cbc_192 PROC
    720 gen_aes_cbc_enc_func  12
    721 intel_aes_encrypt_cbc_192 ENDP
    722 
    723 ALIGN 16
    724 intel_aes_encrypt_cbc_256 PROC
    725 gen_aes_cbc_enc_func  14
    726 intel_aes_encrypt_cbc_256 ENDP
    727 
    728 ALIGN 16
    729 intel_aes_decrypt_cbc_128 PROC
    730 gen_aes_cbc_dec_func  10
    731 intel_aes_decrypt_cbc_128 ENDP
    732 
    733 ALIGN 16
    734 intel_aes_decrypt_cbc_192 PROC
    735 gen_aes_cbc_dec_func  12
    736 intel_aes_decrypt_cbc_192 ENDP
    737 
    738 ALIGN 16
    739 intel_aes_decrypt_cbc_256 PROC
    740 gen_aes_cbc_dec_func  14
    741 intel_aes_decrypt_cbc_256 ENDP
    742 
    743 
    744 
    745 ctrCtx textequ <esi>
    746 CTR textequ <ebx>
    747 
    748 gen_aes_ctr_func MACRO rnds
    749 
    750 LOCAL   loop7
    751 LOCAL   loop1
    752 LOCAL   enc1
    753 LOCAL   bail
    754 
    755        push    inputLen
    756        push    ctrCtx
    757        push    CTR
    758        push    ebp
    759 
    760        mov     ctrCtx, [esp + 4*5 + 0*4]
    761        mov     output, [esp + 4*5 + 1*4]
    762        mov     input,  [esp + 4*5 + 4*4]
    763        mov     inputLen, [esp + 4*5 + 5*4]
    764 
    765        mov     ctx, [4+ctrCtx]
    766 
    767        mov     ebp, esp
    768        sub     esp, 7*16
    769        and     esp, -16
    770 
    771        movdqu  xmm0, [8+ctrCtx]
    772        mov     ctrCtx, [ctrCtx + 8 + 3*4]
    773        bswap   ctrCtx
    774        movdqu  xmm1, [ctx + 0*16]
    775 
    776        pxor    xmm0, xmm1
    777 
    778        movdqa  [esp + 0*16], xmm0
    779        movdqa  [esp + 1*16], xmm0
    780        movdqa  [esp + 2*16], xmm0
    781        movdqa  [esp + 3*16], xmm0
    782        movdqa  [esp + 4*16], xmm0
    783        movdqa  [esp + 5*16], xmm0
    784        movdqa  [esp + 6*16], xmm0
    785 
    786        inc     ctrCtx
    787        mov     CTR, ctrCtx
    788        bswap   CTR
    789        xor     CTR, [ctx + 3*4]
    790        mov     [esp + 1*16 + 3*4], CTR
    791 
    792        inc     ctrCtx
    793        mov     CTR, ctrCtx
    794        bswap   CTR
    795        xor     CTR, [ctx + 3*4]
    796        mov     [esp + 2*16 + 3*4], CTR
    797 
    798        inc     ctrCtx
    799        mov     CTR, ctrCtx
    800        bswap   CTR
    801        xor     CTR, [ctx + 3*4]
    802        mov     [esp + 3*16 + 3*4], CTR
    803 
    804        inc     ctrCtx
    805        mov     CTR, ctrCtx
    806        bswap   CTR
    807        xor     CTR, [ctx + 3*4]
    808        mov     [esp + 4*16 + 3*4], CTR
    809 
    810        inc     ctrCtx
    811        mov     CTR, ctrCtx
    812        bswap   CTR
    813        xor     CTR, [ctx + 3*4]
    814        mov     [esp + 5*16 + 3*4], CTR
    815 
    816        inc     ctrCtx
    817        mov     CTR, ctrCtx
    818        bswap   CTR
    819        xor     CTR, [ctx + 3*4]
    820        mov     [esp + 6*16 + 3*4], CTR
    821 
    822 
    823 loop7:
    824        cmp     inputLen, 7*16
    825        jb      loop1
    826 
    827        movdqu  xmm0, [0*16 + esp]
    828        movdqu  xmm1, [1*16 + esp]
    829        movdqu  xmm2, [2*16 + esp]
    830        movdqu  xmm3, [3*16 + esp]
    831        movdqu  xmm4, [4*16 + esp]
    832        movdqu  xmm5, [5*16 + esp]
    833        movdqu  xmm6, [6*16 + esp]
    834 
    835        i = 1
    836        WHILE i LE 7
    837            aes_rnd i
    838 
    839            inc     ctrCtx
    840            mov     CTR, ctrCtx
    841            bswap   CTR
    842            xor     CTR, [ctx + 3*4]
    843            mov     [esp + (i-1)*16 + 3*4], CTR
    844 
    845            i = i+1
    846        ENDM
    847        WHILE i LT rnds
    848            aes_rnd i
    849            i = i+1
    850            ENDM
    851        aes_last_rnd rnds
    852 
    853        movdqu  xmm7, [0*16 + input]
    854        pxor    xmm0, xmm7
    855        movdqu  xmm7, [1*16 + input]
    856        pxor    xmm1, xmm7
    857        movdqu  xmm7, [2*16 + input]
    858        pxor    xmm2, xmm7
    859        movdqu  xmm7, [3*16 + input]
    860        pxor    xmm3, xmm7
    861        movdqu  xmm7, [4*16 + input]
    862        pxor    xmm4, xmm7
    863        movdqu  xmm7, [5*16 + input]
    864        pxor    xmm5, xmm7
    865        movdqu  xmm7, [6*16 + input]
    866        pxor    xmm6, xmm7
    867 
    868        movdqu  [0*16 + output], xmm0
    869        movdqu  [1*16 + output], xmm1
    870        movdqu  [2*16 + output], xmm2
    871        movdqu  [3*16 + output], xmm3
    872        movdqu  [4*16 + output], xmm4
    873        movdqu  [5*16 + output], xmm5
    874        movdqu  [6*16 + output], xmm6
    875 
    876        lea input, [7*16 + input]
    877        lea output, [7*16 + output]
    878        sub inputLen, 7*16
    879        jmp loop7
    880 
    881 
    882 loop1:
    883        cmp     inputLen, 1*16
    884        jb      bail
    885 
    886        movdqu  xmm0, [esp]
    887        add     esp, 16
    888 
    889        i = 1
    890    WHILE i LT rnds
    891            movdqu  xmm7, [i*16 + ctx]
    892            aesenc  xmm0, xmm7
    893            i = i+1
    894        ENDM
    895        movdqu  xmm7, [rnds*16 + ctx]
    896        aesenclast xmm0, xmm7
    897 
    898        movdqu  xmm7, [input]
    899        pxor    xmm0, xmm7
    900        movdqu  [output], xmm0
    901 
    902        lea input, [1*16 + input]
    903        lea output, [1*16 + output]
    904        sub inputLen, 1*16
    905        jmp loop1
    906 
    907 bail:
    908 
    909        mov     ctrCtx, [ebp + 4*5 + 0*4]
    910        movdqu  xmm0, [esp]
    911        movdqu  xmm1, [ctx + 0*16]
    912        pxor    xmm0, xmm1
    913        movdqu  [8+ctrCtx], xmm0
    914 
    915 
    916        xor     eax, eax
    917        mov     esp, ebp
    918        pop     ebp
    919        pop     CTR
    920        pop     ctrCtx
    921        pop     inputLen
    922        ret
    923 ENDM
    924 
    925 
    926 ALIGN 16
    927 intel_aes_encrypt_ctr_128 PROC
    928 gen_aes_ctr_func  10
    929 intel_aes_encrypt_ctr_128 ENDP
    930 
    931 ALIGN 16
    932 intel_aes_encrypt_ctr_192 PROC
    933 gen_aes_ctr_func  12
    934 intel_aes_encrypt_ctr_192 ENDP
    935 
    936 ALIGN 16
    937 intel_aes_encrypt_ctr_256 PROC
    938 gen_aes_ctr_func  14
    939 intel_aes_encrypt_ctr_256 ENDP
    940 
    941 
    942 END