tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intel-aes.s (101162B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 .text
      6 
      7 #define IV_OFFSET 256
      8 
      9 /*
     10 * Warning: the length values used in this module are "unsigned int"
     11 * in C, which is 32-bit.  When they're passed in registers, use only
     12 * the low 32 bits, because the top half is unspecified.
     13 *
     14 * This is called from C code, so the contents of those bits can
     15 * depend on the C compiler's optimization decisions.  This means that
     16 * mistakes might not be obvious in testing if those bits happen to be
     17 * zero in your build.
     18 *
     19 * Exception: 32-bit lea instructions use a 64-bit address because the
     20 * address size doesn't affect the result, and that form is more
     21 * compactly encoded and preferred by compilers over a 32-bit address.
     22 */
     23 
     24 /* in %rdi : the key
     25   in %rsi : buffer for expanded key
     26 */
     27 .type intel_aes_encrypt_init_128,@function
     28 .globl intel_aes_encrypt_init_128
     29 .align	16
     30 intel_aes_encrypt_init_128:
     31 movups	(%rdi), %xmm1
     32 movups	%xmm1, (%rsi)
     33 leaq	16(%rsi), %rsi
     34 xorl	%eax, %eax
     35 
     36 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01	/* aeskeygenassist $0x01, %xmm1, %xmm2 */
     37 call key_expansion128
     38 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02	/* aeskeygenassist $0x02, %xmm1, %xmm2 */
     39 call key_expansion128
     40 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04	/* aeskeygenassist $0x04, %xmm1, %xmm2 */
     41 call key_expansion128
     42 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08	/* aeskeygenassist $0x08, %xmm1, %xmm2 */
     43 call key_expansion128
     44 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10	/* aeskeygenassist $0x10, %xmm1, %xmm2 */
     45 call key_expansion128
     46 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20	/* aeskeygenassist $0x20, %xmm1, %xmm2 */
     47 call key_expansion128
     48 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40	/* aeskeygenassist $0x40, %xmm1, %xmm2 */
     49 call key_expansion128
     50 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80	/* aeskeygenassist $0x80, %xmm1, %xmm2 */
     51 call key_expansion128
     52 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b	/* aeskeygenassist $0x1b, %xmm1, %xmm2 */
     53 call key_expansion128
     54 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36	/* aeskeygenassist $0x36, %xmm1, %xmm2 */
     55 call key_expansion128
     56 
     57 ret
     58 .size intel_aes_encrypt_init_128, .-intel_aes_encrypt_init_128
     59 
     60 
     61 /* in %rdi : the key
     62   in %rsi : buffer for expanded key
     63 */
     64 .type intel_aes_decrypt_init_128,@function
     65 .globl intel_aes_decrypt_init_128
     66 .align	16
     67 intel_aes_decrypt_init_128:
     68 movups	(%rdi), %xmm1
     69 movups	%xmm1, (%rsi)
     70 leaq	16(%rsi), %rsi
     71 xorl	%eax, %eax
     72 
     73 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x01	/* aeskeygenassist $0x01, %xmm1, %xmm2 */
     74 call key_expansion128
     75 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
     76 movups	%xmm2, -16(%rsi)
     77 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x02	/* aeskeygenassist $0x02, %xmm1, %xmm2 */
     78 call key_expansion128
     79 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
     80 movups	%xmm2, -16(%rsi)
     81 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x04	/* aeskeygenassist $0x04, %xmm1, %xmm2 */
     82 call key_expansion128
     83 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
     84 movups	%xmm2, -16(%rsi)
     85 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x08	/* aeskeygenassist $0x08, %xmm1, %xmm2 */
     86 call key_expansion128
     87 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
     88 movups	%xmm2, -16(%rsi)
     89 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x10	/* aeskeygenassist $0x10, %xmm1, %xmm2 */
     90 call key_expansion128
     91 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
     92 movups	%xmm2, -16(%rsi)
     93 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x20	/* aeskeygenassist $0x20, %xmm1, %xmm2 */
     94 call key_expansion128
     95 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
     96 movups	%xmm2, -16(%rsi)
     97 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x40	/* aeskeygenassist $0x40, %xmm1, %xmm2 */
     98 call key_expansion128
     99 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    100 movups	%xmm2, -16(%rsi)
    101 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x80	/* aeskeygenassist $0x80, %xmm1, %xmm2 */
    102 call key_expansion128
    103 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    104 movups	%xmm2, -16(%rsi)
    105 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x1b	/* aeskeygenassist $0x1b, %xmm1, %xmm2 */
    106 call key_expansion128
    107 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    108 movups	%xmm2, -16(%rsi)
    109 .byte 0x66,0x0f,0x3a,0xdf,0xd1,0x36	/* aeskeygenassist $0x36, %xmm1, %xmm2 */
    110 call key_expansion128
    111 
    112 ret
    113 .size intel_aes_decrypt_init_128, .-intel_aes_decrypt_init_128
    114 
    115 
    116 .type key_expansion128,@function
    117 .align	16
    118 key_expansion128:
    119 movd	%eax, %xmm3
    120 pshufd	$0xff, %xmm2, %xmm2
    121 shufps	$0x10, %xmm1, %xmm3
    122 pxor	%xmm3, %xmm1
    123 shufps	$0x8c, %xmm1, %xmm3
    124 pxor	%xmm2, %xmm1
    125 pxor	%xmm3, %xmm1
    126 movdqu	%xmm1, (%rsi)
    127 addq	$16, %rsi
    128 ret
    129 .size key_expansion128, .-key_expansion128
    130 
    131 
    132 /* in %rdi : cx - context
    133   in %rsi : output - pointer to output buffer
    134   in %rdx : outputLen - pointer to variable for length of output
    135             (already filled in by caller)
    136   in %ecx : maxOutputLen - length of output buffer
    137             (already checked by caller)
    138   in %r8  : input - pointer to input buffer
    139   in %r9d : inputLen - length of input buffer
    140   on stack: blocksize - AES blocksize (always 16, unused)
    141 */
    142 .type intel_aes_encrypt_ecb_128,@function
    143 .globl intel_aes_encrypt_ecb_128
    144 .align	16
    145 intel_aes_encrypt_ecb_128:
    146 movdqu	(%rdi), %xmm2
    147 movdqu	160(%rdi), %xmm12
    148 xor	%eax, %eax
    149 //	cmpl	$8*16, %r9d
    150 cmpl	$128, %r9d
    151 jb	1f
    152 //	leal	-8*16(%r9), %r11d
    153 leal	-128(%r9), %r11d
    154 2:	movdqu	(%r8, %rax), %xmm3
    155 movdqu	16(%r8, %rax), %xmm4
    156 movdqu	32(%r8, %rax), %xmm5
    157 movdqu	48(%r8, %rax), %xmm6
    158 movdqu	64(%r8, %rax), %xmm7
    159 movdqu	80(%r8, %rax), %xmm8
    160 movdqu	96(%r8, %rax), %xmm9
    161 movdqu	112(%r8, %rax), %xmm10
    162 pxor	%xmm2, %xmm3
    163 pxor	%xmm2, %xmm4
    164 pxor	%xmm2, %xmm5
    165 pxor	%xmm2, %xmm6
    166 pxor	%xmm2, %xmm7
    167 pxor	%xmm2, %xmm8
    168 pxor	%xmm2, %xmm9
    169 pxor	%xmm2, %xmm10
    170 
    171 // complete loop unrolling
    172 movdqu 16(%rdi), %xmm1
    173 movdqu 32(%rdi), %xmm11
    174 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
    175 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
    176 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
    177 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
    178 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
    179 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
    180 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
    181 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
    182 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
    183 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
    184 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
    185 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
    186 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
    187 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
    188 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
    189 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
    190 
    191 movdqu 48(%rdi), %xmm1
    192 movdqu 64(%rdi), %xmm11
    193 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
    194 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
    195 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
    196 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
    197 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
    198 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
    199 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
    200 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
    201 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
    202 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
    203 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
    204 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
    205 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
    206 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
    207 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
    208 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
    209 
    210 movdqu 80(%rdi), %xmm1
    211 movdqu 96(%rdi), %xmm11
    212 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
    213 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
    214 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
    215 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
    216 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
    217 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
    218 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
    219 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
    220 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
    221 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
    222 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
    223 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
    224 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
    225 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
    226 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
    227 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
    228 
    229 movdqu 112(%rdi), %xmm1
    230 movdqu 128(%rdi), %xmm11
    231 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
    232 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
    233 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
    234 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
    235 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
    236 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
    237 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
    238 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
    239 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
    240 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
    241 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
    242 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
    243 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
    244 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
    245 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
    246 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
    247 
    248 movdqu 144(%rdi), %xmm1
    249 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
    250 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
    251 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
    252 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
    253 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
    254 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
    255 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
    256 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
    257 .byte 0x66,0x41,0x0f,0x38,0xdd,0xdc 	/* aesenclast 	%xmm12, %xmm3 */
    258 .byte 0x66,0x41,0x0f,0x38,0xdd,0xe4 	/* aesenclast 	%xmm12, %xmm4 */
    259 .byte 0x66,0x41,0x0f,0x38,0xdd,0xec 	/* aesenclast 	%xmm12, %xmm5 */
    260 .byte 0x66,0x41,0x0f,0x38,0xdd,0xf4 	/* aesenclast 	%xmm12, %xmm6 */
    261 .byte 0x66,0x41,0x0f,0x38,0xdd,0xfc 	/* aesenclast 	%xmm12, %xmm7 */
    262 .byte 0x66,0x45,0x0f,0x38,0xdd,0xc4 	/* aesenclast 	%xmm12, %xmm8 */
    263 .byte 0x66,0x45,0x0f,0x38,0xdd,0xcc 	/* aesenclast 	%xmm12, %xmm9 */
    264 .byte 0x66,0x45,0x0f,0x38,0xdd,0xd4 	/* aesenclast 	%xmm12, %xmm10 */
    265 
    266 movdqu	%xmm3, (%rsi, %rax)
    267 movdqu	%xmm4, 16(%rsi, %rax)
    268 movdqu	%xmm5, 32(%rsi, %rax)
    269 movdqu	%xmm6, 48(%rsi, %rax)
    270 movdqu	%xmm7, 64(%rsi, %rax)
    271 movdqu	%xmm8, 80(%rsi, %rax)
    272 movdqu	%xmm9, 96(%rsi, %rax)
    273 movdqu	%xmm10, 112(%rsi, %rax)
    274 //	addl	$8*16, %eax
    275 addl	$128, %eax
    276 cmpl	%r11d, %eax
    277 jbe	2b
    278 1:	cmpl	%eax, %r9d
    279 je	5f
    280 
    281 movdqu	16(%rdi), %xmm3
    282 movdqu	32(%rdi), %xmm4
    283 movdqu	48(%rdi), %xmm5
    284 movdqu	64(%rdi), %xmm6
    285 movdqu	80(%rdi), %xmm7
    286 movdqu	96(%rdi), %xmm8
    287 movdqu	112(%rdi), %xmm9
    288 movdqu	128(%rdi), %xmm10
    289 movdqu	144(%rdi), %xmm11
    290 
    291 4:	movdqu	(%r8, %rax), %xmm1
    292 pxor	%xmm2, %xmm1
    293 .byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
    294 .byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
    295 .byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
    296 .byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
    297 .byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
    298 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
    299 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
    300 .byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
    301 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
    302 .byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
    303 movdqu	%xmm1, (%rsi, %rax)
    304 addl	$16, %eax
    305 cmpl	%eax, %r9d
    306 jne	4b
    307 
    308 5:	xor	%eax, %eax
    309 ret
    310 .size intel_aes_encrypt_ecb_128, .-intel_aes_encrypt_ecb_128
    311 
    312 
    313 /* in %rdi : cx - context
    314   in %rsi : output - pointer to output buffer
    315   in %rdx : outputLen - pointer to variable for length of output
    316             (already filled in by caller)
    317   in %ecx : maxOutputLen - length of output buffer
    318             (already checked by caller)
    319   in %r8  : input - pointer to input buffer
    320   in %r9d : inputLen - length of input buffer
    321   on stack: blocksize - AES blocksize (always 16, unused)
    322 */
    323 .type intel_aes_decrypt_ecb_128,@function
    324 .globl intel_aes_decrypt_ecb_128
    325 .align	16
    326 intel_aes_decrypt_ecb_128:
    327 movdqu	(%rdi), %xmm2
    328 movdqu	160(%rdi), %xmm12
    329 xorl	%eax, %eax
    330 //	cmpl	$8*16, %r9d
    331 cmpl	$128, %r9d
    332 jb	1f
    333 //	leal	-8*16(%r9), %r11d
    334 leal	-128(%r9), %r11d
    335 2:	movdqu	(%r8, %rax), %xmm3
    336 movdqu	16(%r8, %rax), %xmm4
    337 movdqu	32(%r8, %rax), %xmm5
    338 movdqu	48(%r8, %rax), %xmm6
    339 movdqu	64(%r8, %rax), %xmm7
    340 movdqu	80(%r8, %rax), %xmm8
    341 movdqu	96(%r8, %rax), %xmm9
    342 movdqu	112(%r8, %rax), %xmm10
    343 pxor	%xmm12, %xmm3
    344 pxor	%xmm12, %xmm4
    345 pxor	%xmm12, %xmm5
    346 pxor	%xmm12, %xmm6
    347 pxor	%xmm12, %xmm7
    348 pxor	%xmm12, %xmm8
    349 pxor	%xmm12, %xmm9
    350 pxor	%xmm12, %xmm10
    351 
    352 // complete loop unrolling
    353 movdqu 144(%rdi), %xmm1
    354 movdqu 128(%rdi), %xmm11
    355 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
    356 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
    357 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
    358 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
    359 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
    360 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
    361 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
    362 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
    363 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
    364 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
    365 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
    366 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
    367 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
    368 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
    369 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
    370 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
    371 
    372 movdqu 112(%rdi), %xmm1
    373 movdqu 96(%rdi), %xmm11
    374 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
    375 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
    376 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
    377 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
    378 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
    379 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
    380 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
    381 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
    382 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
    383 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
    384 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
    385 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
    386 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
    387 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
    388 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
    389 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
    390 
    391 movdqu 80(%rdi), %xmm1
    392 movdqu 64(%rdi), %xmm11
    393 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
    394 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
    395 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
    396 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
    397 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
    398 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
    399 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
    400 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
    401 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
    402 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
    403 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
    404 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
    405 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
    406 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
    407 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
    408 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
    409 
    410 movdqu 48(%rdi), %xmm1
    411 movdqu 32(%rdi), %xmm11
    412 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
    413 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
    414 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
    415 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
    416 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
    417 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
    418 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
    419 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
    420 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
    421 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
    422 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
    423 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
    424 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
    425 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
    426 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
    427 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
    428 
    429 movdqu 16(%rdi), %xmm1
    430 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
    431 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
    432 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
    433 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
    434 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
    435 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
    436 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
    437 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
    438 .byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
    439 .byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
    440 .byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
    441 .byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
    442 .byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
    443 .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
    444 .byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
    445 .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
    446 
    447 movdqu	%xmm3, (%rsi, %rax)
    448 movdqu	%xmm4, 16(%rsi, %rax)
    449 movdqu	%xmm5, 32(%rsi, %rax)
    450 movdqu	%xmm6, 48(%rsi, %rax)
    451 movdqu	%xmm7, 64(%rsi, %rax)
    452 movdqu	%xmm8, 80(%rsi, %rax)
    453 movdqu	%xmm9, 96(%rsi, %rax)
    454 movdqu	%xmm10, 112(%rsi, %rax)
    455 //	addl	$8*16, %eax
    456 addl	$128, %eax
    457 cmpl	%r11d, %eax
    458 jbe	2b
    459 1:	cmpl	%eax, %r9d
    460 je	5f
    461 
    462 movdqu	16(%rdi), %xmm3
    463 movdqu	32(%rdi), %xmm4
    464 movdqu	48(%rdi), %xmm5
    465 movdqu	64(%rdi), %xmm6
    466 movdqu	80(%rdi), %xmm7
    467 movdqu	96(%rdi), %xmm8
    468 movdqu	112(%rdi), %xmm9
    469 movdqu	128(%rdi), %xmm10
    470 movdqu	144(%rdi), %xmm11
    471 
    472 4:	movdqu	(%r8, %rax), %xmm1
    473 pxor	%xmm12, %xmm1
    474 .byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
    475 .byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
    476 .byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
    477 .byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
    478 .byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
    479 .byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm7, %xmm1 */
    480 .byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm7, %xmm1 */
    481 .byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm7, %xmm1 */
    482 .byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm7, %xmm1 */
    483 .byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
    484 movdqu	%xmm1, (%rsi, %rax)
    485 addl	$16, %eax
    486 cmpl	%eax, %r9d
    487 jne	4b
    488 
    489 5:	xor	%eax, %eax
    490 ret
    491 .size intel_aes_decrypt_ecb_128, .-intel_aes_decrypt_ecb_128
    492 
    493 
    494 /* in %rdi : cx - context
    495   in %rsi : output - pointer to output buffer
    496   in %rdx : outputLen - pointer to variable for length of output
    497             (already filled in by caller)
    498   in %ecx : maxOutputLen - length of output buffer
    499             (already checked by caller)
    500   in %r8  : input - pointer to input buffer
    501   in %r9d : inputLen - length of input buffer
    502   on stack: blocksize - AES blocksize (always 16, unused)
    503 */
    504 .type intel_aes_encrypt_cbc_128,@function
    505 .globl intel_aes_encrypt_cbc_128
    506 .align	16
    507 intel_aes_encrypt_cbc_128:
    508 testl	%r9d, %r9d
    509 je	2f
    510 
    511 //	leaq	IV_OFFSET(%rdi), %rdx
    512 leaq	256(%rdi), %rdx
    513 
    514 movdqu	(%rdx), %xmm0
    515 movdqu	(%rdi), %xmm2
    516 movdqu	16(%rdi), %xmm3
    517 movdqu	32(%rdi), %xmm4
    518 movdqu	48(%rdi), %xmm5
    519 movdqu	64(%rdi), %xmm6
    520 movdqu	80(%rdi), %xmm7
    521 movdqu	96(%rdi), %xmm8
    522 movdqu	112(%rdi), %xmm9
    523 movdqu	128(%rdi), %xmm10
    524 movdqu	144(%rdi), %xmm11
    525 movdqu	160(%rdi), %xmm12
    526 
    527 xorl	%eax, %eax
    528 1:	movdqu	(%r8, %rax), %xmm1
    529 pxor	%xmm0, %xmm1
    530 pxor	%xmm2, %xmm1
    531 .byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
    532 .byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
    533 .byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
    534 .byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
    535 .byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
    536 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
    537 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
    538 .byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmma, %xmm1 */
    539 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmmb, %xmm1 */
    540 .byte 0x66,0x41,0x0f,0x38,0xdd,0xcc	/* aesenclast %xmm12, %xmm1 */
    541 movdqu	%xmm1, (%rsi, %rax)
    542 movdqa	%xmm1, %xmm0
    543 addl	$16, %eax
    544 cmpl	%eax, %r9d
    545 jne	1b
    546 
    547 movdqu	%xmm0, (%rdx)
    548 
    549 2:	xor	%eax, %eax
    550 ret
    551 .size intel_aes_encrypt_cbc_128, .-intel_aes_encrypt_cbc_128
    552 
    553 
    554 /* in %rdi : cx - context
    555   in %rsi : output - pointer to output buffer
    556   in %rdx : outputLen - pointer to variable for length of output
    557             (already filled in by caller)
    558   in %ecx : maxOutputLen - length of output buffer
    559             (already checked by caller)
    560   in %r8  : input - pointer to input buffer
    561   in %r9d : inputLen - length of input buffer
    562   on stack: blocksize - AES blocksize (always 16, unused)
    563 */
    564 .type intel_aes_decrypt_cbc_128,@function
    565 .globl intel_aes_decrypt_cbc_128
    566 .align	16
    567 intel_aes_decrypt_cbc_128:
    568 //	leaq	IV_OFFSET(%rdi), %rdx
    569 leaq	256(%rdi), %rdx
    570 
    571 movdqu	(%rdx), %xmm0   /* iv */
    572 movdqu	(%rdi), %xmm2   /* first key block */
    573 movdqu	160(%rdi), %xmm12 /* last key block */
    574 xorl	%eax, %eax
    575 cmpl	$128, %r9d
    576 jb	1f
    577 leal	-128(%r9), %r11d
    578 2:	movdqu	(%r8, %rax), %xmm3 /* 1st data block */
    579 movdqu	16(%r8, %rax), %xmm4 /* 2d data block */
    580 movdqu	32(%r8, %rax), %xmm5
    581 movdqu	48(%r8, %rax), %xmm6
    582 movdqu	64(%r8, %rax), %xmm7
    583 movdqu	80(%r8, %rax), %xmm8
    584 movdqu	96(%r8, %rax), %xmm9
    585 movdqu	112(%r8, %rax), %xmm10
    586 pxor	%xmm12, %xmm3
    587 pxor	%xmm12, %xmm4
    588 pxor	%xmm12, %xmm5
    589 pxor	%xmm12, %xmm6
    590 pxor	%xmm12, %xmm7
    591 pxor	%xmm12, %xmm8
    592 pxor	%xmm12, %xmm9
    593 pxor	%xmm12, %xmm10
    594 
    595 // complete loop unrolling
    596 movdqu 144(%rdi), %xmm1
    597 movdqu 128(%rdi), %xmm11
    598 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
    599 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
    600 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
    601 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
    602 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
    603 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
    604 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
    605 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
    606 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
    607 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
    608 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
    609 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
    610 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
    611 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
    612 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
    613 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
    614 
    615 movdqu 112(%rdi), %xmm1
    616 movdqu 96(%rdi), %xmm11
    617 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
    618 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
    619 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
    620 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
    621 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
    622 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
    623 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
    624 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
    625 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
    626 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
    627 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
    628 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
    629 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
    630 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
    631 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
    632 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
    633 
    634 movdqu 80(%rdi), %xmm1
    635 movdqu 64(%rdi), %xmm11
    636 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
    637 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
    638 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
    639 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
    640 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
    641 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
    642 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
    643 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
    644 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
    645 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
    646 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
    647 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
    648 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
    649 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
    650 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
    651 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
    652 
    653 movdqu 48(%rdi), %xmm1
    654 movdqu 32(%rdi), %xmm11
    655 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
    656 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
    657 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
    658 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
    659 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
    660 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
    661 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
    662 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
    663 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
    664 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
    665 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
    666 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
    667 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
    668 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
    669 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
    670 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
    671 
    672 movdqu 16(%rdi), %xmm1
    673 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
    674 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
    675 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
    676 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
    677 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
    678 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
    679 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
    680 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
    681 .byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
    682 .byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
    683 .byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
    684 .byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
    685 .byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
    686 .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
    687 .byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
    688 .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
    689 
    690 	pxor	%xmm0, %xmm3
    691 movdqu	(%r8, %rax), %xmm0
    692 pxor	%xmm0, %xmm4
    693 movdqu	16(%r8, %rax), %xmm0
    694 pxor	%xmm0, %xmm5
    695 movdqu	32(%r8, %rax), %xmm0
    696 pxor	%xmm0, %xmm6
    697 movdqu	48(%r8, %rax), %xmm0
    698 pxor	%xmm0, %xmm7
    699 movdqu	64(%r8, %rax), %xmm0
    700 pxor	%xmm0, %xmm8
    701 movdqu	80(%r8, %rax), %xmm0
    702 pxor	%xmm0, %xmm9
    703 movdqu	96(%r8, %rax), %xmm0
    704 pxor	%xmm0, %xmm10
    705 movdqu	112(%r8, %rax), %xmm0
    706 movdqu	%xmm3, (%rsi, %rax)
    707 movdqu	%xmm4, 16(%rsi, %rax)
    708 movdqu	%xmm5, 32(%rsi, %rax)
    709 movdqu	%xmm6, 48(%rsi, %rax)
    710 movdqu	%xmm7, 64(%rsi, %rax)
    711 movdqu	%xmm8, 80(%rsi, %rax)
    712 movdqu	%xmm9, 96(%rsi, %rax)
    713 movdqu	%xmm10, 112(%rsi, %rax)
    714 addl	$128, %eax
    715 cmpl	%r11d, %eax
    716 jbe	2b
    717 1:	cmpl	%eax, %r9d
    718 je	5f
    719 
    720 movdqu	16(%rdi), %xmm3
    721 movdqu	32(%rdi), %xmm4
    722 movdqu	48(%rdi), %xmm5
    723 movdqu	64(%rdi), %xmm6
    724 movdqu	80(%rdi), %xmm7
    725 movdqu	96(%rdi), %xmm8
    726 movdqu	112(%rdi), %xmm9
    727 movdqu	128(%rdi), %xmm10
    728 movdqu	144(%rdi), %xmm11
    729 
    730 4:	movdqu	(%r8, %rax), %xmm1
    731 movdqa	%xmm1, %xmm13
    732 pxor	%xmm12, %xmm1
    733 .byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
    734 .byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
    735 .byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
    736 .byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
    737 .byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
    738 .byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
    739 .byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
    740 .byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
    741 .byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
    742 .byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
    743 pxor	%xmm0, %xmm1
    744 movdqu	%xmm1, (%rsi, %rax)
    745 movdqa	%xmm13, %xmm0
    746 addl	$16, %eax
    747 cmpl	%eax, %r9d
    748 jne	4b
    749 
    750 5:	movdqu	%xmm0, (%rdx)
    751 
    752 xor	%eax, %eax
    753 ret
    754 .size intel_aes_decrypt_cbc_128, .-intel_aes_decrypt_cbc_128
    755        
    756 /* in %rdi : the key
    757   in %rsi : buffer for expanded key
    758 */
    759 .type intel_aes_encrypt_init_192,@function
    760 .globl intel_aes_encrypt_init_192
    761 .align	16
    762 intel_aes_encrypt_init_192:
    763 movdqu	(%rdi), %xmm1
    764 movq	16(%rdi), %xmm3
    765 movdqu	%xmm1, (%rsi)
    766 movq	%xmm3, 16(%rsi)
    767 leaq	24(%rsi), %rsi
    768 
    769 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
    770 call key_expansion192
    771 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
    772 call key_expansion192
    773 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
    774 call key_expansion192
    775 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
    776 call key_expansion192
    777 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
    778 call key_expansion192
    779 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
    780 call key_expansion192
    781 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
    782 call key_expansion192
    783 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80	/* aeskeygenassist $0x80, %xmm3, %xmm2 */
    784 call key_expansion192
    785 
    786 ret
    787 .size intel_aes_encrypt_init_192, .-intel_aes_encrypt_init_192
    788 
    789 
    790 /* in %rdi : the key
    791   in %rsi : buffer for expanded key
    792 */
    793 .type intel_aes_decrypt_init_192,@function
    794 .globl intel_aes_decrypt_init_192
    795 .align	16
    796 intel_aes_decrypt_init_192:
    797 movdqu	(%rdi), %xmm1
    798 movq	16(%rdi), %xmm3
    799 movdqu	%xmm1, (%rsi)
    800 movq	%xmm3, 16(%rsi)
    801 leaq	24(%rsi), %rsi
    802 
    803 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
    804 call key_expansion192
    805 movups	-32(%rsi), %xmm2
    806 movups	-16(%rsi), %xmm4
    807 .byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
    808 .byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
    809 movups	%xmm2, -32(%rsi)
    810 movups	%xmm4, -16(%rsi)
    811 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
    812 call key_expansion192
    813 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    814 movups	%xmm2, -24(%rsi)
    815 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
    816 call key_expansion192
    817 movups	-32(%rsi), %xmm2
    818 movups	-16(%rsi), %xmm4
    819 .byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
    820 .byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
    821 movups	%xmm2, -32(%rsi)
    822 movups	%xmm4, -16(%rsi)
    823 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
    824 call key_expansion192
    825 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    826 movups	%xmm2, -24(%rsi)
    827 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
    828 call key_expansion192
    829 movups	-32(%rsi), %xmm2
    830 movups	-16(%rsi), %xmm4
    831 .byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
    832 .byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
    833 movups	%xmm2, -32(%rsi)
    834 movups	%xmm4, -16(%rsi)
    835 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
    836 call key_expansion192
    837 .byte 0x66,0x0f,0x38,0xdb,0xd1	/* aesimc	%xmm1, %xmm2 */
    838 movups	%xmm2, -24(%rsi)
    839 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
    840 call key_expansion192
    841 movups	-32(%rsi), %xmm2
    842 movups	-16(%rsi), %xmm4
    843 .byte 0x66,0x0f,0x38,0xdb,0xd2	/* aesimc	%xmm2, %xmm2 */
    844 .byte 0x66,0x0f,0x38,0xdb,0xe4	/* aesimc	%xmm4, %xmm4 */
    845 movups	%xmm2, -32(%rsi)
    846 movups	%xmm4, -16(%rsi)
    847 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x80	/* aeskeygenassist $0x80, %xmm3, %xmm2 */
    848 call key_expansion192
    849 
    850 ret
    851 .size intel_aes_decrypt_init_192, .-intel_aes_decrypt_init_192
    852 
    853 
    854 .type key_expansion192,@function
    855 .align	16
    856 key_expansion192:
    857 pshufd	$0x55, %xmm2, %xmm2
    858 xor	%eax, %eax
    859 movd	%eax, %xmm4
    860 shufps	$0x10, %xmm1, %xmm4
    861 pxor	%xmm4, %xmm1
    862 shufps	$0x8c, %xmm1, %xmm4
    863 pxor	%xmm2, %xmm1
    864 pxor	%xmm4, %xmm1
    865 movdqu	%xmm1, (%rsi)
    866 addq	$16, %rsi
    867 
    868 pshufd	$0xff, %xmm1, %xmm4
    869 movd	%eax, %xmm5
    870 shufps	$0x00, %xmm3, %xmm5
    871 shufps	$0x08, %xmm3, %xmm5
    872 pxor	%xmm4, %xmm3
    873 pxor	%xmm5, %xmm3
    874 movq	%xmm3, (%rsi)
    875 addq	$8, %rsi
    876 ret
    877 .size key_expansion192, .-key_expansion192
    878 
    879 
    880 /* in %rdi : cx - context
    881   in %rsi : output - pointer to output buffer
    882   in %rdx : outputLen - pointer to variable for length of output
    883             (already filled in by caller)
    884   in %ecx : maxOutputLen - length of output buffer
    885             (already checked by caller)
    886   in %r8  : input - pointer to input buffer
    887   in %r9d : inputLen - length of input buffer
    888   on stack: blocksize - AES blocksize (always 16, unused)
    889 */
    890 .type intel_aes_encrypt_ecb_192,@function
    891 .globl intel_aes_encrypt_ecb_192
    892 .align	16
    893 intel_aes_encrypt_ecb_192:
    894 movdqu	(%rdi), %xmm2
    895 movdqu	192(%rdi), %xmm14
    896 xorl	%eax, %eax
    897 //	cmpl	$8*16, %r9d
    898 cmpl	$128, %r9d
    899 jb	1f
    900 //	leal	-8*16(%r9), %r11d
    901 leal	-128(%r9), %r11d
    902 2:	movdqu	(%r8, %rax), %xmm3
    903 movdqu	16(%r8, %rax), %xmm4
    904 movdqu	32(%r8, %rax), %xmm5
    905 movdqu	48(%r8, %rax), %xmm6
    906 movdqu	64(%r8, %rax), %xmm7
    907 movdqu	80(%r8, %rax), %xmm8
    908 movdqu	96(%r8, %rax), %xmm9
    909 movdqu	112(%r8, %rax), %xmm10
    910 pxor	%xmm2, %xmm3
    911 pxor	%xmm2, %xmm4
    912 pxor	%xmm2, %xmm5
    913 pxor	%xmm2, %xmm6
    914 pxor	%xmm2, %xmm7
    915 pxor	%xmm2, %xmm8
    916 pxor	%xmm2, %xmm9
    917 pxor	%xmm2, %xmm10
    918 
    919 // complete loop unrolling
    920 movdqu 16(%rdi), %xmm1
    921 movdqu 32(%rdi), %xmm11
    922 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
    923 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
    924 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
    925 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
    926 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
    927 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
    928 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
    929 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
    930 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
    931 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
    932 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
    933 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
    934 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
    935 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
    936 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
    937 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
    938 
    939 movdqu 48(%rdi), %xmm1
    940 movdqu 64(%rdi), %xmm11
    941 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
    942 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
    943 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
    944 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
    945 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
    946 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
    947 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
    948 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
    949 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
    950 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
    951 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
    952 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
    953 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
    954 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
    955 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
    956 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
    957 
    958 movdqu 80(%rdi), %xmm1
    959 movdqu 96(%rdi), %xmm11
    960 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
    961 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
    962 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
    963 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
    964 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
    965 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
    966 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
    967 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
    968 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
    969 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
    970 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
    971 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
    972 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
    973 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
    974 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
    975 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
    976 
    977 movdqu 112(%rdi), %xmm1
    978 movdqu 128(%rdi), %xmm11
    979 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
    980 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
    981 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
    982 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
    983 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
    984 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
    985 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
    986 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
    987 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
    988 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
    989 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
    990 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
    991 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
    992 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
    993 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
    994 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
    995 
    996 movdqu 144(%rdi), %xmm1
    997 movdqu 160(%rdi), %xmm11
    998 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
    999 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1000 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1001 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1002 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1003 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1004 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1005 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1006 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1007 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1008 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1009 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1010 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1011 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1012 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1013 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1014 
   1015 movdqu 176(%rdi), %xmm1
   1016 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1017 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1018 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1019 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1020 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1021 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1022 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1023 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1024 .byte 0x66,0x41,0x0f,0x38,0xdd,0xde	/* aesenclast 	%xmm14, %xmm3 */
   1025 .byte 0x66,0x41,0x0f,0x38,0xdd,0xe6	/* aesenclast 	%xmm14, %xmm4 */
   1026 .byte 0x66,0x41,0x0f,0x38,0xdd,0xee	/* aesenclast 	%xmm14, %xmm5 */
   1027 .byte 0x66,0x41,0x0f,0x38,0xdd,0xf6	/* aesenclast 	%xmm14, %xmm7 */
   1028 .byte 0x66,0x41,0x0f,0x38,0xdd,0xfe	/* aesenclast 	%xmm14, %xmm3 */
   1029 .byte 0x66,0x45,0x0f,0x38,0xdd,0xc6	/* aesenclast 	%xmm14, %xmm8 */
   1030 .byte 0x66,0x45,0x0f,0x38,0xdd,0xce	/* aesenclast 	%xmm14, %xmm9 */
   1031 .byte 0x66,0x45,0x0f,0x38,0xdd,0xd6	/* aesenclast 	%xmm14, %xmm10 */
   1032 
   1033 movdqu	%xmm3, (%rsi, %rax)
   1034 movdqu	%xmm4, 16(%rsi, %rax)
   1035 movdqu	%xmm5, 32(%rsi, %rax)
   1036 movdqu	%xmm6, 48(%rsi, %rax)
   1037 movdqu	%xmm7, 64(%rsi, %rax)
   1038 movdqu	%xmm8, 80(%rsi, %rax)
   1039 movdqu	%xmm9, 96(%rsi, %rax)
   1040 movdqu	%xmm10, 112(%rsi, %rax)
   1041 //	addl	$8*16, %eax
   1042 addl	$128, %eax
   1043 cmpl	%r11d, %eax
   1044 jbe	2b
   1045 1:	cmpl	%eax, %r9d
   1046 je	5f
   1047 
   1048 movdqu	16(%rdi), %xmm3
   1049 movdqu	32(%rdi), %xmm4
   1050 movdqu	48(%rdi), %xmm5
   1051 movdqu	64(%rdi), %xmm6
   1052 movdqu	80(%rdi), %xmm7
   1053 movdqu	96(%rdi), %xmm8
   1054 movdqu	112(%rdi), %xmm9
   1055 movdqu	128(%rdi), %xmm10
   1056 movdqu	144(%rdi), %xmm11
   1057 movdqu	160(%rdi), %xmm12
   1058 movdqu	176(%rdi), %xmm13
   1059 
   1060 4:	movdqu	(%r8, %rax), %xmm1
   1061 pxor	%xmm2, %xmm1
   1062 .byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
   1063 .byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
   1064 .byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
   1065 .byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
   1066 .byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
   1067 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
   1068 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
   1069 .byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
   1070 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
   1071 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
   1072 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
   1073 .byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
   1074 movdqu	%xmm1, (%rsi, %rax)
   1075 addl	$16, %eax
   1076 cmpl	%eax, %r9d
   1077 jne	4b
   1078 
   1079 5:	xor	%eax, %eax
   1080 ret
   1081 .size intel_aes_encrypt_ecb_192, .-intel_aes_encrypt_ecb_192
   1082 
   1083 
   1084 /* in %rdi : cx - context
   1085   in %rsi : output - pointer to output buffer
   1086   in %rdx : outputLen - pointer to variable for length of output
   1087             (already filled in by caller)
   1088   in %ecx : maxOutputLen - length of output buffer
   1089             (already checked by caller)
   1090   in %r8  : input - pointer to input buffer
   1091   in %r9d : inputLen - length of input buffer
   1092   on stack: blocksize - AES blocksize (always 16, unused)
   1093 */
   1094 .type intel_aes_decrypt_ecb_192,@function
   1095 .globl intel_aes_decrypt_ecb_192
   1096 .align	16
   1097 intel_aes_decrypt_ecb_192:
   1098 movdqu	(%rdi), %xmm2
   1099 movdqu	192(%rdi), %xmm14
   1100 xorl	%eax, %eax
   1101 //	cmpl	$8*16, %r9d
   1102 cmpl	$128, %r9d
   1103 jb	1f
   1104 //	leal	-8*16(%r9), %r11d
   1105 leal	-128(%r9), %r11d
   1106 2:	movdqu	(%r8, %rax), %xmm3
   1107 movdqu	16(%r8, %rax), %xmm4
   1108 movdqu	32(%r8, %rax), %xmm5
   1109 movdqu	48(%r8, %rax), %xmm6
   1110 movdqu	64(%r8, %rax), %xmm7
   1111 movdqu	80(%r8, %rax), %xmm8
   1112 movdqu	96(%r8, %rax), %xmm9
   1113 movdqu	112(%r8, %rax), %xmm10
   1114 pxor	%xmm14, %xmm3
   1115 pxor	%xmm14, %xmm4
   1116 pxor	%xmm14, %xmm5
   1117 pxor	%xmm14, %xmm6
   1118 pxor	%xmm14, %xmm7
   1119 pxor	%xmm14, %xmm8
   1120 pxor	%xmm14, %xmm9
   1121 pxor	%xmm14, %xmm10
   1122 
   1123 // complete loop unrolling
   1124 movdqu 176(%rdi), %xmm1
   1125 movdqu 160(%rdi), %xmm11
   1126 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1127 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1128 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1129 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1130 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1131 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1132 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1133 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1134 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1135 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1136 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1137 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1138 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1139 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1140 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1141 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1142 
   1143 movdqu 144(%rdi), %xmm1
   1144 movdqu 128(%rdi), %xmm11
   1145 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1146 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1147 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1148 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1149 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1150 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1151 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1152 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1153 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1154 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1155 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1156 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1157 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1158 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1159 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1160 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1161 
   1162 movdqu 112(%rdi), %xmm1
   1163 movdqu 96(%rdi), %xmm11
   1164 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1165 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1166 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1167 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1168 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1169 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1170 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1171 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1172 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1173 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1174 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1175 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1176 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1177 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1178 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1179 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1180 
   1181 movdqu 80(%rdi), %xmm1
   1182 movdqu 64(%rdi), %xmm11
   1183 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1184 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1185 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1186 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1187 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1188 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1189 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1190 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1191 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1192 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1193 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1194 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1195 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1196 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1197 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1198 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1199 
   1200 movdqu 48(%rdi), %xmm1
   1201 movdqu 32(%rdi), %xmm11
   1202 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1203 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1204 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1205 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1206 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1207 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1208 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1209 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1210 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1211 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1212 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1213 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1214 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1215 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1216 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1217 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1218 
   1219 movdqu 16(%rdi), %xmm1
   1220 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1221 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1222 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1223 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1224 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1225 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1226 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1227 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1228 .byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
   1229 .byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
   1230 .byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
   1231 .byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
   1232 .byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
   1233 .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
   1234 .byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
   1235 .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
   1236 
   1237 movdqu	%xmm3, (%rsi, %rax)
   1238 movdqu	%xmm4, 16(%rsi, %rax)
   1239 movdqu	%xmm5, 32(%rsi, %rax)
   1240 movdqu	%xmm6, 48(%rsi, %rax)
   1241 movdqu	%xmm7, 64(%rsi, %rax)
   1242 movdqu	%xmm8, 80(%rsi, %rax)
   1243 movdqu	%xmm9, 96(%rsi, %rax)
   1244 movdqu	%xmm10, 112(%rsi, %rax)
   1245 //	addl	$8*16, %eax
   1246 addl	$128, %eax
   1247 cmpl	%r11d, %eax
   1248 jbe	2b
   1249 1:	cmpl	%eax, %r9d
   1250 je	5f
   1251 
   1252 movdqu	16(%rdi), %xmm3
   1253 movdqu	32(%rdi), %xmm4
   1254 movdqu	48(%rdi), %xmm5
   1255 movdqu	64(%rdi), %xmm6
   1256 movdqu	80(%rdi), %xmm7
   1257 movdqu	96(%rdi), %xmm8
   1258 movdqu	112(%rdi), %xmm9
   1259 movdqu	128(%rdi), %xmm10
   1260 movdqu	144(%rdi), %xmm11
   1261 movdqu	160(%rdi), %xmm12
   1262 movdqu	176(%rdi), %xmm13
   1263 
   1264 4:	movdqu	(%r8, %rax), %xmm1
   1265 pxor	%xmm14, %xmm1
   1266 .byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
   1267 .byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
   1268 .byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
   1269 .byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
   1270 .byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
   1271 .byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
   1272 .byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
   1273 .byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
   1274 .byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
   1275 .byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
   1276 .byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
   1277 .byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
   1278 movdqu	%xmm1, (%rsi, %rax)
   1279 addl	$16, %eax
   1280 cmpl	%eax, %r9d
   1281 jne	4b
   1282 
   1283 5:	xor	%eax, %eax
   1284 ret
   1285 .size intel_aes_decrypt_ecb_192, .-intel_aes_decrypt_ecb_192
   1286 
   1287 
   1288 /* in %rdi : cx - context
   1289   in %rsi : output - pointer to output buffer
   1290   in %rdx : outputLen - pointer to variable for length of output
   1291             (already filled in by caller)
   1292   in %ecx : maxOutputLen - length of output buffer
   1293             (already checked by caller)
   1294   in %r8  : input - pointer to input buffer
   1295   in %r9d : inputLen - length of input buffer
   1296   on stack: blocksize - AES blocksize (always 16, unused)
   1297 */
   1298 .type intel_aes_encrypt_cbc_192,@function
   1299 .globl intel_aes_encrypt_cbc_192
   1300 .align	16
   1301 intel_aes_encrypt_cbc_192:
   1302 testl	%r9d, %r9d
   1303 je	2f
   1304 
   1305 //	leaq	IV_OFFSET(%rdi), %rdx
   1306 leaq	256(%rdi), %rdx
   1307 
   1308 movdqu	(%rdx), %xmm0
   1309 movdqu	(%rdi), %xmm2
   1310 movdqu	16(%rdi), %xmm3
   1311 movdqu	32(%rdi), %xmm4
   1312 movdqu	48(%rdi), %xmm5
   1313 movdqu	64(%rdi), %xmm6
   1314 movdqu	80(%rdi), %xmm7
   1315 movdqu	96(%rdi), %xmm8
   1316 movdqu	112(%rdi), %xmm9
   1317 movdqu	128(%rdi), %xmm10
   1318 movdqu	144(%rdi), %xmm11
   1319 movdqu	160(%rdi), %xmm12
   1320 movdqu	176(%rdi), %xmm13
   1321 movdqu	192(%rdi), %xmm14
   1322 
   1323 xorl	%eax, %eax
   1324 1:	movdqu	(%r8, %rax), %xmm1
   1325 pxor	%xmm0, %xmm1
   1326 pxor	%xmm2, %xmm1
   1327 .byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
   1328 .byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
   1329 .byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
   1330 .byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
   1331 .byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
   1332 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
   1333 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
   1334 .byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
   1335 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
   1336 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
   1337 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
   1338 .byte 0x66,0x41,0x0f,0x38,0xdd,0xce	/* aesenclast %xmm14, %xmm1 */
   1339 movdqu	%xmm1, (%rsi, %rax)
   1340 movdqa	%xmm1, %xmm0
   1341 addl	$16, %eax
   1342 cmpl	%eax, %r9d
   1343 jne	1b
   1344 
   1345 movdqu	%xmm0, (%rdx)
   1346 
   1347 2:	xor	%eax, %eax
   1348 ret
   1349 .size intel_aes_encrypt_cbc_192, .-intel_aes_encrypt_cbc_192
   1350 
   1351 
   1352 /* in %rdi : cx - context
   1353   in %rsi : output - pointer to output buffer
   1354   in %rdx : outputLen - pointer to variable for length of output
   1355             (already filled in by caller)
   1356   in %exx : maxOutputLen - length of output buffer
   1357             (already checked by caller)
   1358   in %r8  : input - pointer to input buffer
   1359   in %r9d : inputLen - length of input buffer
   1360   on stack: blocksize - AES blocksize (always 16, unused)
   1361 */
   1362 .type intel_aes_decrypt_cbc_192,@function
   1363 .globl intel_aes_decrypt_cbc_192
   1364 .align	16
   1365 intel_aes_decrypt_cbc_192:
   1366 //	leaq	IV_OFFSET(%rdi), %rdx
   1367 leaq	256(%rdi), %rdx
   1368 
   1369 movdqu	(%rdx), %xmm0
   1370 movdqu	(%rdi), %xmm2
   1371 movdqu	192(%rdi), %xmm14
   1372 xorl	%eax, %eax
   1373 cmpl	$128, %r9d
   1374 jb	1f
   1375 leal	-128(%r9), %r11d
   1376 2:	movdqu	(%r8, %rax), %xmm3
   1377 movdqu	16(%r8, %rax), %xmm4
   1378 movdqu	32(%r8, %rax), %xmm5
   1379 movdqu	48(%r8, %rax), %xmm6
   1380 movdqu	64(%r8, %rax), %xmm7
   1381 movdqu	80(%r8, %rax), %xmm8
   1382 movdqu	96(%r8, %rax), %xmm9
   1383 movdqu	112(%r8, %rax), %xmm10
   1384 pxor	%xmm14, %xmm3
   1385 pxor	%xmm14, %xmm4
   1386 pxor	%xmm14, %xmm5
   1387 pxor	%xmm14, %xmm6
   1388 pxor	%xmm14, %xmm7
   1389 pxor	%xmm14, %xmm8
   1390 pxor	%xmm14, %xmm9
   1391 pxor	%xmm14, %xmm10
   1392 
   1393 // complete loop unrolling
   1394 movdqu 176(%rdi), %xmm1
   1395 movdqu 160(%rdi), %xmm11
   1396 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1397 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1398 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1399 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1400 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1401 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1402 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1403 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1404 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1405 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1406 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1407 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1408 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1409 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1410 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1411 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1412 
   1413 movdqu 144(%rdi), %xmm1
   1414 movdqu 128(%rdi), %xmm11
   1415 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1416 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1417 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1418 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1419 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1420 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1421 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1422 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1423 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1424 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1425 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1426 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1427 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1428 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1429 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1430 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1431 
   1432 movdqu 112(%rdi), %xmm1
   1433 movdqu 96(%rdi), %xmm11
   1434 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1435 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1436 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1437 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1438 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1439 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1440 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1441 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1442 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1443 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1444 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1445 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1446 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1447 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1448 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1449 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1450 
   1451 movdqu 80(%rdi), %xmm1
   1452 movdqu 64(%rdi), %xmm11
   1453 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1454 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1455 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1456 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1457 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1458 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1459 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1460 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1461 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1462 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1463 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1464 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1465 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1466 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1467 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1468 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1469 
   1470 movdqu 48(%rdi), %xmm1
   1471 movdqu 32(%rdi), %xmm11
   1472 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1473 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1474 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1475 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1476 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1477 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1478 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1479 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1480 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1481 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1482 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1483 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1484 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1485 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1486 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1487 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1488 
   1489 movdqu 16(%rdi), %xmm1
   1490 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1491 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1492 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1493 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1494 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1495 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1496 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1497 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1498 .byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
   1499 .byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
   1500 .byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
   1501 .byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
   1502 .byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
   1503 .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
   1504 .byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
   1505 .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
   1506 
   1507 	pxor	%xmm0, %xmm3
   1508 movdqu	(%r8, %rax), %xmm0
   1509 pxor	%xmm0, %xmm4
   1510 movdqu	16(%r8, %rax), %xmm0
   1511 pxor	%xmm0, %xmm5
   1512 movdqu	32(%r8, %rax), %xmm0
   1513 pxor	%xmm0, %xmm6
   1514 movdqu	48(%r8, %rax), %xmm0
   1515 pxor	%xmm0, %xmm7
   1516 movdqu	64(%r8, %rax), %xmm0
   1517 pxor	%xmm0, %xmm8
   1518 movdqu	80(%r8, %rax), %xmm0
   1519 pxor	%xmm0, %xmm9
   1520 movdqu	96(%r8, %rax), %xmm0
   1521 pxor	%xmm0, %xmm10
   1522 movdqu	112(%r8, %rax), %xmm0
   1523 movdqu	%xmm3, (%rsi, %rax)
   1524 movdqu	%xmm4, 16(%rsi, %rax)
   1525 movdqu	%xmm5, 32(%rsi, %rax)
   1526 movdqu	%xmm6, 48(%rsi, %rax)
   1527 movdqu	%xmm7, 64(%rsi, %rax)
   1528 movdqu	%xmm8, 80(%rsi, %rax)
   1529 movdqu	%xmm9, 96(%rsi, %rax)
   1530 movdqu	%xmm10, 112(%rsi, %rax)
   1531 addl	$128, %eax
   1532 cmpl	%r11d, %eax
   1533 jbe	2b
   1534 1:	cmpl	%eax, %r9d
   1535 je	5f
   1536 
   1537 movdqu	16(%rdi), %xmm3
   1538 movdqu	32(%rdi), %xmm4
   1539 movdqu	48(%rdi), %xmm5
   1540 movdqu	64(%rdi), %xmm6
   1541 movdqu	80(%rdi), %xmm7
   1542 movdqu	96(%rdi), %xmm8
   1543 movdqu	112(%rdi), %xmm9
   1544 movdqu	128(%rdi), %xmm10
   1545 movdqu	144(%rdi), %xmm11
   1546 movdqu	160(%rdi), %xmm12
   1547 movdqu	176(%rdi), %xmm13
   1548 
   1549 4:	movdqu	(%r8, %rax), %xmm1
   1550 movdqa	%xmm1, %xmm15
   1551 pxor	%xmm14, %xmm1
   1552 .byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
   1553 .byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
   1554 .byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
   1555 .byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
   1556 .byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
   1557 .byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
   1558 .byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
   1559 .byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
   1560 .byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
   1561 .byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
   1562 .byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
   1563 .byte 0x66,0x0f,0x38,0xdf,0xca	/* aesdeclast %xmm2, %xmm1 */
   1564 pxor	%xmm0, %xmm1
   1565 movdqu	%xmm1, (%rsi, %rax)
   1566 movdqa	%xmm15, %xmm0
   1567 addl	$16, %eax
   1568 cmpl	%eax, %r9d
   1569 jne	4b
   1570 
   1571 5:	movdqu	%xmm0, (%rdx)
   1572 
   1573 xor	%eax, %eax
   1574 ret
   1575 .size intel_aes_decrypt_cbc_192, .-intel_aes_decrypt_cbc_192
   1576 
   1577 /* in %rdi : the key
   1578   in %rsi : buffer for expanded key
   1579 */
   1580 .type intel_aes_encrypt_init_256,@function
   1581 .globl intel_aes_encrypt_init_256
   1582 .align	16
   1583 intel_aes_encrypt_init_256:
   1584 movdqu	(%rdi), %xmm1
   1585 movdqu	16(%rdi), %xmm3
   1586 movdqu	%xmm1, (%rsi)
   1587 movdqu	%xmm3, 16(%rsi)
   1588 leaq	32(%rsi), %rsi
   1589 xor	%eax, %eax
   1590 
   1591 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
   1592 call key_expansion256
   1593 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
   1594 call key_expansion256
   1595 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
   1596 call key_expansion256
   1597 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
   1598 call key_expansion256
   1599 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
   1600 call key_expansion256
   1601 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
   1602 call key_expansion256
   1603 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
   1604 pxor	%xmm6, %xmm6
   1605 pshufd	$0xff, %xmm2, %xmm2
   1606 shufps	$0x10, %xmm1, %xmm6
   1607 pxor	%xmm6, %xmm1
   1608 shufps	$0x8c, %xmm1, %xmm6
   1609 pxor	%xmm2, %xmm1
   1610 pxor	%xmm6, %xmm1
   1611 movdqu	%xmm1, (%rsi)
   1612 
   1613 ret
   1614 .size intel_aes_encrypt_init_256, .-intel_aes_encrypt_init_256
   1615 
   1616 
   1617 /* in %rdi : the key
   1618   in %rsi : buffer for expanded key
   1619 */
   1620 .type intel_aes_decrypt_init_256,@function
   1621 .globl intel_aes_decrypt_init_256
   1622 .align	16
   1623 intel_aes_decrypt_init_256:
   1624 movdqu	(%rdi), %xmm1
   1625 movdqu	16(%rdi), %xmm3
   1626 movdqu	%xmm1, (%rsi)
   1627 .byte 0x66,0x0f,0x38,0xdb,0xe3	/* aesimc	%xmm3, %xmm4 */
   1628 movdqu	%xmm4, 16(%rsi)
   1629 leaq	32(%rsi), %rsi
   1630 xor	%eax, %eax
   1631 
   1632 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x01	/* aeskeygenassist $0x01, %xmm3, %xmm2 */
   1633 call key_expansion256
   1634 .byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
   1635 .byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
   1636 movdqu	%xmm4, -32(%rsi)
   1637 movdqu	%xmm5, -16(%rsi)
   1638 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x02	/* aeskeygenassist $0x02, %xmm3, %xmm2 */
   1639 call key_expansion256
   1640 .byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
   1641 .byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
   1642 movdqu	%xmm4, -32(%rsi)
   1643 movdqu	%xmm5, -16(%rsi)
   1644 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x04	/* aeskeygenassist $0x04, %xmm3, %xmm2 */
   1645 call key_expansion256
   1646 .byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
   1647 .byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
   1648 movdqu	%xmm4, -32(%rsi)
   1649 movdqu	%xmm5, -16(%rsi)
   1650 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x08	/* aeskeygenassist $0x08, %xmm3, %xmm2 */
   1651 call key_expansion256
   1652 .byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
   1653 .byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
   1654 movdqu	%xmm4, -32(%rsi)
   1655 movdqu	%xmm5, -16(%rsi)
   1656 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x10	/* aeskeygenassist $0x10, %xmm3, %xmm2 */
   1657 call key_expansion256
   1658 .byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
   1659 .byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
   1660 movdqu	%xmm4, -32(%rsi)
   1661 movdqu	%xmm5, -16(%rsi)
   1662 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x20	/* aeskeygenassist $0x20, %xmm3, %xmm2 */
   1663 call key_expansion256
   1664 .byte 0x66,0x0f,0x38,0xdb,0xe1	/* aesimc	%xmm1, %xmm4 */
   1665 .byte 0x66,0x0f,0x38,0xdb,0xeb	/* aesimc	%xmm3, %xmm5 */
   1666 movdqu	%xmm4, -32(%rsi)
   1667 movdqu	%xmm5, -16(%rsi)
   1668 .byte 0x66,0x0f,0x3a,0xdf,0xd3,0x40	/* aeskeygenassist $0x40, %xmm3, %xmm2 */
   1669 pxor	%xmm6, %xmm6
   1670 pshufd	$0xff, %xmm2, %xmm2
   1671 shufps	$0x10, %xmm1, %xmm6
   1672 pxor	%xmm6, %xmm1
   1673 shufps	$0x8c, %xmm1, %xmm6
   1674 pxor	%xmm2, %xmm1
   1675 pxor	%xmm6, %xmm1
   1676 movdqu	%xmm1, (%rsi)
   1677 
   1678 ret
   1679 .size intel_aes_decrypt_init_256, .-intel_aes_decrypt_init_256
   1680 
   1681 
   1682 .type key_expansion256,@function
   1683 .align	16
   1684 key_expansion256:
   1685 movd	%eax, %xmm6
   1686 pshufd	$0xff, %xmm2, %xmm2
   1687 shufps	$0x10, %xmm1, %xmm6
   1688 pxor	%xmm6, %xmm1
   1689 shufps	$0x8c, %xmm1, %xmm6
   1690 pxor	%xmm2, %xmm1
   1691 pxor	%xmm6, %xmm1
   1692 movdqu	%xmm1, (%rsi)
   1693 
   1694 addq	$16, %rsi
   1695 .byte 0x66,0x0f,0x3a,0xdf,0xe1,0x00	/* aeskeygenassist $0, %xmm1, %xmm4 */
   1696 pshufd	$0xaa, %xmm4, %xmm4
   1697 shufps	$0x10, %xmm3, %xmm6
   1698 pxor	%xmm6, %xmm3
   1699 shufps	$0x8c, %xmm3, %xmm6
   1700 pxor	%xmm4, %xmm3
   1701 pxor	%xmm6, %xmm3
   1702 movdqu	%xmm3, (%rsi)
   1703 addq	$16, %rsi
   1704 ret
   1705 .size key_expansion256, .-key_expansion256
   1706 
   1707 
   1708 /* in %rdi : cx - context
   1709   in %rsi : output - pointer to output buffer
   1710   in %rdx : outputLen - pointer to variable for length of output
   1711             (already filled in by caller)
   1712   in %ecx : maxOutputLen - length of output buffer
   1713             (already checked by caller)
   1714   in %r8  : input - pointer to input buffer
   1715   in %r9d : inputLen - length of input buffer
   1716   on stack: blocksize - AES blocksize (always 16, unused)
   1717 */
   1718 .type intel_aes_encrypt_ecb_256,@function
   1719 .globl intel_aes_encrypt_ecb_256
   1720 .align	16
   1721 intel_aes_encrypt_ecb_256:
   1722 movdqu	(%rdi), %xmm2
   1723 movdqu	224(%rdi), %xmm15
   1724 xorl	%eax, %eax
   1725 //	cmpl	$8*16, %r9d
   1726 cmpl	$128, %r9d
   1727 jb	1f
   1728 //	leal	-8*16(%r9), %r11d
   1729 leal	-128(%r9), %r11d
   1730 2:	movdqu	(%r8, %rax), %xmm3
   1731 movdqu	16(%r8, %rax), %xmm4
   1732 movdqu	32(%r8, %rax), %xmm5
   1733 movdqu	48(%r8, %rax), %xmm6
   1734 movdqu	64(%r8, %rax), %xmm7
   1735 movdqu	80(%r8, %rax), %xmm8
   1736 movdqu	96(%r8, %rax), %xmm9
   1737 movdqu	112(%r8, %rax), %xmm10
   1738 pxor	%xmm2, %xmm3
   1739 pxor	%xmm2, %xmm4
   1740 pxor	%xmm2, %xmm5
   1741 pxor	%xmm2, %xmm6
   1742 pxor	%xmm2, %xmm7
   1743 pxor	%xmm2, %xmm8
   1744 pxor	%xmm2, %xmm9
   1745 pxor	%xmm2, %xmm10
   1746 
   1747 // complete loop unrolling
   1748 movdqu 16(%rdi), %xmm1
   1749 movdqu 32(%rdi), %xmm11
   1750 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1751 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1752 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1753 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1754 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1755 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1756 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1757 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1758 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1759 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1760 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1761 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1762 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1763 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1764 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1765 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1766 
   1767 movdqu 48(%rdi), %xmm1
   1768 movdqu 64(%rdi), %xmm11
   1769 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1770 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1771 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1772 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1773 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1774 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1775 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1776 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1777 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1778 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1779 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1780 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1781 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1782 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1783 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1784 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1785 
   1786 movdqu 80(%rdi), %xmm1
   1787 movdqu 96(%rdi), %xmm11
   1788 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1789 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1790 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1791 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1792 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1793 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1794 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1795 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1796 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1797 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1798 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1799 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1800 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1801 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1802 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1803 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1804 
   1805 movdqu 112(%rdi), %xmm1
   1806 movdqu 128(%rdi), %xmm11
   1807 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1808 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1809 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1810 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1811 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1812 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1813 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1814 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1815 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1816 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1817 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1818 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1819 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1820 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1821 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1822 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1823 
   1824 movdqu 144(%rdi), %xmm1
   1825 movdqu 160(%rdi), %xmm11
   1826 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1827 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1828 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1829 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1830 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1831 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1832 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1833 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1834 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1835 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1836 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1837 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1838 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1839 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1840 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1841 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1842 
   1843 movdqu 176(%rdi), %xmm1
   1844 movdqu 192(%rdi), %xmm11
   1845 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1846 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1847 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1848 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1849 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1850 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1851 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1852 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1853 .byte 0x66,0x41,0x0f,0x38,0xdc,0xdb	/* aesenc	%xmm11, %xmm3 */
   1854 .byte 0x66,0x41,0x0f,0x38,0xdc,0xe3	/* aesenc	%xmm11, %xmm4 */
   1855 .byte 0x66,0x41,0x0f,0x38,0xdc,0xeb	/* aesenc	%xmm11, %xmm5 */
   1856 .byte 0x66,0x41,0x0f,0x38,0xdc,0xf3	/* aesenc	%xmm11, %xmm6 */
   1857 .byte 0x66,0x41,0x0f,0x38,0xdc,0xfb	/* aesenc	%xmm11, %xmm7 */
   1858 .byte 0x66,0x45,0x0f,0x38,0xdc,0xc3	/* aesenc	%xmm11, %xmm8 */
   1859 .byte 0x66,0x45,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm9 */
   1860 .byte 0x66,0x45,0x0f,0x38,0xdc,0xd3	/* aesenc	%xmm11, %xmm10 */
   1861 
   1862 movdqu 208(%rdi), %xmm1
   1863 .byte 0x66,0x0f,0x38,0xdc,0xd9		/* aesenc	%xmm1, %xmm3 */
   1864 .byte 0x66,0x0f,0x38,0xdc,0xe1		/* aesenc	%xmm1, %xmm4 */
   1865 .byte 0x66,0x0f,0x38,0xdc,0xe9		/* aesenc	%xmm1, %xmm5 */
   1866 .byte 0x66,0x0f,0x38,0xdc,0xf1		/* aesenc	%xmm1, %xmm6 */
   1867 .byte 0x66,0x0f,0x38,0xdc,0xf9		/* aesenc	%xmm1, %xmm7 */
   1868 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc1	/* aesenc	%xmm1, %xmm8 */
   1869 .byte 0x66,0x44,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm1, %xmm9 */
   1870 .byte 0x66,0x44,0x0f,0x38,0xdc,0xd1	/* aesenc	%xmm1, %xmm10 */
   1871 .byte 0x66,0x41,0x0f,0x38,0xdd,0xdf	/* aesenclast 	%xmm15, %xmm3 */
   1872 .byte 0x66,0x41,0x0f,0x38,0xdd,0xe7	/* aesenclast 	%xmm15, %xmm4 */
   1873 .byte 0x66,0x41,0x0f,0x38,0xdd,0xef	/* aesenclast 	%xmm15, %xmm5 */
   1874 .byte 0x66,0x41,0x0f,0x38,0xdd,0xf7	/* aesenclast 	%xmm15, %xmm6 */
   1875 .byte 0x66,0x41,0x0f,0x38,0xdd,0xff	/* aesenclast 	%xmm15, %xmm7 */
   1876 .byte 0x66,0x45,0x0f,0x38,0xdd,0xc7	/* aesenclast 	%xmm15, %xmm8 */
   1877 .byte 0x66,0x45,0x0f,0x38,0xdd,0xcf	/* aesenclast 	%xmm15, %xmm9 */
   1878 .byte 0x66,0x45,0x0f,0x38,0xdd,0xd7	/* aesenclast 	%xmm15, %xmm10 */
   1879 
   1880 movdqu	%xmm3, (%rsi, %rax)
   1881 movdqu	%xmm4, 16(%rsi, %rax)
   1882 movdqu	%xmm5, 32(%rsi, %rax)
   1883 movdqu	%xmm6, 48(%rsi, %rax)
   1884 movdqu	%xmm7, 64(%rsi, %rax)
   1885 movdqu	%xmm8, 80(%rsi, %rax)
   1886 movdqu	%xmm9, 96(%rsi, %rax)
   1887 movdqu	%xmm10, 112(%rsi, %rax)
   1888 //	addl	$8*16, %eax
   1889 addl	$128, %eax
   1890 cmpl	%r11d, %eax
   1891 jbe	2b
   1892 1:	cmpl	%eax, %r9d
   1893 je	5f
   1894 
   1895 movdqu	(%rdi), %xmm8
   1896 movdqu	16(%rdi), %xmm2
   1897 movdqu	32(%rdi), %xmm3
   1898 movdqu	48(%rdi), %xmm4
   1899 movdqu	64(%rdi), %xmm5
   1900 movdqu	80(%rdi), %xmm6
   1901 movdqu	96(%rdi), %xmm7
   1902 movdqu	128(%rdi), %xmm9
   1903 movdqu	144(%rdi), %xmm10
   1904 movdqu	160(%rdi), %xmm11
   1905 movdqu	176(%rdi), %xmm12
   1906 movdqu	192(%rdi), %xmm13
   1907 movdqu	208(%rdi), %xmm14
   1908 
   1909 4:	movdqu	(%r8, %rax), %xmm1
   1910 pxor	%xmm8, %xmm1
   1911 movdqu	112(%rdi), %xmm8
   1912 .byte 0x66,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm2, %xmm1 */
   1913 .byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
   1914 .byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
   1915 .byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
   1916 .byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
   1917 .byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
   1918 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
   1919 movdqu	(%rdi), %xmm8
   1920 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
   1921 .byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
   1922 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
   1923 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
   1924 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
   1925 .byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
   1926 .byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
   1927 movdqu	%xmm1, (%rsi, %rax)
   1928 addl	$16, %eax
   1929 cmpl	%eax, %r9d
   1930 jne	4b
   1931 
   1932 5:	xor	%eax, %eax
   1933 ret
   1934 .size intel_aes_encrypt_ecb_256, .-intel_aes_encrypt_ecb_256
   1935 
   1936 
   1937 /* in %rdi : cx - context
   1938   in %rsi : output - pointer to output buffer
   1939   in %rdx : outputLen - pointer to variable for length of output
   1940             (already filled in by caller)
   1941   in %ecx : maxOutputLen - length of output buffer
   1942             (already checked by caller)
   1943   in %r8  : input - pointer to input buffer
   1944   in %r9d : inputLen - length of input buffer
   1945   on stack: blocksize - AES blocksize (always 16, unused)
   1946 */
   1947 .type intel_aes_decrypt_ecb_256,@function
   1948 .globl intel_aes_decrypt_ecb_256
   1949 .align	16
   1950 intel_aes_decrypt_ecb_256:
   1951 movdqu	(%rdi), %xmm2
   1952 movdqu	224(%rdi), %xmm15
   1953 xorl	%eax, %eax
   1954 //	cmpl	$8*16, %r9d
   1955 cmpl	$128, %r9d
   1956 jb	1f
   1957 //	leal	-8*16(%r9), %r11d
   1958 leal	-128(%r9), %r11d
   1959 2:	movdqu	(%r8, %rax), %xmm3
   1960 movdqu	16(%r8, %rax), %xmm4
   1961 movdqu	32(%r8, %rax), %xmm5
   1962 movdqu	48(%r8, %rax), %xmm6
   1963 movdqu	64(%r8, %rax), %xmm7
   1964 movdqu	80(%r8, %rax), %xmm8
   1965 movdqu	96(%r8, %rax), %xmm9
   1966 movdqu	112(%r8, %rax), %xmm10
   1967 pxor	%xmm15, %xmm3
   1968 pxor	%xmm15, %xmm4
   1969 pxor	%xmm15, %xmm5
   1970 pxor	%xmm15, %xmm6
   1971 pxor	%xmm15, %xmm7
   1972 pxor	%xmm15, %xmm8
   1973 pxor	%xmm15, %xmm9
   1974 pxor	%xmm15, %xmm10
   1975 
   1976 // complete loop unrolling
   1977 movdqu 208(%rdi), %xmm1
   1978 movdqu 192(%rdi), %xmm11
   1979 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1980 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   1981 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   1982 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   1983 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   1984 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   1985 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   1986 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   1987 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   1988 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   1989 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   1990 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   1991 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   1992 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   1993 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   1994 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   1995 
   1996 movdqu 176(%rdi), %xmm1
   1997 movdqu 160(%rdi), %xmm11
   1998 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   1999 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2000 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2001 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2002 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2003 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2004 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2005 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2006 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2007 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2008 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2009 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2010 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2011 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2012 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2013 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2014 
   2015 movdqu 144(%rdi), %xmm1
   2016 movdqu 128(%rdi), %xmm11
   2017 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2018 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2019 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2020 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2021 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2022 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2023 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2024 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2025 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2026 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2027 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2028 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2029 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2030 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2031 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2032 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2033 
   2034 movdqu 112(%rdi), %xmm1
   2035 movdqu 96(%rdi), %xmm11
   2036 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2037 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2038 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2039 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2040 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2041 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2042 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2043 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2044 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2045 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2046 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2047 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2048 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2049 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2050 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2051 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2052 
   2053 movdqu 80(%rdi), %xmm1
   2054 movdqu 64(%rdi), %xmm11
   2055 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2056 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2057 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2058 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2059 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2060 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2061 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2062 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2063 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2064 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2065 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2066 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2067 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2068 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2069 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2070 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2071 
   2072 movdqu 48(%rdi), %xmm1
   2073 movdqu 32(%rdi), %xmm11
   2074 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2075 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2076 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2077 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2078 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2079 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2080 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2081 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2082 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2083 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2084 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2085 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2086 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2087 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2088 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2089 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2090 
   2091 movdqu 16(%rdi), %xmm1
   2092 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2093 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2094 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2095 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2096 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2097 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2098 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2099 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2100 .byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
   2101 .byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
   2102 .byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
   2103 .byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
   2104 .byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
   2105 .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
   2106 .byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
   2107 .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
   2108 
   2109 movdqu	%xmm3, (%rsi, %rax)
   2110 movdqu	%xmm4, 16(%rsi, %rax)
   2111 movdqu	%xmm5, 32(%rsi, %rax)
   2112 movdqu	%xmm6, 48(%rsi, %rax)
   2113 movdqu	%xmm7, 64(%rsi, %rax)
   2114 movdqu	%xmm8, 80(%rsi, %rax)
   2115 movdqu	%xmm9, 96(%rsi, %rax)
   2116 movdqu	%xmm10, 112(%rsi, %rax)
   2117 //	addl	$8*16, %eax
   2118 addl	$128, %eax
   2119 cmpl	%r11d, %eax
   2120 jbe	2b
   2121 1:	cmpl	%eax, %r9d
   2122 je	5f
   2123 
   2124 movdqu	16(%rdi), %xmm2
   2125 movdqu	32(%rdi), %xmm3
   2126 movdqu	48(%rdi), %xmm4
   2127 movdqu	64(%rdi), %xmm5
   2128 movdqu	80(%rdi), %xmm6
   2129 movdqu	96(%rdi), %xmm7
   2130 movdqu	112(%rdi), %xmm8
   2131 movdqu	128(%rdi), %xmm9
   2132 movdqu	144(%rdi), %xmm10
   2133 movdqu	160(%rdi), %xmm11
   2134 movdqu	176(%rdi), %xmm12
   2135 movdqu	192(%rdi), %xmm13
   2136 movdqu	208(%rdi), %xmm14
   2137 
   2138 4:	movdqu	(%r8, %rax), %xmm1
   2139 pxor	%xmm15, %xmm1
   2140 .byte 0x66,0x41,0x0f,0x38,0xde,0xce	/* aesdec	%xmm14, %xmm1 */
   2141 .byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
   2142 .byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
   2143 .byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
   2144 .byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
   2145 .byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
   2146 .byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
   2147 movdqu	(%rdi), %xmm8
   2148 .byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
   2149 .byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
   2150 .byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
   2151 .byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
   2152 .byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
   2153 .byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
   2154 .byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
   2155 movdqu	112(%rdi), %xmm8
   2156 movdqu	%xmm1, (%rsi, %rax)
   2157 addl	$16, %eax
   2158 cmpl	%eax, %r9d
   2159 jne	4b
   2160 
   2161 5:	xor	%eax, %eax
   2162 ret
   2163 .size intel_aes_decrypt_ecb_256, .-intel_aes_decrypt_ecb_256
   2164 
   2165 
   2166 /* in %rdi : cx - context
   2167   in %rsi : output - pointer to output buffer
   2168   in %rdx : outputLen - pointer to variable for length of output
   2169             (already filled in by caller)
   2170   in %ecx : maxOutputLen - length of output buffer
   2171             (already checked by caller)
   2172   in %r8  : input - pointer to input buffer
   2173   in %r9d : inputLen - length of input buffer
   2174   on stack: blocksize - AES blocksize (always 16, unused)
   2175 */
   2176 .type intel_aes_encrypt_cbc_256,@function
   2177 .globl intel_aes_encrypt_cbc_256
   2178 .align	16
   2179 intel_aes_encrypt_cbc_256:
   2180 testl	%r9d, %r9d
   2181 je	2f
   2182 
   2183 //	leaq	IV_OFFSET(%rdi), %rdx
   2184 leaq	256(%rdi), %rdx
   2185 
   2186 movdqu	(%rdx), %xmm0
   2187 movdqu	(%rdi), %xmm8
   2188 movdqu	16(%rdi), %xmm2
   2189 movdqu	32(%rdi), %xmm3
   2190 movdqu	48(%rdi), %xmm4
   2191 movdqu	64(%rdi), %xmm5
   2192 movdqu	80(%rdi), %xmm6
   2193 movdqu	96(%rdi), %xmm7
   2194 movdqu	128(%rdi), %xmm9
   2195 movdqu	144(%rdi), %xmm10
   2196 movdqu	160(%rdi), %xmm11
   2197 movdqu	176(%rdi), %xmm12
   2198 movdqu	192(%rdi), %xmm13
   2199 movdqu	208(%rdi), %xmm14
   2200 movdqu	224(%rdi), %xmm15
   2201 
   2202 xorl	%eax, %eax
   2203 1:	movdqu	(%r8, %rax), %xmm1
   2204 pxor	%xmm0, %xmm1
   2205 pxor	%xmm8, %xmm1
   2206 movdqu	112(%rdi), %xmm8
   2207 .byte 0x66,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm2, %xmm1 */
   2208 .byte 0x66,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm3, %xmm1 */
   2209 .byte 0x66,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm4, %xmm1 */
   2210 .byte 0x66,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm5, %xmm1 */
   2211 .byte 0x66,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm6, %xmm1 */
   2212 .byte 0x66,0x0f,0x38,0xdc,0xcf	/* aesenc	%xmm7, %xmm1 */
   2213 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc8	/* aesenc	%xmm8, %xmm1 */
   2214 movdqu	(%rdi), %xmm8
   2215 .byte 0x66,0x41,0x0f,0x38,0xdc,0xc9	/* aesenc	%xmm9, %xmm1 */
   2216 .byte 0x66,0x41,0x0f,0x38,0xdc,0xca	/* aesenc	%xmm10, %xmm1 */
   2217 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcb	/* aesenc	%xmm11, %xmm1 */
   2218 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcc	/* aesenc	%xmm12, %xmm1 */
   2219 .byte 0x66,0x41,0x0f,0x38,0xdc,0xcd	/* aesenc	%xmm13, %xmm1 */
   2220 .byte 0x66,0x41,0x0f,0x38,0xdc,0xce	/* aesenc	%xmm14, %xmm1 */
   2221 .byte 0x66,0x41,0x0f,0x38,0xdd,0xcf	/* aesenclast %xmm15, %xmm1 */
   2222 movdqu	%xmm1, (%rsi, %rax)
   2223 movdqa	%xmm1, %xmm0
   2224 addl	$16, %eax
   2225 cmpl	%eax, %r9d
   2226 jne	1b
   2227 
   2228 movdqu	%xmm0, (%rdx)
   2229 
   2230 2:	xor	%eax, %eax
   2231 ret
   2232 .size intel_aes_encrypt_cbc_256, .-intel_aes_encrypt_cbc_256
   2233 
   2234 
   2235 /* in %rdi : cx - context
   2236   in %rsi : output - pointer to output buffer
   2237   in %rdx : outputLen - pointer to variable for length of output
   2238             (already filled in by caller)
   2239   in %ecx : maxOutputLen - length of output buffer
   2240             (already checked by caller)
   2241   in %r8  : input - pointer to input buffer
   2242   in %r9d : inputLen - length of input buffer
   2243   on stack: blocksize - AES blocksize (always 16, unused)
   2244 */
   2245 .type intel_aes_decrypt_cbc_256,@function
   2246 .globl intel_aes_decrypt_cbc_256
   2247 .align	16
   2248 intel_aes_decrypt_cbc_256:
   2249 //	leaq	IV_OFFSET(%rdi), %rdx
   2250 leaq	256(%rdi), %rdx
   2251 
   2252 movdqu	(%rdx), %xmm0
   2253 movdqu	(%rdi), %xmm2
   2254 movdqu	224(%rdi), %xmm15
   2255 xorl	%eax, %eax
   2256 //	cmpl	$8*16, %r9d
   2257 cmpl	$128, %r9d
   2258 jb	1f
   2259 //	leal	-8*16(%r9), %r11d
   2260 leal	-128(%r9), %r11d
   2261 2:	movdqu  (%r8, %rax), %xmm3
   2262 movdqu	16(%r8, %rax), %xmm4
   2263 movdqu	32(%r8, %rax), %xmm5
   2264 movdqu	48(%r8, %rax), %xmm6
   2265 movdqu	64(%r8, %rax), %xmm7
   2266 movdqu	80(%r8, %rax), %xmm8
   2267 movdqu	96(%r8, %rax), %xmm9
   2268 movdqu	112(%r8, %rax), %xmm10
   2269 pxor	%xmm15, %xmm3
   2270 pxor	%xmm15, %xmm4
   2271 pxor	%xmm15, %xmm5
   2272 pxor	%xmm15, %xmm6
   2273 pxor	%xmm15, %xmm7
   2274 pxor	%xmm15, %xmm8
   2275 pxor	%xmm15, %xmm9
   2276 pxor	%xmm15, %xmm10
   2277 
   2278 // complete loop unrolling
   2279 movdqu 208(%rdi), %xmm1
   2280 movdqu 192(%rdi), %xmm11
   2281 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2282 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2283 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2284 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2285 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2286 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2287 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2288 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2289 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2290 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2291 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2292 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2293 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2294 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2295 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2296 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2297 
   2298 movdqu 176(%rdi), %xmm1
   2299 movdqu 160(%rdi), %xmm11
   2300 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2301 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2302 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2303 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2304 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2305 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2306 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2307 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2308 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2309 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2310 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2311 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2312 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2313 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2314 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2315 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2316 
   2317 movdqu 144(%rdi), %xmm1
   2318 movdqu 128(%rdi), %xmm11
   2319 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2320 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2321 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2322 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2323 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2324 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2325 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2326 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2327 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2328 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2329 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2330 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2331 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2332 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2333 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2334 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2335 
   2336 movdqu 112(%rdi), %xmm1
   2337 movdqu 96(%rdi), %xmm11
   2338 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2339 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2340 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2341 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2342 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2343 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2344 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2345 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2346 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2347 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2348 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2349 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2350 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2351 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2352 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2353 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2354 
   2355 movdqu 80(%rdi), %xmm1
   2356 movdqu 64(%rdi), %xmm11
   2357 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2358 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2359 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2360 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2361 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2362 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2363 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2364 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2365 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2366 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2367 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2368 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2369 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2370 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2371 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2372 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2373 
   2374 movdqu 48(%rdi), %xmm1
   2375 movdqu 32(%rdi), %xmm11
   2376 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2377 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2378 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2379 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2380 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2381 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2382 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2383 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2384 .byte 0x66,0x41,0x0f,0x38,0xde,0xdb	/* aesdec	%xmm11, %xmm3 */
   2385 .byte 0x66,0x41,0x0f,0x38,0xde,0xe3	/* aesdec	%xmm11, %xmm4 */
   2386 .byte 0x66,0x41,0x0f,0x38,0xde,0xeb	/* aesdec	%xmm11, %xmm5 */
   2387 .byte 0x66,0x41,0x0f,0x38,0xde,0xf3	/* aesdec	%xmm11, %xmm6 */
   2388 .byte 0x66,0x41,0x0f,0x38,0xde,0xfb	/* aesdec	%xmm11, %xmm7 */
   2389 .byte 0x66,0x45,0x0f,0x38,0xde,0xc3	/* aesdec	%xmm11, %xmm8 */
   2390 .byte 0x66,0x45,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm9 */
   2391 .byte 0x66,0x45,0x0f,0x38,0xde,0xd3	/* aesdec	%xmm11, %xmm10 */
   2392 
   2393 movdqu 16(%rdi), %xmm1
   2394 .byte 0x66,0x0f,0x38,0xde,0xd9		/* aesdec	%xmm1, %xmm3 */
   2395 .byte 0x66,0x0f,0x38,0xde,0xe1		/* aesdec	%xmm1, %xmm4 */
   2396 .byte 0x66,0x0f,0x38,0xde,0xe9		/* aesdec	%xmm1, %xmm5 */
   2397 .byte 0x66,0x0f,0x38,0xde,0xf1		/* aesdec	%xmm1, %xmm6 */
   2398 .byte 0x66,0x0f,0x38,0xde,0xf9		/* aesdec	%xmm1, %xmm7 */
   2399 .byte 0x66,0x44,0x0f,0x38,0xde,0xc1	/* aesdec	%xmm1, %xmm8 */
   2400 .byte 0x66,0x44,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm1, %xmm9 */
   2401 .byte 0x66,0x44,0x0f,0x38,0xde,0xd1	/* aesdec	%xmm1, %xmm10 */
   2402 .byte 0x66,0x0f,0x38,0xdf,0xda		/* aesdeclast 	%xmm2, %xmm3 */
   2403 .byte 0x66,0x0f,0x38,0xdf,0xe2		/* aesdeclast 	%xmm2, %xmm4 */
   2404 .byte 0x66,0x0f,0x38,0xdf,0xea		/* aesdeclast 	%xmm2, %xmm5 */
   2405 .byte 0x66,0x0f,0x38,0xdf,0xf2		/* aesdeclast 	%xmm2, %xmm6 */
   2406 .byte 0x66,0x0f,0x38,0xdf,0xfa		/* aesdeclast 	%xmm2, %xmm7 */
   2407 .byte 0x66,0x44,0x0f,0x38,0xdf,0xc2	/* aesdeclast 	%xmm2, %xmm8 */
   2408 .byte 0x66,0x44,0x0f,0x38,0xdf,0xca	/* aesdeclast 	%xmm2, %xmm9 */
   2409 .byte 0x66,0x44,0x0f,0x38,0xdf,0xd2	/* aesdeclast 	%xmm2, %xmm10 */
   2410 
   2411 	pxor	%xmm0, %xmm3
   2412 movdqu	(%r8, %rax), %xmm0
   2413 pxor	%xmm0, %xmm4
   2414 movdqu	16(%r8, %rax), %xmm0
   2415 pxor	%xmm0, %xmm5
   2416 movdqu	32(%r8, %rax), %xmm0
   2417 pxor	%xmm0, %xmm6
   2418 movdqu	48(%r8, %rax), %xmm0
   2419 pxor	%xmm0, %xmm7
   2420 movdqu	64(%r8, %rax), %xmm0
   2421 pxor	%xmm0, %xmm8
   2422 movdqu	80(%r8, %rax), %xmm0
   2423 pxor	%xmm0, %xmm9
   2424 movdqu	96(%r8, %rax), %xmm0
   2425 pxor	%xmm0, %xmm10
   2426 movdqu	112(%r8, %rax), %xmm0
   2427 movdqu	%xmm3, (%rsi, %rax)
   2428 movdqu	%xmm4, 16(%rsi, %rax)
   2429 movdqu	%xmm5, 32(%rsi, %rax)
   2430 movdqu	%xmm6, 48(%rsi, %rax)
   2431 movdqu	%xmm7, 64(%rsi, %rax)
   2432 movdqu	%xmm8, 80(%rsi, %rax)
   2433 movdqu	%xmm9, 96(%rsi, %rax)
   2434 movdqu	%xmm10, 112(%rsi, %rax)
   2435 //	addl	$8*16, %eax
   2436 addl	$128, %eax
   2437 cmpl	%r11d, %eax
   2438 jbe	2b
   2439 1:	cmpl	%eax, %r9d
   2440 je	5f
   2441 
   2442 movdqu	16(%rdi), %xmm2
   2443 movdqu	32(%rdi), %xmm3
   2444 movdqu	48(%rdi), %xmm4
   2445 movdqu	64(%rdi), %xmm5
   2446 movdqu	80(%rdi), %xmm6
   2447 movdqu	96(%rdi), %xmm7
   2448 movdqu	112(%rdi), %xmm8
   2449 movdqu	128(%rdi), %xmm9
   2450 movdqu	144(%rdi), %xmm10
   2451 movdqu	160(%rdi), %xmm11
   2452 movdqu	176(%rdi), %xmm12
   2453 movdqu	192(%rdi), %xmm13
   2454 movdqu	208(%rdi), %xmm14
   2455 
   2456 4:	movdqu	(%r8, %rax), %xmm1
   2457 pxor	%xmm15, %xmm1
   2458 .byte 0x66,0x41,0x0f,0x38,0xde,0xce	/* aesdec	%xmm14, %xmm1 */
   2459 .byte 0x66,0x41,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm13, %xmm1 */
   2460 .byte 0x66,0x41,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm12, %xmm1 */
   2461 .byte 0x66,0x41,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm11, %xmm1 */
   2462 .byte 0x66,0x41,0x0f,0x38,0xde,0xca	/* aesdec	%xmm10, %xmm1 */
   2463 .byte 0x66,0x41,0x0f,0x38,0xde,0xc9	/* aesdec	%xmm9, %xmm1 */
   2464 .byte 0x66,0x41,0x0f,0x38,0xde,0xc8	/* aesdec	%xmm8, %xmm1 */
   2465 movdqu	(%rdi), %xmm8
   2466 .byte 0x66,0x0f,0x38,0xde,0xcf	/* aesdec	%xmm7, %xmm1 */
   2467 .byte 0x66,0x0f,0x38,0xde,0xce	/* aesdec	%xmm6, %xmm1 */
   2468 .byte 0x66,0x0f,0x38,0xde,0xcd	/* aesdec	%xmm5, %xmm1 */
   2469 .byte 0x66,0x0f,0x38,0xde,0xcc	/* aesdec	%xmm4, %xmm1 */
   2470 .byte 0x66,0x0f,0x38,0xde,0xcb	/* aesdec	%xmm3, %xmm1 */
   2471 .byte 0x66,0x0f,0x38,0xde,0xca	/* aesdec	%xmm2, %xmm1 */
   2472 .byte 0x66,0x41,0x0f,0x38,0xdf,0xc8	/* aesdeclast %xmm8, %xmm1 */
   2473 movdqu	112(%rdi), %xmm8
   2474 pxor	%xmm0, %xmm1
   2475 movdqu	(%r8, %rax), %xmm0  /* fetch the IV before we store the block */
   2476 movdqu	%xmm1, (%rsi, %rax) /* in case input buf = output buf */
   2477 addl	$16, %eax
   2478 cmpl	%eax, %r9d
   2479 jne	4b
   2480 
   2481 5:	movdqu	%xmm0, (%rdx)
   2482 
   2483 xor	%eax, %eax
   2484 ret
   2485 .size intel_aes_decrypt_cbc_256, .-intel_aes_decrypt_cbc_256