jchuff-sse2.asm (31508B)
1 ; 2 ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2) 3 ; 4 ; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023-2024, D. R. Commander. 5 ; Copyright (C) 2015, Matthieu Darbois. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; Copyright (C) 2023, Aliaksiej Kandracienka. 8 ; 9 ; Based on the x86 SIMD extension for IJG JPEG library 10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 12 ; 13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 14 ; 15 ; This file contains an SSE2 implementation for Huffman coding of one block. 16 ; The following code is based on jchuff.c; see jchuff.c for more details. 17 18 %include "jsimdext.inc" 19 20 struc working_state 21 .next_output_byte: resp 1 ; => next byte to write in buffer 22 .free_in_buffer: resp 1 ; # of byte spaces remaining in buffer 23 .cur.put_buffer.simd resq 1 ; current bit accumulation buffer 24 .cur.free_bits resd 1 ; # of bits available in it 25 .cur.last_dc_val resd 4 ; last DC coef for each component 26 .cinfo: resp 1 ; dump_buffer needs access to this 27 endstruc 28 29 struc c_derived_tbl 30 .ehufco: resd 256 ; code for each symbol 31 .ehufsi: resb 256 ; length of code for each symbol 32 ; If no code has been allocated for a symbol S, ehufsi[S] contains 0 33 endstruc 34 35 ; -------------------------------------------------------------------------- 36 SECTION SEG_CONST 37 38 ALIGNZ 32 39 GLOBAL_DATA(jconst_huff_encode_one_block) 40 41 EXTN(jconst_huff_encode_one_block): 42 43 jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007 44 dd 0x000f, 0x001f, 0x003f, 0x007f 45 dd 0x00ff, 0x01ff, 0x03ff, 0x07ff 46 dd 0x0fff, 0x1fff, 0x3fff, 0x7fff 47 48 ALIGNZ 32 49 50 times 1 << 14 db 15 51 times 1 << 13 db 14 52 times 1 << 12 db 13 53 times 1 << 11 db 12 54 times 1 << 10 db 11 55 times 1 << 9 db 10 56 times 1 << 8 db 9 57 times 1 << 7 db 8 58 times 1 << 6 db 7 59 times 1 << 5 db 6 60 times 1 << 4 db 5 61 times 1 << 3 db 4 62 times 1 << 2 db 3 63 times 1 << 1 db 2 64 times 1 << 0 db 1 65 times 1 db 0 66 GLOBAL_DATA(jpeg_nbits_table) 67 EXTN(jpeg_nbits_table): 68 times 1 db 0 69 times 1 << 0 db 1 70 times 1 << 1 db 2 71 times 1 << 2 db 3 72 times 1 << 3 db 4 73 times 1 << 4 db 5 74 times 1 << 5 db 6 75 times 1 << 6 db 7 76 times 1 << 7 db 8 77 times 1 << 8 db 9 78 times 1 << 9 db 10 79 times 1 << 10 db 11 80 times 1 << 11 db 12 81 times 1 << 12 db 13 82 times 1 << 13 db 14 83 times 1 << 14 db 15 84 times 1 << 15 db 16 85 86 ALIGNZ 32 87 88 %define NBITS(x) nbits_base + x 89 %define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - EXTN(jpeg_nbits_table)) 90 91 ; -------------------------------------------------------------------------- 92 SECTION SEG_TEXT 93 BITS 64 94 95 ; Shorthand used to describe SIMD operations: 96 ; wN: xmmN treated as eight signed 16-bit values 97 ; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7 98 ; bN: xmmN treated as 16 unsigned 8-bit values 99 ; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15 100 ; Contents of SIMD registers are shown in memory order. 101 102 ; Fill the bit buffer to capacity with the leading bits from code, then output 103 ; the bit buffer and put the remaining bits from code into the bit buffer. 104 ; 105 ; Usage: 106 ; code - contains the bits to shift into the bit buffer (LSB-aligned) 107 ; %1 - the label to which to jump when the macro completes 108 ; %2 (optional) - extra instructions to execute after nbits has been set 109 ; 110 ; Upon completion, free_bits will be set to the number of remaining bits from 111 ; code, and put_buffer will contain those remaining bits. temp and code will 112 ; be clobbered. 113 ; 114 ; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE() 115 ; macro in jchuff.c. 116 117 %macro EMIT_QWORD 1-2 118 add nbitsb, free_bitsb ; nbits += free_bits; 119 neg free_bitsb ; free_bits = -free_bits; 120 mov tempd, code ; temp = code; 121 shl put_buffer, nbitsb ; put_buffer <<= nbits; 122 mov nbitsb, free_bitsb ; nbits = free_bits; 123 neg free_bitsb ; free_bits = -free_bits; 124 shr tempd, nbitsb ; temp >>= nbits; 125 or tempq, put_buffer ; temp |= put_buffer; 126 movq xmm0, tempq ; xmm0.u64 = { temp, 0 }; 127 bswap tempq ; temp = htonl(temp); 128 mov put_buffer, codeq ; put_buffer = code; 129 pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0); 130 %2 131 pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i); 132 mov qword [buffer], tempq ; memcpy(buffer, &temp, 8); 133 ; (speculative; will be overwritten if 134 ; code contains any 0xFF bytes) 135 add free_bitsb, 64 ; free_bits += 64; 136 add bufferp, 8 ; buffer += 8; 137 test code, code ; if (code == 0) /* No 0xFF bytes */ 138 jz %1 ; return; 139 ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8 140 ; bytes in the qword. 141 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 142 mov byte [buffer-7], 0 ; buffer[-7] = 0; 143 sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0)); 144 mov byte [buffer], temph ; buffer[0] = temp[1]; 145 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 146 mov byte [buffer+1], 0 ; buffer[1] = 0; 147 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 148 shr tempq, 16 ; temp >>= 16; 149 mov byte [buffer], tempb ; buffer[0] = temp[0]; 150 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 151 mov byte [buffer+1], 0 ; buffer[1] = 0; 152 sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 153 mov byte [buffer], temph ; buffer[0] = temp[1]; 154 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 155 mov byte [buffer+1], 0 ; buffer[1] = 0; 156 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 157 shr tempq, 16 ; temp >>= 16; 158 mov byte [buffer], tempb ; buffer[0] = temp[0]; 159 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 160 mov byte [buffer+1], 0 ; buffer[1] = 0; 161 sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 162 mov byte [buffer], temph ; buffer[0] = temp[1]; 163 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 164 mov byte [buffer+1], 0 ; buffer[1] = 0; 165 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 166 shr tempd, 16 ; temp >>= 16; 167 mov byte [buffer], tempb ; buffer[0] = temp[0]; 168 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF 169 mov byte [buffer+1], 0 ; buffer[1] = 0; 170 sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 171 mov byte [buffer], temph ; buffer[0] = temp[1]; 172 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF 173 mov byte [buffer+1], 0 ; buffer[1] = 0; 174 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 175 jmp %1 ; return; 176 %endmacro 177 178 ; 179 ; Encode a single block's worth of coefficients. 180 ; 181 ; GLOBAL(JOCTET *) 182 ; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer, 183 ; JCOEFPTR block, int last_dc_val, 184 ; c_derived_tbl *dctbl, c_derived_tbl *actbl) 185 ; 186 ; NOTES: 187 ; When shuffling data, we try to avoid pinsrw as much as possible, since it is 188 ; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on 189 ; modern CPUs, so chains of pinsrw instructions (even with different outputs) 190 ; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and 191 ; requires 2 µops (with memory operand) on Intel. In either case, only one 192 ; pinsrw instruction can be decoded per cycle (and nothing else if they are 193 ; back-to-back), so out-of-order execution cannot be used to work around long 194 ; pinsrw chains (though for Sandy Bridge and later, this may be less of a 195 ; problem if the code runs from the µop cache.) 196 ; 197 ; We use tzcnt instead of bsf without checking for support. The instruction is 198 ; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to 199 ; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is 200 ; an input dependency (although the behavior is not formally defined, Intel 201 ; CPUs usually leave the destination unmodified if the source is zero.) This 202 ; can prevent out-of-order execution, so we clear the destination before 203 ; invoking tzcnt. 204 ; 205 ; Initial register allocation 206 ; rax - buffer 207 ; rbx - temp 208 ; rcx - nbits 209 ; rdx - code 210 ; rsi - nbits_base 211 ; rdi - t 212 ; r8 - dctbl --> code_temp 213 ; r9 - actbl 214 ; r10 - state 215 ; r11 - index 216 ; r12 - put_buffer 217 ; r15 - block --> free_bits 218 219 %define buffer rax 220 %ifdef WIN64 221 %define bufferp rax 222 %else 223 %define bufferp raxp 224 %endif 225 %define tempq rbx 226 %define tempd ebx 227 %define tempb bl 228 %define temph bh 229 %define nbitsq rcx 230 %define nbits ecx 231 %define nbitsb cl 232 %define codeq rdx 233 %define code edx 234 %define nbits_base rsi 235 %define t rdi 236 %define td edi 237 %define dctbl r8 238 %define actbl r9 239 %define state r10 240 %define index r11 241 %define indexd r11d 242 %define put_buffer r12 243 %define put_bufferd r12d 244 %define block r15 245 246 ; Step 1: Re-arrange input data according to jpeg_natural_order 247 ; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 248 ; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05 249 ; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34 250 ; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28 251 ; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36 252 ; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51 253 ; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46 254 ; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63 255 256 align 32 257 GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) 258 259 EXTN(jsimd_huff_encode_one_block_sse2): 260 ENDBR64 261 push rbp 262 mov rbp, rsp 263 264 %ifdef WIN64 265 266 ; rcx = working_state *state 267 ; rdx = JOCTET *buffer 268 ; r8 = JCOEFPTR block 269 ; r9 = int last_dc_val 270 ; [rbp+48] = c_derived_tbl *dctbl 271 ; [rbp+56] = c_derived_tbl *actbl 272 273 ;X: X = code stream 274 mov buffer, rdx 275 push r15 276 mov block, r8 277 movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 278 push rbx 279 movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 280 push rsi 281 push rdi 282 push r12 283 movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 284 mov state, rcx 285 movsx code, word [block] ;Z: code = block[0]; 286 pxor xmm4, xmm4 ;A: w4[i] = 0; 287 sub code, r9d ;Z: code -= last_dc_val; 288 mov dctbl, POINTER [rbp+48] 289 mov actbl, POINTER [rbp+56] 290 punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 291 lea nbits_base, [rel EXTN(jpeg_nbits_table)] 292 293 %else 294 295 ; rdi = working_state *state 296 ; rsi = JOCTET *buffer 297 ; rdx = JCOEFPTR block 298 ; rcx = int last_dc_val 299 ; r8 = c_derived_tbl *dctbl 300 ; r9 = c_derived_tbl *actbl 301 302 ;X: X = code stream 303 push r15 304 mov block, rdx 305 movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 306 push rbx 307 movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 308 push r12 309 mov state, rdi 310 mov buffer, rsi 311 movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 312 movsx codeq, word [block] ;Z: code = block[0]; 313 lea nbits_base, [rel EXTN(jpeg_nbits_table)] 314 pxor xmm4, xmm4 ;A: w4[i] = 0; 315 sub codeq, rcx ;Z: code -= last_dc_val; 316 punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 317 318 %endif 319 320 ; Allocate stack space for t array, and realign stack. 321 add rsp, -DCTSIZE2 * SIZEOF_WORD - 8 322 mov t, rsp 323 324 pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 325 pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 326 punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 327 punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13 328 pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17 329 ;A: (Row 0, offset 1) 330 pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0); 331 paddw xmm0, xmm4 ;A: w0[i] += w4[i]; 332 movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i]; 333 334 movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- -- 335 pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- -- 336 pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12 337 movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55 338 movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12 339 punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51 340 pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12 341 pxor xmm4, xmm4 ;A: w4[i] = 0; 342 psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- -- 343 pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0); 344 pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12 345 ; (Row 1, offset 1) 346 pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0); 347 paddw xmm1, xmm4 ;B: w1[i] += w4[i]; 348 movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i]; 349 pxor xmm4, xmm4 ;B: w4[i] = 0; 350 pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0); 351 352 packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i] 353 ; w/ signed saturation 354 355 pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- -- 356 pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- -- 357 pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 -- 358 pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35 359 ; (Row 3, offset 1) 360 pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0); 361 paddw xmm3, xmm4 ;D: w3[i] += w4[i]; 362 movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i]; 363 pxor xmm4, xmm4 ;D: w4[i] = 0; 364 pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0); 365 366 pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51 367 cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000, 368 ;Z: i.e. if code is positive 369 pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51 370 pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51 371 adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0); 372 pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51 373 pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51 374 movsxd codeq, code ;Z: sign extend code 375 pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27 376 ; (Row 2, offset 1) 377 pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0); 378 paddw xmm2, xmm4 ;C: w2[i] += w4[i]; 379 movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i]; 380 pxor xmm4, xmm4 ;C: w4[i] = 0; 381 pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0); 382 383 packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i] 384 ; w/ signed saturation 385 386 movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code); 387 movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55 388 pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i); 389 pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i); 390 movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63 391 punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63 392 shl tempd, 16 ;Z: temp <<= 16; 393 psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 -- 394 pxor xmm2, xmm2 ;H: w2[i] = 0; 395 or put_bufferd, tempd ;Z: put_buffer |= temp; 396 pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 -- 397 movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- -- 398 unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59 399 pxor xmm0, xmm0 ;H: w0[i] = 0; 400 pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 -- 401 ; (Row 7, offset 1) 402 pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0); 403 paddw xmm3, xmm2 ;H: w3[i] += w2[i]; 404 movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i]; 405 movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- -- 406 pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0); 407 punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47 408 mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4] 409 ;Z: temp = dctbl->ehufco[nbits]; 410 movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47 411 psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 -- 412 shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59 413 and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1; 414 pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 -- 415 pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58 416 shl tempq, nbitsb ;Z: temp <<= nbits; 417 pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 -- 418 pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58 419 pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 -- 420 or code, tempd ;Z: code |= temp; 421 movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58 422 pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 -- 423 pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58 424 pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53 425 ; (Row 6, offset 1) 426 pxor xmm2, xmm2 ;G: w2[i] = 0; 427 pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0); 428 pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58 429 paddw xmm4, xmm0 ;G: w4[i] += w0[i]; 430 movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i]; 431 pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58 432 ; (Row 5, offset 1) 433 pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0); 434 pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59 435 436 packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i] 437 ; w/ signed saturation 438 439 pxor xmm0, xmm0 ;F: w0[i] = 0; 440 pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59 441 pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0); 442 pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i); 443 pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59 444 paddw xmm1, xmm2 ;F: w1[i] += w2[i]; 445 movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i]; 446 pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 447 ; (Row 4, offset 1) 448 %undef block 449 %define free_bitsq r15 450 %define free_bitsd r15d 451 %define free_bitsb r15b 452 pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); 453 shl tempq, 48 ;Z: temp <<= 48; 454 pxor xmm2, xmm2 ;E: w2[i] = 0; 455 pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0); 456 paddw xmm5, xmm0 ;E: w5[i] += w0[i]; 457 or tempq, put_buffer ;Z: temp |= put_buffer; 458 movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i]; 459 lea t, [dword t - 2] ;Z: t = &t[-1]; 460 pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0); 461 462 packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i] 463 ; w/ signed saturation 464 465 add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq] 466 ;Z: nbits += dctbl->ehufsi[nbits]; 467 %undef dctbl 468 %define code_temp r8d 469 pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i); 470 mov free_bitsd, [state+working_state.cur.free_bits] 471 ;Z: free_bits = state->cur.free_bits; 472 pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF; 473 shl index, 32 ;Z: index <<= 32; 474 mov put_buffer, [state+working_state.cur.put_buffer.simd] 475 ;Z: put_buffer = state->cur.put_buffer.simd; 476 or index, tempq ;Z: index |= temp; 477 not index ;Z: index = ~index; 478 sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0) 479 jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE; 480 align 16 481 .EMIT_CODE: ;Z: .EMIT_CODE: 482 EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND 483 484 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 485 486 align 16 487 .BRLOOP: ; do { 488 lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16; 489 movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] 490 ; nbits = actbl->ehufsi[0xf0]; 491 mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4] 492 ; code = actbl->ehufco[0xf0]; 493 sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) 494 jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE; 495 shl put_buffer, nbitsb ; put_buffer <<= nbits; 496 mov nbits, code_temp ; nbits = code_temp; 497 or put_buffer, codeq ; put_buffer |= code; 498 cmp nbits, 16 ; if (nbits <= 16) 499 jle .ERLOOP ; break; 500 jmp .BRLOOP ; } while (1); 501 502 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 503 504 align 16 505 times 5 nop 506 .ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE: 507 shl put_buffer, nbitsb ; put_buffer <<= nbits; 508 or put_buffer, codeq ; put_buffer |= code; 509 .BLOOP_COND: ; .BLOOP_COND: 510 test index, index ; if (index != 0) 511 jz .ELOOP ; { 512 .BLOOP: ; do { 513 xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */ 514 tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index 515 inc nbits ; ++nbits; 516 lea t, [t + nbitsq * 2] ; t = &t[nbits]; 517 shr index, nbitsb ; index >>= nbits; 518 .EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END: 519 cmp nbits, 16 ; if (nbits > 16) 520 jg .BRLOOP ; goto .BRLOOP; 521 .ERLOOP: ; .ERLOOP: 522 movsx codeq, word [t] ; code = *t; 523 lea tempd, [nbitsq * 2] ; temp = nbits * 2; 524 movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code); 525 lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits; 526 mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4] 527 ; code_temp = actbl->ehufco[temp-16]; 528 shl code_temp, nbitsb ; code_temp <<= nbits; 529 and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1; 530 add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)] 531 ; free_bits -= actbl->ehufsi[temp-16]; 532 or code, code_temp ; code |= code_temp; 533 sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) 534 jle .EMIT_CODE ; goto .EMIT_CODE; 535 shl put_buffer, nbitsb ; put_buffer <<= nbits; 536 or put_buffer, codeq ; put_buffer |= code; 537 test index, index 538 jnz .BLOOP ; } while (index != 0); 539 .ELOOP: ; } /* index != 0 */ 540 sub td, esp ; t -= &t_[0]; 541 cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62) 542 je .EFN ; { 543 movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] 544 ; nbits = actbl->ehufsi[0]; 545 mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0]; 546 sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0) 547 jg .EFN_SKIP_EMIT_CODE ; { 548 EMIT_QWORD .EFN ; insert code, flush buffer 549 align 16 550 .EFN_SKIP_EMIT_CODE: ; } else { 551 shl put_buffer, nbitsb ; put_buffer <<= nbits; 552 or put_buffer, codeq ; put_buffer |= code; 553 .EFN: ; } } 554 mov [state + working_state.cur.put_buffer.simd], put_buffer 555 ; state->cur.put_buffer.simd = put_buffer; 556 mov byte [state + working_state.cur.free_bits], free_bitsb 557 ; state->cur.free_bits = free_bits; 558 sub rsp, -DCTSIZE2 * SIZEOF_WORD - 8 559 pop r12 560 %ifdef WIN64 561 pop rdi 562 pop rsi 563 pop rbx 564 %else 565 pop rbx 566 %endif 567 pop r15 568 pop rbp 569 ret 570 571 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 572 573 align 16 574 .EMIT_BRLOOP_CODE: 575 EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp } 576 ; insert code, flush buffer, 577 ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END 578 579 ; For some reason, the OS X linker does not honor the request to align the 580 ; segment unless we do this. 581 align 32