jchuff-sse2.asm (39232B)
1 ; 2 ; jchuff-sse2.asm - Huffman entropy encoding (SSE2) 3 ; 4 ; Copyright (C) 2009-2011, 2014-2017, 2019, 2024, D. R. Commander. 5 ; Copyright (C) 2015, Matthieu Darbois. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 13 ; 14 ; This file contains an SSE2 implementation for Huffman coding of one block. 15 ; The following code is based on jchuff.c; see jchuff.c for more details. 16 17 %include "jsimdext.inc" 18 19 struc working_state 20 .next_output_byte: resp 1 ; => next byte to write in buffer 21 .free_in_buffer: resp 1 ; # of byte spaces remaining in buffer 22 .cur.put_buffer.simd resq 1 ; current bit accumulation buffer 23 .cur.free_bits resd 1 ; # of bits available in it 24 .cur.last_dc_val resd 4 ; last DC coef for each component 25 .cinfo: resp 1 ; dump_buffer needs access to this 26 endstruc 27 28 struc c_derived_tbl 29 .ehufco: resd 256 ; code for each symbol 30 .ehufsi: resb 256 ; length of code for each symbol 31 ; If no code has been allocated for a symbol S, ehufsi[S] contains 0 32 endstruc 33 34 ; -------------------------------------------------------------------------- 35 SECTION SEG_CONST 36 37 GLOBAL_DATA(jconst_huff_encode_one_block) 38 39 EXTN(jconst_huff_encode_one_block): 40 41 ALIGNZ 32 42 43 jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007 44 dq 0x000f, 0x001f, 0x003f, 0x007f 45 dq 0x00ff, 0x01ff, 0x03ff, 0x07ff 46 dq 0x0fff, 0x1fff, 0x3fff, 0x7fff 47 48 times 1 << 14 db 15 49 times 1 << 13 db 14 50 times 1 << 12 db 13 51 times 1 << 11 db 12 52 times 1 << 10 db 11 53 times 1 << 9 db 10 54 times 1 << 8 db 9 55 times 1 << 7 db 8 56 times 1 << 6 db 7 57 times 1 << 5 db 6 58 times 1 << 4 db 5 59 times 1 << 3 db 4 60 times 1 << 2 db 3 61 times 1 << 1 db 2 62 times 1 << 0 db 1 63 times 1 db 0 64 GLOBAL_DATA(jpeg_nbits_table) 65 EXTN(jpeg_nbits_table): 66 times 1 db 0 67 times 1 << 0 db 1 68 times 1 << 1 db 2 69 times 1 << 2 db 3 70 times 1 << 3 db 4 71 times 1 << 4 db 5 72 times 1 << 5 db 6 73 times 1 << 6 db 7 74 times 1 << 7 db 8 75 times 1 << 8 db 9 76 times 1 << 9 db 10 77 times 1 << 10 db 11 78 times 1 << 11 db 12 79 times 1 << 12 db 13 80 times 1 << 13 db 14 81 times 1 << 14 db 15 82 83 ALIGNZ 32 84 85 %ifdef PIC 86 %define NBITS(x) nbits_base + x 87 %else 88 %define NBITS(x) EXTN(jpeg_nbits_table) + x 89 %endif 90 %define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - EXTN(jpeg_nbits_table)) 91 92 ; -------------------------------------------------------------------------- 93 SECTION SEG_TEXT 94 BITS 32 95 96 %define mm_put_buffer mm0 97 %define mm_all_0xff mm1 98 %define mm_temp mm2 99 %define mm_nbits mm3 100 %define mm_code_bits mm3 101 %define mm_code mm4 102 %define mm_overflow_bits mm5 103 %define mm_save_nbits mm6 104 105 ; Shorthand used to describe SIMD operations: 106 ; wN: xmmN treated as eight signed 16-bit values 107 ; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7 108 ; bN: xmmN treated as 16 unsigned 8-bit values, or 109 ; mmN treated as eight unsigned 8-bit values 110 ; bN[i]: perform the same operation on all unsigned 8-bit values, 111 ; i=0..15 (SSE register) or i=0..7 (MMX register) 112 ; Contents of SIMD registers are shown in memory order. 113 114 ; Fill the bit buffer to capacity with the leading bits from code, then output 115 ; the bit buffer and put the remaining bits from code into the bit buffer. 116 ; 117 ; Usage: 118 ; code - contains the bits to shift into the bit buffer (LSB-aligned) 119 ; %1 - temp register 120 ; %2 - low byte of temp register 121 ; %3 - second byte of temp register 122 ; %4-%8 (optional) - extra instructions to execute before the macro completes 123 ; %9 - the label to which to jump when the macro completes 124 ; 125 ; Upon completion, free_bits will be set to the number of remaining bits from 126 ; code, and put_buffer will contain those remaining bits. temp and code will 127 ; be clobbered. 128 ; 129 ; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE() 130 ; macro in jchuff.c. 131 132 %macro EMIT_QWORD 9 133 %define %%temp %1 134 %define %%tempb %2 135 %define %%temph %3 136 add nbits, free_bits ; nbits += free_bits; 137 neg free_bits ; free_bits = -free_bits; 138 movq mm_temp, mm_code ; temp = code; 139 movd mm_nbits, nbits ; nbits --> MMX register 140 movd mm_overflow_bits, free_bits ; overflow_bits (temp register) = free_bits; 141 neg free_bits ; free_bits = -free_bits; 142 psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; 143 psrlq mm_temp, mm_overflow_bits ; temp >>= overflow_bits; 144 add free_bits, 64 ; free_bits += 64; 145 por mm_temp, mm_put_buffer ; temp |= put_buffer; 146 %ifidn %%temp, nbits_base 147 movd mm_save_nbits, nbits_base ; save nbits_base 148 %endif 149 movq mm_code_bits, mm_temp ; code_bits (temp register) = temp; 150 movq mm_put_buffer, mm_code ; put_buffer = code; 151 pcmpeqb mm_temp, mm_all_0xff ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0); 152 movq mm_code, mm_code_bits ; code = code_bits; 153 psrlq mm_code_bits, 32 ; code_bits >>= 32; 154 pmovmskb nbits, mm_temp ; nbits = 0; nbits |= ((b_temp[i] >> 7) << i); 155 movd %%temp, mm_code_bits ; temp = code_bits; 156 bswap %%temp ; temp = htonl(temp); 157 test nbits, nbits ; if (nbits != 0) /* Some 0xFF bytes */ 158 jnz %%.SLOW ; goto %%.SLOW 159 mov dword [buffer], %%temp ; *(uint32_t)buffer = temp; 160 %ifidn %%temp, nbits_base 161 movd nbits_base, mm_save_nbits ; restore nbits_base 162 %endif 163 %4 164 movd nbits, mm_code ; nbits = (uint32_t)(code); 165 %5 166 bswap nbits ; nbits = htonl(nbits); 167 mov dword [buffer + 4], nbits ; *(uint32_t)(buffer + 4) = nbits; 168 lea buffer, [buffer + 8] ; buffer += 8; 169 %6 170 %7 171 %8 172 jmp %9 ; return 173 %%.SLOW: 174 ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8 175 ; bytes in the qword. 176 mov byte [buffer], %%tempb ; buffer[0] = temp[0]; 177 cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF 178 mov byte [buffer+1], 0 ; buffer[1] = 0; 179 sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 180 mov byte [buffer], %%temph ; buffer[0] = temp[1]; 181 cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF 182 mov byte [buffer+1], 0 ; buffer[1] = 0; 183 sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 184 shr %%temp, 16 ; temp >>= 16; 185 mov byte [buffer], %%tempb ; buffer[0] = temp[0]; 186 cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF 187 mov byte [buffer+1], 0 ; buffer[1] = 0; 188 sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0)); 189 mov byte [buffer], %%temph ; buffer[0] = temp[1]; 190 cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF 191 mov byte [buffer+1], 0 ; buffer[1] = 0; 192 sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0)); 193 movd nbits, mm_code ; nbits (temp register) = (uint32_t)(code) 194 %ifidn %%temp, nbits_base 195 movd nbits_base, mm_save_nbits ; restore nbits_base 196 %endif 197 bswap nbits ; nbits = htonl(nbits) 198 mov byte [buffer], nbitsb ; buffer[0] = nbits[0]; 199 cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF 200 mov byte [buffer+1], 0 ; buffer[1] = 0; 201 sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0)); 202 mov byte [buffer], nbitsh ; buffer[0] = nbits[1]; 203 cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF 204 mov byte [buffer+1], 0 ; buffer[1] = 0; 205 sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0)); 206 shr nbits, 16 ; nbits >>= 16; 207 mov byte [buffer], nbitsb ; buffer[0] = nbits[0]; 208 cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF 209 mov byte [buffer+1], 0 ; buffer[1] = 0; 210 sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0)); 211 mov byte [buffer], nbitsh ; buffer[0] = nbits[1]; 212 %4 213 cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF 214 mov byte [buffer+1], 0 ; buffer[1] = 0; 215 sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0)); 216 %5 217 %6 218 %7 219 %8 220 jmp %9 ; return; 221 %endmacro 222 223 %macro PUSH 1 224 push %1 225 %assign stack_offset stack_offset + 4 226 %endmacro 227 228 %macro POP 1 229 pop %1 230 %assign stack_offset stack_offset - 4 231 %endmacro 232 233 ; If PIC is defined, load the address of a symbol defined in this file into a 234 ; register. Equivalent to 235 ; GET_GOT %1 236 ; lea %1, [GOTOFF(%1, %2)] 237 ; without using the GOT. 238 ; 239 ; Usage: 240 ; %1 - register into which to load the address of the symbol 241 ; %2 - symbol whose address should be loaded 242 ; %3 - optional multi-line macro to execute before the symbol address is loaded 243 ; %4 - optional multi-line macro to execute after the symbol address is loaded 244 ; 245 ; If PIC is not defined, then %3 and %4 are executed in order. 246 247 %macro GET_SYM 2-4 248 %ifdef PIC 249 call %%.geteip 250 %%.ref: 251 %4 252 add %1, %2 - %%.ref 253 jmp short %%.done 254 align 32 255 %%.geteip: 256 %3 4 ; must adjust stack pointer because of call 257 mov %1, POINTER [esp] 258 ret 259 align 32 260 %%.done: 261 %else 262 %3 0 263 %4 264 %endif 265 %endmacro 266 267 ; 268 ; Encode a single block's worth of coefficients. 269 ; 270 ; GLOBAL(JOCTET *) 271 ; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer, 272 ; JCOEFPTR block, int last_dc_val, 273 ; c_derived_tbl *dctbl, c_derived_tbl *actbl) 274 ; 275 ; Stack layout: 276 ; Function args 277 ; Return address 278 ; Saved ebx 279 ; Saved ebp 280 ; Saved esi 281 ; Saved edi <-- esp_save 282 ; ... 283 ; esp_save 284 ; t_ 64*2 bytes (aligned to 128 bytes) 285 ; 286 ; esp is used (as t) to point into t_ (data in lower indices is not used once 287 ; esp passes over them, so this is signal-safe.) Aligning to 128 bytes allows 288 ; us to find the rest of the data again. 289 ; 290 ; NOTES: 291 ; When shuffling data, we try to avoid pinsrw as much as possible, since it is 292 ; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on 293 ; modern CPUs, so chains of pinsrw instructions (even with different outputs) 294 ; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and 295 ; requires 2 µops (with memory operand) on Intel. In either case, only one 296 ; pinsrw instruction can be decoded per cycle (and nothing else if they are 297 ; back-to-back), so out-of-order execution cannot be used to work around long 298 ; pinsrw chains (though for Sandy Bridge and later, this may be less of a 299 ; problem if the code runs from the µop cache.) 300 ; 301 ; We use tzcnt instead of bsf without checking for support. The instruction is 302 ; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to 303 ; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is 304 ; an input dependency (although the behavior is not formally defined, Intel 305 ; CPUs usually leave the destination unmodified if the source is zero.) This 306 ; can prevent out-of-order execution, so we clear the destination before 307 ; invoking tzcnt. 308 ; 309 ; Initial register allocation 310 ; eax - frame --> buffer 311 ; ebx - nbits_base (PIC) / emit_temp 312 ; ecx - dctbl --> size --> state 313 ; edx - block --> nbits 314 ; esi - code_temp --> state --> actbl 315 ; edi - index_temp --> free_bits 316 ; esp - t 317 ; ebp - index 318 319 %define frame eax 320 %ifdef PIC 321 %define nbits_base ebx 322 %endif 323 %define emit_temp ebx 324 %define emit_tempb bl 325 %define emit_temph bh 326 %define dctbl ecx 327 %define block edx 328 %define code_temp esi 329 %define index_temp edi 330 %define t esp 331 %define index ebp 332 333 %assign save_frame DCTSIZE2 * SIZEOF_WORD 334 335 ; Step 1: Re-arrange input data according to jpeg_natural_order 336 ; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10 337 ; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05 338 ; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34 339 ; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28 340 ; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36 341 ; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51 342 ; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46 343 ; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63 344 345 align 32 346 GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2) 347 348 EXTN(jsimd_huff_encode_one_block_sse2): 349 350 %assign stack_offset 0 351 %define arg_state 4 + stack_offset 352 %define arg_buffer 8 + stack_offset 353 %define arg_block 12 + stack_offset 354 %define arg_last_dc_val 16 + stack_offset 355 %define arg_dctbl 20 + stack_offset 356 %define arg_actbl 24 + stack_offset 357 358 ;X: X = code stream 359 mov block, [esp + arg_block] 360 PUSH ebx 361 PUSH ebp 362 movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07 363 PUSH esi 364 PUSH edi 365 movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07 366 mov frame, esp 367 lea t, [frame - (save_frame + 4)] 368 movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15 369 and t, -DCTSIZE2 * SIZEOF_WORD ; t = &t_[0] 370 mov [t + save_frame], frame 371 pxor xmm4, xmm4 ;A: w4[i] = 0; 372 punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11 373 pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11 374 pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11 375 punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15 376 punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13 377 pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17 378 ;A: (Row 0, offset 1) 379 pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0); 380 paddw xmm0, xmm4 ;A: w0[i] += w4[i]; 381 movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i]; 382 383 movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- -- 384 pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- -- 385 pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12 386 movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55 387 movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12 388 punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51 389 pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12 390 pxor xmm4, xmm4 ;A: w4[i] = 0; 391 psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- -- 392 pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0); 393 pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12 394 ; (Row 1, offset 1) 395 pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0); 396 paddw xmm1, xmm4 ;B: w1[i] += w4[i]; 397 movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i]; 398 pxor xmm4, xmm4 ;B: w4[i] = 0; 399 pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0); 400 401 packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i] 402 ; w/ signed saturation 403 404 pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- -- 405 pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- -- 406 pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 -- 407 pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35 408 ; (Row 3, offset 1) 409 pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0); 410 paddw xmm3, xmm4 ;D: w3[i] += w4[i]; 411 movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i]; 412 pxor xmm4, xmm4 ;D: w4[i] = 0; 413 pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0); 414 415 pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51 416 pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51 417 pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51 418 pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51 419 pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51 420 pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27 421 ; (Row 2, offset 1) 422 pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0); 423 paddw xmm2, xmm4 ;C: w2[i] += w4[i]; 424 movsx code_temp, word [block] ;Z: code_temp = block[0]; 425 426 ; %1 - stack pointer adjustment 427 %macro GET_SYM_BEFORE 1 428 movaps XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2 429 ;C: t[i+16] = w2[i]; 430 pxor xmm4, xmm4 ;C: w4[i] = 0; 431 pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0); 432 sub code_temp, [frame + arg_last_dc_val] ;Z: code_temp -= last_dc_val; 433 434 packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i] 435 ; w/ signed saturation 436 437 movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55 438 pmovmskb index_temp, xmm2 ;Z: index_temp = 0; index_temp |= ((b2[i] >> 7) << i); 439 pmovmskb index, xmm0 ;Z: index = 0; index |= ((b0[i] >> 7) << i); 440 movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63 441 punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63 442 shl index_temp, 16 ;Z: index_temp <<= 16; 443 psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 -- 444 pxor xmm2, xmm2 ;H: w2[i] = 0; 445 pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 -- 446 or index, index_temp ;Z: index |= index_temp; 447 %undef index_temp 448 %define free_bits edi 449 %endmacro 450 451 %macro GET_SYM_AFTER 0 452 movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- -- 453 unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59 454 pxor xmm0, xmm0 ;H: w0[i] = 0; 455 not index ;Z: index = ~index; 456 pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 -- 457 ; (Row 7, offset 1) 458 pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0); 459 mov dctbl, [frame + arg_dctbl] 460 paddw xmm3, xmm2 ;H: w3[i] += w2[i]; 461 movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i]; 462 movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- -- 463 pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0); 464 punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47 465 movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47 466 pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF; 467 %endmacro 468 469 GET_SYM nbits_base, EXTN(jpeg_nbits_table), GET_SYM_BEFORE, GET_SYM_AFTER 470 471 psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 -- 472 shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59 473 pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 -- 474 pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58 475 pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 -- 476 pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58 477 cmp code_temp, 1 << 31 ;Z: Set CF if code_temp < 0x80000000, 478 ;Z: i.e. if code_temp is positive 479 pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 -- 480 movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58 481 pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 -- 482 pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58 483 pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53 484 ; (Row 6, offset 1) 485 adc code_temp, -1 ;Z: code_temp += -1 + (code_temp >= 0 ? 1 : 0); 486 pxor xmm2, xmm2 ;G: w2[i] = 0; 487 pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0); 488 pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58 489 paddw xmm4, xmm0 ;G: w4[i] += w0[i]; 490 movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i]; 491 movd mm_temp, code_temp ;Z: temp = code_temp 492 pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58 493 ; (Row 5, offset 1) 494 pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0); 495 496 packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i] 497 ; w/ signed saturation 498 499 lea t, [t - SIZEOF_WORD] ;Z: t = &t[-1] 500 pxor xmm0, xmm0 ;F: w0[i] = 0; 501 pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0); 502 paddw xmm1, xmm2 ;F: w1[i] += w2[i]; 503 movaps XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i]; 504 pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0); 505 pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59 506 pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59 507 pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59 508 pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29 509 ; (Row 4, offset 1) 510 %undef block 511 %define nbits edx 512 %define nbitsb dl 513 %define nbitsh dh 514 movzx nbits, byte [NBITS(code_temp)] ;Z: nbits = JPEG_NBITS(code_temp); 515 %undef code_temp 516 %define state esi 517 pxor xmm2, xmm2 ;E: w2[i] = 0; 518 mov state, [frame + arg_state] 519 movd mm_nbits, nbits ;Z: nbits --> MMX register 520 pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0); 521 movd mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4] 522 ;Z: code = dctbl->ehufco[nbits]; 523 %define size ecx 524 %define sizeb cl 525 %define sizeh ch 526 paddw xmm5, xmm0 ;E: w5[i] += w0[i]; 527 movaps XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i]; 528 movzx size, byte [dctbl + c_derived_tbl.ehufsi + nbits] 529 ;Z: size = dctbl->ehufsi[nbits]; 530 %undef dctbl 531 pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0); 532 533 packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i] 534 ; w/ signed saturation 535 536 movq mm_put_buffer, [state + working_state.cur.put_buffer.simd] 537 ;Z: put_buffer = state->cur.put_buffer.simd; 538 mov free_bits, [state + working_state.cur.free_bits] 539 ;Z: free_bits = state->cur.free_bits; 540 %undef state 541 %define actbl esi 542 mov actbl, [frame + arg_actbl] 543 %define buffer eax 544 mov buffer, [frame + arg_buffer] 545 %undef frame 546 jmp .BEGIN 547 548 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 549 550 align 16 551 ; size <= 32, so this is not really a loop 552 .BRLOOP1: ; .BRLOOP1: 553 movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] 554 ; nbits = actbl->ehufsi[0xf0]; 555 movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4] 556 ; code = actbl->ehufco[0xf0]; 557 and index, 0x7ffffff ; clear index if size == 32 558 sub size, 16 ; size -= 16; 559 sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) 560 jle .EMIT_BRLOOP1 ; goto .EMIT_BRLOOP1; 561 movd mm_nbits, nbits ; nbits --> MMX register 562 psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; 563 por mm_put_buffer, mm_code ; put_buffer |= code; 564 jmp .ERLOOP1 ; goto .ERLOOP1; 565 566 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 567 568 align 16 569 %ifdef PIC 570 times 6 nop 571 %else 572 times 2 nop 573 %endif 574 .BLOOP1: ; do { /* size = # of zero bits/elements to skip */ 575 ; if size == 32, index remains unchanged. Correct in .BRLOOP. 576 shr index, sizeb ; index >>= size; 577 lea t, [t + size * SIZEOF_WORD] ; t += size; 578 cmp size, 16 ; if (size > 16) 579 jg .BRLOOP1 ; goto .BRLOOP1; 580 .ERLOOP1: ; .ERLOOP1: 581 movsx nbits, word [t] ; nbits = *t; 582 %ifdef PIC 583 add size, size ; size += size; 584 %else 585 lea size, [size * 2] ; size += size; 586 %endif 587 movd mm_temp, nbits ; temp = nbits; 588 movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits); 589 lea size, [size * 8 + nbits] ; size = size * 8 + nbits; 590 movd mm_nbits, nbits ; nbits --> MMX register 591 movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4] 592 ; code = actbl->ehufco[size-16]; 593 movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)] 594 ; size = actbl->ehufsi[size-16]; 595 .BEGIN: ; .BEGIN: 596 pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1; 597 psllq mm_code, mm_nbits ; code <<= nbits; 598 add nbits, size ; nbits += size; 599 por mm_code, mm_temp ; code |= temp; 600 sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) 601 jle .EMIT_ERLOOP1 ; insert code, flush buffer, init size, goto .BLOOP1 602 xor size, size ; size = 0; /* kill tzcnt input dependency */ 603 tzcnt size, index ; size = # of trailing 0 bits in index 604 movd mm_nbits, nbits ; nbits --> MMX register 605 psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; 606 inc size ; ++size; 607 por mm_put_buffer, mm_code ; put_buffer |= code; 608 test index, index 609 jnz .BLOOP1 ; } while (index != 0); 610 ; Round 2 611 ; t points to the last used word, possibly below t_ if the previous index had 32 zero bits. 612 .ELOOP1: ; .ELOOP1: 613 pmovmskb size, xmm4 ; size = 0; size |= ((b4[i] >> 7) << i); 614 pmovmskb index, xmm5 ; index = 0; index |= ((b5[i] >> 7) << i); 615 shl size, 16 ; size <<= 16; 616 or index, size ; index |= size; 617 not index ; index = ~index; 618 lea nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD] 619 ; nbits = t + 1 + 64; 620 and nbits, -DCTSIZE2 * SIZEOF_WORD ; nbits &= -128; /* now points to &t_[64] */ 621 sub nbits, t ; nbits -= t; 622 shr nbits, 1 ; nbits >>= 1; /* # of leading 0 bits in old index + 33 */ 623 tzcnt size, index ; size = # of trailing 0 bits in index 624 inc size ; ++size; 625 test index, index ; if (index == 0) 626 jz .ELOOP2 ; goto .ELOOP2; 627 ; NOTE: size == 32 cannot happen, since the last element is always 0. 628 shr index, sizeb ; index >>= size; 629 lea size, [size + nbits - 33] ; size = size + nbits - 33; 630 lea t, [t + size * SIZEOF_WORD] ; t += size; 631 cmp size, 16 ; if (size <= 16) 632 jle .ERLOOP2 ; goto .ERLOOP2; 633 .BRLOOP2: ; do { 634 movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0] 635 ; nbits = actbl->ehufsi[0xf0]; 636 sub size, 16 ; size -= 16; 637 movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4] 638 ; code = actbl->ehufco[0xf0]; 639 sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) 640 jle .EMIT_BRLOOP2 ; insert code and flush put_buffer 641 movd mm_nbits, nbits ; else { nbits --> MMX register 642 psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; 643 por mm_put_buffer, mm_code ; put_buffer |= code; 644 cmp size, 16 ; if (size <= 16) 645 jle .ERLOOP2 ; goto .ERLOOP2; 646 jmp .BRLOOP2 ; } while (1); 647 648 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 649 650 align 16 651 .BLOOP2: ; do { /* size = # of zero bits/elements to skip */ 652 shr index, sizeb ; index >>= size; 653 lea t, [t + size * SIZEOF_WORD] ; t += size; 654 cmp size, 16 ; if (size > 16) 655 jg .BRLOOP2 ; goto .BRLOOP2; 656 .ERLOOP2: ; .ERLOOP2: 657 movsx nbits, word [t] ; nbits = *t; 658 add size, size ; size += size; 659 movd mm_temp, nbits ; temp = nbits; 660 movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits); 661 movd mm_nbits, nbits ; nbits --> MMX register 662 lea size, [size * 8 + nbits] ; size = size * 8 + nbits; 663 movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4] 664 ; code = actbl->ehufco[size-16]; 665 movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)] 666 ; size = actbl->ehufsi[size-16]; 667 psllq mm_code, mm_nbits ; code <<= nbits; 668 pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1; 669 lea nbits, [nbits + size] ; nbits += size; 670 por mm_code, mm_temp ; code |= temp; 671 xor size, size ; size = 0; /* kill tzcnt input dependency */ 672 sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) 673 jle .EMIT_ERLOOP2 ; insert code, flush buffer, init size, goto .BLOOP2 674 tzcnt size, index ; size = # of trailing 0 bits in index 675 movd mm_nbits, nbits ; nbits --> MMX register 676 psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; 677 inc size ; ++size; 678 por mm_put_buffer, mm_code ; put_buffer |= code; 679 test index, index 680 jnz .BLOOP2 ; } while (index != 0); 681 .ELOOP2: ; .ELOOP2: 682 mov nbits, t ; nbits = t; 683 lea t, [t + SIZEOF_WORD] ; t = &t[1]; 684 and nbits, DCTSIZE2 * SIZEOF_WORD - 1 ; nbits &= 127; 685 and t, -DCTSIZE2 * SIZEOF_WORD ; t &= -128; /* t = &t_[0]; */ 686 cmp nbits, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (nbits != 62 * 2) 687 je .EFN ; { 688 movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0] 689 ; code = actbl->ehufco[0]; 690 movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0] 691 ; nbits = actbl->ehufsi[0]; 692 sub free_bits, nbits ; if ((free_bits -= nbits) <= 0) 693 jg .EFN_SKIP_EMIT_CODE ; { 694 EMIT_QWORD size, sizeb, sizeh, , , , , , .EFN ; insert code, flush put_buffer 695 align 16 696 .EFN_SKIP_EMIT_CODE: ; } else { 697 movd mm_nbits, nbits ; nbits --> MMX register 698 psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits; 699 por mm_put_buffer, mm_code ; put_buffer |= code; 700 .EFN: ; } } 701 %define frame esp 702 mov frame, [t + save_frame] 703 %define state ecx 704 mov state, [frame + arg_state] 705 movq [state + working_state.cur.put_buffer.simd], mm_put_buffer 706 ; state->cur.put_buffer.simd = put_buffer; 707 emms 708 mov [state + working_state.cur.free_bits], free_bits 709 ; state->cur.free_bits = free_bits; 710 POP edi 711 POP esi 712 POP ebp 713 POP ebx 714 ret 715 716 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 717 718 align 16 719 .EMIT_BRLOOP1: 720 EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , , , \ 721 .ERLOOP1 722 723 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 724 725 align 16 726 .EMIT_ERLOOP1: 727 EMIT_QWORD size, sizeb, sizeh, \ 728 { xor size, size }, \ 729 { tzcnt size, index }, \ 730 { inc size }, \ 731 { test index, index }, \ 732 { jnz .BLOOP1 }, \ 733 .ELOOP1 734 735 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 736 737 align 16 738 .EMIT_BRLOOP2: 739 EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , \ 740 { cmp size, 16 }, \ 741 { jle .ERLOOP2 }, \ 742 .BRLOOP2 743 744 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 745 746 align 16 747 .EMIT_ERLOOP2: 748 EMIT_QWORD size, sizeb, sizeh, \ 749 { xor size, size }, \ 750 { tzcnt size, index }, \ 751 { inc size }, \ 752 { test index, index }, \ 753 { jnz .BLOOP2 }, \ 754 .ELOOP2 755 756 ; For some reason, the OS X linker does not honor the request to align the 757 ; segment unless we do this. 758 align 32