tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jchuff-sse2.asm (39232B)


      1 ;
      2 ; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
      3 ;
      4 ; Copyright (C) 2009-2011, 2014-2017, 2019, 2024, D. R. Commander.
      5 ; Copyright (C) 2015, Matthieu Darbois.
      6 ; Copyright (C) 2018, Matthias Räncker.
      7 ;
      8 ; Based on the x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     13 ;
     14 ; This file contains an SSE2 implementation for Huffman coding of one block.
     15 ; The following code is based on jchuff.c; see jchuff.c for more details.
     16 
     17 %include "jsimdext.inc"
     18 
     19 struc working_state
     20 .next_output_byte:   resp 1     ; => next byte to write in buffer
     21 .free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
     22 .cur.put_buffer.simd resq 1     ; current bit accumulation buffer
     23 .cur.free_bits       resd 1     ; # of bits available in it
     24 .cur.last_dc_val     resd 4     ; last DC coef for each component
     25 .cinfo:              resp 1     ; dump_buffer needs access to this
     26 endstruc
     27 
     28 struc c_derived_tbl
     29 .ehufco:             resd 256   ; code for each symbol
     30 .ehufsi:             resb 256   ; length of code for each symbol
     31 ; If no code has been allocated for a symbol S, ehufsi[S] contains 0
     32 endstruc
     33 
     34 ; --------------------------------------------------------------------------
     35    SECTION     SEG_CONST
     36 
     37    GLOBAL_DATA(jconst_huff_encode_one_block)
     38 
     39 EXTN(jconst_huff_encode_one_block):
     40 
     41    ALIGNZ      32
     42 
     43 jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
     44               dq 0x000f, 0x001f, 0x003f, 0x007f
     45               dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
     46               dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
     47 
     48 times 1 << 14 db 15
     49 times 1 << 13 db 14
     50 times 1 << 12 db 13
     51 times 1 << 11 db 12
     52 times 1 << 10 db 11
     53 times 1 <<  9 db 10
     54 times 1 <<  8 db  9
     55 times 1 <<  7 db  8
     56 times 1 <<  6 db  7
     57 times 1 <<  5 db  6
     58 times 1 <<  4 db  5
     59 times 1 <<  3 db  4
     60 times 1 <<  2 db  3
     61 times 1 <<  1 db  2
     62 times 1 <<  0 db  1
     63 times 1       db  0
     64 GLOBAL_DATA(jpeg_nbits_table)
     65 EXTN(jpeg_nbits_table):
     66 times 1       db  0
     67 times 1 <<  0 db  1
     68 times 1 <<  1 db  2
     69 times 1 <<  2 db  3
     70 times 1 <<  3 db  4
     71 times 1 <<  4 db  5
     72 times 1 <<  5 db  6
     73 times 1 <<  6 db  7
     74 times 1 <<  7 db  8
     75 times 1 <<  8 db  9
     76 times 1 <<  9 db 10
     77 times 1 << 10 db 11
     78 times 1 << 11 db 12
     79 times 1 << 12 db 13
     80 times 1 << 13 db 14
     81 times 1 << 14 db 15
     82 
     83    ALIGNZ      32
     84 
     85 %ifdef PIC
     86 %define NBITS(x)      nbits_base + x
     87 %else
     88 %define NBITS(x)      EXTN(jpeg_nbits_table) + x
     89 %endif
     90 %define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - EXTN(jpeg_nbits_table))
     91 
     92 ; --------------------------------------------------------------------------
     93    SECTION     SEG_TEXT
     94    BITS        32
     95 
     96 %define mm_put_buffer     mm0
     97 %define mm_all_0xff       mm1
     98 %define mm_temp           mm2
     99 %define mm_nbits          mm3
    100 %define mm_code_bits      mm3
    101 %define mm_code           mm4
    102 %define mm_overflow_bits  mm5
    103 %define mm_save_nbits     mm6
    104 
    105 ; Shorthand used to describe SIMD operations:
    106 ; wN:  xmmN treated as eight signed 16-bit values
    107 ; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
    108 ; bN:  xmmN treated as 16 unsigned 8-bit values, or
    109 ;      mmN treated as eight unsigned 8-bit values
    110 ; bN[i]:  perform the same operation on all unsigned 8-bit values,
    111 ;         i=0..15 (SSE register) or i=0..7 (MMX register)
    112 ; Contents of SIMD registers are shown in memory order.
    113 
    114 ; Fill the bit buffer to capacity with the leading bits from code, then output
    115 ; the bit buffer and put the remaining bits from code into the bit buffer.
    116 ;
    117 ; Usage:
    118 ; code - contains the bits to shift into the bit buffer (LSB-aligned)
    119 ; %1 - temp register
    120 ; %2 - low byte of temp register
    121 ; %3 - second byte of temp register
    122 ; %4-%8 (optional) - extra instructions to execute before the macro completes
    123 ; %9 - the label to which to jump when the macro completes
    124 ;
    125 ; Upon completion, free_bits will be set to the number of remaining bits from
    126 ; code, and put_buffer will contain those remaining bits.  temp and code will
    127 ; be clobbered.
    128 ;
    129 ; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
    130 ; macro in jchuff.c.
    131 
    132 %macro EMIT_QWORD 9
    133 %define %%temp   %1
    134 %define %%tempb  %2
    135 %define %%temph  %3
    136    add         nbits, free_bits             ; nbits += free_bits;
    137    neg         free_bits                    ; free_bits = -free_bits;
    138    movq        mm_temp, mm_code             ; temp = code;
    139    movd        mm_nbits, nbits              ; nbits --> MMX register
    140    movd        mm_overflow_bits, free_bits  ; overflow_bits (temp register) = free_bits;
    141    neg         free_bits                    ; free_bits = -free_bits;
    142    psllq       mm_put_buffer, mm_nbits      ; put_buffer <<= nbits;
    143    psrlq       mm_temp, mm_overflow_bits    ; temp >>= overflow_bits;
    144    add         free_bits, 64                ; free_bits += 64;
    145    por         mm_temp, mm_put_buffer       ; temp |= put_buffer;
    146 %ifidn %%temp, nbits_base
    147    movd        mm_save_nbits, nbits_base    ; save nbits_base
    148 %endif
    149    movq        mm_code_bits, mm_temp        ; code_bits (temp register) = temp;
    150    movq        mm_put_buffer, mm_code       ; put_buffer = code;
    151    pcmpeqb     mm_temp, mm_all_0xff         ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
    152    movq        mm_code, mm_code_bits        ; code = code_bits;
    153    psrlq       mm_code_bits, 32             ; code_bits >>= 32;
    154    pmovmskb    nbits, mm_temp               ; nbits = 0;  nbits |= ((b_temp[i] >> 7) << i);
    155    movd        %%temp, mm_code_bits         ; temp = code_bits;
    156    bswap       %%temp                       ; temp = htonl(temp);
    157    test        nbits, nbits                 ; if (nbits != 0)  /* Some 0xFF bytes */
    158    jnz         %%.SLOW                      ;   goto %%.SLOW
    159    mov         dword [buffer], %%temp       ; *(uint32_t)buffer = temp;
    160 %ifidn %%temp, nbits_base
    161    movd        nbits_base, mm_save_nbits    ; restore nbits_base
    162 %endif
    163    %4
    164    movd        nbits, mm_code               ; nbits = (uint32_t)(code);
    165    %5
    166    bswap       nbits                        ; nbits = htonl(nbits);
    167    mov         dword [buffer + 4], nbits    ; *(uint32_t)(buffer + 4) = nbits;
    168    lea         buffer, [buffer + 8]         ; buffer += 8;
    169    %6
    170    %7
    171    %8
    172    jmp %9                                   ; return
    173 %%.SLOW:
    174    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
    175    ; bytes in the qword.
    176    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
    177    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
    178    mov         byte [buffer+1], 0         ; buffer[1] = 0;
    179    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
    180    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
    181    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
    182    mov         byte [buffer+1], 0         ; buffer[1] = 0;
    183    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
    184    shr         %%temp, 16                 ; temp >>= 16;
    185    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
    186    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
    187    mov         byte [buffer+1], 0         ; buffer[1] = 0;
    188    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
    189    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
    190    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
    191    mov         byte [buffer+1], 0         ; buffer[1] = 0;
    192    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
    193    movd        nbits, mm_code             ; nbits (temp register) = (uint32_t)(code)
    194 %ifidn %%temp, nbits_base
    195    movd        nbits_base, mm_save_nbits  ; restore nbits_base
    196 %endif
    197    bswap       nbits                      ; nbits = htonl(nbits)
    198    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
    199    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
    200    mov         byte [buffer+1], 0         ; buffer[1] = 0;
    201    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
    202    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
    203    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
    204    mov         byte [buffer+1], 0         ; buffer[1] = 0;
    205    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
    206    shr         nbits, 16                  ; nbits >>= 16;
    207    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
    208    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
    209    mov         byte [buffer+1], 0         ; buffer[1] = 0;
    210    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
    211    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
    212    %4
    213    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
    214    mov         byte [buffer+1], 0         ; buffer[1] = 0;
    215    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
    216    %5
    217    %6
    218    %7
    219    %8
    220    jmp %9                                 ; return;
    221 %endmacro
    222 
    223 %macro PUSH 1
    224    push        %1
    225 %assign stack_offset  stack_offset + 4
    226 %endmacro
    227 
    228 %macro POP 1
    229    pop         %1
    230 %assign stack_offset  stack_offset - 4
    231 %endmacro
    232 
    233 ; If PIC is defined, load the address of a symbol defined in this file into a
    234 ; register.  Equivalent to
    235 ;   GET_GOT     %1
    236 ;   lea         %1, [GOTOFF(%1, %2)]
    237 ; without using the GOT.
    238 ;
    239 ; Usage:
    240 ; %1 - register into which to load the address of the symbol
    241 ; %2 - symbol whose address should be loaded
    242 ; %3 - optional multi-line macro to execute before the symbol address is loaded
    243 ; %4 - optional multi-line macro to execute after the symbol address is loaded
    244 ;
    245 ; If PIC is not defined, then %3 and %4 are executed in order.
    246 
    247 %macro GET_SYM 2-4
    248 %ifdef PIC
    249    call        %%.geteip
    250 %%.ref:
    251    %4
    252    add         %1, %2 - %%.ref
    253    jmp         short %%.done
    254    align       32
    255 %%.geteip:
    256    %3          4               ; must adjust stack pointer because of call
    257    mov         %1, POINTER [esp]
    258    ret
    259    align       32
    260 %%.done:
    261 %else
    262    %3          0
    263    %4
    264 %endif
    265 %endmacro
    266 
    267 ;
    268 ; Encode a single block's worth of coefficients.
    269 ;
    270 ; GLOBAL(JOCTET *)
    271 ; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
    272 ;                                  JCOEFPTR block, int last_dc_val,
    273 ;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
    274 ;
    275 ; Stack layout:
    276 ; Function args
    277 ; Return address
    278 ; Saved ebx
    279 ; Saved ebp
    280 ; Saved esi
    281 ; Saved edi <-- esp_save
    282 ; ...
    283 ; esp_save
    284 ; t_ 64*2 bytes (aligned to 128 bytes)
    285 ;
    286 ; esp is used (as t) to point into t_ (data in lower indices is not used once
    287 ; esp passes over them, so this is signal-safe.)  Aligning to 128 bytes allows
    288 ; us to find the rest of the data again.
    289 ;
    290 ; NOTES:
    291 ; When shuffling data, we try to avoid pinsrw as much as possible, since it is
    292 ; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
    293 ; modern CPUs, so chains of pinsrw instructions (even with different outputs)
    294 ; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
    295 ; requires 2 µops (with memory operand) on Intel.  In either case, only one
    296 ; pinsrw instruction can be decoded per cycle (and nothing else if they are
    297 ; back-to-back), so out-of-order execution cannot be used to work around long
    298 ; pinsrw chains (though for Sandy Bridge and later, this may be less of a
    299 ; problem if the code runs from the µop cache.)
    300 ;
    301 ; We use tzcnt instead of bsf without checking for support.  The instruction is
    302 ; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
    303 ; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
    304 ; an input dependency (although the behavior is not formally defined, Intel
    305 ; CPUs usually leave the destination unmodified if the source is zero.)  This
    306 ; can prevent out-of-order execution, so we clear the destination before
    307 ; invoking tzcnt.
    308 ;
    309 ; Initial register allocation
    310 ; eax - frame --> buffer
    311 ; ebx - nbits_base (PIC) / emit_temp
    312 ; ecx - dctbl --> size --> state
    313 ; edx - block --> nbits
    314 ; esi - code_temp --> state --> actbl
    315 ; edi - index_temp --> free_bits
    316 ; esp - t
    317 ; ebp - index
    318 
    319 %define frame       eax
    320 %ifdef PIC
    321 %define nbits_base  ebx
    322 %endif
    323 %define emit_temp   ebx
    324 %define emit_tempb  bl
    325 %define emit_temph  bh
    326 %define dctbl       ecx
    327 %define block       edx
    328 %define code_temp   esi
    329 %define index_temp  edi
    330 %define t           esp
    331 %define index       ebp
    332 
    333 %assign save_frame  DCTSIZE2 * SIZEOF_WORD
    334 
    335 ; Step 1: Re-arrange input data according to jpeg_natural_order
    336 ; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
    337 ; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
    338 ; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
    339 ; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
    340 ; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
    341 ; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
    342 ; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
    343 ; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
    344 
    345    align       32
    346    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
    347 
    348 EXTN(jsimd_huff_encode_one_block_sse2):
    349 
    350 %assign stack_offset      0
    351 %define arg_state         4 + stack_offset
    352 %define arg_buffer        8 + stack_offset
    353 %define arg_block        12 + stack_offset
    354 %define arg_last_dc_val  16 + stack_offset
    355 %define arg_dctbl        20 + stack_offset
    356 %define arg_actbl        24 + stack_offset
    357 
    358                                                          ;X: X = code stream
    359    mov         block, [esp + arg_block]
    360    PUSH        ebx
    361    PUSH        ebp
    362    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
    363    PUSH        esi
    364    PUSH        edi
    365    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
    366    mov         frame, esp
    367    lea         t, [frame - (save_frame + 4)]
    368    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
    369    and         t, -DCTSIZE2 * SIZEOF_WORD                                             ; t = &t_[0]
    370    mov         [t + save_frame], frame
    371    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
    372    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
    373    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
    374    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
    375    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
    376    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
    377    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
    378                                                          ;A:      (Row 0, offset 1)
    379    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
    380    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
    381    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
    382 
    383    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
    384    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
    385    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
    386    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
    387    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
    388    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
    389    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
    390    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
    391    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
    392    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
    393    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
    394                                                          ;        (Row 1, offset 1)
    395    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
    396    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
    397    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
    398    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
    399    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
    400 
    401    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
    402                                                          ;    w/ signed saturation
    403 
    404    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
    405    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
    406    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
    407    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
    408                                                          ;        (Row 3, offset 1)
    409    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
    410    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
    411    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
    412    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
    413    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
    414 
    415    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
    416    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
    417    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
    418    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
    419    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
    420    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
    421                                                          ;        (Row 2, offset 1)
    422    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
    423    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
    424    movsx       code_temp, word [block]                   ;Z:     code_temp = block[0];
    425 
    426 ; %1 - stack pointer adjustment
    427 %macro GET_SYM_BEFORE 1
    428    movaps      XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
    429                                                          ;C: t[i+16] = w2[i];
    430    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
    431    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
    432    sub         code_temp, [frame + arg_last_dc_val]      ;Z:     code_temp -= last_dc_val;
    433 
    434    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
    435                                                          ;    w/ signed saturation
    436 
    437    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
    438    pmovmskb    index_temp, xmm2                          ;Z:     index_temp = 0;  index_temp |= ((b2[i] >> 7) << i);
    439    pmovmskb    index, xmm0                               ;Z:     index = 0;  index |= ((b0[i] >> 7) << i);
    440    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
    441    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
    442    shl         index_temp, 16                            ;Z:     index_temp <<= 16;
    443    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
    444    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
    445    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
    446    or          index, index_temp                         ;Z:     index |= index_temp;
    447 %undef index_temp
    448 %define free_bits  edi
    449 %endmacro
    450 
    451 %macro GET_SYM_AFTER 0
    452    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
    453    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
    454    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
    455    not         index                                     ;Z:     index = ~index;
    456    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
    457                                                          ;        (Row 7, offset 1)
    458    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
    459    mov         dctbl, [frame + arg_dctbl]
    460    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
    461    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
    462    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
    463    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
    464    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
    465    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
    466    pcmpeqw     mm_all_0xff, mm_all_0xff                  ;Z:     all_0xff[i] = 0xFF;
    467 %endmacro
    468 
    469    GET_SYM     nbits_base, EXTN(jpeg_nbits_table), GET_SYM_BEFORE, GET_SYM_AFTER
    470 
    471    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
    472    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
    473    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
    474    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
    475    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
    476    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
    477    cmp         code_temp, 1 << 31                        ;Z:     Set CF if code_temp < 0x80000000,
    478                                                          ;Z:     i.e. if code_temp is positive
    479    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
    480    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
    481    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
    482    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
    483    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
    484                                                          ;        (Row 6, offset 1)
    485    adc         code_temp, -1                             ;Z:     code_temp += -1 + (code_temp >= 0 ? 1 : 0);
    486    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
    487    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
    488    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
    489    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
    490    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
    491    movd        mm_temp, code_temp                        ;Z:     temp = code_temp
    492    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
    493                                                          ;        (Row 5, offset 1)
    494    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
    495 
    496    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
    497                                                          ;    w/ signed saturation
    498 
    499    lea         t, [t - SIZEOF_WORD]                      ;Z:     t = &t[-1]
    500    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
    501    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
    502    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
    503    movaps      XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1  ;F: t[40+i] = w1[i];
    504    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
    505    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
    506    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
    507    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
    508    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
    509                                                          ;        (Row 4, offset 1)
    510 %undef block
    511 %define nbits  edx
    512 %define nbitsb  dl
    513 %define nbitsh  dh
    514    movzx       nbits, byte [NBITS(code_temp)]            ;Z:     nbits = JPEG_NBITS(code_temp);
    515 %undef code_temp
    516 %define state  esi
    517    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
    518    mov         state, [frame + arg_state]
    519    movd        mm_nbits, nbits                           ;Z:     nbits --> MMX register
    520    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
    521    movd        mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
    522                                                          ;Z:     code = dctbl->ehufco[nbits];
    523 %define size  ecx
    524 %define sizeb  cl
    525 %define sizeh  ch
    526    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
    527    movaps      XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5  ;E: t[32+i] = w5[i];
    528    movzx       size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
    529                                                          ;Z:     size = dctbl->ehufsi[nbits];
    530 %undef dctbl
    531    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
    532 
    533    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
    534                                                          ;    w/ signed saturation
    535 
    536    movq        mm_put_buffer, [state + working_state.cur.put_buffer.simd]
    537                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
    538    mov         free_bits, [state + working_state.cur.free_bits]
    539                                                          ;Z:     free_bits = state->cur.free_bits;
    540 %undef state
    541 %define actbl  esi
    542    mov         actbl, [frame + arg_actbl]
    543 %define buffer  eax
    544    mov         buffer, [frame + arg_buffer]
    545 %undef frame
    546    jmp        .BEGIN
    547 
    548 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    549 
    550    align       16
    551 ; size <= 32, so this is not really a loop
    552 .BRLOOP1:                                                 ; .BRLOOP1:
    553    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
    554                                                          ; nbits = actbl->ehufsi[0xf0];
    555    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
    556                                                          ; code = actbl->ehufco[0xf0];
    557    and         index, 0x7ffffff                          ; clear index if size == 32
    558    sub         size, 16                                  ; size -= 16;
    559    sub         free_bits, nbits                          ; if ((free_bits -= nbits) <= 0)
    560    jle         .EMIT_BRLOOP1                             ;   goto .EMIT_BRLOOP1;
    561    movd        mm_nbits, nbits                           ; nbits --> MMX register
    562    psllq       mm_put_buffer, mm_nbits                   ; put_buffer <<= nbits;
    563    por         mm_put_buffer, mm_code                    ; put_buffer |= code;
    564    jmp         .ERLOOP1                                  ; goto .ERLOOP1;
    565 
    566 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    567 
    568    align       16
    569 %ifdef PIC
    570    times 6     nop
    571 %else
    572    times 2     nop
    573 %endif
    574 .BLOOP1:                                                  ; do {  /* size = # of zero bits/elements to skip */
    575 ; if size == 32, index remains unchanged.  Correct in .BRLOOP.
    576    shr         index, sizeb                              ;   index >>= size;
    577    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
    578    cmp         size, 16                                  ;   if (size > 16)
    579    jg          .BRLOOP1                                  ;     goto .BRLOOP1;
    580 .ERLOOP1:                                                 ; .ERLOOP1:
    581    movsx       nbits, word [t]                           ;   nbits = *t;
    582 %ifdef PIC
    583    add         size, size                                ;   size += size;
    584 %else
    585    lea         size, [size * 2]                          ;   size += size;
    586 %endif
    587    movd        mm_temp, nbits                            ;   temp = nbits;
    588    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
    589    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
    590    movd        mm_nbits, nbits                           ;   nbits --> MMX register
    591    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
    592                                                          ;   code = actbl->ehufco[size-16];
    593    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
    594                                                          ;   size = actbl->ehufsi[size-16];
    595 .BEGIN:                                                   ; .BEGIN:
    596    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
    597    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
    598    add         nbits, size                               ;   nbits += size;
    599    por         mm_code, mm_temp                          ;   code |= temp;
    600    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
    601    jle         .EMIT_ERLOOP1                             ;     insert code, flush buffer, init size, goto .BLOOP1
    602    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
    603    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
    604    movd        mm_nbits, nbits                           ;   nbits --> MMX register
    605    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
    606    inc         size                                      ;   ++size;
    607    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
    608    test        index, index
    609    jnz         .BLOOP1                                   ; } while (index != 0);
    610 ; Round 2
    611 ; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
    612 .ELOOP1:                                                  ; .ELOOP1:
    613    pmovmskb    size, xmm4                                ; size = 0;  size |= ((b4[i] >> 7) << i);
    614    pmovmskb    index, xmm5                               ; index = 0;  index |= ((b5[i] >> 7) << i);
    615    shl         size, 16                                  ; size <<= 16;
    616    or          index, size                               ; index |= size;
    617    not         index                                     ; index = ~index;
    618    lea         nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
    619                                                          ; nbits = t + 1 + 64;
    620    and         nbits, -DCTSIZE2 * SIZEOF_WORD            ; nbits &= -128;  /* now points to &t_[64] */
    621    sub         nbits, t                                  ; nbits -= t;
    622    shr         nbits, 1                                  ; nbits >>= 1;  /* # of leading 0 bits in old index + 33 */
    623    tzcnt       size, index                               ; size = # of trailing 0 bits in index
    624    inc         size                                      ; ++size;
    625    test        index, index                              ; if (index == 0)
    626    jz          .ELOOP2                                   ;   goto .ELOOP2;
    627 ; NOTE: size == 32 cannot happen, since the last element is always 0.
    628    shr         index, sizeb                              ; index >>= size;
    629    lea         size, [size + nbits - 33]                 ; size = size + nbits - 33;
    630    lea         t, [t + size * SIZEOF_WORD]               ; t += size;
    631    cmp         size, 16                                  ; if (size <= 16)
    632    jle         .ERLOOP2                                  ;   goto .ERLOOP2;
    633 .BRLOOP2:                                                 ; do {
    634    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
    635                                                          ;   nbits = actbl->ehufsi[0xf0];
    636    sub         size, 16                                  ;   size -= 16;
    637    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
    638                                                          ;   code = actbl->ehufco[0xf0];
    639    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
    640    jle         .EMIT_BRLOOP2                             ;     insert code and flush put_buffer
    641    movd        mm_nbits, nbits                           ;   else { nbits --> MMX register
    642    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
    643    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
    644    cmp         size, 16                                  ;     if (size <= 16)
    645    jle        .ERLOOP2                                   ;       goto .ERLOOP2;
    646    jmp        .BRLOOP2                                   ; } while (1);
    647 
    648 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    649 
    650    align      16
    651 .BLOOP2:                                                  ; do {  /* size = # of zero bits/elements to skip */
    652    shr         index, sizeb                              ;   index >>= size;
    653    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
    654    cmp         size, 16                                  ;   if (size > 16)
    655    jg          .BRLOOP2                                  ;     goto .BRLOOP2;
    656 .ERLOOP2:                                                 ; .ERLOOP2:
    657    movsx       nbits, word [t]                           ;   nbits = *t;
    658    add         size, size                                ;   size += size;
    659    movd        mm_temp, nbits                            ;   temp = nbits;
    660    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
    661    movd        mm_nbits, nbits                           ;   nbits --> MMX register
    662    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
    663    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
    664                                                          ;   code = actbl->ehufco[size-16];
    665    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
    666                                                          ;   size = actbl->ehufsi[size-16];
    667    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
    668    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
    669    lea         nbits, [nbits + size]                     ;   nbits += size;
    670    por         mm_code, mm_temp                          ;   code |= temp;
    671    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
    672    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
    673    jle         .EMIT_ERLOOP2                             ;     insert code, flush buffer, init size, goto .BLOOP2
    674    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
    675    movd        mm_nbits, nbits                           ;   nbits --> MMX register
    676    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
    677    inc         size                                      ;   ++size;
    678    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
    679    test        index, index
    680    jnz         .BLOOP2                                   ; } while (index != 0);
    681 .ELOOP2:                                                  ; .ELOOP2:
    682    mov         nbits, t                                  ; nbits = t;
    683    lea         t, [t + SIZEOF_WORD]                      ; t = &t[1];
    684    and         nbits, DCTSIZE2 * SIZEOF_WORD - 1         ; nbits &= 127;
    685    and         t, -DCTSIZE2 * SIZEOF_WORD                ; t &= -128;  /* t = &t_[0]; */
    686    cmp         nbits, (DCTSIZE2 - 2) * SIZEOF_WORD       ; if (nbits != 62 * 2)
    687    je          .EFN                                      ; {
    688    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
    689                                                          ;   code = actbl->ehufco[0];
    690    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
    691                                                          ;   nbits = actbl->ehufsi[0];
    692    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
    693    jg          .EFN_SKIP_EMIT_CODE                       ;   {
    694    EMIT_QWORD  size, sizeb, sizeh, , , , , , .EFN        ;     insert code, flush put_buffer
    695    align       16
    696 .EFN_SKIP_EMIT_CODE:                                      ;   } else {
    697    movd        mm_nbits, nbits                           ;     nbits --> MMX register
    698    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
    699    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
    700 .EFN:                                                     ; } }
    701 %define frame  esp
    702    mov         frame, [t + save_frame]
    703 %define state  ecx
    704    mov         state, [frame + arg_state]
    705    movq        [state + working_state.cur.put_buffer.simd], mm_put_buffer
    706                                                          ; state->cur.put_buffer.simd = put_buffer;
    707    emms
    708    mov         [state + working_state.cur.free_bits], free_bits
    709                                                          ; state->cur.free_bits = free_bits;
    710    POP         edi
    711    POP         esi
    712    POP         ebp
    713    POP         ebx
    714    ret
    715 
    716 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    717 
    718    align       16
    719 .EMIT_BRLOOP1:
    720    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , , , \
    721      .ERLOOP1
    722 
    723 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    724 
    725    align       16
    726 .EMIT_ERLOOP1:
    727    EMIT_QWORD  size, sizeb, sizeh, \
    728      { xor     size, size }, \
    729      { tzcnt   size, index }, \
    730      { inc     size }, \
    731      { test    index, index }, \
    732      { jnz     .BLOOP1 }, \
    733      .ELOOP1
    734 
    735 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    736 
    737    align       16
    738 .EMIT_BRLOOP2:
    739    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , \
    740      { cmp     size, 16 }, \
    741      { jle     .ERLOOP2 }, \
    742      .BRLOOP2
    743 
    744 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    745 
    746    align       16
    747 .EMIT_ERLOOP2:
    748    EMIT_QWORD  size, sizeb, sizeh, \
    749      { xor     size, size }, \
    750      { tzcnt   size, index }, \
    751      { inc     size }, \
    752      { test    index, index }, \
    753      { jnz     .BLOOP2 }, \
    754      .ELOOP2
    755 
    756 ; For some reason, the OS X linker does not honor the request to align the
    757 ; segment unless we do this.
    758    align       32