tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jchuff-sse2.asm (31508B)


      1 ;
      2 ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
      3 ;
      4 ; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023-2024, D. R. Commander.
      5 ; Copyright (C) 2015, Matthieu Darbois.
      6 ; Copyright (C) 2018, Matthias Räncker.
      7 ; Copyright (C) 2023, Aliaksiej Kandracienka.
      8 ;
      9 ; Based on the x86 SIMD extension for IJG JPEG library
     10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     12 ;
     13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     14 ;
     15 ; This file contains an SSE2 implementation for Huffman coding of one block.
     16 ; The following code is based on jchuff.c; see jchuff.c for more details.
     17 
     18 %include "jsimdext.inc"
     19 
     20 struc working_state
     21 .next_output_byte:   resp 1     ; => next byte to write in buffer
     22 .free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
     23 .cur.put_buffer.simd resq 1     ; current bit accumulation buffer
     24 .cur.free_bits       resd 1     ; # of bits available in it
     25 .cur.last_dc_val     resd 4     ; last DC coef for each component
     26 .cinfo:              resp 1     ; dump_buffer needs access to this
     27 endstruc
     28 
     29 struc c_derived_tbl
     30 .ehufco:             resd 256   ; code for each symbol
     31 .ehufsi:             resb 256   ; length of code for each symbol
     32 ; If no code has been allocated for a symbol S, ehufsi[S] contains 0
     33 endstruc
     34 
     35 ; --------------------------------------------------------------------------
     36    SECTION     SEG_CONST
     37 
     38    ALIGNZ      32
     39    GLOBAL_DATA(jconst_huff_encode_one_block)
     40 
     41 EXTN(jconst_huff_encode_one_block):
     42 
     43 jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
     44               dd 0x000f, 0x001f, 0x003f, 0x007f
     45               dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
     46               dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
     47 
     48    ALIGNZ      32
     49 
     50 times 1 << 14 db 15
     51 times 1 << 13 db 14
     52 times 1 << 12 db 13
     53 times 1 << 11 db 12
     54 times 1 << 10 db 11
     55 times 1 <<  9 db 10
     56 times 1 <<  8 db  9
     57 times 1 <<  7 db  8
     58 times 1 <<  6 db  7
     59 times 1 <<  5 db  6
     60 times 1 <<  4 db  5
     61 times 1 <<  3 db  4
     62 times 1 <<  2 db  3
     63 times 1 <<  1 db  2
     64 times 1 <<  0 db  1
     65 times 1       db  0
     66 GLOBAL_DATA(jpeg_nbits_table)
     67 EXTN(jpeg_nbits_table):
     68 times 1       db  0
     69 times 1 <<  0 db  1
     70 times 1 <<  1 db  2
     71 times 1 <<  2 db  3
     72 times 1 <<  3 db  4
     73 times 1 <<  4 db  5
     74 times 1 <<  5 db  6
     75 times 1 <<  6 db  7
     76 times 1 <<  7 db  8
     77 times 1 <<  8 db  9
     78 times 1 <<  9 db 10
     79 times 1 << 10 db 11
     80 times 1 << 11 db 12
     81 times 1 << 12 db 13
     82 times 1 << 13 db 14
     83 times 1 << 14 db 15
     84 times 1 << 15 db 16
     85 
     86    ALIGNZ      32
     87 
     88 %define NBITS(x)      nbits_base + x
     89 %define MASK_BITS(x)  NBITS((x) * 4) + (jpeg_mask_bits - EXTN(jpeg_nbits_table))
     90 
     91 ; --------------------------------------------------------------------------
     92    SECTION     SEG_TEXT
     93    BITS        64
     94 
     95 ; Shorthand used to describe SIMD operations:
     96 ; wN:  xmmN treated as eight signed 16-bit values
     97 ; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
     98 ; bN:  xmmN treated as 16 unsigned 8-bit values
     99 ; bN[i]:  perform the same operation on all 16 unsigned 8-bit values, i=0..15
    100 ; Contents of SIMD registers are shown in memory order.
    101 
    102 ; Fill the bit buffer to capacity with the leading bits from code, then output
    103 ; the bit buffer and put the remaining bits from code into the bit buffer.
    104 ;
    105 ; Usage:
    106 ; code - contains the bits to shift into the bit buffer (LSB-aligned)
    107 ; %1 - the label to which to jump when the macro completes
    108 ; %2 (optional) - extra instructions to execute after nbits has been set
    109 ;
    110 ; Upon completion, free_bits will be set to the number of remaining bits from
    111 ; code, and put_buffer will contain those remaining bits.  temp and code will
    112 ; be clobbered.
    113 ;
    114 ; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
    115 ; macro in jchuff.c.
    116 
    117 %macro EMIT_QWORD 1-2
    118    add         nbitsb, free_bitsb      ; nbits += free_bits;
    119    neg         free_bitsb              ; free_bits = -free_bits;
    120    mov         tempd, code             ; temp = code;
    121    shl         put_buffer, nbitsb      ; put_buffer <<= nbits;
    122    mov         nbitsb, free_bitsb      ; nbits = free_bits;
    123    neg         free_bitsb              ; free_bits = -free_bits;
    124    shr         tempd, nbitsb           ; temp >>= nbits;
    125    or          tempq, put_buffer       ; temp |= put_buffer;
    126    movq        xmm0, tempq             ; xmm0.u64 = { temp, 0 };
    127    bswap       tempq                   ; temp = htonl(temp);
    128    mov         put_buffer, codeq       ; put_buffer = code;
    129    pcmpeqb     xmm0, xmm1              ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
    130    %2
    131    pmovmskb    code, xmm0              ; code = 0;  code |= ((b0[i] >> 7) << i);
    132    mov         qword [buffer], tempq   ; memcpy(buffer, &temp, 8);
    133                                        ; (speculative; will be overwritten if
    134                                        ; code contains any 0xFF bytes)
    135    add         free_bitsb, 64          ; free_bits += 64;
    136    add         bufferp, 8              ; buffer += 8;
    137    test        code, code              ; if (code == 0)  /* No 0xFF bytes */
    138    jz          %1                      ;   return;
    139    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
    140    ; bytes in the qword.
    141    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
    142    mov         byte [buffer-7], 0      ; buffer[-7] = 0;
    143    sbb         bufferp, 6              ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
    144    mov         byte [buffer], temph    ; buffer[0] = temp[1];
    145    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
    146    mov         byte [buffer+1], 0      ; buffer[1] = 0;
    147    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
    148    shr         tempq, 16               ; temp >>= 16;
    149    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
    150    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
    151    mov         byte [buffer+1], 0      ; buffer[1] = 0;
    152    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
    153    mov         byte [buffer], temph    ; buffer[0] = temp[1];
    154    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
    155    mov         byte [buffer+1], 0      ; buffer[1] = 0;
    156    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
    157    shr         tempq, 16               ; temp >>= 16;
    158    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
    159    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
    160    mov         byte [buffer+1], 0      ; buffer[1] = 0;
    161    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
    162    mov         byte [buffer], temph    ; buffer[0] = temp[1];
    163    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
    164    mov         byte [buffer+1], 0      ; buffer[1] = 0;
    165    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
    166    shr         tempd, 16               ; temp >>= 16;
    167    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
    168    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
    169    mov         byte [buffer+1], 0      ; buffer[1] = 0;
    170    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
    171    mov         byte [buffer], temph    ; buffer[0] = temp[1];
    172    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
    173    mov         byte [buffer+1], 0      ; buffer[1] = 0;
    174    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
    175    jmp         %1                      ; return;
    176 %endmacro
    177 
    178 ;
    179 ; Encode a single block's worth of coefficients.
    180 ;
    181 ; GLOBAL(JOCTET *)
    182 ; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
    183 ;                                  JCOEFPTR block, int last_dc_val,
    184 ;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
    185 ;
    186 ; NOTES:
    187 ; When shuffling data, we try to avoid pinsrw as much as possible, since it is
    188 ; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
    189 ; modern CPUs, so chains of pinsrw instructions (even with different outputs)
    190 ; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
    191 ; requires 2 µops (with memory operand) on Intel.  In either case, only one
    192 ; pinsrw instruction can be decoded per cycle (and nothing else if they are
    193 ; back-to-back), so out-of-order execution cannot be used to work around long
    194 ; pinsrw chains (though for Sandy Bridge and later, this may be less of a
    195 ; problem if the code runs from the µop cache.)
    196 ;
    197 ; We use tzcnt instead of bsf without checking for support.  The instruction is
    198 ; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
    199 ; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
    200 ; an input dependency (although the behavior is not formally defined, Intel
    201 ; CPUs usually leave the destination unmodified if the source is zero.)  This
    202 ; can prevent out-of-order execution, so we clear the destination before
    203 ; invoking tzcnt.
    204 ;
    205 ; Initial register allocation
    206 ; rax - buffer
    207 ; rbx - temp
    208 ; rcx - nbits
    209 ; rdx - code
    210 ; rsi - nbits_base
    211 ; rdi - t
    212 ; r8  - dctbl --> code_temp
    213 ; r9  - actbl
    214 ; r10 - state
    215 ; r11 - index
    216 ; r12 - put_buffer
    217 ; r15 - block --> free_bits
    218 
    219 %define buffer       rax
    220 %ifdef WIN64
    221 %define bufferp      rax
    222 %else
    223 %define bufferp      raxp
    224 %endif
    225 %define tempq        rbx
    226 %define tempd        ebx
    227 %define tempb        bl
    228 %define temph        bh
    229 %define nbitsq       rcx
    230 %define nbits        ecx
    231 %define nbitsb       cl
    232 %define codeq        rdx
    233 %define code         edx
    234 %define nbits_base   rsi
    235 %define t            rdi
    236 %define td           edi
    237 %define dctbl        r8
    238 %define actbl        r9
    239 %define state        r10
    240 %define index        r11
    241 %define indexd       r11d
    242 %define put_buffer   r12
    243 %define put_bufferd  r12d
    244 %define block        r15
    245 
    246 ; Step 1: Re-arrange input data according to jpeg_natural_order
    247 ; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
    248 ; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
    249 ; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
    250 ; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
    251 ; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
    252 ; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
    253 ; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
    254 ; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
    255 
    256    align       32
    257    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
    258 
    259 EXTN(jsimd_huff_encode_one_block_sse2):
    260    ENDBR64
    261    push        rbp
    262    mov         rbp, rsp
    263 
    264 %ifdef WIN64
    265 
    266 ; rcx = working_state *state
    267 ; rdx = JOCTET *buffer
    268 ; r8 = JCOEFPTR block
    269 ; r9 = int last_dc_val
    270 ; [rbp+48] = c_derived_tbl *dctbl
    271 ; [rbp+56] = c_derived_tbl *actbl
    272 
    273                                                          ;X: X = code stream
    274    mov         buffer, rdx
    275    push        r15
    276    mov         block, r8
    277    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
    278    push        rbx
    279    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
    280    push        rsi
    281    push        rdi
    282    push        r12
    283    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
    284    mov         state, rcx
    285    movsx       code, word [block]                        ;Z:     code = block[0];
    286    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
    287    sub         code, r9d                                 ;Z:     code -= last_dc_val;
    288    mov         dctbl, POINTER [rbp+48]
    289    mov         actbl, POINTER [rbp+56]
    290    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
    291    lea         nbits_base, [rel EXTN(jpeg_nbits_table)]
    292 
    293 %else
    294 
    295 ; rdi = working_state *state
    296 ; rsi = JOCTET *buffer
    297 ; rdx = JCOEFPTR block
    298 ; rcx = int last_dc_val
    299 ; r8 = c_derived_tbl *dctbl
    300 ; r9 = c_derived_tbl *actbl
    301 
    302                                                          ;X: X = code stream
    303    push        r15
    304    mov         block, rdx
    305    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
    306    push        rbx
    307    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
    308    push        r12
    309    mov         state, rdi
    310    mov         buffer, rsi
    311    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
    312    movsx       codeq, word [block]                       ;Z:     code = block[0];
    313    lea         nbits_base, [rel EXTN(jpeg_nbits_table)]
    314    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
    315    sub         codeq, rcx                                ;Z:     code -= last_dc_val;
    316    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
    317 
    318 %endif
    319 
    320    ; Allocate stack space for t array, and realign stack.
    321    add         rsp, -DCTSIZE2 * SIZEOF_WORD - 8
    322    mov         t, rsp
    323 
    324    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
    325    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
    326    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
    327    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
    328    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
    329                                                          ;A:      (Row 0, offset 1)
    330    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
    331    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
    332    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
    333 
    334    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
    335    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
    336    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
    337    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
    338    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
    339    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
    340    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
    341    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
    342    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
    343    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
    344    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
    345                                                          ;        (Row 1, offset 1)
    346    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
    347    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
    348    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
    349    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
    350    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
    351 
    352    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
    353                                                          ;    w/ signed saturation
    354 
    355    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
    356    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
    357    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
    358    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
    359                                                          ;        (Row 3, offset 1)
    360    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
    361    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
    362    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
    363    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
    364    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
    365 
    366    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
    367    cmp         code, 1 << 31                             ;Z:     Set CF if code < 0x80000000,
    368                                                          ;Z:     i.e. if code is positive
    369    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
    370    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
    371    adc         code, -1                                  ;Z:     code += -1 + (code >= 0 ? 1 : 0);
    372    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
    373    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
    374    movsxd      codeq, code                               ;Z:     sign extend code
    375    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
    376                                                          ;        (Row 2, offset 1)
    377    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
    378    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
    379    movaps      XMMWORD [t + 16 * SIZEOF_WORD], xmm2      ;C: t[i+16] = w2[i];
    380    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
    381    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
    382 
    383    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
    384                                                          ;    w/ signed saturation
    385 
    386    movzx       nbitsq, byte [NBITS(codeq)]               ;Z:     nbits = JPEG_NBITS(code);
    387    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
    388    pmovmskb    tempd, xmm2                               ;Z:     temp = 0;  temp |= ((b2[i] >> 7) << i);
    389    pmovmskb    put_bufferd, xmm0                         ;Z:     put_buffer = 0;  put_buffer |= ((b0[i] >> 7) << i);
    390    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
    391    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
    392    shl         tempd, 16                                 ;Z:     temp <<= 16;
    393    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
    394    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
    395    or          put_bufferd, tempd                        ;Z:     put_buffer |= temp;
    396    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
    397    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
    398    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
    399    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
    400    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
    401                                                          ;        (Row 7, offset 1)
    402    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
    403    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
    404    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
    405    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
    406    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
    407    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
    408    mov         tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
    409                                                          ;Z:     temp = dctbl->ehufco[nbits];
    410    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
    411    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
    412    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
    413    and         code, dword [MASK_BITS(nbitsq)]           ;Z:     code &= (1 << nbits) - 1;
    414    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
    415    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
    416    shl         tempq, nbitsb                             ;Z:     temp <<= nbits;
    417    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
    418    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
    419    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
    420    or          code, tempd                               ;Z:     code |= temp;
    421    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
    422    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
    423    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
    424    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
    425                                                          ;        (Row 6, offset 1)
    426    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
    427    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
    428    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
    429    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
    430    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
    431    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
    432                                                          ;        (Row 5, offset 1)
    433    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
    434    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
    435 
    436    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
    437                                                          ;    w/ signed saturation
    438 
    439    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
    440    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
    441    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
    442    pmovmskb    tempd, xmm4                               ;Z:     temp = 0;  temp |= ((b4[i] >> 7) << i);
    443    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
    444    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
    445    movaps      XMMWORD [t + 40 * SIZEOF_WORD], xmm1      ;F: t[40+i] = w1[i];
    446    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
    447                                                          ;        (Row 4, offset 1)
    448 %undef block
    449 %define free_bitsq  r15
    450 %define free_bitsd  r15d
    451 %define free_bitsb  r15b
    452    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
    453    shl         tempq, 48                                 ;Z:     temp <<= 48;
    454    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
    455    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
    456    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
    457    or          tempq, put_buffer                         ;Z:     temp |= put_buffer;
    458    movaps      XMMWORD [t + 32 * SIZEOF_WORD], xmm5      ;E: t[32+i] = w5[i];
    459    lea         t, [dword t - 2]                          ;Z:     t = &t[-1];
    460    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
    461 
    462    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
    463                                                          ;    w/ signed saturation
    464 
    465    add         nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
    466                                                          ;Z:     nbits += dctbl->ehufsi[nbits];
    467 %undef dctbl
    468 %define code_temp  r8d
    469    pmovmskb    indexd, xmm5                              ;Z:     index = 0;  index |= ((b5[i] >> 7) << i);
    470    mov         free_bitsd, [state+working_state.cur.free_bits]
    471                                                          ;Z:     free_bits = state->cur.free_bits;
    472    pcmpeqw     xmm1, xmm1                                ;Z:     b1[i] = 0xFF;
    473    shl         index, 32                                 ;Z:     index <<= 32;
    474    mov         put_buffer, [state+working_state.cur.put_buffer.simd]
    475                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
    476    or          index, tempq                              ;Z:     index |= temp;
    477    not         index                                     ;Z:     index = ~index;
    478    sub         free_bitsb, nbitsb                        ;Z:     if ((free_bits -= nbits) >= 0)
    479    jnl         .ENTRY_SKIP_EMIT_CODE                     ;Z:       goto .ENTRY_SKIP_EMIT_CODE;
    480    align       16
    481 .EMIT_CODE:                                               ;Z:     .EMIT_CODE:
    482    EMIT_QWORD  .BLOOP_COND                               ;Z:     insert code, flush buffer, goto .BLOOP_COND
    483 
    484 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    485 
    486    align       16
    487 .BRLOOP:                                                  ; do {
    488    lea         code_temp, [nbitsq - 16]                  ;   code_temp = nbits - 16;
    489    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
    490                                                          ;   nbits = actbl->ehufsi[0xf0];
    491    mov         code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
    492                                                          ;   code = actbl->ehufco[0xf0];
    493    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
    494    jle         .EMIT_BRLOOP_CODE                         ;     goto .EMIT_BRLOOP_CODE;
    495    shl         put_buffer, nbitsb                        ;   put_buffer <<= nbits;
    496    mov         nbits, code_temp                          ;   nbits = code_temp;
    497    or          put_buffer, codeq                         ;   put_buffer |= code;
    498    cmp         nbits, 16                                 ;   if (nbits <= 16)
    499    jle         .ERLOOP                                   ;     break;
    500    jmp         .BRLOOP                                   ; } while (1);
    501 
    502 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    503 
    504    align       16
    505    times 5     nop
    506 .ENTRY_SKIP_EMIT_CODE:                                    ; .ENTRY_SKIP_EMIT_CODE:
    507    shl         put_buffer, nbitsb                        ; put_buffer <<= nbits;
    508    or          put_buffer, codeq                         ; put_buffer |= code;
    509 .BLOOP_COND:                                              ; .BLOOP_COND:
    510    test        index, index                              ; if (index != 0)
    511    jz          .ELOOP                                    ; {
    512 .BLOOP:                                                   ;   do {
    513    xor         nbits, nbits                              ;     nbits = 0;  /* kill tzcnt input dependency */
    514    tzcnt       nbitsq, index                             ;     nbits = # of trailing 0 bits in index
    515    inc         nbits                                     ;     ++nbits;
    516    lea         t, [t + nbitsq * 2]                       ;     t = &t[nbits];
    517    shr         index, nbitsb                             ;     index >>= nbits;
    518 .EMIT_BRLOOP_CODE_END:                                    ; .EMIT_BRLOOP_CODE_END:
    519    cmp         nbits, 16                                 ;     if (nbits > 16)
    520    jg          .BRLOOP                                   ;       goto .BRLOOP;
    521 .ERLOOP:                                                  ; .ERLOOP:
    522    movsx       codeq, word [t]                           ;     code = *t;
    523    lea         tempd, [nbitsq * 2]                       ;     temp = nbits * 2;
    524    movzx       nbits, byte [NBITS(codeq)]                ;     nbits = JPEG_NBITS(code);
    525    lea         tempd, [nbitsq + tempq * 8]               ;     temp = temp * 8 + nbits;
    526    mov         code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
    527                                                          ;     code_temp = actbl->ehufco[temp-16];
    528    shl         code_temp, nbitsb                         ;     code_temp <<= nbits;
    529    and         code, dword [MASK_BITS(nbitsq)]           ;     code &= (1 << nbits) - 1;
    530    add         nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
    531                                                          ;     free_bits -= actbl->ehufsi[temp-16];
    532    or          code, code_temp                           ;     code |= code_temp;
    533    sub         free_bitsb, nbitsb                        ;     if ((free_bits -= nbits) <= 0)
    534    jle         .EMIT_CODE                                ;       goto .EMIT_CODE;
    535    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
    536    or          put_buffer, codeq                         ;     put_buffer |= code;
    537    test        index, index
    538    jnz         .BLOOP                                    ;   } while (index != 0);
    539 .ELOOP:                                                   ; }  /* index != 0 */
    540    sub         td, esp                                   ; t -= &t_[0];
    541    cmp         td, (DCTSIZE2 - 2) * SIZEOF_WORD          ; if (t != 62)
    542    je          .EFN                                      ; {
    543    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
    544                                                          ;   nbits = actbl->ehufsi[0];
    545    mov         code, [actbl + c_derived_tbl.ehufco + 0]  ;   code = actbl->ehufco[0];
    546    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
    547    jg          .EFN_SKIP_EMIT_CODE                       ;   {
    548    EMIT_QWORD  .EFN                                      ;     insert code, flush buffer
    549    align       16
    550 .EFN_SKIP_EMIT_CODE:                                      ;   } else {
    551    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
    552    or          put_buffer, codeq                         ;     put_buffer |= code;
    553 .EFN:                                                     ; } }
    554    mov         [state + working_state.cur.put_buffer.simd], put_buffer
    555                                                          ; state->cur.put_buffer.simd = put_buffer;
    556    mov         byte [state + working_state.cur.free_bits], free_bitsb
    557                                                          ; state->cur.free_bits = free_bits;
    558    sub         rsp, -DCTSIZE2 * SIZEOF_WORD - 8
    559    pop         r12
    560 %ifdef WIN64
    561    pop         rdi
    562    pop         rsi
    563    pop         rbx
    564 %else
    565    pop         rbx
    566 %endif
    567    pop         r15
    568    pop         rbp
    569    ret
    570 
    571 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    572 
    573    align       16
    574 .EMIT_BRLOOP_CODE:
    575    EMIT_QWORD  .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
    576                                                          ; insert code, flush buffer,
    577                                                          ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
    578 
    579 ; For some reason, the OS X linker does not honor the request to align the
    580 ; segment unless we do this.
    581    align       32