tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcphuff-sse2.asm (18380B)


      1 ;
      2 ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
      3 ;
      4 ; Copyright (C) 2016, 2018, Matthieu Darbois
      5 ;
      6 ; Based on the x86 SIMD extension for IJG JPEG library
      7 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      9 ;
     10 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     11 ;
     12 ; This file contains an SSE2 implementation of data preparation for progressive
     13 ; Huffman encoding.  See jcphuff.c for more details.
     14 
     15 %include "jsimdext.inc"
     16 
     17 ; --------------------------------------------------------------------------
     18    SECTION     SEG_TEXT
     19    BITS        32
     20 
     21 ; --------------------------------------------------------------------------
     22 ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
     23 ; jsimd_encode_mcu_AC_refine_prepare_sse2()
     24 
     25 %macro LOAD16 0
     26    pxor        N0, N0
     27    pxor        N1, N1
     28 
     29    mov         T0, INT [LUT +  0*SIZEOF_INT]
     30    mov         T1, INT [LUT +  8*SIZEOF_INT]
     31    pinsrw      X0, word [BLOCK + T0 * 2], 0
     32    pinsrw      X1, word [BLOCK + T1 * 2], 0
     33 
     34    mov         T0, INT [LUT +  1*SIZEOF_INT]
     35    mov         T1, INT [LUT +  9*SIZEOF_INT]
     36    pinsrw      X0, word [BLOCK + T0 * 2], 1
     37    pinsrw      X1, word [BLOCK + T1 * 2], 1
     38 
     39    mov         T0, INT [LUT +  2*SIZEOF_INT]
     40    mov         T1, INT [LUT + 10*SIZEOF_INT]
     41    pinsrw      X0, word [BLOCK + T0 * 2], 2
     42    pinsrw      X1, word [BLOCK + T1 * 2], 2
     43 
     44    mov         T0, INT [LUT +  3*SIZEOF_INT]
     45    mov         T1, INT [LUT + 11*SIZEOF_INT]
     46    pinsrw      X0, word [BLOCK + T0 * 2], 3
     47    pinsrw      X1, word [BLOCK + T1 * 2], 3
     48 
     49    mov         T0, INT [LUT +  4*SIZEOF_INT]
     50    mov         T1, INT [LUT + 12*SIZEOF_INT]
     51    pinsrw      X0, word [BLOCK + T0 * 2], 4
     52    pinsrw      X1, word [BLOCK + T1 * 2], 4
     53 
     54    mov         T0, INT [LUT +  5*SIZEOF_INT]
     55    mov         T1, INT [LUT + 13*SIZEOF_INT]
     56    pinsrw      X0, word [BLOCK + T0 * 2], 5
     57    pinsrw      X1, word [BLOCK + T1 * 2], 5
     58 
     59    mov         T0, INT [LUT +  6*SIZEOF_INT]
     60    mov         T1, INT [LUT + 14*SIZEOF_INT]
     61    pinsrw      X0, word [BLOCK + T0 * 2], 6
     62    pinsrw      X1, word [BLOCK + T1 * 2], 6
     63 
     64    mov         T0, INT [LUT +  7*SIZEOF_INT]
     65    mov         T1, INT [LUT + 15*SIZEOF_INT]
     66    pinsrw      X0, word [BLOCK + T0 * 2], 7
     67    pinsrw      X1, word [BLOCK + T1 * 2], 7
     68 %endmacro
     69 
     70 %macro LOAD15 0
     71    pxor        N0, N0
     72    pxor        N1, N1
     73    pxor        X1, X1
     74 
     75    mov         T0, INT [LUT +  0*SIZEOF_INT]
     76    mov         T1, INT [LUT +  8*SIZEOF_INT]
     77    pinsrw      X0, word [BLOCK + T0 * 2], 0
     78    pinsrw      X1, word [BLOCK + T1 * 2], 0
     79 
     80    mov         T0, INT [LUT +  1*SIZEOF_INT]
     81    pinsrw      X0, word [BLOCK + T0 * 2], 1
     82 
     83    mov         T0, INT [LUT +  2*SIZEOF_INT]
     84    pinsrw      X0, word [BLOCK + T0 * 2], 2
     85 
     86    mov         T0, INT [LUT +  3*SIZEOF_INT]
     87    pinsrw      X0, word [BLOCK + T0 * 2], 3
     88 
     89    mov         T0, INT [LUT +  4*SIZEOF_INT]
     90    pinsrw      X0, word [BLOCK + T0 * 2], 4
     91 
     92    mov         T0, INT [LUT +  5*SIZEOF_INT]
     93    pinsrw      X0, word [BLOCK + T0 * 2], 5
     94 
     95    mov         T0, INT [LUT +  6*SIZEOF_INT]
     96    pinsrw      X0, word [BLOCK + T0 * 2], 6
     97 
     98    mov         T0, INT [LUT +  7*SIZEOF_INT]
     99    pinsrw      X0, word [BLOCK + T0 * 2], 7
    100 
    101    cmp         LENEND, 2
    102    jl          %%.ELOAD15
    103    mov         T1, INT [LUT +  9*SIZEOF_INT]
    104    pinsrw      X1, word [BLOCK + T1 * 2], 1
    105 
    106    cmp         LENEND, 3
    107    jl          %%.ELOAD15
    108    mov         T1, INT [LUT + 10*SIZEOF_INT]
    109    pinsrw      X1, word [BLOCK + T1 * 2], 2
    110 
    111    cmp         LENEND, 4
    112    jl          %%.ELOAD15
    113    mov         T1, INT [LUT + 11*SIZEOF_INT]
    114    pinsrw      X1, word [BLOCK + T1 * 2], 3
    115 
    116    cmp         LENEND, 5
    117    jl          %%.ELOAD15
    118    mov         T1, INT [LUT + 12*SIZEOF_INT]
    119    pinsrw      X1, word [BLOCK + T1 * 2], 4
    120 
    121    cmp         LENEND, 6
    122    jl          %%.ELOAD15
    123    mov         T1, INT [LUT + 13*SIZEOF_INT]
    124    pinsrw      X1, word [BLOCK + T1 * 2], 5
    125 
    126    cmp         LENEND, 7
    127    jl          %%.ELOAD15
    128    mov         T1, INT [LUT + 14*SIZEOF_INT]
    129    pinsrw      X1, word [BLOCK + T1 * 2], 6
    130 %%.ELOAD15:
    131 %endmacro
    132 
    133 %macro LOAD8 0
    134    pxor        N0, N0
    135 
    136    mov         T0, INT [LUT +  0*SIZEOF_INT]
    137    pinsrw      X0, word [BLOCK + T0 * 2], 0
    138 
    139    mov         T0, INT [LUT +  1*SIZEOF_INT]
    140    pinsrw      X0, word [BLOCK + T0 * 2], 1
    141 
    142    mov         T0, INT [LUT +  2*SIZEOF_INT]
    143    pinsrw      X0, word [BLOCK + T0 * 2], 2
    144 
    145    mov         T0, INT [LUT +  3*SIZEOF_INT]
    146    pinsrw      X0, word [BLOCK + T0 * 2], 3
    147 
    148    mov         T0, INT [LUT +  4*SIZEOF_INT]
    149    pinsrw      X0, word [BLOCK + T0 * 2], 4
    150 
    151    mov         T0, INT [LUT +  5*SIZEOF_INT]
    152    pinsrw      X0, word [BLOCK + T0 * 2], 5
    153 
    154    mov         T0, INT [LUT +  6*SIZEOF_INT]
    155    pinsrw      X0, word [BLOCK + T0 * 2], 6
    156 
    157    mov         T0, INT [LUT +  7*SIZEOF_INT]
    158    pinsrw      X0, word [BLOCK + T0 * 2], 7
    159 %endmacro
    160 
    161 %macro LOAD7 0
    162    pxor        N0, N0
    163    pxor        X0, X0
    164 
    165    mov         T1, INT [LUT +  0*SIZEOF_INT]
    166    pinsrw      X0, word [BLOCK + T1 * 2], 0
    167 
    168    cmp         LENEND, 2
    169    jl          %%.ELOAD7
    170    mov         T1, INT [LUT +  1*SIZEOF_INT]
    171    pinsrw      X0, word [BLOCK + T1 * 2], 1
    172 
    173    cmp         LENEND, 3
    174    jl          %%.ELOAD7
    175    mov         T1, INT [LUT +  2*SIZEOF_INT]
    176    pinsrw      X0, word [BLOCK + T1 * 2], 2
    177 
    178    cmp         LENEND, 4
    179    jl          %%.ELOAD7
    180    mov         T1, INT [LUT +  3*SIZEOF_INT]
    181    pinsrw      X0, word [BLOCK + T1 * 2], 3
    182 
    183    cmp         LENEND, 5
    184    jl          %%.ELOAD7
    185    mov         T1, INT [LUT +  4*SIZEOF_INT]
    186    pinsrw      X0, word [BLOCK + T1 * 2], 4
    187 
    188    cmp         LENEND, 6
    189    jl          %%.ELOAD7
    190    mov         T1, INT [LUT +  5*SIZEOF_INT]
    191    pinsrw      X0, word [BLOCK + T1 * 2], 5
    192 
    193    cmp         LENEND, 7
    194    jl          %%.ELOAD7
    195    mov         T1, INT [LUT +  6*SIZEOF_INT]
    196    pinsrw      X0, word [BLOCK + T1 * 2], 6
    197 %%.ELOAD7:
    198 %endmacro
    199 
    200 %macro REDUCE0 0
    201    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
    202    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
    203    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
    204    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
    205    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
    206    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
    207    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
    208 
    209    pcmpeqw     xmm0, ZERO
    210    pcmpeqw     xmm1, ZERO
    211    pcmpeqw     xmm2, ZERO
    212    pcmpeqw     xmm3, ZERO
    213    pcmpeqw     xmm4, ZERO
    214    pcmpeqw     xmm5, ZERO
    215    pcmpeqw     xmm6, ZERO
    216    pcmpeqw     xmm7, XMMWORD [VALUES + (56*2)]
    217 
    218    packsswb    xmm0, xmm1
    219    packsswb    xmm2, xmm3
    220    packsswb    xmm4, xmm5
    221    packsswb    xmm6, xmm7
    222 
    223    pmovmskb    eax, xmm0
    224    pmovmskb    ecx, xmm2
    225    pmovmskb    edx, xmm4
    226    pmovmskb    esi, xmm6
    227 
    228    shl         ecx, 16
    229    shl         esi, 16
    230 
    231    or          eax, ecx
    232    or          edx, esi
    233 
    234    not         eax
    235    not         edx
    236 
    237    mov         edi, ZEROBITS
    238 
    239    mov         INT [edi], eax
    240    mov         INT [edi+SIZEOF_INT], edx
    241 %endmacro
    242 
    243 ;
    244 ; Prepare data for jsimd_encode_mcu_AC_first().
    245 ;
    246 ; GLOBAL(void)
    247 ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
    248 ;                                        const int *jpeg_natural_order_start,
    249 ;                                        int Sl, int Al, JCOEF *values,
    250 ;                                        size_t *zerobits)
    251 ;
    252 ; eax + 8 = const JCOEF *block
    253 ; eax + 12 = const int *jpeg_natural_order_start
    254 ; eax + 16 = int Sl
    255 ; eax + 20 = int Al
    256 ; eax + 24 = JCOEF *values
    257 ; eax + 28 = size_t *zerobits
    258 
    259 %define ZERO    xmm7
    260 %define X0      xmm0
    261 %define X1      xmm1
    262 %define N0      xmm2
    263 %define N1      xmm3
    264 %define AL      xmm4
    265 %define K       eax
    266 %define LENEND  eax
    267 %define LUT     ebx
    268 %define T0      ecx
    269 %define T1      edx
    270 %define BLOCK   esi
    271 %define VALUES  edi
    272 %define LEN     ebp
    273 
    274 %define ZEROBITS  INT [esp + 5 * 4]
    275 
    276    align       32
    277    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
    278 
    279 EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
    280    push        ebp
    281    mov         eax, esp                     ; eax = original ebp
    282    sub         esp, byte 4
    283    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
    284    mov         [esp], eax
    285    mov         ebp, esp                     ; ebp = aligned ebp
    286    sub         esp, 4
    287    push        ebx
    288    push        ecx
    289 ;   push        edx                     ; need not be preserved
    290    push        esi
    291    push        edi
    292    push        ebp
    293 
    294    mov         BLOCK, INT [eax + 8]
    295    mov         LUT, INT [eax + 12]
    296    mov         VALUES, INT [eax + 24]
    297    movd        AL, INT [eax + 20]
    298    mov         T0, INT [eax + 28]
    299    mov         ZEROBITS, T0
    300    mov         LEN, INT [eax + 16]
    301    pxor        ZERO, ZERO
    302    mov         K, LEN
    303    and         K, -16
    304    shr         K, 4
    305    jz          .ELOOP16
    306 .BLOOP16:
    307    LOAD16
    308    pcmpgtw     N0, X0
    309    pcmpgtw     N1, X1
    310    paddw       X0, N0
    311    paddw       X1, N1
    312    pxor        X0, N0
    313    pxor        X1, N1
    314    psrlw       X0, AL
    315    psrlw       X1, AL
    316    pxor        N0, X0
    317    pxor        N1, X1
    318    movdqa      XMMWORD [VALUES + (0) * 2], X0
    319    movdqa      XMMWORD [VALUES + (8) * 2], X1
    320    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
    321    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
    322    add         VALUES, 16*2
    323    add         LUT, 16*SIZEOF_INT
    324    dec         K
    325    jnz         .BLOOP16
    326    test        LEN, 15
    327    je          .PADDING
    328 .ELOOP16:
    329    mov         LENEND, LEN
    330    and         LENEND, 7
    331 
    332    test        LEN, 8
    333    jz          .TRY7
    334    test        LEN, 7
    335    jz          .TRY8
    336 
    337    LOAD15
    338    pcmpgtw     N0, X0
    339    pcmpgtw     N1, X1
    340    paddw       X0, N0
    341    paddw       X1, N1
    342    pxor        X0, N0
    343    pxor        X1, N1
    344    psrlw       X0, AL
    345    psrlw       X1, AL
    346    pxor        N0, X0
    347    pxor        N1, X1
    348    movdqa      XMMWORD [VALUES + (0) * 2], X0
    349    movdqa      XMMWORD [VALUES + (8) * 2], X1
    350    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
    351    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
    352    add         VALUES, 16*2
    353    jmp         .PADDING
    354 .TRY8:
    355    LOAD8
    356    pcmpgtw     N0, X0
    357    paddw       X0, N0
    358    pxor        X0, N0
    359    psrlw       X0, AL
    360    pxor        N0, X0
    361    movdqa      XMMWORD [VALUES + (0) * 2], X0
    362    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
    363    add         VALUES, 8*2
    364    jmp         .PADDING
    365 .TRY7:
    366    LOAD7
    367    pcmpgtw     N0, X0
    368    paddw       X0, N0
    369    pxor        X0, N0
    370    psrlw       X0, AL
    371    pxor        N0, X0
    372    movdqa      XMMWORD [VALUES + (0) * 2], X0
    373    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
    374    add         VALUES, 8*2
    375 .PADDING:
    376    mov         K, LEN
    377    add         K, 7
    378    and         K, -8
    379    shr         K, 3
    380    sub         K, DCTSIZE2/8
    381    jz          .EPADDING
    382    align       16
    383 .ZEROLOOP:
    384    movdqa      XMMWORD [VALUES + 0], ZERO
    385    add         VALUES, 8*2
    386    inc         K
    387    jnz         .ZEROLOOP
    388 .EPADDING:
    389    sub         VALUES, DCTSIZE2*2
    390 
    391    REDUCE0
    392 
    393    pop         ebp
    394    pop         edi
    395    pop         esi
    396 ;   pop         edx                     ; need not be preserved
    397    pop         ecx
    398    pop         ebx
    399    mov         esp, ebp                ; esp <- aligned ebp
    400    pop         esp                     ; esp <- original ebp
    401    pop         ebp
    402    ret
    403 
    404 %undef ZERO
    405 %undef X0
    406 %undef X1
    407 %undef N0
    408 %undef N1
    409 %undef AL
    410 %undef K
    411 %undef LUT
    412 %undef T0
    413 %undef T1
    414 %undef BLOCK
    415 %undef VALUES
    416 %undef LEN
    417 
    418 ;
    419 ; Prepare data for jsimd_encode_mcu_AC_refine().
    420 ;
    421 ; GLOBAL(int)
    422 ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
    423 ;                                         const int *jpeg_natural_order_start,
    424 ;                                         int Sl, int Al, JCOEF *absvalues,
    425 ;                                         size_t *bits)
    426 ;
    427 ; eax + 8 = const JCOEF *block
    428 ; eax + 12 = const int *jpeg_natural_order_start
    429 ; eax + 16 = int Sl
    430 ; eax + 20 = int Al
    431 ; eax + 24 = JCOEF *values
    432 ; eax + 28 = size_t *bits
    433 
    434 %define ZERO    xmm7
    435 %define ONE     xmm5
    436 %define X0      xmm0
    437 %define X1      xmm1
    438 %define N0      xmm2
    439 %define N1      xmm3
    440 %define AL      xmm4
    441 %define K       eax
    442 %define LENEND  eax
    443 %define LUT     ebx
    444 %define T0      ecx
    445 %define T0w      cx
    446 %define T1      edx
    447 %define BLOCK   esi
    448 %define VALUES  edi
    449 %define KK      ebp
    450 
    451 %define ZEROBITS  INT [esp + 5 * 4]
    452 %define EOB       INT [esp + 5 * 4 + 4]
    453 %define LEN       INT [esp + 5 * 4 + 8]
    454 
    455    align       32
    456    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
    457 
    458 EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
    459    push        ebp
    460    mov         eax, esp                     ; eax = original ebp
    461    sub         esp, byte 4
    462    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
    463    mov         [esp], eax
    464    mov         ebp, esp                     ; ebp = aligned ebp
    465    sub         esp, 16
    466    push        ebx
    467    push        ecx
    468 ;   push        edx                     ; need not be preserved
    469    push        esi
    470    push        edi
    471    push        ebp
    472 
    473    pcmpeqw     ONE, ONE
    474    psrlw       ONE, 15
    475    mov         BLOCK, INT [eax + 8]
    476    mov         LUT, INT [eax + 12]
    477    mov         VALUES, INT [eax + 24]
    478    movd        AL, INT [eax + 20]
    479    mov         T0, INT [eax + 28]
    480    mov         K,  INT [eax + 16]
    481    mov         INT [T0 + 2 * SIZEOF_INT], -1
    482    mov         INT [T0 + 3 * SIZEOF_INT], -1
    483    mov         ZEROBITS, T0
    484    mov         LEN, K
    485    pxor        ZERO, ZERO
    486    and         K, -16
    487    mov         EOB, 0
    488    xor         KK, KK
    489    shr         K, 4
    490    jz          .ELOOPR16
    491 .BLOOPR16:
    492    LOAD16
    493    pcmpgtw     N0, X0
    494    pcmpgtw     N1, X1
    495    paddw       X0, N0
    496    paddw       X1, N1
    497    pxor        X0, N0
    498    pxor        X1, N1
    499    psrlw       X0, AL
    500    psrlw       X1, AL
    501    movdqa      XMMWORD [VALUES + (0) * 2], X0
    502    movdqa      XMMWORD [VALUES + (8) * 2], X1
    503    pcmpeqw     X0, ONE
    504    pcmpeqw     X1, ONE
    505    packsswb    N0, N1
    506    packsswb    X0, X1
    507    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
    508    mov         T1, ZEROBITS
    509    not         T0
    510    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
    511    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
    512    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
    513    jz          .CONTINUER16            ; if (idx) {
    514    lea         T1, [T1+KK*8]
    515    mov         EOB, T1                 ; EOB = k + idx;
    516 .CONTINUER16:
    517    add         VALUES, 16*2
    518    add         LUT, 16*SIZEOF_INT
    519    add         KK, 2
    520    dec         K
    521    jnz         .BLOOPR16
    522    test        LEN, 15
    523    je          .PADDINGR
    524 .ELOOPR16:
    525    mov         LENEND, LEN
    526 
    527    test        LENEND, 8
    528    jz          .TRYR7
    529    test        LENEND, 7
    530    jz          .TRYR8
    531 
    532    and         LENEND, 7
    533    LOAD15
    534    pcmpgtw     N0, X0
    535    pcmpgtw     N1, X1
    536    paddw       X0, N0
    537    paddw       X1, N1
    538    pxor        X0, N0
    539    pxor        X1, N1
    540    psrlw       X0, AL
    541    psrlw       X1, AL
    542    movdqa      XMMWORD [VALUES + (0) * 2], X0
    543    movdqa      XMMWORD [VALUES + (8) * 2], X1
    544    pcmpeqw     X0, ONE
    545    pcmpeqw     X1, ONE
    546    packsswb    N0, N1
    547    packsswb    X0, X1
    548    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
    549    mov         T1, ZEROBITS
    550    not         T0
    551    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
    552    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
    553    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
    554    jz          .CONTINUER15            ; if (idx) {
    555    lea         T1, [T1+KK*8]
    556    mov         EOB, T1                 ; EOB = k + idx;
    557 .CONTINUER15:
    558    add         VALUES, 16*2
    559    jmp         .PADDINGR
    560 .TRYR8:
    561    LOAD8
    562 
    563    pcmpgtw     N0, X0
    564    paddw       X0, N0
    565    pxor        X0, N0
    566    psrlw       X0, AL
    567    movdqa      XMMWORD [VALUES + (0) * 2], X0
    568    pcmpeqw     X0, ONE
    569    packsswb    N0, ZERO
    570    packsswb    X0, ZERO
    571    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
    572    mov         T1, ZEROBITS
    573    not         T0
    574    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
    575    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
    576    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
    577    jz          .CONTINUER8             ; if (idx) {
    578    lea         T1, [T1+KK*8]
    579    mov         EOB, T1                 ; EOB = k + idx;
    580 .CONTINUER8:
    581    add         VALUES, 8*2
    582    jmp         .PADDINGR
    583 .TRYR7:
    584    and         LENEND, 7
    585    LOAD7
    586 
    587    pcmpgtw     N0, X0
    588    paddw       X0, N0
    589    pxor        X0, N0
    590    psrlw       X0, AL
    591    movdqa      XMMWORD [VALUES + (0) * 2], X0
    592    pcmpeqw     X0, ONE
    593    packsswb    N0, ZERO
    594    packsswb    X0, ZERO
    595    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
    596    mov         T1, ZEROBITS
    597    not         T0
    598    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
    599    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
    600    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
    601    jz          .CONTINUER7             ; if (idx) {
    602    lea         T1, [T1+KK*8]
    603    mov         EOB, T1                 ; EOB = k + idx;
    604 .CONTINUER7:
    605    add         VALUES, 8*2
    606 .PADDINGR:
    607    mov         K, LEN
    608    add         K, 7
    609    and         K, -8
    610    shr         K, 3
    611    sub         K, DCTSIZE2/8
    612    jz          .EPADDINGR
    613    align       16
    614 .ZEROLOOPR:
    615    movdqa      XMMWORD [VALUES + 0], ZERO
    616    add         VALUES, 8*2
    617    inc         K
    618    jnz         .ZEROLOOPR
    619 .EPADDINGR:
    620    sub         VALUES, DCTSIZE2*2
    621 
    622    REDUCE0
    623 
    624    mov         eax, EOB
    625 
    626    pop         ebp
    627    pop         edi
    628    pop         esi
    629 ;   pop         edx                     ; need not be preserved
    630    pop         ecx
    631    pop         ebx
    632    mov         esp, ebp                ; esp <- aligned ebp
    633    pop         esp                     ; esp <- original ebp
    634    pop         ebp
    635    ret
    636 
    637 %undef ZERO
    638 %undef ONE
    639 %undef X0
    640 %undef X1
    641 %undef N0
    642 %undef N1
    643 %undef AL
    644 %undef K
    645 %undef KK
    646 %undef EOB
    647 %undef SIGN
    648 %undef LUT
    649 %undef T0
    650 %undef T1
    651 %undef BLOCK
    652 %undef VALUES
    653 %undef LEN
    654 %undef LENEND
    655 
    656 ; For some reason, the OS X linker does not honor the request to align the
    657 ; segment unless we do this.
    658    align       32