tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcphuff-sse2.asm (17152B)


      1 ;
      2 ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
      3 ; (64-bit SSE2)
      4 ;
      5 ; Copyright (C) 2016, 2018, Matthieu Darbois
      6 ; Copyright (C) 2023, Aliaksiej Kandracienka.
      7 ; Copyright (C) 2024, D. R. Commander.
      8 ;
      9 ; Based on the x86 SIMD extension for IJG JPEG library
     10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     12 ;
     13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     14 ;
     15 ; This file contains an SSE2 implementation of data preparation for progressive
     16 ; Huffman encoding.  See jcphuff.c for more details.
     17 
     18 %include "jsimdext.inc"
     19 
     20 ; --------------------------------------------------------------------------
     21    SECTION     SEG_TEXT
     22    BITS        64
     23 
     24 ; --------------------------------------------------------------------------
     25 ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
     26 ; jsimd_encode_mcu_AC_refine_prepare_sse2()
     27 
     28 %macro LOAD16 0
     29    pxor        N0, N0
     30    pxor        N1, N1
     31 
     32    mov         T0d, INT [LUT +  0*SIZEOF_INT]
     33    mov         T1d, INT [LUT +  8*SIZEOF_INT]
     34    pinsrw      X0, word [BLOCK + T0 * 2], 0
     35    pinsrw      X1, word [BLOCK + T1 * 2], 0
     36 
     37    mov         T0d, INT [LUT +  1*SIZEOF_INT]
     38    mov         T1d, INT [LUT +  9*SIZEOF_INT]
     39    pinsrw      X0, word [BLOCK + T0 * 2], 1
     40    pinsrw      X1, word [BLOCK + T1 * 2], 1
     41 
     42    mov         T0d, INT [LUT +  2*SIZEOF_INT]
     43    mov         T1d, INT [LUT + 10*SIZEOF_INT]
     44    pinsrw      X0, word [BLOCK + T0 * 2], 2
     45    pinsrw      X1, word [BLOCK + T1 * 2], 2
     46 
     47    mov         T0d, INT [LUT +  3*SIZEOF_INT]
     48    mov         T1d, INT [LUT + 11*SIZEOF_INT]
     49    pinsrw      X0, word [BLOCK + T0 * 2], 3
     50    pinsrw      X1, word [BLOCK + T1 * 2], 3
     51 
     52    mov         T0d, INT [LUT +  4*SIZEOF_INT]
     53    mov         T1d, INT [LUT + 12*SIZEOF_INT]
     54    pinsrw      X0, word [BLOCK + T0 * 2], 4
     55    pinsrw      X1, word [BLOCK + T1 * 2], 4
     56 
     57    mov         T0d, INT [LUT +  5*SIZEOF_INT]
     58    mov         T1d, INT [LUT + 13*SIZEOF_INT]
     59    pinsrw      X0, word [BLOCK + T0 * 2], 5
     60    pinsrw      X1, word [BLOCK + T1 * 2], 5
     61 
     62    mov         T0d, INT [LUT +  6*SIZEOF_INT]
     63    mov         T1d, INT [LUT + 14*SIZEOF_INT]
     64    pinsrw      X0, word [BLOCK + T0 * 2], 6
     65    pinsrw      X1, word [BLOCK + T1 * 2], 6
     66 
     67    mov         T0d, INT [LUT +  7*SIZEOF_INT]
     68    mov         T1d, INT [LUT + 15*SIZEOF_INT]
     69    pinsrw      X0, word [BLOCK + T0 * 2], 7
     70    pinsrw      X1, word [BLOCK + T1 * 2], 7
     71 %endmacro
     72 
     73 %macro LOAD15 0
     74    pxor        N0, N0
     75    pxor        N1, N1
     76    pxor        X1, X1
     77 
     78    mov         T0d, INT [LUT +  0*SIZEOF_INT]
     79    mov         T1d, INT [LUT +  8*SIZEOF_INT]
     80    pinsrw      X0, word [BLOCK + T0 * 2], 0
     81    pinsrw      X1, word [BLOCK + T1 * 2], 0
     82 
     83    mov         T0d, INT [LUT +  1*SIZEOF_INT]
     84    pinsrw      X0, word [BLOCK + T0 * 2], 1
     85 
     86    mov         T0d, INT [LUT +  2*SIZEOF_INT]
     87    pinsrw      X0, word [BLOCK + T0 * 2], 2
     88 
     89    mov         T0d, INT [LUT +  3*SIZEOF_INT]
     90    pinsrw      X0, word [BLOCK + T0 * 2], 3
     91 
     92    mov         T0d, INT [LUT +  4*SIZEOF_INT]
     93    pinsrw      X0, word [BLOCK + T0 * 2], 4
     94 
     95    mov         T0d, INT [LUT +  5*SIZEOF_INT]
     96    pinsrw      X0, word [BLOCK + T0 * 2], 5
     97 
     98    mov         T0d, INT [LUT +  6*SIZEOF_INT]
     99    pinsrw      X0, word [BLOCK + T0 * 2], 6
    100 
    101    mov         T0d, INT [LUT +  7*SIZEOF_INT]
    102    pinsrw      X0, word [BLOCK + T0 * 2], 7
    103 
    104    cmp         LENEND, 2
    105    jl          %%.ELOAD15
    106    mov         T1d, INT [LUT +  9*SIZEOF_INT]
    107    pinsrw      X1, word [BLOCK + T1 * 2], 1
    108 
    109    cmp         LENEND, 3
    110    jl          %%.ELOAD15
    111    mov         T1d, INT [LUT + 10*SIZEOF_INT]
    112    pinsrw      X1, word [BLOCK + T1 * 2], 2
    113 
    114    cmp         LENEND, 4
    115    jl          %%.ELOAD15
    116    mov         T1d, INT [LUT + 11*SIZEOF_INT]
    117    pinsrw      X1, word [BLOCK + T1 * 2], 3
    118 
    119    cmp         LENEND, 5
    120    jl          %%.ELOAD15
    121    mov         T1d, INT [LUT + 12*SIZEOF_INT]
    122    pinsrw      X1, word [BLOCK + T1 * 2], 4
    123 
    124    cmp         LENEND, 6
    125    jl          %%.ELOAD15
    126    mov         T1d, INT [LUT + 13*SIZEOF_INT]
    127    pinsrw      X1, word [BLOCK + T1 * 2], 5
    128 
    129    cmp         LENEND, 7
    130    jl          %%.ELOAD15
    131    mov         T1d, INT [LUT + 14*SIZEOF_INT]
    132    pinsrw      X1, word [BLOCK + T1 * 2], 6
    133 %%.ELOAD15:
    134 %endmacro
    135 
    136 %macro LOAD8 0
    137    pxor        N0, N0
    138 
    139    mov         T0d, INT [LUT +  0*SIZEOF_INT]
    140    pinsrw      X0, word [BLOCK + T0 * 2], 0
    141 
    142    mov         T0d, INT [LUT +  1*SIZEOF_INT]
    143    pinsrw      X0, word [BLOCK + T0 * 2], 1
    144 
    145    mov         T0d, INT [LUT +  2*SIZEOF_INT]
    146    pinsrw      X0, word [BLOCK + T0 * 2], 2
    147 
    148    mov         T0d, INT [LUT +  3*SIZEOF_INT]
    149    pinsrw      X0, word [BLOCK + T0 * 2], 3
    150 
    151    mov         T0d, INT [LUT +  4*SIZEOF_INT]
    152    pinsrw      X0, word [BLOCK + T0 * 2], 4
    153 
    154    mov         T0d, INT [LUT +  5*SIZEOF_INT]
    155    pinsrw      X0, word [BLOCK + T0 * 2], 5
    156 
    157    mov         T0d, INT [LUT +  6*SIZEOF_INT]
    158    pinsrw      X0, word [BLOCK + T0 * 2], 6
    159 
    160    mov         T0d, INT [LUT +  7*SIZEOF_INT]
    161    pinsrw      X0, word [BLOCK + T0 * 2], 7
    162 %endmacro
    163 
    164 %macro LOAD7 0
    165    pxor        N0, N0
    166    pxor        X0, X0
    167 
    168    mov         T1d, INT [LUT +  0*SIZEOF_INT]
    169    pinsrw      X0, word [BLOCK + T1 * 2], 0
    170 
    171    cmp         LENEND, 2
    172    jl          %%.ELOAD7
    173    mov         T1d, INT [LUT +  1*SIZEOF_INT]
    174    pinsrw      X0, word [BLOCK + T1 * 2], 1
    175 
    176    cmp         LENEND, 3
    177    jl          %%.ELOAD7
    178    mov         T1d, INT [LUT +  2*SIZEOF_INT]
    179    pinsrw      X0, word [BLOCK + T1 * 2], 2
    180 
    181    cmp         LENEND, 4
    182    jl          %%.ELOAD7
    183    mov         T1d, INT [LUT +  3*SIZEOF_INT]
    184    pinsrw      X0, word [BLOCK + T1 * 2], 3
    185 
    186    cmp         LENEND, 5
    187    jl          %%.ELOAD7
    188    mov         T1d, INT [LUT +  4*SIZEOF_INT]
    189    pinsrw      X0, word [BLOCK + T1 * 2], 4
    190 
    191    cmp         LENEND, 6
    192    jl          %%.ELOAD7
    193    mov         T1d, INT [LUT +  5*SIZEOF_INT]
    194    pinsrw      X0, word [BLOCK + T1 * 2], 5
    195 
    196    cmp         LENEND, 7
    197    jl          %%.ELOAD7
    198    mov         T1d, INT [LUT +  6*SIZEOF_INT]
    199    pinsrw      X0, word [BLOCK + T1 * 2], 6
    200 %%.ELOAD7:
    201 %endmacro
    202 
    203 %macro REDUCE0 0
    204    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
    205    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
    206    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
    207    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
    208    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
    209    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
    210    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
    211    movdqa      xmm7, XMMWORD [VALUES + (56*2)]
    212 
    213    pcmpeqw     xmm0, ZERO
    214    pcmpeqw     xmm1, ZERO
    215    pcmpeqw     xmm2, ZERO
    216    pcmpeqw     xmm3, ZERO
    217    pcmpeqw     xmm4, ZERO
    218    pcmpeqw     xmm5, ZERO
    219    pcmpeqw     xmm6, ZERO
    220    pcmpeqw     xmm7, ZERO
    221 
    222    packsswb    xmm0, xmm1
    223    packsswb    xmm2, xmm3
    224    packsswb    xmm4, xmm5
    225    packsswb    xmm6, xmm7
    226 
    227    pmovmskb    eax, xmm0
    228    pmovmskb    ecx, xmm2
    229    pmovmskb    edx, xmm4
    230    pmovmskb    esi, xmm6
    231 
    232    shl         rcx, 16
    233    shl         rdx, 32
    234    shl         rsi, 48
    235 
    236    or          rax, rcx
    237    or          rdx, rsi
    238    or          rax, rdx
    239 
    240    not         rax
    241 
    242    mov         MMWORD [r15], rax
    243 %endmacro
    244 
    245 ;
    246 ; Prepare data for jsimd_encode_mcu_AC_first().
    247 ;
    248 ; GLOBAL(void)
    249 ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
    250 ;                                        const int *jpeg_natural_order_start,
    251 ;                                        int Sl, int Al, JCOEF *values,
    252 ;                                        size_t *zerobits)
    253 ;
    254 ; r10 = const JCOEF *block
    255 ; r11 = const int *jpeg_natural_order_start
    256 ; r12 = int Sl
    257 ; r13 = int Al
    258 ; r14 = JCOEF *values
    259 ; r15 = size_t *zerobits
    260 
    261 %define ZERO    xmm9
    262 %define X0      xmm0
    263 %define X1      xmm1
    264 %define N0      xmm2
    265 %define N1      xmm3
    266 %define AL      xmm4
    267 %define K       eax
    268 %define LUT     r11
    269 %define T0      rcx
    270 %define T0d     ecx
    271 %define T1      rdx
    272 %define T1d     edx
    273 %define BLOCK   r10
    274 %define VALUES  r14
    275 %define LEN     r12d
    276 %define LENEND  r13d
    277 
    278    align       32
    279    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
    280 
    281 EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
    282    ENDBR64
    283    push        rbp
    284    mov         rbp, rsp
    285    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
    286    sub         rsp, SIZEOF_XMMWORD
    287    movdqa      XMMWORD [rsp], ZERO
    288    COLLECT_ARGS 6
    289 
    290    movd        AL, r13d
    291    pxor        ZERO, ZERO
    292    mov         K, LEN
    293    mov         LENEND, LEN
    294    and         K, -16
    295    and         LENEND, 7
    296    shr         K, 4
    297    jz          .ELOOP16
    298 .BLOOP16:
    299    LOAD16
    300    pcmpgtw     N0, X0
    301    pcmpgtw     N1, X1
    302    paddw       X0, N0
    303    paddw       X1, N1
    304    pxor        X0, N0
    305    pxor        X1, N1
    306    psrlw       X0, AL
    307    psrlw       X1, AL
    308    pxor        N0, X0
    309    pxor        N1, X1
    310    movdqa      XMMWORD [VALUES + (0) * 2], X0
    311    movdqa      XMMWORD [VALUES + (8) * 2], X1
    312    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
    313    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
    314    add         VALUES, 16*2
    315    add         LUT, 16*SIZEOF_INT
    316    dec         K
    317    jnz         .BLOOP16
    318    test        LEN, 15
    319    je          .PADDING
    320 .ELOOP16:
    321    test        LEN, 8
    322    jz          .TRY7
    323    test        LEN, 7
    324    jz          .TRY8
    325 
    326    LOAD15
    327    pcmpgtw     N0, X0
    328    pcmpgtw     N1, X1
    329    paddw       X0, N0
    330    paddw       X1, N1
    331    pxor        X0, N0
    332    pxor        X1, N1
    333    psrlw       X0, AL
    334    psrlw       X1, AL
    335    pxor        N0, X0
    336    pxor        N1, X1
    337    movdqa      XMMWORD [VALUES + (0) * 2], X0
    338    movdqa      XMMWORD [VALUES + (8) * 2], X1
    339    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
    340    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
    341    add         VALUES, 16*2
    342    jmp         .PADDING
    343 .TRY8:
    344    LOAD8
    345    pcmpgtw     N0, X0
    346    paddw       X0, N0
    347    pxor        X0, N0
    348    psrlw       X0, AL
    349    pxor        N0, X0
    350    movdqa      XMMWORD [VALUES + (0) * 2], X0
    351    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
    352    add         VALUES, 8*2
    353    jmp         .PADDING
    354 .TRY7:
    355    LOAD7
    356    pcmpgtw     N0, X0
    357    paddw       X0, N0
    358    pxor        X0, N0
    359    psrlw       X0, AL
    360    pxor        N0, X0
    361    movdqa      XMMWORD [VALUES + (0) * 2], X0
    362    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
    363    add         VALUES, 8*2
    364 .PADDING:
    365    mov         K, LEN
    366    add         K, 7
    367    and         K, -8
    368    shr         K, 3
    369    sub         K, DCTSIZE2/8
    370    jz          .EPADDING
    371    align       16
    372 .ZEROLOOP:
    373    movdqa      XMMWORD [VALUES + 0], ZERO
    374    add         VALUES, 8*2
    375    inc         K
    376    jnz         .ZEROLOOP
    377 .EPADDING:
    378    sub         VALUES, DCTSIZE2*2
    379 
    380    REDUCE0
    381 
    382    UNCOLLECT_ARGS 6
    383    movdqa      ZERO, XMMWORD [rsp]
    384    mov         rsp, rbp
    385    pop         rbp
    386    ret
    387 
    388 %undef ZERO
    389 %undef X0
    390 %undef X1
    391 %undef N0
    392 %undef N1
    393 %undef AL
    394 %undef K
    395 %undef LUT
    396 %undef T0
    397 %undef T0d
    398 %undef T1
    399 %undef T1d
    400 %undef BLOCK
    401 %undef VALUES
    402 %undef LEN
    403 %undef LENEND
    404 
    405 ;
    406 ; Prepare data for jsimd_encode_mcu_AC_refine().
    407 ;
    408 ; GLOBAL(int)
    409 ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
    410 ;                                         const int *jpeg_natural_order_start,
    411 ;                                         int Sl, int Al, JCOEF *absvalues,
    412 ;                                         size_t *bits)
    413 ;
    414 ; r10 = const JCOEF *block
    415 ; r11 = const int *jpeg_natural_order_start
    416 ; r12 = int Sl
    417 ; r13 = int Al
    418 ; r14 = JCOEF *values
    419 ; r15 = size_t *bits
    420 
    421 %define ZERO    xmm9
    422 %define ONE     xmm5
    423 %define X0      xmm0
    424 %define X1      xmm1
    425 %define N0      xmm2
    426 %define N1      xmm3
    427 %define AL      xmm4
    428 %define K       eax
    429 %define KK      r9d
    430 %define EOB     r8d
    431 %define SIGN    rdi
    432 %define LUT     r11
    433 %define T0      rcx
    434 %define T0d     ecx
    435 %define T1      rdx
    436 %define T1d     edx
    437 %define BLOCK   r10
    438 %define VALUES  r14
    439 %define LEN     r12d
    440 %define LENEND  r13d
    441 
    442    align       32
    443    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
    444 
    445 EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
    446    ENDBR64
    447    push        rbp
    448    mov         rbp, rsp
    449    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
    450    sub         rsp, SIZEOF_XMMWORD
    451    movdqa      XMMWORD [rsp], ZERO
    452    COLLECT_ARGS 6
    453 
    454    xor         SIGN, SIGN
    455    xor         EOB, EOB
    456    xor         KK, KK
    457    movd        AL, r13d
    458    pxor        ZERO, ZERO
    459    pcmpeqw     ONE, ONE
    460    psrlw       ONE, 15
    461    mov         K, LEN
    462    mov         LENEND, LEN
    463    and         K, -16
    464    and         LENEND, 7
    465    shr         K, 4
    466    jz          .ELOOPR16
    467 .BLOOPR16:
    468    LOAD16
    469    pcmpgtw     N0, X0
    470    pcmpgtw     N1, X1
    471    paddw       X0, N0
    472    paddw       X1, N1
    473    pxor        X0, N0
    474    pxor        X1, N1
    475    psrlw       X0, AL
    476    psrlw       X1, AL
    477    movdqa      XMMWORD [VALUES + (0) * 2], X0
    478    movdqa      XMMWORD [VALUES + (8) * 2], X1
    479    pcmpeqw     X0, ONE
    480    pcmpeqw     X1, ONE
    481    packsswb    N0, N1
    482    packsswb    X0, X1
    483    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
    484    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
    485    shr         SIGN, 16                ; make room for sizebits
    486    shl         T0, 48
    487    or          SIGN, T0
    488    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
    489    jz          .CONTINUER16            ; if (idx) {
    490    mov         EOB, KK
    491    add         EOB, T1d                ; EOB = k + idx;
    492 .CONTINUER16:
    493    add         VALUES, 16*2
    494    add         LUT, 16*SIZEOF_INT
    495    add         KK, 16
    496    dec         K
    497    jnz         .BLOOPR16
    498    test        LEN, 15
    499    je          .PADDINGR
    500 .ELOOPR16:
    501    test        LEN, 8
    502    jz          .TRYR7
    503    test        LEN, 7
    504    jz          .TRYR8
    505 
    506    LOAD15
    507    pcmpgtw     N0, X0
    508    pcmpgtw     N1, X1
    509    paddw       X0, N0
    510    paddw       X1, N1
    511    pxor        X0, N0
    512    pxor        X1, N1
    513    psrlw       X0, AL
    514    psrlw       X1, AL
    515    movdqa      XMMWORD [VALUES + (0) * 2], X0
    516    movdqa      XMMWORD [VALUES + (8) * 2], X1
    517    pcmpeqw     X0, ONE
    518    pcmpeqw     X1, ONE
    519    packsswb    N0, N1
    520    packsswb    X0, X1
    521    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
    522    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
    523    shr         SIGN, 16                ; make room for sizebits
    524    shl         T0, 48
    525    or          SIGN, T0
    526    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
    527    jz          .CONTINUER15            ; if (idx) {
    528    mov         EOB, KK
    529    add         EOB, T1d                ; EOB = k + idx;
    530 .CONTINUER15:
    531    add         VALUES, 16*2
    532    jmp         .PADDINGR
    533 .TRYR8:
    534    LOAD8
    535 
    536    pcmpgtw     N0, X0
    537    paddw       X0, N0
    538    pxor        X0, N0
    539    psrlw       X0, AL
    540    movdqa      XMMWORD [VALUES + (0) * 2], X0
    541    pcmpeqw     X0, ONE
    542    packsswb    N0, ZERO
    543    packsswb    X0, ZERO
    544    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
    545    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
    546    shr         SIGN, 8                 ; make room for sizebits
    547    shl         T0, 56
    548    or          SIGN, T0
    549    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
    550    jz          .CONTINUER8             ; if (idx) {
    551    mov         EOB, KK
    552    add         EOB, T1d                ; EOB = k + idx;
    553 .CONTINUER8:
    554    add         VALUES, 8*2
    555    jmp         .PADDINGR
    556 .TRYR7:
    557    LOAD7
    558 
    559    pcmpgtw     N0, X0
    560    paddw       X0, N0
    561    pxor        X0, N0
    562    psrlw       X0, AL
    563    movdqa      XMMWORD [VALUES + (0) * 2], X0
    564    pcmpeqw     X0, ONE
    565    packsswb    N0, ZERO
    566    packsswb    X0, ZERO
    567    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
    568    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
    569    shr         SIGN, 8                 ; make room for sizebits
    570    shl         T0, 56
    571    or          SIGN, T0
    572    bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
    573    jz          .CONTINUER7             ; if (idx) {
    574    mov         EOB, KK
    575    add         EOB, T1d                ; EOB = k + idx;
    576 .CONTINUER7:
    577    add         VALUES, 8*2
    578 .PADDINGR:
    579    mov         K, LEN
    580    add         K, 7
    581    and         K, -8
    582    shr         K, 3
    583    sub         K, DCTSIZE2/8
    584    jz          .EPADDINGR
    585    align       16
    586 .ZEROLOOPR:
    587    movdqa      XMMWORD [VALUES + 0], ZERO
    588    shr         SIGN, 8
    589    add         VALUES, 8*2
    590    inc         K
    591    jnz         .ZEROLOOPR
    592 .EPADDINGR:
    593    not         SIGN
    594    sub         VALUES, DCTSIZE2*2
    595    mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
    596 
    597    REDUCE0
    598 
    599    mov         eax, EOB
    600    UNCOLLECT_ARGS 6
    601    movdqa      ZERO, XMMWORD [rsp]
    602    mov         rsp, rbp
    603    pop         rbp
    604    ret
    605 
    606 %undef ZERO
    607 %undef ONE
    608 %undef X0
    609 %undef X1
    610 %undef N0
    611 %undef N1
    612 %undef AL
    613 %undef K
    614 %undef KK
    615 %undef EOB
    616 %undef SIGN
    617 %undef LUT
    618 %undef T0
    619 %undef T0d
    620 %undef T1
    621 %undef T1d
    622 %undef BLOCK
    623 %undef VALUES
    624 %undef LEN
    625 %undef LENEND
    626 
    627 ; For some reason, the OS X linker does not honor the request to align the
    628 ; segment unless we do this.
    629    align       32