tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jidctflt-sse2.asm (20576B)


      1 ;
      2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander.
      6 ; Copyright (C) 2018, Matthias Räncker.
      7 ; Copyright (C) 2023, Aliaksiej Kandracienka.
      8 ;
      9 ; Based on the x86 SIMD extension for IJG JPEG library
     10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     12 ;
     13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     14 ;
     15 ; This file contains a floating-point implementation of the inverse DCT
     16 ; (Discrete Cosine Transform). The following code is based directly on
     17 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
     18 
     19 %include "jsimdext.inc"
     20 %include "jdct.inc"
     21 
     22 ; --------------------------------------------------------------------------
     23 
     24 %macro UNPCKLPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     25    shufps      %1, %2, 0x44
     26 %endmacro
     27 
     28 %macro UNPCKHPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     29    shufps      %1, %2, 0xEE
     30 %endmacro
     31 
     32 ; --------------------------------------------------------------------------
     33    SECTION     SEG_CONST
     34 
     35    ALIGNZ      32
     36    GLOBAL_DATA(jconst_idct_float_sse2)
     37 
     38 EXTN(jconst_idct_float_sse2):
     39 
     40 PD_1_414        times 4  dd  1.414213562373095048801689
     41 PD_1_847        times 4  dd  1.847759065022573512256366
     42 PD_1_082        times 4  dd  1.082392200292393968799446
     43 PD_M2_613       times 4  dd -2.613125929752753055713286
     44 PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
     45 PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
     46 
     47    ALIGNZ      32
     48 
     49 ; --------------------------------------------------------------------------
     50    SECTION     SEG_TEXT
     51    BITS        64
     52 ;
     53 ; Perform dequantization and inverse DCT on one block of coefficients.
     54 ;
     55 ; GLOBAL(void)
     56 ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
     57 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
     58 ;
     59 
     60 ; r10 = void *dct_table
     61 ; r11 = JCOEFPTR coef_block
     62 ; r12 = JSAMPARRAY output_buf
     63 ; r13d = JDIMENSION output_col
     64 
     65 %define wk(i)         r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
     66                                        ; xmmword wk[WK_NUM]
     67 %define WK_NUM        2
     68 %define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
     69                                        ; FAST_FLOAT workspace[DCTSIZE2]
     70 
     71    align       32
     72    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
     73 
     74 EXTN(jsimd_idct_float_sse2):
     75    ENDBR64
     76    push        rbp
     77    mov         rbp, rsp
     78    push        r15
     79    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
     80    ; Allocate stack space for wk array.  r15 is used to access it.
     81    mov         r15, rsp
     82    lea         rsp, [workspace]
     83    COLLECT_ARGS 4
     84    push        rbx
     85 
     86    ; ---- Pass 1: process columns from input, store into work array.
     87 
     88    mov         rdx, r10                ; quantptr
     89    mov         rsi, r11                ; inptr
     90    lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
     91    mov         rcx, DCTSIZE/4          ; ctr
     92 .columnloop:
     93 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
     94    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
     95    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     96    jnz         near .columnDCT
     97 
     98    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
     99    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    100    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    101    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
    102    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    103    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    104    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    105    por         xmm1, xmm2
    106    por         xmm3, xmm4
    107    por         xmm5, xmm6
    108    por         xmm1, xmm3
    109    por         xmm5, xmm7
    110    por         xmm1, xmm5
    111    packsswb    xmm1, xmm1
    112    movd        eax, xmm1
    113    test        rax, rax
    114    jnz         short .columnDCT
    115 
    116    ; -- AC terms all zero
    117 
    118    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    119 
    120    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
    121    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
    122    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
    123 
    124    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    125 
    126    movaps      xmm1, xmm0
    127    movaps      xmm2, xmm0
    128    movaps      xmm3, xmm0
    129 
    130    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
    131    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
    132    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
    133    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
    134 
    135    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
    136    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
    137    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
    138    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
    139    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
    140    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
    141    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
    142    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
    143    jmp         near .nextcolumn
    144 %endif
    145 .columnDCT:
    146 
    147    ; -- Even part
    148 
    149    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    150    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    151    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
    152    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    153 
    154    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
    155    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
    156    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
    157    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
    158    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
    159    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
    160 
    161    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
    162    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
    163    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
    164    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
    165    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
    166    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
    167 
    168    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    169    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    170    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    171    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    172 
    173    movaps      xmm4, xmm0
    174    movaps      xmm5, xmm1
    175    subps       xmm0, xmm2              ; xmm0=tmp11
    176    subps       xmm1, xmm3
    177    addps       xmm4, xmm2              ; xmm4=tmp10
    178    addps       xmm5, xmm3              ; xmm5=tmp13
    179 
    180    mulps       xmm1, [rel PD_1_414]
    181    subps       xmm1, xmm5              ; xmm1=tmp12
    182 
    183    movaps      xmm6, xmm4
    184    movaps      xmm7, xmm0
    185    subps       xmm4, xmm5              ; xmm4=tmp3
    186    subps       xmm0, xmm1              ; xmm0=tmp2
    187    addps       xmm6, xmm5              ; xmm6=tmp0
    188    addps       xmm7, xmm1              ; xmm7=tmp1
    189 
    190    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
    191    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
    192 
    193    ; -- Odd part
    194 
    195    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    196    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    197    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    198    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    199 
    200    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
    201    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
    202    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
    203    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
    204    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
    205    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
    206 
    207    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
    208    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
    209    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
    210    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
    211    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
    212    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
    213 
    214    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    215    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    216    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    217    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    218 
    219    movaps      xmm4, xmm2
    220    movaps      xmm0, xmm5
    221    addps       xmm2, xmm1              ; xmm2=z11
    222    addps       xmm5, xmm3              ; xmm5=z13
    223    subps       xmm4, xmm1              ; xmm4=z12
    224    subps       xmm0, xmm3              ; xmm0=z10
    225 
    226    movaps      xmm1, xmm2
    227    subps       xmm2, xmm5
    228    addps       xmm1, xmm5              ; xmm1=tmp7
    229 
    230    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
    231 
    232    movaps      xmm3, xmm0
    233    addps       xmm0, xmm4
    234    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
    235    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
    236    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
    237    addps       xmm3, xmm0              ; xmm3=tmp12
    238    subps       xmm4, xmm0              ; xmm4=tmp10
    239 
    240    ; -- Final output stage
    241 
    242    subps       xmm3, xmm1              ; xmm3=tmp6
    243    movaps      xmm5, xmm6
    244    movaps      xmm0, xmm7
    245    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
    246    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
    247    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
    248    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
    249    subps       xmm2, xmm3              ; xmm2=tmp5
    250 
    251    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
    252    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
    253    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
    254    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
    255    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
    256    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
    257 
    258    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
    259    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
    260 
    261    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
    262    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
    263 
    264    addps       xmm4, xmm2              ; xmm4=tmp4
    265    movaps      xmm0, xmm7
    266    movaps      xmm3, xmm5
    267    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
    268    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
    269    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
    270    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
    271 
    272    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
    273    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
    274    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
    275    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
    276    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
    277    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
    278 
    279    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
    280    UNPCKLPS2   xmm6, xmm7              ; xmm6=(00 10 20 30)
    281    UNPCKHPS2   xmm3, xmm7              ; xmm3=(01 11 21 31)
    282    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
    283    UNPCKLPS2   xmm1, xmm2              ; xmm1=(02 12 22 32)
    284    UNPCKHPS2   xmm0, xmm2              ; xmm0=(03 13 23 33)
    285 
    286    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
    287    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
    288 
    289    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
    290    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
    291    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
    292    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
    293 
    294    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
    295    UNPCKLPS2   xmm5, xmm7              ; xmm5=(40 50 60 70)
    296    UNPCKHPS2   xmm6, xmm7              ; xmm6=(41 51 61 71)
    297    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
    298    UNPCKLPS2   xmm4, xmm2              ; xmm4=(42 52 62 72)
    299    UNPCKHPS2   xmm3, xmm2              ; xmm3=(43 53 63 73)
    300 
    301    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
    302    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
    303    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
    304    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
    305 
    306 .nextcolumn:
    307    add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
    308    add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
    309    add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
    310    dec         rcx                                    ; ctr
    311    jnz         near .columnloop
    312 
    313    ; -- Prefetch the next coefficient block
    314 
    315    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
    316    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
    317    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
    318    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
    319 
    320    ; ---- Pass 2: process rows from work array, store into output array.
    321 
    322    lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
    323    mov         rdi, r12                ; (JSAMPROW *)
    324    mov         eax, r13d
    325    mov         rcx, DCTSIZE/4          ; ctr
    326 .rowloop:
    327 
    328    ; -- Even part
    329 
    330    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
    331    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
    332    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
    333    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
    334 
    335    movaps      xmm4, xmm0
    336    movaps      xmm5, xmm1
    337    subps       xmm0, xmm2              ; xmm0=tmp11
    338    subps       xmm1, xmm3
    339    addps       xmm4, xmm2              ; xmm4=tmp10
    340    addps       xmm5, xmm3              ; xmm5=tmp13
    341 
    342    mulps       xmm1, [rel PD_1_414]
    343    subps       xmm1, xmm5              ; xmm1=tmp12
    344 
    345    movaps      xmm6, xmm4
    346    movaps      xmm7, xmm0
    347    subps       xmm4, xmm5              ; xmm4=tmp3
    348    subps       xmm0, xmm1              ; xmm0=tmp2
    349    addps       xmm6, xmm5              ; xmm6=tmp0
    350    addps       xmm7, xmm1              ; xmm7=tmp1
    351 
    352    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
    353    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
    354 
    355    ; -- Odd part
    356 
    357    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
    358    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
    359    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
    360    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
    361 
    362    movaps      xmm4, xmm2
    363    movaps      xmm0, xmm5
    364    addps       xmm2, xmm1              ; xmm2=z11
    365    addps       xmm5, xmm3              ; xmm5=z13
    366    subps       xmm4, xmm1              ; xmm4=z12
    367    subps       xmm0, xmm3              ; xmm0=z10
    368 
    369    movaps      xmm1, xmm2
    370    subps       xmm2, xmm5
    371    addps       xmm1, xmm5              ; xmm1=tmp7
    372 
    373    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
    374 
    375    movaps      xmm3, xmm0
    376    addps       xmm0, xmm4
    377    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
    378    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
    379    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
    380    addps       xmm3, xmm0              ; xmm3=tmp12
    381    subps       xmm4, xmm0              ; xmm4=tmp10
    382 
    383    ; -- Final output stage
    384 
    385    subps       xmm3, xmm1              ; xmm3=tmp6
    386    movaps      xmm5, xmm6
    387    movaps      xmm0, xmm7
    388    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
    389    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
    390    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
    391    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
    392    subps       xmm2, xmm3              ; xmm2=tmp5
    393 
    394    movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
    395    pcmpeqd     xmm3, xmm3
    396    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    397 
    398    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
    399    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
    400    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
    401    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
    402 
    403    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
    404    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
    405    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
    406    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
    407    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
    408    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
    409 
    410    movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
    411    movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
    412 
    413    addps       xmm4, xmm2              ; xmm4=tmp4
    414    movaps      xmm7, xmm1
    415    movaps      xmm5, xmm3
    416    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
    417    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
    418    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
    419    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
    420 
    421    movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
    422    pcmpeqd     xmm4, xmm4
    423    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    424 
    425    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
    426    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
    427    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
    428    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
    429 
    430    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
    431    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
    432    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
    433    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
    434    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
    435    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
    436 
    437    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
    438 
    439    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
    440    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
    441    paddb       xmm6, xmm2
    442    paddb       xmm1, xmm2
    443 
    444    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
    445    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
    446    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
    447 
    448    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
    449    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
    450    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
    451 
    452    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
    453    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
    454 
    455    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
    456    mov         rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
    457    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
    458    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
    459    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
    460    mov         rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
    461    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
    462    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
    463 
    464    add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
    465    add         rdi, byte 4*SIZEOF_JSAMPROW
    466    dec         rcx                            ; ctr
    467    jnz         near .rowloop
    468 
    469    pop         rbx
    470    UNCOLLECT_ARGS 4
    471    lea         rsp, [rbp-8]
    472    pop         r15
    473    pop         rbp
    474    ret
    475 
    476 ; For some reason, the OS X linker does not honor the request to align the
    477 ; segment unless we do this.
    478    align       32