tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jidctflt-sse2.asm (21570B)


      1 ;
      2 ; jidctflt.asm - floating-point IDCT (SSE & SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 ;
     13 ; This file contains a floating-point implementation of the inverse DCT
     14 ; (Discrete Cosine Transform). The following code is based directly on
     15 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
     16 
     17 %include "jsimdext.inc"
     18 %include "jdct.inc"
     19 
     20 ; --------------------------------------------------------------------------
     21 
     22 %macro UNPCKLPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     23    shufps      %1, %2, 0x44
     24 %endmacro
     25 
     26 %macro UNPCKHPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     27    shufps      %1, %2, 0xEE
     28 %endmacro
     29 
     30 ; --------------------------------------------------------------------------
     31    SECTION     SEG_CONST
     32 
     33    ALIGNZ      32
     34    GLOBAL_DATA(jconst_idct_float_sse2)
     35 
     36 EXTN(jconst_idct_float_sse2):
     37 
     38 PD_1_414        times 4  dd  1.414213562373095048801689
     39 PD_1_847        times 4  dd  1.847759065022573512256366
     40 PD_1_082        times 4  dd  1.082392200292393968799446
     41 PD_M2_613       times 4  dd -2.613125929752753055713286
     42 PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
     43 PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
     44 
     45    ALIGNZ      32
     46 
     47 ; --------------------------------------------------------------------------
     48    SECTION     SEG_TEXT
     49    BITS        32
     50 ;
     51 ; Perform dequantization and inverse DCT on one block of coefficients.
     52 ;
     53 ; GLOBAL(void)
     54 ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
     55 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
     56 ;
     57 
     58 %define dct_table(b)   (b) + 8          ; void *dct_table
     59 %define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
     60 %define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
     61 %define output_col(b)  (b) + 20         ; JDIMENSION output_col
     62 
     63 %define original_ebp   ebp + 0
     64 %define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
     65                                        ; xmmword wk[WK_NUM]
     66 %define WK_NUM         2
     67 %define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
     68                                        ; FAST_FLOAT workspace[DCTSIZE2]
     69 
     70    align       32
     71    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
     72 
     73 EXTN(jsimd_idct_float_sse2):
     74    push        ebp
     75    mov         eax, esp                     ; eax = original ebp
     76    sub         esp, byte 4
     77    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
     78    mov         [esp], eax
     79    mov         ebp, esp                     ; ebp = aligned ebp
     80    lea         esp, [workspace]
     81    push        ebx
     82 ;   push        ecx                     ; need not be preserved
     83 ;   push        edx                     ; need not be preserved
     84    push        esi
     85    push        edi
     86 
     87    GET_GOT     ebx                     ; get GOT address
     88 
     89    ; ---- Pass 1: process columns from input, store into work array.
     90 
     91 ;   mov         eax, [original_ebp]
     92    mov         edx, POINTER [dct_table(eax)]    ; quantptr
     93    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
     94    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
     95    mov         ecx, DCTSIZE/4                   ; ctr
     96    ALIGNX      16, 7
     97 .columnloop:
     98 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
     99    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
    100    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
    101    jnz         near .columnDCT
    102 
    103    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    104    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    105    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    106    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
    107    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    108    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    109    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    110    por         xmm1, xmm2
    111    por         xmm3, xmm4
    112    por         xmm5, xmm6
    113    por         xmm1, xmm3
    114    por         xmm5, xmm7
    115    por         xmm1, xmm5
    116    packsswb    xmm1, xmm1
    117    movd        eax, xmm1
    118    test        eax, eax
    119    jnz         short .columnDCT
    120 
    121    ; -- AC terms all zero
    122 
    123    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    124 
    125    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
    126    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
    127    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
    128 
    129    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    130 
    131    movaps      xmm1, xmm0
    132    movaps      xmm2, xmm0
    133    movaps      xmm3, xmm0
    134 
    135    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
    136    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
    137    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
    138    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
    139 
    140    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    141    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
    142    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
    143    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
    144    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
    145    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
    146    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
    147    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
    148    jmp         near .nextcolumn
    149    ALIGNX      16, 7
    150 %endif
    151 .columnDCT:
    152 
    153    ; -- Even part
    154 
    155    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    156    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    157    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
    158    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    159 
    160    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
    161    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
    162    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
    163    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
    164    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
    165    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
    166 
    167    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
    168    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
    169    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
    170    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
    171    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
    172    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
    173 
    174    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    175    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    176    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    177    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    178 
    179    movaps      xmm4, xmm0
    180    movaps      xmm5, xmm1
    181    subps       xmm0, xmm2              ; xmm0=tmp11
    182    subps       xmm1, xmm3
    183    addps       xmm4, xmm2              ; xmm4=tmp10
    184    addps       xmm5, xmm3              ; xmm5=tmp13
    185 
    186    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
    187    subps       xmm1, xmm5              ; xmm1=tmp12
    188 
    189    movaps      xmm6, xmm4
    190    movaps      xmm7, xmm0
    191    subps       xmm4, xmm5              ; xmm4=tmp3
    192    subps       xmm0, xmm1              ; xmm0=tmp2
    193    addps       xmm6, xmm5              ; xmm6=tmp0
    194    addps       xmm7, xmm1              ; xmm7=tmp1
    195 
    196    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
    197    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
    198 
    199    ; -- Odd part
    200 
    201    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    202    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    203    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    204    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    205 
    206    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
    207    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
    208    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
    209    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
    210    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
    211    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
    212 
    213    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
    214    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
    215    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
    216    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
    217    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
    218    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
    219 
    220    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    221    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    222    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    223    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    224 
    225    movaps      xmm4, xmm2
    226    movaps      xmm0, xmm5
    227    addps       xmm2, xmm1              ; xmm2=z11
    228    addps       xmm5, xmm3              ; xmm5=z13
    229    subps       xmm4, xmm1              ; xmm4=z12
    230    subps       xmm0, xmm3              ; xmm0=z10
    231 
    232    movaps      xmm1, xmm2
    233    subps       xmm2, xmm5
    234    addps       xmm1, xmm5              ; xmm1=tmp7
    235 
    236    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
    237 
    238    movaps      xmm3, xmm0
    239    addps       xmm0, xmm4
    240    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
    241    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
    242    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
    243    addps       xmm3, xmm0                     ; xmm3=tmp12
    244    subps       xmm4, xmm0                     ; xmm4=tmp10
    245 
    246    ; -- Final output stage
    247 
    248    subps       xmm3, xmm1              ; xmm3=tmp6
    249    movaps      xmm5, xmm6
    250    movaps      xmm0, xmm7
    251    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
    252    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
    253    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
    254    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
    255    subps       xmm2, xmm3              ; xmm2=tmp5
    256 
    257    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
    258    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
    259    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
    260    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
    261    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
    262    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
    263 
    264    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
    265    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
    266 
    267    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
    268    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
    269 
    270    addps       xmm4, xmm2              ; xmm4=tmp4
    271    movaps      xmm0, xmm7
    272    movaps      xmm3, xmm5
    273    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
    274    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
    275    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
    276    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
    277 
    278    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
    279    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
    280    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
    281    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
    282    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
    283    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
    284 
    285    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
    286    UNPCKLPS2   xmm6, xmm7              ; xmm6=(00 10 20 30)
    287    UNPCKHPS2   xmm3, xmm7              ; xmm3=(01 11 21 31)
    288    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
    289    UNPCKLPS2   xmm1, xmm2              ; xmm1=(02 12 22 32)
    290    UNPCKHPS2   xmm0, xmm2              ; xmm0=(03 13 23 33)
    291 
    292    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
    293    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
    294 
    295    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
    296    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
    297    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
    298    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    299 
    300    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
    301    UNPCKLPS2   xmm5, xmm7              ; xmm5=(40 50 60 70)
    302    UNPCKHPS2   xmm6, xmm7              ; xmm6=(41 51 61 71)
    303    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
    304    UNPCKLPS2   xmm4, xmm2              ; xmm4=(42 52 62 72)
    305    UNPCKHPS2   xmm3, xmm2              ; xmm3=(43 53 63 73)
    306 
    307    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
    308    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
    309    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
    310    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
    311 
    312 .nextcolumn:
    313    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
    314    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
    315    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
    316    dec         ecx                                    ; ctr
    317    jnz         near .columnloop
    318 
    319    ; -- Prefetch the next coefficient block
    320 
    321    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
    322    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
    323    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
    324    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
    325 
    326    ; ---- Pass 2: process rows from work array, store into output array.
    327 
    328    mov         eax, [original_ebp]
    329    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
    330    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
    331    mov         eax, JDIMENSION [output_col(eax)]
    332    mov         ecx, DCTSIZE/4                     ; ctr
    333    ALIGNX      16, 7
    334 .rowloop:
    335 
    336    ; -- Even part
    337 
    338    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
    339    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
    340    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
    341    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
    342 
    343    movaps      xmm4, xmm0
    344    movaps      xmm5, xmm1
    345    subps       xmm0, xmm2              ; xmm0=tmp11
    346    subps       xmm1, xmm3
    347    addps       xmm4, xmm2              ; xmm4=tmp10
    348    addps       xmm5, xmm3              ; xmm5=tmp13
    349 
    350    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
    351    subps       xmm1, xmm5              ; xmm1=tmp12
    352 
    353    movaps      xmm6, xmm4
    354    movaps      xmm7, xmm0
    355    subps       xmm4, xmm5              ; xmm4=tmp3
    356    subps       xmm0, xmm1              ; xmm0=tmp2
    357    addps       xmm6, xmm5              ; xmm6=tmp0
    358    addps       xmm7, xmm1              ; xmm7=tmp1
    359 
    360    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
    361    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
    362 
    363    ; -- Odd part
    364 
    365    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
    366    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
    367    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
    368    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
    369 
    370    movaps      xmm4, xmm2
    371    movaps      xmm0, xmm5
    372    addps       xmm2, xmm1              ; xmm2=z11
    373    addps       xmm5, xmm3              ; xmm5=z13
    374    subps       xmm4, xmm1              ; xmm4=z12
    375    subps       xmm0, xmm3              ; xmm0=z10
    376 
    377    movaps      xmm1, xmm2
    378    subps       xmm2, xmm5
    379    addps       xmm1, xmm5              ; xmm1=tmp7
    380 
    381    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
    382 
    383    movaps      xmm3, xmm0
    384    addps       xmm0, xmm4
    385    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
    386    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
    387    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
    388    addps       xmm3, xmm0                     ; xmm3=tmp12
    389    subps       xmm4, xmm0                     ; xmm4=tmp10
    390 
    391    ; -- Final output stage
    392 
    393    subps       xmm3, xmm1              ; xmm3=tmp6
    394    movaps      xmm5, xmm6
    395    movaps      xmm0, xmm7
    396    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
    397    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
    398    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
    399    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
    400    subps       xmm2, xmm3              ; xmm2=tmp5
    401 
    402    movaps      xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm1=[PD_RNDINT_MAGIC]
    403    pcmpeqd     xmm3, xmm3
    404    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    405 
    406    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
    407    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
    408    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
    409    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
    410 
    411    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
    412    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
    413    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
    414    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
    415    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
    416    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
    417 
    418    movaps      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
    419    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
    420 
    421    addps       xmm4, xmm2              ; xmm4=tmp4
    422    movaps      xmm7, xmm1
    423    movaps      xmm5, xmm3
    424    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
    425    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
    426    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
    427    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
    428 
    429    movaps      xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm2=[PD_RNDINT_MAGIC]
    430    pcmpeqd     xmm4, xmm4
    431    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    432 
    433    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
    434    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
    435    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
    436    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
    437 
    438    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
    439    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
    440    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
    441    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
    442    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
    443    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
    444 
    445    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
    446 
    447    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
    448    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
    449    paddb       xmm6, xmm2
    450    paddb       xmm1, xmm2
    451 
    452    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
    453    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
    454    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
    455 
    456    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
    457    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
    458    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
    459 
    460    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
    461    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
    462 
    463    PUSHPIC     ebx                     ; save GOT address
    464 
    465    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
    466    mov         ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
    467    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
    468    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
    469    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
    470    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
    471    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
    472    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
    473 
    474    POPPIC      ebx                     ; restore GOT address
    475 
    476    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
    477    add         edi, byte 4*SIZEOF_JSAMPROW
    478    dec         ecx                            ; ctr
    479    jnz         near .rowloop
    480 
    481    pop         edi
    482    pop         esi
    483 ;   pop         edx                     ; need not be preserved
    484 ;   pop         ecx                     ; need not be preserved
    485    pop         ebx
    486    mov         esp, ebp                ; esp <- aligned ebp
    487    pop         esp                     ; esp <- original ebp
    488    pop         ebp
    489    ret
    490 
    491 ; For some reason, the OS X linker does not honor the request to align the
    492 ; segment unless we do this.
    493    align       32