tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jidctflt-sse.asm (24713B)


      1 ;
      2 ; jidctflt.asm - floating-point IDCT (SSE & MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 ;
     13 ; This file contains a floating-point implementation of the inverse DCT
     14 ; (Discrete Cosine Transform). The following code is based directly on
     15 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
     16 
     17 %include "jsimdext.inc"
     18 %include "jdct.inc"
     19 
     20 ; --------------------------------------------------------------------------
     21 
     22 %macro UNPCKLPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     23    shufps      %1, %2, 0x44
     24 %endmacro
     25 
     26 %macro UNPCKHPS2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     27    shufps      %1, %2, 0xEE
     28 %endmacro
     29 
     30 ; --------------------------------------------------------------------------
     31    SECTION     SEG_CONST
     32 
     33    ALIGNZ      32
     34    GLOBAL_DATA(jconst_idct_float_sse)
     35 
     36 EXTN(jconst_idct_float_sse):
     37 
     38 PD_1_414       times 4 dd  1.414213562373095048801689
     39 PD_1_847       times 4 dd  1.847759065022573512256366
     40 PD_1_082       times 4 dd  1.082392200292393968799446
     41 PD_M2_613      times 4 dd -2.613125929752753055713286
     42 PD_0_125       times 4 dd  0.125        ; 1/8
     43 PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
     44 
     45    ALIGNZ      32
     46 
     47 ; --------------------------------------------------------------------------
     48    SECTION     SEG_TEXT
     49    BITS        32
     50 ;
     51 ; Perform dequantization and inverse DCT on one block of coefficients.
     52 ;
     53 ; GLOBAL(void)
     54 ; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block,
     55 ;                      JSAMPARRAY output_buf, JDIMENSION output_col)
     56 ;
     57 
     58 %define dct_table(b)   (b) + 8          ; void *dct_table
     59 %define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
     60 %define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
     61 %define output_col(b)  (b) + 20         ; JDIMENSION output_col
     62 
     63 %define original_ebp   ebp + 0
     64 %define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
     65                                        ; xmmword wk[WK_NUM]
     66 %define WK_NUM         2
     67 %define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
     68                                        ; FAST_FLOAT workspace[DCTSIZE2]
     69 
     70    align       32
     71    GLOBAL_FUNCTION(jsimd_idct_float_sse)
     72 
     73 EXTN(jsimd_idct_float_sse):
     74    push        ebp
     75    mov         eax, esp                     ; eax = original ebp
     76    sub         esp, byte 4
     77    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
     78    mov         [esp], eax
     79    mov         ebp, esp                     ; ebp = aligned ebp
     80    lea         esp, [workspace]
     81    push        ebx
     82 ;   push        ecx                     ; need not be preserved
     83 ;   push        edx                     ; need not be preserved
     84    push        esi
     85    push        edi
     86 
     87    GET_GOT     ebx                     ; get GOT address
     88 
     89    ; ---- Pass 1: process columns from input, store into work array.
     90 
     91 ;   mov         eax, [original_ebp]
     92    mov         edx, POINTER [dct_table(eax)]    ; quantptr
     93    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
     94    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
     95    mov         ecx, DCTSIZE/4                   ; ctr
     96    ALIGNX      16, 7
     97 .columnloop:
     98 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
     99    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
    100    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
    101    jnz         near .columnDCT
    102 
    103    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    104    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    105    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    106    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
    107    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    108    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    109    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    110    por         mm1, mm0
    111    packsswb    mm1, mm1
    112    movd        eax, mm1
    113    test        eax, eax
    114    jnz         short .columnDCT
    115 
    116    ; -- AC terms all zero
    117 
    118    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    119 
    120    punpckhwd   mm1, mm0                   ; mm1=(** 02 ** 03)
    121    punpcklwd   mm0, mm0                   ; mm0=(00 00 01 01)
    122    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in0H=(02 03)
    123    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
    124    cvtpi2ps    xmm3, mm1                  ; xmm3=(02 03 ** **)
    125    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
    126    movlhps     xmm0, xmm3                 ; xmm0=in0=(00 01 02 03)
    127 
    128    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    129 
    130    movaps      xmm1, xmm0
    131    movaps      xmm2, xmm0
    132    movaps      xmm3, xmm0
    133 
    134    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
    135    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
    136    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
    137    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
    138 
    139    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    140    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
    141    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
    142    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
    143    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
    144    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
    145    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
    146    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
    147    jmp         near .nextcolumn
    148    ALIGNX      16, 7
    149 %endif
    150 .columnDCT:
    151 
    152    ; -- Even part
    153 
    154    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    155    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    156    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
    157    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    158 
    159    punpckhwd   mm4, mm0                ; mm4=(** 02 ** 03)
    160    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
    161    punpckhwd   mm5, mm1                ; mm5=(** 22 ** 23)
    162    punpcklwd   mm1, mm1                ; mm1=(20 20 21 21)
    163 
    164    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in0H=(02 03)
    165    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
    166    cvtpi2ps    xmm4, mm4                  ; xmm4=(02 03 ** **)
    167    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
    168    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in2H=(22 23)
    169    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in2L=(20 21)
    170    cvtpi2ps    xmm5, mm5                  ; xmm5=(22 23 ** **)
    171    cvtpi2ps    xmm1, mm1                  ; xmm1=(20 21 ** **)
    172 
    173    punpckhwd   mm6, mm2                ; mm6=(** 42 ** 43)
    174    punpcklwd   mm2, mm2                ; mm2=(40 40 41 41)
    175    punpckhwd   mm7, mm3                ; mm7=(** 62 ** 63)
    176    punpcklwd   mm3, mm3                ; mm3=(60 60 61 61)
    177 
    178    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in4H=(42 43)
    179    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in4L=(40 41)
    180    cvtpi2ps    xmm6, mm6                  ; xmm6=(42 43 ** **)
    181    cvtpi2ps    xmm2, mm2                  ; xmm2=(40 41 ** **)
    182    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in6H=(62 63)
    183    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in6L=(60 61)
    184    cvtpi2ps    xmm7, mm7                  ; xmm7=(62 63 ** **)
    185    cvtpi2ps    xmm3, mm3                  ; xmm3=(60 61 ** **)
    186 
    187    movlhps     xmm0, xmm4              ; xmm0=in0=(00 01 02 03)
    188    movlhps     xmm1, xmm5              ; xmm1=in2=(20 21 22 23)
    189    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    190    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    191 
    192    movlhps     xmm2, xmm6              ; xmm2=in4=(40 41 42 43)
    193    movlhps     xmm3, xmm7              ; xmm3=in6=(60 61 62 63)
    194    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    195    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    196 
    197    movaps      xmm4, xmm0
    198    movaps      xmm5, xmm1
    199    subps       xmm0, xmm2              ; xmm0=tmp11
    200    subps       xmm1, xmm3
    201    addps       xmm4, xmm2              ; xmm4=tmp10
    202    addps       xmm5, xmm3              ; xmm5=tmp13
    203 
    204    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
    205    subps       xmm1, xmm5              ; xmm1=tmp12
    206 
    207    movaps      xmm6, xmm4
    208    movaps      xmm7, xmm0
    209    subps       xmm4, xmm5              ; xmm4=tmp3
    210    subps       xmm0, xmm1              ; xmm0=tmp2
    211    addps       xmm6, xmm5              ; xmm6=tmp0
    212    addps       xmm7, xmm1              ; xmm7=tmp1
    213 
    214    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
    215    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
    216 
    217    ; -- Odd part
    218 
    219    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    220    movq        mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    221    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    222    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    223 
    224    punpckhwd   mm6, mm4                ; mm6=(** 12 ** 13)
    225    punpcklwd   mm4, mm4                ; mm4=(10 10 11 11)
    226    punpckhwd   mm2, mm0                ; mm2=(** 32 ** 33)
    227    punpcklwd   mm0, mm0                ; mm0=(30 30 31 31)
    228 
    229    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in1H=(12 13)
    230    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in1L=(10 11)
    231    cvtpi2ps    xmm4, mm6                  ; xmm4=(12 13 ** **)
    232    cvtpi2ps    xmm2, mm4                  ; xmm2=(10 11 ** **)
    233    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in3H=(32 33)
    234    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in3L=(30 31)
    235    cvtpi2ps    xmm0, mm2                  ; xmm0=(32 33 ** **)
    236    cvtpi2ps    xmm3, mm0                  ; xmm3=(30 31 ** **)
    237 
    238    punpckhwd   mm7, mm5                ; mm7=(** 52 ** 53)
    239    punpcklwd   mm5, mm5                ; mm5=(50 50 51 51)
    240    punpckhwd   mm3, mm1                ; mm3=(** 72 ** 73)
    241    punpcklwd   mm1, mm1                ; mm1=(70 70 71 71)
    242 
    243    movlhps     xmm2, xmm4              ; xmm2=in1=(10 11 12 13)
    244    movlhps     xmm3, xmm0              ; xmm3=in3=(30 31 32 33)
    245 
    246    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in5H=(52 53)
    247    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in5L=(50 51)
    248    cvtpi2ps    xmm4, mm7                  ; xmm4=(52 53 ** **)
    249    cvtpi2ps    xmm5, mm5                  ; xmm5=(50 51 ** **)
    250    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in7H=(72 73)
    251    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in7L=(70 71)
    252    cvtpi2ps    xmm0, mm3                  ; xmm0=(72 73 ** **)
    253    cvtpi2ps    xmm1, mm1                  ; xmm1=(70 71 ** **)
    254 
    255    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    256    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    257 
    258    movlhps     xmm5, xmm4              ; xmm5=in5=(50 51 52 53)
    259    movlhps     xmm1, xmm0              ; xmm1=in7=(70 71 72 73)
    260    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    261    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    262 
    263    movaps      xmm4, xmm2
    264    movaps      xmm0, xmm5
    265    addps       xmm2, xmm1              ; xmm2=z11
    266    addps       xmm5, xmm3              ; xmm5=z13
    267    subps       xmm4, xmm1              ; xmm4=z12
    268    subps       xmm0, xmm3              ; xmm0=z10
    269 
    270    movaps      xmm1, xmm2
    271    subps       xmm2, xmm5
    272    addps       xmm1, xmm5              ; xmm1=tmp7
    273 
    274    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
    275 
    276    movaps      xmm3, xmm0
    277    addps       xmm0, xmm4
    278    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
    279    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
    280    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
    281    addps       xmm3, xmm0                     ; xmm3=tmp12
    282    subps       xmm4, xmm0                     ; xmm4=tmp10
    283 
    284    ; -- Final output stage
    285 
    286    subps       xmm3, xmm1              ; xmm3=tmp6
    287    movaps      xmm5, xmm6
    288    movaps      xmm0, xmm7
    289    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
    290    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
    291    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
    292    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
    293    subps       xmm2, xmm3              ; xmm2=tmp5
    294 
    295    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
    296    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
    297    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
    298    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
    299    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
    300    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
    301 
    302    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
    303    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
    304 
    305    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
    306    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
    307 
    308    addps       xmm4, xmm2              ; xmm4=tmp4
    309    movaps      xmm0, xmm7
    310    movaps      xmm3, xmm5
    311    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
    312    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
    313    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
    314    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
    315 
    316    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
    317    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
    318    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
    319    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
    320    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
    321    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
    322 
    323    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
    324    UNPCKLPS2   xmm6, xmm7              ; xmm6=(00 10 20 30)
    325    UNPCKHPS2   xmm3, xmm7              ; xmm3=(01 11 21 31)
    326    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
    327    UNPCKLPS2   xmm1, xmm2              ; xmm1=(02 12 22 32)
    328    UNPCKHPS2   xmm0, xmm2              ; xmm0=(03 13 23 33)
    329 
    330    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
    331    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
    332 
    333    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
    334    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
    335    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
    336    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    337 
    338    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
    339    UNPCKLPS2   xmm5, xmm7              ; xmm5=(40 50 60 70)
    340    UNPCKHPS2   xmm6, xmm7              ; xmm6=(41 51 61 71)
    341    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
    342    UNPCKLPS2   xmm4, xmm2              ; xmm4=(42 52 62 72)
    343    UNPCKHPS2   xmm3, xmm2              ; xmm3=(43 53 63 73)
    344 
    345    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
    346    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
    347    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
    348    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
    349 
    350 .nextcolumn:
    351    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
    352    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
    353    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
    354    dec         ecx                                    ; ctr
    355    jnz         near .columnloop
    356 
    357    ; -- Prefetch the next coefficient block
    358 
    359    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
    360    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
    361    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
    362    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
    363 
    364    ; ---- Pass 2: process rows from work array, store into output array.
    365 
    366    mov         eax, [original_ebp]
    367    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
    368    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
    369    mov         eax, JDIMENSION [output_col(eax)]
    370    mov         ecx, DCTSIZE/4                     ; ctr
    371    ALIGNX      16, 7
    372 .rowloop:
    373 
    374    ; -- Even part
    375 
    376    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
    377    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
    378    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
    379    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
    380 
    381    movaps      xmm4, xmm0
    382    movaps      xmm5, xmm1
    383    subps       xmm0, xmm2              ; xmm0=tmp11
    384    subps       xmm1, xmm3
    385    addps       xmm4, xmm2              ; xmm4=tmp10
    386    addps       xmm5, xmm3              ; xmm5=tmp13
    387 
    388    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
    389    subps       xmm1, xmm5              ; xmm1=tmp12
    390 
    391    movaps      xmm6, xmm4
    392    movaps      xmm7, xmm0
    393    subps       xmm4, xmm5              ; xmm4=tmp3
    394    subps       xmm0, xmm1              ; xmm0=tmp2
    395    addps       xmm6, xmm5              ; xmm6=tmp0
    396    addps       xmm7, xmm1              ; xmm7=tmp1
    397 
    398    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
    399    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
    400 
    401    ; -- Odd part
    402 
    403    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
    404    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
    405    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
    406    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
    407 
    408    movaps      xmm4, xmm2
    409    movaps      xmm0, xmm5
    410    addps       xmm2, xmm1              ; xmm2=z11
    411    addps       xmm5, xmm3              ; xmm5=z13
    412    subps       xmm4, xmm1              ; xmm4=z12
    413    subps       xmm0, xmm3              ; xmm0=z10
    414 
    415    movaps      xmm1, xmm2
    416    subps       xmm2, xmm5
    417    addps       xmm1, xmm5              ; xmm1=tmp7
    418 
    419    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
    420 
    421    movaps      xmm3, xmm0
    422    addps       xmm0, xmm4
    423    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
    424    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
    425    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
    426    addps       xmm3, xmm0                     ; xmm3=tmp12
    427    subps       xmm4, xmm0                     ; xmm4=tmp10
    428 
    429    ; -- Final output stage
    430 
    431    subps       xmm3, xmm1              ; xmm3=tmp6
    432    movaps      xmm5, xmm6
    433    movaps      xmm0, xmm7
    434    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
    435    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
    436    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
    437    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
    438    subps       xmm2, xmm3              ; xmm2=tmp5
    439 
    440    movaps      xmm1, [GOTOFF(ebx,PD_0_125)]  ; xmm1=[PD_0_125]
    441 
    442    mulps       xmm6, xmm1              ; descale(1/8)
    443    mulps       xmm7, xmm1              ; descale(1/8)
    444    mulps       xmm5, xmm1              ; descale(1/8)
    445    mulps       xmm0, xmm1              ; descale(1/8)
    446 
    447    movhlps     xmm3, xmm6
    448    movhlps     xmm1, xmm7
    449    cvtps2pi    mm0, xmm6               ; round to int32, mm0=data0L=(00 10)
    450    cvtps2pi    mm1, xmm7               ; round to int32, mm1=data1L=(01 11)
    451    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data0H=(20 30)
    452    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data1H=(21 31)
    453    packssdw    mm0, mm2                ; mm0=data0=(00 10 20 30)
    454    packssdw    mm1, mm3                ; mm1=data1=(01 11 21 31)
    455 
    456    movhlps     xmm6, xmm5
    457    movhlps     xmm7, xmm0
    458    cvtps2pi    mm4, xmm5               ; round to int32, mm4=data7L=(07 17)
    459    cvtps2pi    mm5, xmm0               ; round to int32, mm5=data6L=(06 16)
    460    cvtps2pi    mm6, xmm6               ; round to int32, mm6=data7H=(27 37)
    461    cvtps2pi    mm7, xmm7               ; round to int32, mm7=data6H=(26 36)
    462    packssdw    mm4, mm6                ; mm4=data7=(07 17 27 37)
    463    packssdw    mm5, mm7                ; mm5=data6=(06 16 26 36)
    464 
    465    packsswb    mm0, mm5                ; mm0=(00 10 20 30 06 16 26 36)
    466    packsswb    mm1, mm4                ; mm1=(01 11 21 31 07 17 27 37)
    467 
    468    movaps      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp2
    469    movaps      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
    470 
    471    movaps      xmm6, [GOTOFF(ebx,PD_0_125)]  ; xmm6=[PD_0_125]
    472 
    473    addps       xmm4, xmm2              ; xmm4=tmp4
    474    movaps      xmm5, xmm3
    475    movaps      xmm0, xmm1
    476    addps       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
    477    addps       xmm1, xmm4              ; xmm1=data4=(04 14 24 34)
    478    subps       xmm5, xmm2              ; xmm5=data5=(05 15 25 35)
    479    subps       xmm0, xmm4              ; xmm0=data3=(03 13 23 33)
    480 
    481    mulps       xmm3, xmm6              ; descale(1/8)
    482    mulps       xmm1, xmm6              ; descale(1/8)
    483    mulps       xmm5, xmm6              ; descale(1/8)
    484    mulps       xmm0, xmm6              ; descale(1/8)
    485 
    486    movhlps     xmm7, xmm3
    487    movhlps     xmm2, xmm1
    488    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data2L=(02 12)
    489    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data4L=(04 14)
    490    cvtps2pi    mm6, xmm7               ; round to int32, mm6=data2H=(22 32)
    491    cvtps2pi    mm7, xmm2               ; round to int32, mm7=data4H=(24 34)
    492    packssdw    mm2, mm6                ; mm2=data2=(02 12 22 32)
    493    packssdw    mm3, mm7                ; mm3=data4=(04 14 24 34)
    494 
    495    movhlps     xmm4, xmm5
    496    movhlps     xmm6, xmm0
    497    cvtps2pi    mm5, xmm5               ; round to int32, mm5=data5L=(05 15)
    498    cvtps2pi    mm4, xmm0               ; round to int32, mm4=data3L=(03 13)
    499    cvtps2pi    mm6, xmm4               ; round to int32, mm6=data5H=(25 35)
    500    cvtps2pi    mm7, xmm6               ; round to int32, mm7=data3H=(23 33)
    501    packssdw    mm5, mm6                ; mm5=data5=(05 15 25 35)
    502    packssdw    mm4, mm7                ; mm4=data3=(03 13 23 33)
    503 
    504    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
    505 
    506    packsswb    mm2, mm3                ; mm2=(02 12 22 32 04 14 24 34)
    507    packsswb    mm4, mm5                ; mm4=(03 13 23 33 05 15 25 35)
    508 
    509    paddb       mm0, mm6
    510    paddb       mm1, mm6
    511    paddb       mm2, mm6
    512    paddb       mm4, mm6
    513 
    514    movq        mm7, mm0                ; transpose coefficients(phase 1)
    515    punpcklbw   mm0, mm1                ; mm0=(00 01 10 11 20 21 30 31)
    516    punpckhbw   mm7, mm1                ; mm7=(06 07 16 17 26 27 36 37)
    517    movq        mm3, mm2                ; transpose coefficients(phase 1)
    518    punpcklbw   mm2, mm4                ; mm2=(02 03 12 13 22 23 32 33)
    519    punpckhbw   mm3, mm4                ; mm3=(04 05 14 15 24 25 34 35)
    520 
    521    movq        mm5, mm0                ; transpose coefficients(phase 2)
    522    punpcklwd   mm0, mm2                ; mm0=(00 01 02 03 10 11 12 13)
    523    punpckhwd   mm5, mm2                ; mm5=(20 21 22 23 30 31 32 33)
    524    movq        mm6, mm3                ; transpose coefficients(phase 2)
    525    punpcklwd   mm3, mm7                ; mm3=(04 05 06 07 14 15 16 17)
    526    punpckhwd   mm6, mm7                ; mm6=(24 25 26 27 34 35 36 37)
    527 
    528    movq        mm1, mm0                ; transpose coefficients(phase 3)
    529    punpckldq   mm0, mm3                ; mm0=(00 01 02 03 04 05 06 07)
    530    punpckhdq   mm1, mm3                ; mm1=(10 11 12 13 14 15 16 17)
    531    movq        mm4, mm5                ; transpose coefficients(phase 3)
    532    punpckldq   mm5, mm6                ; mm5=(20 21 22 23 24 25 26 27)
    533    punpckhdq   mm4, mm6                ; mm4=(30 31 32 33 34 35 36 37)
    534 
    535    PUSHPIC     ebx                     ; save GOT address
    536 
    537    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
    538    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
    539    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
    540    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
    541    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
    542    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
    543    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
    544    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
    545 
    546    POPPIC      ebx                     ; restore GOT address
    547 
    548    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
    549    add         edi, byte 4*SIZEOF_JSAMPROW
    550    dec         ecx                            ; ctr
    551    jnz         near .rowloop
    552 
    553    emms                                ; empty MMX state
    554 
    555    pop         edi
    556    pop         esi
    557 ;   pop         edx                     ; need not be preserved
    558 ;   pop         ecx                     ; need not be preserved
    559    pop         ebx
    560    mov         esp, ebp                ; esp <- aligned ebp
    561    pop         esp                     ; esp <- original ebp
    562    pop         ebp
    563    ret
    564 
    565 ; For some reason, the OS X linker does not honor the request to align the
    566 ; segment unless we do this.
    567    align       32