tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jfdctint-mmx.asm (24349B)


      1 ;
      2 ; jfdctint.asm - accurate integer FDCT (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2020, 2024, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     12 ;
     13 ; This file contains a slower but more accurate integer implementation of the
     14 ; forward DCT (Discrete Cosine Transform). The following code is based
     15 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
     16 ; more details.
     17 
     18 %include "jsimdext.inc"
     19 %include "jdct.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 
     23 %define CONST_BITS  13
     24 %define PASS1_BITS  2
     25 
     26 %define DESCALE_P1  (CONST_BITS - PASS1_BITS)
     27 %define DESCALE_P2  (CONST_BITS + PASS1_BITS)
     28 
     29 %if CONST_BITS == 13
     30 F_0_298 equ  2446  ; FIX(0.298631336)
     31 F_0_390 equ  3196  ; FIX(0.390180644)
     32 F_0_541 equ  4433  ; FIX(0.541196100)
     33 F_0_765 equ  6270  ; FIX(0.765366865)
     34 F_0_899 equ  7373  ; FIX(0.899976223)
     35 F_1_175 equ  9633  ; FIX(1.175875602)
     36 F_1_501 equ 12299  ; FIX(1.501321110)
     37 F_1_847 equ 15137  ; FIX(1.847759065)
     38 F_1_961 equ 16069  ; FIX(1.961570560)
     39 F_2_053 equ 16819  ; FIX(2.053119869)
     40 F_2_562 equ 20995  ; FIX(2.562915447)
     41 F_3_072 equ 25172  ; FIX(3.072711026)
     42 %else
     43 ; NASM cannot do compile-time arithmetic on floating-point constants.
     44 %define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
     45 F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
     46 F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
     47 F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
     48 F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
     49 F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
     50 F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
     51 F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
     52 F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
     53 F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
     54 F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
     55 F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
     56 F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
     57 %endif
     58 
     59 ; --------------------------------------------------------------------------
     60    SECTION     SEG_CONST
     61 
     62    ALIGNZ      32
     63    GLOBAL_DATA(jconst_fdct_islow_mmx)
     64 
     65 EXTN(jconst_fdct_islow_mmx):
     66 
     67 PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
     68 PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
     69 PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
     70 PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
     71 PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
     72 PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
     73 PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
     74 PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
     75 PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
     76 PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
     77 PW_DESCALE_P2X times 4 dw  1 << (PASS1_BITS - 1)
     78 
     79    ALIGNZ      32
     80 
     81 ; --------------------------------------------------------------------------
     82    SECTION     SEG_TEXT
     83    BITS        32
     84 ;
     85 ; Perform the forward DCT on one block of samples.
     86 ;
     87 ; GLOBAL(void)
     88 ; jsimd_fdct_islow_mmx(DCTELEM *data)
     89 ;
     90 
     91 %define data(b)       (b) + 8           ; DCTELEM *data
     92 
     93 %define original_ebp  ebp + 0
     94 %define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
     95 %define WK_NUM        2
     96 
     97    align       32
     98    GLOBAL_FUNCTION(jsimd_fdct_islow_mmx)
     99 
    100 EXTN(jsimd_fdct_islow_mmx):
    101    push        ebp
    102    mov         eax, esp                    ; eax = original ebp
    103    sub         esp, byte 4
    104    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
    105    mov         [esp], eax
    106    mov         ebp, esp                    ; ebp = aligned ebp
    107    lea         esp, [wk(0)]
    108    PUSHPIC     ebx
    109 ;   push        ecx                     ; need not be preserved
    110 ;   push        edx                     ; need not be preserved
    111 ;   push        esi                     ; unused
    112 ;   push        edi                     ; unused
    113 
    114    GET_GOT     ebx                     ; get GOT address
    115 
    116    ; ---- Pass 1: process rows.
    117 
    118    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
    119    mov         ecx, DCTSIZE/4
    120    ALIGNX      16, 7
    121 .rowloop:
    122 
    123    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
    124    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
    125    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
    126    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
    127 
    128    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
    129    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
    130 
    131    movq        mm4, mm0                ; transpose coefficients(phase 1)
    132    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
    133    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
    134    movq        mm5, mm2                ; transpose coefficients(phase 1)
    135    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
    136    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
    137 
    138    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
    139    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
    140    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
    141    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
    142 
    143    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
    144    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
    145 
    146    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
    147    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
    148 
    149    movq        mm4, mm6                ; transpose coefficients(phase 1)
    150    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
    151    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
    152    movq        mm2, mm1                ; transpose coefficients(phase 1)
    153    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
    154    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
    155 
    156    movq        mm7, mm6                ; transpose coefficients(phase 2)
    157    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
    158    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
    159    movq        mm3, mm2                ; transpose coefficients(phase 2)
    160    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
    161    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
    162 
    163    movq        mm0, mm7
    164    movq        mm5, mm6
    165    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
    166    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
    167    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
    168    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
    169 
    170    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
    171    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
    172    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
    173    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
    174 
    175    movq        mm7, mm4                ; transpose coefficients(phase 2)
    176    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
    177    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
    178    movq        mm6, mm1                ; transpose coefficients(phase 2)
    179    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
    180    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
    181 
    182    movq        mm2, mm7
    183    movq        mm3, mm4
    184    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
    185    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
    186    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
    187    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
    188 
    189    ; -- Even part
    190 
    191    movq        mm1, mm5
    192    movq        mm6, mm0
    193    paddw       mm5, mm7                ; mm5=tmp10
    194    paddw       mm0, mm4                ; mm0=tmp11
    195    psubw       mm1, mm7                ; mm1=tmp13
    196    psubw       mm6, mm4                ; mm6=tmp12
    197 
    198    movq        mm7, mm5
    199    paddw       mm5, mm0                ; mm5=tmp10+tmp11
    200    psubw       mm7, mm0                ; mm7=tmp10-tmp11
    201 
    202    psllw       mm5, PASS1_BITS         ; mm5=data0
    203    psllw       mm7, PASS1_BITS         ; mm7=data4
    204 
    205    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
    206    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
    207 
    208    ; (Original)
    209    ; z1 = (tmp12 + tmp13) * 0.541196100;
    210    ; data2 = z1 + tmp13 * 0.765366865;
    211    ; data6 = z1 + tmp12 * -1.847759065;
    212    ;
    213    ; (This implementation)
    214    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
    215    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
    216 
    217    movq        mm4, mm1                ; mm1=tmp13
    218    movq        mm0, mm1
    219    punpcklwd   mm4, mm6                ; mm6=tmp12
    220    punpckhwd   mm0, mm6
    221    movq        mm1, mm4
    222    movq        mm6, mm0
    223    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
    224    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
    225    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
    226    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
    227 
    228    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
    229    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P1)]
    230    psrad       mm4, DESCALE_P1
    231    psrad       mm0, DESCALE_P1
    232    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
    233    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P1)]
    234    psrad       mm1, DESCALE_P1
    235    psrad       mm6, DESCALE_P1
    236 
    237    packssdw    mm4, mm0                ; mm4=data2
    238    packssdw    mm1, mm6                ; mm1=data6
    239 
    240    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
    241    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
    242 
    243    ; -- Odd part
    244 
    245    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
    246    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
    247 
    248    movq        mm0, mm2                ; mm2=tmp4
    249    movq        mm6, mm3                ; mm3=tmp5
    250    paddw       mm0, mm5                ; mm0=z3
    251    paddw       mm6, mm7                ; mm6=z4
    252 
    253    ; (Original)
    254    ; z5 = (z3 + z4) * 1.175875602;
    255    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
    256    ; z3 += z5;  z4 += z5;
    257    ;
    258    ; (This implementation)
    259    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
    260    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
    261 
    262    movq        mm4, mm0
    263    movq        mm1, mm0
    264    punpcklwd   mm4, mm6
    265    punpckhwd   mm1, mm6
    266    movq        mm0, mm4
    267    movq        mm6, mm1
    268    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
    269    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
    270    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
    271    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
    272 
    273    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
    274    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
    275 
    276    ; (Original)
    277    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
    278    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
    279    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
    280    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
    281    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
    282    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
    283    ;
    284    ; (This implementation)
    285    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
    286    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
    287    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
    288    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
    289    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
    290    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
    291 
    292    movq        mm4, mm2
    293    movq        mm1, mm2
    294    punpcklwd   mm4, mm7
    295    punpckhwd   mm1, mm7
    296    movq        mm2, mm4
    297    movq        mm7, mm1
    298    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
    299    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
    300    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
    301    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
    302 
    303    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
    304    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
    305    paddd       mm2, mm0                ; mm2=data1L
    306    paddd       mm7, mm6                ; mm7=data1H
    307 
    308    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
    309    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
    310    psrad       mm4, DESCALE_P1
    311    psrad       mm1, DESCALE_P1
    312    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P1)]
    313    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
    314    psrad       mm2, DESCALE_P1
    315    psrad       mm7, DESCALE_P1
    316 
    317    packssdw    mm4, mm1                ; mm4=data7
    318    packssdw    mm2, mm7                ; mm2=data1
    319 
    320    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
    321    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
    322 
    323    movq        mm1, mm3
    324    movq        mm7, mm3
    325    punpcklwd   mm1, mm5
    326    punpckhwd   mm7, mm5
    327    movq        mm3, mm1
    328    movq        mm5, mm7
    329    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
    330    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
    331    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
    332    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
    333 
    334    paddd       mm1, mm0                ; mm1=data5L
    335    paddd       mm7, mm6                ; mm7=data5H
    336    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
    337    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
    338 
    339    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
    340    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
    341    psrad       mm1, DESCALE_P1
    342    psrad       mm7, DESCALE_P1
    343    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P1)]
    344    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P1)]
    345    psrad       mm3, DESCALE_P1
    346    psrad       mm5, DESCALE_P1
    347 
    348    packssdw    mm1, mm7                ; mm1=data5
    349    packssdw    mm3, mm5                ; mm3=data3
    350 
    351    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
    352    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
    353 
    354    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
    355    dec         ecx
    356    jnz         near .rowloop
    357 
    358    ; ---- Pass 2: process columns.
    359 
    360    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
    361    mov         ecx, DCTSIZE/4
    362    ALIGNX      16, 7
    363 .columnloop:
    364 
    365    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
    366    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
    367    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
    368    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
    369 
    370    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
    371    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
    372 
    373    movq        mm4, mm0                ; transpose coefficients(phase 1)
    374    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
    375    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
    376    movq        mm5, mm2                ; transpose coefficients(phase 1)
    377    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
    378    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
    379 
    380    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
    381    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
    382    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
    383    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
    384 
    385    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
    386    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
    387 
    388    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
    389    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
    390 
    391    movq        mm4, mm6                ; transpose coefficients(phase 1)
    392    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
    393    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
    394    movq        mm2, mm1                ; transpose coefficients(phase 1)
    395    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
    396    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
    397 
    398    movq        mm7, mm6                ; transpose coefficients(phase 2)
    399    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
    400    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
    401    movq        mm3, mm2                ; transpose coefficients(phase 2)
    402    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
    403    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
    404 
    405    movq        mm0, mm7
    406    movq        mm5, mm6
    407    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
    408    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
    409    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
    410    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
    411 
    412    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
    413    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
    414    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
    415    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
    416 
    417    movq        mm7, mm4                ; transpose coefficients(phase 2)
    418    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
    419    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
    420    movq        mm6, mm1                ; transpose coefficients(phase 2)
    421    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
    422    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
    423 
    424    movq        mm2, mm7
    425    movq        mm3, mm4
    426    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
    427    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
    428    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
    429    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
    430 
    431    ; -- Even part
    432 
    433    movq        mm1, mm5
    434    movq        mm6, mm0
    435    paddw       mm5, mm7                ; mm5=tmp10
    436    paddw       mm0, mm4                ; mm0=tmp11
    437    psubw       mm1, mm7                ; mm1=tmp13
    438    psubw       mm6, mm4                ; mm6=tmp12
    439 
    440    movq        mm7, mm5
    441    paddw       mm5, mm0                ; mm5=tmp10+tmp11
    442    psubw       mm7, mm0                ; mm7=tmp10-tmp11
    443 
    444    paddw       mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
    445    paddw       mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
    446    psraw       mm5, PASS1_BITS         ; mm5=data0
    447    psraw       mm7, PASS1_BITS         ; mm7=data4
    448 
    449    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
    450    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
    451 
    452    ; (Original)
    453    ; z1 = (tmp12 + tmp13) * 0.541196100;
    454    ; data2 = z1 + tmp13 * 0.765366865;
    455    ; data6 = z1 + tmp12 * -1.847759065;
    456    ;
    457    ; (This implementation)
    458    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
    459    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
    460 
    461    movq        mm4, mm1                ; mm1=tmp13
    462    movq        mm0, mm1
    463    punpcklwd   mm4, mm6                ; mm6=tmp12
    464    punpckhwd   mm0, mm6
    465    movq        mm1, mm4
    466    movq        mm6, mm0
    467    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
    468    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
    469    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
    470    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
    471 
    472    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
    473    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P2)]
    474    psrad       mm4, DESCALE_P2
    475    psrad       mm0, DESCALE_P2
    476    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
    477    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P2)]
    478    psrad       mm1, DESCALE_P2
    479    psrad       mm6, DESCALE_P2
    480 
    481    packssdw    mm4, mm0                ; mm4=data2
    482    packssdw    mm1, mm6                ; mm1=data6
    483 
    484    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
    485    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
    486 
    487    ; -- Odd part
    488 
    489    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
    490    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
    491 
    492    movq        mm0, mm2                ; mm2=tmp4
    493    movq        mm6, mm3                ; mm3=tmp5
    494    paddw       mm0, mm5                ; mm0=z3
    495    paddw       mm6, mm7                ; mm6=z4
    496 
    497    ; (Original)
    498    ; z5 = (z3 + z4) * 1.175875602;
    499    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
    500    ; z3 += z5;  z4 += z5;
    501    ;
    502    ; (This implementation)
    503    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
    504    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
    505 
    506    movq        mm4, mm0
    507    movq        mm1, mm0
    508    punpcklwd   mm4, mm6
    509    punpckhwd   mm1, mm6
    510    movq        mm0, mm4
    511    movq        mm6, mm1
    512    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
    513    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
    514    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
    515    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
    516 
    517    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
    518    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
    519 
    520    ; (Original)
    521    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
    522    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
    523    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
    524    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
    525    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
    526    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
    527    ;
    528    ; (This implementation)
    529    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
    530    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
    531    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
    532    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
    533    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
    534    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
    535 
    536    movq        mm4, mm2
    537    movq        mm1, mm2
    538    punpcklwd   mm4, mm7
    539    punpckhwd   mm1, mm7
    540    movq        mm2, mm4
    541    movq        mm7, mm1
    542    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
    543    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
    544    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
    545    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
    546 
    547    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
    548    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
    549    paddd       mm2, mm0                ; mm2=data1L
    550    paddd       mm7, mm6                ; mm7=data1H
    551 
    552    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
    553    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
    554    psrad       mm4, DESCALE_P2
    555    psrad       mm1, DESCALE_P2
    556    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P2)]
    557    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
    558    psrad       mm2, DESCALE_P2
    559    psrad       mm7, DESCALE_P2
    560 
    561    packssdw    mm4, mm1                ; mm4=data7
    562    packssdw    mm2, mm7                ; mm2=data1
    563 
    564    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
    565    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
    566 
    567    movq        mm1, mm3
    568    movq        mm7, mm3
    569    punpcklwd   mm1, mm5
    570    punpckhwd   mm7, mm5
    571    movq        mm3, mm1
    572    movq        mm5, mm7
    573    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
    574    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
    575    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
    576    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
    577 
    578    paddd       mm1, mm0                ; mm1=data5L
    579    paddd       mm7, mm6                ; mm7=data5H
    580    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
    581    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
    582 
    583    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
    584    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
    585    psrad       mm1, DESCALE_P2
    586    psrad       mm7, DESCALE_P2
    587    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P2)]
    588    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P2)]
    589    psrad       mm3, DESCALE_P2
    590    psrad       mm5, DESCALE_P2
    591 
    592    packssdw    mm1, mm7                ; mm1=data5
    593    packssdw    mm3, mm5                ; mm3=data3
    594 
    595    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
    596    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
    597 
    598    add         edx, byte 4*SIZEOF_DCTELEM
    599    dec         ecx
    600    jnz         near .columnloop
    601 
    602    emms                                ; empty MMX state
    603 
    604 ;   pop         edi                     ; unused
    605 ;   pop         esi                     ; unused
    606 ;   pop         edx                     ; need not be preserved
    607 ;   pop         ecx                     ; need not be preserved
    608    POPPIC      ebx
    609    mov         esp, ebp                ; esp <- aligned ebp
    610    pop         esp                     ; esp <- original ebp
    611    pop         ebp
    612    ret
    613 
    614 ; For some reason, the OS X linker does not honor the request to align the
    615 ; segment unless we do this.
    616    align       32