tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

simple_idct.asm (37993B)


      1 ;
      2 ; Simple IDCT MMX
      3 ;
      4 ; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
      5 ;
      6 ; Conversion from gcc syntax to x264asm syntax with minimal modifications
      7 ; by James Darnley <jdarnley@obe.tv>.
      8 ;
      9 ; This file is part of FFmpeg.
     10 ;
     11 ; FFmpeg is free software; you can redistribute it and/or
     12 ; modify it under the terms of the GNU Lesser General Public
     13 ; License as published by the Free Software Foundation; either
     14 ; version 2.1 of the License, or (at your option) any later version.
     15 ;
     16 ; FFmpeg is distributed in the hope that it will be useful,
     17 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
     18 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     19 ; Lesser General Public License for more details.
     20 ;
     21 ; You should have received a copy of the GNU Lesser General Public
     22 ; License along with FFmpeg; if not, write to the Free Software
     23 ; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     24 ;/
     25 
     26 %include "libavutil/x86/x86util.asm"
     27 
     28 SECTION_RODATA
     29 
     30 %if ARCH_X86_32
     31 cextern pb_80
     32 
     33 wm1010: dw 0, 0xffff, 0, 0xffff
     34 d40000: dd 4 << 16, 0
     35 
     36 ; 23170.475006
     37 ; 22725.260826
     38 ; 21406.727617
     39 ; 19265.545870
     40 ; 16384.000000
     41 ; 12872.826198
     42 ; 8866.956905
     43 ; 4520.335430
     44 
     45 %define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     46 %define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     47 %define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     48 %define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     49 %define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
     50 %define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     51 %define C6 8867  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     52 %define C7 4520  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
     53 
     54 %define ROW_SHIFT 11
     55 %define COL_SHIFT 20 ; 6
     56 
     57 coeffs:
     58    dw 1 << (ROW_SHIFT - 1), 0
     59    dw 1 << (ROW_SHIFT - 1), 0
     60    dw 1 << (ROW_SHIFT - 1), 1
     61    dw 1 << (ROW_SHIFT - 1), 0
     62 
     63    dw C4,  C4,  C4,  C4
     64    dw C4, -C4,  C4, -C4
     65 
     66    dw C2,  C6,  C2,  C6
     67    dw C6, -C2,  C6, -C2
     68 
     69    dw C1,  C3,  C1,  C3
     70    dw C5,  C7,  C5,  C7
     71 
     72    dw C3, -C7,  C3, -C7
     73    dw -C1, -C5, -C1, -C5
     74 
     75    dw C5, -C1,  C5, -C1
     76    dw C7,  C3,  C7,  C3
     77 
     78    dw C7, -C5,  C7, -C5
     79    dw C3, -C1,  C3, -C1
     80 
     81 SECTION .text
     82 
     83 %macro DC_COND_IDCT 7
     84    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
     85    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
     86    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
     87    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
     88    movq            mm4, [wm1010]
     89    pand            mm4, mm0
     90    por             mm4, mm1
     91    por             mm4, mm2
     92    por             mm4, mm3
     93    packssdw        mm4, mm4
     94    movd            t0d, mm4
     95    or              t0d, t0d
     96    jz              %%1
     97    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
     98    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
     99    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
    100    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
    101    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
    102    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
    103    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
    104    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
    105    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
    106    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
    107    paddd           mm4, [coeffs + 8]
    108    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
    109    paddd           mm4, mm5            ; A0             a0
    110    psubd           mm6, mm5            ; A3             a3
    111    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
    112    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
    113    paddd           mm0, [coeffs + 8]
    114    paddd           mm1, mm0            ; A1             a1
    115    paddd           mm0, mm0
    116    psubd           mm0, mm1            ; A2             a2
    117    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
    118    paddd           mm7, mm5            ; B0             b0
    119    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
    120    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
    121    paddd           mm7, mm4            ; A0+B0          a0+b0
    122    paddd           mm4, mm4            ; 2A0            2a0
    123    psubd           mm4, mm7            ; A0-B0          a0-b0
    124    paddd           mm5, mm2            ; B1             b1
    125    psrad           mm7, %7
    126    psrad           mm4, %7
    127    movq            mm2, mm1            ; A1             a1
    128    paddd           mm1, mm5            ; A1+B1          a1+b1
    129    psubd           mm2, mm5            ; A1-B1          a1-b1
    130    psrad           mm1, %7
    131    psrad           mm2, %7
    132    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
    133    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
    134    movq           [%5], mm7
    135    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
    136    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
    137    movq      [24 + %5], mm2
    138    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
    139    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
    140    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
    141    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
    142    movq            mm2, mm0            ; A2             a2
    143    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
    144    paddd           mm4, mm7            ; B2             b2
    145    paddd           mm2, mm4            ; A2+B2          a2+b2
    146    psubd           mm0, mm4            ; a2-B2          a2-b2
    147    psrad           mm2, %7
    148    psrad           mm0, %7
    149    movq            mm4, mm6            ; A3             a3
    150    paddd           mm3, mm1            ; B3             b3
    151    paddd           mm6, mm3            ; A3+B3          a3+b3
    152    psubd           mm4, mm3            ; a3-B3          a3-b3
    153    psrad           mm6, %7
    154    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
    155    movq       [8 + %5], mm2
    156    psrad           mm4, %7
    157    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
    158    movq      [16 + %5], mm4
    159    jmp             %%2
    160 %%1:
    161    pslld           mm0, 16
    162    paddd           mm0, [d40000]
    163    psrad           mm0, 13
    164    packssdw        mm0, mm0
    165    movq           [%5], mm0
    166    movq       [8 + %5], mm0
    167    movq      [16 + %5], mm0
    168    movq      [24 + %5], mm0
    169 %%2:
    170 %endmacro
    171 
    172 %macro Z_COND_IDCT 8
    173    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
    174    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
    175    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
    176    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
    177    movq            mm4, mm0
    178    por             mm4, mm1
    179    por             mm4, mm2
    180    por             mm4, mm3
    181    packssdw        mm4, mm4
    182    movd            t0d, mm4
    183    or              t0d, t0d
    184    jz               %8
    185    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
    186    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
    187    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
    188    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
    189    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
    190    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
    191    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
    192    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
    193    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
    194    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
    195    paddd           mm4, [coeffs]
    196    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
    197    paddd           mm4, mm5            ; A0             a0
    198    psubd           mm6, mm5            ; A3             a3
    199    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
    200    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
    201    paddd           mm0, [coeffs]
    202    paddd           mm1, mm0            ; A1             a1
    203    paddd           mm0, mm0
    204    psubd           mm0, mm1            ; A2             a2
    205    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
    206    paddd           mm7, mm5            ; B0             b0
    207    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
    208    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
    209    paddd           mm7, mm4            ; A0+B0          a0+b0
    210    paddd           mm4, mm4            ; 2A0            2a0
    211    psubd           mm4, mm7            ; A0-B0          a0-b0
    212    paddd           mm5, mm2            ; B1             b1
    213    psrad           mm7, %7
    214    psrad           mm4, %7
    215    movq            mm2, mm1            ; A1             a1
    216    paddd           mm1, mm5            ; A1+B1          a1+b1
    217    psubd           mm2, mm5            ; A1-B1          a1-b1
    218    psrad           mm1, %7
    219    psrad           mm2, %7
    220    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
    221    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
    222    movq           [%5], mm7
    223    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
    224    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
    225    movq      [24 + %5], mm2
    226    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
    227    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
    228    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
    229    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
    230    movq            mm2, mm0            ; A2             a2
    231    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
    232    paddd           mm4, mm7            ; B2             b2
    233    paddd           mm2, mm4            ; A2+B2          a2+b2
    234    psubd           mm0, mm4            ; a2-B2          a2-b2
    235    psrad           mm2, %7
    236    psrad           mm0, %7
    237    movq            mm4, mm6            ; A3             a3
    238    paddd           mm3, mm1            ; B3             b3
    239    paddd           mm6, mm3            ; A3+B3          a3+b3
    240    psubd           mm4, mm3            ; a3-B3          a3-b3
    241    psrad           mm6, %7
    242    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
    243    movq       [8 + %5], mm2
    244    psrad           mm4, %7
    245    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
    246    movq      [16 + %5], mm4
    247 %endmacro
    248 
    249 %macro IDCT1 6
    250    movq            mm0, %1             ; R4     R0      r4      r0
    251    movq            mm1, %2             ; R6     R2      r6      r2
    252    movq            mm2, %3             ; R3     R1      r3      r1
    253    movq            mm3, %4             ; R7     R5      r7      r5
    254    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
    255    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
    256    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
    257    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
    258    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
    259    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
    260    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
    261    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
    262    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
    263    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
    264    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
    265    paddd           mm4, mm5            ; A0             a0
    266    psubd           mm6, mm5            ; A3             a3
    267    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
    268    paddd           mm0, mm1            ; A1             a1
    269    psubd           mm5, mm1            ; A2             a2
    270    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
    271    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
    272    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
    273    paddd           mm7, mm1            ; B0             b0
    274    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
    275    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
    276    paddd           mm7, mm4            ; A0+B0          a0+b0
    277    paddd           mm4, mm4            ; 2A0            2a0
    278    psubd           mm4, mm7            ; A0-B0          a0-b0
    279    paddd           mm1, mm2            ; B1             b1
    280    psrad           mm7, %6
    281    psrad           mm4, %6
    282    movq            mm2, mm0            ; A1             a1
    283    paddd           mm0, mm1            ; A1+B1          a1+b1
    284    psubd           mm2, mm1            ; A1-B1          a1-b1
    285    psrad           mm0, %6
    286    psrad           mm2, %6
    287    packssdw        mm7, mm7            ; A0+B0  a0+b0
    288    movd           [%5], mm7
    289    packssdw        mm0, mm0            ; A1+B1  a1+b1
    290    movd      [16 + %5], mm0
    291    packssdw        mm2, mm2            ; A1-B1  a1-b1
    292    movd      [96 + %5], mm2
    293    packssdw        mm4, mm4            ; A0-B0  a0-b0
    294    movd     [112 + %5], mm4
    295    movq            mm0, %3             ; R3     R1      r3      r1
    296    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
    297    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
    298    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
    299    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
    300    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
    301    movq            mm2, mm5            ; A2             a2
    302    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
    303    paddd           mm4, mm7            ; B2             b2
    304    paddd           mm2, mm4            ; A2+B2          a2+b2
    305    psubd           mm5, mm4            ; a2-B2          a2-b2
    306    psrad           mm2, %6
    307    psrad           mm5, %6
    308    movq            mm4, mm6            ; A3             a3
    309    paddd           mm3, mm0            ; B3             b3
    310    paddd           mm6, mm3            ; A3+B3          a3+b3
    311    psubd           mm4, mm3            ; a3-B3          a3-b3
    312    psrad           mm6, %6
    313    psrad           mm4, %6
    314    packssdw        mm2, mm2            ; A2+B2  a2+b2
    315    packssdw        mm6, mm6            ; A3+B3  a3+b3
    316    movd      [32 + %5], mm2
    317    packssdw        mm4, mm4            ; A3-B3  a3-b3
    318    packssdw        mm5, mm5            ; A2-B2  a2-b2
    319    movd      [48 + %5], mm6
    320    movd      [64 + %5], mm4
    321    movd      [80 + %5], mm5
    322 %endmacro
    323 
    324 %macro IDCT2 6
    325    movq            mm0, %1             ; R4     R0      r4      r0
    326    movq            mm1, %2             ; R6     R2      r6      r2
    327    movq            mm3, %4             ; R7     R5      r7      r5
    328    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
    329    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
    330    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
    331    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
    332    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
    333    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
    334    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
    335    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
    336    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
    337    paddd           mm4, mm5            ; A0             a0
    338    psubd           mm6, mm5            ; A3             a3
    339    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
    340    paddd           mm0, mm1            ; A1             a1
    341    psubd           mm5, mm1            ; A2             a2
    342    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
    343    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
    344    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
    345    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
    346    paddd           mm1, mm4            ; A0+B0          a0+b0
    347    paddd           mm4, mm4            ; 2A0            2a0
    348    psubd           mm4, mm1            ; A0-B0          a0-b0
    349    psrad           mm1, %6
    350    psrad           mm4, %6
    351    movq            mm2, mm0            ; A1             a1
    352    paddd           mm0, mm7            ; A1+B1          a1+b1
    353    psubd           mm2, mm7            ; A1-B1          a1-b1
    354    psrad           mm0, %6
    355    psrad           mm2, %6
    356    packssdw        mm1, mm1            ; A0+B0  a0+b0
    357    movd           [%5], mm1
    358    packssdw        mm0, mm0            ; A1+B1  a1+b1
    359    movd      [16 + %5], mm0
    360    packssdw        mm2, mm2            ; A1-B1  a1-b1
    361    movd      [96 + %5], mm2
    362    packssdw        mm4, mm4            ; A0-B0  a0-b0
    363    movd     [112 + %5], mm4
    364    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
    365    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
    366    movq            mm2, mm5            ; A2             a2
    367    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
    368    paddd           mm2, mm1            ; A2+B2          a2+b2
    369    psubd           mm5, mm1            ; a2-B2          a2-b2
    370    psrad           mm2, %6
    371    psrad           mm5, %6
    372    movq            mm1, mm6            ; A3             a3
    373    paddd           mm6, mm3            ; A3+B3          a3+b3
    374    psubd           mm1, mm3            ; a3-B3          a3-b3
    375    psrad           mm6, %6
    376    psrad           mm1, %6
    377    packssdw        mm2, mm2            ; A2+B2  a2+b2
    378    packssdw        mm6, mm6            ; A3+B3  a3+b3
    379    movd      [32 + %5], mm2
    380    packssdw        mm1, mm1            ; A3-B3  a3-b3
    381    packssdw        mm5, mm5            ; A2-B2  a2-b2
    382    movd      [48 + %5], mm6
    383    movd      [64 + %5], mm1
    384    movd      [80 + %5], mm5
    385 %endmacro
    386 
    387 %macro IDCT3 6
    388    movq            mm0, %1             ; R4     R0      r4      r0
    389    movq            mm3, %4             ; R7     R5      r7      r5
    390    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
    391    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
    392    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
    393    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
    394    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
    395    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
    396    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
    397    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
    398    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
    399    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
    400    paddd           mm1, mm4            ; A0+B0          a0+b0
    401    paddd           mm4, mm4            ; 2A0            2a0
    402    psubd           mm4, mm1            ; A0-B0          a0-b0
    403    psrad           mm1, %6
    404    psrad           mm4, %6
    405    movq            mm2, mm0            ; A1             a1
    406    paddd           mm0, mm7            ; A1+B1          a1+b1
    407    psubd           mm2, mm7            ; A1-B1          a1-b1
    408    psrad           mm0, %6
    409    psrad           mm2, %6
    410    packssdw        mm1, mm1            ; A0+B0  a0+b0
    411    movd           [%5], mm1
    412    packssdw        mm0, mm0            ; A1+B1  a1+b1
    413    movd      [16 + %5], mm0
    414    packssdw        mm2, mm2            ; A1-B1  a1-b1
    415    movd      [96 + %5], mm2
    416    packssdw        mm4, mm4            ; A0-B0  a0-b0
    417    movd     [112 + %5], mm4
    418    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
    419    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
    420    movq            mm2, mm5            ; A2             a2
    421    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
    422    paddd           mm2, mm1            ; A2+B2          a2+b2
    423    psubd           mm5, mm1            ; a2-B2          a2-b2
    424    psrad           mm2, %6
    425    psrad           mm5, %6
    426    movq            mm1, mm6            ; A3             a3
    427    paddd           mm6, mm3            ; A3+B3          a3+b3
    428    psubd           mm1, mm3            ; a3-B3          a3-b3
    429    psrad           mm6, %6
    430    psrad           mm1, %6
    431    packssdw        mm2, mm2            ; A2+B2  a2+b2
    432    packssdw        mm6, mm6            ; A3+B3  a3+b3
    433    movd      [32 + %5], mm2
    434    packssdw        mm1, mm1            ; A3-B3  a3-b3
    435    packssdw        mm5, mm5            ; A2-B2  a2-b2
    436    movd      [48 + %5], mm6
    437    movd      [64 + %5], mm1
    438    movd      [80 + %5], mm5
    439 %endmacro
    440 
    441 %macro IDCT4 6
    442    movq            mm0, %1             ; R4     R0      r4      r0
    443    movq            mm2, %3             ; R3     R1      r3      r1
    444    movq            mm3, %4             ; R7     R5      r7      r5
    445    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
    446    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
    447    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
    448    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
    449    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
    450    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
    451    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
    452    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
    453    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
    454    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
    455    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
    456    paddd           mm7, mm1            ; B0             b0
    457    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
    458    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
    459    paddd           mm7, mm4            ; A0+B0          a0+b0
    460    paddd           mm4, mm4            ; 2A0            2a0
    461    psubd           mm4, mm7            ; A0-B0          a0-b0
    462    paddd           mm1, mm2            ; B1             b1
    463    psrad           mm7, %6
    464    psrad           mm4, %6
    465    movq            mm2, mm0            ; A1             a1
    466    paddd           mm0, mm1            ; A1+B1          a1+b1
    467    psubd           mm2, mm1            ; A1-B1          a1-b1
    468    psrad           mm0, %6
    469    psrad           mm2, %6
    470    packssdw        mm7, mm7            ; A0+B0  a0+b0
    471    movd           [%5], mm7
    472    packssdw        mm0, mm0            ; A1+B1  a1+b1
    473    movd      [16 + %5], mm0
    474    packssdw        mm2, mm2            ; A1-B1  a1-b1
    475    movd      [96 + %5], mm2
    476    packssdw        mm4, mm4            ; A0-B0  a0-b0
    477    movd     [112 + %5], mm4
    478    movq            mm0, %3             ; R3     R1      r3      r1
    479    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
    480    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
    481    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
    482    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
    483    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
    484    movq            mm2, mm5            ; A2             a2
    485    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
    486    paddd           mm4, mm7            ; B2             b2
    487    paddd           mm2, mm4            ; A2+B2          a2+b2
    488    psubd           mm5, mm4            ; a2-B2          a2-b2
    489    psrad           mm2, %6
    490    psrad           mm5, %6
    491    movq            mm4, mm6            ; A3             a3
    492    paddd           mm3, mm0            ; B3             b3
    493    paddd           mm6, mm3            ; A3+B3          a3+b3
    494    psubd           mm4, mm3            ; a3-B3          a3-b3
    495    psrad           mm6, %6
    496    psrad           mm4, %6
    497    packssdw        mm2, mm2            ; A2+B2  a2+b2
    498    packssdw        mm6, mm6            ; A3+B3  a3+b3
    499    movd      [32 + %5], mm2
    500    packssdw        mm4, mm4            ; A3-B3  a3-b3
    501    packssdw        mm5, mm5            ; A2-B2  a2-b2
    502    movd      [48 + %5], mm6
    503    movd      [64 + %5], mm4
    504    movd      [80 + %5], mm5
    505 %endmacro
    506 
    507 %macro IDCT5 6
    508    movq            mm0, %1             ; R4     R0      r4      r0
    509    movq            mm2, %3             ; R3     R1      r3      r1
    510    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
    511    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
    512    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
    513    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
    514    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
    515    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
    516    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
    517    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
    518    movq            mm3, [coeffs + 64]
    519    pmaddwd         mm3, mm2            ; -C7R3+C3R1     -C7r3+C3r1
    520    paddd           mm7, mm4            ; A0+B0          a0+b0
    521    paddd           mm4, mm4            ; 2A0            2a0
    522    psubd           mm4, mm7            ; A0-B0          a0-b0
    523    psrad           mm7, %6
    524    psrad           mm4, %6
    525    movq            mm1, mm0            ; A1             a1
    526    paddd           mm0, mm3            ; A1+B1          a1+b1
    527    psubd           mm1, mm3            ; A1-B1          a1-b1
    528    psrad           mm0, %6
    529    psrad           mm1, %6
    530    packssdw        mm7, mm7            ; A0+B0  a0+b0
    531    movd           [%5], mm7
    532    packssdw        mm0, mm0            ; A1+B1  a1+b1
    533    movd      [16 + %5], mm0
    534    packssdw        mm1, mm1            ; A1-B1  a1-b1
    535    movd      [96 + %5], mm1
    536    packssdw        mm4, mm4            ; A0-B0  a0-b0
    537    movd     [112 + %5], mm4
    538    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
    539    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
    540    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
    541    movq            mm1, mm5            ; A2             a2
    542    paddd           mm1, mm4            ; A2+B2          a2+b2
    543    psubd           mm5, mm4            ; a2-B2          a2-b2
    544    psrad           mm1, %6
    545    psrad           mm5, %6
    546    movq            mm4, mm6            ; A3             a3
    547    paddd           mm6, mm2            ; A3+B3          a3+b3
    548    psubd           mm4, mm2            ; a3-B3          a3-b3
    549    psrad           mm6, %6
    550    psrad           mm4, %6
    551    packssdw        mm1, mm1            ; A2+B2  a2+b2
    552    packssdw        mm6, mm6            ; A3+B3  a3+b3
    553    movd      [32 + %5], mm1
    554    packssdw        mm4, mm4            ; A3-B3  a3-b3
    555    packssdw        mm5, mm5            ; A2-B2  a2-b2
    556    movd      [48 + %5], mm6
    557    movd      [64 + %5], mm4
    558    movd      [80 + %5], mm5
    559 %endmacro
    560 
    561 %macro IDCT6 6
    562    movq            mm0, [%1]           ; R4     R0      r4      r0
    563    movq            mm1, [%2]           ; R6     R2      r6      r2
    564    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
    565    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
    566    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
    567    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
    568    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
    569    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
    570    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
    571    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
    572    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
    573    paddd           mm4, mm5            ; A0             a0
    574    psubd           mm6, mm5            ; A3             a3
    575    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
    576    paddd           mm0, mm1            ; A1             a1
    577    psubd           mm5, mm1            ; A2             a2
    578    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
    579    movq            mm3, [8 + %2]       ; R6     R2      r6      r2
    580    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
    581    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
    582    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
    583    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
    584    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
    585    pmaddwd         mm7, mm3            ; C6R6+C2R2      C6r6+C2r2
    586    pmaddwd         mm3, [coeffs + 40]  ; -C2R6+C6R2     -C2r6+C6r2
    587    paddd           mm7, mm1            ; A0             a0
    588    paddd           mm1, mm1            ; 2C0            2c0
    589    psubd           mm1, mm7            ; A3             a3
    590    paddd           mm3, mm2            ; A1             a1
    591    paddd           mm2, mm2            ; 2C1            2c1
    592    psubd           mm2, mm3            ; A2             a2
    593    psrad           mm4, %6
    594    psrad           mm7, %6
    595    psrad           mm3, %6
    596    packssdw        mm4, mm7            ; A0     a0
    597    movq           [%5], mm4
    598    psrad           mm0, %6
    599    packssdw        mm0, mm3            ; A1     a1
    600    movq      [16 + %5], mm0
    601    movq      [96 + %5], mm0
    602    movq     [112 + %5], mm4
    603    psrad           mm5, %6
    604    psrad           mm6, %6
    605    psrad           mm2, %6
    606    packssdw        mm5, mm2            ; A2-B2  a2-b2
    607    movq      [32 + %5], mm5
    608    psrad           mm1, %6
    609    packssdw        mm6, mm1            ; A3+B3  a3+b3
    610    movq      [48 + %5], mm6
    611    movq      [64 + %5], mm6
    612    movq      [80 + %5], mm5
    613 %endmacro
    614 
    615 %macro IDCT7 6
    616    movq            mm0, %1             ; R4     R0      r4      r0
    617    movq            mm1, %2             ; R6     R2      r6      r2
    618    movq            mm2, %3             ; R3     R1      r3      r1
    619    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
    620    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
    621    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
    622    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
    623    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
    624    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
    625    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
    626    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
    627    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
    628    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
    629    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
    630    paddd           mm4, mm5            ; A0             a0
    631    psubd           mm6, mm5            ; A3             a3
    632    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
    633    paddd           mm0, mm1            ; A1             a1
    634    psubd           mm5, mm1            ; A2             a2
    635    movq            mm1, [coeffs + 64]
    636    pmaddwd         mm1, mm2            ; -C7R3+C3R1     -C7r3+C3r1
    637    paddd           mm7, mm4            ; A0+B0          a0+b0
    638    paddd           mm4, mm4            ; 2A0            2a0
    639    psubd           mm4, mm7            ; A0-B0          a0-b0
    640    psrad           mm7, %6
    641    psrad           mm4, %6
    642    movq            mm3, mm0            ; A1             a1
    643    paddd           mm0, mm1            ; A1+B1          a1+b1
    644    psubd           mm3, mm1            ; A1-B1          a1-b1
    645    psrad           mm0, %6
    646    psrad           mm3, %6
    647    packssdw        mm7, mm7            ; A0+B0  a0+b0
    648    movd           [%5], mm7
    649    packssdw        mm0, mm0            ; A1+B1  a1+b1
    650    movd      [16 + %5], mm0
    651    packssdw        mm3, mm3            ; A1-B1  a1-b1
    652    movd      [96 + %5], mm3
    653    packssdw        mm4, mm4            ; A0-B0  a0-b0
    654    movd     [112 + %5], mm4
    655    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
    656    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
    657    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
    658    movq            mm3, mm5            ; A2             a2
    659    paddd           mm3, mm4            ; A2+B2          a2+b2
    660    psubd           mm5, mm4            ; a2-B2          a2-b2
    661    psrad           mm3, %6
    662    psrad           mm5, %6
    663    movq            mm4, mm6            ; A3             a3
    664    paddd           mm6, mm2            ; A3+B3          a3+b3
    665    psubd           mm4, mm2            ; a3-B3          a3-b3
    666    psrad           mm6, %6
    667    packssdw        mm3, mm3            ; A2+B2  a2+b2
    668    movd      [32 + %5], mm3
    669    psrad           mm4, %6
    670    packssdw        mm6, mm6            ; A3+B3  a3+b3
    671    movd      [48 + %5], mm6
    672    packssdw        mm4, mm4            ; A3-B3  a3-b3
    673    packssdw        mm5, mm5            ; A2-B2  a2-b2
    674    movd      [64 + %5], mm4
    675    movd      [80 + %5], mm5
    676 %endmacro
    677 
    678 %macro IDCT8 6
    679    movq            mm0, [%1]           ; R4     R0      r4      r0
    680    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
    681    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
    682    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
    683    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
    684    psrad           mm4, %6
    685    psrad           mm0, %6
    686    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
    687    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
    688    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
    689    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
    690    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
    691    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
    692    psrad           mm1, %6
    693    packssdw        mm4, mm1            ; A0     a0
    694    movq           [%5], mm4
    695    psrad           mm2, %6
    696    packssdw        mm0, mm2            ; A1     a1
    697    movq      [16 + %5], mm0
    698    movq      [96 + %5], mm0
    699    movq     [112 + %5], mm4
    700    movq      [32 + %5], mm0
    701    movq      [48 + %5], mm4
    702    movq      [64 + %5], mm4
    703    movq      [80 + %5], mm0
    704 %endmacro
    705 
    706 %macro IDCT 0
    707    DC_COND_IDCT  0,   8,  16,  24, rsp +  0, null, 11
    708    Z_COND_IDCT  32,  40,  48,  56, rsp + 32, null, 11, %%4
    709    Z_COND_IDCT  64,  72,  80,  88, rsp + 64, null, 11, %%2
    710    Z_COND_IDCT  96, 104, 112, 120, rsp + 96, null, 11, %%1
    711 
    712    IDCT1 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
    713    IDCT1 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
    714    IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
    715    IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
    716    jmp %%9
    717 
    718    ALIGN 16
    719    %%4:
    720    Z_COND_IDCT 64,  72,  80,  88, rsp + 64, null, 11, %%6
    721    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
    722 
    723    IDCT2 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
    724    IDCT2 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
    725    IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
    726    IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
    727    jmp %%9
    728 
    729    ALIGN 16
    730    %%6:
    731    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
    732 
    733    IDCT3 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
    734    IDCT3 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
    735    IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
    736    IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
    737    jmp %%9
    738 
    739    ALIGN 16
    740    %%2:
    741    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
    742 
    743    IDCT4 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
    744    IDCT4 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
    745    IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
    746    IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
    747    jmp %%9
    748 
    749    ALIGN 16
    750    %%3:
    751 
    752    IDCT5 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
    753    IDCT5 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
    754    IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
    755    IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
    756    jmp %%9
    757 
    758    ALIGN 16
    759    %%5:
    760 
    761    IDCT6 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
    762    IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
    763    jmp %%9
    764 
    765    ALIGN 16
    766    %%1:
    767 
    768    IDCT7 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
    769    IDCT7 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
    770    IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
    771    IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
    772    jmp %%9
    773 
    774    ALIGN 16
    775    %%7:
    776 
    777    IDCT8 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
    778    IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
    779 
    780    %%9:
    781 %endmacro
    782 
    783 %macro PUT_PIXELS_CLAMPED_HALF 1
    784    mova     m0, [blockq+mmsize*0+%1]
    785    mova     m1, [blockq+mmsize*2+%1]
    786    packuswb m0, [blockq+mmsize*1+%1]
    787    packuswb m1, [blockq+mmsize*3+%1]
    788    movq           [pixelsq], m0
    789    movhps  [lsizeq+pixelsq], m0
    790    movq  [2*lsizeq+pixelsq], m1
    791    movhps [lsize3q+pixelsq], m1
    792 %endmacro
    793 
    794 %macro ADD_PIXELS_CLAMPED 1
    795    mova       m0, [blockq+mmsize*0+%1]
    796    mova       m1, [blockq+mmsize*1+%1]
    797    movq       m2, [pixelsq]
    798    movq       m3, [pixelsq+lsizeq]
    799    punpcklbw  m2, m4
    800    punpcklbw  m3, m4
    801    paddsw     m0, m2
    802    paddsw     m1, m3
    803    packuswb   m0, m1
    804    movq       [pixelsq], m0
    805    movhps     [pixelsq+lsizeq], m0
    806 %endmacro
    807 
    808 INIT_MMX mmx
    809 
    810 cglobal simple_idct, 1, 2, 8, 128, block, t0
    811    IDCT
    812    emms
    813 RET
    814 
    815 INIT_XMM sse2
    816 
    817 cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
    818    IDCT
    819    lea lsize3q, [lsizeq*3]
    820    PUT_PIXELS_CLAMPED_HALF 0
    821    lea pixelsq, [pixelsq+lsizeq*4]
    822    PUT_PIXELS_CLAMPED_HALF 64
    823 RET
    824 
    825 cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
    826    IDCT
    827    pxor       m4, m4
    828    ADD_PIXELS_CLAMPED 0
    829    lea        pixelsq, [pixelsq+lsizeq*2]
    830    ADD_PIXELS_CLAMPED 32
    831    lea        pixelsq, [pixelsq+lsizeq*2]
    832    ADD_PIXELS_CLAMPED 64
    833    lea        pixelsq, [pixelsq+lsizeq*2]
    834    ADD_PIXELS_CLAMPED 96
    835 RET
    836 %endif