tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

imdct36.asm (17862B)


      1 ;******************************************************************************
      2 ;* 36 point SSE-optimized IMDCT transform
      3 ;* Copyright (c) 2011 Vitor Sessak
      4 ;*
      5 ;* This file is part of FFmpeg.
      6 ;*
      7 ;* FFmpeg is free software; you can redistribute it and/or
      8 ;* modify it under the terms of the GNU Lesser General Public
      9 ;* License as published by the Free Software Foundation; either
     10 ;* version 2.1 of the License, or (at your option) any later version.
     11 ;*
     12 ;* FFmpeg is distributed in the hope that it will be useful,
     13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 ;* Lesser General Public License for more details.
     16 ;*
     17 ;* You should have received a copy of the GNU Lesser General Public
     18 ;* License along with FFmpeg; if not, write to the Free Software
     19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     20 ;******************************************************************************
     21 
     22 %include "libavutil/x86/x86util.asm"
     23 
     24 SECTION_RODATA
     25 
     26 ps_mask:  dd 0, ~0, ~0, ~0
     27 ps_mask2: dd 0, ~0,  0, ~0
     28 ps_mask3: dd 0,  0,  0, ~0
     29 ps_mask4: dd 0, ~0,  0,  0
     30 
     31 ps_val1:  dd          -0.5,          -0.5, -0.8660254038, -0.8660254038
     32 ps_val2:  dd           1.0,           1.0,  0.8660254038,  0.8660254038
     33 ps_val3:  dd  0.1736481777,  0.1736481777,  0.3420201433,  0.3420201433
     34 ps_val4:  dd -0.7660444431, -0.7660444431,  0.8660254038,  0.8660254038
     35 ps_val5:  dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
     36 ps_val6:  dd           0.5,           0.5, -0.6427876097, -0.6427876097
     37 ps_val7:  dd           1.0,           1.0, -0.6427876097, -0.6427876097
     38 
     39 ps_p1p1m1m1: dd 0,          0, 0x80000000, 0x80000000
     40 ps_p1m1p1m1: dd 0, 0x80000000,          0, 0x80000000
     41 
     42 ps_cosh:       dd 1.0, 0.50190991877167369479,  1.0,  5.73685662283492756461
     43               dd 1.0, 0.51763809020504152469,  1.0,  1.93185165257813657349
     44               dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
     45               dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
     46               dd 1.0, 0.70710678118654752439,  0.0,  0.0
     47 
     48 ps_cosh_sse3:  dd 1.0, -0.50190991877167369479,  1.0, -5.73685662283492756461
     49               dd 1.0, -0.51763809020504152469,  1.0, -1.93185165257813657349
     50               dd 1.0, -0.55168895948124587824, -1.0,  1.18310079157624925896
     51               dd 1.0, -0.61038729438072803416, -1.0,  0.87172339781054900991
     52               dd 1.0, -0.70710678118654752439,  0.0,  0.0
     53 
     54 costabs:  times 4 dd  0.98480773
     55          times 4 dd  0.93969262
     56          times 4 dd  0.86602539
     57          times 4 dd -0.76604444
     58          times 4 dd -0.64278764
     59          times 4 dd  0.50000000
     60          times 4 dd -0.50000000
     61          times 4 dd -0.34202015
     62          times 4 dd -0.17364818
     63          times 4 dd  0.50190992
     64          times 4 dd  0.51763808
     65          times 4 dd  0.55168896
     66          times 4 dd  0.61038726
     67          times 4 dd  0.70710677
     68          times 4 dd  0.87172341
     69          times 4 dd  1.18310082
     70          times 4 dd  1.93185163
     71          times 4 dd  5.73685646
     72 
     73 %define SBLIMIT 32
     74 SECTION .text
     75 
     76 %macro PSHUFD 3
     77 %if cpuflag(sse2) && notcpuflag(avx)
     78    pshufd %1, %2, %3
     79 %else
     80    shufps %1, %2, %2, %3
     81 %endif
     82 %endmacro
     83 
     84 ; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
     85 ; output %1={x3,x4,y1,y2}
     86 %macro BUILDINVHIGHLOW 3
     87 %if cpuflag(avx)
     88    shufps %1, %2, %3, 0x4e
     89 %else
     90    movlhps %1, %3
     91    movhlps %1, %2
     92 %endif
     93 %endmacro
     94 
     95 ; input  %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
     96 ; output %1={x4,y1,y2,y3}
     97 %macro ROTLEFT 3
     98 %if cpuflag(ssse3)
     99    palignr  %1, %3, %2, 12
    100 %else
    101    BUILDINVHIGHLOW %1, %2, %3
    102    shufps  %1, %1, %3, 0x99
    103 %endif
    104 %endmacro
    105 
    106 %macro INVERTHL 2
    107 %if cpuflag(sse2)
    108    PSHUFD  %1, %2, 0x4e
    109 %else
    110    movhlps %1, %2
    111    movlhps %1, %2
    112 %endif
    113 %endmacro
    114 
    115 %macro BUTTERF 3
    116    INVERTHL %2, %1
    117    xorps    %1, [ps_p1p1m1m1]
    118    addps    %1, %2
    119 %if cpuflag(sse3)
    120    mulps    %1, %1, [ps_cosh_sse3 + %3]
    121    PSHUFD   %2, %1, 0xb1
    122    addsubps %1, %1, %2
    123 %else
    124    mulps    %1, [ps_cosh + %3]
    125    PSHUFD   %2, %1, 0xb1
    126    xorps    %1, [ps_p1m1p1m1]
    127    addps    %1, %2
    128 %endif
    129 %endmacro
    130 
    131 %macro BUTTERF2 3
    132 %if cpuflag(sse3)
    133    mulps    %1, %1, [ps_cosh_sse3 + %3]
    134    PSHUFD   %2, %1, 0xe1
    135    addsubps %1, %1, %2
    136 %else
    137    mulps    %1, [ps_cosh + %3]
    138    PSHUFD   %2, %1, 0xe1
    139    xorps    %1, [ps_p1m1p1m1]
    140    addps    %1, %2
    141 %endif
    142 %endmacro
    143 
    144 %macro STORE 4
    145 %if cpuflag(sse4)
    146    movss     [%3       ], %1
    147    extractps dword [%3 +   %4], %1, 1
    148    extractps dword [%3 + 2*%4], %1, 2
    149    extractps dword [%3 + 3*%4], %1, 3
    150 %else
    151    movhlps %2, %1
    152    movss   [%3       ], %1
    153    movss   [%3 + 2*%4], %2
    154    shufps  %1, %1, 0xb1
    155    movss   [%3 +   %4], %1
    156    movhlps %2, %1
    157    movss   [%3 + 3*%4], %2
    158 %endif
    159 %endmacro
    160 
    161 %macro LOAD 4
    162    movlps  %1, [%3       ]
    163    movhps  %1, [%3 +   %4]
    164    movlps  %2, [%3 + 2*%4]
    165    movhps  %2, [%3 + 3*%4]
    166    shufps  %1, %2, 0x88
    167 %endmacro
    168 
    169 %macro LOADA64 2
    170 %if cpuflag(avx)
    171   movu     %1, [%2]
    172 %else
    173   movlps   %1, [%2]
    174   movhps   %1, [%2 + 8]
    175 %endif
    176 %endmacro
    177 
    178 %macro DEFINE_IMDCT 0
    179 cglobal imdct36_float, 4,4,9, out, buf, in, win
    180 
    181    ; for(i=17;i>=1;i--) in[i] += in[i-1];
    182    LOADA64 m0, inq
    183    LOADA64 m1, inq + 16
    184 
    185    ROTLEFT m5, m0, m1
    186 
    187    PSHUFD  m6, m0, 0x93
    188    andps   m6, m6, [ps_mask]
    189    addps   m0, m0, m6
    190 
    191    LOADA64 m2, inq + 32
    192 
    193    ROTLEFT m7, m1, m2
    194 
    195    addps   m1, m1, m5
    196    LOADA64 m3, inq + 48
    197 
    198    ROTLEFT m5, m2, m3
    199 
    200    xorps   m4, m4, m4
    201    movlps  m4, [inq+64]
    202    BUILDINVHIGHLOW m6, m3, m4
    203    shufps  m6, m6, m4, 0xa9
    204 
    205    addps   m4, m4, m6
    206    addps   m2, m2, m7
    207    addps   m3, m3, m5
    208 
    209    ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
    210    movlhps m5, m5, m0
    211    andps   m5, m5, [ps_mask3]
    212 
    213    BUILDINVHIGHLOW m7, m0, m1
    214    andps   m7, m7, [ps_mask2]
    215 
    216    addps   m0, m0, m5
    217 
    218    BUILDINVHIGHLOW m6, m1, m2
    219    andps   m6, m6, [ps_mask2]
    220 
    221    addps  m1, m1, m7
    222 
    223    BUILDINVHIGHLOW m7, m2, m3
    224    andps   m7, m7, [ps_mask2]
    225 
    226    addps   m2, m2, m6
    227 
    228    movhlps m6, m6, m3
    229    andps   m6, m6, [ps_mask4]
    230 
    231    addps  m3, m3, m7
    232    addps  m4, m4, m6
    233 
    234    ; Populate tmp[]
    235    movlhps m6, m1, m5    ; zero out high values
    236    subps   m6, m6, m4
    237 
    238    subps  m5, m0, m3
    239 
    240 %if ARCH_X86_64
    241    SWAP   m5, m8
    242 %endif
    243 
    244    mulps  m7, m2, [ps_val1]
    245 
    246 %if ARCH_X86_64
    247    mulps  m5, m8, [ps_val2]
    248 %else
    249    mulps  m5, m5, [ps_val2]
    250 %endif
    251    addps  m7, m7, m5
    252 
    253    mulps  m5, m6, [ps_val1]
    254    subps  m7, m7, m5
    255 
    256 %if ARCH_X86_64
    257    SWAP   m5, m8
    258 %else
    259    subps  m5, m0, m3
    260 %endif
    261 
    262    subps  m5, m5, m6
    263    addps  m5, m5, m2
    264 
    265    shufps m6, m4, m3, 0xe4
    266    subps  m6, m6, m2
    267    mulps  m6, m6, [ps_val3]
    268 
    269    addps  m4, m4, m1
    270    mulps  m4, m4, [ps_val4]
    271 
    272    shufps m1, m1, m0, 0xe4
    273    addps  m1, m1, m2
    274    mulps  m1, m1, [ps_val5]
    275 
    276    mulps  m3, m3, [ps_val6]
    277    mulps  m0, m0, [ps_val7]
    278    addps  m0, m0, m3
    279 
    280    xorps  m2, m1, [ps_p1p1m1m1]
    281    subps  m2, m2, m4
    282    addps  m2, m2, m0
    283 
    284    addps  m3, m4, m0
    285    subps  m3, m3, m6
    286    xorps  m3, m3, [ps_p1p1m1m1]
    287 
    288    shufps m0, m0, m4, 0xe4
    289    subps  m0, m0, m1
    290    addps  m0, m0, m6
    291 
    292    BUILDINVHIGHLOW m4, m2, m3
    293    shufps  m3, m3, m2, 0x4e
    294 
    295    ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
    296 
    297    BUTTERF  m0, m1, 0
    298    BUTTERF  m7, m2, 16
    299    BUTTERF  m3, m6, 32
    300    BUTTERF  m4, m1, 48
    301    BUTTERF2 m5, m1, 64
    302 
    303    ; permutates:
    304    ; m0    0  1  2  3     =>     2  6 10 14   m1
    305    ; m7    4  5  6  7     =>     3  7 11 15   m2
    306    ; m3    8  9 10 11     =>    17 13  9  5   m3
    307    ; m4   12 13 14 15     =>    16 12  8  4   m5
    308    ; m5   16 17 xx xx     =>     0  1 xx xx   m0
    309 
    310    unpckhps m1, m0, m7
    311    unpckhps m6, m3, m4
    312    movhlps  m2, m6, m1
    313    movlhps  m1, m1, m6
    314 
    315    unpcklps m5, m5, m4
    316    unpcklps m3, m3, m7
    317    movhlps  m4, m3, m5
    318    movlhps  m5, m5, m3
    319    SWAP m4, m3
    320    ; permutation done
    321 
    322    PSHUFD  m6, m2, 0xb1
    323    movss   m4, [bufq + 4*68]
    324    movss   m7, [bufq + 4*64]
    325    unpcklps  m7, m7, m4
    326    mulps   m6, m6, [winq + 16*4]
    327    addps   m6, m6, m7
    328    movss   [outq + 64*SBLIMIT], m6
    329    shufps  m6, m6, m6, 0xb1
    330    movss   [outq + 68*SBLIMIT], m6
    331 
    332    mulps   m6, m3, [winq + 4*4]
    333    LOAD    m4, m7, bufq + 4*16, 16
    334    addps   m6, m6, m4
    335    STORE   m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
    336 
    337    shufps  m4, m0, m3, 0xb5
    338    mulps   m4, m4, [winq + 8*4]
    339    LOAD    m7, m6, bufq + 4*32, 16
    340    addps   m4, m4, m7
    341    STORE   m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
    342 
    343    shufps  m3, m3, m2, 0xb1
    344    mulps   m3, m3, [winq + 12*4]
    345    LOAD    m7, m6, bufq + 4*48, 16
    346    addps   m3, m3, m7
    347    STORE   m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
    348 
    349    mulps   m2, m2, [winq]
    350    LOAD    m6, m7, bufq, 16
    351    addps   m2, m2, m6
    352    STORE   m2, m7, outq, 4*SBLIMIT
    353 
    354    mulps    m4, m1, [winq + 20*4]
    355    STORE    m4, m7, bufq, 16
    356 
    357    mulps    m3, m5, [winq + 24*4]
    358    STORE    m3, m7, bufq + 4*16, 16
    359 
    360    shufps   m0, m0, m5, 0xb0
    361    mulps    m0, m0, [winq + 28*4]
    362    STORE    m0, m7, bufq + 4*32, 16
    363 
    364    shufps   m5, m5, m1, 0xb1
    365    mulps    m5, m5, [winq + 32*4]
    366    STORE    m5, m7, bufq + 4*48, 16
    367 
    368    shufps   m1, m1, m1, 0xb1
    369    mulps    m1, m1, [winq + 36*4]
    370    movss    [bufq + 4*64], m1
    371    shufps   m1, m1, 0xb1
    372    movss    [bufq + 4*68], m1
    373    RET
    374 %endmacro
    375 
    376 INIT_XMM sse2
    377 DEFINE_IMDCT
    378 
    379 INIT_XMM sse3
    380 DEFINE_IMDCT
    381 
    382 INIT_XMM ssse3
    383 DEFINE_IMDCT
    384 
    385 %if HAVE_AVX_EXTERNAL
    386 INIT_XMM avx
    387 DEFINE_IMDCT
    388 %endif
    389 
    390 INIT_XMM sse
    391 
    392 %if ARCH_X86_64
    393 %define SPILL SWAP
    394 %define UNSPILL SWAP
    395 %define SPILLED(x) m %+ x
    396 %else
    397 %define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
    398 %macro SPILL 2 ; xmm#, mempos
    399    movaps SPILLED(%2), m%1
    400 %endmacro
    401 %macro UNSPILL 2
    402    movaps m%1, SPILLED(%2)
    403 %endmacro
    404 %endif
    405 
    406 %macro DEFINE_FOUR_IMDCT 0
    407 cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
    408    movlps  m0, [inq+64]
    409    movhps  m0, [inq+64 +   72]
    410    movlps  m3, [inq+64 + 2*72]
    411    movhps  m3, [inq+64 + 3*72]
    412 
    413    shufps  m5, m0, m3, 0xdd
    414    shufps  m0, m0, m3, 0x88
    415 
    416    mova     m1, [inq+48]
    417    movu     m6, [inq+48 +   72]
    418    mova     m7, [inq+48 + 2*72]
    419    movu     m3, [inq+48 + 3*72]
    420 
    421    TRANSPOSE4x4PS 1, 6, 7, 3, 4
    422 
    423    addps   m4, m6, m7
    424    mova    [tmpq+4*28], m4
    425 
    426    addps    m7, m3
    427    addps    m6, m1
    428    addps    m3, m0
    429    addps    m0, m5
    430    addps    m0, m7
    431    addps    m7, m6
    432    mova    [tmpq+4*12], m7
    433    SPILL   3, 12
    434 
    435    mova     m4, [inq+32]
    436    movu     m5, [inq+32 +   72]
    437    mova     m2, [inq+32 + 2*72]
    438    movu     m7, [inq+32 + 3*72]
    439 
    440    TRANSPOSE4x4PS 4, 5, 2, 7, 3
    441 
    442    addps   m1, m7
    443    SPILL   1, 11
    444 
    445    addps   m3, m5, m2
    446    SPILL   3, 13
    447 
    448    addps    m7, m2
    449    addps    m5, m4
    450    addps    m6, m7
    451    mova    [tmpq], m6
    452    addps   m7, m5
    453    mova    [tmpq+4*16], m7
    454 
    455    mova    m2, [inq+16]
    456    movu    m7, [inq+16 +   72]
    457    mova    m1, [inq+16 + 2*72]
    458    movu    m6, [inq+16 + 3*72]
    459 
    460    TRANSPOSE4x4PS 2, 7, 1, 6, 3
    461 
    462    addps   m4, m6
    463    addps   m6, m1
    464    addps   m1, m7
    465    addps   m7, m2
    466    addps   m5, m6
    467    SPILL   5, 15
    468    addps   m6, m7
    469    mulps   m6, [costabs + 16*2]
    470    mova    [tmpq+4*8], m6
    471    SPILL   1, 10
    472    SPILL   0, 14
    473 
    474    mova    m1, [inq]
    475    movu    m6, [inq +   72]
    476    mova    m3, [inq + 2*72]
    477    movu    m5, [inq + 3*72]
    478 
    479    TRANSPOSE4x4PS 1, 6, 3, 5, 0
    480 
    481    addps    m2, m5
    482    addps    m5, m3
    483    addps    m7, m5
    484    addps    m3, m6
    485    addps    m6, m1
    486    SPILL    7, 8
    487    addps    m5, m6
    488    SPILL    6, 9
    489    addps    m6, m4, SPILLED(12)
    490    subps    m6, m2
    491    UNSPILL  7, 11
    492    SPILL    5, 11
    493    subps    m5, m1, m7
    494    mulps    m7, [costabs + 16*5]
    495    addps    m7, m1
    496    mulps    m0, m6, [costabs + 16*6]
    497    addps    m0, m5
    498    mova     [tmpq+4*24], m0
    499    addps    m6, m5
    500    mova     [tmpq+4*4], m6
    501    addps    m6, m4, m2
    502    mulps    m6, [costabs + 16*1]
    503    subps    m4, SPILLED(12)
    504    mulps    m4, [costabs + 16*8]
    505    addps    m2, SPILLED(12)
    506    mulps    m2, [costabs + 16*3]
    507    subps    m5, m7, m6
    508    subps    m5, m2
    509    addps    m6, m7
    510    addps    m6, m4
    511    addps    m7, m2
    512    subps    m7, m4
    513    mova     [tmpq+4*20], m7
    514    mova     m2, [tmpq+4*28]
    515    mova     [tmpq+4*28], m5
    516    UNSPILL  7, 13
    517    subps    m5, m7, m2
    518    mulps    m5, [costabs + 16*7]
    519    UNSPILL  1, 10
    520    mulps    m1, [costabs + 16*2]
    521    addps    m4, m3, m2
    522    mulps    m4, [costabs + 16*4]
    523    addps    m2, m7
    524    addps    m7, m3
    525    mulps    m7, [costabs]
    526    subps    m3, m2
    527    mulps    m3, [costabs + 16*2]
    528    addps    m2, m7, m5
    529    addps    m2, m1
    530    SPILL    2, 10
    531    addps    m7, m4
    532    subps    m7, m1
    533    SPILL    7, 12
    534    subps    m5, m4
    535    subps    m5, m1
    536    UNSPILL  0, 14
    537    SPILL    5, 13
    538    addps    m1, m0, SPILLED(15)
    539    subps    m1, SPILLED(8)
    540    mova     m4, [costabs + 16*5]
    541    mulps    m4, [tmpq]
    542    UNSPILL  2, 9
    543    addps    m4, m2
    544    subps    m2, [tmpq]
    545    mulps    m5, m1, [costabs + 16*6]
    546    addps    m5, m2
    547    SPILL    5, 9
    548    addps    m2, m1
    549    SPILL    2, 14
    550    UNSPILL  5, 15
    551    subps    m7, m5, m0
    552    addps    m5, SPILLED(8)
    553    mulps    m5, [costabs + 16*1]
    554    mulps    m7, [costabs + 16*8]
    555    addps    m0, SPILLED(8)
    556    mulps    m0, [costabs + 16*3]
    557    subps    m2, m4, m5
    558    subps    m2, m0
    559    SPILL    2, 15
    560    addps    m5, m4
    561    addps    m5, m7
    562    addps    m4, m0
    563    subps    m4, m7
    564    SPILL    4, 8
    565    mova     m7, [tmpq+4*16]
    566    mova     m2, [tmpq+4*12]
    567    addps    m0, m7, m2
    568    subps    m0, SPILLED(11)
    569    mulps    m0, [costabs + 16*2]
    570    addps    m4, m7, SPILLED(11)
    571    mulps    m4, [costabs]
    572    subps    m7, m2
    573    mulps    m7, [costabs + 16*7]
    574    addps    m2, SPILLED(11)
    575    mulps    m2, [costabs + 16*4]
    576    addps    m1, m7, [tmpq+4*8]
    577    addps    m1, m4
    578    addps    m4, m2
    579    subps    m4, [tmpq+4*8]
    580    SPILL    4, 11
    581    subps    m7, m2
    582    subps    m7, [tmpq+4*8]
    583    addps    m4, m6, SPILLED(10)
    584    subps    m6, SPILLED(10)
    585    addps    m2, m5, m1
    586    mulps    m2, [costabs + 16*9]
    587    subps    m5, m1
    588    mulps    m5, [costabs + 16*17]
    589    subps    m1, m4, m2
    590    addps    m4, m2
    591    mulps    m2, m1, [winq+4*36]
    592    addps    m2, [bufq+4*36]
    593    mova     [outq+1152], m2
    594    mulps    m1, [winq+4*32]
    595    addps    m1, [bufq+4*32]
    596    mova     [outq+1024], m1
    597    mulps    m1, m4, [winq+4*116]
    598    mova     [bufq+4*36], m1
    599    mulps    m4, [winq+4*112]
    600    mova     [bufq+4*32], m4
    601    addps    m2, m6, m5
    602    subps    m6, m5
    603    mulps    m1, m6, [winq+4*68]
    604    addps    m1, [bufq+4*68]
    605    mova     [outq+2176], m1
    606    mulps    m6, [winq]
    607    addps    m6, [bufq]
    608    mova     [outq], m6
    609    mulps    m1, m2, [winq+4*148]
    610    mova     [bufq+4*68], m1
    611    mulps    m2, [winq+4*80]
    612    mova     [bufq], m2
    613    addps    m5, m3, [tmpq+4*24]
    614    mova     m2, [tmpq+4*24]
    615    subps    m2, m3
    616    mova     m1, SPILLED(9)
    617    subps    m1, m0
    618    mulps    m1, [costabs + 16*10]
    619    addps    m0, SPILLED(9)
    620    mulps    m0, [costabs + 16*16]
    621    addps    m6, m5, m1
    622    subps    m5, m1
    623    mulps    m3, m5, [winq+4*40]
    624    addps    m3, [bufq+4*40]
    625    mova     [outq+1280], m3
    626    mulps    m5, [winq+4*28]
    627    addps    m5, [bufq+4*28]
    628    mova     [outq+896], m5
    629    mulps    m1, m6, [winq+4*120]
    630    mova     [bufq+4*40], m1
    631    mulps    m6, [winq+4*108]
    632    mova     [bufq+4*28], m6
    633    addps    m1, m2, m0
    634    subps    m2, m0
    635    mulps    m5, m2, [winq+4*64]
    636    addps    m5, [bufq+4*64]
    637    mova     [outq+2048], m5
    638    mulps    m2, [winq+4*4]
    639    addps    m2, [bufq+4*4]
    640    mova     [outq+128], m2
    641    mulps    m0, m1, [winq+4*144]
    642    mova     [bufq+4*64], m0
    643    mulps    m1, [winq+4*84]
    644    mova     [bufq+4*4], m1
    645    mova     m1, [tmpq+4*28]
    646    mova     m5, m1
    647    addps    m1, SPILLED(13)
    648    subps    m5, SPILLED(13)
    649    UNSPILL  3, 15
    650    addps    m2, m7, m3
    651    mulps    m2, [costabs + 16*11]
    652    subps    m3, m7
    653    mulps    m3, [costabs + 16*15]
    654    addps    m0, m2, m1
    655    subps    m1, m2
    656    SWAP     m0, m2
    657    mulps    m6, m1, [winq+4*44]
    658    addps    m6, [bufq+4*44]
    659    mova     [outq+1408], m6
    660    mulps    m1, [winq+4*24]
    661    addps    m1, [bufq+4*24]
    662    mova     [outq+768], m1
    663    mulps    m0, m2, [winq+4*124]
    664    mova     [bufq+4*44], m0
    665    mulps    m2, [winq+4*104]
    666    mova     [bufq+4*24], m2
    667    addps    m0, m5, m3
    668    subps    m5, m3
    669    mulps    m1, m5, [winq+4*60]
    670    addps    m1, [bufq+4*60]
    671    mova     [outq+1920], m1
    672    mulps    m5, [winq+4*8]
    673    addps    m5, [bufq+4*8]
    674    mova     [outq+256], m5
    675    mulps    m1, m0, [winq+4*140]
    676    mova     [bufq+4*60], m1
    677    mulps    m0, [winq+4*88]
    678    mova     [bufq+4*8], m0
    679    mova     m1, [tmpq+4*20]
    680    addps    m1, SPILLED(12)
    681    mova     m2, [tmpq+4*20]
    682    subps    m2, SPILLED(12)
    683    UNSPILL  7, 8
    684    subps    m0, m7, SPILLED(11)
    685    addps    m7, SPILLED(11)
    686    mulps    m4, m7, [costabs + 16*12]
    687    mulps    m0, [costabs + 16*14]
    688    addps    m5, m1, m4
    689    subps    m1, m4
    690    mulps    m7, m1, [winq+4*48]
    691    addps    m7, [bufq+4*48]
    692    mova     [outq+1536], m7
    693    mulps    m1, [winq+4*20]
    694    addps    m1, [bufq+4*20]
    695    mova     [outq+640], m1
    696    mulps    m1, m5, [winq+4*128]
    697    mova     [bufq+4*48], m1
    698    mulps    m5, [winq+4*100]
    699    mova     [bufq+4*20], m5
    700    addps    m6, m2, m0
    701    subps    m2, m0
    702    mulps    m1, m2, [winq+4*56]
    703    addps    m1, [bufq+4*56]
    704    mova     [outq+1792], m1
    705    mulps    m2, [winq+4*12]
    706    addps    m2, [bufq+4*12]
    707    mova     [outq+384], m2
    708    mulps    m0, m6, [winq+4*136]
    709    mova    [bufq+4*56], m0
    710    mulps    m6, [winq+4*92]
    711    mova     [bufq+4*12], m6
    712    UNSPILL  0, 14
    713    mulps    m0, [costabs + 16*13]
    714    mova     m3, [tmpq+4*4]
    715    addps    m2, m0, m3
    716    subps    m3, m0
    717    mulps    m0, m3, [winq+4*52]
    718    addps    m0, [bufq+4*52]
    719    mova     [outq+1664], m0
    720    mulps    m3, [winq+4*16]
    721    addps    m3, [bufq+4*16]
    722    mova     [outq+512], m3
    723    mulps    m0, m2, [winq+4*132]
    724    mova     [bufq+4*52], m0
    725    mulps    m2, [winq+4*96]
    726    mova     [bufq+4*16], m2
    727    RET
    728 %endmacro
    729 
    730 INIT_XMM sse
    731 DEFINE_FOUR_IMDCT
    732 
    733 %if HAVE_AVX_EXTERNAL
    734 INIT_XMM avx
    735 DEFINE_FOUR_IMDCT
    736 %endif