tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dct32.asm (11595B)


      1 ;******************************************************************************
      2 ;* 32 point SSE-optimized DCT transform
      3 ;* Copyright (c) 2010 Vitor Sessak
      4 ;*
      5 ;* This file is part of FFmpeg.
      6 ;*
      7 ;* FFmpeg is free software; you can redistribute it and/or
      8 ;* modify it under the terms of the GNU Lesser General Public
      9 ;* License as published by the Free Software Foundation; either
     10 ;* version 2.1 of the License, or (at your option) any later version.
     11 ;*
     12 ;* FFmpeg is distributed in the hope that it will be useful,
     13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 ;* Lesser General Public License for more details.
     16 ;*
     17 ;* You should have received a copy of the GNU Lesser General Public
     18 ;* License along with FFmpeg; if not, write to the Free Software
     19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     20 ;******************************************************************************
     21 
     22 %include "libavutil/x86/x86util.asm"
     23 
     24 SECTION_RODATA 32
     25 
     26 ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
     27 
     28 ps_cos_vec: dd   0.500603,  0.505471,  0.515447,  0.531043
     29            dd   0.553104,  0.582935,  0.622504,  0.674808
     30            dd -10.190008, -3.407609, -2.057781, -1.484165
     31            dd  -1.169440, -0.972568, -0.839350, -0.744536
     32            dd   0.502419,  0.522499,  0.566944,  0.646822
     33            dd   0.788155,  1.060678,  1.722447,  5.101149
     34            dd   0.509796,  0.601345,  0.899976,  2.562916
     35            dd   0.509796,  0.601345,  0.899976,  2.562916
     36            dd   1.000000,  1.000000,  1.306563,  0.541196
     37            dd   1.000000,  1.000000,  1.306563,  0.541196
     38            dd   1.000000,  0.707107,  1.000000, -0.707107
     39            dd   1.000000,  0.707107,  1.000000, -0.707107
     40            dd   0.707107,  0.707107,  0.707107,  0.707107
     41 
     42 %macro BUTTERFLY 4
     43    subps  %4, %1, %2
     44    addps  %2, %2, %1
     45    mulps  %1, %4, %3
     46 %endmacro
     47 
     48 %macro BUTTERFLY0 5
     49 %if cpuflag(sse2) && notcpuflag(avx)
     50    pshufd %4, %1, %5
     51    xorps  %1, %2
     52    addps  %1, %4
     53    mulps  %1, %3
     54 %else
     55    shufps %4, %1, %1, %5
     56    xorps  %1, %1, %2
     57    addps  %4, %4, %1
     58    mulps  %1, %4, %3
     59 %endif
     60 %endmacro
     61 
     62 %macro BUTTERFLY2 4
     63    BUTTERFLY0 %1, %2, %3, %4, 0x1b
     64 %endmacro
     65 
     66 %macro BUTTERFLY3 4
     67    BUTTERFLY0 %1, %2, %3, %4, 0xb1
     68 %endmacro
     69 
     70 %macro BUTTERFLY3V 5
     71    movaps m%5, m%1
     72    addps  m%1, m%2
     73    subps  m%5, m%2
     74    SWAP %2, %5
     75    mulps  m%2, [ps_cos_vec+192]
     76    movaps m%5, m%3
     77    addps  m%3, m%4
     78    subps  m%4, m%5
     79    mulps  m%4, [ps_cos_vec+192]
     80 %endmacro
     81 
     82 %macro PASS6_AND_PERMUTE 0
     83    mov         tmpd, [outq+4]
     84    movss         m7, [outq+72]
     85    addss         m7, [outq+76]
     86    movss         m3, [outq+56]
     87    addss         m3, [outq+60]
     88    addss         m4, m3
     89    movss         m2, [outq+52]
     90    addss         m2, m3
     91    movss         m3, [outq+104]
     92    addss         m3, [outq+108]
     93    addss         m1, m3
     94    addss         m5, m4
     95    movss [outq+ 16], m1
     96    movss         m1, [outq+100]
     97    addss         m1, m3
     98    movss         m3, [outq+40]
     99    movss [outq+ 48], m1
    100    addss         m3, [outq+44]
    101    movss         m1, [outq+100]
    102    addss         m4, m3
    103    addss         m3, m2
    104    addss         m1, [outq+108]
    105    movss [outq+ 40], m3
    106    addss         m2, [outq+36]
    107    movss         m3, [outq+8]
    108    movss [outq+ 56], m2
    109    addss         m3, [outq+12]
    110    movss [outq+ 32], m3
    111    movss         m3, [outq+80]
    112    movss [outq+  8], m5
    113    movss [outq+ 80], m1
    114    movss         m2, [outq+52]
    115    movss         m5, [outq+120]
    116    addss         m5, [outq+124]
    117    movss         m1, [outq+64]
    118    addss         m2, [outq+60]
    119    addss         m0, m5
    120    addss         m5, [outq+116]
    121    mov    [outq+64], tmpd
    122    addss         m6, m0
    123    addss         m1, m6
    124    mov         tmpd, [outq+12]
    125    mov   [outq+ 96], tmpd
    126    movss [outq+  4], m1
    127    movss         m1, [outq+24]
    128    movss [outq+ 24], m4
    129    movss         m4, [outq+88]
    130    addss         m4, [outq+92]
    131    addss         m3, m4
    132    addss         m4, [outq+84]
    133    mov         tmpd, [outq+108]
    134    addss         m1, [outq+28]
    135    addss         m0, m1
    136    addss         m1, m5
    137    addss         m6, m3
    138    addss         m3, m0
    139    addss         m0, m7
    140    addss         m5, [outq+20]
    141    addss         m7, m1
    142    movss [outq+ 12], m6
    143    mov   [outq+112], tmpd
    144    movss         m6, [outq+28]
    145    movss [outq+ 28], m0
    146    movss         m0, [outq+36]
    147    movss [outq+ 36], m7
    148    addss         m1, m4
    149    movss         m7, [outq+116]
    150    addss         m0, m2
    151    addss         m7, [outq+124]
    152    movss [outq+ 72], m0
    153    movss         m0, [outq+44]
    154    addss         m2, m0
    155    movss [outq+ 44], m1
    156    movss [outq+ 88], m2
    157    addss         m0, [outq+60]
    158    mov         tmpd, [outq+60]
    159    mov   [outq+120], tmpd
    160    movss [outq+104], m0
    161    addss         m4, m5
    162    addss         m5, [outq+68]
    163    movss  [outq+52], m4
    164    movss  [outq+60], m5
    165    movss         m4, [outq+68]
    166    movss         m5, [outq+20]
    167    movss [outq+ 20], m3
    168    addss         m5, m7
    169    addss         m7, m6
    170    addss         m4, m5
    171    movss         m2, [outq+84]
    172    addss         m2, [outq+92]
    173    addss         m5, m2
    174    movss [outq+ 68], m4
    175    addss         m2, m7
    176    movss         m4, [outq+76]
    177    movss [outq+ 84], m2
    178    movss [outq+ 76], m5
    179    addss         m7, m4
    180    addss         m6, [outq+124]
    181    addss         m4, m6
    182    addss         m6, [outq+92]
    183    movss [outq+100], m4
    184    movss [outq+108], m6
    185    movss         m6, [outq+92]
    186    movss  [outq+92], m7
    187    addss         m6, [outq+124]
    188    movss [outq+116], m6
    189 %endmacro
    190 
    191 INIT_YMM avx
    192 SECTION .text
    193 %if HAVE_AVX_EXTERNAL
    194 ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
    195 cglobal dct32_float, 2,3,8, out, in, tmp
    196    ; pass 1
    197    vmovaps     m4, [inq+0]
    198    vinsertf128 m5, m5, [inq+96], 1
    199    vinsertf128 m5, m5, [inq+112], 0
    200    vshufps     m5, m5, m5, 0x1b
    201    BUTTERFLY   m4, m5, [ps_cos_vec], m6
    202 
    203    vmovaps     m2, [inq+64]
    204    vinsertf128 m6, m6, [inq+32], 1
    205    vinsertf128 m6, m6, [inq+48], 0
    206    vshufps     m6, m6, m6, 0x1b
    207    BUTTERFLY   m2, m6, [ps_cos_vec+32], m0
    208 
    209    ; pass 2
    210 
    211    BUTTERFLY  m5, m6, [ps_cos_vec+64], m0
    212    BUTTERFLY  m4, m2, [ps_cos_vec+64], m7
    213 
    214 
    215    ; pass 3
    216    vperm2f128  m3, m6, m4, 0x31
    217    vperm2f128  m1, m6, m4, 0x20
    218    vshufps     m3, m3, m3, 0x1b
    219 
    220    BUTTERFLY   m1, m3, [ps_cos_vec+96], m6
    221 
    222 
    223    vperm2f128  m4, m5, m2, 0x20
    224    vperm2f128  m5, m5, m2, 0x31
    225    vshufps     m5, m5, m5, 0x1b
    226 
    227    BUTTERFLY   m4, m5, [ps_cos_vec+96], m6
    228 
    229    ; pass 4
    230    vmovaps m6, [ps_p1p1m1m1+0]
    231    vmovaps m2, [ps_cos_vec+128]
    232 
    233    BUTTERFLY2  m5, m6, m2, m7
    234    BUTTERFLY2  m4, m6, m2, m7
    235    BUTTERFLY2  m1, m6, m2, m7
    236    BUTTERFLY2  m3, m6, m2, m7
    237 
    238 
    239    ; pass 5
    240    vshufps m6, m6, m6, 0xcc
    241    vmovaps m2, [ps_cos_vec+160]
    242 
    243    BUTTERFLY3  m5, m6, m2, m7
    244    BUTTERFLY3  m4, m6, m2, m7
    245    BUTTERFLY3  m1, m6, m2, m7
    246    BUTTERFLY3  m3, m6, m2, m7
    247 
    248    vperm2f128  m6, m3, m3, 0x31
    249    vmovaps [outq], m3
    250 
    251    vextractf128  [outq+64], m5, 1
    252    vextractf128  [outq+32], m5, 0
    253 
    254    vextractf128  [outq+80], m4, 1
    255    vextractf128  [outq+48], m4, 0
    256 
    257    vperm2f128  m0, m1, m1, 0x31
    258    vmovaps [outq+96], m1
    259 
    260    vzeroupper
    261 
    262    ;    pass 6, no SIMD...
    263 INIT_XMM
    264    PASS6_AND_PERMUTE
    265    RET
    266 %endif
    267 
    268 %if ARCH_X86_64
    269 %define SPILL SWAP
    270 %define UNSPILL SWAP
    271 
    272 %macro PASS5 0
    273    nop ; FIXME code alignment
    274    SWAP 5, 8
    275    SWAP 4, 12
    276    SWAP 6, 14
    277    SWAP 7, 13
    278    SWAP 0, 15
    279    PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
    280    TRANSPOSE4x4PS 8, 9, 10, 11, 0
    281    BUTTERFLY3V    8, 9, 10, 11, 0
    282    addps   m10, m11
    283    TRANSPOSE4x4PS 12, 13, 14, 15, 0
    284    BUTTERFLY3V    12, 13, 14, 15, 0
    285    addps   m14, m15
    286    addps   m12, m14
    287    addps   m14, m13
    288    addps   m13, m15
    289 %endmacro
    290 
    291 %macro PASS6 0
    292    SWAP 9, 12
    293    SWAP 11, 14
    294    movss [outq+0x00], m8
    295    pshuflw m0, m8, 0xe
    296    movss [outq+0x10], m9
    297    pshuflw m1, m9, 0xe
    298    movss [outq+0x20], m10
    299    pshuflw m2, m10, 0xe
    300    movss [outq+0x30], m11
    301    pshuflw m3, m11, 0xe
    302    movss [outq+0x40], m12
    303    pshuflw m4, m12, 0xe
    304    movss [outq+0x50], m13
    305    pshuflw m5, m13, 0xe
    306    movss [outq+0x60], m14
    307    pshuflw m6, m14, 0xe
    308    movaps [outq+0x70], m15
    309    pshuflw m7, m15, 0xe
    310    addss   m0, m1
    311    addss   m1, m2
    312    movss [outq+0x08], m0
    313    addss   m2, m3
    314    movss [outq+0x18], m1
    315    addss   m3, m4
    316    movss [outq+0x28], m2
    317    addss   m4, m5
    318    movss [outq+0x38], m3
    319    addss   m5, m6
    320    movss [outq+0x48], m4
    321    addss   m6, m7
    322    movss [outq+0x58], m5
    323    movss [outq+0x68], m6
    324    movss [outq+0x78], m7
    325 
    326    PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
    327    movhlps m0, m1
    328    pshufd  m1, m1, 3
    329    SWAP 0, 2, 4, 6, 8, 10, 12, 14
    330    SWAP 1, 3, 5, 7, 9, 11, 13, 15
    331 %rep 7
    332    movhlps m0, m1
    333    pshufd  m1, m1, 3
    334    addss   m15, m1
    335    SWAP 0, 2, 4, 6, 8, 10, 12, 14
    336    SWAP 1, 3, 5, 7, 9, 11, 13, 15
    337 %endrep
    338 %assign i 4
    339 %rep 15
    340    addss m0, m1
    341    movss [outq+i], m0
    342    SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    343    %assign i i+8
    344 %endrep
    345 %endmacro
    346 
    347 %else ; ARCH_X86_32
    348 %macro SPILL 2 ; xmm#, mempos
    349    movaps [outq+(%2-8)*16], m%1
    350 %endmacro
    351 %macro UNSPILL 2
    352    movaps m%1, [outq+(%2-8)*16]
    353 %endmacro
    354 
    355 %define PASS6 PASS6_AND_PERMUTE
    356 %macro PASS5 0
    357    movaps      m2, [ps_cos_vec+160]
    358    shufps      m3, m3, 0xcc
    359 
    360    BUTTERFLY3  m5, m3, m2, m1
    361    SPILL 5, 8
    362 
    363    UNSPILL 1, 9
    364    BUTTERFLY3  m1, m3, m2, m5
    365    SPILL 1, 14
    366 
    367    BUTTERFLY3  m4, m3, m2, m5
    368    SPILL 4, 12
    369 
    370    BUTTERFLY3  m7, m3, m2, m5
    371    SPILL 7, 13
    372 
    373    UNSPILL 5, 10
    374    BUTTERFLY3  m5, m3, m2, m7
    375    SPILL 5, 10
    376 
    377    UNSPILL 4, 11
    378    BUTTERFLY3  m4, m3, m2, m7
    379    SPILL 4, 11
    380 
    381    BUTTERFLY3  m6, m3, m2, m7
    382    SPILL 6, 9
    383 
    384    BUTTERFLY3  m0, m3, m2, m7
    385    SPILL 0, 15
    386 %endmacro
    387 %endif
    388 
    389 
    390 ; void ff_dct32_float(FFTSample *out, const FFTSample *in)
    391 %macro DCT32_FUNC 0
    392 cglobal dct32_float, 2, 3, 16, out, in, tmp
    393    ; pass 1
    394 
    395    movaps      m0, [inq+0]
    396    LOAD_INV    m1, [inq+112]
    397    BUTTERFLY   m0, m1, [ps_cos_vec], m3
    398 
    399    movaps      m7, [inq+64]
    400    LOAD_INV    m4, [inq+48]
    401    BUTTERFLY   m7, m4, [ps_cos_vec+32], m3
    402 
    403    ; pass 2
    404    movaps      m2, [ps_cos_vec+64]
    405    BUTTERFLY   m1, m4, m2, m3
    406    SPILL 1, 11
    407    SPILL 4, 8
    408 
    409    ; pass 1
    410    movaps      m1, [inq+16]
    411    LOAD_INV    m6, [inq+96]
    412    BUTTERFLY   m1, m6, [ps_cos_vec+16], m3
    413 
    414    movaps      m4, [inq+80]
    415    LOAD_INV    m5, [inq+32]
    416    BUTTERFLY   m4, m5, [ps_cos_vec+48], m3
    417 
    418    ; pass 2
    419    BUTTERFLY   m0, m7, m2, m3
    420 
    421    movaps      m2, [ps_cos_vec+80]
    422    BUTTERFLY   m6, m5, m2, m3
    423 
    424    BUTTERFLY   m1, m4, m2, m3
    425 
    426    ; pass 3
    427    movaps      m2, [ps_cos_vec+96]
    428    shufps      m1, m1, 0x1b
    429    BUTTERFLY   m0, m1, m2, m3
    430    SPILL 0, 15
    431    SPILL 1, 14
    432 
    433    UNSPILL 0, 8
    434    shufps      m5, m5, 0x1b
    435    BUTTERFLY   m0, m5, m2, m3
    436 
    437    UNSPILL 1, 11
    438    shufps      m6, m6, 0x1b
    439    BUTTERFLY   m1, m6, m2, m3
    440    SPILL 1, 11
    441 
    442    shufps      m4, m4, 0x1b
    443    BUTTERFLY   m7, m4, m2, m3
    444 
    445    ; pass 4
    446    movaps      m3, [ps_p1p1m1m1+0]
    447    movaps      m2, [ps_cos_vec+128]
    448 
    449    BUTTERFLY2  m5, m3, m2, m1
    450 
    451    BUTTERFLY2  m0, m3, m2, m1
    452    SPILL 0, 9
    453 
    454    BUTTERFLY2  m6, m3, m2, m1
    455    SPILL 6, 10
    456 
    457    UNSPILL 0, 11
    458    BUTTERFLY2  m0, m3, m2, m1
    459    SPILL 0, 11
    460 
    461    BUTTERFLY2  m4, m3, m2, m1
    462 
    463    BUTTERFLY2  m7, m3, m2, m1
    464 
    465    UNSPILL 6, 14
    466    BUTTERFLY2  m6, m3, m2, m1
    467 
    468    UNSPILL 0, 15
    469    BUTTERFLY2  m0, m3, m2, m1
    470 
    471    PASS5
    472    PASS6
    473    RET
    474 %endmacro
    475 
    476 %macro LOAD_INV 2
    477    pshufd      %1, %2, 0x1b
    478 %endmacro
    479 
    480 INIT_XMM sse2
    481 DCT32_FUNC