tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

flacdsp.asm (9431B)


      1 ;******************************************************************************
      2 ;* FLAC DSP SIMD optimizations
      3 ;*
      4 ;* Copyright (C) 2014 Loren Merritt
      5 ;* Copyright (C) 2014 James Almer
      6 ;*
      7 ;* This file is part of FFmpeg.
      8 ;*
      9 ;* FFmpeg is free software; you can redistribute it and/or
     10 ;* modify it under the terms of the GNU Lesser General Public
     11 ;* License as published by the Free Software Foundation; either
     12 ;* version 2.1 of the License, or (at your option) any later version.
     13 ;*
     14 ;* FFmpeg is distributed in the hope that it will be useful,
     15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     17 ;* Lesser General Public License for more details.
     18 ;*
     19 ;* You should have received a copy of the GNU Lesser General Public
     20 ;* License along with FFmpeg; if not, write to the Free Software
     21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     22 ;******************************************************************************
     23 
     24 %include "libavutil/x86/x86util.asm"
     25 
     26 SECTION_RODATA
     27 
     28 vector:  db 0,1,4,5,8,9,12,13,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,1,4,5,8,9,12,13,
     29 
     30 SECTION .text
     31 
     32 %macro PMACSDQL 3
     33 %if cpuflag(xop)
     34    pmacsdql %1, %2, %3, %1
     35 %else
     36    pmuldq   %2, %3
     37    paddq    %1, %2
     38 %endif
     39 %endmacro
     40 
     41 %macro LPC_32 3
     42 INIT_XMM %1
     43 cglobal flac_lpc_%2, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
     44    sub    lend, pred_orderd
     45    jle .ret
     46    movsxdifnidn pred_orderq, pred_orderd
     47    lea    decodedq, [decodedq+pred_orderq*4-8]
     48    lea    coeffsq, [coeffsq+pred_orderq*4]
     49    neg    pred_orderq
     50    movd   m4, qlevelm
     51 ALIGN 16
     52 .loop_sample:
     53    movd   m0, [decodedq+pred_orderq*4+8]
     54    add    decodedq, 8
     55    movd   m1, [coeffsq+pred_orderq*4]
     56    pxor   m2, m2
     57    pxor   m3, m3
     58    lea    jq, [pred_orderq+1]
     59    test   jq, jq
     60    jz .end_order
     61 .loop_order:
     62    PMACSDQL m2, m0, m1
     63    movd   m0, [decodedq+jq*4]
     64    PMACSDQL m3, m1, m0
     65    movd   m1, [coeffsq+jq*4]
     66    inc    jq
     67    jl .loop_order
     68 .end_order:
     69    PMACSDQL m2, m0, m1
     70    %3     m2, m4
     71    movd   m0, [decodedq]
     72    paddd  m0, m2
     73    movd   [decodedq], m0
     74    sub  lend, 2
     75    jl .ret
     76    PMACSDQL m3, m1, m0
     77    %3     m3, m4
     78    movd   m1, [decodedq+4]
     79    paddd  m1, m3
     80    movd   [decodedq+4], m1
     81    jg .loop_sample
     82 .ret:
     83    RET
     84 %endmacro
     85 
     86 LPC_32 sse4, 16, psrad
     87 LPC_32 sse4, 32, psrlq
     88 %if HAVE_XOP_EXTERNAL
     89 LPC_32 xop,  32, psrlq
     90 %endif
     91 
     92 INIT_XMM sse2
     93 cglobal flac_wasted_32, 3,3,5, decoded, wasted, len
     94    shl   lend, 2
     95    add   decodedq, lenq
     96    neg   lenq
     97    movd  m4, wastedd
     98 ALIGN 16
     99 .loop:
    100    mova  m0, [decodedq+lenq+mmsize*0]
    101    mova  m1, [decodedq+lenq+mmsize*1]
    102    mova  m2, [decodedq+lenq+mmsize*2]
    103    mova  m3, [decodedq+lenq+mmsize*3]
    104    pslld m0, m4
    105    pslld m1, m4
    106    pslld m2, m4
    107    pslld m3, m4
    108    mova  [decodedq+lenq+mmsize*0], m0
    109    mova  [decodedq+lenq+mmsize*1], m1
    110    mova  [decodedq+lenq+mmsize*2], m2
    111    mova  [decodedq+lenq+mmsize*3], m3
    112    add lenq, mmsize * 4
    113    jl .loop
    114    RET
    115 
    116 INIT_XMM sse4
    117 cglobal flac_wasted_33, 4,4,5, decoded, residuals, wasted, len
    118    shl   lend, 2
    119    lea   decodedq, [decodedq+lenq*2]
    120    add   residualsq, lenq
    121    neg   lenq
    122    movd  m4, wastedd
    123 ALIGN 16
    124 .loop:
    125    pmovsxdq  m0, [residualsq+lenq+mmsize*0]
    126    pmovsxdq  m1, [residualsq+lenq+mmsize/2]
    127    pmovsxdq  m2, [residualsq+lenq+mmsize*1]
    128    pmovsxdq  m3, [residualsq+lenq+mmsize*1+mmsize/2]
    129    psllq m0, m4
    130    psllq m1, m4
    131    psllq m2, m4
    132    psllq m3, m4
    133    mova  [decodedq+lenq*2+mmsize*0], m0
    134    mova  [decodedq+lenq*2+mmsize*1], m1
    135    mova  [decodedq+lenq*2+mmsize*2], m2
    136    mova  [decodedq+lenq*2+mmsize*3], m3
    137    add lenq, mmsize * 2
    138    jl .loop
    139    RET
    140 
    141 ;----------------------------------------------------------------------------------
    142 ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
    143 ;                                                   int len, int shift);
    144 ;----------------------------------------------------------------------------------
    145 %macro FLAC_DECORRELATE_16 3-4
    146 cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
    147 %ifidn %1, indep2
    148    VBROADCASTI128 m2, [vector]
    149 %endif
    150 %if ARCH_X86_32
    151    mov      lend, lenm
    152 %endif
    153    movd       m3, r4m
    154    shl      lend, 2
    155    mov      in1q, [in0q + gprsize]
    156    mov      in0q, [in0q]
    157    mov      outq, [outq]
    158    add      in1q, lenq
    159    add      in0q, lenq
    160    add      outq, lenq
    161    neg      lenq
    162 
    163 align 16
    164 .loop:
    165    mova       m0, [in0q + lenq]
    166    mova       m1, [in1q + lenq]
    167 %ifidn %1, ms
    168    psrad      m2, m1, 1
    169    psubd      m0, m2
    170 %endif
    171 %ifnidn %1, indep2
    172    p%4d       m2, m0, m1
    173    packssdw   m%2, m%2
    174    packssdw   m%3, m%3
    175    punpcklwd  m%2, m%3
    176    psllw      m%2, m3
    177 %else
    178    pslld      m%2, m3
    179    pslld      m%3, m3
    180    pshufb     m%2, m%2, m2
    181    pshufb     m%3, m%3, m2
    182    punpcklwd  m%2, m%3
    183 %endif
    184    mova [outq + lenq], m%2
    185    add      lenq, 16
    186    jl .loop
    187    RET
    188 %endmacro
    189 
    190 INIT_XMM sse2
    191 FLAC_DECORRELATE_16 ls, 0, 2, sub
    192 FLAC_DECORRELATE_16 rs, 2, 1, add
    193 FLAC_DECORRELATE_16 ms, 2, 0, add
    194 
    195 ;----------------------------------------------------------------------------------
    196 ;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
    197 ;                                        int len, int shift);
    198 ;----------------------------------------------------------------------------------
    199 %macro FLAC_DECORRELATE_32 5
    200 cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
    201 %if ARCH_X86_32
    202    mov      lend, lenm
    203 %endif
    204    movd       m3, r4m
    205    mov      in1q, [in0q + gprsize]
    206    mov      in0q, [in0q]
    207    mov      outq, [outq]
    208    sub      in1q, in0q
    209 
    210 align 16
    211 .loop:
    212    mova       m0, [in0q]
    213    mova       m1, [in0q + in1q]
    214 %ifidn %1, ms
    215    psrad      m2, m1, 1
    216    psubd      m0, m2
    217 %endif
    218    p%5d       m2, m0, m1
    219    pslld     m%2, m3
    220    pslld     m%3, m3
    221 
    222    SBUTTERFLY dq, %2, %3, %4
    223 
    224    mova  [outq         ], m%2
    225    mova  [outq + mmsize], m%3
    226 
    227    add      in0q, mmsize
    228    add      outq, mmsize*2
    229    sub      lend, mmsize/4
    230    jg .loop
    231    RET
    232 %endmacro
    233 
    234 INIT_XMM sse2
    235 FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
    236 FLAC_DECORRELATE_32 rs, 2, 1, 0, add
    237 FLAC_DECORRELATE_32 ms, 2, 0, 1, add
    238 
    239 ;-----------------------------------------------------------------------------------------
    240 ;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
    241 ;                                            int len, int shift);
    242 ;-----------------------------------------------------------------------------------------
    243 ;%1 = bps
    244 ;%2 = channels
    245 ;%3 = last xmm reg used
    246 ;%4 = word/dword (shift instruction)
    247 %macro FLAC_DECORRELATE_INDEP 4
    248 %define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
    249 cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
    250 %if ARCH_X86_32
    251 %if %2 == 6
    252    DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
    253    %define  lend  dword r3m
    254 %else
    255    mov      lend, lenm
    256 %endif
    257 %endif
    258    movd      m%3, r4m
    259 
    260 %assign %%i 1
    261 %rep %2-1
    262    mov      in %+ %%i %+ q, [in0q+%%i*gprsize]
    263 %assign %%i %%i+1
    264 %endrep
    265 
    266    mov      in0q, [in0q]
    267    mov      outq, [outq]
    268 
    269 %assign %%i 1
    270 %rep %2-1
    271    sub      in %+ %%i %+ q, in0q
    272 %assign %%i %%i+1
    273 %endrep
    274 
    275 align 16
    276 .loop:
    277    mova       m0, [in0q]
    278 
    279 %assign %%i 1
    280 %rep REPCOUNT-1
    281    mova     m %+ %%i, [in0q + in %+ %%i %+ q]
    282 %assign %%i %%i+1
    283 %endrep
    284 
    285 %if %1 == 32
    286 
    287 %if %2 == 8
    288    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
    289 %elif %2 == 6
    290    SBUTTERFLY dq, 0, 1, 6
    291    SBUTTERFLY dq, 2, 3, 6
    292    SBUTTERFLY dq, 4, 5, 6
    293 
    294    punpcklqdq m6, m0, m2
    295    punpckhqdq m2, m4
    296    shufps     m4, m0, 0xe4
    297    punpcklqdq m0, m1, m3
    298    punpckhqdq m3, m5
    299    shufps     m5, m1, 0xe4
    300    SWAP 0,6,1,4,5,3
    301 %elif %2 == 4
    302    TRANSPOSE4x4D 0, 1, 2, 3, 4
    303 %else ; %2 == 2
    304    SBUTTERFLY dq, 0, 1, 2
    305 %endif
    306 
    307 %else ; %1 == 16
    308 
    309 %if %2 == 8
    310    packssdw   m0, [in0q + in4q]
    311    packssdw   m1, [in0q + in5q]
    312    packssdw   m2, [in0q + in6q]
    313    packssdw   m3, [in0q + in7q]
    314    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
    315 %elif %2 == 6
    316    packssdw   m0, [in0q + in3q]
    317    packssdw   m1, [in0q + in4q]
    318    packssdw   m2, [in0q + in5q]
    319    pshufd     m3, m0,     q1032
    320    punpcklwd  m0, m1
    321    punpckhwd  m1, m2
    322    punpcklwd  m2, m3
    323 
    324    shufps     m3, m0, m2, q2020
    325    shufps     m0, m1,     q2031
    326    shufps     m2, m1,     q3131
    327    shufps     m1, m2, m3, q3120
    328    shufps     m3, m0,     q0220
    329    shufps     m0, m2,     q3113
    330    SWAP 2, 0, 3
    331 %else ; %2 == 4
    332    packssdw   m0, [in0q + in2q]
    333    packssdw   m1, [in0q + in3q]
    334    SBUTTERFLY wd, 0, 1, 2
    335    SBUTTERFLY dq, 0, 1, 2
    336 %endif
    337 
    338 %endif
    339 
    340 %assign %%i 0
    341 %rep REPCOUNT
    342    psll%4   m %+ %%i, m%3
    343 %assign %%i %%i+1
    344 %endrep
    345 
    346 %assign %%i 0
    347 %rep REPCOUNT
    348    mova [outq + %%i*mmsize], m %+ %%i
    349 %assign %%i %%i+1
    350 %endrep
    351 
    352    add      in0q, mmsize
    353    add      outq, mmsize*REPCOUNT
    354    sub      lend, mmsize/4
    355    jg .loop
    356    RET
    357 %endmacro
    358 
    359 INIT_XMM ssse3
    360 FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
    361 FLAC_DECORRELATE_INDEP 32, 2, 3, d
    362 FLAC_DECORRELATE_INDEP 16, 4, 3, w
    363 FLAC_DECORRELATE_INDEP 32, 4, 5, d
    364 FLAC_DECORRELATE_INDEP 16, 6, 4, w
    365 FLAC_DECORRELATE_INDEP 32, 6, 7, d
    366 %if ARCH_X86_64
    367 FLAC_DECORRELATE_INDEP 16, 8, 5, w
    368 FLAC_DECORRELATE_INDEP 32, 8, 9, d
    369 %endif
    370 
    371 INIT_XMM avx
    372 FLAC_DECORRELATE_INDEP 32, 4, 5, d
    373 FLAC_DECORRELATE_INDEP 32, 6, 7, d
    374 %if ARCH_X86_64
    375 FLAC_DECORRELATE_INDEP 16, 8, 5, w
    376 FLAC_DECORRELATE_INDEP 32, 8, 9, d
    377 %endif