tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

float_dsp.asm (16959B)


      1 ;*****************************************************************************
      2 ;* x86-optimized Float DSP functions
      3 ;*
      4 ;* Copyright 2006 Loren Merritt
      5 ;*
      6 ;* This file is part of FFmpeg.
      7 ;*
      8 ;* FFmpeg is free software; you can redistribute it and/or
      9 ;* modify it under the terms of the GNU Lesser General Public
     10 ;* License as published by the Free Software Foundation; either
     11 ;* version 2.1 of the License, or (at your option) any later version.
     12 ;*
     13 ;* FFmpeg is distributed in the hope that it will be useful,
     14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 ;* Lesser General Public License for more details.
     17 ;*
     18 ;* You should have received a copy of the GNU Lesser General Public
     19 ;* License along with FFmpeg; if not, write to the Free Software
     20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     21 ;******************************************************************************
     22 
     23 %include "libavutil/x86/x86util.asm"
     24 
     25 SECTION_RODATA 32
     26 pd_reverse: dd 7, 6, 5, 4, 3, 2, 1, 0
     27 
     28 SECTION .text
     29 
     30 ;-----------------------------------------------------------------------------
     31 ; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
     32 ;-----------------------------------------------------------------------------
     33 %macro VECTOR_FMUL 0
     34 cglobal vector_fmul, 4,4,2, dst, src0, src1, len
     35    lea       lenq, [lend*4 - 64]
     36 ALIGN 16
     37 .loop:
     38 %assign a 0
     39 %rep 32/mmsize
     40    mova      m0,   [src0q + lenq + (a+0)*mmsize]
     41    mova      m1,   [src0q + lenq + (a+1)*mmsize]
     42    mulps     m0, m0, [src1q + lenq + (a+0)*mmsize]
     43    mulps     m1, m1, [src1q + lenq + (a+1)*mmsize]
     44    mova      [dstq + lenq + (a+0)*mmsize], m0
     45    mova      [dstq + lenq + (a+1)*mmsize], m1
     46 %assign a a+2
     47 %endrep
     48 
     49    sub       lenq, 64
     50    jge       .loop
     51    RET
     52 %endmacro
     53 
     54 INIT_XMM sse
     55 VECTOR_FMUL
     56 %if HAVE_AVX_EXTERNAL
     57 INIT_YMM avx
     58 VECTOR_FMUL
     59 %endif
     60 
     61 ;-----------------------------------------------------------------------------
     62 ; void vector_dmul(double *dst, const double *src0, const double *src1, int len)
     63 ;-----------------------------------------------------------------------------
     64 %macro VECTOR_DMUL 0
     65 cglobal vector_dmul, 4,4,4, dst, src0, src1, len
     66    lea       lend, [lenq*8 - mmsize*4]
     67 ALIGN 16
     68 .loop:
     69    movaps    m0,     [src0q + lenq + 0*mmsize]
     70    movaps    m1,     [src0q + lenq + 1*mmsize]
     71    movaps    m2,     [src0q + lenq + 2*mmsize]
     72    movaps    m3,     [src0q + lenq + 3*mmsize]
     73    mulpd     m0, m0, [src1q + lenq + 0*mmsize]
     74    mulpd     m1, m1, [src1q + lenq + 1*mmsize]
     75    mulpd     m2, m2, [src1q + lenq + 2*mmsize]
     76    mulpd     m3, m3, [src1q + lenq + 3*mmsize]
     77    movaps    [dstq + lenq + 0*mmsize], m0
     78    movaps    [dstq + lenq + 1*mmsize], m1
     79    movaps    [dstq + lenq + 2*mmsize], m2
     80    movaps    [dstq + lenq + 3*mmsize], m3
     81 
     82    sub       lenq, mmsize*4
     83    jge       .loop
     84    RET
     85 %endmacro
     86 
     87 INIT_XMM sse2
     88 VECTOR_DMUL
     89 %if HAVE_AVX_EXTERNAL
     90 INIT_YMM avx
     91 VECTOR_DMUL
     92 %endif
     93 
     94 ;------------------------------------------------------------------------------
     95 ; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
     96 ;------------------------------------------------------------------------------
     97 
     98 %macro VECTOR_FMAC_SCALAR 0
     99 %if UNIX64
    100 cglobal vector_fmac_scalar, 3,3,5, dst, src, len
    101 %else
    102 cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
    103 %endif
    104 %if ARCH_X86_32
    105    VBROADCASTSS m0, mulm
    106 %else
    107 %if WIN64
    108    SWAP 0, 2
    109 %endif
    110    shufps      xm0, xm0, 0
    111 %if cpuflag(avx)
    112    vinsertf128  m0, m0, xm0, 1
    113 %endif
    114 %endif
    115    lea    lenq, [lend*4-64]
    116 .loop:
    117 %if cpuflag(fma3)
    118    mova     m1,     [dstq+lenq]
    119    mova     m2,     [dstq+lenq+1*mmsize]
    120    fmaddps  m1, m0, [srcq+lenq], m1
    121    fmaddps  m2, m0, [srcq+lenq+1*mmsize], m2
    122 %else ; cpuflag
    123    mulps    m1, m0, [srcq+lenq]
    124    mulps    m2, m0, [srcq+lenq+1*mmsize]
    125 %if mmsize < 32
    126    mulps    m3, m0, [srcq+lenq+2*mmsize]
    127    mulps    m4, m0, [srcq+lenq+3*mmsize]
    128 %endif ; mmsize
    129    addps    m1, m1, [dstq+lenq]
    130    addps    m2, m2, [dstq+lenq+1*mmsize]
    131 %if mmsize < 32
    132    addps    m3, m3, [dstq+lenq+2*mmsize]
    133    addps    m4, m4, [dstq+lenq+3*mmsize]
    134 %endif ; mmsize
    135 %endif ; cpuflag
    136    mova  [dstq+lenq], m1
    137    mova  [dstq+lenq+1*mmsize], m2
    138 %if mmsize < 32
    139    mova  [dstq+lenq+2*mmsize], m3
    140    mova  [dstq+lenq+3*mmsize], m4
    141 %endif ; mmsize
    142    sub    lenq, 64
    143    jge .loop
    144    RET
    145 %endmacro
    146 
    147 INIT_XMM sse
    148 VECTOR_FMAC_SCALAR
    149 %if HAVE_AVX_EXTERNAL
    150 INIT_YMM avx
    151 VECTOR_FMAC_SCALAR
    152 %endif
    153 %if HAVE_FMA3_EXTERNAL
    154 INIT_YMM fma3
    155 VECTOR_FMAC_SCALAR
    156 %endif
    157 
    158 ;------------------------------------------------------------------------------
    159 ; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
    160 ;------------------------------------------------------------------------------
    161 
    162 %macro VECTOR_FMUL_SCALAR 0
    163 %if UNIX64
    164 cglobal vector_fmul_scalar, 3,3,2, dst, src, len
    165 %else
    166 cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
    167 %endif
    168 %if ARCH_X86_32
    169    movss    m0, mulm
    170 %elif WIN64
    171    SWAP 0, 2
    172 %endif
    173    shufps   m0, m0, 0
    174    lea    lenq, [lend*4-mmsize]
    175 .loop:
    176    mova     m1, [srcq+lenq]
    177    mulps    m1, m0
    178    mova  [dstq+lenq], m1
    179    sub    lenq, mmsize
    180    jge .loop
    181    RET
    182 %endmacro
    183 
    184 INIT_XMM sse
    185 VECTOR_FMUL_SCALAR
    186 
    187 ;------------------------------------------------------------------------------
    188 ; void ff_vector_dmac_scalar(double *dst, const double *src, double mul,
    189 ;                            int len)
    190 ;------------------------------------------------------------------------------
    191 
    192 %macro VECTOR_DMAC_SCALAR 0
    193 %if ARCH_X86_32
    194 cglobal vector_dmac_scalar, 2,4,5, dst, src, mul, len, lenaddr
    195    mov          lenq, lenaddrm
    196    VBROADCASTSD m0, mulm
    197 %else
    198 %if UNIX64
    199 cglobal vector_dmac_scalar, 3,3,5, dst, src, len
    200 %else
    201 cglobal vector_dmac_scalar, 4,4,5, dst, src, mul, len
    202    SWAP 0, 2
    203 %endif
    204    movlhps     xm0, xm0
    205 %if cpuflag(avx)
    206    vinsertf128  m0, m0, xm0, 1
    207 %endif
    208 %endif
    209    lea    lenq, [lend*8-mmsize*4]
    210 .loop:
    211 %if cpuflag(fma3)
    212    movaps   m1,     [dstq+lenq]
    213    movaps   m2,     [dstq+lenq+1*mmsize]
    214    movaps   m3,     [dstq+lenq+2*mmsize]
    215    movaps   m4,     [dstq+lenq+3*mmsize]
    216    fmaddpd  m1, m0, [srcq+lenq], m1
    217    fmaddpd  m2, m0, [srcq+lenq+1*mmsize], m2
    218    fmaddpd  m3, m0, [srcq+lenq+2*mmsize], m3
    219    fmaddpd  m4, m0, [srcq+lenq+3*mmsize], m4
    220 %else ; cpuflag
    221    mulpd    m1, m0, [srcq+lenq]
    222    mulpd    m2, m0, [srcq+lenq+1*mmsize]
    223    mulpd    m3, m0, [srcq+lenq+2*mmsize]
    224    mulpd    m4, m0, [srcq+lenq+3*mmsize]
    225    addpd    m1, m1, [dstq+lenq]
    226    addpd    m2, m2, [dstq+lenq+1*mmsize]
    227    addpd    m3, m3, [dstq+lenq+2*mmsize]
    228    addpd    m4, m4, [dstq+lenq+3*mmsize]
    229 %endif ; cpuflag
    230    movaps [dstq+lenq], m1
    231    movaps [dstq+lenq+1*mmsize], m2
    232    movaps [dstq+lenq+2*mmsize], m3
    233    movaps [dstq+lenq+3*mmsize], m4
    234    sub    lenq, mmsize*4
    235    jge .loop
    236    RET
    237 %endmacro
    238 
    239 INIT_XMM sse2
    240 VECTOR_DMAC_SCALAR
    241 %if HAVE_AVX_EXTERNAL
    242 INIT_YMM avx
    243 VECTOR_DMAC_SCALAR
    244 %endif
    245 %if HAVE_FMA3_EXTERNAL
    246 INIT_YMM fma3
    247 VECTOR_DMAC_SCALAR
    248 %endif
    249 
    250 ;------------------------------------------------------------------------------
    251 ; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
    252 ;                            int len)
    253 ;------------------------------------------------------------------------------
    254 
    255 %macro VECTOR_DMUL_SCALAR 0
    256 %if ARCH_X86_32
    257 cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
    258    mov          lenq, lenaddrm
    259 %elif UNIX64
    260 cglobal vector_dmul_scalar, 3,3,3, dst, src, len
    261 %else
    262 cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
    263 %endif
    264 %if ARCH_X86_32
    265    VBROADCASTSD   m0, mulm
    266 %else
    267 %if WIN64
    268    SWAP 0, 2
    269 %endif
    270    movlhps       xm0, xm0
    271 %if cpuflag(avx)
    272    vinsertf128   ym0, ym0, xm0, 1
    273 %endif
    274 %endif
    275    lea          lenq, [lend*8-2*mmsize]
    276 .loop:
    277    mulpd          m1, m0, [srcq+lenq       ]
    278    mulpd          m2, m0, [srcq+lenq+mmsize]
    279    movaps [dstq+lenq       ], m1
    280    movaps [dstq+lenq+mmsize], m2
    281    sub          lenq, 2*mmsize
    282    jge .loop
    283    RET
    284 %endmacro
    285 
    286 INIT_XMM sse2
    287 VECTOR_DMUL_SCALAR
    288 %if HAVE_AVX_EXTERNAL
    289 INIT_YMM avx
    290 VECTOR_DMUL_SCALAR
    291 %endif
    292 
    293 ;-----------------------------------------------------------------------------
    294 ; vector_fmul_window(float *dst, const float *src0,
    295 ;                    const float *src1, const float *win, int len);
    296 ;-----------------------------------------------------------------------------
    297 INIT_XMM sse
    298 cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
    299    shl     lend, 2
    300    lea    len1q, [lenq - mmsize]
    301    add    src0q, lenq
    302    add     dstq, lenq
    303    add     winq, lenq
    304    neg     lenq
    305 .loop:
    306    mova      m0, [winq  + lenq]
    307    mova      m4, [src0q + lenq]
    308    mova      m1, [winq  + len1q]
    309    mova      m5, [src1q + len1q]
    310    shufps    m1, m1, 0x1b
    311    shufps    m5, m5, 0x1b
    312    mova      m2, m0
    313    mova      m3, m1
    314    mulps     m2, m4
    315    mulps     m3, m5
    316    mulps     m1, m4
    317    mulps     m0, m5
    318    addps     m2, m3
    319    subps     m1, m0
    320    shufps    m2, m2, 0x1b
    321    mova      [dstq + lenq], m1
    322    mova      [dstq + len1q], m2
    323    sub       len1q, mmsize
    324    add       lenq,  mmsize
    325    jl .loop
    326    RET
    327 
    328 ;-----------------------------------------------------------------------------
    329 ; vector_fmul_add(float *dst, const float *src0, const float *src1,
    330 ;                 const float *src2, int len)
    331 ;-----------------------------------------------------------------------------
    332 %macro VECTOR_FMUL_ADD 0
    333 cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
    334    lea       lenq, [lend*4 - 2*mmsize]
    335 ALIGN 16
    336 .loop:
    337    mova    m0,   [src0q + lenq]
    338    mova    m1,   [src0q + lenq + mmsize]
    339 %if cpuflag(fma3)
    340    mova    m2,     [src2q + lenq]
    341    mova    m3,     [src2q + lenq + mmsize]
    342    fmaddps m0, m0, [src1q + lenq], m2
    343    fmaddps m1, m1, [src1q + lenq + mmsize], m3
    344 %else
    345    mulps   m0, m0, [src1q + lenq]
    346    mulps   m1, m1, [src1q + lenq + mmsize]
    347    addps   m0, m0, [src2q + lenq]
    348    addps   m1, m1, [src2q + lenq + mmsize]
    349 %endif
    350    mova    [dstq + lenq], m0
    351    mova    [dstq + lenq + mmsize], m1
    352 
    353    sub     lenq,   2*mmsize
    354    jge     .loop
    355    RET
    356 %endmacro
    357 
    358 INIT_XMM sse
    359 VECTOR_FMUL_ADD
    360 %if HAVE_AVX_EXTERNAL
    361 INIT_YMM avx
    362 VECTOR_FMUL_ADD
    363 %endif
    364 %if HAVE_FMA3_EXTERNAL
    365 INIT_YMM fma3
    366 VECTOR_FMUL_ADD
    367 %endif
    368 
    369 ;-----------------------------------------------------------------------------
    370 ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
    371 ;                          int len)
    372 ;-----------------------------------------------------------------------------
    373 %macro VECTOR_FMUL_REVERSE 0
    374 cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
    375 %if cpuflag(avx2)
    376    movaps  m2, [pd_reverse]
    377 %endif
    378    lea       lenq, [lend*4 - 2*mmsize]
    379 ALIGN 16
    380 .loop:
    381 %if cpuflag(avx2)
    382    vpermps m0, m2, [src1q]
    383    vpermps m1, m2, [src1q+mmsize]
    384 %elif cpuflag(avx)
    385    vmovaps     xmm0, [src1q + 16]
    386    vinsertf128 m0, m0, [src1q], 1
    387    vshufps     m0, m0, m0, q0123
    388    vmovaps     xmm1, [src1q + mmsize + 16]
    389    vinsertf128 m1, m1, [src1q + mmsize], 1
    390    vshufps     m1, m1, m1, q0123
    391 %else
    392    mova    m0, [src1q]
    393    mova    m1, [src1q + mmsize]
    394    shufps  m0, m0, q0123
    395    shufps  m1, m1, q0123
    396 %endif
    397    mulps   m0, m0, [src0q + lenq + mmsize]
    398    mulps   m1, m1, [src0q + lenq]
    399    movaps  [dstq + lenq + mmsize], m0
    400    movaps  [dstq + lenq], m1
    401    add     src1q, 2*mmsize
    402    sub     lenq,  2*mmsize
    403    jge     .loop
    404    RET
    405 %endmacro
    406 
    407 INIT_XMM sse
    408 VECTOR_FMUL_REVERSE
    409 %if HAVE_AVX_EXTERNAL
    410 INIT_YMM avx
    411 VECTOR_FMUL_REVERSE
    412 %endif
    413 %if HAVE_AVX2_EXTERNAL
    414 INIT_YMM avx2
    415 VECTOR_FMUL_REVERSE
    416 %endif
    417 
    418 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
    419 INIT_XMM sse
    420 cglobal scalarproduct_float, 3,3,2, v1, v2, offset
    421    shl   offsetd, 2
    422    add       v1q, offsetq
    423    add       v2q, offsetq
    424    neg   offsetq
    425    xorps    xmm0, xmm0
    426 .loop:
    427    movaps   xmm1, [v1q+offsetq]
    428    mulps    xmm1, [v2q+offsetq]
    429    addps    xmm0, xmm1
    430    add   offsetq, 16
    431    js .loop
    432    movhlps  xmm1, xmm0
    433    addps    xmm0, xmm1
    434    movss    xmm1, xmm0
    435    shufps   xmm0, xmm0, 1
    436    addss    xmm0, xmm1
    437 %if ARCH_X86_64 == 0
    438    movss     r0m,  xmm0
    439    fld dword r0m
    440 %endif
    441    RET
    442 
    443 INIT_YMM fma3
    444 cglobal scalarproduct_float, 3,5,8, v1, v2, size, len, offset
    445    xor   offsetq, offsetq
    446    xorps      m0, m0, m0
    447    shl     sized, 2
    448    mov      lenq, sizeq
    449    cmp      lenq, 32
    450    jl   .l16
    451    cmp      lenq, 64
    452    jl   .l32
    453    xorps    m1, m1, m1
    454    cmp      lenq, 128
    455    jl   .l64
    456    and    lenq, ~127
    457    xorps    m2, m2, m2
    458    xorps    m3, m3, m3
    459 .loop128:
    460    movups   m4, [v1q+offsetq]
    461    movups   m5, [v1q+offsetq + 32]
    462    movups   m6, [v1q+offsetq + 64]
    463    movups   m7, [v1q+offsetq + 96]
    464    fmaddps  m0, m4, [v2q+offsetq     ], m0
    465    fmaddps  m1, m5, [v2q+offsetq + 32], m1
    466    fmaddps  m2, m6, [v2q+offsetq + 64], m2
    467    fmaddps  m3, m7, [v2q+offsetq + 96], m3
    468    add   offsetq, 128
    469    cmp   offsetq, lenq
    470    jl .loop128
    471    addps    m0, m0, m2
    472    addps    m1, m1, m3
    473    mov      lenq, sizeq
    474    and      lenq, 127
    475    cmp      lenq, 64
    476    jge .l64
    477    addps    m0, m0, m1
    478    cmp      lenq, 32
    479    jge .l32
    480    vextractf128 xmm2, m0, 1
    481    addps    xmm0, xmm2
    482    cmp      lenq, 16
    483    jge .l16
    484    movhlps  xmm1, xmm0
    485    addps    xmm0, xmm1
    486    movss    xmm1, xmm0
    487    shufps   xmm0, xmm0, 1
    488    addss    xmm0, xmm1
    489 %if ARCH_X86_64 == 0
    490    movss r0m, xm0
    491    fld dword r0m
    492 %endif
    493    RET
    494 .l64:
    495    and    lenq, ~63
    496    add    lenq, offsetq
    497 .loop64:
    498    movups   m4, [v1q+offsetq]
    499    movups   m5, [v1q+offsetq + 32]
    500    fmaddps  m0, m4, [v2q+offsetq], m0
    501    fmaddps  m1, m5, [v2q+offsetq + 32], m1
    502    add   offsetq, 64
    503    cmp   offsetq, lenq
    504    jl .loop64
    505    addps    m0, m0, m1
    506    mov      lenq, sizeq
    507    and      lenq, 63
    508    cmp      lenq, 32
    509    jge .l32
    510    vextractf128 xmm2, m0, 1
    511    addps    xmm0, xmm2
    512    cmp      lenq, 16
    513    jge .l16
    514    movhlps  xmm1, xmm0
    515    addps    xmm0, xmm1
    516    movss    xmm1, xmm0
    517    shufps   xmm0, xmm0, 1
    518    addss    xmm0, xmm1
    519 %if ARCH_X86_64 == 0
    520    movss r0m, xm0
    521    fld dword r0m
    522 %endif
    523    RET
    524 .l32:
    525    and    lenq, ~31
    526    add    lenq, offsetq
    527 .loop32:
    528    movups   m4, [v1q+offsetq]
    529    fmaddps  m0, m4, [v2q+offsetq], m0
    530    add   offsetq, 32
    531    cmp   offsetq, lenq
    532    jl .loop32
    533    vextractf128 xmm2, m0, 1
    534    addps    xmm0, xmm2
    535    mov      lenq, sizeq
    536    and      lenq, 31
    537    cmp      lenq, 16
    538    jge .l16
    539    movhlps  xmm1, xmm0
    540    addps    xmm0, xmm1
    541    movss    xmm1, xmm0
    542    shufps   xmm0, xmm0, 1
    543    addss    xmm0, xmm1
    544 %if ARCH_X86_64 == 0
    545    movss r0m, xm0
    546    fld dword r0m
    547 %endif
    548    RET
    549 .l16:
    550    and    lenq, ~15
    551    add    lenq, offsetq
    552 .loop16:
    553    movaps   xmm1, [v1q+offsetq]
    554    mulps    xmm1, [v2q+offsetq]
    555    addps    xmm0, xmm1
    556    add   offsetq, 16
    557    cmp   offsetq, lenq
    558    jl .loop16
    559    movhlps  xmm1, xmm0
    560    addps    xmm0, xmm1
    561    movss    xmm1, xmm0
    562    shufps   xmm0, xmm0, 1
    563    addss    xmm0, xmm1
    564 %if ARCH_X86_64 == 0
    565    movss r0m, xm0
    566    fld dword r0m
    567 %endif
    568    RET
    569 
    570 ;---------------------------------------------------------------------------------
    571 ; double scalarproduct_double(const double *v1, const double *v2, size_t len)
    572 ;---------------------------------------------------------------------------------
    573 %macro SCALARPRODUCT_DOUBLE 0
    574 cglobal scalarproduct_double, 3,3,8, v1, v2, offset
    575    shl offsetq, 3
    576    add     v1q, offsetq
    577    add     v2q, offsetq
    578    neg offsetq
    579    xorpd    m0, m0
    580    xorpd    m1, m1
    581    movapd   m2, m0
    582    movapd   m3, m1
    583 align 16
    584 .loop:
    585    movapd   m4, [v1q+offsetq+mmsize*0]
    586    movapd   m5, [v1q+offsetq+mmsize*1]
    587    movapd   m6, [v1q+offsetq+mmsize*2]
    588    movapd   m7, [v1q+offsetq+mmsize*3]
    589    mulpd    m4, [v2q+offsetq+mmsize*0]
    590    mulpd    m5, [v2q+offsetq+mmsize*1]
    591    mulpd    m6, [v2q+offsetq+mmsize*2]
    592    mulpd    m7, [v2q+offsetq+mmsize*3]
    593    addpd    m0, m4
    594    addpd    m1, m5
    595    addpd    m2, m6
    596    addpd    m3, m7
    597    add offsetq, mmsize*4
    598    jl .loop
    599    addpd    m0, m1
    600    addpd    m2, m3
    601    addpd    m0, m2
    602 %if mmsize == 32
    603    vextractf128 xm1, m0, 1
    604    addpd   xm0, xm1
    605 %endif
    606    movhlps xm1, xm0
    607    addsd   xm0, xm1
    608 %if ARCH_X86_64 == 0
    609    movsd   r0m, xm0
    610    fld qword r0m
    611 %endif
    612    RET
    613 %endmacro
    614 
    615 INIT_XMM sse2
    616 SCALARPRODUCT_DOUBLE
    617 %if HAVE_AVX_EXTERNAL
    618 INIT_YMM avx
    619 SCALARPRODUCT_DOUBLE
    620 %endif
    621 
    622 ;-----------------------------------------------------------------------------
    623 ; void ff_butterflies_float(float *src0, float *src1, int len);
    624 ;-----------------------------------------------------------------------------
    625 INIT_XMM sse
    626 cglobal butterflies_float, 3,3,3, src0, src1, len
    627    shl       lend, 2
    628    add      src0q, lenq
    629    add      src1q, lenq
    630    neg       lenq
    631 .loop:
    632    mova        m0, [src0q + lenq]
    633    mova        m1, [src1q + lenq]
    634    subps       m2, m0, m1
    635    addps       m0, m0, m1
    636    mova        [src1q + lenq], m2
    637    mova        [src0q + lenq], m0
    638    add       lenq, mmsize
    639    jl .loop
    640    RET