tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

tx_float.asm (58015B)


      1 ;******************************************************************************
      2 ;* Copyright (c) Lynne
      3 ;*
      4 ;* This file is part of FFmpeg.
      5 ;*
      6 ;* FFmpeg is free software; you can redistribute it and/or
      7 ;* modify it under the terms of the GNU Lesser General Public
      8 ;* License as published by the Free Software Foundation; either
      9 ;* version 2.1 of the License, or (at your option) any later version.
     10 ;*
     11 ;* FFmpeg is distributed in the hope that it will be useful,
     12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 ;* Lesser General Public License for more details.
     15 ;*
     16 ;* You should have received a copy of the GNU Lesser General Public
     17 ;* License along with FFmpeg; if not, write to the Free Software
     18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 ;******************************************************************************
     20 
     21 ; Open `doc/transforms.md` to see the code upon which the transforms here were
     22 ; based upon and compare.
     23 
     24 ; Intra-asm call convention:
     25 ;       320 bytes of stack available
     26 ;       14 GPRs available (last 4 must not be clobbered)
     27 ;       Additionally, don't clobber ctx, in, out, stride, len, lut
     28 ;       All vector regs available
     29 
     30 ; TODO:
     31 ;       carry over registers from smaller transforms to save on ~8 loads/stores
     32 ;       check if vinsertf could be faster than verpm2f128 for duplication
     33 ;       even faster FFT8 (current one is very #instructions optimized)
     34 ;       replace some xors with blends + addsubs?
     35 ;       replace some shuffles with vblends?
     36 ;       avx512 split-radix
     37 
     38 %include "libavutil/x86/x86util.asm"
     39 
     40 %define private_prefix ff_tx
     41 
     42 %if ARCH_X86_64
     43 %define ptr resq
     44 %else
     45 %define ptr resd
     46 %endif
     47 
     48 %assign i 16
     49 %rep 18
     50 cextern tab_ %+ i %+ _float ; ff_tab_i_float...
     51 %assign i (i << 1)
     52 %endrep
     53 
     54 cextern tab_53_float
     55 
     56 struc AVTXContext
     57    .len:          resd 1 ; Length
     58    .inv           resd 1 ; Inverse flag
     59    .map:           ptr 1 ; Lookup table(s)
     60    .exp:           ptr 1 ; Exponentiation factors
     61    .tmp:           ptr 1 ; Temporary data
     62 
     63    .sub:           ptr 1 ; Subcontexts
     64    .fn:            ptr 4 ; Subcontext functions
     65    .nb_sub:       resd 1 ; Subcontext count
     66 
     67    ; Everything else is inaccessible
     68 endstruc
     69 
     70 SECTION_RODATA 32
     71 
     72 %define POS 0x00000000
     73 %define NEG 0x80000000
     74 
     75 %define M_SQRT1_2 0.707106781186547524401
     76 %define COS16_1   0.92387950420379638671875
     77 %define COS16_3   0.3826834261417388916015625
     78 
     79 d8_mult_odd:   dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \
     80                  M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2
     81 
     82 s8_mult_odd:   dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
     83 s8_perm_even:  dd 1, 3, 0, 2, 1, 3, 2, 0
     84 s8_perm_odd1:  dd 3, 3, 1, 1, 1, 1, 3, 3
     85 s8_perm_odd2:  dd 1, 2, 0, 3, 1, 0, 0, 1
     86 
     87 s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2
     88 s16_mult_odd1: dd COS16_1,  COS16_1,  COS16_3,  COS16_3,  COS16_1, -COS16_1,  COS16_3, -COS16_3
     89 s16_mult_odd2: dd COS16_3, -COS16_3,  COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
     90 s16_perm:      dd 0, 1, 2, 3, 1, 0, 3, 2
     91 
     92 s15_perm:      dd 0, 6, 5, 3, 2, 4, 7, 1
     93 
     94 mask_mmppmmmm: dd NEG, NEG, POS, POS, NEG, NEG, NEG, NEG
     95 mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
     96 mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
     97 mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
     98 mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG
     99 mask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS
    100 mask_pmpmpmpm: times 4 dd POS, NEG
    101 
    102 SECTION .text
    103 
    104 ; Load complex values (64 bits) via a lookup table
    105 ; %1 - output register
    106 ; %2 - GRP of base input memory address
    107 ; %3 - GPR of LUT (int32_t indices) address
    108 ; %4 - LUT offset
    109 ; %5 - temporary GPR (only used if vgather is not used)
    110 ; %6 - temporary register (for avx only)
    111 ; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set)
    112 %macro LOAD64_LUT 5-7
    113 %if %0 > 6 && cpuflag(avx2)
    114    pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25
    115    movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction
    116    vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args
    117 %else
    118    mov      %5d, [%3 + %4 + 0]
    119    movsd  xmm%1, [%2 + %5q*8]
    120 %if sizeof%1 > 16 && %0 > 5
    121    mov      %5d, [%3 + %4 + 8]
    122    movsd  xmm%6, [%2 + %5q*8]
    123 %endif
    124    mov      %5d, [%3 + %4 + 4]
    125    movhps xmm%1, [%2 + %5q*8]
    126 %if sizeof%1 > 16 && %0 > 5
    127    mov      %5d, [%3 + %4 + 12]
    128    movhps xmm%6, [%2 + %5q*8]
    129    vinsertf128 %1, %1, xmm%6, 1
    130 %endif
    131 %endif
    132 %endmacro
    133 
    134 ; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode)
    135 ; %1 - coefficients (r0.reim, r1.reim)
    136 ; %2 - temporary
    137 %macro FFT2 2
    138    shufps   %2, %1, %1, q3322
    139    shufps   %1, %1, %1, q1100
    140 
    141    addsubps %1, %1, %2
    142 
    143    shufps   %1, %1, %1, q2031
    144 %endmacro
    145 
    146 ; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode)
    147 ; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
    148 ; %2 - odd coefficients  (r1.reim, r3.reim, r5.reim, r7.reim)
    149 ; %3 - temporary
    150 %macro FFT4 3
    151    subps  %3, %1, %2         ;  r1234, [r5678]
    152    addps  %1, %1, %2         ;  t1234, [t5678]
    153 
    154    shufps %2, %1, %3, q1010  ;  t12, r12
    155    shufps %1, %1, %3, q2332  ;  t34, r43
    156 
    157    subps  %3, %2, %1         ;  a34, b32
    158    addps  %2, %2, %1         ;  a12, b14
    159 
    160    shufps %1, %2, %3, q1010  ;  a1234     even
    161 
    162    shufps %2, %2, %3, q2332  ;  b1423
    163    shufps %2, %2, %2, q1320  ;  b1234     odd
    164 %endmacro
    165 
    166 ; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode)
    167 ; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim])
    168 ; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim])
    169 ; %3 - odd coefficients  (a1.reim, a3.reim, [b1.reim, b3.reim])
    170 ; %4 - odd coefficients  (a5.reim, a7.reim, [b5.reim, b7.reim])
    171 ; %5 - temporary
    172 ; %6 - temporary
    173 %macro FFT8 6
    174    addps    %5, %1, %3               ; q1-8
    175    addps    %6, %2, %4               ; k1-8
    176 
    177    subps    %1, %1, %3               ; r1-8
    178    subps    %2, %2, %4               ; j1-8
    179 
    180    shufps   %4, %1, %1, q2323        ; r4343
    181    shufps   %3, %5, %6, q3032        ; q34, k14
    182 
    183    shufps   %1, %1, %1, q1010        ; r1212
    184    shufps   %5, %5, %6, q1210        ; q12, k32
    185 
    186    xorps    %4, %4, [mask_pmmppmmp]  ; r4343 * pmmp
    187    addps    %6, %5, %3               ; s12, g12
    188 
    189    mulps    %2, %2, [d8_mult_odd]    ; r8 * d8_mult_odd
    190    subps    %5, %5, %3               ; s34, g43
    191 
    192    addps    %3, %1, %4               ; z1234
    193    unpcklpd %1, %6, %5               ; s1234
    194 
    195    shufps   %4, %2, %2, q2301        ; j2143
    196    shufps   %6, %6, %5, q2332        ; g1234
    197 
    198    addsubps %2, %2, %4               ; l2143
    199    shufps   %5, %2, %2, q0123        ; l3412
    200    addsubps %5, %5, %2               ; t1234
    201 
    202    subps    %2, %1, %6               ; h1234 even
    203    subps    %4, %3, %5               ; u1234 odd
    204 
    205    addps    %1, %1, %6               ; w1234 even
    206    addps    %3, %3, %5               ; o1234 odd
    207 %endmacro
    208 
    209 ; Single 8-point in-place complex FFT in 20 instructions
    210 ; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim)
    211 ; %2 - odd coefficients  (r1.reim, r3.reim, r5.reim, r7.reim)
    212 ; %3 - temporary
    213 ; %4 - temporary
    214 %macro FFT8_AVX 4
    215    subps      %3, %1, %2               ;  r1234, r5678
    216    addps      %1, %1, %2               ;  q1234, q5678
    217 
    218    vpermilps  %2, %3, [s8_perm_odd1]   ;  r4422, r6688
    219    shufps     %4, %1, %1, q3322        ;  q1122, q5566
    220 
    221    movsldup   %3, %3                   ;  r1133, r5577
    222    shufps     %1, %1, %1, q1100        ;  q3344, q7788
    223 
    224    addsubps   %3, %3, %2               ;  z1234, z5678
    225    addsubps   %1, %1, %4               ;  s3142, s7586
    226 
    227    mulps      %3, %3, [s8_mult_odd]    ;  z * s8_mult_odd
    228    vpermilps  %1, %1, [s8_perm_even]   ;  s1234, s5687 !
    229 
    230    shufps     %2, %3, %3, q2332        ;   junk, z7887
    231    xorps      %4, %1, [mask_mmmmpppm]  ;  e1234, e5687 !
    232 
    233    vpermilps  %3, %3, [s8_perm_odd2]   ;  z2314, z6556
    234    vperm2f128 %1, %1, %4, 0x03         ;  e5687, s1234
    235 
    236    addsubps   %2, %2, %3               ;   junk, t5678
    237    subps      %1, %1, %4               ;  w1234, w5678 even
    238 
    239    vperm2f128 %2, %2, %2, 0x11         ;  t5678, t5678
    240    vperm2f128 %3, %3, %3, 0x00         ;  z2314, z2314
    241 
    242    xorps      %2, %2, [mask_ppmpmmpm]  ;  t * ppmpmmpm
    243    addps      %2, %3, %2               ;  u1234, u5678 odd
    244 %endmacro
    245 
    246 ; Single 16-point in-place complex FFT
    247 ; %1 - even coefficients (r0.reim, r2.reim,  r4.reim,  r6.reim)
    248 ; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim)
    249 ; %3 - odd coefficients  (r1.reim, r3.reim,  r5.reim,  r7.reim)
    250 ; %4 - odd coefficients  (r9.reim, r11.reim, r13.reim, r15.reim)
    251 ; %5, %6 - temporary
    252 ; %7, %8 - temporary (optional)
    253 %macro FFT16 6-8
    254    FFT4       %3, %4, %5
    255 %if %0 > 7
    256    FFT8_AVX   %1, %2, %6, %7
    257    movaps     %8, [mask_mpmppmpm]
    258    movaps     %7, [s16_perm]
    259 %define mask %8
    260 %define perm %7
    261 %elif %0 > 6
    262    FFT8_AVX   %1, %2, %6, %7
    263    movaps     %7, [s16_perm]
    264 %define mask [mask_mpmppmpm]
    265 %define perm %7
    266 %else
    267    FFT8_AVX   %1, %2, %6, %5
    268 %define mask [mask_mpmppmpm]
    269 %define perm [s16_perm]
    270 %endif
    271    xorps      %5, %5, %5                   ; 0
    272 
    273    shufps     %6, %4, %4, q2301            ; z12.imre, z13.imre...
    274    shufps     %5, %5, %3, q2301            ; 0, 0, z8.imre...
    275 
    276    mulps      %4, %4, [s16_mult_odd1]      ; z.reim * costab
    277    xorps      %5, %5, [mask_mppmmpmp]
    278 %if cpuflag(fma3)
    279    fmaddps    %6, %6, [s16_mult_odd2], %4  ; s[8..15]
    280    addps      %5, %3, %5                   ; s[0...7]
    281 %else
    282    mulps      %6, %6, [s16_mult_odd2]      ; z.imre * costab
    283 
    284    addps      %5, %3, %5                   ; s[0...7]
    285    addps      %6, %4, %6                   ; s[8..15]
    286 %endif
    287    mulps      %5, %5, [s16_mult_even]      ; s[0...7]*costab
    288 
    289    xorps      %4, %6, mask                 ; s[8..15]*mpmppmpm
    290    xorps      %3, %5, mask                 ; s[0...7]*mpmppmpm
    291 
    292    vperm2f128 %4, %4, %4, 0x01             ; s[12..15, 8..11]
    293    vperm2f128 %3, %3, %3, 0x01             ; s[4..7, 0..3]
    294 
    295    addps      %6, %6, %4                   ; y56, u56, y34, u34
    296    addps      %5, %5, %3                   ; w56, x56, w34, x34
    297 
    298    vpermilps  %6, %6, perm                 ; y56, u56, y43, u43
    299    vpermilps  %5, %5, perm                 ; w56, x56, w43, x43
    300 
    301    subps      %4, %2, %6                   ; odd  part 2
    302    addps      %3, %2, %6                   ; odd  part 1
    303 
    304    subps      %2, %1, %5                   ; even part 2
    305    addps      %1, %1, %5                   ; even part 1
    306 %undef mask
    307 %undef perm
    308 %endmacro
    309 
    310 ; Single 15-point complex FFT
    311 ; Input:
    312 ; xm0 must contain in[0,1].reim
    313 ; m2 - in[3-6].reim
    314 ; m3 - in[7-11].reim
    315 ; m4 - in[12-15].reim
    316 ; xm5 must contain in[2].reimreim
    317 ;
    318 ; Output:
    319 ; m0, m1, m2 - ACs
    320 ; xm14 - out[0]
    321 ; xm15 - out[10, 5]
    322 %macro FFT15 0
    323    shufps xm1, xm0, xm0, q3223      ; in[1].imrereim
    324    shufps xm0, xm0, xm0, q1001      ; in[0].imrereim
    325 
    326    xorps xm1, xm11
    327    addps xm1, xm0                   ; pc[0,1].imre
    328 
    329    shufps xm0, xm1, xm1, q3232      ; pc[1].reimreim
    330    addps xm0, xm5                   ; dc[0].reimreim
    331 
    332    mulps xm1, xm9                   ; tab[0123]*pc[01]
    333 
    334    shufpd xm6, xm1, xm1, 01b        ; pc[1,0].reim
    335    xorps xm1, xm11
    336    addps xm1, xm1, xm6
    337    addsubps xm1, xm5, xm1           ; dc[1,2].reim
    338 
    339    subps m7, m2, m3                 ; q[0-3].imre
    340    addps m6, m2, m3                 ; q[4-7]
    341    shufps m7, m7, m7, q2301         ; q[0-3].reim
    342 
    343    addps m5, m4, m6                 ; y[0-3]
    344 
    345    vperm2f128 m14, m9, m9, 0x11     ; tab[23232323]
    346    vbroadcastsd m15, xm9            ; tab[01010101]
    347 
    348    mulps m6, m14
    349    mulps m7, m15
    350 
    351    subps m2, m6, m7                 ; k[0-3]
    352    addps m3, m6, m7                 ; k[4-7]
    353 
    354    shufps m12, m11, m11, q3232      ; ppppmmmm
    355 
    356    addsubps m6, m4, m2              ; k[0-3]
    357    addsubps m7, m4, m3              ; k[4-7]
    358 
    359    ; 15pt from here on
    360    vpermpd m2, m5, q0123            ; y[3-0]
    361    vpermpd m3, m6, q0123            ; k[3-0]
    362    vpermpd m4, m7, q0123            ; k[7-4]
    363 
    364    xorps m5, m12
    365    xorps m6, m12
    366    xorps m7, m12
    367 
    368    addps m2, m5                     ; t[0-3]
    369    addps m3, m6                     ; t[4-7]
    370    addps m4, m7                     ; t[8-11]
    371 
    372    movlhps xm14, xm2                ; out[0]
    373    unpcklpd xm15, xm3, xm4          ; out[10,5]
    374    unpckhpd xm5, xm3, xm4           ; out[10,5]
    375 
    376    addps xm14, xm2                  ; out[0]
    377    addps xm15, xm5                  ; out[10,5]
    378    addps xm14, xm0                  ; out[0]
    379    addps xm15, xm1                  ; out[10,5]
    380 
    381    shufps m12, m10, m10, q3232      ; tab5 4 5 4 5  8  9  8  9
    382    shufps m13, m10, m10, q1010      ; tab5 6 7 6 7 10 11 10 11
    383 
    384    mulps m5, m2, m12                ; t[0-3]
    385    mulps m6, m3, m12                ; t[4-7]
    386    mulps m7, m4, m12                ; t[8-11]
    387 
    388    mulps m2, m13                    ; r[0-3]
    389    mulps m3, m13                    ; r[4-7]
    390    mulps m4, m13                    ; r[8-11]
    391 
    392    shufps m5, m5, m5, q1032         ; t[1,0,3,2].reim
    393    shufps m6, m6, m6, q1032         ; t[5,4,7,6].reim
    394    shufps m7, m7, m7, q1032         ; t[9,8,11,10].reim
    395 
    396    vperm2f128 m13, m11, m11, 0x01   ; mmmmmmpp
    397    shufps m12, m11, m11, q3232      ; ppppmmmm
    398 
    399    xorps m5, m13
    400    xorps m6, m13
    401    xorps m7, m13
    402 
    403    addps m2, m5                     ; r[0,1,2,3]
    404    addps m3, m6                     ; r[4,5,6,7]
    405    addps m4, m7                     ; r[8,9,10,11]
    406 
    407    shufps m5, m2, m2, q2301
    408    shufps m6, m3, m3, q2301
    409    shufps m7, m4, m4, q2301
    410 
    411    xorps m2, m12
    412    xorps m3, m12
    413    xorps m4, m12
    414 
    415    vpermpd m5, m5, q0123
    416    vpermpd m6, m6, q0123
    417    vpermpd m7, m7, q0123
    418 
    419    addps m5, m2
    420    addps m6, m3
    421    addps m7, m4
    422 
    423    vpermps m5, m8, m5
    424    vpermps m6, m8, m6
    425    vpermps m7, m8, m7
    426 
    427    vbroadcastsd m0, xm0             ; dc[0]
    428    vpermpd m2, m1, q1111            ; dc[2]
    429    vbroadcastsd m1, xm1             ; dc[1]
    430 
    431    addps m0, m5
    432    addps m1, m6
    433    addps m2, m7
    434 %endmacro
    435 
    436 ; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs
    437 ; Uses all 16 of registers.
    438 ; Output is slightly permuted such that tx2,3's coefficients are interleaved
    439 ; on a 2-point basis (look at `doc/transforms.md`)
    440 %macro SPLIT_RADIX_COMBINE 17
    441 %if %1 && mmsize == 32
    442    vperm2f128 %14, %6, %7, 0x20     ; m2[0], m2[1], m3[0], m3[1] even
    443    vperm2f128 %16, %9, %8, 0x20     ; m2[0], m2[1], m3[0], m3[1] odd
    444    vperm2f128 %15, %6, %7, 0x31     ; m2[2], m2[3], m3[2], m3[3] even
    445    vperm2f128 %17, %9, %8, 0x31     ; m2[2], m2[3], m3[2], m3[3] odd
    446 %endif
    447 
    448    shufps     %12, %10, %10, q2200  ; cos00224466
    449    shufps     %13, %11, %11, q1133  ; wim77553311
    450    movshdup   %10, %10              ; cos11335577
    451    shufps     %11, %11, %11, q0022  ; wim66442200
    452 
    453 %if %1 && mmsize == 32
    454    shufps     %6, %14, %14, q2301   ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even
    455    shufps     %8, %16, %16, q2301   ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd
    456    shufps     %7, %15, %15, q2301   ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even
    457    shufps     %9, %17, %17, q2301   ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd
    458 
    459    mulps      %14, %14, %13         ; m2[0123]reim * wim7531 even
    460    mulps      %16, %16, %11         ; m2[0123]reim * wim7531 odd
    461    mulps      %15, %15, %13         ; m3[0123]reim * wim7531 even
    462    mulps      %17, %17, %11         ; m3[0123]reim * wim7531 odd
    463 %else
    464    mulps      %14, %6, %13          ; m2,3[01]reim * wim7531 even
    465    mulps      %16, %8, %11          ; m2,3[01]reim * wim7531 odd
    466    mulps      %15, %7, %13          ; m2,3[23]reim * wim7531 even
    467    mulps      %17, %9, %11          ; m2,3[23]reim * wim7531 odd
    468    ; reorder the multiplies to save movs reg, reg in the %if above
    469    shufps     %6, %6, %6, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
    470    shufps     %8, %8, %8, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd
    471    shufps     %7, %7, %7, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
    472    shufps     %9, %9, %9, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd
    473 %endif
    474 
    475 %if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA!
    476    fmaddsubps %6, %6, %12, %14      ; w[0..8] even
    477    fmaddsubps %8, %8, %10, %16      ; w[0..8] odd
    478    fmsubaddps %7, %7, %12, %15      ; j[0..8] even
    479    fmsubaddps %9, %9, %10, %17      ; j[0..8] odd
    480    movaps     %13, [mask_pmpmpmpm]  ; "subaddps? pfft, who needs that!"
    481 %else
    482    mulps      %6, %6, %12           ; m2,3[01]imre * cos0246
    483    mulps      %8, %8, %10           ; m2,3[01]imre * cos0246
    484    movaps     %13, [mask_pmpmpmpm]  ; "subaddps? pfft, who needs that!"
    485    mulps      %7, %7, %12           ; m2,3[23]reim * cos0246
    486    mulps      %9, %9, %10           ; m2,3[23]reim * cos0246
    487    addsubps   %6, %6, %14           ; w[0..8]
    488    addsubps   %8, %8, %16           ; w[0..8]
    489    xorps      %15, %15, %13         ; +-m2,3[23]imre * wim7531
    490    xorps      %17, %17, %13         ; +-m2,3[23]imre * wim7531
    491    addps      %7, %7, %15           ; j[0..8]
    492    addps      %9, %9, %17           ; j[0..8]
    493 %endif
    494 
    495    addps      %14, %6, %7           ; t10235476 even
    496    addps      %16, %8, %9           ; t10235476 odd
    497    subps      %15, %6, %7           ; +-r[0..7] even
    498    subps      %17, %8, %9           ; +-r[0..7] odd
    499 
    500    shufps     %14, %14, %14, q2301  ; t[0..7] even
    501    shufps     %16, %16, %16, q2301  ; t[0..7] odd
    502    xorps      %15, %15, %13         ; r[0..7] even
    503    xorps      %17, %17, %13         ; r[0..7] odd
    504 
    505    subps      %6, %2, %14           ; m2,3[01] even
    506    subps      %8, %4, %16           ; m2,3[01] odd
    507    subps      %7, %3, %15           ; m2,3[23] even
    508    subps      %9, %5, %17           ; m2,3[23] odd
    509 
    510    addps      %2, %2, %14           ; m0 even
    511    addps      %4, %4, %16           ; m0 odd
    512    addps      %3, %3, %15           ; m1 even
    513    addps      %5, %5, %17           ; m1 odd
    514 %endmacro
    515 
    516 ; Same as above, only does one parity at a time, takes 3 temporary registers,
    517 ; however, if the twiddles aren't needed after this, the registers they use
    518 ; can be used as any of the temporary registers.
    519 %macro SPLIT_RADIX_COMBINE_HALF 10
    520 %if %1
    521    shufps     %8, %6, %6, q2200     ; cos00224466
    522    shufps     %9, %7, %7, q1133     ; wim77553311
    523 %else
    524    shufps     %8, %6, %6, q3311     ; cos11335577
    525    shufps     %9, %7, %7, q0022     ; wim66442200
    526 %endif
    527 
    528    mulps      %10, %4, %9           ; m2,3[01]reim * wim7531 even
    529    mulps      %9, %9, %5            ; m2,3[23]reim * wim7531 even
    530 
    531    shufps     %4, %4, %4, q2301     ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
    532    shufps     %5, %5, %5, q2301     ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
    533 
    534 %if cpuflag(fma3)
    535    fmaddsubps %4, %4, %8, %10       ; w[0..8] even
    536    fmsubaddps %5, %5, %8, %9        ; j[0..8] even
    537    movaps     %10, [mask_pmpmpmpm]
    538 %else
    539    mulps      %4, %4, %8            ; m2,3[01]imre * cos0246
    540    mulps      %5, %5, %8            ; m2,3[23]reim * cos0246
    541    addsubps   %4, %4, %10           ; w[0..8]
    542    movaps     %10, [mask_pmpmpmpm]
    543    xorps      %9, %9, %10           ; +-m2,3[23]imre * wim7531
    544    addps      %5, %5, %9            ; j[0..8]
    545 %endif
    546 
    547    addps      %8, %4, %5            ; t10235476
    548    subps      %9, %4, %5            ; +-r[0..7]
    549 
    550    shufps     %8, %8, %8, q2301     ; t[0..7]
    551    xorps      %9, %9, %10           ; r[0..7]
    552 
    553    subps      %4, %2, %8            ; %3,3[01]
    554    subps      %5, %3, %9            ; %3,3[23]
    555 
    556    addps      %2, %2, %8            ; m0
    557    addps      %3, %3, %9            ; m1
    558 %endmacro
    559 
    560 ; Same as above, tries REALLY hard to use 2 temporary registers.
    561 %macro SPLIT_RADIX_COMBINE_LITE 9
    562 %if %1
    563    shufps     %8, %6, %6, q2200        ; cos00224466
    564    shufps     %9, %7, %7, q1133        ; wim77553311
    565 %else
    566    shufps     %8, %6, %6, q3311        ; cos11335577
    567    shufps     %9, %7, %7, q0022        ; wim66442200
    568 %endif
    569 
    570    mulps      %9, %9, %4               ; m2,3[01]reim * wim7531 even
    571    shufps     %4, %4, %4, q2301        ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even
    572 
    573 %if cpuflag(fma3)
    574    fmaddsubps %4, %4, %8, %9           ; w[0..8] even
    575 %else
    576    mulps      %4, %4, %8               ; m2,3[01]imre * cos0246
    577    addsubps   %4, %4, %9               ; w[0..8]
    578 %endif
    579 
    580 %if %1
    581    shufps     %9, %7, %7, q1133        ; wim77553311
    582 %else
    583    shufps     %9, %7, %7, q0022        ; wim66442200
    584 %endif
    585 
    586    mulps      %9, %9, %5               ; m2,3[23]reim * wim7531 even
    587    shufps     %5, %5, %5, q2301        ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even
    588 %if cpuflag (fma3)
    589    fmsubaddps %5, %5, %8, %9           ; j[0..8] even
    590 %else
    591    mulps      %5, %5, %8               ; m2,3[23]reim * cos0246
    592    xorps      %9, %9, [mask_pmpmpmpm]  ; +-m2,3[23]imre * wim7531
    593    addps      %5, %5, %9               ; j[0..8]
    594 %endif
    595 
    596    addps      %8, %4, %5               ; t10235476
    597    subps      %9, %4, %5               ; +-r[0..7]
    598 
    599    shufps     %8, %8, %8, q2301        ; t[0..7]
    600    xorps      %9, %9, [mask_pmpmpmpm]  ; r[0..7]
    601 
    602    subps      %4, %2, %8               ; %3,3[01]
    603    subps      %5, %3, %9               ; %3,3[23]
    604 
    605    addps      %2, %2, %8               ; m0
    606    addps      %3, %3, %9               ; m1
    607 %endmacro
    608 
    609 %macro SPLIT_RADIX_COMBINE_64 0
    610    SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
    611 
    612    movaps [outq +  0*mmsize], m0
    613    movaps [outq +  4*mmsize], m1
    614    movaps [outq +  8*mmsize], tx1_e0
    615    movaps [outq + 12*mmsize], tx2_e0
    616 
    617    SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0
    618 
    619    movaps [outq +  2*mmsize], m2
    620    movaps [outq +  6*mmsize], m3
    621    movaps [outq + 10*mmsize], tx1_o0
    622    movaps [outq + 14*mmsize], tx2_o0
    623 
    624    movaps tw_e,           [tab_64_float + mmsize]
    625    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
    626 
    627    movaps m0, [outq +  1*mmsize]
    628    movaps m1, [outq +  3*mmsize]
    629    movaps m2, [outq +  5*mmsize]
    630    movaps m3, [outq +  7*mmsize]
    631 
    632    SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
    633                           tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
    634 
    635    movaps [outq +  1*mmsize], m0
    636    movaps [outq +  3*mmsize], m1
    637    movaps [outq +  5*mmsize], m2
    638    movaps [outq +  7*mmsize], m3
    639 
    640    movaps [outq +  9*mmsize], tx1_e1
    641    movaps [outq + 11*mmsize], tx1_o1
    642    movaps [outq + 13*mmsize], tx2_e1
    643    movaps [outq + 15*mmsize], tx2_o1
    644 %endmacro
    645 
    646 ; Perform a single even/odd split radix combination with loads and stores
    647 ; The _4 indicates this is a quarter of the iterations required to complete a full
    648 ; combine loop
    649 ; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6
    650 %macro SPLIT_RADIX_LOAD_COMBINE_4 8
    651    movaps m8,         [rtabq + (%5)*mmsize + %7]
    652    vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23
    653 
    654    movaps m0, [outq +      (0 + %4)*mmsize + %6]
    655    movaps m2, [outq +      (2 + %4)*mmsize + %6]
    656    movaps m1, [outq + %1 + (0 + %4)*mmsize + %6]
    657    movaps m3, [outq + %1 + (2 + %4)*mmsize + %6]
    658 
    659    movaps m4, [outq + %2 + (0 + %4)*mmsize + %6]
    660    movaps m6, [outq + %2 + (2 + %4)*mmsize + %6]
    661    movaps m5, [outq + %3 + (0 + %4)*mmsize + %6]
    662    movaps m7, [outq + %3 + (2 + %4)*mmsize + %6]
    663 
    664    SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
    665                           m4, m5, m6, m7, \
    666                           m8, m9, \
    667                           m10, m11, m12, m13, m14, m15
    668 
    669    movaps [outq +      (0 + %4)*mmsize + %6], m0
    670    movaps [outq +      (2 + %4)*mmsize + %6], m2
    671    movaps [outq + %1 + (0 + %4)*mmsize + %6], m1
    672    movaps [outq + %1 + (2 + %4)*mmsize + %6], m3
    673 
    674    movaps [outq + %2 + (0 + %4)*mmsize + %6], m4
    675    movaps [outq + %2 + (2 + %4)*mmsize + %6], m6
    676    movaps [outq + %3 + (0 + %4)*mmsize + %6], m5
    677    movaps [outq + %3 + (2 + %4)*mmsize + %6], m7
    678 %endmacro
    679 
    680 %macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5
    681 %if %0 > 2
    682 %define offset_c %3
    683 %else
    684 %define offset_c 0
    685 %endif
    686 %if %0 > 3
    687 %define offset_r %4
    688 %else
    689 %define offset_r 0
    690 %endif
    691 %if %0 > 4
    692 %define offset_i %5
    693 %else
    694 %define offset_i 0
    695 %endif
    696 
    697    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i
    698    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i
    699    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i
    700    SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i
    701 %endmacro
    702 
    703 ; Perform a single even/odd split radix combination with loads, deinterleaves and
    704 ; stores. The _2 indicates this is a half of the iterations required to complete
    705 ; a full combine+deinterleave loop
    706 ; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6
    707 %macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6
    708    movaps m8,         [rtabq + (0 + %2)*mmsize]
    709    vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23
    710 
    711    movaps m0, [outq +      (0 + 0 + %1)*mmsize + %6]
    712    movaps m2, [outq +      (2 + 0 + %1)*mmsize + %6]
    713    movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6]
    714    movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6]
    715 
    716    movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6]
    717    movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6]
    718    movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6]
    719    movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6]
    720 
    721    SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
    722       m4, m5, m6, m7, \
    723       m8, m9, \
    724       m10, m11, m12, m13, m14, m15
    725 
    726    unpckhpd m10, m0, m2
    727    unpckhpd m11, m1, m3
    728    unpckhpd m12, m4, m6
    729    unpckhpd m13, m5, m7
    730    unpcklpd m0, m0, m2
    731    unpcklpd m1, m1, m3
    732    unpcklpd m4, m4, m6
    733    unpcklpd m5, m5, m7
    734 
    735    vextractf128 [outq +      (0 + 0 + %1)*mmsize + %6 +  0], m0,  0
    736    vextractf128 [outq +      (0 + 0 + %1)*mmsize + %6 + 16], m10, 0
    737    vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 +  0], m1,  0
    738    vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0
    739 
    740    vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 +  0], m4,  0
    741    vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0
    742    vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 +  0], m5,  0
    743    vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0
    744 
    745    vperm2f128 m10, m10, m0, 0x13
    746    vperm2f128 m11, m11, m1, 0x13
    747    vperm2f128 m12, m12, m4, 0x13
    748    vperm2f128 m13, m13, m5, 0x13
    749 
    750    movaps m8,         [rtabq + (1 + %2)*mmsize]
    751    vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23
    752 
    753    movaps m0, [outq +      (0 + 1 + %1)*mmsize + %6]
    754    movaps m2, [outq +      (2 + 1 + %1)*mmsize + %6]
    755    movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6]
    756    movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6]
    757 
    758    movaps [outq +      (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict
    759    movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict
    760 
    761    movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6]
    762    movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6]
    763    movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6]
    764    movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6]
    765 
    766    movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict
    767    movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict
    768 
    769    SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \
    770                           m4, m5, m6, m7, \
    771                           m8, m9, \
    772                           m10, m11, m12, m13, m14, m15 ; temporary registers
    773 
    774    unpcklpd m8,  m0, m2
    775    unpcklpd m9,  m1, m3
    776    unpcklpd m10, m4, m6
    777    unpcklpd m11, m5, m7
    778    unpckhpd m0, m0, m2
    779    unpckhpd m1, m1, m3
    780    unpckhpd m4, m4, m6
    781    unpckhpd m5, m5, m7
    782 
    783    vextractf128 [outq +      (2 + 0 + %1)*mmsize + %6 +  0], m8,  0
    784    vextractf128 [outq +      (2 + 0 + %1)*mmsize + %6 + 16], m0,  0
    785    vextractf128 [outq +      (2 + 1 + %1)*mmsize + %6 +  0], m8,  1
    786    vextractf128 [outq +      (2 + 1 + %1)*mmsize + %6 + 16], m0,  1
    787 
    788    vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 +  0], m9,  0
    789    vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1,  0
    790    vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 +  0], m9,  1
    791    vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1,  1
    792 
    793    vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 +  0], m10, 0
    794    vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4,  0
    795    vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 +  0], m10, 1
    796    vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4,  1
    797 
    798    vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 +  0], m11, 0
    799    vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5,  0
    800    vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 +  0], m11, 1
    801    vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5,  1
    802 %endmacro
    803 
    804 %macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3
    805 %if %0 > 2
    806 %define offset %3
    807 %else
    808 %define offset 0
    809 %endif
    810    SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset
    811    SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset
    812 %endmacro
    813 
    814 INIT_XMM sse3
    815 cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride
    816    movaps m0, [inq]
    817    FFT2 m0, m1
    818    movaps [outq], m0
    819    ret
    820 
    821 cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
    822    movaps m0, [inq]
    823    FFT2 m0, m1
    824    movaps [outq], m0
    825    RET
    826 
    827 %macro FFT4_FN 3
    828 INIT_XMM sse2
    829 %if %3
    830 cglobal fft4_ %+ %1 %+ _asm_float, 0, 0, 0, ctx, out, in, stride
    831 %else
    832 cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
    833 %endif
    834    movaps m0, [inq + 0*mmsize]
    835    movaps m1, [inq + 1*mmsize]
    836 
    837 %if %2
    838    shufps m2, m1, m0, q3210
    839    shufps m0, m0, m1, q3210
    840    movaps m1, m2
    841 %endif
    842 
    843    FFT4 m0, m1, m2
    844 
    845    unpcklpd m2, m0, m1
    846    unpckhpd m0, m0, m1
    847 
    848    movaps [outq + 0*mmsize], m2
    849    movaps [outq + 1*mmsize], m0
    850 
    851 %if %3
    852    ret
    853 %else
    854    RET
    855 %endif
    856 %endmacro
    857 
    858 FFT4_FN fwd, 0, 0
    859 FFT4_FN fwd, 0, 1
    860 FFT4_FN inv, 1, 0
    861 FFT4_FN inv, 1, 1
    862 
    863 %macro FFT8_SSE_FN 1
    864 INIT_XMM sse3
    865 %if %1
    866 cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
    867    movaps m0, [inq + 0*mmsize]
    868    movaps m1, [inq + 1*mmsize]
    869    movaps m2, [inq + 2*mmsize]
    870    movaps m3, [inq + 3*mmsize]
    871 %else
    872 cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
    873    mov ctxq, [ctxq + AVTXContext.map]
    874    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
    875    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
    876    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
    877    LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
    878 %endif
    879 
    880    FFT8 m0, m1, m2, m3, m4, m5
    881 
    882    unpcklpd m4, m0, m3
    883    unpcklpd m5, m1, m2
    884    unpckhpd m0, m0, m3
    885    unpckhpd m1, m1, m2
    886 
    887    movups [outq + 0*mmsize], m4
    888    movups [outq + 1*mmsize], m0
    889    movups [outq + 2*mmsize], m5
    890    movups [outq + 3*mmsize], m1
    891 
    892 %if %1
    893    ret
    894 %else
    895    RET
    896 %endif
    897 
    898 %if %1
    899 cglobal fft8_ns_float, 4, 5, 6, ctx, out, in, stride, tmp
    900    call mangle(ff_tx_fft8_asm_float_sse3)
    901    RET
    902 %endif
    903 %endmacro
    904 
    905 FFT8_SSE_FN 0
    906 FFT8_SSE_FN 1
    907 
    908 %macro FFT8_AVX_FN 1
    909 INIT_YMM avx
    910 %if %1
    911 cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
    912    movaps m0, [inq + 0*mmsize]
    913    movaps m1, [inq + 1*mmsize]
    914 %else
    915 cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
    916    mov ctxq, [ctxq + AVTXContext.map]
    917    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
    918    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
    919 %endif
    920 
    921    FFT8_AVX m0, m1, m2, m3
    922 
    923    unpcklpd m2, m0, m1
    924    unpckhpd m0, m0, m1
    925 
    926    ; Around 2% faster than 2x vperm2f128 + 2x movapd
    927    vextractf128 [outq + 16*0], m2, 0
    928    vextractf128 [outq + 16*1], m0, 0
    929    vextractf128 [outq + 16*2], m2, 1
    930    vextractf128 [outq + 16*3], m0, 1
    931 
    932 %if %1
    933    ret
    934 %else
    935    RET
    936 %endif
    937 
    938 %if %1
    939 cglobal fft8_ns_float, 4, 5, 4, ctx, out, in, stride, tmp
    940    call mangle(ff_tx_fft8_asm_float_avx)
    941    RET
    942 %endif
    943 %endmacro
    944 
    945 FFT8_AVX_FN 0
    946 FFT8_AVX_FN 1
    947 
    948 %macro FFT16_FN 2
    949 INIT_YMM %1
    950 %if %2
    951 cglobal fft16_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
    952    movaps m0, [inq + 0*mmsize]
    953    movaps m1, [inq + 1*mmsize]
    954    movaps m2, [inq + 2*mmsize]
    955    movaps m3, [inq + 3*mmsize]
    956 %else
    957 cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
    958    mov ctxq, [ctxq + AVTXContext.map]
    959    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
    960    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
    961    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6
    962    LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7
    963 %endif
    964 
    965    FFT16 m0, m1, m2, m3, m4, m5, m6, m7
    966 
    967    unpcklpd m5, m1, m3
    968    unpcklpd m4, m0, m2
    969    unpckhpd m1, m1, m3
    970    unpckhpd m0, m0, m2
    971 
    972    vextractf128 [outq + 16*0], m4, 0
    973    vextractf128 [outq + 16*1], m0, 0
    974    vextractf128 [outq + 16*2], m4, 1
    975    vextractf128 [outq + 16*3], m0, 1
    976    vextractf128 [outq + 16*4], m5, 0
    977    vextractf128 [outq + 16*5], m1, 0
    978    vextractf128 [outq + 16*6], m5, 1
    979    vextractf128 [outq + 16*7], m1, 1
    980 
    981 %if %2
    982    ret
    983 %else
    984    RET
    985 %endif
    986 
    987 %if %2
    988 cglobal fft16_ns_float, 4, 5, 8, ctx, out, in, stride, tmp
    989    call mangle(ff_tx_fft16_asm_float_ %+ %1)
    990    RET
    991 %endif
    992 %endmacro
    993 
    994 FFT16_FN avx,  0
    995 FFT16_FN avx,  1
    996 FFT16_FN fma3, 0
    997 FFT16_FN fma3, 1
    998 
    999 %macro FFT32_FN 2
   1000 INIT_YMM %1
   1001 %if %2
   1002 cglobal fft32_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
   1003    movaps m4, [inq + 4*mmsize]
   1004    movaps m5, [inq + 5*mmsize]
   1005    movaps m6, [inq + 6*mmsize]
   1006    movaps m7, [inq + 7*mmsize]
   1007 %else
   1008 cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
   1009    mov ctxq, [ctxq + AVTXContext.map]
   1010    LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq,  m8, m12
   1011    LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq,  m9, m13
   1012    LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10, m14
   1013    LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11, m15
   1014 %endif
   1015 
   1016    FFT8 m4, m5, m6, m7, m8, m9
   1017 
   1018 %if %2
   1019    movaps m0, [inq + 0*mmsize]
   1020    movaps m1, [inq + 1*mmsize]
   1021    movaps m2, [inq + 2*mmsize]
   1022    movaps m3, [inq + 3*mmsize]
   1023 %else
   1024    LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq,  m8, m12
   1025    LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq,  m9, m13
   1026    LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10, m14
   1027    LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11, m15
   1028 %endif
   1029 
   1030    movaps m8,         [tab_32_float]
   1031    vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23
   1032 
   1033    FFT16 m0, m1, m2, m3, m10, m11, m12, m13
   1034 
   1035    SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
   1036                           m10, m11, m12, m13, m14, m15 ; temporary registers
   1037 
   1038    unpcklpd  m9, m1, m3
   1039    unpcklpd m10, m5, m7
   1040    unpcklpd  m8, m0, m2
   1041    unpcklpd m11, m4, m6
   1042    unpckhpd  m1, m1, m3
   1043    unpckhpd  m5, m5, m7
   1044    unpckhpd  m0, m0, m2
   1045    unpckhpd  m4, m4, m6
   1046 
   1047    vextractf128 [outq + 16* 0],  m8, 0
   1048    vextractf128 [outq + 16* 1],  m0, 0
   1049    vextractf128 [outq + 16* 2],  m8, 1
   1050    vextractf128 [outq + 16* 3],  m0, 1
   1051    vextractf128 [outq + 16* 4],  m9, 0
   1052    vextractf128 [outq + 16* 5],  m1, 0
   1053    vextractf128 [outq + 16* 6],  m9, 1
   1054    vextractf128 [outq + 16* 7],  m1, 1
   1055 
   1056    vextractf128 [outq + 16* 8], m11, 0
   1057    vextractf128 [outq + 16* 9],  m4, 0
   1058    vextractf128 [outq + 16*10], m11, 1
   1059    vextractf128 [outq + 16*11],  m4, 1
   1060    vextractf128 [outq + 16*12], m10, 0
   1061    vextractf128 [outq + 16*13],  m5, 0
   1062    vextractf128 [outq + 16*14], m10, 1
   1063    vextractf128 [outq + 16*15],  m5, 1
   1064 
   1065 %if %2
   1066    ret
   1067 %else
   1068    RET
   1069 %endif
   1070 
   1071 %if %2
   1072 cglobal fft32_ns_float, 4, 5, 16, ctx, out, in, stride, tmp
   1073    call mangle(ff_tx_fft32_asm_float_ %+ %1)
   1074    RET
   1075 %endif
   1076 %endmacro
   1077 
   1078 %if ARCH_X86_64
   1079 FFT32_FN avx,  0
   1080 FFT32_FN avx,  1
   1081 FFT32_FN fma3, 0
   1082 FFT32_FN fma3, 1
   1083 %endif
   1084 
   1085 %macro FFT_SPLIT_RADIX_DEF 1-2
   1086 ALIGN 16
   1087 .%1 %+ pt:
   1088    PUSH lenq
   1089    mov lenq, (%1/4)
   1090 
   1091    add outq, (%1*4) - (%1/1)
   1092    call .32pt
   1093 
   1094    add outq, (%1*2) - (%1/2) ; the synth loops also increment outq
   1095    call .32pt
   1096 
   1097    POP lenq
   1098    sub outq, (%1*4) + (%1*2) + (%1/2)
   1099 
   1100    lea rtabq, [tab_ %+ %1 %+ _float]
   1101    lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7]
   1102 
   1103 %if %0 > 1
   1104    cmp tgtq, %1
   1105    je .deinterleave
   1106 
   1107    mov tmpq, %1
   1108 
   1109 .synth_ %+ %1:
   1110    SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0
   1111    add outq, 8*mmsize
   1112    add rtabq, 4*mmsize
   1113    sub itabq, 4*mmsize
   1114    sub tmpq, 4*mmsize
   1115    jg .synth_ %+ %1
   1116 
   1117    cmp lenq, %1
   1118    jg %2 ; can't do math here, nasm doesn't get it
   1119    ret
   1120 %endif
   1121 %endmacro
   1122 
   1123 %macro FFT_SPLIT_RADIX_FN 2
   1124 INIT_YMM %1
   1125 %if %2
   1126 cglobal fft_sr_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp
   1127 %else
   1128 cglobal fft_sr_float, 4, 10, 16, 272, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp
   1129    movsxd lenq, dword [ctxq + AVTXContext.len]
   1130    mov lutq, [ctxq + AVTXContext.map]
   1131 %endif
   1132    mov tgtq, lenq
   1133 
   1134 ; Bottom-most/32-point transform ===============================================
   1135 ALIGN 16
   1136 .32pt:
   1137 %if %2
   1138    movaps m4, [inq + 4*mmsize]
   1139    movaps m5, [inq + 5*mmsize]
   1140    movaps m6, [inq + 6*mmsize]
   1141    movaps m7, [inq + 7*mmsize]
   1142 %else
   1143    LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq,  m8, m12
   1144    LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq,  m9, m13
   1145    LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10, m14
   1146    LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11, m15
   1147 %endif
   1148 
   1149    FFT8 m4, m5, m6, m7, m8, m9
   1150 
   1151 %if %2
   1152    movaps m0, [inq + 0*mmsize]
   1153    movaps m1, [inq + 1*mmsize]
   1154    movaps m2, [inq + 2*mmsize]
   1155    movaps m3, [inq + 3*mmsize]
   1156 %else
   1157    LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq,  m8, m12
   1158    LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq,  m9, m13
   1159    LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10, m14
   1160    LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11, m15
   1161 %endif
   1162 
   1163    movaps m8,         [tab_32_float]
   1164    vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23
   1165 
   1166    FFT16 m0, m1, m2, m3, m10, m11, m12, m13
   1167 
   1168    SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \
   1169                           m10, m11, m12, m13, m14, m15 ; temporary registers
   1170 
   1171    movaps [outq + 1*mmsize], m1
   1172    movaps [outq + 3*mmsize], m3
   1173    movaps [outq + 5*mmsize], m5
   1174    movaps [outq + 7*mmsize], m7
   1175 
   1176 %if %2
   1177    add inq, 8*mmsize
   1178 %else
   1179    add lutq, (mmsize/2)*8
   1180 %endif
   1181    cmp lenq, 32
   1182    jg .64pt
   1183 
   1184    movaps [outq + 0*mmsize], m0
   1185    movaps [outq + 2*mmsize], m2
   1186    movaps [outq + 4*mmsize], m4
   1187    movaps [outq + 6*mmsize], m6
   1188 
   1189    ret
   1190 
   1191 ; 64-point transform ===========================================================
   1192 ALIGN 16
   1193 .64pt:
   1194 ; Helper defines, these make it easier to track what's happening
   1195 %define tx1_e0 m4
   1196 %define tx1_e1 m5
   1197 %define tx1_o0 m6
   1198 %define tx1_o1 m7
   1199 %define tx2_e0 m8
   1200 %define tx2_e1 m9
   1201 %define tx2_o0 m10
   1202 %define tx2_o1 m11
   1203 %define tw_e m12
   1204 %define tw_o m13
   1205 %define tmp1 m14
   1206 %define tmp2 m15
   1207 
   1208    SWAP m4, m1
   1209    SWAP m6, m3
   1210 
   1211 %if %2
   1212    movaps tx1_e0, [inq + 0*mmsize]
   1213    movaps tx1_e1, [inq + 1*mmsize]
   1214    movaps tx1_o0, [inq + 2*mmsize]
   1215    movaps tx1_o1, [inq + 3*mmsize]
   1216 %else
   1217    LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tmp1
   1218    LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o, tmp2
   1219    LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tmp1
   1220    LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tw_o, tmp2
   1221 %endif
   1222 
   1223    FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
   1224 
   1225 %if %2
   1226    movaps tx2_e0, [inq + 4*mmsize]
   1227    movaps tx2_e1, [inq + 5*mmsize]
   1228    movaps tx2_o0, [inq + 6*mmsize]
   1229    movaps tx2_o1, [inq + 7*mmsize]
   1230 %else
   1231    LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tw_e, tmp1
   1232    LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_o, tmp2
   1233    LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_e, tmp1
   1234    LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_o, tmp2
   1235 %endif
   1236 
   1237    FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o
   1238 
   1239    movaps tw_e,           [tab_64_float]
   1240    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
   1241 
   1242 %if %2
   1243    add inq, 8*mmsize
   1244 %else
   1245    add lutq, (mmsize/2)*8
   1246 %endif
   1247    cmp tgtq, 64
   1248    je .64pt_deint
   1249 
   1250    SPLIT_RADIX_COMBINE_64
   1251 
   1252    cmp lenq, 64
   1253    jg .128pt
   1254    ret
   1255 
   1256 ; 128-point transform ==========================================================
   1257 ALIGN 16
   1258 .128pt:
   1259    PUSH lenq
   1260    mov lenq, 32
   1261 
   1262    add outq, 16*mmsize
   1263    call .32pt
   1264 
   1265    add outq, 8*mmsize
   1266    call .32pt
   1267 
   1268    POP lenq
   1269    sub outq, 24*mmsize
   1270 
   1271    lea rtabq, [tab_128_float]
   1272    lea itabq, [tab_128_float + 128 - 4*7]
   1273 
   1274    cmp tgtq, 128
   1275    je .deinterleave
   1276 
   1277    SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128
   1278 
   1279    cmp lenq, 128
   1280    jg .256pt
   1281    ret
   1282 
   1283 ; 256-point transform ==========================================================
   1284 ALIGN 16
   1285 .256pt:
   1286    PUSH lenq
   1287    mov lenq, 64
   1288 
   1289    add outq, 32*mmsize
   1290    call .32pt
   1291 
   1292    add outq, 16*mmsize
   1293    call .32pt
   1294 
   1295    POP lenq
   1296    sub outq, 48*mmsize
   1297 
   1298    lea rtabq, [tab_256_float]
   1299    lea itabq, [tab_256_float + 256 - 4*7]
   1300 
   1301    cmp tgtq, 256
   1302    je .deinterleave
   1303 
   1304    SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256
   1305    SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize
   1306 
   1307    cmp lenq, 256
   1308    jg .512pt
   1309    ret
   1310 
   1311 ; 512-point transform ==========================================================
   1312 ALIGN 16
   1313 .512pt:
   1314    PUSH lenq
   1315    mov lenq, 128
   1316 
   1317    add outq, 64*mmsize
   1318    call .32pt
   1319 
   1320    add outq, 32*mmsize
   1321    call .32pt
   1322 
   1323    POP lenq
   1324    sub outq, 96*mmsize
   1325 
   1326    lea rtabq, [tab_512_float]
   1327    lea itabq, [tab_512_float + 512 - 4*7]
   1328 
   1329    cmp tgtq, 512
   1330    je .deinterleave
   1331 
   1332    mov tmpq, 4
   1333 
   1334 .synth_512:
   1335    SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512
   1336    add outq, 8*mmsize
   1337    add rtabq, 4*mmsize
   1338    sub itabq, 4*mmsize
   1339    sub tmpq, 1
   1340    jg .synth_512
   1341 
   1342    cmp lenq, 512
   1343    jg .1024pt
   1344    ret
   1345 
   1346 ; 1024-point transform ==========================================================
   1347 ALIGN 16
   1348 .1024pt:
   1349    PUSH lenq
   1350    mov lenq, 256
   1351 
   1352    add outq, 96*mmsize
   1353    call .32pt
   1354 
   1355    add outq, 64*mmsize
   1356    call .32pt
   1357 
   1358    POP lenq
   1359    sub outq, 192*mmsize
   1360 
   1361    lea rtabq, [tab_1024_float]
   1362    lea itabq, [tab_1024_float + 1024 - 4*7]
   1363 
   1364    cmp tgtq, 1024
   1365    je .deinterleave
   1366 
   1367    mov tmpq, 8
   1368 
   1369 .synth_1024:
   1370    SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024
   1371    add outq, 8*mmsize
   1372    add rtabq, 4*mmsize
   1373    sub itabq, 4*mmsize
   1374    sub tmpq, 1
   1375    jg .synth_1024
   1376 
   1377    cmp lenq, 1024
   1378    jg .2048pt
   1379    ret
   1380 
   1381 ; 2048 to 131072-point transforms ==============================================
   1382 FFT_SPLIT_RADIX_DEF 2048,  .4096pt
   1383 FFT_SPLIT_RADIX_DEF 4096,  .8192pt
   1384 FFT_SPLIT_RADIX_DEF 8192,  .16384pt
   1385 FFT_SPLIT_RADIX_DEF 16384, .32768pt
   1386 FFT_SPLIT_RADIX_DEF 32768, .65536pt
   1387 FFT_SPLIT_RADIX_DEF 65536, .131072pt
   1388 FFT_SPLIT_RADIX_DEF 131072, .262144pt
   1389 FFT_SPLIT_RADIX_DEF 262144, .524288pt
   1390 FFT_SPLIT_RADIX_DEF 524288, .1048576pt
   1391 FFT_SPLIT_RADIX_DEF 1048576, .2097152pt
   1392 FFT_SPLIT_RADIX_DEF 2097152
   1393 
   1394 ;===============================================================================
   1395 ; Final synthesis + deinterleaving code
   1396 ;===============================================================================
   1397 .deinterleave:
   1398 %if %2
   1399    PUSH strideq
   1400 %endif
   1401    mov tgtq, lenq
   1402    imul tmpq, lenq, 2
   1403    lea strideq, [4*lenq + tmpq]
   1404 
   1405 .synth_deinterleave:
   1406    SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, strideq
   1407    add outq, 8*mmsize
   1408    add rtabq, 4*mmsize
   1409    sub itabq, 4*mmsize
   1410    sub tgtq, 4*mmsize
   1411    jg .synth_deinterleave
   1412 
   1413 %if %2
   1414    POP strideq
   1415    sub outq, tmpq
   1416    neg tmpq
   1417    lea inq, [inq + tmpq*4]
   1418    ret
   1419 %else
   1420    RET
   1421 %endif
   1422 
   1423 ; 64-point deinterleave which only has to load 4 registers =====================
   1424 .64pt_deint:
   1425    SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
   1426    SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e
   1427 
   1428    unpcklpd tmp1, m0, m2
   1429    unpcklpd tmp2, m1, m3
   1430    unpcklpd tw_o, tx1_e0, tx1_o0
   1431    unpcklpd tw_e, tx2_e0, tx2_o0
   1432    unpckhpd m0, m0, m2
   1433    unpckhpd m1, m1, m3
   1434    unpckhpd tx1_e0, tx1_e0, tx1_o0
   1435    unpckhpd tx2_e0, tx2_e0, tx2_o0
   1436 
   1437    vextractf128 [outq +  0*mmsize +  0], tmp1,   0
   1438    vextractf128 [outq +  0*mmsize + 16], m0,     0
   1439    vextractf128 [outq +  4*mmsize +  0], tmp2,   0
   1440    vextractf128 [outq +  4*mmsize + 16], m1,     0
   1441 
   1442    vextractf128 [outq +  8*mmsize +  0], tw_o,   0
   1443    vextractf128 [outq +  8*mmsize + 16], tx1_e0, 0
   1444    vextractf128 [outq +  9*mmsize +  0], tw_o,   1
   1445    vextractf128 [outq +  9*mmsize + 16], tx1_e0, 1
   1446 
   1447    vperm2f128 tmp1, tmp1, m0, 0x31
   1448    vperm2f128 tmp2, tmp2, m1, 0x31
   1449 
   1450    vextractf128 [outq + 12*mmsize +  0], tw_e,   0
   1451    vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0
   1452    vextractf128 [outq + 13*mmsize +  0], tw_e,   1
   1453    vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1
   1454 
   1455    movaps tw_e,           [tab_64_float + mmsize]
   1456    vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23
   1457 
   1458    movaps m0, [outq +  1*mmsize]
   1459    movaps m1, [outq +  3*mmsize]
   1460    movaps m2, [outq +  5*mmsize]
   1461    movaps m3, [outq +  7*mmsize]
   1462 
   1463    movaps [outq +  1*mmsize], tmp1
   1464    movaps [outq +  5*mmsize], tmp2
   1465 
   1466    SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \
   1467                           tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers
   1468 
   1469    unpcklpd tmp1, m0, m1
   1470    unpcklpd tmp2, m2, m3
   1471    unpcklpd tw_e, tx1_e1, tx1_o1
   1472    unpcklpd tw_o, tx2_e1, tx2_o1
   1473    unpckhpd m0, m0, m1
   1474    unpckhpd m2, m2, m3
   1475    unpckhpd tx1_e1, tx1_e1, tx1_o1
   1476    unpckhpd tx2_e1, tx2_e1, tx2_o1
   1477 
   1478    vextractf128 [outq +  2*mmsize +  0], tmp1,   0
   1479    vextractf128 [outq +  2*mmsize + 16], m0,     0
   1480    vextractf128 [outq +  3*mmsize +  0], tmp1,   1
   1481    vextractf128 [outq +  3*mmsize + 16], m0,     1
   1482 
   1483    vextractf128 [outq +  6*mmsize +  0], tmp2,   0
   1484    vextractf128 [outq +  6*mmsize + 16], m2,     0
   1485    vextractf128 [outq +  7*mmsize +  0], tmp2,   1
   1486    vextractf128 [outq +  7*mmsize + 16], m2,     1
   1487 
   1488    vextractf128 [outq + 10*mmsize +  0], tw_e,   0
   1489    vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0
   1490    vextractf128 [outq + 11*mmsize +  0], tw_e,   1
   1491    vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1
   1492 
   1493    vextractf128 [outq + 14*mmsize +  0], tw_o,   0
   1494    vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0
   1495    vextractf128 [outq + 15*mmsize +  0], tw_o,   1
   1496    vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
   1497 
   1498 %if %2
   1499    sub inq, 16*mmsize
   1500    ret
   1501 %else
   1502    RET
   1503 %endif
   1504 
   1505 %if %2
   1506 cglobal fft_sr_ns_float, 4, 10, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt, off
   1507    movsxd lenq, dword [ctxq + AVTXContext.len]
   1508    mov lutq, [ctxq + AVTXContext.map]
   1509 
   1510    call mangle(ff_tx_fft_sr_asm_float_ %+ %1)
   1511    RET
   1512 %endif
   1513 %endmacro
   1514 
   1515 %if ARCH_X86_64
   1516 FFT_SPLIT_RADIX_FN avx, 0
   1517 FFT_SPLIT_RADIX_FN avx, 1
   1518 FFT_SPLIT_RADIX_FN fma3, 0
   1519 FFT_SPLIT_RADIX_FN fma3, 1
   1520 FFT_SPLIT_RADIX_FN avx2, 0
   1521 FFT_SPLIT_RADIX_FN avx2, 1
   1522 %endif
   1523 
   1524 %macro FFT15_FN 2
   1525 INIT_YMM avx2
   1526 cglobal fft15_ %+ %2, 4, 10, 16, ctx, out, in, stride, len, lut, tmp, tgt5, stride3, stride5
   1527    mov lutq, [ctxq + AVTXContext.map]
   1528 
   1529    imul stride3q, strideq, 3
   1530    imul stride5q, strideq, 5
   1531 
   1532    movaps m11, [mask_mmppmmmm]      ; mmppmmmm
   1533    movaps m10, [tab_53_float]       ; tab5
   1534    movaps xm9, [tab_53_float + 32]  ; tab3
   1535    vpermpd m9, m9, q1110            ; tab[23232323]
   1536    movaps m8, [s15_perm]
   1537 
   1538 %if %1
   1539    movups  xm0, [inq]
   1540    movddup xm5, [inq + 16]
   1541    movups  m2, [inq + mmsize*0 + 24]
   1542    movups  m3, [inq + mmsize*1 + 24]
   1543    movups  m4, [inq + mmsize*2 + 24]
   1544 %else
   1545    LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15
   1546    LOAD64_LUT  m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7
   1547    LOAD64_LUT  m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15
   1548    LOAD64_LUT  m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7
   1549    mov tmpd, [lutq + 8]
   1550    movddup xm5, [inq + tmpq*8]
   1551 %endif
   1552 
   1553    FFT15
   1554 
   1555    lea tgt5q, [outq + stride5q]
   1556    lea tmpq,  [outq + stride5q*2]
   1557 
   1558    movhps [outq], xm14              ; out[0]
   1559    movhps [outq + stride5q*1], xm15 ; out[5]
   1560    movlps [outq + stride5q*2], xm15 ; out[10]
   1561 
   1562    vextractf128 xm3, m0, 1
   1563    vextractf128 xm4, m1, 1
   1564    vextractf128 xm5, m2, 1
   1565 
   1566    movlps [outq  + strideq*1],  xm1
   1567    movhps [outq  + strideq*2],  xm2
   1568    movlps [outq  + stride3q*1], xm3
   1569    movhps [outq  + strideq*4],  xm4
   1570    movlps [outq  + stride3q*2], xm0
   1571    movlps [outq  + strideq*8],  xm5
   1572    movhps [outq  + stride3q*4], xm0
   1573    movhps [tgt5q + strideq*2],  xm1
   1574    movhps [tgt5q + strideq*4],  xm3
   1575    movlps [tmpq  + strideq*1],  xm2
   1576    movlps [tmpq  + stride3q*1], xm4
   1577    movhps [tmpq  + strideq*4],  xm5
   1578 
   1579    RET
   1580 %endmacro
   1581 
   1582 %if ARCH_X86_64
   1583 FFT15_FN 0, float
   1584 FFT15_FN 1, ns_float
   1585 %endif
   1586 
   1587 %macro IMDCT_FN 1
   1588 INIT_YMM %1
   1589 cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, t1, t2, t3, \
   1590                                        t4, t5, btmp
   1591    movsxd lenq, dword [ctxq + AVTXContext.len]
   1592    mov expq, [ctxq + AVTXContext.exp]
   1593 
   1594    lea t1d, [lend - 1]
   1595    imul t1d, strided
   1596 
   1597    mov btmpq, ctxq                    ; backup original context
   1598    mov lutq, [ctxq + AVTXContext.map] ; load map
   1599 
   1600    cmp strideq, 4
   1601    je .stride4
   1602 
   1603    shl strideq, 1
   1604    movd xm4, strided
   1605    vpbroadcastd m4, xm4             ; stride splatted
   1606    movd xm5, t1d
   1607    vpbroadcastd m5, xm5             ; offset splatted
   1608 
   1609    mov t2q, outq                    ; don't modify the original output
   1610    pcmpeqd m15, m15                 ; set all bits to 1
   1611 
   1612 .stridex_pre:
   1613    pmulld m2, m4, [lutq]            ; multiply by stride
   1614    movaps m0, m15
   1615    psubd m3, m5, m2                 ; subtract from offset
   1616    movaps m1, m15
   1617    vgatherdps m6, [inq + m2], m0    ; im
   1618    vgatherdps m7, [inq + m3], m1    ; re
   1619 
   1620    movaps m8, [expq + 0*mmsize]     ; tab 1
   1621    movaps m9, [expq + 1*mmsize]     ; tab 2
   1622 
   1623    unpcklps m0, m7, m6              ; re, im, re, im
   1624    unpckhps m1, m7, m6              ; re, im, re, im
   1625 
   1626    vperm2f128 m2, m1, m0, 0x02      ; output order
   1627    vperm2f128 m3, m1, m0, 0x13      ; output order
   1628 
   1629    movshdup m10, m8                 ; tab 1 imim
   1630    movshdup m11, m9                 ; tab 2 imim
   1631    movsldup m12, m8                 ; tab 1 rere
   1632    movsldup m13, m9                 ; tab 2 rere
   1633 
   1634    mulps m10, m2                    ; 1 reim * imim
   1635    mulps m11, m3                    ; 2 reim * imim
   1636 
   1637    shufps m10, m10, q2301
   1638    shufps m11, m11, q2301
   1639 
   1640    fmaddsubps m10, m12, m2, m10
   1641    fmaddsubps m11, m13, m3, m11
   1642 
   1643    movups [t2q + 0*mmsize], m10
   1644    movups [t2q + 1*mmsize], m11
   1645 
   1646    add expq, mmsize*2
   1647    add lutq, mmsize
   1648    add t2q, mmsize*2
   1649    sub lenq, mmsize/2
   1650    jg .stridex_pre
   1651    jmp .transform
   1652 
   1653 .stride4:
   1654    lea expq, [expq + lenq*4]
   1655    lea lutq, [lutq + lenq*2]
   1656    lea t1q, [inq + t1q]
   1657    lea t1q, [t1q + strideq - mmsize]
   1658    lea t2q, [lenq*2 - mmsize/2]
   1659 
   1660 .stride4_pre:
   1661    movups       m4, [inq]
   1662    movups       m3, [t1q]
   1663 
   1664    movsldup     m1, m4              ; im im, im im
   1665    movshdup     m0, m3              ; re re, re re
   1666    movshdup     m4, m4              ; re re, re re (2)
   1667    movsldup     m3, m3              ; im im, im im (2)
   1668 
   1669    movups       m2, [expq]          ; tab
   1670    movups       m5, [expq + 2*t2q]  ; tab (2)
   1671 
   1672    vpermpd      m0, m0, q0123       ; flip
   1673    shufps       m7, m2, m2, q2301
   1674    vpermpd      m4, m4, q0123       ; flip (2)
   1675    shufps       m8, m5, m5, q2301
   1676 
   1677    mulps        m1, m7              ; im im * tab.reim
   1678    mulps        m3, m8              ; im im * tab.reim (2)
   1679 
   1680    fmaddsubps   m0, m0, m2, m1
   1681    fmaddsubps   m4, m4, m5, m3
   1682 
   1683    vextractf128 xm3, m0, 1
   1684    vextractf128 xm6, m4, 1
   1685 
   1686    ; scatter
   1687    movsxd strideq, dword [lutq + 0*4]
   1688    movsxd lenq,    dword [lutq + 1*4]
   1689    movsxd t3q,     dword [lutq + 2*4]
   1690    movsxd t4q,     dword [lutq + 3*4]
   1691 
   1692    movlps [outq + strideq*8], xm0
   1693    movhps [outq + lenq*8],    xm0
   1694    movlps [outq + t3q*8],     xm3
   1695    movhps [outq + t4q*8],     xm3
   1696 
   1697    movsxd strideq, dword [lutq + 0*4 + t2q]
   1698    movsxd lenq,    dword [lutq + 1*4 + t2q]
   1699    movsxd t3q,     dword [lutq + 2*4 + t2q]
   1700    movsxd t4q,     dword [lutq + 3*4 + t2q]
   1701 
   1702    movlps [outq + strideq*8], xm4
   1703    movhps [outq + lenq*8],    xm4
   1704    movlps [outq + t3q*8],     xm6
   1705    movhps [outq + t4q*8],     xm6
   1706 
   1707    add lutq, mmsize/2
   1708    add expq, mmsize
   1709    add inq, mmsize
   1710    sub t1q, mmsize
   1711    sub t2q, mmsize
   1712    jge .stride4_pre
   1713 
   1714 .transform:
   1715    mov strideq, 2*4
   1716    mov t4q, ctxq                      ; backup original context
   1717    mov t5q, [ctxq + AVTXContext.fn]   ; subtransform's jump point
   1718    mov ctxq, [ctxq + AVTXContext.sub]
   1719    mov lutq, [ctxq + AVTXContext.map]
   1720    movsxd lenq, dword [ctxq + AVTXContext.len]
   1721 
   1722    mov inq, outq                    ; in-place transform
   1723    call t5q                         ; call the FFT
   1724 
   1725    mov ctxq, t4q                    ; restore original context
   1726    movsxd lenq, dword [ctxq + AVTXContext.len]
   1727    mov expq, [ctxq + AVTXContext.exp]
   1728    lea expq, [expq + lenq*4]
   1729 
   1730    xor t1q, t1q                     ; low
   1731    lea t2q, [lenq*4 - mmsize]       ; high
   1732 
   1733 .post:
   1734    movaps m2, [expq + t2q]          ; tab h
   1735    movaps m3, [expq + t1q]          ; tab l
   1736    movups m0, [outq + t2q]          ; in h
   1737    movups m1, [outq + t1q]          ; in l
   1738 
   1739    movshdup m4, m2                  ; tab h imim
   1740    movshdup m5, m3                  ; tab l imim
   1741    movsldup m6, m2                  ; tab h rere
   1742    movsldup m7, m3                  ; tab l rere
   1743 
   1744    shufps m2, m0, m0, q2301         ; in h imre
   1745    shufps m3, m1, m1, q2301         ; in l imre
   1746 
   1747    mulps m6, m0
   1748    mulps m7, m1
   1749 
   1750    fmaddsubps m4, m4, m2, m6
   1751    fmaddsubps m5, m5, m3, m7
   1752 
   1753    vpermpd m3, m5, q0123            ; flip
   1754    vpermpd m2, m4, q0123            ; flip
   1755 
   1756    blendps m1, m2, m5, 01010101b
   1757    blendps m0, m3, m4, 01010101b
   1758 
   1759    movups [outq + t2q], m0
   1760    movups [outq + t1q], m1
   1761 
   1762    add t1q, mmsize
   1763    sub t2q, mmsize
   1764    sub lenq, mmsize/2
   1765    jg .post
   1766 
   1767    RET
   1768 %endmacro
   1769 
   1770 %if ARCH_X86_64
   1771 IMDCT_FN avx2
   1772 %endif
   1773 
   1774 %macro PFA_15_FN 2
   1775 INIT_YMM %1
   1776 %if %2
   1777 cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
   1778                                         tgt5, stride3, stride5, btmp
   1779 %else
   1780 cglobal fft_pfa_15xM_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
   1781                                            tgt5, stride3, stride5, btmp
   1782 %endif
   1783 
   1784 %if %2
   1785    PUSH inq
   1786    PUSH tgt5q
   1787    PUSH stride3q
   1788    PUSH stride5q
   1789    PUSH btmpq
   1790 %endif
   1791 
   1792    PUSH strideq
   1793 
   1794    mov btmpq, outq
   1795 
   1796    mov outq, [ctxq + AVTXContext.tmp]
   1797 %if !%2
   1798    movsxd lenq, dword [ctxq + AVTXContext.len]
   1799    mov lutq, [ctxq + AVTXContext.map]
   1800 %endif
   1801 
   1802    ; Load stride (second transform's length) and second transform's LUT
   1803    mov tmpq, [ctxq + AVTXContext.sub]
   1804    movsxd strideq, dword [tmpq + AVTXContext.len]
   1805    mov mapq, [tmpq + AVTXContext.map]
   1806 
   1807    shl strideq, 3
   1808    imul stride3q, strideq, 3
   1809    imul stride5q, strideq, 5
   1810 
   1811    movaps m11, [mask_mmppmmmm]      ; mmppmmmm
   1812    movaps m10, [tab_53_float]       ; tab5
   1813    movaps xm9, [tab_53_float + 32]  ; tab3
   1814    vpermpd m9, m9, q1110            ; tab[23232323]
   1815    movaps m8, [s15_perm]
   1816 
   1817 .dim1:
   1818    mov tmpd, [mapq]
   1819    lea tgtq, [outq + tmpq*8]
   1820 
   1821 %if %2
   1822    movups  xm0, [inq]                                            ; in[0,1].reim
   1823    movddup xm5, [inq + 16]                                       ; in[2].reimreim
   1824    movups  m2, [inq + mmsize*0 + 24]                             ; in[3-6].reim
   1825    movups  m3, [inq + mmsize*1 + 24]                             ; in[7-11].reim
   1826    movups  m4, [inq + mmsize*2 + 24]                             ; in[12-15].reim
   1827 %else
   1828    LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15                 ; in[0,1].reim
   1829    LOAD64_LUT  m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7
   1830    LOAD64_LUT  m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15
   1831    LOAD64_LUT  m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7
   1832    mov tmpd, [lutq + 8]
   1833    movddup xm5, [inq + tmpq*8]     ; in[2].reimreim
   1834 %endif
   1835 
   1836    FFT15
   1837 
   1838    lea tgt5q, [tgtq + stride5q]
   1839    lea tmpq,  [tgtq + stride5q*2]
   1840 
   1841    movhps [tgtq], xm14              ; out[0]
   1842    movhps [tgtq + stride5q*1], xm15 ; out[5]
   1843    movlps [tgtq + stride5q*2], xm15 ; out[10]
   1844 
   1845    vextractf128 xm3, m0, 1
   1846    vextractf128 xm4, m1, 1
   1847    vextractf128 xm5, m2, 1
   1848 
   1849    movlps [tgtq  + strideq*1],  xm1
   1850    movhps [tgtq  + strideq*2],  xm2
   1851    movlps [tgtq  + stride3q*1], xm3
   1852    movhps [tgtq  + strideq*4],  xm4
   1853    movlps [tgtq  + stride3q*2], xm0
   1854    movlps [tgtq  + strideq*8],  xm5
   1855    movhps [tgtq  + stride3q*4], xm0
   1856    movhps [tgt5q + strideq*2],  xm1
   1857    movhps [tgt5q + strideq*4],  xm3
   1858    movlps [tmpq  + strideq*1],  xm2
   1859    movlps [tmpq  + stride3q*1], xm4
   1860    movhps [tmpq  + strideq*4],  xm5
   1861 
   1862 %if %2
   1863    add inq, mmsize*3 + 24
   1864 %else
   1865    add lutq, (mmsize/2)*3 + 12
   1866 %endif
   1867    add mapq, 4
   1868    sub lenq, 15
   1869    jg .dim1
   1870 
   1871    ; Second transform setup
   1872    mov stride5q, ctxq                              ; backup original context
   1873    movsxd stride3q, dword [ctxq + AVTXContext.len] ; full length
   1874    mov tgt5q, [ctxq + AVTXContext.fn]              ; subtransform's jump point
   1875 
   1876    mov inq, outq                                   ; in-place transform
   1877    mov ctxq, [ctxq + AVTXContext.sub]              ; load subtransform's context
   1878    mov lutq, [ctxq + AVTXContext.map]              ; load subtransform's map
   1879    movsxd lenq, dword [ctxq + AVTXContext.len]     ; load subtransform's length
   1880 
   1881 .dim2:
   1882    call tgt5q                                      ; call the FFT
   1883    lea inq,  [inq  + lenq*8]
   1884    lea outq, [outq + lenq*8]
   1885    sub stride3q, lenq
   1886    jg .dim2
   1887 
   1888    mov ctxq, stride5q                              ; restore original context
   1889    mov lutq, [ctxq + AVTXContext.map]
   1890    mov inq,  [ctxq + AVTXContext.tmp]
   1891    movsxd lenq, dword [ctxq + AVTXContext.len]     ; full length
   1892 
   1893    lea stride3q, [lutq + lenq*4]                   ; second part of the LUT
   1894    mov stride5q, lenq
   1895    mov tgt5q, btmpq
   1896    POP strideq
   1897    lea tmpq, [strideq + 2*strideq]
   1898 
   1899 .post:
   1900    LOAD64_LUT m0, inq, stride3q, 0, tmpq, m8, m9
   1901    vextractf128 xm1, m0, 1
   1902    movlps [tgt5q], xm0
   1903    movhps [tgt5q + strideq], xm0
   1904    movlps [tgt5q + strideq*2], xm1
   1905    movhps [tgt5q + tmpq], xm1
   1906 
   1907    lea tgt5q, [tgt5q + 4*strideq]
   1908    add stride3q, mmsize/2
   1909    sub stride5q, mmsize/8
   1910    jg .post
   1911 
   1912 %if %2
   1913    mov outq, btmpq
   1914    POP btmpq
   1915    POP stride5q
   1916    POP stride3q
   1917    POP tgt5q
   1918    POP inq
   1919    ret
   1920 %else
   1921    RET
   1922 %endif
   1923 
   1924 %if %2
   1925 cglobal fft_pfa_15xM_ns_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
   1926                                               tgt5, stride3, stride5, btmp
   1927    movsxd lenq, dword [ctxq + AVTXContext.len]
   1928    mov lutq, [ctxq + AVTXContext.map]
   1929 
   1930    call mangle(ff_tx_fft_pfa_15xM_asm_float)
   1931    RET
   1932 %endif
   1933 %endmacro
   1934 
   1935 %if ARCH_X86_64
   1936 PFA_15_FN avx2, 0
   1937 PFA_15_FN avx2, 1
   1938 %endif