tx_float.asm (58015B)
1 ;****************************************************************************** 2 ;* Copyright (c) Lynne 3 ;* 4 ;* This file is part of FFmpeg. 5 ;* 6 ;* FFmpeg is free software; you can redistribute it and/or 7 ;* modify it under the terms of the GNU Lesser General Public 8 ;* License as published by the Free Software Foundation; either 9 ;* version 2.1 of the License, or (at your option) any later version. 10 ;* 11 ;* FFmpeg is distributed in the hope that it will be useful, 12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 ;* Lesser General Public License for more details. 15 ;* 16 ;* You should have received a copy of the GNU Lesser General Public 17 ;* License along with FFmpeg; if not, write to the Free Software 18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 ;****************************************************************************** 20 21 ; Open `doc/transforms.md` to see the code upon which the transforms here were 22 ; based upon and compare. 23 24 ; Intra-asm call convention: 25 ; 320 bytes of stack available 26 ; 14 GPRs available (last 4 must not be clobbered) 27 ; Additionally, don't clobber ctx, in, out, stride, len, lut 28 ; All vector regs available 29 30 ; TODO: 31 ; carry over registers from smaller transforms to save on ~8 loads/stores 32 ; check if vinsertf could be faster than verpm2f128 for duplication 33 ; even faster FFT8 (current one is very #instructions optimized) 34 ; replace some xors with blends + addsubs? 35 ; replace some shuffles with vblends? 36 ; avx512 split-radix 37 38 %include "libavutil/x86/x86util.asm" 39 40 %define private_prefix ff_tx 41 42 %if ARCH_X86_64 43 %define ptr resq 44 %else 45 %define ptr resd 46 %endif 47 48 %assign i 16 49 %rep 18 50 cextern tab_ %+ i %+ _float ; ff_tab_i_float... 51 %assign i (i << 1) 52 %endrep 53 54 cextern tab_53_float 55 56 struc AVTXContext 57 .len: resd 1 ; Length 58 .inv resd 1 ; Inverse flag 59 .map: ptr 1 ; Lookup table(s) 60 .exp: ptr 1 ; Exponentiation factors 61 .tmp: ptr 1 ; Temporary data 62 63 .sub: ptr 1 ; Subcontexts 64 .fn: ptr 4 ; Subcontext functions 65 .nb_sub: resd 1 ; Subcontext count 66 67 ; Everything else is inaccessible 68 endstruc 69 70 SECTION_RODATA 32 71 72 %define POS 0x00000000 73 %define NEG 0x80000000 74 75 %define M_SQRT1_2 0.707106781186547524401 76 %define COS16_1 0.92387950420379638671875 77 %define COS16_3 0.3826834261417388916015625 78 79 d8_mult_odd: dd M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, \ 80 M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2 81 82 s8_mult_odd: dd 1.0, 1.0, -1.0, 1.0, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 83 s8_perm_even: dd 1, 3, 0, 2, 1, 3, 2, 0 84 s8_perm_odd1: dd 3, 3, 1, 1, 1, 1, 3, 3 85 s8_perm_odd2: dd 1, 2, 0, 3, 1, 0, 0, 1 86 87 s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2 88 s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3 89 s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1 90 s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2 91 92 s15_perm: dd 0, 6, 5, 3, 2, 4, 7, 1 93 94 mask_mmppmmmm: dd NEG, NEG, POS, POS, NEG, NEG, NEG, NEG 95 mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG 96 mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG 97 mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS 98 mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG 99 mask_pmmppmmp: dd POS, NEG, NEG, POS, POS, NEG, NEG, POS 100 mask_pmpmpmpm: times 4 dd POS, NEG 101 102 SECTION .text 103 104 ; Load complex values (64 bits) via a lookup table 105 ; %1 - output register 106 ; %2 - GRP of base input memory address 107 ; %3 - GPR of LUT (int32_t indices) address 108 ; %4 - LUT offset 109 ; %5 - temporary GPR (only used if vgather is not used) 110 ; %6 - temporary register (for avx only) 111 ; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set) 112 %macro LOAD64_LUT 5-7 113 %if %0 > 6 && cpuflag(avx2) 114 pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25 115 movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction 116 vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args 117 %else 118 mov %5d, [%3 + %4 + 0] 119 movsd xmm%1, [%2 + %5q*8] 120 %if sizeof%1 > 16 && %0 > 5 121 mov %5d, [%3 + %4 + 8] 122 movsd xmm%6, [%2 + %5q*8] 123 %endif 124 mov %5d, [%3 + %4 + 4] 125 movhps xmm%1, [%2 + %5q*8] 126 %if sizeof%1 > 16 && %0 > 5 127 mov %5d, [%3 + %4 + 12] 128 movhps xmm%6, [%2 + %5q*8] 129 vinsertf128 %1, %1, xmm%6, 1 130 %endif 131 %endif 132 %endmacro 133 134 ; Single 2-point in-place complex FFT (will do 2 transforms at once in AVX mode) 135 ; %1 - coefficients (r0.reim, r1.reim) 136 ; %2 - temporary 137 %macro FFT2 2 138 shufps %2, %1, %1, q3322 139 shufps %1, %1, %1, q1100 140 141 addsubps %1, %1, %2 142 143 shufps %1, %1, %1, q2031 144 %endmacro 145 146 ; Single 4-point in-place complex FFT (will do 2 transforms at once in [AVX] mode) 147 ; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) 148 ; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) 149 ; %3 - temporary 150 %macro FFT4 3 151 subps %3, %1, %2 ; r1234, [r5678] 152 addps %1, %1, %2 ; t1234, [t5678] 153 154 shufps %2, %1, %3, q1010 ; t12, r12 155 shufps %1, %1, %3, q2332 ; t34, r43 156 157 subps %3, %2, %1 ; a34, b32 158 addps %2, %2, %1 ; a12, b14 159 160 shufps %1, %2, %3, q1010 ; a1234 even 161 162 shufps %2, %2, %3, q2332 ; b1423 163 shufps %2, %2, %2, q1320 ; b1234 odd 164 %endmacro 165 166 ; Single/Dual 8-point in-place complex FFT (will do 2 transforms in [AVX] mode) 167 ; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim]) 168 ; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim]) 169 ; %3 - odd coefficients (a1.reim, a3.reim, [b1.reim, b3.reim]) 170 ; %4 - odd coefficients (a5.reim, a7.reim, [b5.reim, b7.reim]) 171 ; %5 - temporary 172 ; %6 - temporary 173 %macro FFT8 6 174 addps %5, %1, %3 ; q1-8 175 addps %6, %2, %4 ; k1-8 176 177 subps %1, %1, %3 ; r1-8 178 subps %2, %2, %4 ; j1-8 179 180 shufps %4, %1, %1, q2323 ; r4343 181 shufps %3, %5, %6, q3032 ; q34, k14 182 183 shufps %1, %1, %1, q1010 ; r1212 184 shufps %5, %5, %6, q1210 ; q12, k32 185 186 xorps %4, %4, [mask_pmmppmmp] ; r4343 * pmmp 187 addps %6, %5, %3 ; s12, g12 188 189 mulps %2, %2, [d8_mult_odd] ; r8 * d8_mult_odd 190 subps %5, %5, %3 ; s34, g43 191 192 addps %3, %1, %4 ; z1234 193 unpcklpd %1, %6, %5 ; s1234 194 195 shufps %4, %2, %2, q2301 ; j2143 196 shufps %6, %6, %5, q2332 ; g1234 197 198 addsubps %2, %2, %4 ; l2143 199 shufps %5, %2, %2, q0123 ; l3412 200 addsubps %5, %5, %2 ; t1234 201 202 subps %2, %1, %6 ; h1234 even 203 subps %4, %3, %5 ; u1234 odd 204 205 addps %1, %1, %6 ; w1234 even 206 addps %3, %3, %5 ; o1234 odd 207 %endmacro 208 209 ; Single 8-point in-place complex FFT in 20 instructions 210 ; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) 211 ; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) 212 ; %3 - temporary 213 ; %4 - temporary 214 %macro FFT8_AVX 4 215 subps %3, %1, %2 ; r1234, r5678 216 addps %1, %1, %2 ; q1234, q5678 217 218 vpermilps %2, %3, [s8_perm_odd1] ; r4422, r6688 219 shufps %4, %1, %1, q3322 ; q1122, q5566 220 221 movsldup %3, %3 ; r1133, r5577 222 shufps %1, %1, %1, q1100 ; q3344, q7788 223 224 addsubps %3, %3, %2 ; z1234, z5678 225 addsubps %1, %1, %4 ; s3142, s7586 226 227 mulps %3, %3, [s8_mult_odd] ; z * s8_mult_odd 228 vpermilps %1, %1, [s8_perm_even] ; s1234, s5687 ! 229 230 shufps %2, %3, %3, q2332 ; junk, z7887 231 xorps %4, %1, [mask_mmmmpppm] ; e1234, e5687 ! 232 233 vpermilps %3, %3, [s8_perm_odd2] ; z2314, z6556 234 vperm2f128 %1, %1, %4, 0x03 ; e5687, s1234 235 236 addsubps %2, %2, %3 ; junk, t5678 237 subps %1, %1, %4 ; w1234, w5678 even 238 239 vperm2f128 %2, %2, %2, 0x11 ; t5678, t5678 240 vperm2f128 %3, %3, %3, 0x00 ; z2314, z2314 241 242 xorps %2, %2, [mask_ppmpmmpm] ; t * ppmpmmpm 243 addps %2, %3, %2 ; u1234, u5678 odd 244 %endmacro 245 246 ; Single 16-point in-place complex FFT 247 ; %1 - even coefficients (r0.reim, r2.reim, r4.reim, r6.reim) 248 ; %2 - even coefficients (r8.reim, r10.reim, r12.reim, r14.reim) 249 ; %3 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) 250 ; %4 - odd coefficients (r9.reim, r11.reim, r13.reim, r15.reim) 251 ; %5, %6 - temporary 252 ; %7, %8 - temporary (optional) 253 %macro FFT16 6-8 254 FFT4 %3, %4, %5 255 %if %0 > 7 256 FFT8_AVX %1, %2, %6, %7 257 movaps %8, [mask_mpmppmpm] 258 movaps %7, [s16_perm] 259 %define mask %8 260 %define perm %7 261 %elif %0 > 6 262 FFT8_AVX %1, %2, %6, %7 263 movaps %7, [s16_perm] 264 %define mask [mask_mpmppmpm] 265 %define perm %7 266 %else 267 FFT8_AVX %1, %2, %6, %5 268 %define mask [mask_mpmppmpm] 269 %define perm [s16_perm] 270 %endif 271 xorps %5, %5, %5 ; 0 272 273 shufps %6, %4, %4, q2301 ; z12.imre, z13.imre... 274 shufps %5, %5, %3, q2301 ; 0, 0, z8.imre... 275 276 mulps %4, %4, [s16_mult_odd1] ; z.reim * costab 277 xorps %5, %5, [mask_mppmmpmp] 278 %if cpuflag(fma3) 279 fmaddps %6, %6, [s16_mult_odd2], %4 ; s[8..15] 280 addps %5, %3, %5 ; s[0...7] 281 %else 282 mulps %6, %6, [s16_mult_odd2] ; z.imre * costab 283 284 addps %5, %3, %5 ; s[0...7] 285 addps %6, %4, %6 ; s[8..15] 286 %endif 287 mulps %5, %5, [s16_mult_even] ; s[0...7]*costab 288 289 xorps %4, %6, mask ; s[8..15]*mpmppmpm 290 xorps %3, %5, mask ; s[0...7]*mpmppmpm 291 292 vperm2f128 %4, %4, %4, 0x01 ; s[12..15, 8..11] 293 vperm2f128 %3, %3, %3, 0x01 ; s[4..7, 0..3] 294 295 addps %6, %6, %4 ; y56, u56, y34, u34 296 addps %5, %5, %3 ; w56, x56, w34, x34 297 298 vpermilps %6, %6, perm ; y56, u56, y43, u43 299 vpermilps %5, %5, perm ; w56, x56, w43, x43 300 301 subps %4, %2, %6 ; odd part 2 302 addps %3, %2, %6 ; odd part 1 303 304 subps %2, %1, %5 ; even part 2 305 addps %1, %1, %5 ; even part 1 306 %undef mask 307 %undef perm 308 %endmacro 309 310 ; Single 15-point complex FFT 311 ; Input: 312 ; xm0 must contain in[0,1].reim 313 ; m2 - in[3-6].reim 314 ; m3 - in[7-11].reim 315 ; m4 - in[12-15].reim 316 ; xm5 must contain in[2].reimreim 317 ; 318 ; Output: 319 ; m0, m1, m2 - ACs 320 ; xm14 - out[0] 321 ; xm15 - out[10, 5] 322 %macro FFT15 0 323 shufps xm1, xm0, xm0, q3223 ; in[1].imrereim 324 shufps xm0, xm0, xm0, q1001 ; in[0].imrereim 325 326 xorps xm1, xm11 327 addps xm1, xm0 ; pc[0,1].imre 328 329 shufps xm0, xm1, xm1, q3232 ; pc[1].reimreim 330 addps xm0, xm5 ; dc[0].reimreim 331 332 mulps xm1, xm9 ; tab[0123]*pc[01] 333 334 shufpd xm6, xm1, xm1, 01b ; pc[1,0].reim 335 xorps xm1, xm11 336 addps xm1, xm1, xm6 337 addsubps xm1, xm5, xm1 ; dc[1,2].reim 338 339 subps m7, m2, m3 ; q[0-3].imre 340 addps m6, m2, m3 ; q[4-7] 341 shufps m7, m7, m7, q2301 ; q[0-3].reim 342 343 addps m5, m4, m6 ; y[0-3] 344 345 vperm2f128 m14, m9, m9, 0x11 ; tab[23232323] 346 vbroadcastsd m15, xm9 ; tab[01010101] 347 348 mulps m6, m14 349 mulps m7, m15 350 351 subps m2, m6, m7 ; k[0-3] 352 addps m3, m6, m7 ; k[4-7] 353 354 shufps m12, m11, m11, q3232 ; ppppmmmm 355 356 addsubps m6, m4, m2 ; k[0-3] 357 addsubps m7, m4, m3 ; k[4-7] 358 359 ; 15pt from here on 360 vpermpd m2, m5, q0123 ; y[3-0] 361 vpermpd m3, m6, q0123 ; k[3-0] 362 vpermpd m4, m7, q0123 ; k[7-4] 363 364 xorps m5, m12 365 xorps m6, m12 366 xorps m7, m12 367 368 addps m2, m5 ; t[0-3] 369 addps m3, m6 ; t[4-7] 370 addps m4, m7 ; t[8-11] 371 372 movlhps xm14, xm2 ; out[0] 373 unpcklpd xm15, xm3, xm4 ; out[10,5] 374 unpckhpd xm5, xm3, xm4 ; out[10,5] 375 376 addps xm14, xm2 ; out[0] 377 addps xm15, xm5 ; out[10,5] 378 addps xm14, xm0 ; out[0] 379 addps xm15, xm1 ; out[10,5] 380 381 shufps m12, m10, m10, q3232 ; tab5 4 5 4 5 8 9 8 9 382 shufps m13, m10, m10, q1010 ; tab5 6 7 6 7 10 11 10 11 383 384 mulps m5, m2, m12 ; t[0-3] 385 mulps m6, m3, m12 ; t[4-7] 386 mulps m7, m4, m12 ; t[8-11] 387 388 mulps m2, m13 ; r[0-3] 389 mulps m3, m13 ; r[4-7] 390 mulps m4, m13 ; r[8-11] 391 392 shufps m5, m5, m5, q1032 ; t[1,0,3,2].reim 393 shufps m6, m6, m6, q1032 ; t[5,4,7,6].reim 394 shufps m7, m7, m7, q1032 ; t[9,8,11,10].reim 395 396 vperm2f128 m13, m11, m11, 0x01 ; mmmmmmpp 397 shufps m12, m11, m11, q3232 ; ppppmmmm 398 399 xorps m5, m13 400 xorps m6, m13 401 xorps m7, m13 402 403 addps m2, m5 ; r[0,1,2,3] 404 addps m3, m6 ; r[4,5,6,7] 405 addps m4, m7 ; r[8,9,10,11] 406 407 shufps m5, m2, m2, q2301 408 shufps m6, m3, m3, q2301 409 shufps m7, m4, m4, q2301 410 411 xorps m2, m12 412 xorps m3, m12 413 xorps m4, m12 414 415 vpermpd m5, m5, q0123 416 vpermpd m6, m6, q0123 417 vpermpd m7, m7, q0123 418 419 addps m5, m2 420 addps m6, m3 421 addps m7, m4 422 423 vpermps m5, m8, m5 424 vpermps m6, m8, m6 425 vpermps m7, m8, m7 426 427 vbroadcastsd m0, xm0 ; dc[0] 428 vpermpd m2, m1, q1111 ; dc[2] 429 vbroadcastsd m1, xm1 ; dc[1] 430 431 addps m0, m5 432 addps m1, m6 433 addps m2, m7 434 %endmacro 435 436 ; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs 437 ; Uses all 16 of registers. 438 ; Output is slightly permuted such that tx2,3's coefficients are interleaved 439 ; on a 2-point basis (look at `doc/transforms.md`) 440 %macro SPLIT_RADIX_COMBINE 17 441 %if %1 && mmsize == 32 442 vperm2f128 %14, %6, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] even 443 vperm2f128 %16, %9, %8, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd 444 vperm2f128 %15, %6, %7, 0x31 ; m2[2], m2[3], m3[2], m3[3] even 445 vperm2f128 %17, %9, %8, 0x31 ; m2[2], m2[3], m3[2], m3[3] odd 446 %endif 447 448 shufps %12, %10, %10, q2200 ; cos00224466 449 shufps %13, %11, %11, q1133 ; wim77553311 450 movshdup %10, %10 ; cos11335577 451 shufps %11, %11, %11, q0022 ; wim66442200 452 453 %if %1 && mmsize == 32 454 shufps %6, %14, %14, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even 455 shufps %8, %16, %16, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd 456 shufps %7, %15, %15, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even 457 shufps %9, %17, %17, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd 458 459 mulps %14, %14, %13 ; m2[0123]reim * wim7531 even 460 mulps %16, %16, %11 ; m2[0123]reim * wim7531 odd 461 mulps %15, %15, %13 ; m3[0123]reim * wim7531 even 462 mulps %17, %17, %11 ; m3[0123]reim * wim7531 odd 463 %else 464 mulps %14, %6, %13 ; m2,3[01]reim * wim7531 even 465 mulps %16, %8, %11 ; m2,3[01]reim * wim7531 odd 466 mulps %15, %7, %13 ; m2,3[23]reim * wim7531 even 467 mulps %17, %9, %11 ; m2,3[23]reim * wim7531 odd 468 ; reorder the multiplies to save movs reg, reg in the %if above 469 shufps %6, %6, %6, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even 470 shufps %8, %8, %8, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd 471 shufps %7, %7, %7, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even 472 shufps %9, %9, %9, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd 473 %endif 474 475 %if cpuflag(fma3) ; 11 - 5 = 6 instructions saved through FMA! 476 fmaddsubps %6, %6, %12, %14 ; w[0..8] even 477 fmaddsubps %8, %8, %10, %16 ; w[0..8] odd 478 fmsubaddps %7, %7, %12, %15 ; j[0..8] even 479 fmsubaddps %9, %9, %10, %17 ; j[0..8] odd 480 movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!" 481 %else 482 mulps %6, %6, %12 ; m2,3[01]imre * cos0246 483 mulps %8, %8, %10 ; m2,3[01]imre * cos0246 484 movaps %13, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!" 485 mulps %7, %7, %12 ; m2,3[23]reim * cos0246 486 mulps %9, %9, %10 ; m2,3[23]reim * cos0246 487 addsubps %6, %6, %14 ; w[0..8] 488 addsubps %8, %8, %16 ; w[0..8] 489 xorps %15, %15, %13 ; +-m2,3[23]imre * wim7531 490 xorps %17, %17, %13 ; +-m2,3[23]imre * wim7531 491 addps %7, %7, %15 ; j[0..8] 492 addps %9, %9, %17 ; j[0..8] 493 %endif 494 495 addps %14, %6, %7 ; t10235476 even 496 addps %16, %8, %9 ; t10235476 odd 497 subps %15, %6, %7 ; +-r[0..7] even 498 subps %17, %8, %9 ; +-r[0..7] odd 499 500 shufps %14, %14, %14, q2301 ; t[0..7] even 501 shufps %16, %16, %16, q2301 ; t[0..7] odd 502 xorps %15, %15, %13 ; r[0..7] even 503 xorps %17, %17, %13 ; r[0..7] odd 504 505 subps %6, %2, %14 ; m2,3[01] even 506 subps %8, %4, %16 ; m2,3[01] odd 507 subps %7, %3, %15 ; m2,3[23] even 508 subps %9, %5, %17 ; m2,3[23] odd 509 510 addps %2, %2, %14 ; m0 even 511 addps %4, %4, %16 ; m0 odd 512 addps %3, %3, %15 ; m1 even 513 addps %5, %5, %17 ; m1 odd 514 %endmacro 515 516 ; Same as above, only does one parity at a time, takes 3 temporary registers, 517 ; however, if the twiddles aren't needed after this, the registers they use 518 ; can be used as any of the temporary registers. 519 %macro SPLIT_RADIX_COMBINE_HALF 10 520 %if %1 521 shufps %8, %6, %6, q2200 ; cos00224466 522 shufps %9, %7, %7, q1133 ; wim77553311 523 %else 524 shufps %8, %6, %6, q3311 ; cos11335577 525 shufps %9, %7, %7, q0022 ; wim66442200 526 %endif 527 528 mulps %10, %4, %9 ; m2,3[01]reim * wim7531 even 529 mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even 530 531 shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even 532 shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even 533 534 %if cpuflag(fma3) 535 fmaddsubps %4, %4, %8, %10 ; w[0..8] even 536 fmsubaddps %5, %5, %8, %9 ; j[0..8] even 537 movaps %10, [mask_pmpmpmpm] 538 %else 539 mulps %4, %4, %8 ; m2,3[01]imre * cos0246 540 mulps %5, %5, %8 ; m2,3[23]reim * cos0246 541 addsubps %4, %4, %10 ; w[0..8] 542 movaps %10, [mask_pmpmpmpm] 543 xorps %9, %9, %10 ; +-m2,3[23]imre * wim7531 544 addps %5, %5, %9 ; j[0..8] 545 %endif 546 547 addps %8, %4, %5 ; t10235476 548 subps %9, %4, %5 ; +-r[0..7] 549 550 shufps %8, %8, %8, q2301 ; t[0..7] 551 xorps %9, %9, %10 ; r[0..7] 552 553 subps %4, %2, %8 ; %3,3[01] 554 subps %5, %3, %9 ; %3,3[23] 555 556 addps %2, %2, %8 ; m0 557 addps %3, %3, %9 ; m1 558 %endmacro 559 560 ; Same as above, tries REALLY hard to use 2 temporary registers. 561 %macro SPLIT_RADIX_COMBINE_LITE 9 562 %if %1 563 shufps %8, %6, %6, q2200 ; cos00224466 564 shufps %9, %7, %7, q1133 ; wim77553311 565 %else 566 shufps %8, %6, %6, q3311 ; cos11335577 567 shufps %9, %7, %7, q0022 ; wim66442200 568 %endif 569 570 mulps %9, %9, %4 ; m2,3[01]reim * wim7531 even 571 shufps %4, %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even 572 573 %if cpuflag(fma3) 574 fmaddsubps %4, %4, %8, %9 ; w[0..8] even 575 %else 576 mulps %4, %4, %8 ; m2,3[01]imre * cos0246 577 addsubps %4, %4, %9 ; w[0..8] 578 %endif 579 580 %if %1 581 shufps %9, %7, %7, q1133 ; wim77553311 582 %else 583 shufps %9, %7, %7, q0022 ; wim66442200 584 %endif 585 586 mulps %9, %9, %5 ; m2,3[23]reim * wim7531 even 587 shufps %5, %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even 588 %if cpuflag (fma3) 589 fmsubaddps %5, %5, %8, %9 ; j[0..8] even 590 %else 591 mulps %5, %5, %8 ; m2,3[23]reim * cos0246 592 xorps %9, %9, [mask_pmpmpmpm] ; +-m2,3[23]imre * wim7531 593 addps %5, %5, %9 ; j[0..8] 594 %endif 595 596 addps %8, %4, %5 ; t10235476 597 subps %9, %4, %5 ; +-r[0..7] 598 599 shufps %8, %8, %8, q2301 ; t[0..7] 600 xorps %9, %9, [mask_pmpmpmpm] ; r[0..7] 601 602 subps %4, %2, %8 ; %3,3[01] 603 subps %5, %3, %9 ; %3,3[23] 604 605 addps %2, %2, %8 ; m0 606 addps %3, %3, %9 ; m1 607 %endmacro 608 609 %macro SPLIT_RADIX_COMBINE_64 0 610 SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 611 612 movaps [outq + 0*mmsize], m0 613 movaps [outq + 4*mmsize], m1 614 movaps [outq + 8*mmsize], tx1_e0 615 movaps [outq + 12*mmsize], tx2_e0 616 617 SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, m0 618 619 movaps [outq + 2*mmsize], m2 620 movaps [outq + 6*mmsize], m3 621 movaps [outq + 10*mmsize], tx1_o0 622 movaps [outq + 14*mmsize], tx2_o0 623 624 movaps tw_e, [tab_64_float + mmsize] 625 vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 626 627 movaps m0, [outq + 1*mmsize] 628 movaps m1, [outq + 3*mmsize] 629 movaps m2, [outq + 5*mmsize] 630 movaps m3, [outq + 7*mmsize] 631 632 SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \ 633 tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers 634 635 movaps [outq + 1*mmsize], m0 636 movaps [outq + 3*mmsize], m1 637 movaps [outq + 5*mmsize], m2 638 movaps [outq + 7*mmsize], m3 639 640 movaps [outq + 9*mmsize], tx1_e1 641 movaps [outq + 11*mmsize], tx1_o1 642 movaps [outq + 13*mmsize], tx2_e1 643 movaps [outq + 15*mmsize], tx2_o1 644 %endmacro 645 646 ; Perform a single even/odd split radix combination with loads and stores 647 ; The _4 indicates this is a quarter of the iterations required to complete a full 648 ; combine loop 649 ; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6 650 %macro SPLIT_RADIX_LOAD_COMBINE_4 8 651 movaps m8, [rtabq + (%5)*mmsize + %7] 652 vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23 653 654 movaps m0, [outq + (0 + %4)*mmsize + %6] 655 movaps m2, [outq + (2 + %4)*mmsize + %6] 656 movaps m1, [outq + %1 + (0 + %4)*mmsize + %6] 657 movaps m3, [outq + %1 + (2 + %4)*mmsize + %6] 658 659 movaps m4, [outq + %2 + (0 + %4)*mmsize + %6] 660 movaps m6, [outq + %2 + (2 + %4)*mmsize + %6] 661 movaps m5, [outq + %3 + (0 + %4)*mmsize + %6] 662 movaps m7, [outq + %3 + (2 + %4)*mmsize + %6] 663 664 SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ 665 m4, m5, m6, m7, \ 666 m8, m9, \ 667 m10, m11, m12, m13, m14, m15 668 669 movaps [outq + (0 + %4)*mmsize + %6], m0 670 movaps [outq + (2 + %4)*mmsize + %6], m2 671 movaps [outq + %1 + (0 + %4)*mmsize + %6], m1 672 movaps [outq + %1 + (2 + %4)*mmsize + %6], m3 673 674 movaps [outq + %2 + (0 + %4)*mmsize + %6], m4 675 movaps [outq + %2 + (2 + %4)*mmsize + %6], m6 676 movaps [outq + %3 + (0 + %4)*mmsize + %6], m5 677 movaps [outq + %3 + (2 + %4)*mmsize + %6], m7 678 %endmacro 679 680 %macro SPLIT_RADIX_LOAD_COMBINE_FULL 2-5 681 %if %0 > 2 682 %define offset_c %3 683 %else 684 %define offset_c 0 685 %endif 686 %if %0 > 3 687 %define offset_r %4 688 %else 689 %define offset_r 0 690 %endif 691 %if %0 > 4 692 %define offset_i %5 693 %else 694 %define offset_i 0 695 %endif 696 697 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i 698 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i 699 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 4, 2, offset_c, offset_r, offset_i 700 SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 5, 3, offset_c, offset_r, offset_i 701 %endmacro 702 703 ; Perform a single even/odd split radix combination with loads, deinterleaves and 704 ; stores. The _2 indicates this is a half of the iterations required to complete 705 ; a full combine+deinterleave loop 706 ; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6 707 %macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6 708 movaps m8, [rtabq + (0 + %2)*mmsize] 709 vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23 710 711 movaps m0, [outq + (0 + 0 + %1)*mmsize + %6] 712 movaps m2, [outq + (2 + 0 + %1)*mmsize + %6] 713 movaps m1, [outq + %3 + (0 + 0 + %1)*mmsize + %6] 714 movaps m3, [outq + %3 + (2 + 0 + %1)*mmsize + %6] 715 716 movaps m4, [outq + %4 + (0 + 0 + %1)*mmsize + %6] 717 movaps m6, [outq + %4 + (2 + 0 + %1)*mmsize + %6] 718 movaps m5, [outq + %5 + (0 + 0 + %1)*mmsize + %6] 719 movaps m7, [outq + %5 + (2 + 0 + %1)*mmsize + %6] 720 721 SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ 722 m4, m5, m6, m7, \ 723 m8, m9, \ 724 m10, m11, m12, m13, m14, m15 725 726 unpckhpd m10, m0, m2 727 unpckhpd m11, m1, m3 728 unpckhpd m12, m4, m6 729 unpckhpd m13, m5, m7 730 unpcklpd m0, m0, m2 731 unpcklpd m1, m1, m3 732 unpcklpd m4, m4, m6 733 unpcklpd m5, m5, m7 734 735 vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 0], m0, 0 736 vextractf128 [outq + (0 + 0 + %1)*mmsize + %6 + 16], m10, 0 737 vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 0], m1, 0 738 vextractf128 [outq + %3 + (0 + 0 + %1)*mmsize + %6 + 16], m11, 0 739 740 vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 0], m4, 0 741 vextractf128 [outq + %4 + (0 + 0 + %1)*mmsize + %6 + 16], m12, 0 742 vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 0], m5, 0 743 vextractf128 [outq + %5 + (0 + 0 + %1)*mmsize + %6 + 16], m13, 0 744 745 vperm2f128 m10, m10, m0, 0x13 746 vperm2f128 m11, m11, m1, 0x13 747 vperm2f128 m12, m12, m4, 0x13 748 vperm2f128 m13, m13, m5, 0x13 749 750 movaps m8, [rtabq + (1 + %2)*mmsize] 751 vperm2f128 m9, m9, [itabq - (1 + %2)*mmsize], 0x23 752 753 movaps m0, [outq + (0 + 1 + %1)*mmsize + %6] 754 movaps m2, [outq + (2 + 1 + %1)*mmsize + %6] 755 movaps m1, [outq + %3 + (0 + 1 + %1)*mmsize + %6] 756 movaps m3, [outq + %3 + (2 + 1 + %1)*mmsize + %6] 757 758 movaps [outq + (0 + 1 + %1)*mmsize + %6], m10 ; m0 conflict 759 movaps [outq + %3 + (0 + 1 + %1)*mmsize + %6], m11 ; m1 conflict 760 761 movaps m4, [outq + %4 + (0 + 1 + %1)*mmsize + %6] 762 movaps m6, [outq + %4 + (2 + 1 + %1)*mmsize + %6] 763 movaps m5, [outq + %5 + (0 + 1 + %1)*mmsize + %6] 764 movaps m7, [outq + %5 + (2 + 1 + %1)*mmsize + %6] 765 766 movaps [outq + %4 + (0 + 1 + %1)*mmsize + %6], m12 ; m4 conflict 767 movaps [outq + %5 + (0 + 1 + %1)*mmsize + %6], m13 ; m5 conflict 768 769 SPLIT_RADIX_COMBINE 0, m0, m1, m2, m3, \ 770 m4, m5, m6, m7, \ 771 m8, m9, \ 772 m10, m11, m12, m13, m14, m15 ; temporary registers 773 774 unpcklpd m8, m0, m2 775 unpcklpd m9, m1, m3 776 unpcklpd m10, m4, m6 777 unpcklpd m11, m5, m7 778 unpckhpd m0, m0, m2 779 unpckhpd m1, m1, m3 780 unpckhpd m4, m4, m6 781 unpckhpd m5, m5, m7 782 783 vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 0], m8, 0 784 vextractf128 [outq + (2 + 0 + %1)*mmsize + %6 + 16], m0, 0 785 vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 0], m8, 1 786 vextractf128 [outq + (2 + 1 + %1)*mmsize + %6 + 16], m0, 1 787 788 vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 0], m9, 0 789 vextractf128 [outq + %3 + (2 + 0 + %1)*mmsize + %6 + 16], m1, 0 790 vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 0], m9, 1 791 vextractf128 [outq + %3 + (2 + 1 + %1)*mmsize + %6 + 16], m1, 1 792 793 vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 0], m10, 0 794 vextractf128 [outq + %4 + (2 + 0 + %1)*mmsize + %6 + 16], m4, 0 795 vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 0], m10, 1 796 vextractf128 [outq + %4 + (2 + 1 + %1)*mmsize + %6 + 16], m4, 1 797 798 vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 0], m11, 0 799 vextractf128 [outq + %5 + (2 + 0 + %1)*mmsize + %6 + 16], m5, 0 800 vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 0], m11, 1 801 vextractf128 [outq + %5 + (2 + 1 + %1)*mmsize + %6 + 16], m5, 1 802 %endmacro 803 804 %macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL 2-3 805 %if %0 > 2 806 %define offset %3 807 %else 808 %define offset 0 809 %endif 810 SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset 811 SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset 812 %endmacro 813 814 INIT_XMM sse3 815 cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride 816 movaps m0, [inq] 817 FFT2 m0, m1 818 movaps [outq], m0 819 ret 820 821 cglobal fft2_float, 4, 4, 2, ctx, out, in, stride 822 movaps m0, [inq] 823 FFT2 m0, m1 824 movaps [outq], m0 825 RET 826 827 %macro FFT4_FN 3 828 INIT_XMM sse2 829 %if %3 830 cglobal fft4_ %+ %1 %+ _asm_float, 0, 0, 0, ctx, out, in, stride 831 %else 832 cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride 833 %endif 834 movaps m0, [inq + 0*mmsize] 835 movaps m1, [inq + 1*mmsize] 836 837 %if %2 838 shufps m2, m1, m0, q3210 839 shufps m0, m0, m1, q3210 840 movaps m1, m2 841 %endif 842 843 FFT4 m0, m1, m2 844 845 unpcklpd m2, m0, m1 846 unpckhpd m0, m0, m1 847 848 movaps [outq + 0*mmsize], m2 849 movaps [outq + 1*mmsize], m0 850 851 %if %3 852 ret 853 %else 854 RET 855 %endif 856 %endmacro 857 858 FFT4_FN fwd, 0, 0 859 FFT4_FN fwd, 0, 1 860 FFT4_FN inv, 1, 0 861 FFT4_FN inv, 1, 1 862 863 %macro FFT8_SSE_FN 1 864 INIT_XMM sse3 865 %if %1 866 cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp 867 movaps m0, [inq + 0*mmsize] 868 movaps m1, [inq + 1*mmsize] 869 movaps m2, [inq + 2*mmsize] 870 movaps m3, [inq + 3*mmsize] 871 %else 872 cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp 873 mov ctxq, [ctxq + AVTXContext.map] 874 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq 875 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq 876 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq 877 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq 878 %endif 879 880 FFT8 m0, m1, m2, m3, m4, m5 881 882 unpcklpd m4, m0, m3 883 unpcklpd m5, m1, m2 884 unpckhpd m0, m0, m3 885 unpckhpd m1, m1, m2 886 887 movups [outq + 0*mmsize], m4 888 movups [outq + 1*mmsize], m0 889 movups [outq + 2*mmsize], m5 890 movups [outq + 3*mmsize], m1 891 892 %if %1 893 ret 894 %else 895 RET 896 %endif 897 898 %if %1 899 cglobal fft8_ns_float, 4, 5, 6, ctx, out, in, stride, tmp 900 call mangle(ff_tx_fft8_asm_float_sse3) 901 RET 902 %endif 903 %endmacro 904 905 FFT8_SSE_FN 0 906 FFT8_SSE_FN 1 907 908 %macro FFT8_AVX_FN 1 909 INIT_YMM avx 910 %if %1 911 cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp 912 movaps m0, [inq + 0*mmsize] 913 movaps m1, [inq + 1*mmsize] 914 %else 915 cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp 916 mov ctxq, [ctxq + AVTXContext.map] 917 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2 918 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3 919 %endif 920 921 FFT8_AVX m0, m1, m2, m3 922 923 unpcklpd m2, m0, m1 924 unpckhpd m0, m0, m1 925 926 ; Around 2% faster than 2x vperm2f128 + 2x movapd 927 vextractf128 [outq + 16*0], m2, 0 928 vextractf128 [outq + 16*1], m0, 0 929 vextractf128 [outq + 16*2], m2, 1 930 vextractf128 [outq + 16*3], m0, 1 931 932 %if %1 933 ret 934 %else 935 RET 936 %endif 937 938 %if %1 939 cglobal fft8_ns_float, 4, 5, 4, ctx, out, in, stride, tmp 940 call mangle(ff_tx_fft8_asm_float_avx) 941 RET 942 %endif 943 %endmacro 944 945 FFT8_AVX_FN 0 946 FFT8_AVX_FN 1 947 948 %macro FFT16_FN 2 949 INIT_YMM %1 950 %if %2 951 cglobal fft16_asm_float, 0, 0, 0, ctx, out, in, stride, tmp 952 movaps m0, [inq + 0*mmsize] 953 movaps m1, [inq + 1*mmsize] 954 movaps m2, [inq + 2*mmsize] 955 movaps m3, [inq + 3*mmsize] 956 %else 957 cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp 958 mov ctxq, [ctxq + AVTXContext.map] 959 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4 960 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5 961 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6 962 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7 963 %endif 964 965 FFT16 m0, m1, m2, m3, m4, m5, m6, m7 966 967 unpcklpd m5, m1, m3 968 unpcklpd m4, m0, m2 969 unpckhpd m1, m1, m3 970 unpckhpd m0, m0, m2 971 972 vextractf128 [outq + 16*0], m4, 0 973 vextractf128 [outq + 16*1], m0, 0 974 vextractf128 [outq + 16*2], m4, 1 975 vextractf128 [outq + 16*3], m0, 1 976 vextractf128 [outq + 16*4], m5, 0 977 vextractf128 [outq + 16*5], m1, 0 978 vextractf128 [outq + 16*6], m5, 1 979 vextractf128 [outq + 16*7], m1, 1 980 981 %if %2 982 ret 983 %else 984 RET 985 %endif 986 987 %if %2 988 cglobal fft16_ns_float, 4, 5, 8, ctx, out, in, stride, tmp 989 call mangle(ff_tx_fft16_asm_float_ %+ %1) 990 RET 991 %endif 992 %endmacro 993 994 FFT16_FN avx, 0 995 FFT16_FN avx, 1 996 FFT16_FN fma3, 0 997 FFT16_FN fma3, 1 998 999 %macro FFT32_FN 2 1000 INIT_YMM %1 1001 %if %2 1002 cglobal fft32_asm_float, 0, 0, 0, ctx, out, in, stride, tmp 1003 movaps m4, [inq + 4*mmsize] 1004 movaps m5, [inq + 5*mmsize] 1005 movaps m6, [inq + 6*mmsize] 1006 movaps m7, [inq + 7*mmsize] 1007 %else 1008 cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp 1009 mov ctxq, [ctxq + AVTXContext.map] 1010 LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12 1011 LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13 1012 LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m10, m14 1013 LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m11, m15 1014 %endif 1015 1016 FFT8 m4, m5, m6, m7, m8, m9 1017 1018 %if %2 1019 movaps m0, [inq + 0*mmsize] 1020 movaps m1, [inq + 1*mmsize] 1021 movaps m2, [inq + 2*mmsize] 1022 movaps m3, [inq + 3*mmsize] 1023 %else 1024 LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m12 1025 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m9, m13 1026 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m10, m14 1027 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m11, m15 1028 %endif 1029 1030 movaps m8, [tab_32_float] 1031 vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23 1032 1033 FFT16 m0, m1, m2, m3, m10, m11, m12, m13 1034 1035 SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \ 1036 m10, m11, m12, m13, m14, m15 ; temporary registers 1037 1038 unpcklpd m9, m1, m3 1039 unpcklpd m10, m5, m7 1040 unpcklpd m8, m0, m2 1041 unpcklpd m11, m4, m6 1042 unpckhpd m1, m1, m3 1043 unpckhpd m5, m5, m7 1044 unpckhpd m0, m0, m2 1045 unpckhpd m4, m4, m6 1046 1047 vextractf128 [outq + 16* 0], m8, 0 1048 vextractf128 [outq + 16* 1], m0, 0 1049 vextractf128 [outq + 16* 2], m8, 1 1050 vextractf128 [outq + 16* 3], m0, 1 1051 vextractf128 [outq + 16* 4], m9, 0 1052 vextractf128 [outq + 16* 5], m1, 0 1053 vextractf128 [outq + 16* 6], m9, 1 1054 vextractf128 [outq + 16* 7], m1, 1 1055 1056 vextractf128 [outq + 16* 8], m11, 0 1057 vextractf128 [outq + 16* 9], m4, 0 1058 vextractf128 [outq + 16*10], m11, 1 1059 vextractf128 [outq + 16*11], m4, 1 1060 vextractf128 [outq + 16*12], m10, 0 1061 vextractf128 [outq + 16*13], m5, 0 1062 vextractf128 [outq + 16*14], m10, 1 1063 vextractf128 [outq + 16*15], m5, 1 1064 1065 %if %2 1066 ret 1067 %else 1068 RET 1069 %endif 1070 1071 %if %2 1072 cglobal fft32_ns_float, 4, 5, 16, ctx, out, in, stride, tmp 1073 call mangle(ff_tx_fft32_asm_float_ %+ %1) 1074 RET 1075 %endif 1076 %endmacro 1077 1078 %if ARCH_X86_64 1079 FFT32_FN avx, 0 1080 FFT32_FN avx, 1 1081 FFT32_FN fma3, 0 1082 FFT32_FN fma3, 1 1083 %endif 1084 1085 %macro FFT_SPLIT_RADIX_DEF 1-2 1086 ALIGN 16 1087 .%1 %+ pt: 1088 PUSH lenq 1089 mov lenq, (%1/4) 1090 1091 add outq, (%1*4) - (%1/1) 1092 call .32pt 1093 1094 add outq, (%1*2) - (%1/2) ; the synth loops also increment outq 1095 call .32pt 1096 1097 POP lenq 1098 sub outq, (%1*4) + (%1*2) + (%1/2) 1099 1100 lea rtabq, [tab_ %+ %1 %+ _float] 1101 lea itabq, [tab_ %+ %1 %+ _float + %1 - 4*7] 1102 1103 %if %0 > 1 1104 cmp tgtq, %1 1105 je .deinterleave 1106 1107 mov tmpq, %1 1108 1109 .synth_ %+ %1: 1110 SPLIT_RADIX_LOAD_COMBINE_FULL 2*%1, 6*%1, 0, 0, 0 1111 add outq, 8*mmsize 1112 add rtabq, 4*mmsize 1113 sub itabq, 4*mmsize 1114 sub tmpq, 4*mmsize 1115 jg .synth_ %+ %1 1116 1117 cmp lenq, %1 1118 jg %2 ; can't do math here, nasm doesn't get it 1119 ret 1120 %endif 1121 %endmacro 1122 1123 %macro FFT_SPLIT_RADIX_FN 2 1124 INIT_YMM %1 1125 %if %2 1126 cglobal fft_sr_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp 1127 %else 1128 cglobal fft_sr_float, 4, 10, 16, 272, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp 1129 movsxd lenq, dword [ctxq + AVTXContext.len] 1130 mov lutq, [ctxq + AVTXContext.map] 1131 %endif 1132 mov tgtq, lenq 1133 1134 ; Bottom-most/32-point transform =============================================== 1135 ALIGN 16 1136 .32pt: 1137 %if %2 1138 movaps m4, [inq + 4*mmsize] 1139 movaps m5, [inq + 5*mmsize] 1140 movaps m6, [inq + 6*mmsize] 1141 movaps m7, [inq + 7*mmsize] 1142 %else 1143 LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m12 1144 LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m9, m13 1145 LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m10, m14 1146 LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m11, m15 1147 %endif 1148 1149 FFT8 m4, m5, m6, m7, m8, m9 1150 1151 %if %2 1152 movaps m0, [inq + 0*mmsize] 1153 movaps m1, [inq + 1*mmsize] 1154 movaps m2, [inq + 2*mmsize] 1155 movaps m3, [inq + 3*mmsize] 1156 %else 1157 LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m12 1158 LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m9, m13 1159 LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m10, m14 1160 LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m11, m15 1161 %endif 1162 1163 movaps m8, [tab_32_float] 1164 vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23 1165 1166 FFT16 m0, m1, m2, m3, m10, m11, m12, m13 1167 1168 SPLIT_RADIX_COMBINE 1, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, \ 1169 m10, m11, m12, m13, m14, m15 ; temporary registers 1170 1171 movaps [outq + 1*mmsize], m1 1172 movaps [outq + 3*mmsize], m3 1173 movaps [outq + 5*mmsize], m5 1174 movaps [outq + 7*mmsize], m7 1175 1176 %if %2 1177 add inq, 8*mmsize 1178 %else 1179 add lutq, (mmsize/2)*8 1180 %endif 1181 cmp lenq, 32 1182 jg .64pt 1183 1184 movaps [outq + 0*mmsize], m0 1185 movaps [outq + 2*mmsize], m2 1186 movaps [outq + 4*mmsize], m4 1187 movaps [outq + 6*mmsize], m6 1188 1189 ret 1190 1191 ; 64-point transform =========================================================== 1192 ALIGN 16 1193 .64pt: 1194 ; Helper defines, these make it easier to track what's happening 1195 %define tx1_e0 m4 1196 %define tx1_e1 m5 1197 %define tx1_o0 m6 1198 %define tx1_o1 m7 1199 %define tx2_e0 m8 1200 %define tx2_e1 m9 1201 %define tx2_o0 m10 1202 %define tx2_o1 m11 1203 %define tw_e m12 1204 %define tw_o m13 1205 %define tmp1 m14 1206 %define tmp2 m15 1207 1208 SWAP m4, m1 1209 SWAP m6, m3 1210 1211 %if %2 1212 movaps tx1_e0, [inq + 0*mmsize] 1213 movaps tx1_e1, [inq + 1*mmsize] 1214 movaps tx1_o0, [inq + 2*mmsize] 1215 movaps tx1_o1, [inq + 3*mmsize] 1216 %else 1217 LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tmp1 1218 LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tw_o, tmp2 1219 LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tmp1 1220 LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tw_o, tmp2 1221 %endif 1222 1223 FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1 1224 1225 %if %2 1226 movaps tx2_e0, [inq + 4*mmsize] 1227 movaps tx2_e1, [inq + 5*mmsize] 1228 movaps tx2_o0, [inq + 6*mmsize] 1229 movaps tx2_o1, [inq + 7*mmsize] 1230 %else 1231 LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tw_e, tmp1 1232 LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_o, tmp2 1233 LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tw_e, tmp1 1234 LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_o, tmp2 1235 %endif 1236 1237 FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o 1238 1239 movaps tw_e, [tab_64_float] 1240 vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23 1241 1242 %if %2 1243 add inq, 8*mmsize 1244 %else 1245 add lutq, (mmsize/2)*8 1246 %endif 1247 cmp tgtq, 64 1248 je .64pt_deint 1249 1250 SPLIT_RADIX_COMBINE_64 1251 1252 cmp lenq, 64 1253 jg .128pt 1254 ret 1255 1256 ; 128-point transform ========================================================== 1257 ALIGN 16 1258 .128pt: 1259 PUSH lenq 1260 mov lenq, 32 1261 1262 add outq, 16*mmsize 1263 call .32pt 1264 1265 add outq, 8*mmsize 1266 call .32pt 1267 1268 POP lenq 1269 sub outq, 24*mmsize 1270 1271 lea rtabq, [tab_128_float] 1272 lea itabq, [tab_128_float + 128 - 4*7] 1273 1274 cmp tgtq, 128 1275 je .deinterleave 1276 1277 SPLIT_RADIX_LOAD_COMBINE_FULL 2*128, 6*128 1278 1279 cmp lenq, 128 1280 jg .256pt 1281 ret 1282 1283 ; 256-point transform ========================================================== 1284 ALIGN 16 1285 .256pt: 1286 PUSH lenq 1287 mov lenq, 64 1288 1289 add outq, 32*mmsize 1290 call .32pt 1291 1292 add outq, 16*mmsize 1293 call .32pt 1294 1295 POP lenq 1296 sub outq, 48*mmsize 1297 1298 lea rtabq, [tab_256_float] 1299 lea itabq, [tab_256_float + 256 - 4*7] 1300 1301 cmp tgtq, 256 1302 je .deinterleave 1303 1304 SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256 1305 SPLIT_RADIX_LOAD_COMBINE_FULL 2*256, 6*256, 8*mmsize, 4*mmsize, -4*mmsize 1306 1307 cmp lenq, 256 1308 jg .512pt 1309 ret 1310 1311 ; 512-point transform ========================================================== 1312 ALIGN 16 1313 .512pt: 1314 PUSH lenq 1315 mov lenq, 128 1316 1317 add outq, 64*mmsize 1318 call .32pt 1319 1320 add outq, 32*mmsize 1321 call .32pt 1322 1323 POP lenq 1324 sub outq, 96*mmsize 1325 1326 lea rtabq, [tab_512_float] 1327 lea itabq, [tab_512_float + 512 - 4*7] 1328 1329 cmp tgtq, 512 1330 je .deinterleave 1331 1332 mov tmpq, 4 1333 1334 .synth_512: 1335 SPLIT_RADIX_LOAD_COMBINE_FULL 2*512, 6*512 1336 add outq, 8*mmsize 1337 add rtabq, 4*mmsize 1338 sub itabq, 4*mmsize 1339 sub tmpq, 1 1340 jg .synth_512 1341 1342 cmp lenq, 512 1343 jg .1024pt 1344 ret 1345 1346 ; 1024-point transform ========================================================== 1347 ALIGN 16 1348 .1024pt: 1349 PUSH lenq 1350 mov lenq, 256 1351 1352 add outq, 96*mmsize 1353 call .32pt 1354 1355 add outq, 64*mmsize 1356 call .32pt 1357 1358 POP lenq 1359 sub outq, 192*mmsize 1360 1361 lea rtabq, [tab_1024_float] 1362 lea itabq, [tab_1024_float + 1024 - 4*7] 1363 1364 cmp tgtq, 1024 1365 je .deinterleave 1366 1367 mov tmpq, 8 1368 1369 .synth_1024: 1370 SPLIT_RADIX_LOAD_COMBINE_FULL 2*1024, 6*1024 1371 add outq, 8*mmsize 1372 add rtabq, 4*mmsize 1373 sub itabq, 4*mmsize 1374 sub tmpq, 1 1375 jg .synth_1024 1376 1377 cmp lenq, 1024 1378 jg .2048pt 1379 ret 1380 1381 ; 2048 to 131072-point transforms ============================================== 1382 FFT_SPLIT_RADIX_DEF 2048, .4096pt 1383 FFT_SPLIT_RADIX_DEF 4096, .8192pt 1384 FFT_SPLIT_RADIX_DEF 8192, .16384pt 1385 FFT_SPLIT_RADIX_DEF 16384, .32768pt 1386 FFT_SPLIT_RADIX_DEF 32768, .65536pt 1387 FFT_SPLIT_RADIX_DEF 65536, .131072pt 1388 FFT_SPLIT_RADIX_DEF 131072, .262144pt 1389 FFT_SPLIT_RADIX_DEF 262144, .524288pt 1390 FFT_SPLIT_RADIX_DEF 524288, .1048576pt 1391 FFT_SPLIT_RADIX_DEF 1048576, .2097152pt 1392 FFT_SPLIT_RADIX_DEF 2097152 1393 1394 ;=============================================================================== 1395 ; Final synthesis + deinterleaving code 1396 ;=============================================================================== 1397 .deinterleave: 1398 %if %2 1399 PUSH strideq 1400 %endif 1401 mov tgtq, lenq 1402 imul tmpq, lenq, 2 1403 lea strideq, [4*lenq + tmpq] 1404 1405 .synth_deinterleave: 1406 SPLIT_RADIX_COMBINE_DEINTERLEAVE_FULL tmpq, strideq 1407 add outq, 8*mmsize 1408 add rtabq, 4*mmsize 1409 sub itabq, 4*mmsize 1410 sub tgtq, 4*mmsize 1411 jg .synth_deinterleave 1412 1413 %if %2 1414 POP strideq 1415 sub outq, tmpq 1416 neg tmpq 1417 lea inq, [inq + tmpq*4] 1418 ret 1419 %else 1420 RET 1421 %endif 1422 1423 ; 64-point deinterleave which only has to load 4 registers ===================== 1424 .64pt_deint: 1425 SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 1426 SPLIT_RADIX_COMBINE_HALF 0, m2, m3, tx1_o0, tx2_o0, tw_e, tw_o, tmp1, tmp2, tw_e 1427 1428 unpcklpd tmp1, m0, m2 1429 unpcklpd tmp2, m1, m3 1430 unpcklpd tw_o, tx1_e0, tx1_o0 1431 unpcklpd tw_e, tx2_e0, tx2_o0 1432 unpckhpd m0, m0, m2 1433 unpckhpd m1, m1, m3 1434 unpckhpd tx1_e0, tx1_e0, tx1_o0 1435 unpckhpd tx2_e0, tx2_e0, tx2_o0 1436 1437 vextractf128 [outq + 0*mmsize + 0], tmp1, 0 1438 vextractf128 [outq + 0*mmsize + 16], m0, 0 1439 vextractf128 [outq + 4*mmsize + 0], tmp2, 0 1440 vextractf128 [outq + 4*mmsize + 16], m1, 0 1441 1442 vextractf128 [outq + 8*mmsize + 0], tw_o, 0 1443 vextractf128 [outq + 8*mmsize + 16], tx1_e0, 0 1444 vextractf128 [outq + 9*mmsize + 0], tw_o, 1 1445 vextractf128 [outq + 9*mmsize + 16], tx1_e0, 1 1446 1447 vperm2f128 tmp1, tmp1, m0, 0x31 1448 vperm2f128 tmp2, tmp2, m1, 0x31 1449 1450 vextractf128 [outq + 12*mmsize + 0], tw_e, 0 1451 vextractf128 [outq + 12*mmsize + 16], tx2_e0, 0 1452 vextractf128 [outq + 13*mmsize + 0], tw_e, 1 1453 vextractf128 [outq + 13*mmsize + 16], tx2_e0, 1 1454 1455 movaps tw_e, [tab_64_float + mmsize] 1456 vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7 - mmsize], 0x23 1457 1458 movaps m0, [outq + 1*mmsize] 1459 movaps m1, [outq + 3*mmsize] 1460 movaps m2, [outq + 5*mmsize] 1461 movaps m3, [outq + 7*mmsize] 1462 1463 movaps [outq + 1*mmsize], tmp1 1464 movaps [outq + 5*mmsize], tmp2 1465 1466 SPLIT_RADIX_COMBINE 0, m0, m2, m1, m3, tx1_e1, tx2_e1, tx1_o1, tx2_o1, tw_e, tw_o, \ 1467 tmp1, tmp2, tx2_o0, tx1_o0, tx2_e0, tx1_e0 ; temporary registers 1468 1469 unpcklpd tmp1, m0, m1 1470 unpcklpd tmp2, m2, m3 1471 unpcklpd tw_e, tx1_e1, tx1_o1 1472 unpcklpd tw_o, tx2_e1, tx2_o1 1473 unpckhpd m0, m0, m1 1474 unpckhpd m2, m2, m3 1475 unpckhpd tx1_e1, tx1_e1, tx1_o1 1476 unpckhpd tx2_e1, tx2_e1, tx2_o1 1477 1478 vextractf128 [outq + 2*mmsize + 0], tmp1, 0 1479 vextractf128 [outq + 2*mmsize + 16], m0, 0 1480 vextractf128 [outq + 3*mmsize + 0], tmp1, 1 1481 vextractf128 [outq + 3*mmsize + 16], m0, 1 1482 1483 vextractf128 [outq + 6*mmsize + 0], tmp2, 0 1484 vextractf128 [outq + 6*mmsize + 16], m2, 0 1485 vextractf128 [outq + 7*mmsize + 0], tmp2, 1 1486 vextractf128 [outq + 7*mmsize + 16], m2, 1 1487 1488 vextractf128 [outq + 10*mmsize + 0], tw_e, 0 1489 vextractf128 [outq + 10*mmsize + 16], tx1_e1, 0 1490 vextractf128 [outq + 11*mmsize + 0], tw_e, 1 1491 vextractf128 [outq + 11*mmsize + 16], tx1_e1, 1 1492 1493 vextractf128 [outq + 14*mmsize + 0], tw_o, 0 1494 vextractf128 [outq + 14*mmsize + 16], tx2_e1, 0 1495 vextractf128 [outq + 15*mmsize + 0], tw_o, 1 1496 vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1 1497 1498 %if %2 1499 sub inq, 16*mmsize 1500 ret 1501 %else 1502 RET 1503 %endif 1504 1505 %if %2 1506 cglobal fft_sr_ns_float, 4, 10, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt, off 1507 movsxd lenq, dword [ctxq + AVTXContext.len] 1508 mov lutq, [ctxq + AVTXContext.map] 1509 1510 call mangle(ff_tx_fft_sr_asm_float_ %+ %1) 1511 RET 1512 %endif 1513 %endmacro 1514 1515 %if ARCH_X86_64 1516 FFT_SPLIT_RADIX_FN avx, 0 1517 FFT_SPLIT_RADIX_FN avx, 1 1518 FFT_SPLIT_RADIX_FN fma3, 0 1519 FFT_SPLIT_RADIX_FN fma3, 1 1520 FFT_SPLIT_RADIX_FN avx2, 0 1521 FFT_SPLIT_RADIX_FN avx2, 1 1522 %endif 1523 1524 %macro FFT15_FN 2 1525 INIT_YMM avx2 1526 cglobal fft15_ %+ %2, 4, 10, 16, ctx, out, in, stride, len, lut, tmp, tgt5, stride3, stride5 1527 mov lutq, [ctxq + AVTXContext.map] 1528 1529 imul stride3q, strideq, 3 1530 imul stride5q, strideq, 5 1531 1532 movaps m11, [mask_mmppmmmm] ; mmppmmmm 1533 movaps m10, [tab_53_float] ; tab5 1534 movaps xm9, [tab_53_float + 32] ; tab3 1535 vpermpd m9, m9, q1110 ; tab[23232323] 1536 movaps m8, [s15_perm] 1537 1538 %if %1 1539 movups xm0, [inq] 1540 movddup xm5, [inq + 16] 1541 movups m2, [inq + mmsize*0 + 24] 1542 movups m3, [inq + mmsize*1 + 24] 1543 movups m4, [inq + mmsize*2 + 24] 1544 %else 1545 LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 1546 LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7 1547 LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15 1548 LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7 1549 mov tmpd, [lutq + 8] 1550 movddup xm5, [inq + tmpq*8] 1551 %endif 1552 1553 FFT15 1554 1555 lea tgt5q, [outq + stride5q] 1556 lea tmpq, [outq + stride5q*2] 1557 1558 movhps [outq], xm14 ; out[0] 1559 movhps [outq + stride5q*1], xm15 ; out[5] 1560 movlps [outq + stride5q*2], xm15 ; out[10] 1561 1562 vextractf128 xm3, m0, 1 1563 vextractf128 xm4, m1, 1 1564 vextractf128 xm5, m2, 1 1565 1566 movlps [outq + strideq*1], xm1 1567 movhps [outq + strideq*2], xm2 1568 movlps [outq + stride3q*1], xm3 1569 movhps [outq + strideq*4], xm4 1570 movlps [outq + stride3q*2], xm0 1571 movlps [outq + strideq*8], xm5 1572 movhps [outq + stride3q*4], xm0 1573 movhps [tgt5q + strideq*2], xm1 1574 movhps [tgt5q + strideq*4], xm3 1575 movlps [tmpq + strideq*1], xm2 1576 movlps [tmpq + stride3q*1], xm4 1577 movhps [tmpq + strideq*4], xm5 1578 1579 RET 1580 %endmacro 1581 1582 %if ARCH_X86_64 1583 FFT15_FN 0, float 1584 FFT15_FN 1, ns_float 1585 %endif 1586 1587 %macro IMDCT_FN 1 1588 INIT_YMM %1 1589 cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, t1, t2, t3, \ 1590 t4, t5, btmp 1591 movsxd lenq, dword [ctxq + AVTXContext.len] 1592 mov expq, [ctxq + AVTXContext.exp] 1593 1594 lea t1d, [lend - 1] 1595 imul t1d, strided 1596 1597 mov btmpq, ctxq ; backup original context 1598 mov lutq, [ctxq + AVTXContext.map] ; load map 1599 1600 cmp strideq, 4 1601 je .stride4 1602 1603 shl strideq, 1 1604 movd xm4, strided 1605 vpbroadcastd m4, xm4 ; stride splatted 1606 movd xm5, t1d 1607 vpbroadcastd m5, xm5 ; offset splatted 1608 1609 mov t2q, outq ; don't modify the original output 1610 pcmpeqd m15, m15 ; set all bits to 1 1611 1612 .stridex_pre: 1613 pmulld m2, m4, [lutq] ; multiply by stride 1614 movaps m0, m15 1615 psubd m3, m5, m2 ; subtract from offset 1616 movaps m1, m15 1617 vgatherdps m6, [inq + m2], m0 ; im 1618 vgatherdps m7, [inq + m3], m1 ; re 1619 1620 movaps m8, [expq + 0*mmsize] ; tab 1 1621 movaps m9, [expq + 1*mmsize] ; tab 2 1622 1623 unpcklps m0, m7, m6 ; re, im, re, im 1624 unpckhps m1, m7, m6 ; re, im, re, im 1625 1626 vperm2f128 m2, m1, m0, 0x02 ; output order 1627 vperm2f128 m3, m1, m0, 0x13 ; output order 1628 1629 movshdup m10, m8 ; tab 1 imim 1630 movshdup m11, m9 ; tab 2 imim 1631 movsldup m12, m8 ; tab 1 rere 1632 movsldup m13, m9 ; tab 2 rere 1633 1634 mulps m10, m2 ; 1 reim * imim 1635 mulps m11, m3 ; 2 reim * imim 1636 1637 shufps m10, m10, q2301 1638 shufps m11, m11, q2301 1639 1640 fmaddsubps m10, m12, m2, m10 1641 fmaddsubps m11, m13, m3, m11 1642 1643 movups [t2q + 0*mmsize], m10 1644 movups [t2q + 1*mmsize], m11 1645 1646 add expq, mmsize*2 1647 add lutq, mmsize 1648 add t2q, mmsize*2 1649 sub lenq, mmsize/2 1650 jg .stridex_pre 1651 jmp .transform 1652 1653 .stride4: 1654 lea expq, [expq + lenq*4] 1655 lea lutq, [lutq + lenq*2] 1656 lea t1q, [inq + t1q] 1657 lea t1q, [t1q + strideq - mmsize] 1658 lea t2q, [lenq*2 - mmsize/2] 1659 1660 .stride4_pre: 1661 movups m4, [inq] 1662 movups m3, [t1q] 1663 1664 movsldup m1, m4 ; im im, im im 1665 movshdup m0, m3 ; re re, re re 1666 movshdup m4, m4 ; re re, re re (2) 1667 movsldup m3, m3 ; im im, im im (2) 1668 1669 movups m2, [expq] ; tab 1670 movups m5, [expq + 2*t2q] ; tab (2) 1671 1672 vpermpd m0, m0, q0123 ; flip 1673 shufps m7, m2, m2, q2301 1674 vpermpd m4, m4, q0123 ; flip (2) 1675 shufps m8, m5, m5, q2301 1676 1677 mulps m1, m7 ; im im * tab.reim 1678 mulps m3, m8 ; im im * tab.reim (2) 1679 1680 fmaddsubps m0, m0, m2, m1 1681 fmaddsubps m4, m4, m5, m3 1682 1683 vextractf128 xm3, m0, 1 1684 vextractf128 xm6, m4, 1 1685 1686 ; scatter 1687 movsxd strideq, dword [lutq + 0*4] 1688 movsxd lenq, dword [lutq + 1*4] 1689 movsxd t3q, dword [lutq + 2*4] 1690 movsxd t4q, dword [lutq + 3*4] 1691 1692 movlps [outq + strideq*8], xm0 1693 movhps [outq + lenq*8], xm0 1694 movlps [outq + t3q*8], xm3 1695 movhps [outq + t4q*8], xm3 1696 1697 movsxd strideq, dword [lutq + 0*4 + t2q] 1698 movsxd lenq, dword [lutq + 1*4 + t2q] 1699 movsxd t3q, dword [lutq + 2*4 + t2q] 1700 movsxd t4q, dword [lutq + 3*4 + t2q] 1701 1702 movlps [outq + strideq*8], xm4 1703 movhps [outq + lenq*8], xm4 1704 movlps [outq + t3q*8], xm6 1705 movhps [outq + t4q*8], xm6 1706 1707 add lutq, mmsize/2 1708 add expq, mmsize 1709 add inq, mmsize 1710 sub t1q, mmsize 1711 sub t2q, mmsize 1712 jge .stride4_pre 1713 1714 .transform: 1715 mov strideq, 2*4 1716 mov t4q, ctxq ; backup original context 1717 mov t5q, [ctxq + AVTXContext.fn] ; subtransform's jump point 1718 mov ctxq, [ctxq + AVTXContext.sub] 1719 mov lutq, [ctxq + AVTXContext.map] 1720 movsxd lenq, dword [ctxq + AVTXContext.len] 1721 1722 mov inq, outq ; in-place transform 1723 call t5q ; call the FFT 1724 1725 mov ctxq, t4q ; restore original context 1726 movsxd lenq, dword [ctxq + AVTXContext.len] 1727 mov expq, [ctxq + AVTXContext.exp] 1728 lea expq, [expq + lenq*4] 1729 1730 xor t1q, t1q ; low 1731 lea t2q, [lenq*4 - mmsize] ; high 1732 1733 .post: 1734 movaps m2, [expq + t2q] ; tab h 1735 movaps m3, [expq + t1q] ; tab l 1736 movups m0, [outq + t2q] ; in h 1737 movups m1, [outq + t1q] ; in l 1738 1739 movshdup m4, m2 ; tab h imim 1740 movshdup m5, m3 ; tab l imim 1741 movsldup m6, m2 ; tab h rere 1742 movsldup m7, m3 ; tab l rere 1743 1744 shufps m2, m0, m0, q2301 ; in h imre 1745 shufps m3, m1, m1, q2301 ; in l imre 1746 1747 mulps m6, m0 1748 mulps m7, m1 1749 1750 fmaddsubps m4, m4, m2, m6 1751 fmaddsubps m5, m5, m3, m7 1752 1753 vpermpd m3, m5, q0123 ; flip 1754 vpermpd m2, m4, q0123 ; flip 1755 1756 blendps m1, m2, m5, 01010101b 1757 blendps m0, m3, m4, 01010101b 1758 1759 movups [outq + t2q], m0 1760 movups [outq + t1q], m1 1761 1762 add t1q, mmsize 1763 sub t2q, mmsize 1764 sub lenq, mmsize/2 1765 jg .post 1766 1767 RET 1768 %endmacro 1769 1770 %if ARCH_X86_64 1771 IMDCT_FN avx2 1772 %endif 1773 1774 %macro PFA_15_FN 2 1775 INIT_YMM %1 1776 %if %2 1777 cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ 1778 tgt5, stride3, stride5, btmp 1779 %else 1780 cglobal fft_pfa_15xM_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ 1781 tgt5, stride3, stride5, btmp 1782 %endif 1783 1784 %if %2 1785 PUSH inq 1786 PUSH tgt5q 1787 PUSH stride3q 1788 PUSH stride5q 1789 PUSH btmpq 1790 %endif 1791 1792 PUSH strideq 1793 1794 mov btmpq, outq 1795 1796 mov outq, [ctxq + AVTXContext.tmp] 1797 %if !%2 1798 movsxd lenq, dword [ctxq + AVTXContext.len] 1799 mov lutq, [ctxq + AVTXContext.map] 1800 %endif 1801 1802 ; Load stride (second transform's length) and second transform's LUT 1803 mov tmpq, [ctxq + AVTXContext.sub] 1804 movsxd strideq, dword [tmpq + AVTXContext.len] 1805 mov mapq, [tmpq + AVTXContext.map] 1806 1807 shl strideq, 3 1808 imul stride3q, strideq, 3 1809 imul stride5q, strideq, 5 1810 1811 movaps m11, [mask_mmppmmmm] ; mmppmmmm 1812 movaps m10, [tab_53_float] ; tab5 1813 movaps xm9, [tab_53_float + 32] ; tab3 1814 vpermpd m9, m9, q1110 ; tab[23232323] 1815 movaps m8, [s15_perm] 1816 1817 .dim1: 1818 mov tmpd, [mapq] 1819 lea tgtq, [outq + tmpq*8] 1820 1821 %if %2 1822 movups xm0, [inq] ; in[0,1].reim 1823 movddup xm5, [inq + 16] ; in[2].reimreim 1824 movups m2, [inq + mmsize*0 + 24] ; in[3-6].reim 1825 movups m3, [inq + mmsize*1 + 24] ; in[7-11].reim 1826 movups m4, [inq + mmsize*2 + 24] ; in[12-15].reim 1827 %else 1828 LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 ; in[0,1].reim 1829 LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m6, m7 1830 LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15 1831 LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m6, m7 1832 mov tmpd, [lutq + 8] 1833 movddup xm5, [inq + tmpq*8] ; in[2].reimreim 1834 %endif 1835 1836 FFT15 1837 1838 lea tgt5q, [tgtq + stride5q] 1839 lea tmpq, [tgtq + stride5q*2] 1840 1841 movhps [tgtq], xm14 ; out[0] 1842 movhps [tgtq + stride5q*1], xm15 ; out[5] 1843 movlps [tgtq + stride5q*2], xm15 ; out[10] 1844 1845 vextractf128 xm3, m0, 1 1846 vextractf128 xm4, m1, 1 1847 vextractf128 xm5, m2, 1 1848 1849 movlps [tgtq + strideq*1], xm1 1850 movhps [tgtq + strideq*2], xm2 1851 movlps [tgtq + stride3q*1], xm3 1852 movhps [tgtq + strideq*4], xm4 1853 movlps [tgtq + stride3q*2], xm0 1854 movlps [tgtq + strideq*8], xm5 1855 movhps [tgtq + stride3q*4], xm0 1856 movhps [tgt5q + strideq*2], xm1 1857 movhps [tgt5q + strideq*4], xm3 1858 movlps [tmpq + strideq*1], xm2 1859 movlps [tmpq + stride3q*1], xm4 1860 movhps [tmpq + strideq*4], xm5 1861 1862 %if %2 1863 add inq, mmsize*3 + 24 1864 %else 1865 add lutq, (mmsize/2)*3 + 12 1866 %endif 1867 add mapq, 4 1868 sub lenq, 15 1869 jg .dim1 1870 1871 ; Second transform setup 1872 mov stride5q, ctxq ; backup original context 1873 movsxd stride3q, dword [ctxq + AVTXContext.len] ; full length 1874 mov tgt5q, [ctxq + AVTXContext.fn] ; subtransform's jump point 1875 1876 mov inq, outq ; in-place transform 1877 mov ctxq, [ctxq + AVTXContext.sub] ; load subtransform's context 1878 mov lutq, [ctxq + AVTXContext.map] ; load subtransform's map 1879 movsxd lenq, dword [ctxq + AVTXContext.len] ; load subtransform's length 1880 1881 .dim2: 1882 call tgt5q ; call the FFT 1883 lea inq, [inq + lenq*8] 1884 lea outq, [outq + lenq*8] 1885 sub stride3q, lenq 1886 jg .dim2 1887 1888 mov ctxq, stride5q ; restore original context 1889 mov lutq, [ctxq + AVTXContext.map] 1890 mov inq, [ctxq + AVTXContext.tmp] 1891 movsxd lenq, dword [ctxq + AVTXContext.len] ; full length 1892 1893 lea stride3q, [lutq + lenq*4] ; second part of the LUT 1894 mov stride5q, lenq 1895 mov tgt5q, btmpq 1896 POP strideq 1897 lea tmpq, [strideq + 2*strideq] 1898 1899 .post: 1900 LOAD64_LUT m0, inq, stride3q, 0, tmpq, m8, m9 1901 vextractf128 xm1, m0, 1 1902 movlps [tgt5q], xm0 1903 movhps [tgt5q + strideq], xm0 1904 movlps [tgt5q + strideq*2], xm1 1905 movhps [tgt5q + tmpq], xm1 1906 1907 lea tgt5q, [tgt5q + 4*strideq] 1908 add stride3q, mmsize/2 1909 sub stride5q, mmsize/8 1910 jg .post 1911 1912 %if %2 1913 mov outq, btmpq 1914 POP btmpq 1915 POP stride5q 1916 POP stride3q 1917 POP tgt5q 1918 POP inq 1919 ret 1920 %else 1921 RET 1922 %endif 1923 1924 %if %2 1925 cglobal fft_pfa_15xM_ns_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ 1926 tgt5, stride3, stride5, btmp 1927 movsxd lenq, dword [ctxq + AVTXContext.len] 1928 mov lutq, [ctxq + AVTXContext.map] 1929 1930 call mangle(ff_tx_fft_pfa_15xM_asm_float) 1931 RET 1932 %endif 1933 %endmacro 1934 1935 %if ARCH_X86_64 1936 PFA_15_FN avx2, 0 1937 PFA_15_FN avx2, 1 1938 %endif