tx_float_neon.S (50120B)
1 /* 2 * Copyright (c) Lynne 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include "libavutil/aarch64/asm.S" 22 23 /* Open `doc/transforms.md` to see the code upon which the transforms here were 24 * based upon. 25 * 26 * File conventions: 27 * GPRs: x0-x3 - arguments, untouched 28 * x4 - Lookup table base pointer 29 * x5-x6 - macro ld1 temps/function scratch 30 * x7-x9 - FFT table state 31 * x10-x17 - lookup table/macro scratch 32 * w19-w20 - current/target length when needed 33 * x21-x22 - len*2, len*6 34 * 35 * Vectors: v0-v7 - coefficients 36 * v8-v15 - coefficients when needed, otherwise untouched 37 * v16-v30 - used as needed 38 * v31 - -1.0, +1.0, -1.0, +1.0. Never touched after loading. 39 * 40 * Stack: backup for v8-v15 and x19-x22 when needed, and transform lengths 41 */ 42 43 #define M_SQRT1_2 0.707106781186547524401 44 #define COS16_1 0.92387950420379638671875 45 #define COS16_3 0.3826834261417388916015625 46 47 /* We only ever load this once at the start, and then live with losing an 48 * entire register as we need to lug this all the time everywhere. 49 * Clearly should be integrated into an fsadd and fmlsa, but "muh RISC!". */ 50 const subadd, align=4 51 .float -1.0, 1.0, -1.0, 1.0 52 endconst 53 54 .macro LOAD_SUBADD 55 movrel x5, subadd 56 ld1 { v31.4s }, [x5] 57 .endm 58 59 .macro SETUP_LUT no_lut=0 60 .if \no_lut == 0 61 ldr x4, [x0, #8] 62 .endif 63 .endm 64 65 .macro LOAD_INPUT dst1, dst2, dst3, dst4, src, no_lut=0, discont=0 66 .if \no_lut == 1 67 .if \discont == 1 68 ldp q\dst1\(), q\dst2\(), [\src\()] 69 ldp q\dst3\(), q\dst4\(), [\src\(), #32] 70 add \src\(), \src\(), #64 71 .else 72 ld1 { v\dst1\().4s, v\dst2\().4s, v\dst3\().4s, v\dst4\().4s }, [\src], #64 73 .endif 74 .else 75 ldp w10, w11, [x4, #0 ] 76 ldp w12, w13, [x4, #8 ] 77 ldp w14, w15, [x4, #16] 78 ldp w16, w17, [x4, #24] 79 80 add x4, x4, #32 81 82 ldr d\dst1, [\src, x10, lsl #3] 83 add x11, \src, x11, lsl #3 84 ldr d\dst2, [\src, x12, lsl #3] 85 add x13, \src, x13, lsl #3 86 ldr d\dst3, [\src, x14, lsl #3] 87 add x15, \src, x15, lsl #3 88 ldr d\dst4, [\src, x16, lsl #3] 89 add x17, \src, x17, lsl #3 90 91 ld1 { v\dst1\().d }[1], [x11] 92 ld1 { v\dst2\().d }[1], [x13] 93 ld1 { v\dst3\().d }[1], [x15] 94 ld1 { v\dst4\().d }[1], [x17] 95 .endif 96 .endm 97 98 .macro FFT4 e0, o0, standalone 99 fadd v16.4s, \e0\().4s, \o0\().4s // r1..4 100 fsub \e0\().4s, \e0\().4s, \o0\().4s // t1..4 101 102 rev64 v18.4s, \e0\().4s 103 104 zip2 \o0\().2d, v16.2d, \e0\().2d 105 zip1 v17.2d, v16.2d, \e0\().2d 106 107 mov \o0\().d[1], v18.d[1] 108 109 fadd \e0\().4s, v17.4s, \o0\().4s // a1,2 b1,4 110 fsub v16.4s, v17.4s, \o0\().4s // a3,4 b3,2 111 112 mov \o0\().16b, v16.16b // Swap once again... 113 mov \o0\().s[3], \e0\().s[3] 114 mov \e0\().s[3], v16.s[3] 115 116 .if \standalone == 0 117 uzp2 \o0\().2d, \e0\().2d, \o0\().2d 118 uzp1 \e0\().2d, \e0\().2d, v16.2d 119 .endif 120 .endm 121 122 const shuf_4pt_x2, align=4 123 .byte 24, 25, 26, 27 // reg2, 3 124 .byte 12, 13, 14, 15 // reg1, 4 125 .byte 8, 9, 10, 11 // reg1, 3 126 .byte 28, 29, 30, 31 // reg2, 4 127 endconst 128 129 // Identical to FFT4, but does 2 transforms in parallel, with no deinterleaving 130 .macro FFT4_X2 e0, o0, e1, o1, \ 131 t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22 132 133 fadd \t0\().4s, \e0\().4s, \o0\().4s // r1234 134 fadd \t2\().4s, \e1\().4s, \o1\().4s // r1234 135 fsub \e0\().4s, \e0\().4s, \o0\().4s // t1234 136 fsub \e1\().4s, \e1\().4s, \o1\().4s // t1234 137 138 movrel x5, shuf_4pt_x2 139 140 rev64 \t4\().4s, \e0\().4s 141 rev64 \t5\().4s, \e1\().4s 142 143 zip2 \o0\().2d, \t0\().2d, \e0\().2d // t3,4 r3,4 144 zip2 \o1\().2d, \t2\().2d, \e1\().2d // t3,4 r3,4 145 146 ld1 { \t6\().16b }, [x5] 147 148 mov \o0\().d[1], \t4\().d[1] 149 mov \o1\().d[1], \t5\().d[1] 150 151 zip1 \t1\().2d, \t0\().2d, \e0\().2d // t1,2 r1,2 152 zip1 \t3\().2d, \t2\().2d, \e1\().2d // t1,2 r1,2 153 154 fsub \t4\().4s, \t1\().4s, \o0\().4s // a34 b32 155 fadd \t5\().4s, \t1\().4s, \o0\().4s // a12 b14 156 fsub \t2\().4s, \t3\().4s, \o1\().4s // a34 b32 157 fadd \t3\().4s, \t3\().4s, \o1\().4s // a12 b14 158 159 // TODO: experiment with movs instead of tables here 160 tbl \o0\().16b, { \t4\().16b, \t5\().16b }, \t6\().16b // b1234 161 tbl \o1\().16b, { \t2\().16b, \t3\().16b }, \t6\().16b // b1234 162 163 zip1 \e0\().2d, \t5\().2d, \t4\().2d // a1234 164 // zip2 \o0\().2d, \t5\().2d, \t4\().2d // b1432 165 zip1 \e1\().2d, \t3\().2d, \t2\().2d // a1234 166 // zip2 \o1\().2d, \t3\().2d, \t2\().2d // b1432 167 // rev64 \o0\().4s, \o0\().4s // b4123 168 // rev64 \o1\().4s, \o1\().4s // b4123 169 // ext \o0\().16b, \o0\().16b, \o0\().16b, #4 // b1234 170 // ext \o1\().16b, \o1\().16b, \o1\().16b, #4 // b1234 171 .endm 172 173 const tab_8pt, align=4 174 .float M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2 175 endconst 176 177 .macro FFT8 e0, e1, o0, o1, \ 178 t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22 179 180 movrel x5, tab_8pt 181 182 fsub \t1\().4s, \e1\().4s, \o1\().4s // j1234 183 fadd \o1\().4s, \e1\().4s, \o1\().4s // k1234 184 fsub \t0\().4s, \e0\().4s, \o0\().4s // r1234 185 fadd \o0\().4s, \e0\().4s, \o0\().4s // q1234 186 187 ld1 { \t5\().4s }, [x5] 188 189 ext \t4\().16b, \o1\().16b, \o1\().16b, #12 190 rev64 \t4\().4s, \t4\().4s 191 192 ext \t2\().16b, \o0\().16b, \t4\().16b, #8 // o0[0,1], o1[3,2] 193 mov \o0\().d[1], \t4\().d[1] // o0[3, 4]; o1[1, 4] 194 195 fsub \e1\().4s, \o0\().4s, \t2\().4s // s34, g43 196 fadd \t2\().4s, \o0\().4s, \t2\().4s // s12, g12 197 198 rev64 \t6\().4s, v31.4s // 1, -1, 1, -1 199 dup \o0\().2d, \t0\().d[0] // r1212 200 dup \o1\().2d, \t0\().d[1] // r3434 201 202 rev64 \t4\().4s, \e1\().4s // xxg34 203 rev64 \o1\().4s, \o1\().4s // r4343 204 205 ext \t6\().16b, v31.16b, \t6\().16b, #8 // -1, 1, 1, -1 206 zip1 \t3\().2d, \t2\().2d, \e1\().2d // s1234 207 zip2 \t2\().2d, \t2\().2d, \t4\().2d // g1234 208 209 fadd \e0\().4s, \t3\().4s, \t2\().4s // out_e1 210 fsub \e1\().4s, \t3\().4s, \t2\().4s // out_e2 211 212 fmul \t1\().4s, \t1\().4s, \t5\().4s // j * +--+M_SQRT1_2 213 fmls \o0\().4s, \o1\().4s, \t6\().4s // z1234 214 215 rev64 \t4\().4s, \t1\().4s // j2143 216 fmla \t1\().4s, \t4\().4s, v31.4s // l2143 217 218 rev64 \t4\().4s, \t1\().4s // l1234 219 ext \t4\().16b, \t4\().16b, \t4\().16b, #8 // l3412 220 221 fmla \t4\().4s, \t1\().4s, v31.4s // t1234 222 223 fadd \o1\().4s, \o0\().4s, \t4\().4s // out_o2 224 fsub \o0\().4s, \o0\().4s, \t4\().4s // out_o1 225 .endm 226 227 // Identical as FFT8, but does 2 transforms in parallel 228 .macro FFT8_X2 e0, e1, o0, o1, e2, e3, o2, o3 229 230 movrel x5, tab_8pt 231 232 fadd v19.4s, \e3\().4s, \o3\().4s // k1234 233 fadd v17.4s, \e1\().4s, \o1\().4s // k1234 234 fadd v18.4s, \e2\().4s, \o2\().4s // q1234 235 fadd v16.4s, \e0\().4s, \o0\().4s // q1234 236 237 ld1 { v23.4s }, [x5] 238 239 ext v22.16b, v19.16b, v19.16b, #12 240 ext v21.16b, v17.16b, v17.16b, #12 241 242 rev64 v22.4s, v22.4s 243 rev64 v21.4s, v21.4s 244 245 ext v19.16b, v18.16b, v22.16b, #8 246 ext v17.16b, v16.16b, v21.16b, #8 247 248 mov v18.d[1], v22.d[1] 249 mov v21.d[0], v16.d[0] 250 251 fadd v22.4s, v18.4s, v19.4s // s12, g12 252 fsub v19.4s, v18.4s, v19.4s // s34, g43 253 fsub v18.4s, v21.4s, v17.4s // s34, g43 254 fadd v16.4s, v21.4s, v17.4s // s12, g12 255 256 fsub \e0\().4s, \e0\().4s, \o0\().4s // r1234 257 fsub v20.4s, \e1\().4s, \o1\().4s // j1234 258 fsub \e2\().4s, \e2\().4s, \o2\().4s // r1234 259 fsub v21.4s, \e3\().4s, \o3\().4s // j1234 260 261 rev64 v24.4s, v31.4s // 1, -1, 1, -1 262 zip1 v17.2d, v16.2d, v18.2d // s1234 263 zip1 \e1\().2d, v22.2d, v19.2d // s1234 264 265 rev64 v18.4s, v18.4s // xxg34 266 rev64 v19.4s, v19.4s // xxg34 267 268 zip2 v16.2d, v16.2d, v18.2d // g1234 269 zip2 \e3\().2d, v22.2d, v19.2d // g1234 270 271 dup \o0\().2d, \e0\().d[0] // r1212 272 dup \o1\().2d, \e0\().d[1] // r3434 273 dup \o2\().2d, \e2\().d[0] // r1212 274 dup \o3\().2d, \e2\().d[1] // r3434 275 276 fadd \e2\().4s, \e1\().4s, \e3\().4s // out_e1 277 fsub \e3\().4s, \e1\().4s, \e3\().4s // out_e2 278 fadd \e0\().4s, v17.4s, v16.4s // out_e1 279 fsub \e1\().4s, v17.4s, v16.4s // out_e2 280 281 ext v24.16b, v31.16b, v24.16b, #8 // -1, 1, 1, -1 282 rev64 \o1\().4s, \o1\().4s // r4343 283 rev64 \o3\().4s, \o3\().4s // r4343 284 285 fmul v19.4s, v20.4s, v23.4s // j * +--+M_SQRT1_2 286 fmul v21.4s, v21.4s, v23.4s // j * +--+M_SQRT1_2 287 288 rev64 v20.4s, v19.4s // j2143 289 rev64 v18.4s, v21.4s // j2143 290 291 fmls \o0\().4s, \o1\().4s, v24.4s // z1234 292 fmls \o2\().4s, \o3\().4s, v24.4s // z1234 293 294 fmla v19.4s, v20.4s, v31.4s // l2143 295 fmla v21.4s, v18.4s, v31.4s // l2143 296 297 rev64 v20.4s, v19.4s // l1234 298 rev64 v18.4s, v21.4s // l1234 299 ext v20.16b, v20.16b, v20.16b, #8 // l3412 300 ext v18.16b, v18.16b, v18.16b, #8 // l3412 301 302 fmla v20.4s, v19.4s, v31.4s // t1234 303 fmla v18.4s, v21.4s, v31.4s // t1234 304 305 fadd \o1\().4s, \o0\().4s, v20.4s // out_o2 306 fadd \o3\().4s, \o2\().4s, v18.4s // out_o2 307 fsub \o0\().4s, \o0\().4s, v20.4s // out_o1 308 fsub \o2\().4s, \o2\().4s, v18.4s // out_o1 309 .endm 310 311 const tab_16pt, align=4 312 .float -COS16_1, COS16_1, -COS16_3, COS16_3 // Could be +-+- too 313 .float COS16_3, COS16_3, COS16_1, COS16_1 314 .float 1.0, 1.0, M_SQRT1_2, M_SQRT1_2 315 endconst 316 317 // 16-point FFT 318 // t3, t4, t5, t6 must be sequential 319 .macro FFT16 e0, e1, e2, e3, o0, o1, o2, o3, \ 320 t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22 321 322 FFT8 \e0, \e1, \e2, \e3, \t0, \t1, \t2, \t3, \t4, \t5, \t6 323 FFT4_X2 \o0, \o1, \o2, \o3, \t0, \t1, \t2, \t3, \t4, \t5, \t6 324 325 movrel x5, tab_16pt 326 327 rev64 \t0\().4s, \o0\().4s // z[ 8, 9].imre 328 rev64 \t1\().4s, \o2\().4s // z[10,11].imre 329 330 ins \t0\().d[0], xzr 331 ins \t1\().d[0], xzr 332 333 ld1 { \t4\().4s, \t5\().4s, \t6\().4s }, [x5] 334 // TODO: We could derive \t4\() or \t5\() from either, but it seems cheaper to load 335 336 fmla \o2\().4s, \t1\().4s, v31.4s // s[4567] 337 fmls \o0\().4s, \t0\().4s, v31.4s // s[0123] 338 339 fmul \t2\().4s, \o1\().4s, \t4\().4s 340 fmul \t3\().4s, \o3\().4s, \t4\().4s 341 342 rev64 \o3\().4s, \o3\().4s 343 rev64 \o1\().4s, \o1\().4s 344 345 fmla \t3\().4s, \o3\().4s, \t5\().4s // s[12, 13, 14, 15] 346 fmls \t2\().4s, \o1\().4s, \t5\().4s // s[ 8, 9, 10, 11] 347 348 fmul \t1\().4s, \o2\().4s, \t6\().4s // s[4567] * mult 349 fmul \t0\().4s, \o0\().4s, \t6\().4s // s[0123] * mult 350 351 mov \o1\().16b, \t3\().16b 352 mov \o2\().16b, \t1\().16b 353 354 fsub \t3\().4s, \t3\().4s, \t2\().4s // y34, u34 355 fsub \t1\().4s, \t1\().4s, \t0\().4s // w34, x34 356 357 fadd \t2\().4s, \t2\().4s, \o1\().4s // y56, u56 358 rev64 \t3\().4s, \t3\().4s 359 fadd \t0\().4s, \t0\().4s, \o2\().4s // w56, x56 360 rev64 \t1\().4s, \t1\().4s 361 362 fmul \t2\().4s, \t2\().4s, v31.4s 363 fmul \t1\().4s, \t1\().4s, v31.4s 364 365 fadd \o3\().4s, \e3\().4s, \t3\().4s 366 fsub \o2\().4s, \e3\().4s, \t3\().4s 367 fsub \o1\().4s, \e2\().4s, \t2\().4s 368 fadd \o0\().4s, \e2\().4s, \t2\().4s 369 370 fsub \e2\().4s, \e0\().4s, \t0\().4s 371 fadd \e0\().4s, \e0\().4s, \t0\().4s 372 fsub \e3\().4s, \e1\().4s, \t1\().4s 373 fadd \e1\().4s, \e1\().4s, \t1\().4s 374 .endm 375 376 function ff_tx_fft2_float_neon, export=1 377 ld2r { v0.2d, v1.2d }, [x2] 378 379 fneg v2.2s, v1.2s 380 mov v2.d[1], v1.d[0] 381 382 fsub v2.4s, v0.4s, v2.4s 383 384 st1 { v2.4s }, [x1] 385 ret 386 endfunc 387 388 .macro FFT4_FN name, inv 389 function ff_tx_fft4_\name\()_float_neon, export=1 390 ld1 {v0.4s, v1.4s}, [x2] 391 392 .if \inv == 1 393 mov v2.d[0], v0.d[1] 394 mov v0.d[1], v1.d[1] 395 mov v1.d[1], v2.d[0] 396 .endif 397 398 FFT4 v0, v1, 1 399 400 st1 { v0.4s, v1.4s }, [x1] 401 ret 402 endfunc 403 .endm 404 405 FFT4_FN fwd, 0 406 FFT4_FN inv, 1 407 408 .macro FFT8_FN name, no_perm 409 function ff_tx_fft8_\name\()_neon, export=1 410 SETUP_LUT \no_perm 411 LOAD_INPUT 0, 1, 2, 3, x2, \no_perm 412 413 LOAD_SUBADD 414 FFT8 v0, v1, v2, v3 415 416 zip1 v16.2d, v0.2d, v2.2d 417 zip2 v17.2d, v0.2d, v2.2d 418 zip1 v18.2d, v1.2d, v3.2d 419 zip2 v19.2d, v1.2d, v3.2d 420 st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x1] 421 422 ret 423 endfunc 424 .endm 425 426 FFT8_FN float, 0 427 FFT8_FN ns_float, 1 428 429 .macro FFT16_FN name, no_perm 430 function ff_tx_fft16_\name\()_neon, export=1 431 SETUP_LUT \no_perm 432 LOAD_INPUT 0, 1, 2, 3, x2, \no_perm 433 LOAD_INPUT 4, 5, 6, 7, x2, \no_perm 434 435 LOAD_SUBADD 436 FFT16 v0, v1, v2, v3, v4, v5, v6, v7 437 438 zip1 v20.2d, v0.2d, v4.2d 439 zip2 v21.2d, v0.2d, v4.2d 440 zip1 v22.2d, v1.2d, v6.2d 441 zip2 v23.2d, v1.2d, v6.2d 442 st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x1], #64 443 444 zip1 v24.2d, v2.2d, v5.2d 445 zip2 v25.2d, v2.2d, v5.2d 446 zip1 v26.2d, v3.2d, v7.2d 447 zip2 v27.2d, v3.2d, v7.2d 448 st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x1] 449 450 ret 451 endfunc 452 .endm 453 454 FFT16_FN float, 0 455 FFT16_FN ns_float, 1 456 457 .macro SETUP_SR_RECOMB len, re, im, dec 458 ldr w5, =(\len - 4*7) 459 movrel \re, X(ff_tx_tab_\len\()_float) 460 add \im, \re, x5 461 mov \dec, #-32 462 463 .if \len > 32 464 mov x21, #2*\len 465 add x22, x21, x21, lsl #1 466 .endif 467 .endm 468 469 .macro SR_COMBINE e0, e1, e2, e3, e4, e5, e6, e7, \ 470 o0, o1, o2, o3, o4, o5, o6, o7, \ 471 re, im, dec, swap_im, \ 472 t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, \ 473 t6=v22, t7=v23, t8=v24, t9=v25, ta=v26, tb=v27 474 475 ld1 { \t8\().4s, \t9\().4s }, [\im], \dec 476 ld1 { \t0\().4s, \t1\().4s }, [\re], #32 477 478 .if \swap_im == 1 479 ext \t2\().16b, \t9\().16b, \t9\().16b, #8 480 ext \t3\().16b, \t8\().16b, \t8\().16b, #8 481 .else 482 ext \t2\().16b, \t8\().16b, \t8\().16b, #8 483 ext \t3\().16b, \t9\().16b, \t9\().16b, #8 484 .endif 485 486 trn1 \t4\().4s, \t0\().4s, \t0\().4s // cos0022 487 trn2 \t0\().4s, \t0\().4s, \t0\().4s // cos4466 488 trn1 \t5\().4s, \t1\().4s, \t1\().4s // cos1133 489 trn2 \t1\().4s, \t1\().4s, \t1\().4s // cos5577 490 491 rev64 \t6\().4s, \o0\().4s // E m2[0,1].imre 492 rev64 \t7\().4s, \o2\().4s // O m2[0,1].imre 493 rev64 \t8\().4s, \o4\().4s // E m2[2,3].imre 494 rev64 \t9\().4s, \o6\().4s // O m2[2,3].imre 495 496 fmul \t6\().4s, \t6\().4s, \t4\().4s // E m2[0,1].imre*t1[0,2] 497 fmul \t7\().4s, \t7\().4s, \t0\().4s // O m2[0,1].imre*t1[0,2] 498 fmul \t8\().4s, \t8\().4s, \t4\().4s // E m2[2,3].imre*t1[0,2] 499 fmul \t9\().4s, \t9\().4s, \t0\().4s // O m2[2,3].imre*t1[0,2] 500 501 rev64 \ta\().4s, \o1\().4s // E m3[0,1].imre 502 rev64 \tb\().4s, \o3\().4s // O m3[0,1].imre 503 rev64 \t4\().4s, \o5\().4s // E m3[2,3].imre 504 rev64 \t0\().4s, \o7\().4s // O m3[2,3].imre 505 506 fmul \ta\().4s, \ta\().4s, \t5\().4s // E m3[0,1].imre*t1[4,6] 507 fmul \tb\().4s, \tb\().4s, \t1\().4s // O m3[0,1].imre*t1[4,6] 508 fmul \t4\().4s, \t4\().4s, \t5\().4s // E m3[2,3].imre*t1[4,6] 509 fmul \t0\().4s, \t0\().4s, \t1\().4s // O m3[2,3].imre*t1[4,6] 510 511 trn1 \t5\().4s, \t3\().4s, \t3\().4s // wim2200 512 trn2 \t3\().4s, \t3\().4s, \t3\().4s // wim3311 513 trn1 \t1\().4s, \t2\().4s, \t2\().4s // wim6644 514 trn2 \t2\().4s, \t2\().4s, \t2\().4s // wim7755 515 516 fmul \t5\().4s, \t5\().4s, v31.4s 517 fmul \t3\().4s, \t3\().4s, v31.4s 518 fmul \t1\().4s, \t1\().4s, v31.4s 519 fmul \t2\().4s, \t2\().4s, v31.4s 520 521 fmla \t7\().4s, \o2\().4s, \t5\().4s // O w0123 522 fmls \t9\().4s, \o6\().4s, \t5\().4s // O j0123 523 fmla \t6\().4s, \o0\().4s, \t3\().4s // E w0123 524 fmls \t8\().4s, \o4\().4s, \t3\().4s // E j0123 525 526 fmla \ta\().4s, \o1\().4s, \t2\().4s // E w4567 527 fmla \tb\().4s, \o3\().4s, \t1\().4s // O w4567 528 fmls \t4\().4s, \o5\().4s, \t2\().4s // E j4567 529 fmls \t0\().4s, \o7\().4s, \t1\().4s // O j4567 530 531 fsub \t2\().4s, \t7\().4s, \t9\().4s 532 fsub \t1\().4s, \t8\().4s, \t6\().4s 533 fsub \t3\().4s, \t4\().4s, \ta\().4s 534 fsub \t5\().4s, \t0\().4s, \tb\().4s 535 536 fadd \t6\().4s, \t8\().4s, \t6\().4s 537 fadd \t7\().4s, \t9\().4s, \t7\().4s 538 fadd \t8\().4s, \t4\().4s, \ta\().4s 539 fadd \t9\().4s, \t0\().4s, \tb\().4s 540 541 fmul \t1\().4s, \t1\().4s, v31.4s 542 fmul \t2\().4s, \t2\().4s, v31.4s 543 fmul \t3\().4s, \t3\().4s, v31.4s 544 fmul \t5\().4s, \t5\().4s, v31.4s 545 546 rev64 \t6\().4s, \t6\().4s 547 rev64 \t8\().4s, \t8\().4s 548 rev64 \t7\().4s, \t7\().4s 549 rev64 \t9\().4s, \t9\().4s 550 551 fsub \o0\().4s, \e0\().4s, \t6\().4s 552 fsub \o1\().4s, \e1\().4s, \t8\().4s 553 fsub \o2\().4s, \e2\().4s, \t1\().4s 554 fsub \o3\().4s, \e3\().4s, \t3\().4s 555 556 fsub \o4\().4s, \e4\().4s, \t7\().4s 557 fsub \o5\().4s, \e6\().4s, \t9\().4s 558 fadd \o6\().4s, \e5\().4s, \t2\().4s 559 fsub \o7\().4s, \e7\().4s, \t5\().4s 560 561 fadd \e0\().4s, \e0\().4s, \t6\().4s 562 fadd \e1\().4s, \e1\().4s, \t8\().4s 563 fadd \e2\().4s, \e2\().4s, \t1\().4s 564 fadd \e3\().4s, \e3\().4s, \t3\().4s 565 566 fadd \e4\().4s, \e4\().4s, \t7\().4s 567 fsub \e5\().4s, \e5\().4s, \t2\().4s // swapped 568 fadd \e6\().4s, \e6\().4s, \t9\().4s // swapped 569 fadd \e7\().4s, \e7\().4s, \t5\().4s 570 .endm 571 572 .macro SR_COMBINE_HALF e0, e1, e2, e3, \ 573 o0, o1, o2, o3, \ 574 c0, c1, c2, c3, \ 575 t0, t1, t2, t3, t4, t5, part 576 577 .if \part == 0 578 trn1 \t4\().4s, \c0\().4s, \c0\().4s // cos0022 579 trn1 \c1\().4s, \c1\().4s, \c1\().4s // cos1133 580 .else 581 trn2 \t4\().4s, \c0\().4s, \c0\().4s // cos0022 582 trn2 \c1\().4s, \c1\().4s, \c1\().4s // cos1133 583 .endif 584 .if \part == 0 585 trn2 \t5\().4s, \c2\().4s, \c2\().4s // wim7755 586 trn2 \c3\().4s, \c3\().4s, \c3\().4s // wim3311 587 .else 588 trn1 \t5\().4s, \c2\().4s, \c2\().4s // wim7755 589 trn1 \c3\().4s, \c3\().4s, \c3\().4s // wim3311 590 .endif 591 592 fmul \t5\().4s, \t5\().4s, v31.4s 593 fmul \c3\().4s, \c3\().4s, v31.4s 594 595 rev64 \t0\().4s, \o0\().4s // E m2[0,1].imre 596 rev64 \t1\().4s, \o2\().4s // E m2[2,3].imre 597 rev64 \t2\().4s, \o1\().4s // E m3[0,1].imre 598 rev64 \t3\().4s, \o3\().4s // E m3[2,3].imre 599 600 fmul \o0\().4s, \o0\().4s, \c3\().4s // E m2[0,1].imre*t1[0,2] 601 fmul \o1\().4s, \o1\().4s, \t5\().4s // E m3[0,1].imre*t1[4,6] 602 fmla \o0\().4s, \t0\().4s, \t4\().4s // E w0123 603 fmla \o1\().4s, \t2\().4s, \c1\().4s // E w4567 604 605 fmul \t1\().4s, \t1\().4s, \t4\().4s // E m2[2,3].imre*t1[0,2] 606 fmul \t3\().4s, \t3\().4s, \c1\().4s // E m3[2,3].imre*t1[4,6] 607 fmls \t1\().4s, \o2\().4s, \c3\().4s // E j0123 608 fmls \t3\().4s, \o3\().4s, \t5\().4s // E j4567 609 610 fsub \t0\().4s, \t1\().4s, \o0\().4s 611 fadd \t1\().4s, \t1\().4s, \o0\().4s 612 fadd \t2\().4s, \t3\().4s, \o1\().4s 613 fsub \t3\().4s, \t3\().4s, \o1\().4s 614 615 fmul \t0\().4s, \t0\().4s, v31.4s 616 fmul \t3\().4s, \t3\().4s, v31.4s 617 618 rev64 \t1\().4s, \t1\().4s 619 rev64 \t2\().4s, \t2\().4s 620 621 .if \part == 0 622 fsub \o0\().4s, \e0\().4s, \t1\().4s 623 fsub \o1\().4s, \e1\().4s, \t2\().4s 624 fsub \o2\().4s, \e2\().4s, \t0\().4s 625 fsub \o3\().4s, \e3\().4s, \t3\().4s 626 .else 627 fsub \o0\().4s, \e0\().4s, \t1\().4s 628 fadd \o2\().4s, \e1\().4s, \t2\().4s 629 fsub \o1\().4s, \e2\().4s, \t0\().4s 630 fadd \o3\().4s, \e3\().4s, \t3\().4s 631 .endif 632 633 .if \part == 0 634 fadd \e0\().4s, \e0\().4s, \t1\().4s 635 fadd \e1\().4s, \e1\().4s, \t2\().4s 636 fadd \e2\().4s, \e2\().4s, \t0\().4s 637 fadd \e3\().4s, \e3\().4s, \t3\().4s 638 .else 639 fadd \e0\().4s, \e0\().4s, \t1\().4s 640 fsub \e1\().4s, \e1\().4s, \t2\().4s // swapped 641 fadd \e2\().4s, \e2\().4s, \t0\().4s // swapped 642 fsub \e3\().4s, \e3\().4s, \t3\().4s 643 .endif 644 .endm 645 646 /* Same as SR_COMBINE_HALF, but heroically tries to use 3 temporary registers 647 * without touching the tables. */ 648 .macro SR_COMBINE_LITE e0, e1, e2, e3, \ 649 o0, o1, o2, o3, \ 650 c0, c1, c2, c3, \ 651 t0, t1, t2, part 652 653 rev64 \t0\().4s, \o0\().4s // E m2[0,1].imre 654 rev64 \t1\().4s, \o2\().4s // E m2[2,3].imre 655 .if \part == 0 656 trn2 \t2\().4s, \c3\().4s, \c3\().4s // wim3311 657 .else 658 trn1 \t2\().4s, \c3\().4s, \c3\().4s // wim3311 659 .endif 660 fmul \t2\().4s, \t2\().4s, v31.4s 661 fmul \o2\().4s, \o2\().4s, \t2\().4s 662 fmul \o0\().4s, \o0\().4s, \t2\().4s // E m2[0,1].imre*t1[0,2] 663 .if \part == 0 664 trn1 \t2\().4s, \c0\().4s, \c0\().4s // cos0022 665 .else 666 trn2 \t2\().4s, \c0\().4s, \c0\().4s // cos0022 667 .endif 668 fmul \t1\().4s, \t1\().4s, \t2\().4s // E m2[2,3].imre*t1[0,2] 669 fmla \o0\().4s, \t0\().4s, \t2\().4s // E w0123 670 fsub \t1\().4s, \t1\().4s, \o2\().4s // E j0123 671 672 rev64 \t2\().4s, \o1\().4s // E m3[0,1].imre 673 rev64 \o2\().4s, \o3\().4s // E m3[2,3].imre 674 675 .if \part == 0 676 trn2 \t0\().4s, \c2\().4s, \c2\().4s // wim7755 677 .else 678 trn1 \t0\().4s, \c2\().4s, \c2\().4s // wim7755 679 .endif 680 fmul \t0\().4s, \t0\().4s, v31.4s 681 682 fmul \o1\().4s, \o1\().4s, \t0\().4s // E m3[0,1].imre*t1[4,6] 683 fmul \o3\().4s, \o3\().4s, \t0\().4s 684 685 .if \part == 0 686 trn1 \t0\().4s, \c1\().4s, \c1\().4s // cos1133 687 .else 688 trn2 \t0\().4s, \c1\().4s, \c1\().4s // cos1133 689 .endif 690 fmul \o2\().4s, \o2\().4s, \t0\().4s // E m3[2,3].imre*t1[4,6] 691 fmla \o1\().4s, \t2\().4s, \t0\().4s // E w4567 692 fsub \o2\().4s, \o2\().4s, \o3\().4s // E j4567 693 694 fsub \t0\().4s, \t1\().4s, \o0\().4s 695 fadd \o0\().4s, \t1\().4s, \o0\().4s 696 fadd \t2\().4s, \o2\().4s, \o1\().4s 697 fsub \t1\().4s, \o2\().4s, \o1\().4s 698 699 fmul \t0\().4s, \t0\().4s, v31.4s 700 fmul \t1\().4s, \t1\().4s, v31.4s 701 702 rev64 \t2\().4s, \t2\().4s 703 rev64 \o0\().4s, \o0\().4s 704 705 .if \part == 0 706 fsub \o1\().4s, \e1\().4s, \t2\().4s 707 fsub \o2\().4s, \e2\().4s, \t0\().4s 708 fsub \o3\().4s, \e3\().4s, \t1\().4s 709 .else 710 fadd \o2\().4s, \e1\().4s, \t0\().4s 711 fsub \o1\().4s, \e2\().4s, \t2\().4s 712 fadd \o3\().4s, \e3\().4s, \t1\().4s 713 .endif 714 715 .if \part == 0 716 fadd \e1\().4s, \e1\().4s, \t2\().4s 717 fadd \e2\().4s, \e2\().4s, \t0\().4s 718 fadd \e3\().4s, \e3\().4s, \t1\().4s 719 .else 720 fsub \e1\().4s, \e1\().4s, \t0\().4s // swapped 721 fadd \e2\().4s, \e2\().4s, \t2\().4s // swapped 722 fsub \e3\().4s, \e3\().4s, \t1\().4s 723 .endif 724 725 mov \t1\().16b, \o0\().16b 726 727 fsub \o0\().4s, \e0\().4s, \t1\().4s 728 fadd \e0\().4s, \e0\().4s, \t1\().4s 729 .endm 730 731 .macro SR_COMBINE_4 len, part, off 732 add x10, x1, x21 733 add x11, x1, x21, lsl #1 734 add x12, x1, x22 735 736 ldp q0, q1, [x1, #((0 + \part)*32 + \off)] 737 ldp q4, q5, [x1, #((2 + \part)*32 + \off)] 738 ldp q2, q3, [x10, #((0 + \part)*32 + \off)] 739 ldp q6, q7, [x10, #((2 + \part)*32 + \off)] 740 741 ldp q8, q9, [x11, #((0 + \part)*32 + \off)] 742 ldp q10, q11, [x11, #((2 + \part)*32 + \off)] 743 ldp q12, q13, [x12, #((0 + \part)*32 + \off)] 744 ldp q14, q15, [x12, #((2 + \part)*32 + \off)] 745 746 SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \ 747 v8, v9, v10, v11, v12, v13, v14, v15, \ 748 x7, x8, x9, 0 749 750 stp q0, q1, [x1, #((0 + \part)*32 + \off)] 751 stp q4, q5, [x1, #((2 + \part)*32 + \off)] 752 stp q2, q3, [x10, #((0 + \part)*32 + \off)] 753 stp q6, q7, [x10, #((2 + \part)*32 + \off)] 754 755 stp q8, q9, [x11, #((0 + \part)*32 + \off)] 756 stp q12, q13, [x11, #((2 + \part)*32 + \off)] 757 stp q10, q11, [x12, #((0 + \part)*32 + \off)] 758 stp q14, q15, [x12, #((2 + \part)*32 + \off)] 759 .endm 760 761 .macro SR_COMBINE_FULL len, off=0 762 add x10, x1, x21 763 add x11, x1, x21, lsl #1 764 add x12, x1, x22 765 766 SR_COMBINE_4 \len, 0, \off 767 SR_COMBINE_4 \len, 1, \off 768 SR_COMBINE_4 \len, 4, \off 769 SR_COMBINE_4 \len, 5, \off 770 .endm 771 772 .macro SR_COMBINE_D2 part, off 773 add x10, x1, #((\part)*32 + \off) 774 add x11, x14, #((\part)*32 + \off) 775 add x12, x15, #((\part)*32 + \off) 776 add x13, x16, #((\part)*32 + \off) 777 778 ldp q0, q1, [x10] 779 ldp q4, q5, [x10, #(2*32)] 780 ldp q2, q3, [x11] 781 ldp q6, q7, [x11, #(2*32)] 782 783 ldp q8, q9, [x12] 784 ldp q10, q11, [x12, #(2*32)] 785 ldp q12, q13, [x13] 786 ldp q14, q15, [x13, #(2*32)] 787 788 SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \ 789 v8, v9, v10, v11, v12, v13, v14, v15, \ 790 x7, x8, x9, 0, \ 791 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27 792 793 zip1 v16.2d, v0.2d, v4.2d 794 zip2 v17.2d, v0.2d, v4.2d 795 zip1 v18.2d, v1.2d, v5.2d 796 zip2 v19.2d, v1.2d, v5.2d 797 798 zip1 v20.2d, v2.2d, v6.2d 799 zip2 v21.2d, v2.2d, v6.2d 800 zip1 v22.2d, v3.2d, v7.2d 801 zip2 v23.2d, v3.2d, v7.2d 802 803 ldp q0, q1, [x10, #(1*32)] 804 ldp q4, q5, [x10, #(3*32)] 805 ldp q2, q3, [x11, #(1*32)] 806 ldp q6, q7, [x11, #(3*32)] 807 808 st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x10], #64 809 st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x11], #64 810 811 zip1 v20.2d, v8.2d, v12.2d 812 zip2 v21.2d, v8.2d, v12.2d 813 zip1 v22.2d, v9.2d, v13.2d 814 zip2 v23.2d, v9.2d, v13.2d 815 zip1 v24.2d, v10.2d, v14.2d 816 zip2 v25.2d, v10.2d, v14.2d 817 zip1 v26.2d, v11.2d, v15.2d 818 zip2 v27.2d, v11.2d, v15.2d 819 820 ldp q8, q9, [x12, #(1*32)] 821 ldp q10, q11, [x12, #(3*32)] 822 ldp q12, q13, [x13, #(1*32)] 823 ldp q14, q15, [x13, #(3*32)] 824 825 st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x12], #64 826 st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x13], #64 827 828 SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \ 829 v8, v9, v10, v11, v12, v13, v14, v15, \ 830 x7, x8, x9, 0, \ 831 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27 832 833 zip1 v16.2d, v0.2d, v4.2d 834 zip2 v17.2d, v0.2d, v4.2d 835 zip1 v18.2d, v1.2d, v5.2d 836 zip2 v19.2d, v1.2d, v5.2d 837 st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x10] 838 839 zip1 v16.2d, v2.2d, v6.2d 840 zip2 v17.2d, v2.2d, v6.2d 841 zip1 v18.2d, v3.2d, v7.2d 842 zip2 v19.2d, v3.2d, v7.2d 843 st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x11] 844 845 zip1 v20.2d, v8.2d, v12.2d 846 zip2 v21.2d, v8.2d, v12.2d 847 zip1 v22.2d, v9.2d, v13.2d 848 zip2 v23.2d, v9.2d, v13.2d 849 st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x12] 850 851 zip1 v24.2d, v10.2d, v14.2d 852 zip2 v25.2d, v10.2d, v14.2d 853 zip1 v26.2d, v11.2d, v15.2d 854 zip2 v27.2d, v11.2d, v15.2d 855 st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x13] 856 .endm 857 858 .macro SR_COMBINE_DINT off=0 859 add x14, x1, x21 860 add x15, x1, x21, lsl #1 861 add x16, x1, x22 862 863 SR_COMBINE_D2 0, \off 864 SR_COMBINE_D2 4, \off 865 .endm 866 867 .macro FFT32_FN name, no_perm 868 function ff_tx_fft32_\name\()_neon, export=1 869 stp d14, d15, [sp, #-16*4]! 870 stp d8, d9, [sp, #16*3] 871 stp d10, d11, [sp, #16*2] 872 stp d12, d13, [sp, #16] 873 874 LOAD_SUBADD 875 SETUP_SR_RECOMB 32, x7, x8, x9 876 877 SETUP_LUT \no_perm 878 LOAD_INPUT 0, 1, 2, 3, x2, \no_perm 879 LOAD_INPUT 4, 5, 6, 7, x2, \no_perm 880 LOAD_INPUT 8, 9, 10, 11, x2, \no_perm 881 LOAD_INPUT 12, 13, 14, 15, x2, \no_perm 882 883 FFT8_X2 v8, v9, v10, v11, v12, v13, v14, v15 884 FFT16 v0, v1, v2, v3, v4, v5, v6, v7 885 886 SR_COMBINE v0, v1, v2, v3, v4, v5, v6, v7, \ 887 v8, v9, v10, v11, v12, v13, v14, v15, \ 888 x7, x8, x9, 0 889 890 zip1 v16.2d, v0.2d, v4.2d 891 zip2 v17.2d, v0.2d, v4.2d 892 zip1 v18.2d, v1.2d, v6.2d 893 zip2 v19.2d, v1.2d, v6.2d 894 st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x1], #64 895 896 zip1 v20.2d, v2.2d, v5.2d 897 zip2 v21.2d, v2.2d, v5.2d 898 zip1 v22.2d, v3.2d, v7.2d 899 zip2 v23.2d, v3.2d, v7.2d 900 st1 { v20.4s, v21.4s, v22.4s, v23.4s }, [x1], #64 901 902 zip1 v24.2d, v8.2d, v12.2d 903 zip2 v25.2d, v8.2d, v12.2d 904 zip1 v26.2d, v9.2d, v13.2d 905 zip2 v27.2d, v9.2d, v13.2d 906 st1 { v24.4s, v25.4s, v26.4s, v27.4s }, [x1], #64 907 908 zip1 v28.2d, v10.2d, v14.2d 909 zip2 v29.2d, v10.2d, v14.2d 910 zip1 v30.2d, v11.2d, v15.2d 911 zip2 v31.2d, v11.2d, v15.2d 912 st1 { v28.4s, v29.4s, v30.4s, v31.4s }, [x1] 913 914 ldp d12, d13, [sp, #16] 915 ldp d10, d11, [sp, #16*2] 916 ldp d8, d9, [sp, #16*3] 917 ldp d14, d15, [sp], #16*4 918 919 ret 920 endfunc 921 .endm 922 923 FFT32_FN float, 0 924 FFT32_FN ns_float, 1 925 926 .macro cmp_imm reg, imm 927 .if \imm >= 4096 928 cmp \reg, #((\imm)/4096), lsl #12 929 .else 930 cmp \reg, #(\imm) 931 .endif 932 .endm 933 934 .macro SR_TRANSFORM_DEF len, next=0 935 \len: 936 stp x20, x30, [sp, #-16]! 937 mov w20, #(\len/4) 938 mov x5, #((\len*4) - (\len/1)) 939 add x1, x1, x5 940 bl 32b 941 mov x5, #((\len*2) - (\len/2)) 942 add x1, x1, x5 943 bl 32b 944 ldp x20, x30, [sp], #16 945 ldr w5, =(\len*6 + \len/2) 946 sub x1, x1, x5 947 948 SETUP_SR_RECOMB \len, x7, x8, x9 949 950 .if \next\() != 0 951 cmp_imm w19, \len 952 b.eq 0f 953 954 mov w5, #(\len/128) 955 \len\()5: 956 SR_COMBINE_FULL \len 957 add x1, x1, 8*32 958 subs w5, w5, 1 959 b.gt \len\()5b 960 961 cmp_imm w20, \len 962 b.gt \next\()f 963 ret 964 .endif 965 .endm 966 967 .macro FFT_SPLIT_RADIX_FN name, no_perm 968 function ff_tx_fft_sr_\name\()_neon, export=1 969 stp x21, x22, [sp, #-16*6]! 970 stp d8, d9, [sp, #16*5] 971 stp d10, d11, [sp, #16*4] 972 stp d12, d13, [sp, #16*3] 973 stp d14, d15, [sp, #16*2] 974 stp x19, x20, [sp, #16] 975 976 ldr w19, [x0, #0] // global target 977 mov w20, w19 // local length 978 979 LOAD_SUBADD 980 SETUP_LUT \no_perm 981 982 32: 983 SETUP_SR_RECOMB 32, x7, x8, x9 984 985 LOAD_INPUT 0, 1, 2, 3, x2, \no_perm 986 LOAD_INPUT 4, 6, 5, 7, x2, \no_perm, 1 987 LOAD_INPUT 8, 9, 10, 11, x2, \no_perm 988 LOAD_INPUT 12, 13, 14, 15, x2, \no_perm 989 990 FFT8_X2 v8, v9, v10, v11, v12, v13, v14, v15 991 FFT16 v0, v1, v2, v3, v4, v6, v5, v7 992 993 SR_COMBINE v0, v1, v2, v3, v4, v6, v5, v7, \ 994 v8, v9, v10, v11, v12, v13, v14, v15, \ 995 x7, x8, x9, 0 996 997 stp q2, q3, [x1, #32*1] 998 stp q6, q7, [x1, #32*3] 999 stp q10, q11, [x1, #32*5] 1000 stp q14, q15, [x1, #32*7] 1001 1002 cmp w20, #32 1003 b.gt 64f 1004 1005 stp q0, q1, [x1, #32*0] 1006 stp q4, q5, [x1, #32*2] 1007 stp q8, q9, [x1, #32*4] 1008 stp q12, q13, [x1, #32*6] 1009 1010 ret 1011 64: 1012 SETUP_SR_RECOMB 64, x7, x8, x9 1013 1014 LOAD_INPUT 2, 3, 10, 11, x2, \no_perm, 1 1015 LOAD_INPUT 6, 14, 7, 15, x2, \no_perm, 1 1016 1017 FFT16 v2, v3, v10, v11, v6, v14, v7, v15 1018 1019 LOAD_INPUT 16, 17, 18, 19, x2, \no_perm 1020 LOAD_INPUT 20, 22, 21, 23, x2, \no_perm, 1 1021 1022 FFT16 v16, v17, v18, v19, v20, v22, v21, v23, \ 1023 v24, v25, v26, v27, v28, v29, v30 1024 1025 ld1 { v26.4s, v27.4s }, [x8], x9 1026 ldp q24, q25, [x7], #32 1027 1028 ext v26.16b, v26.16b, v26.16b, #8 1029 ext v27.16b, v27.16b, v27.16b, #8 1030 1031 cmp w19, #64 1032 b.eq 2f // custom deinterleave 1033 1034 // TODO: investigate doing the 2 combines like in deinterleave 1035 // TODO: experiment with spilling to gprs and converting to HALF or full 1036 SR_COMBINE_LITE v0, v1, v8, v9, \ 1037 v2, v3, v16, v17, \ 1038 v24, v25, v26, v27, \ 1039 v28, v29, v30, 0 1040 1041 stp q0, q1, [x1, #32* 0] 1042 stp q8, q9, [x1, #32* 4] 1043 stp q2, q3, [x1, #32* 8] 1044 stp q16, q17, [x1, #32*12] 1045 1046 SR_COMBINE_HALF v4, v5, v12, v13, \ 1047 v6, v7, v20, v21, \ 1048 v24, v25, v26, v27, \ 1049 v28, v29, v30, v0, v1, v8, 1 1050 1051 stp q4, q20, [x1, #32* 2] 1052 stp q12, q21, [x1, #32* 6] 1053 stp q6, q5, [x1, #32*10] 1054 stp q7, q13, [x1, #32*14] 1055 1056 ldp q2, q3, [x1, #32*1] 1057 ldp q6, q7, [x1, #32*3] 1058 ldp q12, q13, [x1, #32*5] 1059 ldp q16, q17, [x1, #32*7] 1060 1061 SR_COMBINE v2, v3, v12, v13, v6, v16, v7, v17, \ 1062 v10, v11, v14, v15, v18, v19, v22, v23, \ 1063 x7, x8, x9, 0, \ 1064 v24, v25, v26, v27, v28, v29, v30, v8, v0, v1, v4, v5 1065 1066 stp q2, q3, [x1, #32* 1] 1067 stp q6, q7, [x1, #32* 3] 1068 stp q12, q13, [x1, #32* 5] 1069 stp q16, q17, [x1, #32* 7] 1070 1071 stp q10, q11, [x1, #32* 9] 1072 stp q18, q19, [x1, #32*11] 1073 stp q14, q15, [x1, #32*13] 1074 stp q22, q23, [x1, #32*15] 1075 1076 cmp w20, #64 1077 b.gt 128f 1078 ret 1079 128: 1080 stp x20, x30, [sp, #-16]! 1081 mov w20, #32 1082 add x1, x1, #16*32 1083 bl 32b 1084 add x1, x1, #8*32 1085 bl 32b 1086 ldp x20, x30, [sp], #16 1087 sub x1, x1, #24*32 1088 1089 SETUP_SR_RECOMB 128, x7, x8, x9 1090 1091 cmp w19, #128 1092 b.eq 0f 1093 1094 SR_COMBINE_FULL 128 1095 1096 cmp w20, #128 1097 b.gt 256f 1098 ret 1099 256: 1100 stp x20, x30, [sp, #-16]! 1101 mov w20, #64 1102 add x1, x1, #32*32 1103 bl 32b 1104 add x1, x1, #16*32 1105 bl 32b 1106 ldp x20, x30, [sp], #16 1107 sub x1, x1, #48*32 1108 1109 SETUP_SR_RECOMB 256, x7, x8, x9 1110 1111 cmp w19, #256 1112 b.eq 0f 1113 1114 SR_COMBINE_FULL 256 1115 SR_COMBINE_FULL 256, 8*32 1116 1117 cmp w20, #256 1118 b.gt 512f 1119 ret 1120 512: 1121 stp x20, x30, [sp, #-16]! 1122 mov w20, #128 1123 add x1, x1, #64*32 1124 bl 32b 1125 add x1, x1, #32*32 1126 bl 32b 1127 ldp x20, x30, [sp], #16 1128 sub x1, x1, #96*32 1129 1130 SETUP_SR_RECOMB 512, x7, x8, x9 1131 1132 cmp w19, #512 1133 b.eq 0f 1134 1135 mov x5, 4 1136 5125: 1137 SR_COMBINE_FULL 512 1138 add x1, x1, 8*32 1139 subs w5, w5, 1 1140 b.gt 5125b 1141 1142 cmp w20, #512 1143 b.gt 1024f 1144 1145 ret 1146 1024: 1147 stp x20, x30, [sp, #-16]! 1148 mov w20, #256 1149 add x1, x1, #96*32 1150 bl 32b 1151 add x1, x1, #64*32 1152 bl 32b 1153 ldp x20, x30, [sp], #16 1154 mov x5, #192*32 1155 sub x1, x1, x5 1156 1157 SETUP_SR_RECOMB 1024, x7, x8, x9 1158 1159 cmp w19, #1024 1160 b.eq 0f 1161 1162 mov w5, 8 1163 10245: 1164 SR_COMBINE_FULL 1024 1165 add x1, x1, 8*32 1166 subs w5, w5, 1 1167 b.gt 10245b 1168 1169 cmp w20, #1024 1170 b.gt 2048f 1171 1172 ret 1173 1174 SR_TRANSFORM_DEF 2048, 4096 1175 SR_TRANSFORM_DEF 4096, 8192 1176 SR_TRANSFORM_DEF 8192, 16384 1177 SR_TRANSFORM_DEF 16384, 32768 1178 SR_TRANSFORM_DEF 32768, 65536 1179 SR_TRANSFORM_DEF 65536, 131072 1180 SR_TRANSFORM_DEF 131072 1181 1182 0: // general deinterleave loop 1183 SR_COMBINE_DINT 1184 add x1, x1, #32*8 1185 subs w19, w19, #32*4 1186 b.gt 0b 1187 1188 ldp x19, x20, [sp, #16] 1189 ldp d14, d15, [sp, #16*2] 1190 ldp d12, d13, [sp, #16*3] 1191 ldp d10, d11, [sp, #16*4] 1192 ldp d8, d9, [sp, #16*5] 1193 ldp x21, x22, [sp], #16*6 1194 1195 ret 1196 1197 2: // special case for 64 point deinterleave 1198 mov x10, v23.d[0] 1199 mov x11, v23.d[1] 1200 1201 SR_COMBINE_LITE v0, v1, v8, v9, \ 1202 v2, v3, v16, v17, \ 1203 v24, v25, v26, v27, \ 1204 v28, v29, v30, 0 1205 1206 SR_COMBINE_HALF v4, v5, v12, v13, \ 1207 v6, v7, v20, v21, \ 1208 v24, v25, v26, v27, \ 1209 v28, v29, v30, v23, v24, v26, 1 1210 1211 zip1 v23.2d, v0.2d, v4.2d 1212 zip2 v24.2d, v0.2d, v4.2d 1213 zip1 v25.2d, v1.2d, v20.2d 1214 zip2 v26.2d, v1.2d, v20.2d 1215 1216 zip1 v27.2d, v8.2d, v12.2d 1217 zip2 v28.2d, v8.2d, v12.2d 1218 zip1 v29.2d, v9.2d, v21.2d 1219 zip2 v30.2d, v9.2d, v21.2d 1220 1221 mov v20.16b, v5.16b 1222 mov v21.16b, v7.16b 1223 mov x12, x1 1224 add x13, x1, #32* 4 1225 add x14, x1, #32* 8 1226 add x15, x1, #32*12 1227 1228 zip1 v4.2d, v2.2d, v6.2d 1229 zip2 v5.2d, v2.2d, v6.2d 1230 zip1 v6.2d, v3.2d, v20.2d 1231 zip2 v7.2d, v3.2d, v20.2d 1232 1233 zip1 v0.2d, v16.2d, v21.2d 1234 zip2 v1.2d, v16.2d, v21.2d 1235 zip1 v2.2d, v17.2d, v13.2d 1236 zip2 v3.2d, v17.2d, v13.2d 1237 1238 // stp is faster by a little on A53, but this is faster on M1s (theory) 1239 ldp q8, q9, [x1, #32*1] 1240 ldp q12, q13, [x1, #32*5] 1241 1242 st1 { v23.4s, v24.4s, v25.4s, v26.4s }, [x12], #64 // 32* 0...1 1243 st1 { v27.4s, v28.4s, v29.4s, v30.4s }, [x13], #64 // 32* 4...5 1244 st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x14], #64 // 32* 8...9 1245 st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x15], #64 // 32*12..13 1246 1247 mov v23.d[0], x10 1248 mov v23.d[1], x11 1249 1250 ldp q6, q7, [x1, #32*3] 1251 ldp q16, q17, [x1, #32*7] 1252 1253 SR_COMBINE v8, v9, v12, v13, v6, v16, v7, v17, \ 1254 v10, v11, v14, v15, v18, v19, v22, v23, \ 1255 x7, x8, x9, 0, \ 1256 v24, v25, v26, v27, v28, v29, v30, v4, v0, v1, v5, v20 1257 1258 zip1 v0.2d, v8.2d, v6.2d 1259 zip2 v1.2d, v8.2d, v6.2d 1260 zip1 v2.2d, v9.2d, v7.2d 1261 zip2 v3.2d, v9.2d, v7.2d 1262 st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x12] 1263 1264 zip1 v4.2d, v12.2d, v16.2d 1265 zip2 v5.2d, v12.2d, v16.2d 1266 zip1 v6.2d, v13.2d, v17.2d 1267 zip2 v7.2d, v13.2d, v17.2d 1268 st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x13] 1269 1270 zip1 v0.2d, v10.2d, v18.2d 1271 zip2 v1.2d, v10.2d, v18.2d 1272 zip1 v2.2d, v11.2d, v19.2d 1273 zip2 v3.2d, v11.2d, v19.2d 1274 st1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x14] 1275 1276 zip1 v4.2d, v14.2d, v22.2d 1277 zip2 v5.2d, v14.2d, v22.2d 1278 zip1 v6.2d, v15.2d, v23.2d 1279 zip2 v7.2d, v15.2d, v23.2d 1280 st1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x15] 1281 1282 ldp x19, x20, [sp, #16] 1283 ldp d14, d15, [sp, #16*2] 1284 ldp d12, d13, [sp, #16*3] 1285 ldp d10, d11, [sp, #16*4] 1286 ldp d8, d9, [sp, #16*5] 1287 ldp x21, x22, [sp], #16*6 1288 1289 ret 1290 endfunc 1291 .endm 1292 1293 FFT_SPLIT_RADIX_FN float, 0 1294 FFT_SPLIT_RADIX_FN ns_float, 1