tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

tx_float_neon.S (50120B)


      1 /*
      2 * Copyright (c) Lynne
      3 *
      4 * This file is part of FFmpeg.
      5 *
      6 * FFmpeg is free software; you can redistribute it and/or
      7 * modify it under the terms of the GNU Lesser General Public
      8 * License as published by the Free Software Foundation; either
      9 * version 2.1 of the License, or (at your option) any later version.
     10 *
     11 * FFmpeg is distributed in the hope that it will be useful,
     12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     14 * Lesser General Public License for more details.
     15 *
     16 * You should have received a copy of the GNU Lesser General Public
     17 * License along with FFmpeg; if not, write to the Free Software
     18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     19 */
     20 
     21 #include "libavutil/aarch64/asm.S"
     22 
     23 /* Open `doc/transforms.md` to see the code upon which the transforms here were
     24 * based upon.
     25 *
     26 * File conventions:
     27 * GPRs:    x0-x3   - arguments, untouched
     28 *          x4      - Lookup table base pointer
     29 *          x5-x6   - macro ld1 temps/function scratch
     30 *          x7-x9   - FFT table state
     31 *          x10-x17 - lookup table/macro scratch
     32 *          w19-w20 - current/target length when needed
     33 *          x21-x22 - len*2, len*6
     34 *
     35 * Vectors: v0-v7   - coefficients
     36 *          v8-v15  - coefficients when needed, otherwise untouched
     37 *          v16-v30 - used as needed
     38 *          v31     - -1.0, +1.0, -1.0, +1.0. Never touched after loading.
     39 *
     40 * Stack:   backup for v8-v15 and x19-x22 when needed, and transform lengths
     41 */
     42 
     43 #define M_SQRT1_2 0.707106781186547524401
     44 #define COS16_1   0.92387950420379638671875
     45 #define COS16_3   0.3826834261417388916015625
     46 
     47 /* We only ever load this once at the start, and then live with losing an
     48 * entire register as we need to lug this all the time everywhere.
     49 * Clearly should be integrated into an fsadd and fmlsa, but "muh RISC!". */
     50 const subadd, align=4
     51        .float -1.0,  1.0, -1.0,  1.0
     52 endconst
     53 
     54 .macro LOAD_SUBADD
     55        movrel          x5, subadd
     56        ld1             { v31.4s }, [x5]
     57 .endm
     58 
     59 .macro SETUP_LUT no_lut=0
     60 .if \no_lut == 0
     61        ldr             x4, [x0, #8]
     62 .endif
     63 .endm
     64 
     65 .macro LOAD_INPUT dst1, dst2, dst3, dst4, src, no_lut=0, discont=0
     66 .if \no_lut == 1
     67 .if \discont == 1
     68        ldp             q\dst1\(), q\dst2\(), [\src\()]
     69        ldp             q\dst3\(), q\dst4\(), [\src\(), #32]
     70        add             \src\(), \src\(), #64
     71 .else
     72        ld1             { v\dst1\().4s, v\dst2\().4s, v\dst3\().4s, v\dst4\().4s }, [\src], #64
     73 .endif
     74 .else
     75        ldp             w10, w11, [x4, #0 ]
     76        ldp             w12, w13, [x4, #8 ]
     77        ldp             w14, w15, [x4, #16]
     78        ldp             w16, w17, [x4, #24]
     79 
     80        add             x4, x4, #32
     81 
     82        ldr             d\dst1, [\src, x10, lsl #3]
     83        add             x11, \src, x11, lsl #3
     84        ldr             d\dst2, [\src, x12, lsl #3]
     85        add             x13, \src, x13, lsl #3
     86        ldr             d\dst3, [\src, x14, lsl #3]
     87        add             x15, \src, x15, lsl #3
     88        ldr             d\dst4, [\src, x16, lsl #3]
     89        add             x17, \src, x17, lsl #3
     90 
     91        ld1             { v\dst1\().d }[1], [x11]
     92        ld1             { v\dst2\().d }[1], [x13]
     93        ld1             { v\dst3\().d }[1], [x15]
     94        ld1             { v\dst4\().d }[1], [x17]
     95 .endif
     96 .endm
     97 
     98 .macro FFT4 e0, o0, standalone
     99        fadd            v16.4s, \e0\().4s, \o0\().4s         // r1..4
    100        fsub            \e0\().4s, \e0\().4s, \o0\().4s      // t1..4
    101 
    102        rev64           v18.4s, \e0\().4s
    103 
    104        zip2            \o0\().2d, v16.2d, \e0\().2d
    105        zip1            v17.2d, v16.2d, \e0\().2d
    106 
    107        mov             \o0\().d[1], v18.d[1]
    108 
    109        fadd            \e0\().4s, v17.4s, \o0\().4s         // a1,2 b1,4
    110        fsub            v16.4s,    v17.4s, \o0\().4s         // a3,4 b3,2
    111 
    112        mov             \o0\().16b, v16.16b                  // Swap once again...
    113        mov             \o0\().s[3], \e0\().s[3]
    114        mov             \e0\().s[3], v16.s[3]
    115 
    116 .if \standalone == 0
    117        uzp2            \o0\().2d, \e0\().2d, \o0\().2d
    118        uzp1            \e0\().2d, \e0\().2d, v16.2d
    119 .endif
    120 .endm
    121 
    122 const shuf_4pt_x2, align=4
    123        .byte   24, 25, 26, 27 // reg2, 3
    124        .byte   12, 13, 14, 15 // reg1, 4
    125        .byte    8,  9, 10, 11 // reg1, 3
    126        .byte   28, 29, 30, 31 // reg2, 4
    127 endconst
    128 
    129 // Identical to FFT4, but does 2 transforms in parallel, with no deinterleaving
    130 .macro FFT4_X2 e0, o0, e1, o1, \
    131               t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22
    132 
    133        fadd            \t0\().4s, \e0\().4s, \o0\().4s                     // r1234
    134        fadd            \t2\().4s, \e1\().4s, \o1\().4s                     // r1234
    135        fsub            \e0\().4s, \e0\().4s, \o0\().4s                     // t1234
    136        fsub            \e1\().4s, \e1\().4s, \o1\().4s                     // t1234
    137 
    138        movrel          x5, shuf_4pt_x2
    139 
    140        rev64           \t4\().4s, \e0\().4s
    141        rev64           \t5\().4s, \e1\().4s
    142 
    143        zip2            \o0\().2d, \t0\().2d, \e0\().2d                     // t3,4 r3,4
    144        zip2            \o1\().2d, \t2\().2d, \e1\().2d                     // t3,4 r3,4
    145 
    146        ld1             { \t6\().16b }, [x5]
    147 
    148        mov             \o0\().d[1], \t4\().d[1]
    149        mov             \o1\().d[1], \t5\().d[1]
    150 
    151        zip1            \t1\().2d, \t0\().2d, \e0\().2d                     // t1,2 r1,2
    152        zip1            \t3\().2d, \t2\().2d, \e1\().2d                     // t1,2 r1,2
    153 
    154        fsub            \t4\().4s, \t1\().4s, \o0\().4s                     // a34 b32
    155        fadd            \t5\().4s, \t1\().4s, \o0\().4s                     // a12 b14
    156        fsub            \t2\().4s, \t3\().4s, \o1\().4s                     // a34 b32
    157        fadd            \t3\().4s, \t3\().4s, \o1\().4s                     // a12 b14
    158 
    159        // TODO: experiment with movs instead of tables here
    160        tbl             \o0\().16b, { \t4\().16b, \t5\().16b }, \t6\().16b  // b1234
    161        tbl             \o1\().16b, { \t2\().16b, \t3\().16b }, \t6\().16b  // b1234
    162 
    163        zip1            \e0\().2d, \t5\().2d, \t4\().2d                     // a1234
    164 //        zip2            \o0\().2d, \t5\().2d, \t4\().2d                     // b1432
    165        zip1            \e1\().2d, \t3\().2d, \t2\().2d                     // a1234
    166 //        zip2            \o1\().2d, \t3\().2d, \t2\().2d                     // b1432
    167 //        rev64           \o0\().4s, \o0\().4s                                // b4123
    168 //        rev64           \o1\().4s, \o1\().4s                                // b4123
    169 //        ext             \o0\().16b, \o0\().16b, \o0\().16b, #4              // b1234
    170 //        ext             \o1\().16b, \o1\().16b, \o1\().16b, #4              // b1234
    171 .endm
    172 
    173 const tab_8pt, align=4
    174        .float M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2,  M_SQRT1_2
    175 endconst
    176 
    177 .macro FFT8 e0, e1, o0, o1, \
    178            t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22
    179 
    180        movrel          x5, tab_8pt
    181 
    182        fsub            \t1\().4s, \e1\().4s, \o1\().4s             // j1234
    183        fadd            \o1\().4s, \e1\().4s, \o1\().4s             // k1234
    184        fsub            \t0\().4s, \e0\().4s, \o0\().4s             // r1234
    185        fadd            \o0\().4s, \e0\().4s, \o0\().4s             // q1234
    186 
    187        ld1             { \t5\().4s }, [x5]
    188 
    189        ext             \t4\().16b, \o1\().16b, \o1\().16b, #12
    190        rev64           \t4\().4s,  \t4\().4s
    191 
    192        ext             \t2\().16b, \o0\().16b, \t4\().16b, #8      // o0[0,1], o1[3,2]
    193        mov             \o0\().d[1], \t4\().d[1]                    // o0[3, 4]; o1[1, 4]
    194 
    195        fsub            \e1\().4s, \o0\().4s, \t2\().4s             // s34, g43
    196        fadd            \t2\().4s, \o0\().4s, \t2\().4s             // s12, g12
    197 
    198        rev64           \t6\().4s, v31.4s                           // 1, -1, 1, -1
    199        dup             \o0\().2d, \t0\().d[0]                      // r1212
    200        dup             \o1\().2d, \t0\().d[1]                      // r3434
    201 
    202        rev64           \t4\().4s, \e1\().4s                        // xxg34
    203        rev64           \o1\().4s, \o1\().4s                        // r4343
    204 
    205        ext             \t6\().16b, v31.16b, \t6\().16b, #8         // -1, 1, 1, -1
    206        zip1            \t3\().2d, \t2\().2d, \e1\().2d             // s1234
    207        zip2            \t2\().2d, \t2\().2d, \t4\().2d             // g1234
    208 
    209        fadd            \e0\().4s, \t3\().4s, \t2\().4s             // out_e1
    210        fsub            \e1\().4s, \t3\().4s, \t2\().4s             // out_e2
    211 
    212        fmul            \t1\().4s, \t1\().4s, \t5\().4s             // j * +--+M_SQRT1_2
    213        fmls            \o0\().4s, \o1\().4s, \t6\().4s             // z1234
    214 
    215        rev64           \t4\().4s, \t1\().4s                        // j2143
    216        fmla            \t1\().4s, \t4\().4s, v31.4s                // l2143
    217 
    218        rev64           \t4\().4s, \t1\().4s                        // l1234
    219        ext             \t4\().16b, \t4\().16b, \t4\().16b, #8      // l3412
    220 
    221        fmla            \t4\().4s, \t1\().4s, v31.4s                // t1234
    222 
    223        fadd            \o1\().4s, \o0\().4s, \t4\().4s             // out_o2
    224        fsub            \o0\().4s, \o0\().4s, \t4\().4s             // out_o1
    225 .endm
    226 
    227 // Identical as FFT8, but does 2 transforms in parallel
    228 .macro FFT8_X2 e0, e1, o0, o1, e2, e3, o2, o3
    229 
    230        movrel          x5, tab_8pt
    231 
    232        fadd            v19.4s, \e3\().4s, \o3\().4s             // k1234
    233        fadd            v17.4s, \e1\().4s, \o1\().4s             // k1234
    234        fadd            v18.4s, \e2\().4s, \o2\().4s             // q1234
    235        fadd            v16.4s, \e0\().4s, \o0\().4s             // q1234
    236 
    237        ld1             { v23.4s }, [x5]
    238 
    239        ext             v22.16b, v19.16b, v19.16b, #12
    240        ext             v21.16b, v17.16b, v17.16b, #12
    241 
    242        rev64           v22.4s,  v22.4s
    243        rev64           v21.4s,  v21.4s
    244 
    245        ext             v19.16b, v18.16b, v22.16b, #8
    246        ext             v17.16b, v16.16b, v21.16b, #8
    247 
    248        mov             v18.d[1], v22.d[1]
    249        mov             v21.d[0], v16.d[0]
    250 
    251        fadd            v22.4s, v18.4s, v19.4s                   // s12, g12
    252        fsub            v19.4s, v18.4s, v19.4s                   // s34, g43
    253        fsub            v18.4s, v21.4s, v17.4s                   // s34, g43
    254        fadd            v16.4s, v21.4s, v17.4s                   // s12, g12
    255 
    256        fsub            \e0\().4s, \e0\().4s, \o0\().4s          // r1234
    257        fsub            v20.4s, \e1\().4s, \o1\().4s             // j1234
    258        fsub            \e2\().4s, \e2\().4s, \o2\().4s          // r1234
    259        fsub            v21.4s, \e3\().4s, \o3\().4s             // j1234
    260 
    261        rev64           v24.4s, v31.4s                           // 1, -1, 1, -1
    262        zip1            v17.2d, v16.2d, v18.2d                   // s1234
    263        zip1            \e1\().2d, v22.2d, v19.2d                // s1234
    264 
    265        rev64           v18.4s, v18.4s                           // xxg34
    266        rev64           v19.4s, v19.4s                           // xxg34
    267 
    268        zip2            v16.2d, v16.2d, v18.2d                   // g1234
    269        zip2            \e3\().2d, v22.2d, v19.2d                // g1234
    270 
    271        dup             \o0\().2d, \e0\().d[0]                   // r1212
    272        dup             \o1\().2d, \e0\().d[1]                   // r3434
    273        dup             \o2\().2d, \e2\().d[0]                   // r1212
    274        dup             \o3\().2d, \e2\().d[1]                   // r3434
    275 
    276        fadd            \e2\().4s, \e1\().4s, \e3\().4s          // out_e1
    277        fsub            \e3\().4s, \e1\().4s, \e3\().4s          // out_e2
    278        fadd            \e0\().4s, v17.4s, v16.4s                // out_e1
    279        fsub            \e1\().4s, v17.4s, v16.4s                // out_e2
    280 
    281        ext             v24.16b, v31.16b, v24.16b, #8            // -1, 1, 1, -1
    282        rev64           \o1\().4s, \o1\().4s                     // r4343
    283        rev64           \o3\().4s, \o3\().4s                     // r4343
    284 
    285        fmul            v19.4s, v20.4s, v23.4s                   // j * +--+M_SQRT1_2
    286        fmul            v21.4s, v21.4s, v23.4s                   // j * +--+M_SQRT1_2
    287 
    288        rev64           v20.4s, v19.4s                           // j2143
    289        rev64           v18.4s, v21.4s                           // j2143
    290 
    291        fmls            \o0\().4s, \o1\().4s, v24.4s             // z1234
    292        fmls            \o2\().4s, \o3\().4s, v24.4s             // z1234
    293 
    294        fmla            v19.4s, v20.4s, v31.4s                   // l2143
    295        fmla            v21.4s, v18.4s, v31.4s                   // l2143
    296 
    297        rev64           v20.4s, v19.4s                           // l1234
    298        rev64           v18.4s, v21.4s                           // l1234
    299        ext             v20.16b, v20.16b, v20.16b, #8            // l3412
    300        ext             v18.16b, v18.16b, v18.16b, #8            // l3412
    301 
    302        fmla            v20.4s, v19.4s, v31.4s                   // t1234
    303        fmla            v18.4s, v21.4s, v31.4s                   // t1234
    304 
    305        fadd            \o1\().4s, \o0\().4s, v20.4s             // out_o2
    306        fadd            \o3\().4s, \o2\().4s, v18.4s             // out_o2
    307        fsub            \o0\().4s, \o0\().4s, v20.4s             // out_o1
    308        fsub            \o2\().4s, \o2\().4s, v18.4s             // out_o1
    309 .endm
    310 
    311 const tab_16pt, align=4
    312        .float   -COS16_1,    COS16_1,   -COS16_3,    COS16_3    // Could be +-+- too
    313        .float    COS16_3,    COS16_3,    COS16_1,    COS16_1
    314        .float        1.0,        1.0,  M_SQRT1_2,  M_SQRT1_2
    315 endconst
    316 
    317 // 16-point FFT
    318 // t3, t4, t5, t6 must be sequential
    319 .macro FFT16 e0, e1, e2, e3, o0, o1, o2, o3, \
    320             t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, t6=v22
    321 
    322        FFT8            \e0, \e1, \e2, \e3, \t0, \t1, \t2, \t3, \t4, \t5, \t6
    323        FFT4_X2         \o0, \o1, \o2, \o3, \t0, \t1, \t2, \t3, \t4, \t5, \t6
    324 
    325        movrel          x5, tab_16pt
    326 
    327        rev64           \t0\().4s, \o0\().4s                     // z[ 8, 9].imre
    328        rev64           \t1\().4s, \o2\().4s                     // z[10,11].imre
    329 
    330        ins             \t0\().d[0], xzr
    331        ins             \t1\().d[0], xzr
    332 
    333        ld1             { \t4\().4s, \t5\().4s, \t6\().4s }, [x5]
    334        // TODO: We could derive \t4\() or \t5\() from either, but it seems cheaper to load
    335 
    336        fmla            \o2\().4s, \t1\().4s, v31.4s             // s[4567]
    337        fmls            \o0\().4s, \t0\().4s, v31.4s             // s[0123]
    338 
    339        fmul            \t2\().4s, \o1\().4s, \t4\().4s
    340        fmul            \t3\().4s, \o3\().4s, \t4\().4s
    341 
    342        rev64           \o3\().4s, \o3\().4s
    343        rev64           \o1\().4s, \o1\().4s
    344 
    345        fmla            \t3\().4s, \o3\().4s, \t5\().4s          // s[12, 13, 14, 15]
    346        fmls            \t2\().4s, \o1\().4s, \t5\().4s          // s[ 8,  9, 10, 11]
    347 
    348        fmul            \t1\().4s, \o2\().4s, \t6\().4s          // s[4567] * mult
    349        fmul            \t0\().4s, \o0\().4s, \t6\().4s          // s[0123] * mult
    350 
    351        mov             \o1\().16b, \t3\().16b
    352        mov             \o2\().16b, \t1\().16b
    353 
    354        fsub            \t3\().4s, \t3\().4s, \t2\().4s          // y34, u34
    355        fsub            \t1\().4s, \t1\().4s, \t0\().4s          // w34, x34
    356 
    357        fadd            \t2\().4s, \t2\().4s, \o1\().4s          // y56, u56
    358        rev64           \t3\().4s, \t3\().4s
    359        fadd            \t0\().4s, \t0\().4s, \o2\().4s          // w56, x56
    360        rev64           \t1\().4s, \t1\().4s
    361 
    362        fmul            \t2\().4s, \t2\().4s, v31.4s
    363        fmul            \t1\().4s, \t1\().4s, v31.4s
    364 
    365        fadd            \o3\().4s, \e3\().4s, \t3\().4s
    366        fsub            \o2\().4s, \e3\().4s, \t3\().4s
    367        fsub            \o1\().4s, \e2\().4s, \t2\().4s
    368        fadd            \o0\().4s, \e2\().4s, \t2\().4s
    369 
    370        fsub            \e2\().4s, \e0\().4s, \t0\().4s
    371        fadd            \e0\().4s, \e0\().4s, \t0\().4s
    372        fsub            \e3\().4s, \e1\().4s, \t1\().4s
    373        fadd            \e1\().4s, \e1\().4s, \t1\().4s
    374 .endm
    375 
    376 function ff_tx_fft2_float_neon, export=1
    377        ld2r            { v0.2d, v1.2d }, [x2]
    378 
    379        fneg            v2.2s, v1.2s
    380        mov             v2.d[1], v1.d[0]
    381 
    382        fsub            v2.4s, v0.4s, v2.4s
    383 
    384        st1             { v2.4s }, [x1]
    385        ret
    386 endfunc
    387 
    388 .macro FFT4_FN name, inv
    389 function ff_tx_fft4_\name\()_float_neon, export=1
    390        ld1             {v0.4s, v1.4s}, [x2]
    391 
    392 .if \inv == 1
    393        mov             v2.d[0], v0.d[1]
    394        mov             v0.d[1], v1.d[1]
    395        mov             v1.d[1], v2.d[0]
    396 .endif
    397 
    398        FFT4            v0, v1, 1
    399 
    400        st1             { v0.4s, v1.4s }, [x1]
    401        ret
    402 endfunc
    403 .endm
    404 
    405 FFT4_FN fwd, 0
    406 FFT4_FN inv, 1
    407 
    408 .macro FFT8_FN name, no_perm
    409 function ff_tx_fft8_\name\()_neon, export=1
    410        SETUP_LUT       \no_perm
    411        LOAD_INPUT      0, 1, 2, 3, x2, \no_perm
    412 
    413        LOAD_SUBADD
    414        FFT8            v0, v1, v2, v3
    415 
    416        zip1            v16.2d, v0.2d, v2.2d
    417        zip2            v17.2d, v0.2d, v2.2d
    418        zip1            v18.2d, v1.2d, v3.2d
    419        zip2            v19.2d, v1.2d, v3.2d
    420        st1             { v16.4s, v17.4s, v18.4s, v19.4s }, [x1]
    421 
    422        ret
    423 endfunc
    424 .endm
    425 
    426 FFT8_FN float,    0
    427 FFT8_FN ns_float, 1
    428 
    429 .macro FFT16_FN name, no_perm
    430 function ff_tx_fft16_\name\()_neon, export=1
    431        SETUP_LUT       \no_perm
    432        LOAD_INPUT      0, 1, 2, 3, x2, \no_perm
    433        LOAD_INPUT      4, 5, 6, 7, x2, \no_perm
    434 
    435        LOAD_SUBADD
    436        FFT16           v0, v1, v2, v3, v4, v5, v6, v7
    437 
    438        zip1            v20.2d, v0.2d, v4.2d
    439        zip2            v21.2d, v0.2d, v4.2d
    440        zip1            v22.2d, v1.2d, v6.2d
    441        zip2            v23.2d, v1.2d, v6.2d
    442        st1             { v20.4s, v21.4s, v22.4s, v23.4s }, [x1], #64
    443 
    444        zip1            v24.2d, v2.2d, v5.2d
    445        zip2            v25.2d, v2.2d, v5.2d
    446        zip1            v26.2d, v3.2d, v7.2d
    447        zip2            v27.2d, v3.2d, v7.2d
    448        st1             { v24.4s, v25.4s, v26.4s, v27.4s }, [x1]
    449 
    450        ret
    451 endfunc
    452 .endm
    453 
    454 FFT16_FN float,    0
    455 FFT16_FN ns_float, 1
    456 
    457 .macro SETUP_SR_RECOMB len, re, im, dec
    458        ldr             w5, =(\len - 4*7)
    459        movrel          \re, X(ff_tx_tab_\len\()_float)
    460        add             \im, \re, x5
    461        mov             \dec, #-32
    462 
    463 .if \len > 32
    464        mov             x21, #2*\len
    465        add             x22, x21, x21, lsl #1
    466 .endif
    467 .endm
    468 
    469 .macro SR_COMBINE e0, e1, e2, e3, e4, e5, e6, e7, \
    470                  o0, o1, o2, o3, o4, o5, o6, o7, \
    471                  re, im, dec, swap_im, \
    472                  t0=v16, t1=v17, t2=v18, t3=v19, t4=v20, t5=v21, \
    473                  t6=v22, t7=v23, t8=v24, t9=v25, ta=v26, tb=v27
    474 
    475        ld1             { \t8\().4s, \t9\().4s }, [\im], \dec
    476        ld1             { \t0\().4s, \t1\().4s }, [\re], #32
    477 
    478 .if \swap_im == 1
    479        ext             \t2\().16b, \t9\().16b, \t9\().16b, #8
    480        ext             \t3\().16b, \t8\().16b, \t8\().16b, #8
    481 .else
    482        ext             \t2\().16b, \t8\().16b, \t8\().16b, #8
    483        ext             \t3\().16b, \t9\().16b, \t9\().16b, #8
    484 .endif
    485 
    486        trn1            \t4\().4s, \t0\().4s, \t0\().4s      // cos0022
    487        trn2            \t0\().4s, \t0\().4s, \t0\().4s      // cos4466
    488        trn1            \t5\().4s, \t1\().4s, \t1\().4s      // cos1133
    489        trn2            \t1\().4s, \t1\().4s, \t1\().4s      // cos5577
    490 
    491        rev64           \t6\().4s, \o0\().4s                 // E m2[0,1].imre
    492        rev64           \t7\().4s, \o2\().4s                 // O m2[0,1].imre
    493        rev64           \t8\().4s, \o4\().4s                 // E m2[2,3].imre
    494        rev64           \t9\().4s, \o6\().4s                 // O m2[2,3].imre
    495 
    496        fmul            \t6\().4s, \t6\().4s, \t4\().4s      // E m2[0,1].imre*t1[0,2]
    497        fmul            \t7\().4s, \t7\().4s, \t0\().4s      // O m2[0,1].imre*t1[0,2]
    498        fmul            \t8\().4s, \t8\().4s, \t4\().4s      // E m2[2,3].imre*t1[0,2]
    499        fmul            \t9\().4s, \t9\().4s, \t0\().4s      // O m2[2,3].imre*t1[0,2]
    500 
    501        rev64           \ta\().4s, \o1\().4s                 // E m3[0,1].imre
    502        rev64           \tb\().4s, \o3\().4s                 // O m3[0,1].imre
    503        rev64           \t4\().4s, \o5\().4s                 // E m3[2,3].imre
    504        rev64           \t0\().4s, \o7\().4s                 // O m3[2,3].imre
    505 
    506        fmul            \ta\().4s, \ta\().4s, \t5\().4s      // E m3[0,1].imre*t1[4,6]
    507        fmul            \tb\().4s, \tb\().4s, \t1\().4s      // O m3[0,1].imre*t1[4,6]
    508        fmul            \t4\().4s, \t4\().4s, \t5\().4s      // E m3[2,3].imre*t1[4,6]
    509        fmul            \t0\().4s, \t0\().4s, \t1\().4s      // O m3[2,3].imre*t1[4,6]
    510 
    511        trn1            \t5\().4s, \t3\().4s, \t3\().4s      // wim2200
    512        trn2            \t3\().4s, \t3\().4s, \t3\().4s      // wim3311
    513        trn1            \t1\().4s, \t2\().4s, \t2\().4s      // wim6644
    514        trn2            \t2\().4s, \t2\().4s, \t2\().4s      // wim7755
    515 
    516        fmul            \t5\().4s, \t5\().4s, v31.4s
    517        fmul            \t3\().4s, \t3\().4s, v31.4s
    518        fmul            \t1\().4s, \t1\().4s, v31.4s
    519        fmul            \t2\().4s, \t2\().4s, v31.4s
    520 
    521        fmla            \t7\().4s, \o2\().4s, \t5\().4s      // O w0123
    522        fmls            \t9\().4s, \o6\().4s, \t5\().4s      // O j0123
    523        fmla            \t6\().4s, \o0\().4s, \t3\().4s      // E w0123
    524        fmls            \t8\().4s, \o4\().4s, \t3\().4s      // E j0123
    525 
    526        fmla            \ta\().4s, \o1\().4s, \t2\().4s      // E w4567
    527        fmla            \tb\().4s, \o3\().4s, \t1\().4s      // O w4567
    528        fmls            \t4\().4s, \o5\().4s, \t2\().4s      // E j4567
    529        fmls            \t0\().4s, \o7\().4s, \t1\().4s      // O j4567
    530 
    531        fsub            \t2\().4s, \t7\().4s, \t9\().4s
    532        fsub            \t1\().4s, \t8\().4s, \t6\().4s
    533        fsub            \t3\().4s, \t4\().4s, \ta\().4s
    534        fsub            \t5\().4s, \t0\().4s, \tb\().4s
    535 
    536        fadd            \t6\().4s, \t8\().4s, \t6\().4s
    537        fadd            \t7\().4s, \t9\().4s, \t7\().4s
    538        fadd            \t8\().4s, \t4\().4s, \ta\().4s
    539        fadd            \t9\().4s, \t0\().4s, \tb\().4s
    540 
    541        fmul            \t1\().4s, \t1\().4s, v31.4s
    542        fmul            \t2\().4s, \t2\().4s, v31.4s
    543        fmul            \t3\().4s, \t3\().4s, v31.4s
    544        fmul            \t5\().4s, \t5\().4s, v31.4s
    545 
    546        rev64           \t6\().4s, \t6\().4s
    547        rev64           \t8\().4s, \t8\().4s
    548        rev64           \t7\().4s, \t7\().4s
    549        rev64           \t9\().4s, \t9\().4s
    550 
    551        fsub            \o0\().4s, \e0\().4s, \t6\().4s
    552        fsub            \o1\().4s, \e1\().4s, \t8\().4s
    553        fsub            \o2\().4s, \e2\().4s, \t1\().4s
    554        fsub            \o3\().4s, \e3\().4s, \t3\().4s
    555 
    556        fsub            \o4\().4s, \e4\().4s, \t7\().4s
    557        fsub            \o5\().4s, \e6\().4s, \t9\().4s
    558        fadd            \o6\().4s, \e5\().4s, \t2\().4s
    559        fsub            \o7\().4s, \e7\().4s, \t5\().4s
    560 
    561        fadd            \e0\().4s, \e0\().4s, \t6\().4s
    562        fadd            \e1\().4s, \e1\().4s, \t8\().4s
    563        fadd            \e2\().4s, \e2\().4s, \t1\().4s
    564        fadd            \e3\().4s, \e3\().4s, \t3\().4s
    565 
    566        fadd            \e4\().4s, \e4\().4s, \t7\().4s
    567        fsub            \e5\().4s, \e5\().4s, \t2\().4s      // swapped
    568        fadd            \e6\().4s, \e6\().4s, \t9\().4s      // swapped
    569        fadd            \e7\().4s, \e7\().4s, \t5\().4s
    570 .endm
    571 
    572 .macro SR_COMBINE_HALF e0, e1, e2, e3, \
    573                       o0, o1, o2, o3, \
    574                       c0, c1, c2, c3, \
    575                       t0, t1, t2, t3, t4, t5, part
    576 
    577 .if \part == 0
    578        trn1            \t4\().4s, \c0\().4s, \c0\().4s   // cos0022
    579        trn1            \c1\().4s, \c1\().4s, \c1\().4s   // cos1133
    580 .else
    581        trn2            \t4\().4s, \c0\().4s, \c0\().4s   // cos0022
    582        trn2            \c1\().4s, \c1\().4s, \c1\().4s   // cos1133
    583 .endif
    584 .if \part == 0
    585        trn2            \t5\().4s, \c2\().4s, \c2\().4s   // wim7755
    586        trn2            \c3\().4s, \c3\().4s, \c3\().4s   // wim3311
    587 .else
    588        trn1            \t5\().4s, \c2\().4s, \c2\().4s   // wim7755
    589        trn1            \c3\().4s, \c3\().4s, \c3\().4s   // wim3311
    590 .endif
    591 
    592        fmul            \t5\().4s, \t5\().4s, v31.4s
    593        fmul            \c3\().4s, \c3\().4s, v31.4s
    594 
    595        rev64           \t0\().4s, \o0\().4s              // E m2[0,1].imre
    596        rev64           \t1\().4s, \o2\().4s              // E m2[2,3].imre
    597        rev64           \t2\().4s, \o1\().4s              // E m3[0,1].imre
    598        rev64           \t3\().4s, \o3\().4s              // E m3[2,3].imre
    599 
    600        fmul            \o0\().4s, \o0\().4s, \c3\().4s   // E m2[0,1].imre*t1[0,2]
    601        fmul            \o1\().4s, \o1\().4s, \t5\().4s   // E m3[0,1].imre*t1[4,6]
    602        fmla            \o0\().4s, \t0\().4s, \t4\().4s   // E w0123
    603        fmla            \o1\().4s, \t2\().4s, \c1\().4s   // E w4567
    604 
    605        fmul            \t1\().4s, \t1\().4s, \t4\().4s   // E m2[2,3].imre*t1[0,2]
    606        fmul            \t3\().4s, \t3\().4s, \c1\().4s   // E m3[2,3].imre*t1[4,6]
    607        fmls            \t1\().4s, \o2\().4s, \c3\().4s   // E j0123
    608        fmls            \t3\().4s, \o3\().4s, \t5\().4s   // E j4567
    609 
    610        fsub            \t0\().4s, \t1\().4s, \o0\().4s
    611        fadd            \t1\().4s, \t1\().4s, \o0\().4s
    612        fadd            \t2\().4s, \t3\().4s, \o1\().4s
    613        fsub            \t3\().4s, \t3\().4s, \o1\().4s
    614 
    615        fmul            \t0\().4s, \t0\().4s, v31.4s
    616        fmul            \t3\().4s, \t3\().4s, v31.4s
    617 
    618        rev64           \t1\().4s, \t1\().4s
    619        rev64           \t2\().4s, \t2\().4s
    620 
    621 .if \part == 0
    622        fsub            \o0\().4s, \e0\().4s, \t1\().4s
    623        fsub            \o1\().4s, \e1\().4s, \t2\().4s
    624        fsub            \o2\().4s, \e2\().4s, \t0\().4s
    625        fsub            \o3\().4s, \e3\().4s, \t3\().4s
    626 .else
    627        fsub            \o0\().4s, \e0\().4s, \t1\().4s
    628        fadd            \o2\().4s, \e1\().4s, \t2\().4s
    629        fsub            \o1\().4s, \e2\().4s, \t0\().4s
    630        fadd            \o3\().4s, \e3\().4s, \t3\().4s
    631 .endif
    632 
    633 .if \part == 0
    634        fadd            \e0\().4s, \e0\().4s, \t1\().4s
    635        fadd            \e1\().4s, \e1\().4s, \t2\().4s
    636        fadd            \e2\().4s, \e2\().4s, \t0\().4s
    637        fadd            \e3\().4s, \e3\().4s, \t3\().4s
    638 .else
    639        fadd            \e0\().4s, \e0\().4s, \t1\().4s
    640        fsub            \e1\().4s, \e1\().4s, \t2\().4s   // swapped
    641        fadd            \e2\().4s, \e2\().4s, \t0\().4s   // swapped
    642        fsub            \e3\().4s, \e3\().4s, \t3\().4s
    643 .endif
    644 .endm
    645 
    646 /* Same as SR_COMBINE_HALF, but heroically tries to use 3 temporary registers
    647 * without touching the tables. */
    648 .macro SR_COMBINE_LITE e0, e1, e2, e3, \
    649                       o0, o1, o2, o3, \
    650                       c0, c1, c2, c3, \
    651                       t0, t1, t2, part
    652 
    653        rev64           \t0\().4s, \o0\().4s              // E m2[0,1].imre
    654        rev64           \t1\().4s, \o2\().4s              // E m2[2,3].imre
    655 .if \part == 0
    656        trn2            \t2\().4s, \c3\().4s, \c3\().4s   // wim3311
    657 .else
    658        trn1            \t2\().4s, \c3\().4s, \c3\().4s   // wim3311
    659 .endif
    660        fmul            \t2\().4s, \t2\().4s, v31.4s
    661        fmul            \o2\().4s, \o2\().4s, \t2\().4s
    662        fmul            \o0\().4s, \o0\().4s, \t2\().4s   // E m2[0,1].imre*t1[0,2]
    663 .if \part == 0
    664        trn1            \t2\().4s, \c0\().4s, \c0\().4s   // cos0022
    665 .else
    666        trn2            \t2\().4s, \c0\().4s, \c0\().4s   // cos0022
    667 .endif
    668        fmul            \t1\().4s, \t1\().4s, \t2\().4s   // E m2[2,3].imre*t1[0,2]
    669        fmla            \o0\().4s, \t0\().4s, \t2\().4s   // E w0123
    670        fsub            \t1\().4s, \t1\().4s, \o2\().4s   // E j0123
    671 
    672        rev64           \t2\().4s, \o1\().4s              // E m3[0,1].imre
    673        rev64           \o2\().4s, \o3\().4s              // E m3[2,3].imre
    674 
    675 .if \part == 0
    676        trn2            \t0\().4s, \c2\().4s, \c2\().4s   // wim7755
    677 .else
    678        trn1            \t0\().4s, \c2\().4s, \c2\().4s   // wim7755
    679 .endif
    680        fmul            \t0\().4s, \t0\().4s, v31.4s
    681 
    682        fmul            \o1\().4s, \o1\().4s, \t0\().4s   // E m3[0,1].imre*t1[4,6]
    683        fmul            \o3\().4s, \o3\().4s, \t0\().4s
    684 
    685 .if \part == 0
    686        trn1            \t0\().4s, \c1\().4s, \c1\().4s   // cos1133
    687 .else
    688        trn2            \t0\().4s, \c1\().4s, \c1\().4s   // cos1133
    689 .endif
    690        fmul            \o2\().4s, \o2\().4s, \t0\().4s   // E m3[2,3].imre*t1[4,6]
    691        fmla            \o1\().4s, \t2\().4s, \t0\().4s   // E w4567
    692        fsub            \o2\().4s, \o2\().4s, \o3\().4s   // E j4567
    693 
    694        fsub            \t0\().4s, \t1\().4s, \o0\().4s
    695        fadd            \o0\().4s, \t1\().4s, \o0\().4s
    696        fadd            \t2\().4s, \o2\().4s, \o1\().4s
    697        fsub            \t1\().4s, \o2\().4s, \o1\().4s
    698 
    699        fmul            \t0\().4s, \t0\().4s, v31.4s
    700        fmul            \t1\().4s, \t1\().4s, v31.4s
    701 
    702        rev64           \t2\().4s, \t2\().4s
    703        rev64           \o0\().4s, \o0\().4s
    704 
    705 .if \part == 0
    706        fsub            \o1\().4s, \e1\().4s, \t2\().4s
    707        fsub            \o2\().4s, \e2\().4s, \t0\().4s
    708        fsub            \o3\().4s, \e3\().4s, \t1\().4s
    709 .else
    710        fadd            \o2\().4s, \e1\().4s, \t0\().4s
    711        fsub            \o1\().4s, \e2\().4s, \t2\().4s
    712        fadd            \o3\().4s, \e3\().4s, \t1\().4s
    713 .endif
    714 
    715 .if \part == 0
    716        fadd            \e1\().4s, \e1\().4s, \t2\().4s
    717        fadd            \e2\().4s, \e2\().4s, \t0\().4s
    718        fadd            \e3\().4s, \e3\().4s, \t1\().4s
    719 .else
    720        fsub            \e1\().4s, \e1\().4s, \t0\().4s   // swapped
    721        fadd            \e2\().4s, \e2\().4s, \t2\().4s   // swapped
    722        fsub            \e3\().4s, \e3\().4s, \t1\().4s
    723 .endif
    724 
    725        mov             \t1\().16b, \o0\().16b
    726 
    727        fsub            \o0\().4s, \e0\().4s, \t1\().4s
    728        fadd            \e0\().4s, \e0\().4s, \t1\().4s
    729 .endm
    730 
    731 .macro SR_COMBINE_4 len, part, off
    732        add             x10, x1, x21
    733        add             x11, x1, x21, lsl #1
    734        add             x12, x1, x22
    735 
    736        ldp             q0,  q1,  [x1,  #((0 + \part)*32 + \off)]
    737        ldp             q4,  q5,  [x1,  #((2 + \part)*32 + \off)]
    738        ldp             q2,  q3,  [x10, #((0 + \part)*32 + \off)]
    739        ldp             q6,  q7,  [x10, #((2 + \part)*32 + \off)]
    740 
    741        ldp             q8,  q9,  [x11, #((0 + \part)*32 + \off)]
    742        ldp             q10, q11, [x11, #((2 + \part)*32 + \off)]
    743        ldp             q12, q13, [x12, #((0 + \part)*32 + \off)]
    744        ldp             q14, q15, [x12, #((2 + \part)*32 + \off)]
    745 
    746        SR_COMBINE      v0,  v1,  v2,  v3,  v4,  v6,  v5,  v7, \
    747                        v8,  v9, v10, v11, v12, v13, v14, v15, \
    748                        x7,  x8, x9, 0
    749 
    750        stp             q0,  q1,  [x1,  #((0 + \part)*32 + \off)]
    751        stp             q4,  q5,  [x1,  #((2 + \part)*32 + \off)]
    752        stp             q2,  q3,  [x10, #((0 + \part)*32 + \off)]
    753        stp             q6,  q7,  [x10, #((2 + \part)*32 + \off)]
    754 
    755        stp             q8,  q9,  [x11, #((0 + \part)*32 + \off)]
    756        stp             q12, q13, [x11, #((2 + \part)*32 + \off)]
    757        stp             q10, q11, [x12, #((0 + \part)*32 + \off)]
    758        stp             q14, q15, [x12, #((2 + \part)*32 + \off)]
    759 .endm
    760 
    761 .macro SR_COMBINE_FULL len, off=0
    762        add             x10, x1, x21
    763        add             x11, x1, x21, lsl #1
    764        add             x12, x1, x22
    765 
    766        SR_COMBINE_4    \len, 0, \off
    767        SR_COMBINE_4    \len, 1, \off
    768        SR_COMBINE_4    \len, 4, \off
    769        SR_COMBINE_4    \len, 5, \off
    770 .endm
    771 
    772 .macro SR_COMBINE_D2 part, off
    773        add             x10,  x1, #((\part)*32 + \off)
    774        add             x11, x14, #((\part)*32 + \off)
    775        add             x12, x15, #((\part)*32 + \off)
    776        add             x13, x16, #((\part)*32 + \off)
    777 
    778        ldp             q0,  q1,  [x10]
    779        ldp             q4,  q5,  [x10, #(2*32)]
    780        ldp             q2,  q3,  [x11]
    781        ldp             q6,  q7,  [x11, #(2*32)]
    782 
    783        ldp             q8,  q9,  [x12]
    784        ldp             q10, q11, [x12, #(2*32)]
    785        ldp             q12, q13, [x13]
    786        ldp             q14, q15, [x13, #(2*32)]
    787 
    788        SR_COMBINE      v0,  v1,  v2,  v3,  v4,  v6,  v5,  v7, \
    789                        v8,  v9, v10, v11, v12, v13, v14, v15, \
    790                        x7,  x8, x9, 0, \
    791                        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27
    792 
    793        zip1            v16.2d, v0.2d, v4.2d
    794        zip2            v17.2d, v0.2d, v4.2d
    795        zip1            v18.2d, v1.2d, v5.2d
    796        zip2            v19.2d, v1.2d, v5.2d
    797 
    798        zip1            v20.2d, v2.2d, v6.2d
    799        zip2            v21.2d, v2.2d, v6.2d
    800        zip1            v22.2d, v3.2d, v7.2d
    801        zip2            v23.2d, v3.2d, v7.2d
    802 
    803        ldp             q0,  q1,  [x10, #(1*32)]
    804        ldp             q4,  q5,  [x10, #(3*32)]
    805        ldp             q2,  q3,  [x11, #(1*32)]
    806        ldp             q6,  q7,  [x11, #(3*32)]
    807 
    808        st1             { v16.4s, v17.4s, v18.4s, v19.4s }, [x10], #64
    809        st1             { v20.4s, v21.4s, v22.4s, v23.4s }, [x11], #64
    810 
    811        zip1            v20.2d, v8.2d, v12.2d
    812        zip2            v21.2d, v8.2d, v12.2d
    813        zip1            v22.2d, v9.2d, v13.2d
    814        zip2            v23.2d, v9.2d, v13.2d
    815        zip1            v24.2d, v10.2d, v14.2d
    816        zip2            v25.2d, v10.2d, v14.2d
    817        zip1            v26.2d, v11.2d, v15.2d
    818        zip2            v27.2d, v11.2d, v15.2d
    819 
    820        ldp             q8,  q9,  [x12, #(1*32)]
    821        ldp             q10, q11, [x12, #(3*32)]
    822        ldp             q12, q13, [x13, #(1*32)]
    823        ldp             q14, q15, [x13, #(3*32)]
    824 
    825        st1             { v20.4s, v21.4s, v22.4s, v23.4s }, [x12], #64
    826        st1             { v24.4s, v25.4s, v26.4s, v27.4s }, [x13], #64
    827 
    828        SR_COMBINE      v0,  v1,  v2,  v3,  v4,  v6,  v5,  v7, \
    829                        v8,  v9, v10, v11, v12, v13, v14, v15, \
    830                        x7,  x8, x9, 0, \
    831                        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27
    832 
    833        zip1            v16.2d, v0.2d, v4.2d
    834        zip2            v17.2d, v0.2d, v4.2d
    835        zip1            v18.2d, v1.2d, v5.2d
    836        zip2            v19.2d, v1.2d, v5.2d
    837        st1             { v16.4s, v17.4s, v18.4s, v19.4s }, [x10]
    838 
    839        zip1            v16.2d, v2.2d, v6.2d
    840        zip2            v17.2d, v2.2d, v6.2d
    841        zip1            v18.2d, v3.2d, v7.2d
    842        zip2            v19.2d, v3.2d, v7.2d
    843        st1             { v16.4s, v17.4s, v18.4s, v19.4s }, [x11]
    844 
    845        zip1            v20.2d, v8.2d, v12.2d
    846        zip2            v21.2d, v8.2d, v12.2d
    847        zip1            v22.2d, v9.2d, v13.2d
    848        zip2            v23.2d, v9.2d, v13.2d
    849        st1             { v20.4s, v21.4s, v22.4s, v23.4s }, [x12]
    850 
    851        zip1            v24.2d, v10.2d, v14.2d
    852        zip2            v25.2d, v10.2d, v14.2d
    853        zip1            v26.2d, v11.2d, v15.2d
    854        zip2            v27.2d, v11.2d, v15.2d
    855        st1             { v24.4s, v25.4s, v26.4s, v27.4s }, [x13]
    856 .endm
    857 
    858 .macro SR_COMBINE_DINT off=0
    859        add             x14, x1, x21
    860        add             x15, x1, x21, lsl #1
    861        add             x16, x1, x22
    862 
    863        SR_COMBINE_D2   0, \off
    864        SR_COMBINE_D2   4, \off
    865 .endm
    866 
    867 .macro FFT32_FN name, no_perm
    868 function ff_tx_fft32_\name\()_neon, export=1
    869        stp             d14, d15, [sp, #-16*4]!
    870        stp             d8,  d9,  [sp, #16*3]
    871        stp             d10, d11, [sp, #16*2]
    872        stp             d12, d13, [sp, #16]
    873 
    874        LOAD_SUBADD
    875        SETUP_SR_RECOMB 32, x7, x8, x9
    876 
    877        SETUP_LUT       \no_perm
    878        LOAD_INPUT      0,  1,  2,  3,  x2, \no_perm
    879        LOAD_INPUT      4,  5,  6,  7,  x2, \no_perm
    880        LOAD_INPUT      8,  9,  10, 11, x2, \no_perm
    881        LOAD_INPUT      12, 13, 14, 15, x2, \no_perm
    882 
    883        FFT8_X2         v8, v9, v10, v11, v12, v13, v14, v15
    884        FFT16           v0, v1, v2, v3, v4, v5, v6, v7
    885 
    886        SR_COMBINE      v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, \
    887                        v8,  v9, v10, v11, v12, v13, v14, v15, \
    888                        x7,  x8,  x9, 0
    889 
    890        zip1            v16.2d, v0.2d, v4.2d
    891        zip2            v17.2d, v0.2d, v4.2d
    892        zip1            v18.2d, v1.2d, v6.2d
    893        zip2            v19.2d, v1.2d, v6.2d
    894        st1             { v16.4s, v17.4s, v18.4s, v19.4s }, [x1], #64
    895 
    896        zip1            v20.2d, v2.2d, v5.2d
    897        zip2            v21.2d, v2.2d, v5.2d
    898        zip1            v22.2d, v3.2d, v7.2d
    899        zip2            v23.2d, v3.2d, v7.2d
    900        st1             { v20.4s, v21.4s, v22.4s, v23.4s }, [x1], #64
    901 
    902        zip1            v24.2d, v8.2d, v12.2d
    903        zip2            v25.2d, v8.2d, v12.2d
    904        zip1            v26.2d, v9.2d, v13.2d
    905        zip2            v27.2d, v9.2d, v13.2d
    906        st1             { v24.4s, v25.4s, v26.4s, v27.4s }, [x1], #64
    907 
    908        zip1            v28.2d, v10.2d, v14.2d
    909        zip2            v29.2d, v10.2d, v14.2d
    910        zip1            v30.2d, v11.2d, v15.2d
    911        zip2            v31.2d, v11.2d, v15.2d
    912        st1             { v28.4s, v29.4s, v30.4s, v31.4s }, [x1]
    913 
    914        ldp             d12, d13, [sp, #16]
    915        ldp             d10, d11, [sp, #16*2]
    916        ldp             d8,  d9,  [sp, #16*3]
    917        ldp             d14, d15, [sp], #16*4
    918 
    919        ret
    920 endfunc
    921 .endm
    922 
    923 FFT32_FN float,    0
    924 FFT32_FN ns_float, 1
    925 
    926 .macro cmp_imm reg, imm
    927 .if \imm >= 4096
    928        cmp             \reg, #((\imm)/4096), lsl #12
    929 .else
    930        cmp             \reg, #(\imm)
    931 .endif
    932 .endm
    933 
    934 .macro SR_TRANSFORM_DEF len, next=0
    935 \len:
    936        stp             x20, x30, [sp, #-16]!
    937        mov             w20, #(\len/4)
    938        mov             x5, #((\len*4) - (\len/1))
    939        add             x1, x1, x5
    940        bl              32b
    941        mov             x5, #((\len*2) - (\len/2))
    942        add             x1, x1, x5
    943        bl              32b
    944        ldp             x20, x30, [sp], #16
    945        ldr             w5, =(\len*6 + \len/2)
    946        sub             x1, x1, x5
    947 
    948        SETUP_SR_RECOMB \len, x7, x8, x9
    949 
    950 .if \next\() != 0
    951        cmp_imm         w19, \len
    952        b.eq            0f
    953 
    954        mov             w5, #(\len/128)
    955 \len\()5:
    956        SR_COMBINE_FULL \len
    957        add             x1, x1, 8*32
    958        subs            w5, w5, 1
    959        b.gt            \len\()5b
    960 
    961        cmp_imm         w20, \len
    962        b.gt            \next\()f
    963        ret
    964 .endif
    965 .endm
    966 
    967 .macro FFT_SPLIT_RADIX_FN name, no_perm
    968 function ff_tx_fft_sr_\name\()_neon, export=1
    969        stp             x21, x22, [sp, #-16*6]!
    970        stp             d8,  d9,  [sp, #16*5]
    971        stp             d10, d11, [sp, #16*4]
    972        stp             d12, d13, [sp, #16*3]
    973        stp             d14, d15, [sp, #16*2]
    974        stp             x19, x20, [sp, #16]
    975 
    976        ldr             w19, [x0, #0] // global target
    977        mov             w20, w19      // local length
    978 
    979        LOAD_SUBADD
    980        SETUP_LUT       \no_perm
    981 
    982 32:
    983        SETUP_SR_RECOMB 32, x7, x8, x9
    984 
    985        LOAD_INPUT      0,  1,  2,  3,  x2, \no_perm
    986        LOAD_INPUT      4,  6,  5,  7,  x2, \no_perm, 1
    987        LOAD_INPUT      8,  9,  10, 11, x2, \no_perm
    988        LOAD_INPUT      12, 13, 14, 15, x2, \no_perm
    989 
    990        FFT8_X2         v8, v9, v10, v11, v12, v13, v14, v15
    991        FFT16           v0, v1, v2, v3, v4, v6, v5, v7
    992 
    993        SR_COMBINE      v0,  v1,  v2,  v3,  v4,  v6,  v5,  v7,  \
    994                        v8,  v9,  v10, v11, v12, v13, v14, v15, \
    995                        x7,  x8,  x9,  0
    996 
    997        stp             q2,  q3,  [x1, #32*1]
    998        stp             q6,  q7,  [x1, #32*3]
    999        stp             q10, q11, [x1, #32*5]
   1000        stp             q14, q15, [x1, #32*7]
   1001 
   1002        cmp             w20, #32
   1003        b.gt            64f
   1004 
   1005        stp             q0,  q1,  [x1, #32*0]
   1006        stp             q4,  q5,  [x1, #32*2]
   1007        stp             q8,  q9,  [x1, #32*4]
   1008        stp             q12, q13, [x1, #32*6]
   1009 
   1010        ret
   1011 64:
   1012        SETUP_SR_RECOMB 64, x7, x8, x9
   1013 
   1014        LOAD_INPUT      2,  3,  10, 11, x2, \no_perm, 1
   1015        LOAD_INPUT      6,  14, 7,  15, x2, \no_perm, 1
   1016 
   1017        FFT16           v2, v3, v10, v11, v6, v14, v7, v15
   1018 
   1019        LOAD_INPUT      16, 17, 18, 19, x2, \no_perm
   1020        LOAD_INPUT      20, 22, 21, 23, x2, \no_perm, 1
   1021 
   1022        FFT16           v16, v17, v18, v19, v20, v22, v21, v23, \
   1023                        v24, v25, v26, v27, v28, v29, v30
   1024 
   1025        ld1             { v26.4s, v27.4s }, [x8], x9
   1026        ldp             q24, q25, [x7], #32
   1027 
   1028        ext             v26.16b, v26.16b, v26.16b, #8
   1029        ext             v27.16b, v27.16b, v27.16b, #8
   1030 
   1031        cmp             w19, #64
   1032        b.eq            2f // custom deinterleave
   1033 
   1034        // TODO: investigate doing the 2 combines like in deinterleave
   1035        // TODO: experiment with spilling to gprs and converting to HALF or full
   1036        SR_COMBINE_LITE v0,  v1,  v8,  v9,  \
   1037                        v2,  v3,  v16, v17, \
   1038                        v24, v25, v26, v27, \
   1039                        v28, v29, v30, 0
   1040 
   1041        stp             q0,  q1,  [x1, #32* 0]
   1042        stp             q8,  q9,  [x1, #32* 4]
   1043        stp             q2,  q3,  [x1, #32* 8]
   1044        stp             q16, q17, [x1, #32*12]
   1045 
   1046        SR_COMBINE_HALF v4,  v5,  v12, v13, \
   1047                        v6,  v7,  v20, v21, \
   1048                        v24, v25, v26, v27, \
   1049                        v28, v29, v30, v0, v1, v8, 1
   1050 
   1051        stp             q4,  q20, [x1, #32* 2]
   1052        stp             q12, q21, [x1, #32* 6]
   1053        stp             q6,  q5,  [x1, #32*10]
   1054        stp             q7,  q13, [x1, #32*14]
   1055 
   1056        ldp             q2,  q3,  [x1, #32*1]
   1057        ldp             q6,  q7,  [x1, #32*3]
   1058        ldp             q12, q13, [x1, #32*5]
   1059        ldp             q16, q17, [x1, #32*7]
   1060 
   1061        SR_COMBINE      v2,  v3,  v12, v13, v6,  v16, v7,  v17, \
   1062                        v10, v11, v14, v15, v18, v19, v22, v23, \
   1063                        x7,  x8,  x9,  0, \
   1064                        v24, v25, v26, v27, v28, v29, v30, v8, v0, v1, v4, v5
   1065 
   1066        stp             q2,  q3,  [x1, #32* 1]
   1067        stp             q6,  q7,  [x1, #32* 3]
   1068        stp             q12, q13, [x1, #32* 5]
   1069        stp             q16, q17, [x1, #32* 7]
   1070 
   1071        stp             q10, q11, [x1, #32* 9]
   1072        stp             q18, q19, [x1, #32*11]
   1073        stp             q14, q15, [x1, #32*13]
   1074        stp             q22, q23, [x1, #32*15]
   1075 
   1076        cmp             w20, #64
   1077        b.gt            128f
   1078        ret
   1079 128:
   1080        stp             x20, x30, [sp, #-16]!
   1081        mov             w20, #32
   1082        add             x1, x1, #16*32
   1083        bl              32b
   1084        add             x1, x1, #8*32
   1085        bl              32b
   1086        ldp             x20, x30, [sp], #16
   1087        sub             x1, x1, #24*32
   1088 
   1089        SETUP_SR_RECOMB 128, x7, x8, x9
   1090 
   1091        cmp             w19, #128
   1092        b.eq            0f
   1093 
   1094        SR_COMBINE_FULL 128
   1095 
   1096        cmp             w20, #128
   1097        b.gt            256f
   1098        ret
   1099 256:
   1100        stp             x20, x30, [sp, #-16]!
   1101        mov             w20, #64
   1102        add             x1, x1, #32*32
   1103        bl              32b
   1104        add             x1, x1, #16*32
   1105        bl              32b
   1106        ldp             x20, x30, [sp], #16
   1107        sub             x1, x1, #48*32
   1108 
   1109        SETUP_SR_RECOMB 256, x7, x8, x9
   1110 
   1111        cmp             w19, #256
   1112        b.eq            0f
   1113 
   1114        SR_COMBINE_FULL 256
   1115        SR_COMBINE_FULL 256, 8*32
   1116 
   1117        cmp             w20, #256
   1118        b.gt            512f
   1119        ret
   1120 512:
   1121        stp             x20, x30, [sp, #-16]!
   1122        mov             w20, #128
   1123        add             x1, x1, #64*32
   1124        bl              32b
   1125        add             x1, x1, #32*32
   1126        bl              32b
   1127        ldp             x20, x30, [sp], #16
   1128        sub             x1, x1, #96*32
   1129 
   1130        SETUP_SR_RECOMB 512, x7, x8, x9
   1131 
   1132        cmp             w19, #512
   1133        b.eq            0f
   1134 
   1135        mov             x5, 4
   1136 5125:
   1137        SR_COMBINE_FULL 512
   1138        add             x1, x1, 8*32
   1139        subs            w5, w5, 1
   1140        b.gt            5125b
   1141 
   1142        cmp             w20, #512
   1143        b.gt            1024f
   1144 
   1145        ret
   1146 1024:
   1147        stp             x20, x30, [sp, #-16]!
   1148        mov             w20, #256
   1149        add             x1, x1, #96*32
   1150        bl              32b
   1151        add             x1, x1, #64*32
   1152        bl              32b
   1153        ldp             x20, x30, [sp], #16
   1154        mov             x5, #192*32
   1155        sub             x1, x1, x5
   1156 
   1157        SETUP_SR_RECOMB 1024, x7, x8, x9
   1158 
   1159        cmp             w19, #1024
   1160        b.eq            0f
   1161 
   1162        mov             w5, 8
   1163 10245:
   1164        SR_COMBINE_FULL 1024
   1165        add             x1, x1, 8*32
   1166        subs            w5, w5, 1
   1167        b.gt            10245b
   1168 
   1169        cmp             w20, #1024
   1170        b.gt            2048f
   1171 
   1172        ret
   1173 
   1174 SR_TRANSFORM_DEF        2048, 4096
   1175 SR_TRANSFORM_DEF        4096, 8192
   1176 SR_TRANSFORM_DEF        8192, 16384
   1177 SR_TRANSFORM_DEF        16384, 32768
   1178 SR_TRANSFORM_DEF        32768, 65536
   1179 SR_TRANSFORM_DEF        65536, 131072
   1180 SR_TRANSFORM_DEF        131072
   1181 
   1182 0: // general deinterleave loop
   1183        SR_COMBINE_DINT
   1184        add             x1, x1, #32*8
   1185        subs            w19, w19, #32*4
   1186        b.gt            0b
   1187 
   1188        ldp             x19, x20, [sp, #16]
   1189        ldp             d14, d15, [sp, #16*2]
   1190        ldp             d12, d13, [sp, #16*3]
   1191        ldp             d10, d11, [sp, #16*4]
   1192        ldp             d8,  d9,  [sp, #16*5]
   1193        ldp             x21, x22, [sp], #16*6
   1194 
   1195        ret
   1196 
   1197 2: // special case for 64 point deinterleave
   1198        mov             x10, v23.d[0]
   1199        mov             x11, v23.d[1]
   1200 
   1201        SR_COMBINE_LITE v0,  v1,  v8,  v9,  \
   1202                        v2,  v3,  v16, v17, \
   1203                        v24, v25, v26, v27, \
   1204                        v28, v29, v30, 0
   1205 
   1206        SR_COMBINE_HALF v4,  v5,  v12, v13, \
   1207                        v6,  v7,  v20, v21, \
   1208                        v24, v25, v26, v27, \
   1209                        v28, v29, v30, v23, v24, v26, 1
   1210 
   1211        zip1            v23.2d, v0.2d, v4.2d
   1212        zip2            v24.2d, v0.2d, v4.2d
   1213        zip1            v25.2d, v1.2d, v20.2d
   1214        zip2            v26.2d, v1.2d, v20.2d
   1215 
   1216        zip1            v27.2d, v8.2d, v12.2d
   1217        zip2            v28.2d, v8.2d, v12.2d
   1218        zip1            v29.2d, v9.2d, v21.2d
   1219        zip2            v30.2d, v9.2d, v21.2d
   1220 
   1221        mov             v20.16b, v5.16b
   1222        mov             v21.16b, v7.16b
   1223        mov             x12, x1
   1224        add             x13, x1, #32* 4
   1225        add             x14, x1, #32* 8
   1226        add             x15, x1, #32*12
   1227 
   1228        zip1            v4.2d,  v2.2d, v6.2d
   1229        zip2            v5.2d,  v2.2d, v6.2d
   1230        zip1            v6.2d,  v3.2d, v20.2d
   1231        zip2            v7.2d,  v3.2d, v20.2d
   1232 
   1233        zip1            v0.2d, v16.2d, v21.2d
   1234        zip2            v1.2d, v16.2d, v21.2d
   1235        zip1            v2.2d, v17.2d, v13.2d
   1236        zip2            v3.2d, v17.2d, v13.2d
   1237 
   1238        // stp is faster by a little on A53, but this is faster on M1s (theory)
   1239        ldp             q8,  q9, [x1, #32*1]
   1240        ldp             q12, q13, [x1, #32*5]
   1241 
   1242        st1             { v23.4s, v24.4s, v25.4s, v26.4s }, [x12], #64  // 32* 0...1
   1243        st1             { v27.4s, v28.4s, v29.4s, v30.4s }, [x13], #64  // 32* 4...5
   1244        st1             {  v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x14], #64  // 32* 8...9
   1245        st1             {  v0.4s,  v1.4s,  v2.4s,  v3.4s }, [x15], #64  // 32*12..13
   1246 
   1247        mov             v23.d[0], x10
   1248        mov             v23.d[1], x11
   1249 
   1250        ldp             q6,  q7, [x1, #32*3]
   1251        ldp             q16, q17, [x1, #32*7]
   1252 
   1253        SR_COMBINE      v8,  v9,  v12, v13, v6,  v16, v7,  v17, \
   1254                        v10, v11, v14, v15, v18, v19, v22, v23, \
   1255                        x7,  x8,  x9,  0, \
   1256                        v24, v25, v26, v27, v28, v29, v30, v4, v0, v1, v5, v20
   1257 
   1258        zip1            v0.2d, v8.2d, v6.2d
   1259        zip2            v1.2d, v8.2d, v6.2d
   1260        zip1            v2.2d, v9.2d, v7.2d
   1261        zip2            v3.2d, v9.2d, v7.2d
   1262        st1             {  v0.4s,  v1.4s,  v2.4s,  v3.4s }, [x12]
   1263 
   1264        zip1            v4.2d, v12.2d, v16.2d
   1265        zip2            v5.2d, v12.2d, v16.2d
   1266        zip1            v6.2d, v13.2d, v17.2d
   1267        zip2            v7.2d, v13.2d, v17.2d
   1268        st1             {  v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13]
   1269 
   1270        zip1            v0.2d, v10.2d, v18.2d
   1271        zip2            v1.2d, v10.2d, v18.2d
   1272        zip1            v2.2d, v11.2d, v19.2d
   1273        zip2            v3.2d, v11.2d, v19.2d
   1274        st1             {  v0.4s,  v1.4s,  v2.4s,  v3.4s }, [x14]
   1275 
   1276        zip1            v4.2d, v14.2d, v22.2d
   1277        zip2            v5.2d, v14.2d, v22.2d
   1278        zip1            v6.2d, v15.2d, v23.2d
   1279        zip2            v7.2d, v15.2d, v23.2d
   1280        st1             {  v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x15]
   1281 
   1282        ldp             x19, x20, [sp, #16]
   1283        ldp             d14, d15, [sp, #16*2]
   1284        ldp             d12, d13, [sp, #16*3]
   1285        ldp             d10, d11, [sp, #16*4]
   1286        ldp             d8,  d9,  [sp, #16*5]
   1287        ldp             x21, x22, [sp], #16*6
   1288 
   1289        ret
   1290 endfunc
   1291 .endm
   1292 
   1293 FFT_SPLIT_RADIX_FN float, 0
   1294 FFT_SPLIT_RADIX_FN ns_float, 1