tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

tx_template.c (91045B)


      1 /*
      2 * Copyright (c) Lynne
      3 *
      4 * Power of two FFT:
      5 * Copyright (c) Lynne
      6 * Copyright (c) 2008 Loren Merritt
      7 * Copyright (c) 2002 Fabrice Bellard
      8 * Partly based on libdjbfft by D. J. Bernstein
      9 *
     10 * This file is part of FFmpeg.
     11 *
     12 * FFmpeg is free software; you can redistribute it and/or
     13 * modify it under the terms of the GNU Lesser General Public
     14 * License as published by the Free Software Foundation; either
     15 * version 2.1 of the License, or (at your option) any later version.
     16 *
     17 * FFmpeg is distributed in the hope that it will be useful,
     18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     20 * Lesser General Public License for more details.
     21 *
     22 * You should have received a copy of the GNU Lesser General Public
     23 * License along with FFmpeg; if not, write to the Free Software
     24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     25 */
     26 
     27 #include "mem.h"
     28 
     29 #define TABLE_DEF(name, size) \
     30    DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size]
     31 
     32 #define SR_POW2_TABLES \
     33    SR_TABLE(8)        \
     34    SR_TABLE(16)       \
     35    SR_TABLE(32)       \
     36    SR_TABLE(64)       \
     37    SR_TABLE(128)      \
     38    SR_TABLE(256)      \
     39    SR_TABLE(512)      \
     40    SR_TABLE(1024)     \
     41    SR_TABLE(2048)     \
     42    SR_TABLE(4096)     \
     43    SR_TABLE(8192)     \
     44    SR_TABLE(16384)    \
     45    SR_TABLE(32768)    \
     46    SR_TABLE(65536)    \
     47    SR_TABLE(131072)   \
     48    SR_TABLE(262144)   \
     49    SR_TABLE(524288)   \
     50    SR_TABLE(1048576)   \
     51    SR_TABLE(2097152)   \
     52 
     53 #define SR_TABLE(len) \
     54    TABLE_DEF(len, len/4 + 1);
     55 /* Power of two tables */
     56 SR_POW2_TABLES
     57 #undef SR_TABLE
     58 
     59 /* Other factors' tables */
     60 TABLE_DEF(53, 12);
     61 TABLE_DEF( 7,  6);
     62 TABLE_DEF( 9,  8);
     63 
     64 typedef struct FFTabInitData {
     65    void (*func)(void);
     66    int factors[TX_MAX_SUB]; /* Must be sorted high -> low */
     67 } FFTabInitData;
     68 
     69 #define SR_TABLE(len)                                              \
     70 static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void)            \
     71 {                                                                  \
     72    double freq = 2*M_PI/len;                                      \
     73    TXSample *tab = TX_TAB(ff_tx_tab_ ##len);                      \
     74                                                                   \
     75    for (int i = 0; i < len/4; i++)                                \
     76        *tab++ = RESCALE(cos(i*freq));                             \
     77                                                                   \
     78    *tab = 0;                                                      \
     79 }
     80 SR_POW2_TABLES
     81 #undef SR_TABLE
     82 
     83 static void (*const sr_tabs_init_funcs[])(void) = {
     84 #define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len),
     85    SR_POW2_TABLES
     86 #undef SR_TABLE
     87 };
     88 
     89 static AVOnce sr_tabs_init_once[] = {
     90 #define SR_TABLE(len) AV_ONCE_INIT,
     91    SR_POW2_TABLES
     92 #undef SR_TABLE
     93 };
     94 
     95 static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
     96 {
     97    /* 5pt, doubled to eliminate AVX lane shuffles */
     98    TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI /  5));
     99    TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI /  5));
    100    TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
    101    TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
    102    TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI /  5));
    103    TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI /  5));
    104    TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
    105    TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
    106 
    107    /* 3pt */
    108    TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
    109    TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
    110    TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI /  6));
    111    TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI /  6));
    112 }
    113 
    114 static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
    115 {
    116    TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI /  7));
    117    TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI /  7));
    118    TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28));
    119    TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28));
    120    TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14));
    121    TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
    122 }
    123 
    124 static av_cold void TX_TAB(ff_tx_init_tab_9)(void)
    125 {
    126    TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI /  3));
    127    TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI /  3));
    128    TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI /  9));
    129    TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI /  9));
    130    TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36));
    131    TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36));
    132    TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
    133    TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
    134 }
    135 
    136 static const FFTabInitData nptwo_tabs_init_data[] = {
    137    { TX_TAB(ff_tx_init_tab_53),      { 15, 5, 3 } },
    138    { TX_TAB(ff_tx_init_tab_9),       {  9 }       },
    139    { TX_TAB(ff_tx_init_tab_7),       {  7 }       },
    140 };
    141 
    142 static AVOnce nptwo_tabs_init_once[] = {
    143    AV_ONCE_INIT,
    144    AV_ONCE_INIT,
    145    AV_ONCE_INIT,
    146 };
    147 
    148 av_cold void TX_TAB(ff_tx_init_tabs)(int len)
    149 {
    150    int factor_2 = ff_ctz(len);
    151    if (factor_2) {
    152        int idx = factor_2 - 3;
    153        for (int i = 0; i <= idx; i++)
    154            ff_thread_once(&sr_tabs_init_once[i],
    155                            sr_tabs_init_funcs[i]);
    156        len >>= factor_2;
    157    }
    158 
    159    for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
    160        int f, f_idx = 0;
    161 
    162        if (len <= 1)
    163            return;
    164 
    165        while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
    166            if (f % len)
    167                continue;
    168 
    169            ff_thread_once(&nptwo_tabs_init_once[i],
    170                            nptwo_tabs_init_data[i].func);
    171            len /= f;
    172            break;
    173        }
    174    }
    175 }
    176 
    177 static av_always_inline void fft3(TXComplex *out, TXComplex *in,
    178                                  ptrdiff_t stride)
    179 {
    180    TXComplex tmp[3];
    181    const TXSample *tab = TX_TAB(ff_tx_tab_53);
    182 #ifdef TX_INT32
    183    int64_t mtmp[4];
    184 #endif
    185 
    186    tmp[0] = in[0];
    187    BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
    188    BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
    189 
    190 #ifdef TX_INT32
    191    out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re;
    192    out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im;
    193    mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
    194    mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
    195    mtmp[2] = (int64_t)tab[10] * tmp[2].re;
    196    mtmp[3] = (int64_t)tab[10] * tmp[2].im;
    197    out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
    198    out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
    199    out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
    200    out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
    201 #else
    202    out[0*stride].re = tmp[0].re + tmp[2].re;
    203    out[0*stride].im = tmp[0].im + tmp[2].im;
    204    tmp[1].re = tab[ 8] * tmp[1].re;
    205    tmp[1].im = tab[ 9] * tmp[1].im;
    206    tmp[2].re = tab[10] * tmp[2].re;
    207    tmp[2].im = tab[10] * tmp[2].im;
    208    out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
    209    out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
    210    out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
    211    out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
    212 #endif
    213 }
    214 
    215 #define DECL_FFT5(NAME, D0, D1, D2, D3, D4)                         \
    216 static av_always_inline void NAME(TXComplex *out, TXComplex *in,    \
    217                                  ptrdiff_t stride)                 \
    218 {                                                                   \
    219    TXComplex dc, z0[4], t[6];                                      \
    220    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
    221                                                                    \
    222    dc = in[0];                                                     \
    223    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
    224    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
    225    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
    226    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
    227                                                                    \
    228    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
    229    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
    230                                                                    \
    231    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
    232    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
    233    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
    234    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
    235                                                                    \
    236    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
    237    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
    238    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
    239    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
    240                                                                    \
    241    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
    242    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
    243    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
    244    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
    245    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
    246    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
    247    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
    248    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
    249 }
    250 
    251 DECL_FFT5(fft5,     0,  1,  2,  3,  4)
    252 DECL_FFT5(fft5_m1,  0,  6, 12,  3,  9)
    253 DECL_FFT5(fft5_m2, 10,  1,  7, 13,  4)
    254 DECL_FFT5(fft5_m3,  5, 11,  2,  8, 14)
    255 
    256 static av_always_inline void fft7(TXComplex *out, TXComplex *in,
    257                                  ptrdiff_t stride)
    258 {
    259    TXComplex dc, t[6], z[3];
    260    const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7);
    261 #ifdef TX_INT32
    262    int64_t mtmp[12];
    263 #endif
    264 
    265    dc = in[0];
    266    BF(t[1].re, t[0].re, in[1].re, in[6].re);
    267    BF(t[1].im, t[0].im, in[1].im, in[6].im);
    268    BF(t[3].re, t[2].re, in[2].re, in[5].re);
    269    BF(t[3].im, t[2].im, in[2].im, in[5].im);
    270    BF(t[5].re, t[4].re, in[3].re, in[4].re);
    271    BF(t[5].im, t[4].im, in[3].im, in[4].im);
    272 
    273    out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re;
    274    out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im;
    275 
    276 #ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */
    277    mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re;
    278    mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re;
    279    mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re;
    280    mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im;
    281    mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im;
    282    mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im;
    283 
    284    mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im;
    285    mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im;
    286    mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im;
    287    mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re;
    288    mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re;
    289    mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re;
    290 
    291    z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31);
    292    z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31);
    293    z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31);
    294    z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31);
    295    z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31);
    296    z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31);
    297 
    298    t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31);
    299    t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31);
    300    t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31);
    301    t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31);
    302    t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31);
    303    t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31);
    304 #else
    305    z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re;
    306    z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re;
    307    z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re;
    308    z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im;
    309    z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im;
    310    z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im;
    311 
    312    /* It's possible to do t[4].re and t[0].im with 2 multiplies only by
    313     * multiplying the sum of all with the average of the twiddles */
    314 
    315    t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im;
    316    t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im;
    317    t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im;
    318    t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re;
    319    t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re;
    320    t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re;
    321 #endif
    322 
    323    BF(t[1].re, z[0].re, z[0].re, t[4].re);
    324    BF(t[3].re, z[1].re, z[1].re, t[2].re);
    325    BF(t[5].re, z[2].re, z[2].re, t[0].re);
    326    BF(t[1].im, z[0].im, z[0].im, t[0].im);
    327    BF(t[3].im, z[1].im, z[1].im, t[2].im);
    328    BF(t[5].im, z[2].im, z[2].im, t[4].im);
    329 
    330    out[1*stride].re = dc.re + z[0].re;
    331    out[1*stride].im = dc.im + t[1].im;
    332    out[2*stride].re = dc.re + t[3].re;
    333    out[2*stride].im = dc.im + z[1].im;
    334    out[3*stride].re = dc.re + z[2].re;
    335    out[3*stride].im = dc.im + t[5].im;
    336    out[4*stride].re = dc.re + t[5].re;
    337    out[4*stride].im = dc.im + z[2].im;
    338    out[5*stride].re = dc.re + z[1].re;
    339    out[5*stride].im = dc.im + t[3].im;
    340    out[6*stride].re = dc.re + t[1].re;
    341    out[6*stride].im = dc.im + z[0].im;
    342 }
    343 
    344 static av_always_inline void fft9(TXComplex *out, TXComplex *in,
    345                                  ptrdiff_t stride)
    346 {
    347    const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9);
    348    TXComplex dc, t[16], w[4], x[5], y[5], z[2];
    349 #ifdef TX_INT32
    350    int64_t mtmp[12];
    351 #endif
    352 
    353    dc = in[0];
    354    BF(t[1].re, t[0].re, in[1].re, in[8].re);
    355    BF(t[1].im, t[0].im, in[1].im, in[8].im);
    356    BF(t[3].re, t[2].re, in[2].re, in[7].re);
    357    BF(t[3].im, t[2].im, in[2].im, in[7].im);
    358    BF(t[5].re, t[4].re, in[3].re, in[6].re);
    359    BF(t[5].im, t[4].im, in[3].im, in[6].im);
    360    BF(t[7].re, t[6].re, in[4].re, in[5].re);
    361    BF(t[7].im, t[6].im, in[4].im, in[5].im);
    362 
    363    w[0].re = t[0].re - t[6].re;
    364    w[0].im = t[0].im - t[6].im;
    365    w[1].re = t[2].re - t[6].re;
    366    w[1].im = t[2].im - t[6].im;
    367    w[2].re = t[1].re - t[7].re;
    368    w[2].im = t[1].im - t[7].im;
    369    w[3].re = t[3].re + t[7].re;
    370    w[3].im = t[3].im + t[7].im;
    371 
    372    z[0].re = dc.re + t[4].re;
    373    z[0].im = dc.im + t[4].im;
    374 
    375    z[1].re = t[0].re + t[2].re + t[6].re;
    376    z[1].im = t[0].im + t[2].im + t[6].im;
    377 
    378    out[0*stride].re = z[0].re + z[1].re;
    379    out[0*stride].im = z[0].im + z[1].im;
    380 
    381 #ifdef TX_INT32
    382    mtmp[0] = t[1].re - t[3].re + t[7].re;
    383    mtmp[1] = t[1].im - t[3].im + t[7].im;
    384 
    385    y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31);
    386    y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31);
    387 
    388    mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31);
    389    mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31);
    390    mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31);
    391    mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31);
    392 
    393    x[3].re = z[0].re  + (int32_t)mtmp[0];
    394    x[3].im = z[0].im  + (int32_t)mtmp[1];
    395    z[0].re = in[0].re + (int32_t)mtmp[2];
    396    z[0].im = in[0].im + (int32_t)mtmp[3];
    397 
    398    mtmp[0] = ((int64_t)tab[1].re)*w[0].re;
    399    mtmp[1] = ((int64_t)tab[1].re)*w[0].im;
    400    mtmp[2] = ((int64_t)tab[2].im)*w[0].re;
    401    mtmp[3] = ((int64_t)tab[2].im)*w[0].im;
    402    mtmp[4] = ((int64_t)tab[1].im)*w[2].re;
    403    mtmp[5] = ((int64_t)tab[1].im)*w[2].im;
    404    mtmp[6] = ((int64_t)tab[2].re)*w[2].re;
    405    mtmp[7] = ((int64_t)tab[2].re)*w[2].im;
    406 
    407    x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31);
    408    x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31);
    409    x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31);
    410    x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31);
    411    y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31);
    412    y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31);
    413    y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31);
    414    y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31);
    415 
    416    y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31);
    417    y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31);
    418 
    419 #else
    420    y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re);
    421    y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im);
    422 
    423    x[3].re = z[0].re  + tab[0].re*z[1].re;
    424    x[3].im = z[0].im  + tab[0].re*z[1].im;
    425    z[0].re = dc.re + tab[0].re*t[4].re;
    426    z[0].im = dc.im + tab[0].re*t[4].im;
    427 
    428    x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re;
    429    x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im;
    430    x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re;
    431    x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im;
    432    y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re;
    433    y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im;
    434    y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re;
    435    y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im;
    436 
    437    y[0].re = tab[0].im*t[5].re;
    438    y[0].im = tab[0].im*t[5].im;
    439 #endif
    440 
    441    x[4].re = x[1].re + x[2].re;
    442    x[4].im = x[1].im + x[2].im;
    443 
    444    y[4].re = y[1].re - y[2].re;
    445    y[4].im = y[1].im - y[2].im;
    446    x[1].re = z[0].re + x[1].re;
    447    x[1].im = z[0].im + x[1].im;
    448    y[1].re = y[0].re + y[1].re;
    449    y[1].im = y[0].im + y[1].im;
    450    x[2].re = z[0].re + x[2].re;
    451    x[2].im = z[0].im + x[2].im;
    452    y[2].re = y[2].re - y[0].re;
    453    y[2].im = y[2].im - y[0].im;
    454    x[4].re = z[0].re - x[4].re;
    455    x[4].im = z[0].im - x[4].im;
    456    y[4].re = y[0].re - y[4].re;
    457    y[4].im = y[0].im - y[4].im;
    458 
    459    out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re };
    460    out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re };
    461    out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re };
    462    out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re };
    463    out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re };
    464    out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re };
    465    out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re };
    466    out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re };
    467 }
    468 
    469 static av_always_inline void fft15(TXComplex *out, TXComplex *in,
    470                                   ptrdiff_t stride)
    471 {
    472    TXComplex tmp[15];
    473 
    474    for (int i = 0; i < 5; i++)
    475        fft3(tmp + i, in + i*3, 5);
    476 
    477    fft5_m1(out, tmp +  0, stride);
    478    fft5_m2(out, tmp +  5, stride);
    479    fft5_m3(out, tmp + 10, stride);
    480 }
    481 
    482 static av_cold int TX_NAME(ff_tx_fft_factor_init)(AVTXContext *s,
    483                                                  const FFTXCodelet *cd,
    484                                                  uint64_t flags,
    485                                                  FFTXCodeletOptions *opts,
    486                                                  int len, int inv,
    487                                                  const void *scale)
    488 {
    489    int ret = 0;
    490    TX_TAB(ff_tx_init_tabs)(len);
    491 
    492    if (len == 15)
    493        ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
    494    else if (flags & FF_TX_PRESHUFFLE)
    495        ret = ff_tx_gen_default_map(s, opts);
    496 
    497    return ret;
    498 }
    499 
    500 #define DECL_FACTOR_S(n)                                                       \
    501 static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst,                   \
    502                                  void *src, ptrdiff_t stride)                 \
    503 {                                                                              \
    504    fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex));    \
    505 }                                                                              \
    506 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = {                    \
    507    .name       = TX_NAME_STR("fft" #n "_ns"),                                 \
    508    .function   = TX_NAME(ff_tx_fft##n),                                       \
    509    .type       = TX_TYPE(FFT),                                                \
    510    .flags      = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |                         \
    511                  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE,                          \
    512    .factors[0] = n,                                                           \
    513    .nb_factors = 1,                                                           \
    514    .min_len    = n,                                                           \
    515    .max_len    = n,                                                           \
    516    .init       = TX_NAME(ff_tx_fft_factor_init),                              \
    517    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
    518    .prio       = FF_TX_PRIO_BASE,                                             \
    519 };
    520 
    521 #define DECL_FACTOR_F(n)                                                       \
    522 DECL_FACTOR_S(n)                                                               \
    523 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = {                   \
    524    .name       = TX_NAME_STR("fft" #n "_fwd"),                                \
    525    .function   = TX_NAME(ff_tx_fft##n),                                       \
    526    .type       = TX_TYPE(FFT),                                                \
    527    .flags      = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |                         \
    528                  AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY,                        \
    529    .factors[0] = n,                                                           \
    530    .nb_factors = 1,                                                           \
    531    .min_len    = n,                                                           \
    532    .max_len    = n,                                                           \
    533    .init       = TX_NAME(ff_tx_fft_factor_init),                              \
    534    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
    535    .prio       = FF_TX_PRIO_BASE,                                             \
    536 };
    537 
    538 DECL_FACTOR_F(3)
    539 DECL_FACTOR_F(5)
    540 DECL_FACTOR_F(7)
    541 DECL_FACTOR_F(9)
    542 DECL_FACTOR_S(15)
    543 
    544 #define BUTTERFLIES(a0, a1, a2, a3)            \
    545    do {                                       \
    546        r0=a0.re;                              \
    547        i0=a0.im;                              \
    548        r1=a1.re;                              \
    549        i1=a1.im;                              \
    550        BF(t3, t5, t5, t1);                    \
    551        BF(a2.re, a0.re, r0, t5);              \
    552        BF(a3.im, a1.im, i1, t3);              \
    553        BF(t4, t6, t2, t6);                    \
    554        BF(a3.re, a1.re, r1, t4);              \
    555        BF(a2.im, a0.im, i0, t6);              \
    556    } while (0)
    557 
    558 #define TRANSFORM(a0, a1, a2, a3, wre, wim)    \
    559    do {                                       \
    560        CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
    561        CMUL(t5, t6, a3.re, a3.im, wre,  wim); \
    562        BUTTERFLIES(a0, a1, a2, a3);           \
    563    } while (0)
    564 
    565 /* z[0...8n-1], w[1...2n-1] */
    566 static inline void TX_NAME(ff_tx_fft_sr_combine)(TXComplex *z,
    567                                                 const TXSample *cos, int len)
    568 {
    569    int o1 = 2*len;
    570    int o2 = 4*len;
    571    int o3 = 6*len;
    572    const TXSample *wim = cos + o1 - 7;
    573    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
    574 
    575    for (int i = 0; i < len; i += 4) {
    576        TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
    577        TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
    578        TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
    579        TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
    580 
    581        TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
    582        TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
    583        TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
    584        TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
    585 
    586        z   += 2*4;
    587        cos += 2*4;
    588        wim -= 2*4;
    589    }
    590 }
    591 
    592 static av_cold int TX_NAME(ff_tx_fft_sr_codelet_init)(AVTXContext *s,
    593                                                      const FFTXCodelet *cd,
    594                                                      uint64_t flags,
    595                                                      FFTXCodeletOptions *opts,
    596                                                      int len, int inv,
    597                                                      const void *scale)
    598 {
    599    TX_TAB(ff_tx_init_tabs)(len);
    600    return ff_tx_gen_ptwo_revtab(s, opts);
    601 }
    602 
    603 #define DECL_SR_CODELET_DEF(n)                              \
    604 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
    605    .name       = TX_NAME_STR("fft" #n "_ns"),              \
    606    .function   = TX_NAME(ff_tx_fft##n##_ns),               \
    607    .type       = TX_TYPE(FFT),                             \
    608    .flags      = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE |      \
    609                  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE,       \
    610    .factors[0] = 2,                                        \
    611    .nb_factors = 1,                                        \
    612    .min_len    = n,                                        \
    613    .max_len    = n,                                        \
    614    .init       = TX_NAME(ff_tx_fft_sr_codelet_init),       \
    615    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                      \
    616    .prio       = FF_TX_PRIO_BASE,                          \
    617 };
    618 
    619 #define DECL_SR_CODELET(n, n2, n4)                                    \
    620 static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst,    \
    621                                        void *_src, ptrdiff_t stride) \
    622 {                                                                     \
    623    TXComplex *src = _src;                                            \
    624    TXComplex *dst = _dst;                                            \
    625    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
    626                                                                      \
    627    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
    628    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
    629    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
    630    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
    631 }                                                                     \
    632                                                                      \
    633 DECL_SR_CODELET_DEF(n)
    634 
    635 static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *_dst,
    636                                   void *_src, ptrdiff_t stride)
    637 {
    638    TXComplex *src = _src;
    639    TXComplex *dst = _dst;
    640    TXComplex tmp;
    641 
    642    BF(tmp.re, dst[0].re, src[0].re, src[1].re);
    643    BF(tmp.im, dst[0].im, src[0].im, src[1].im);
    644    dst[1] = tmp;
    645 }
    646 
    647 static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *_dst,
    648                                   void *_src, ptrdiff_t stride)
    649 {
    650    TXComplex *src = _src;
    651    TXComplex *dst = _dst;
    652    TXSample t1, t2, t3, t4, t5, t6, t7, t8;
    653 
    654    BF(t3, t1, src[0].re, src[1].re);
    655    BF(t8, t6, src[3].re, src[2].re);
    656    BF(dst[2].re, dst[0].re, t1, t6);
    657    BF(t4, t2, src[0].im, src[1].im);
    658    BF(t7, t5, src[2].im, src[3].im);
    659    BF(dst[3].im, dst[1].im, t4, t8);
    660    BF(dst[3].re, dst[1].re, t3, t7);
    661    BF(dst[2].im, dst[0].im, t2, t5);
    662 }
    663 
    664 static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *_dst,
    665                                   void *_src, ptrdiff_t stride)
    666 {
    667    TXComplex *src = _src;
    668    TXComplex *dst = _dst;
    669    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
    670    const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
    671 
    672    TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
    673 
    674    BF(t1, dst[5].re, src[4].re, -src[5].re);
    675    BF(t2, dst[5].im, src[4].im, -src[5].im);
    676    BF(t5, dst[7].re, src[6].re, -src[7].re);
    677    BF(t6, dst[7].im, src[6].im, -src[7].im);
    678 
    679    BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
    680    TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
    681 }
    682 
    683 static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *_dst,
    684                                    void *_src, ptrdiff_t stride)
    685 {
    686    TXComplex *src = _src;
    687    TXComplex *dst = _dst;
    688    const TXSample *cos = TX_TAB(ff_tx_tab_16);
    689 
    690    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
    691    TXSample cos_16_1 = cos[1];
    692    TXSample cos_16_2 = cos[2];
    693    TXSample cos_16_3 = cos[3];
    694 
    695    TX_NAME(ff_tx_fft8_ns)(s, dst +  0, src +  0, stride);
    696    TX_NAME(ff_tx_fft4_ns)(s, dst +  8, src +  8, stride);
    697    TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
    698 
    699    t1 = dst[ 8].re;
    700    t2 = dst[ 8].im;
    701    t5 = dst[12].re;
    702    t6 = dst[12].im;
    703    BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
    704 
    705    TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
    706    TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
    707    TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
    708 }
    709 
    710 DECL_SR_CODELET_DEF(2)
    711 DECL_SR_CODELET_DEF(4)
    712 DECL_SR_CODELET_DEF(8)
    713 DECL_SR_CODELET_DEF(16)
    714 DECL_SR_CODELET(32,16,8)
    715 DECL_SR_CODELET(64,32,16)
    716 DECL_SR_CODELET(128,64,32)
    717 DECL_SR_CODELET(256,128,64)
    718 DECL_SR_CODELET(512,256,128)
    719 DECL_SR_CODELET(1024,512,256)
    720 DECL_SR_CODELET(2048,1024,512)
    721 DECL_SR_CODELET(4096,2048,1024)
    722 DECL_SR_CODELET(8192,4096,2048)
    723 DECL_SR_CODELET(16384,8192,4096)
    724 DECL_SR_CODELET(32768,16384,8192)
    725 DECL_SR_CODELET(65536,32768,16384)
    726 DECL_SR_CODELET(131072,65536,32768)
    727 DECL_SR_CODELET(262144,131072,65536)
    728 DECL_SR_CODELET(524288,262144,131072)
    729 DECL_SR_CODELET(1048576,524288,262144)
    730 DECL_SR_CODELET(2097152,1048576,524288)
    731 
    732 static av_cold int TX_NAME(ff_tx_fft_init)(AVTXContext *s,
    733                                           const FFTXCodelet *cd,
    734                                           uint64_t flags,
    735                                           FFTXCodeletOptions *opts,
    736                                           int len, int inv,
    737                                           const void *scale)
    738 {
    739    int ret;
    740    int is_inplace = !!(flags & AV_TX_INPLACE);
    741    FFTXCodeletOptions sub_opts = {
    742        .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
    743    };
    744 
    745    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
    746    flags |=  AV_TX_INPLACE;      /* in-place */
    747    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
    748 
    749    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale)))
    750        return ret;
    751 
    752    if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len)))
    753        return ret;
    754 
    755    return 0;
    756 }
    757 
    758 static av_cold int TX_NAME(ff_tx_fft_inplace_small_init)(AVTXContext *s,
    759                                                         const FFTXCodelet *cd,
    760                                                         uint64_t flags,
    761                                                         FFTXCodeletOptions *opts,
    762                                                         int len, int inv,
    763                                                         const void *scale)
    764 {
    765    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
    766        return AVERROR(ENOMEM);
    767    flags &= ~AV_TX_INPLACE;
    768    return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale);
    769 }
    770 
    771 static void TX_NAME(ff_tx_fft)(AVTXContext *s, void *_dst,
    772                               void *_src, ptrdiff_t stride)
    773 {
    774    TXComplex *src = _src;
    775    TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst;
    776    TXComplex *dst2 = _dst;
    777    int *map = s->sub[0].map;
    778    int len = s->len;
    779 
    780    /* Compilers can't vectorize this anyway without assuming AVX2, which they
    781     * generally don't, at least without -march=native -mtune=native */
    782    for (int i = 0; i < len; i++)
    783        dst1[i] = src[map[i]];
    784 
    785    s->fn[0](&s->sub[0], dst2, dst1, stride);
    786 }
    787 
    788 static void TX_NAME(ff_tx_fft_inplace)(AVTXContext *s, void *_dst,
    789                                       void *_src, ptrdiff_t stride)
    790 {
    791    TXComplex *src = _src;
    792    TXComplex *dst = _dst;
    793    TXComplex tmp;
    794    const int *map = s->sub->map;
    795    const int *inplace_idx = s->map;
    796    int src_idx, dst_idx;
    797 
    798    src_idx = *inplace_idx++;
    799    do {
    800        tmp = src[src_idx];
    801        dst_idx = map[src_idx];
    802        do {
    803            FFSWAP(TXComplex, tmp, src[dst_idx]);
    804            dst_idx = map[dst_idx];
    805        } while (dst_idx != src_idx); /* Can be > as well, but was less predictable */
    806        src[dst_idx] = tmp;
    807    } while ((src_idx = *inplace_idx++));
    808 
    809    s->fn[0](&s->sub[0], dst, src, stride);
    810 }
    811 
    812 static const FFTXCodelet TX_NAME(ff_tx_fft_def) = {
    813    .name       = TX_NAME_STR("fft"),
    814    .function   = TX_NAME(ff_tx_fft),
    815    .type       = TX_TYPE(FFT),
    816    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE,
    817    .factors[0] = TX_FACTOR_ANY,
    818    .nb_factors = 1,
    819    .min_len    = 2,
    820    .max_len    = TX_LEN_UNLIMITED,
    821    .init       = TX_NAME(ff_tx_fft_init),
    822    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
    823    .prio       = FF_TX_PRIO_BASE,
    824 };
    825 
    826 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_small_def) = {
    827    .name       = TX_NAME_STR("fft_inplace_small"),
    828    .function   = TX_NAME(ff_tx_fft),
    829    .type       = TX_TYPE(FFT),
    830    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
    831    .factors[0] = TX_FACTOR_ANY,
    832    .nb_factors = 1,
    833    .min_len    = 2,
    834    .max_len    = 65536,
    835    .init       = TX_NAME(ff_tx_fft_inplace_small_init),
    836    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
    837    .prio       = FF_TX_PRIO_BASE - 256,
    838 };
    839 
    840 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_def) = {
    841    .name       = TX_NAME_STR("fft_inplace"),
    842    .function   = TX_NAME(ff_tx_fft_inplace),
    843    .type       = TX_TYPE(FFT),
    844    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
    845    .factors[0] = TX_FACTOR_ANY,
    846    .nb_factors = 1,
    847    .min_len    = 2,
    848    .max_len    = TX_LEN_UNLIMITED,
    849    .init       = TX_NAME(ff_tx_fft_init),
    850    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
    851    .prio       = FF_TX_PRIO_BASE - 512,
    852 };
    853 
    854 static av_cold int TX_NAME(ff_tx_fft_init_naive_small)(AVTXContext *s,
    855                                                       const FFTXCodelet *cd,
    856                                                       uint64_t flags,
    857                                                       FFTXCodeletOptions *opts,
    858                                                       int len, int inv,
    859                                                       const void *scale)
    860 {
    861    const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len;
    862 
    863    if (!(s->exp = av_malloc(len*len*sizeof(*s->exp))))
    864        return AVERROR(ENOMEM);
    865 
    866    for (int i = 0; i < len; i++) {
    867        for (int j = 0; j < len; j++) {
    868            const double factor = phase*i*j;
    869            s->exp[i*j] = (TXComplex){
    870                RESCALE(cos(factor)),
    871                RESCALE(sin(factor)),
    872            };
    873        }
    874    }
    875 
    876    return 0;
    877 }
    878 
    879 static void TX_NAME(ff_tx_fft_naive)(AVTXContext *s, void *_dst, void *_src,
    880                                     ptrdiff_t stride)
    881 {
    882    TXComplex *src = _src;
    883    TXComplex *dst = _dst;
    884    const int n = s->len;
    885    double phase = s->inv ? 2.0*M_PI/n : -2.0*M_PI/n;
    886 
    887    stride /= sizeof(*dst);
    888 
    889    for (int i = 0; i < n; i++) {
    890        TXComplex tmp = { 0 };
    891        for (int j = 0; j < n; j++) {
    892            const double factor = phase*i*j;
    893            const TXComplex mult = {
    894                RESCALE(cos(factor)),
    895                RESCALE(sin(factor)),
    896            };
    897            TXComplex res;
    898            CMUL3(res, src[j], mult);
    899            tmp.re += res.re;
    900            tmp.im += res.im;
    901        }
    902        dst[i*stride] = tmp;
    903    }
    904 }
    905 
    906 static void TX_NAME(ff_tx_fft_naive_small)(AVTXContext *s, void *_dst, void *_src,
    907                                           ptrdiff_t stride)
    908 {
    909    TXComplex *src = _src;
    910    TXComplex *dst = _dst;
    911    const int n = s->len;
    912 
    913    stride /= sizeof(*dst);
    914 
    915    for (int i = 0; i < n; i++) {
    916        TXComplex tmp = { 0 };
    917        for (int j = 0; j < n; j++) {
    918            TXComplex res;
    919            const TXComplex mult = s->exp[i*j];
    920            CMUL3(res, src[j], mult);
    921            tmp.re += res.re;
    922            tmp.im += res.im;
    923        }
    924        dst[i*stride] = tmp;
    925    }
    926 }
    927 
    928 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_small_def) = {
    929    .name       = TX_NAME_STR("fft_naive_small"),
    930    .function   = TX_NAME(ff_tx_fft_naive_small),
    931    .type       = TX_TYPE(FFT),
    932    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE,
    933    .factors[0] = TX_FACTOR_ANY,
    934    .nb_factors = 1,
    935    .min_len    = 2,
    936    .max_len    = 1024,
    937    .init       = TX_NAME(ff_tx_fft_init_naive_small),
    938    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
    939    .prio       = FF_TX_PRIO_MIN/2,
    940 };
    941 
    942 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_def) = {
    943    .name       = TX_NAME_STR("fft_naive"),
    944    .function   = TX_NAME(ff_tx_fft_naive),
    945    .type       = TX_TYPE(FFT),
    946    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE,
    947    .factors[0] = TX_FACTOR_ANY,
    948    .nb_factors = 1,
    949    .min_len    = 2,
    950    .max_len    = TX_LEN_UNLIMITED,
    951    .init       = NULL,
    952    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
    953    .prio       = FF_TX_PRIO_MIN,
    954 };
    955 
    956 static av_cold int TX_NAME(ff_tx_fft_pfa_init)(AVTXContext *s,
    957                                               const FFTXCodelet *cd,
    958                                               uint64_t flags,
    959                                               FFTXCodeletOptions *opts,
    960                                               int len, int inv,
    961                                               const void *scale)
    962 {
    963    int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
    964    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
    965    size_t extra_tmp_len = 0;
    966    int len_list[TX_MAX_DECOMPOSITIONS];
    967 
    968    if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
    969        return ret;
    970 
    971    /* Two iterations to test both orderings. */
    972    for (int i = 0; i < ret; i++) {
    973        int len1 = len_list[i];
    974        int len2 = len / len1;
    975 
    976        /* Our ptwo transforms don't support striding the output. */
    977        if (len2 & (len2 - 1))
    978            FFSWAP(int, len1, len2);
    979 
    980        ff_tx_clear_ctx(s);
    981 
    982        /* First transform */
    983        sub_opts.map_dir = FF_TX_MAP_GATHER;
    984        flags &= ~AV_TX_INPLACE;
    985        flags |=  FF_TX_OUT_OF_PLACE;
    986        flags |=  FF_TX_PRESHUFFLE; /* This function handles the permute step */
    987        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
    988                               len1, inv, scale);
    989 
    990        if (ret == AVERROR(ENOMEM)) {
    991            return ret;
    992        } else if (ret < 0) { /* Try again without a preshuffle flag */
    993            flags &= ~FF_TX_PRESHUFFLE;
    994            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
    995                                   len1, inv, scale);
    996            if (ret == AVERROR(ENOMEM))
    997                return ret;
    998            else if (ret < 0)
    999                continue;
   1000        }
   1001 
   1002        /* Second transform. */
   1003        sub_opts.map_dir = FF_TX_MAP_SCATTER;
   1004        flags |=  FF_TX_PRESHUFFLE;
   1005 retry:
   1006        flags &= ~FF_TX_OUT_OF_PLACE;
   1007        flags |=  AV_TX_INPLACE;
   1008        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
   1009                               len2, inv, scale);
   1010 
   1011        if (ret == AVERROR(ENOMEM)) {
   1012            return ret;
   1013        } else if (ret < 0) { /* Try again with an out-of-place transform */
   1014            flags |= FF_TX_OUT_OF_PLACE;
   1015            flags &= ~AV_TX_INPLACE;
   1016            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
   1017                                   len2, inv, scale);
   1018            if (ret == AVERROR(ENOMEM)) {
   1019                return ret;
   1020            } else if (ret < 0) {
   1021                if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
   1022                    flags &= ~FF_TX_PRESHUFFLE;
   1023                    goto retry;
   1024                } else {
   1025                    continue;
   1026                }
   1027            }
   1028        }
   1029 
   1030        /* Success */
   1031        break;
   1032    }
   1033 
   1034    /* If nothing was sucessful, error out */
   1035    if (ret < 0)
   1036        return ret;
   1037 
   1038    /* Generate PFA map */
   1039    if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
   1040                                          s->sub[0].len, s->sub[1].len)))
   1041        return ret;
   1042 
   1043    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
   1044        return AVERROR(ENOMEM);
   1045 
   1046    /* Flatten input map */
   1047    tmp = (int *)s->tmp;
   1048    for (int k = 0; k < len; k += s->sub[0].len) {
   1049        memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
   1050        for (int i = 0; i < s->sub[0].len; i++)
   1051            s->map[k + i] = tmp[s->sub[0].map[i]];
   1052    }
   1053 
   1054    /* Only allocate extra temporary memory if we need it */
   1055    if (!(s->sub[1].flags & AV_TX_INPLACE))
   1056        extra_tmp_len = len;
   1057    else if (!ps)
   1058        extra_tmp_len = s->sub[0].len;
   1059 
   1060    if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
   1061        return AVERROR(ENOMEM);
   1062 
   1063    return 0;
   1064 }
   1065 
   1066 static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out,
   1067                                   void *_in, ptrdiff_t stride)
   1068 {
   1069    const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
   1070    const int *in_map = s->map, *out_map = in_map + l;
   1071    const int *sub_map = s->sub[1].map;
   1072    TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
   1073    TXComplex *in = _in, *out = _out;
   1074 
   1075    stride /= sizeof(*out);
   1076 
   1077    for (int i = 0; i < m; i++) {
   1078        for (int j = 0; j < n; j++)
   1079            s->exp[j] = in[in_map[i*n + j]];
   1080        s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex));
   1081    }
   1082 
   1083    for (int i = 0; i < n; i++)
   1084        s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
   1085 
   1086    for (int i = 0; i < l; i++)
   1087        out[i*stride] = tmp1[out_map[i]];
   1088 }
   1089 
   1090 static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out,
   1091                                      void *_in, ptrdiff_t stride)
   1092 {
   1093    const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
   1094    const int *in_map = s->map, *out_map = in_map + l;
   1095    const int *sub_map = s->sub[1].map;
   1096    TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
   1097    TXComplex *in = _in, *out = _out;
   1098 
   1099    stride /= sizeof(*out);
   1100 
   1101    for (int i = 0; i < m; i++)
   1102        s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex));
   1103 
   1104    for (int i = 0; i < n; i++)
   1105        s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
   1106 
   1107    for (int i = 0; i < l; i++)
   1108        out[i*stride] = tmp1[out_map[i]];
   1109 }
   1110 
   1111 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = {
   1112    .name       = TX_NAME_STR("fft_pfa"),
   1113    .function   = TX_NAME(ff_tx_fft_pfa),
   1114    .type       = TX_TYPE(FFT),
   1115    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
   1116    .factors    = { 7, 5, 3, 2, TX_FACTOR_ANY },
   1117    .nb_factors = 2,
   1118    .min_len    = 2*3,
   1119    .max_len    = TX_LEN_UNLIMITED,
   1120    .init       = TX_NAME(ff_tx_fft_pfa_init),
   1121    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   1122    .prio       = FF_TX_PRIO_BASE,
   1123 };
   1124 
   1125 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = {
   1126    .name       = TX_NAME_STR("fft_pfa_ns"),
   1127    .function   = TX_NAME(ff_tx_fft_pfa_ns),
   1128    .type       = TX_TYPE(FFT),
   1129    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |
   1130                  FF_TX_PRESHUFFLE,
   1131    .factors    = { 7, 5, 3, 2, TX_FACTOR_ANY },
   1132    .nb_factors = 2,
   1133    .min_len    = 2*3,
   1134    .max_len    = TX_LEN_UNLIMITED,
   1135    .init       = TX_NAME(ff_tx_fft_pfa_init),
   1136    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   1137    .prio       = FF_TX_PRIO_BASE,
   1138 };
   1139 
   1140 static av_cold int TX_NAME(ff_tx_mdct_naive_init)(AVTXContext *s,
   1141                                                  const FFTXCodelet *cd,
   1142                                                  uint64_t flags,
   1143                                                  FFTXCodeletOptions *opts,
   1144                                                  int len, int inv,
   1145                                                  const void *scale)
   1146 {
   1147    s->scale_d = *((SCALE_TYPE *)scale);
   1148    s->scale_f = s->scale_d;
   1149    return 0;
   1150 }
   1151 
   1152 static void TX_NAME(ff_tx_mdct_naive_fwd)(AVTXContext *s, void *_dst,
   1153                                          void *_src, ptrdiff_t stride)
   1154 {
   1155    TXSample *src = _src;
   1156    TXSample *dst = _dst;
   1157    double scale = s->scale_d;
   1158    int len = s->len;
   1159    const double phase = M_PI/(4.0*len);
   1160 
   1161    stride /= sizeof(*dst);
   1162 
   1163    for (int i = 0; i < len; i++) {
   1164        double sum = 0.0;
   1165        for (int j = 0; j < len*2; j++) {
   1166            int a = (2*j + 1 + len) * (2*i + 1);
   1167            sum += UNSCALE(src[j]) * cos(a * phase);
   1168        }
   1169        dst[i*stride] = RESCALE(sum*scale);
   1170    }
   1171 }
   1172 
   1173 static void TX_NAME(ff_tx_mdct_naive_inv)(AVTXContext *s, void *_dst,
   1174                                          void *_src, ptrdiff_t stride)
   1175 {
   1176    TXSample *src = _src;
   1177    TXSample *dst = _dst;
   1178    double scale = s->scale_d;
   1179    int len = s->len >> 1;
   1180    int len2 = len*2;
   1181    const double phase = M_PI/(4.0*len2);
   1182 
   1183    stride /= sizeof(*src);
   1184 
   1185    for (int i = 0; i < len; i++) {
   1186        double sum_d = 0.0;
   1187        double sum_u = 0.0;
   1188        double i_d = phase * (4*len  - 2*i - 1);
   1189        double i_u = phase * (3*len2 + 2*i + 1);
   1190        for (int j = 0; j < len2; j++) {
   1191            double a = (2 * j + 1);
   1192            double a_d = cos(a * i_d);
   1193            double a_u = cos(a * i_u);
   1194            double val = UNSCALE(src[j*stride]);
   1195            sum_d += a_d * val;
   1196            sum_u += a_u * val;
   1197        }
   1198        dst[i +   0] = RESCALE( sum_d*scale);
   1199        dst[i + len] = RESCALE(-sum_u*scale);
   1200    }
   1201 }
   1202 
   1203 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_fwd_def) = {
   1204    .name       = TX_NAME_STR("mdct_naive_fwd"),
   1205    .function   = TX_NAME(ff_tx_mdct_naive_fwd),
   1206    .type       = TX_TYPE(MDCT),
   1207    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
   1208    .factors    = { 2, TX_FACTOR_ANY }, /* MDCTs need an even length */
   1209    .nb_factors = 2,
   1210    .min_len    = 2,
   1211    .max_len    = TX_LEN_UNLIMITED,
   1212    .init       = TX_NAME(ff_tx_mdct_naive_init),
   1213    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   1214    .prio       = FF_TX_PRIO_MIN,
   1215 };
   1216 
   1217 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = {
   1218    .name       = TX_NAME_STR("mdct_naive_inv"),
   1219    .function   = TX_NAME(ff_tx_mdct_naive_inv),
   1220    .type       = TX_TYPE(MDCT),
   1221    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
   1222    .factors    = { 2, TX_FACTOR_ANY },
   1223    .nb_factors = 2,
   1224    .min_len    = 2,
   1225    .max_len    = TX_LEN_UNLIMITED,
   1226    .init       = TX_NAME(ff_tx_mdct_naive_init),
   1227    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   1228    .prio       = FF_TX_PRIO_MIN,
   1229 };
   1230 
   1231 static av_cold int TX_NAME(ff_tx_mdct_init)(AVTXContext *s,
   1232                                            const FFTXCodelet *cd,
   1233                                            uint64_t flags,
   1234                                            FFTXCodeletOptions *opts,
   1235                                            int len, int inv,
   1236                                            const void *scale)
   1237 {
   1238    int ret;
   1239    FFTXCodeletOptions sub_opts = {
   1240        .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
   1241    };
   1242 
   1243    s->scale_d = *((SCALE_TYPE *)scale);
   1244    s->scale_f = s->scale_d;
   1245 
   1246    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
   1247    flags |=  AV_TX_INPLACE;      /* in-place */
   1248    flags |=  FF_TX_PRESHUFFLE;   /* First try with an in-place transform */
   1249 
   1250    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
   1251                                inv, scale))) {
   1252        flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
   1253        if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
   1254                                    inv, scale)))
   1255            return ret;
   1256    }
   1257 
   1258    s->map = av_malloc((len >> 1)*sizeof(*s->map));
   1259    if (!s->map)
   1260        return AVERROR(ENOMEM);
   1261 
   1262    /* If we need to preshuffle copy the map from the subcontext */
   1263    if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
   1264        memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
   1265    } else {
   1266        for (int i = 0; i < len >> 1; i++)
   1267            s->map[i] = i;
   1268    }
   1269 
   1270    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
   1271        return ret;
   1272 
   1273    /* Saves a multiply in a hot path. */
   1274    if (inv)
   1275        for (int i = 0; i < (s->len >> 1); i++)
   1276            s->map[i] <<= 1;
   1277 
   1278    return 0;
   1279 }
   1280 
   1281 static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src,
   1282                                    ptrdiff_t stride)
   1283 {
   1284    TXSample *src = _src, *dst = _dst;
   1285    TXComplex *exp = s->exp, tmp, *z = _dst;
   1286    const int len2 = s->len >> 1;
   1287    const int len4 = s->len >> 2;
   1288    const int len3 = len2 * 3;
   1289    const int *sub_map = s->map;
   1290 
   1291    stride /= sizeof(*dst);
   1292 
   1293    for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
   1294        const int k = 2*i;
   1295        const int idx = sub_map[i];
   1296        if (k < len2) {
   1297            tmp.re = FOLD(-src[ len2 + k],  src[1*len2 - 1 - k]);
   1298            tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
   1299        } else {
   1300            tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
   1301            tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
   1302        }
   1303        CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
   1304    }
   1305 
   1306    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
   1307 
   1308    for (int i = 0; i < len4; i++) {
   1309        const int i0 = len4 + i, i1 = len4 - i - 1;
   1310        TXComplex src1 = { z[i1].re, z[i1].im };
   1311        TXComplex src0 = { z[i0].re, z[i0].im };
   1312 
   1313        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
   1314             exp[i0].im, exp[i0].re);
   1315        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
   1316             exp[i1].im, exp[i1].re);
   1317    }
   1318 }
   1319 
   1320 static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src,
   1321                                    ptrdiff_t stride)
   1322 {
   1323    TXComplex *z = _dst, *exp = s->exp;
   1324    const TXSample *src = _src, *in1, *in2;
   1325    const int len2 = s->len >> 1;
   1326    const int len4 = s->len >> 2;
   1327    const int *sub_map = s->map;
   1328 
   1329    stride /= sizeof(*src);
   1330    in1 = src;
   1331    in2 = src + ((len2*2) - 1) * stride;
   1332 
   1333    for (int i = 0; i < len2; i++) {
   1334        int k = sub_map[i];
   1335        TXComplex tmp = { in2[-k*stride], in1[k*stride] };
   1336        CMUL3(z[i], tmp, exp[i]);
   1337    }
   1338 
   1339    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
   1340 
   1341    exp += len2;
   1342    for (int i = 0; i < len4; i++) {
   1343        const int i0 = len4 + i, i1 = len4 - i - 1;
   1344        TXComplex src1 = { z[i1].im, z[i1].re };
   1345        TXComplex src0 = { z[i0].im, z[i0].re };
   1346 
   1347        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
   1348        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
   1349    }
   1350 }
   1351 
   1352 static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = {
   1353    .name       = TX_NAME_STR("mdct_fwd"),
   1354    .function   = TX_NAME(ff_tx_mdct_fwd),
   1355    .type       = TX_TYPE(MDCT),
   1356    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
   1357    .factors    = { 2, TX_FACTOR_ANY },
   1358    .nb_factors = 2,
   1359    .min_len    = 2,
   1360    .max_len    = TX_LEN_UNLIMITED,
   1361    .init       = TX_NAME(ff_tx_mdct_init),
   1362    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   1363    .prio       = FF_TX_PRIO_BASE,
   1364 };
   1365 
   1366 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_def) = {
   1367    .name       = TX_NAME_STR("mdct_inv"),
   1368    .function   = TX_NAME(ff_tx_mdct_inv),
   1369    .type       = TX_TYPE(MDCT),
   1370    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
   1371    .factors    = { 2, TX_FACTOR_ANY },
   1372    .nb_factors = 2,
   1373    .min_len    = 2,
   1374    .max_len    = TX_LEN_UNLIMITED,
   1375    .init       = TX_NAME(ff_tx_mdct_init),
   1376    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   1377    .prio       = FF_TX_PRIO_BASE,
   1378 };
   1379 
   1380 static av_cold int TX_NAME(ff_tx_mdct_inv_full_init)(AVTXContext *s,
   1381                                                     const FFTXCodelet *cd,
   1382                                                     uint64_t flags,
   1383                                                     FFTXCodeletOptions *opts,
   1384                                                     int len, int inv,
   1385                                                     const void *scale)
   1386 {
   1387    int ret;
   1388 
   1389    s->scale_d = *((SCALE_TYPE *)scale);
   1390    s->scale_f = s->scale_d;
   1391 
   1392    flags &= ~AV_TX_FULL_IMDCT;
   1393 
   1394    if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale)))
   1395        return ret;
   1396 
   1397    return 0;
   1398 }
   1399 
   1400 static void TX_NAME(ff_tx_mdct_inv_full)(AVTXContext *s, void *_dst,
   1401                                         void *_src, ptrdiff_t stride)
   1402 {
   1403    int len  = s->len << 1;
   1404    int len2 = len >> 1;
   1405    int len4 = len >> 2;
   1406    TXSample *dst = _dst;
   1407 
   1408    s->fn[0](&s->sub[0], dst + len4, _src, stride);
   1409 
   1410    stride /= sizeof(*dst);
   1411 
   1412    for (int i = 0; i < len4; i++) {
   1413        dst[            i*stride] = -dst[(len2 - i - 1)*stride];
   1414        dst[(len - i - 1)*stride] =  dst[(len2 + i + 0)*stride];
   1415    }
   1416 }
   1417 
   1418 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_full_def) = {
   1419    .name       = TX_NAME_STR("mdct_inv_full"),
   1420    .function   = TX_NAME(ff_tx_mdct_inv_full),
   1421    .type       = TX_TYPE(MDCT),
   1422    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
   1423                  FF_TX_OUT_OF_PLACE | AV_TX_FULL_IMDCT,
   1424    .factors    = { 2, TX_FACTOR_ANY },
   1425    .nb_factors = 2,
   1426    .min_len    = 2,
   1427    .max_len    = TX_LEN_UNLIMITED,
   1428    .init       = TX_NAME(ff_tx_mdct_inv_full_init),
   1429    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   1430    .prio       = FF_TX_PRIO_BASE,
   1431 };
   1432 
   1433 static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s,
   1434                                                const FFTXCodelet *cd,
   1435                                                uint64_t flags,
   1436                                                FFTXCodeletOptions *opts,
   1437                                                int len, int inv,
   1438                                                const void *scale)
   1439 {
   1440    int ret, sub_len;
   1441    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
   1442 
   1443    len >>= 1;
   1444    sub_len = len / cd->factors[0];
   1445 
   1446    s->scale_d = *((SCALE_TYPE *)scale);
   1447    s->scale_f = s->scale_d;
   1448 
   1449    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
   1450    flags |=  AV_TX_INPLACE;      /* in-place */
   1451    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
   1452 
   1453    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
   1454                                sub_len, inv, scale)))
   1455        return ret;
   1456 
   1457    if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
   1458        return ret;
   1459 
   1460    /* Our 15-point transform is also a compound one, so embed its input map */
   1461    if (cd->factors[0] == 15)
   1462        TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
   1463 
   1464    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
   1465        return ret;
   1466 
   1467    /* Saves multiplies in loops. */
   1468    for (int i = 0; i < len; i++)
   1469        s->map[i] <<= 1;
   1470 
   1471    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
   1472        return AVERROR(ENOMEM);
   1473 
   1474    TX_TAB(ff_tx_init_tabs)(len / sub_len);
   1475 
   1476    return 0;
   1477 }
   1478 
   1479 #define DECL_COMP_IMDCT(N)                                                     \
   1480 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst,    \
   1481                                                void *_src, ptrdiff_t stride)  \
   1482 {                                                                              \
   1483    TXComplex fft##N##in[N];                                                   \
   1484    TXComplex *z = _dst, *exp = s->exp;                                        \
   1485    const TXSample *src = _src, *in1, *in2;                                    \
   1486    const int len4 = s->len >> 2;                                              \
   1487    const int len2 = s->len >> 1;                                              \
   1488    const int m = s->sub->len;                                                 \
   1489    const int *in_map = s->map, *out_map = in_map + N*m;                       \
   1490    const int *sub_map = s->sub->map;                                          \
   1491                                                                               \
   1492    stride /= sizeof(*src); /* To convert it from bytes */                     \
   1493    in1 = src;                                                                 \
   1494    in2 = src + ((N*m*2) - 1) * stride;                                        \
   1495                                                                               \
   1496    for (int i = 0; i < len2; i += N) {                                        \
   1497        for (int j = 0; j < N; j++) {                                          \
   1498            const int k = in_map[j];                                           \
   1499            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
   1500            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
   1501        }                                                                      \
   1502        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
   1503        exp += N;                                                              \
   1504        in_map += N;                                                           \
   1505    }                                                                          \
   1506                                                                               \
   1507    for (int i = 0; i < N; i++)                                                \
   1508        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
   1509                                                                               \
   1510    for (int i = 0; i < len4; i++) {                                           \
   1511        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
   1512        const int s0 = out_map[i0], s1 = out_map[i1];                          \
   1513        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
   1514        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
   1515                                                                               \
   1516        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
   1517        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
   1518    }                                                                          \
   1519 }                                                                              \
   1520                                                                               \
   1521 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = {           \
   1522    .name       = TX_NAME_STR("mdct_pfa_" #N "xM_inv"),                        \
   1523    .function   = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv),                         \
   1524    .type       = TX_TYPE(MDCT),                                               \
   1525    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,   \
   1526    .factors    = { N, TX_FACTOR_ANY },                                        \
   1527    .nb_factors = 2,                                                           \
   1528    .min_len    = N*2,                                                         \
   1529    .max_len    = TX_LEN_UNLIMITED,                                            \
   1530    .init       = TX_NAME(ff_tx_mdct_pfa_init),                                \
   1531    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
   1532    .prio       = FF_TX_PRIO_BASE,                                             \
   1533 };
   1534 
   1535 DECL_COMP_IMDCT(3)
   1536 DECL_COMP_IMDCT(5)
   1537 DECL_COMP_IMDCT(7)
   1538 DECL_COMP_IMDCT(9)
   1539 DECL_COMP_IMDCT(15)
   1540 
   1541 #define DECL_COMP_MDCT(N)                                                      \
   1542 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst,    \
   1543                                                void *_src, ptrdiff_t stride)  \
   1544 {                                                                              \
   1545    TXComplex fft##N##in[N];                                                   \
   1546    TXSample *src = _src, *dst = _dst;                                         \
   1547    TXComplex *exp = s->exp, tmp;                                              \
   1548    const int m = s->sub->len;                                                 \
   1549    const int len4 = N*m;                                                      \
   1550    const int len3 = len4 * 3;                                                 \
   1551    const int len8 = s->len >> 2;                                              \
   1552    const int *in_map = s->map, *out_map = in_map + N*m;                       \
   1553    const int *sub_map = s->sub->map;                                          \
   1554                                                                               \
   1555    stride /= sizeof(*dst);                                                    \
   1556                                                                               \
   1557    for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */             \
   1558        for (int j = 0; j < N; j++) {                                          \
   1559            const int k = in_map[i*N + j];                                     \
   1560            if (k < len4) {                                                    \
   1561                tmp.re = FOLD(-src[ len4 + k],  src[1*len4 - 1 - k]);          \
   1562                tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);          \
   1563            } else {                                                           \
   1564                tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]);          \
   1565                tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]);          \
   1566            }                                                                  \
   1567            CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im,           \
   1568                 exp[k >> 1].re, exp[k >> 1].im);                              \
   1569        }                                                                      \
   1570        fft##N(s->tmp + sub_map[i], fft##N##in, m);                            \
   1571    }                                                                          \
   1572                                                                               \
   1573    for (int i = 0; i < N; i++)                                                \
   1574        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
   1575                                                                               \
   1576    for (int i = 0; i < len8; i++) {                                           \
   1577        const int i0 = len8 + i, i1 = len8 - i - 1;                            \
   1578        const int s0 = out_map[i0], s1 = out_map[i1];                          \
   1579        TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im };                     \
   1580        TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im };                     \
   1581                                                                               \
   1582        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,    \
   1583             exp[i0].im, exp[i0].re);                                          \
   1584        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,    \
   1585             exp[i1].im, exp[i1].re);                                          \
   1586    }                                                                          \
   1587 }                                                                              \
   1588                                                                               \
   1589 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = {           \
   1590    .name       = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"),                        \
   1591    .function   = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd),                         \
   1592    .type       = TX_TYPE(MDCT),                                               \
   1593    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,   \
   1594    .factors    = { N, TX_FACTOR_ANY },                                        \
   1595    .nb_factors = 2,                                                           \
   1596    .min_len    = N*2,                                                         \
   1597    .max_len    = TX_LEN_UNLIMITED,                                            \
   1598    .init       = TX_NAME(ff_tx_mdct_pfa_init),                                \
   1599    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
   1600    .prio       = FF_TX_PRIO_BASE,                                             \
   1601 };
   1602 
   1603 DECL_COMP_MDCT(3)
   1604 DECL_COMP_MDCT(5)
   1605 DECL_COMP_MDCT(7)
   1606 DECL_COMP_MDCT(9)
   1607 DECL_COMP_MDCT(15)
   1608 
   1609 static av_cold int TX_NAME(ff_tx_rdft_init)(AVTXContext *s,
   1610                                            const FFTXCodelet *cd,
   1611                                            uint64_t flags,
   1612                                            FFTXCodeletOptions *opts,
   1613                                            int len, int inv,
   1614                                            const void *scale)
   1615 {
   1616    int ret;
   1617    double f, m;
   1618    TXSample *tab;
   1619    uint64_t r2r = flags & AV_TX_REAL_TO_REAL;
   1620    int len4 = FFALIGN(len, 4) / 4;
   1621 
   1622    s->scale_d = *((SCALE_TYPE *)scale);
   1623    s->scale_f = s->scale_d;
   1624 
   1625    flags &= ~(AV_TX_REAL_TO_REAL | AV_TX_REAL_TO_IMAGINARY);
   1626 
   1627    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
   1628        return ret;
   1629 
   1630    if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp))))
   1631        return AVERROR(ENOMEM);
   1632 
   1633    tab = (TXSample *)s->exp;
   1634 
   1635    f = 2*M_PI/len;
   1636 
   1637    m = (inv ? 2*s->scale_d : s->scale_d);
   1638 
   1639    *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
   1640    *tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
   1641    *tab++ = RESCALE( m);
   1642    *tab++ = RESCALE(-m);
   1643 
   1644    *tab++ = RESCALE( (0.5 - 0.0) * m);
   1645    if (r2r)
   1646        *tab++ = 1 / s->scale_f;
   1647    else
   1648        *tab++ = RESCALE( (0.0 - 0.5) * m);
   1649    *tab++ = RESCALE( (0.5 - inv) * m);
   1650    *tab++ = RESCALE(-(0.5 - inv) * m);
   1651 
   1652    for (int i = 0; i < len4; i++)
   1653        *tab++ = RESCALE(cos(i*f));
   1654 
   1655    tab = ((TXSample *)s->exp) + len4 + 8;
   1656 
   1657    for (int i = 0; i < len4; i++)
   1658        *tab++ = RESCALE(cos(((len - i*4)/4.0)*f)) * (inv ? 1 : -1);
   1659 
   1660    return 0;
   1661 }
   1662 
   1663 #define DECL_RDFT(n, inv)                                                      \
   1664 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst,               \
   1665                                     void *_src, ptrdiff_t stride)             \
   1666 {                                                                              \
   1667    const int len2 = s->len >> 1;                                              \
   1668    const int len4 = s->len >> 2;                                              \
   1669    const TXSample *fact = (void *)s->exp;                                     \
   1670    const TXSample *tcos = fact + 8;                                           \
   1671    const TXSample *tsin = tcos + len4;                                        \
   1672    TXComplex *data = inv ? _src : _dst;                                       \
   1673    TXComplex t[3];                                                            \
   1674                                                                               \
   1675    if (!inv)                                                                  \
   1676        s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex));                   \
   1677    else                                                                       \
   1678        data[0].im = data[len2].re;                                            \
   1679                                                                               \
   1680    /* The DC value's both components are real, but we need to change them     \
   1681     * into complex values. Also, the middle of the array is special-cased.    \
   1682     * These operations can be done before or after the loop. */               \
   1683    t[0].re = data[0].re;                                                      \
   1684    data[0].re = t[0].re + data[0].im;                                         \
   1685    data[0].im = t[0].re - data[0].im;                                         \
   1686    data[   0].re = MULT(fact[0], data[   0].re);                              \
   1687    data[   0].im = MULT(fact[1], data[   0].im);                              \
   1688    data[len4].re = MULT(fact[2], data[len4].re);                              \
   1689    data[len4].im = MULT(fact[3], data[len4].im);                              \
   1690                                                                               \
   1691    for (int i = 1; i < len4; i++) {                                           \
   1692        /* Separate even and odd FFTs */                                       \
   1693        t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re));             \
   1694        t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im));             \
   1695        t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im));             \
   1696        t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re));             \
   1697                                                                               \
   1698        /* Apply twiddle factors to the odd FFT and add to the even FFT */     \
   1699        CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]);            \
   1700                                                                               \
   1701        data[       i].re = t[0].re + t[2].re;                                 \
   1702        data[       i].im = t[2].im - t[0].im;                                 \
   1703        data[len2 - i].re = t[0].re - t[2].re;                                 \
   1704        data[len2 - i].im = t[2].im + t[0].im;                                 \
   1705    }                                                                          \
   1706                                                                               \
   1707    if (inv) {                                                                 \
   1708        s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex));                   \
   1709    } else {                                                                   \
   1710        /* Move [0].im to the last position, as convention requires */         \
   1711        data[len2].re = data[0].im;                                            \
   1712        data[   0].im = data[len2].im = 0;                                     \
   1713    }                                                                          \
   1714 }                                                                              \
   1715                                                                               \
   1716 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
   1717    .name       = TX_NAME_STR("rdft_" #n),                                     \
   1718    .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
   1719    .type       = TX_TYPE(RDFT),                                               \
   1720    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |       \
   1721                  (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY),             \
   1722    .factors    = { 4, TX_FACTOR_ANY },                                        \
   1723    .nb_factors = 2,                                                           \
   1724    .min_len    = 4,                                                           \
   1725    .max_len    = TX_LEN_UNLIMITED,                                            \
   1726    .init       = TX_NAME(ff_tx_rdft_init),                                    \
   1727    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
   1728    .prio       = FF_TX_PRIO_BASE,                                             \
   1729 };
   1730 
   1731 DECL_RDFT(r2c,  0)
   1732 DECL_RDFT(c2r,  1)
   1733 
   1734 #define DECL_RDFT_HALF(n, mode, mod2)                                          \
   1735 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst,               \
   1736                                        void *_src, ptrdiff_t stride)          \
   1737 {                                                                              \
   1738    const int len = s->len;                                                    \
   1739    const int len2 = len >> 1;                                                 \
   1740    const int len4 = len >> 2;                                                 \
   1741    const int aligned_len4 = FFALIGN(len, 4)/4;                                \
   1742    const TXSample *fact = (void *)s->exp;                                     \
   1743    const TXSample *tcos = fact + 8;                                           \
   1744    const TXSample *tsin = tcos + aligned_len4;                                \
   1745    TXComplex *data = _dst;                                                    \
   1746    TXSample *out = _dst; /* Half-complex is forward-only */                   \
   1747    TXSample tmp_dc;                                                           \
   1748    av_unused TXSample tmp_mid;                                                \
   1749    TXSample tmp[4];                                                           \
   1750    TXComplex sf, sl;                                                          \
   1751                                                                               \
   1752    s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex));                       \
   1753                                                                               \
   1754    tmp_dc = data[0].re;                                                       \
   1755    data[   0].re = tmp_dc + data[0].im;                                       \
   1756    tmp_dc        = tmp_dc - data[0].im;                                       \
   1757                                                                               \
   1758    data[   0].re = MULT(fact[0], data[   0].re);                              \
   1759    tmp_dc        = MULT(fact[1],        tmp_dc);                              \
   1760    data[len4].re = MULT(fact[2], data[len4].re);                              \
   1761                                                                               \
   1762    if (!mod2) {                                                               \
   1763        data[len4].im = MULT(fact[3], data[len4].im);                          \
   1764    } else {                                                                   \
   1765        sf = data[len4];                                                       \
   1766        sl = data[len4 + 1];                                                   \
   1767        if (mode == AV_TX_REAL_TO_REAL)                                        \
   1768            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
   1769        else                                                                   \
   1770            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
   1771        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
   1772        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
   1773                                                                               \
   1774        if (mode == AV_TX_REAL_TO_REAL) {                                      \
   1775            tmp[3]  = tmp[1]*tcos[len4] - tmp[2]*tsin[len4];                   \
   1776            tmp_mid = (tmp[0] - tmp[3]);                                       \
   1777        } else {                                                               \
   1778            tmp[3]  = tmp[1]*tsin[len4] + tmp[2]*tcos[len4];                   \
   1779            tmp_mid = (tmp[0] + tmp[3]);                                       \
   1780        }                                                                      \
   1781    }                                                                          \
   1782                                                                               \
   1783    /* NOTE: unrolling this breaks non-mod8 lengths */                         \
   1784    for (int i = 1; i <= len4; i++) {                                          \
   1785        TXSample tmp[4];                                                       \
   1786        TXComplex sf = data[i];                                                \
   1787        TXComplex sl = data[len2 - i];                                         \
   1788                                                                               \
   1789        if (mode == AV_TX_REAL_TO_REAL)                                        \
   1790            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
   1791        else                                                                   \
   1792            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
   1793                                                                               \
   1794        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
   1795        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
   1796                                                                               \
   1797        if (mode == AV_TX_REAL_TO_REAL) {                                      \
   1798            tmp[3]           = tmp[1]*tcos[i] - tmp[2]*tsin[i];                \
   1799            out[i]           = (tmp[0] + tmp[3]);                              \
   1800            out[len - i]     = (tmp[0] - tmp[3]);                              \
   1801        } else {                                                               \
   1802            tmp[3]           = tmp[1]*tsin[i] + tmp[2]*tcos[i];                \
   1803            out[i - 1]       = (tmp[3] - tmp[0]);                              \
   1804            out[len - i - 1] = (tmp[0] + tmp[3]);                              \
   1805        }                                                                      \
   1806    }                                                                          \
   1807                                                                               \
   1808    for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++)       \
   1809        out[len2 - i] = out[len - i];                                          \
   1810                                                                               \
   1811    if (mode == AV_TX_REAL_TO_REAL) {                                          \
   1812        out[len2] = tmp_dc;                                                    \
   1813        if (mod2)                                                              \
   1814            out[len4 + 1] = tmp_mid * fact[5];                                 \
   1815    } else if (mod2) {                                                         \
   1816        out[len4] = tmp_mid;                                                   \
   1817    }                                                                          \
   1818 }                                                                              \
   1819                                                                               \
   1820 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
   1821    .name       = TX_NAME_STR("rdft_" #n),                                     \
   1822    .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
   1823    .type       = TX_TYPE(RDFT),                                               \
   1824    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | mode |                     \
   1825                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,                     \
   1826    .factors    = { 2 + 2*(!mod2), TX_FACTOR_ANY },                            \
   1827    .nb_factors = 2,                                                           \
   1828    .min_len    = 2 + 2*(!mod2),                                               \
   1829    .max_len    = TX_LEN_UNLIMITED,                                            \
   1830    .init       = TX_NAME(ff_tx_rdft_init),                                    \
   1831    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
   1832    .prio       = FF_TX_PRIO_BASE,                                             \
   1833 };
   1834 
   1835 DECL_RDFT_HALF(r2r,      AV_TX_REAL_TO_REAL,      0)
   1836 DECL_RDFT_HALF(r2r_mod2, AV_TX_REAL_TO_REAL,      1)
   1837 DECL_RDFT_HALF(r2i,      AV_TX_REAL_TO_IMAGINARY, 0)
   1838 DECL_RDFT_HALF(r2i_mod2, AV_TX_REAL_TO_IMAGINARY, 1)
   1839 
   1840 static av_cold int TX_NAME(ff_tx_dct_init)(AVTXContext *s,
   1841                                           const FFTXCodelet *cd,
   1842                                           uint64_t flags,
   1843                                           FFTXCodeletOptions *opts,
   1844                                           int len, int inv,
   1845                                           const void *scale)
   1846 {
   1847    int ret;
   1848    double freq;
   1849    TXSample *tab;
   1850    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
   1851 
   1852    if (inv) {
   1853        len *= 2;
   1854        s->len *= 2;
   1855        rsc *= 0.5;
   1856    }
   1857 
   1858    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc)))
   1859        return ret;
   1860 
   1861    s->exp = av_malloc((len/2)*3*sizeof(TXSample));
   1862    if (!s->exp)
   1863        return AVERROR(ENOMEM);
   1864 
   1865    tab = (TXSample *)s->exp;
   1866 
   1867    freq = M_PI/(len*2);
   1868 
   1869    for (int i = 0; i < len; i++)
   1870        tab[i] = RESCALE(cos(i*freq)*(!inv + 1));
   1871 
   1872    if (inv) {
   1873        for (int i = 0; i < len/2; i++)
   1874            tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq));
   1875    } else {
   1876        for (int i = 0; i < len/2; i++)
   1877            tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq));
   1878    }
   1879 
   1880    return 0;
   1881 }
   1882 
   1883 static void TX_NAME(ff_tx_dctII)(AVTXContext *s, void *_dst,
   1884                                 void *_src, ptrdiff_t stride)
   1885 {
   1886    TXSample *dst = _dst;
   1887    TXSample *src = _src;
   1888    const int len = s->len;
   1889    const int len2 = len >> 1;
   1890    const TXSample *exp = (void *)s->exp;
   1891    TXSample next;
   1892 #ifdef TX_INT32
   1893    int64_t tmp1, tmp2;
   1894 #else
   1895    TXSample tmp1, tmp2;
   1896 #endif
   1897 
   1898    for (int i = 0; i < len2; i++) {
   1899        TXSample in1 = src[i];
   1900        TXSample in2 = src[len - i - 1];
   1901        TXSample s    = exp[len + i];
   1902 
   1903 #ifdef TX_INT32
   1904        tmp1 = in1 + in2;
   1905        tmp2 = in1 - in2;
   1906 
   1907        tmp1 >>= 1;
   1908        tmp2 *= s;
   1909 
   1910        tmp2 = (tmp2 + 0x40000000) >> 31;
   1911 #else
   1912        tmp1 = (in1 + in2)*0.5;
   1913        tmp2 = (in1 - in2)*s;
   1914 #endif
   1915 
   1916        src[i]           = tmp1 + tmp2;
   1917        src[len - i - 1] = tmp1 - tmp2;
   1918    }
   1919 
   1920    s->fn[0](&s->sub[0], dst, src, sizeof(TXComplex));
   1921 
   1922    next = dst[len];
   1923 
   1924    for (int i = len - 2; i > 0; i -= 2) {
   1925        TXSample tmp;
   1926 
   1927        CMUL(tmp, dst[i], exp[len - i], exp[i], dst[i + 0], dst[i + 1]);
   1928 
   1929        dst[i + 1] = next;
   1930 
   1931        next += tmp;
   1932    }
   1933 
   1934 #ifdef TX_INT32
   1935    tmp1 = ((int64_t)exp[0]) * ((int64_t)dst[0]);
   1936    dst[0] = (tmp1 + 0x40000000) >> 31;
   1937 #else
   1938    dst[0] = exp[0] * dst[0];
   1939 #endif
   1940    dst[1] = next;
   1941 }
   1942 
   1943 static void TX_NAME(ff_tx_dctIII)(AVTXContext *s, void *_dst,
   1944                                  void *_src, ptrdiff_t stride)
   1945 {
   1946    TXSample *dst = _dst;
   1947    TXSample *src = _src;
   1948    const int len = s->len;
   1949    const int len2 = len >> 1;
   1950    const TXSample *exp = (void *)s->exp;
   1951 #ifdef TX_INT32
   1952    int64_t  tmp1, tmp2 = src[len - 1];
   1953    tmp2 = (2*tmp2 + 0x40000000) >> 31;
   1954 #else
   1955    TXSample tmp1, tmp2 = 2*src[len - 1];
   1956 #endif
   1957 
   1958    src[len] = tmp2;
   1959 
   1960    for (int i = len - 2; i >= 2; i -= 2) {
   1961        TXSample val1 = src[i - 0];
   1962        TXSample val2 = src[i - 1] - src[i + 1];
   1963 
   1964        CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2);
   1965    }
   1966 
   1967    s->fn[0](&s->sub[0], dst, src, sizeof(float));
   1968 
   1969    for (int i = 0; i < len2; i++) {
   1970        TXSample in1 = dst[i];
   1971        TXSample in2 = dst[len - i - 1];
   1972        TXSample c   = exp[len + i];
   1973 
   1974        tmp1 = in1 + in2;
   1975        tmp2 = in1 - in2;
   1976        tmp2 *= c;
   1977 #ifdef TX_INT32
   1978        tmp2 = (tmp2 + 0x40000000) >> 31;
   1979 #endif
   1980 
   1981        dst[i]            = tmp1 + tmp2;
   1982        dst[len - i - 1]  = tmp1 - tmp2;
   1983    }
   1984 }
   1985 
   1986 static const FFTXCodelet TX_NAME(ff_tx_dctII_def) = {
   1987    .name       = TX_NAME_STR("dctII"),
   1988    .function   = TX_NAME(ff_tx_dctII),
   1989    .type       = TX_TYPE(DCT),
   1990    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
   1991                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
   1992    .factors    = { 2, TX_FACTOR_ANY },
   1993    .min_len    = 2,
   1994    .max_len    = TX_LEN_UNLIMITED,
   1995    .init       = TX_NAME(ff_tx_dct_init),
   1996    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   1997    .prio       = FF_TX_PRIO_BASE,
   1998 };
   1999 
   2000 static const FFTXCodelet TX_NAME(ff_tx_dctIII_def) = {
   2001    .name       = TX_NAME_STR("dctIII"),
   2002    .function   = TX_NAME(ff_tx_dctIII),
   2003    .type       = TX_TYPE(DCT),
   2004    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
   2005                  FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
   2006    .factors    = { 2, TX_FACTOR_ANY },
   2007    .min_len    = 2,
   2008    .max_len    = TX_LEN_UNLIMITED,
   2009    .init       = TX_NAME(ff_tx_dct_init),
   2010    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   2011    .prio       = FF_TX_PRIO_BASE,
   2012 };
   2013 
   2014 static av_cold int TX_NAME(ff_tx_dcstI_init)(AVTXContext *s,
   2015                                             const FFTXCodelet *cd,
   2016                                             uint64_t flags,
   2017                                             FFTXCodeletOptions *opts,
   2018                                             int len, int inv,
   2019                                             const void *scale)
   2020 {
   2021    int ret;
   2022    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
   2023 
   2024    if (inv) {
   2025        len *= 2;
   2026        s->len *= 2;
   2027        rsc *= 0.5;
   2028    }
   2029 
   2030    /* We want a half-complex RDFT */
   2031    flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL :
   2032                                          AV_TX_REAL_TO_IMAGINARY;
   2033 
   2034    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL,
   2035                                (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2,
   2036                                0, &rsc)))
   2037        return ret;
   2038 
   2039    s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample));
   2040    if (!s->tmp)
   2041        return AVERROR(ENOMEM);
   2042 
   2043    return 0;
   2044 }
   2045 
   2046 static void TX_NAME(ff_tx_dctI)(AVTXContext *s, void *_dst,
   2047                                void *_src, ptrdiff_t stride)
   2048 {
   2049    TXSample *dst = _dst;
   2050    TXSample *src = _src;
   2051    const int len = s->len - 1;
   2052    TXSample *tmp = (TXSample *)s->tmp;
   2053 
   2054    stride /= sizeof(TXSample);
   2055 
   2056    for (int i = 0; i < len; i++)
   2057        tmp[i] = tmp[2*len - i] = src[i * stride];
   2058 
   2059    tmp[len] = src[len * stride]; /* Middle */
   2060 
   2061    s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample));
   2062 }
   2063 
   2064 static void TX_NAME(ff_tx_dstI)(AVTXContext *s, void *_dst,
   2065                                void *_src, ptrdiff_t stride)
   2066 {
   2067    TXSample *dst = _dst;
   2068    TXSample *src = _src;
   2069    const int len = s->len + 1;
   2070    TXSample *tmp = (void *)s->tmp;
   2071 
   2072    stride /= sizeof(TXSample);
   2073 
   2074    tmp[0] = 0;
   2075 
   2076    for (int i = 1; i < len; i++) {
   2077        TXSample a = src[(i - 1) * stride];
   2078        tmp[i] = -a;
   2079        tmp[2*len - i] = a;
   2080    }
   2081 
   2082    tmp[len] = 0; /* i == n, Nyquist */
   2083 
   2084    s->fn[0](&s->sub[0], dst, tmp, sizeof(float));
   2085 }
   2086 
   2087 static const FFTXCodelet TX_NAME(ff_tx_dctI_def) = {
   2088    .name       = TX_NAME_STR("dctI"),
   2089    .function   = TX_NAME(ff_tx_dctI),
   2090    .type       = TX_TYPE(DCT_I),
   2091    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
   2092    .factors    = { 2, TX_FACTOR_ANY },
   2093    .nb_factors = 2,
   2094    .min_len    = 2,
   2095    .max_len    = TX_LEN_UNLIMITED,
   2096    .init       = TX_NAME(ff_tx_dcstI_init),
   2097    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   2098    .prio       = FF_TX_PRIO_BASE,
   2099 };
   2100 
   2101 static const FFTXCodelet TX_NAME(ff_tx_dstI_def) = {
   2102    .name       = TX_NAME_STR("dstI"),
   2103    .function   = TX_NAME(ff_tx_dstI),
   2104    .type       = TX_TYPE(DST_I),
   2105    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
   2106    .factors    = { 2, TX_FACTOR_ANY },
   2107    .nb_factors = 2,
   2108    .min_len    = 2,
   2109    .max_len    = TX_LEN_UNLIMITED,
   2110    .init       = TX_NAME(ff_tx_dcstI_init),
   2111    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
   2112    .prio       = FF_TX_PRIO_BASE,
   2113 };
   2114 
   2115 int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
   2116 {
   2117    int off = 0;
   2118    int len4 = s->len >> 1;
   2119    double scale = s->scale_d;
   2120    const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
   2121    size_t alloc = pre_tab ? 2*len4 : len4;
   2122 
   2123    if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
   2124        return AVERROR(ENOMEM);
   2125 
   2126    scale = sqrt(fabs(scale));
   2127 
   2128    if (pre_tab)
   2129        off = len4;
   2130 
   2131    for (int i = 0; i < len4; i++) {
   2132        const double alpha = M_PI_2 * (i + theta) / len4;
   2133        s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
   2134                                       RESCALE(sin(alpha) * scale) };
   2135    }
   2136 
   2137    if (pre_tab)
   2138        for (int i = 0; i < len4; i++)
   2139            s->exp[i] = s->exp[len4 + pre_tab[i]];
   2140 
   2141    return 0;
   2142 }
   2143 
   2144 const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
   2145    /* Split-Radix codelets */
   2146    &TX_NAME(ff_tx_fft2_ns_def),
   2147    &TX_NAME(ff_tx_fft4_ns_def),
   2148    &TX_NAME(ff_tx_fft8_ns_def),
   2149    &TX_NAME(ff_tx_fft16_ns_def),
   2150    &TX_NAME(ff_tx_fft32_ns_def),
   2151    &TX_NAME(ff_tx_fft64_ns_def),
   2152    &TX_NAME(ff_tx_fft128_ns_def),
   2153    &TX_NAME(ff_tx_fft256_ns_def),
   2154    &TX_NAME(ff_tx_fft512_ns_def),
   2155    &TX_NAME(ff_tx_fft1024_ns_def),
   2156    &TX_NAME(ff_tx_fft2048_ns_def),
   2157    &TX_NAME(ff_tx_fft4096_ns_def),
   2158    &TX_NAME(ff_tx_fft8192_ns_def),
   2159    &TX_NAME(ff_tx_fft16384_ns_def),
   2160    &TX_NAME(ff_tx_fft32768_ns_def),
   2161    &TX_NAME(ff_tx_fft65536_ns_def),
   2162    &TX_NAME(ff_tx_fft131072_ns_def),
   2163    &TX_NAME(ff_tx_fft262144_ns_def),
   2164    &TX_NAME(ff_tx_fft524288_ns_def),
   2165    &TX_NAME(ff_tx_fft1048576_ns_def),
   2166    &TX_NAME(ff_tx_fft2097152_ns_def),
   2167 
   2168    /* Prime factor codelets */
   2169    &TX_NAME(ff_tx_fft3_ns_def),
   2170    &TX_NAME(ff_tx_fft5_ns_def),
   2171    &TX_NAME(ff_tx_fft7_ns_def),
   2172    &TX_NAME(ff_tx_fft9_ns_def),
   2173    &TX_NAME(ff_tx_fft15_ns_def),
   2174 
   2175    /* We get these for free */
   2176    &TX_NAME(ff_tx_fft3_fwd_def),
   2177    &TX_NAME(ff_tx_fft5_fwd_def),
   2178    &TX_NAME(ff_tx_fft7_fwd_def),
   2179    &TX_NAME(ff_tx_fft9_fwd_def),
   2180 
   2181    /* Standalone transforms */
   2182    &TX_NAME(ff_tx_fft_def),
   2183    &TX_NAME(ff_tx_fft_inplace_def),
   2184    &TX_NAME(ff_tx_fft_inplace_small_def),
   2185    &TX_NAME(ff_tx_fft_pfa_def),
   2186    &TX_NAME(ff_tx_fft_pfa_ns_def),
   2187    &TX_NAME(ff_tx_fft_naive_def),
   2188    &TX_NAME(ff_tx_fft_naive_small_def),
   2189    &TX_NAME(ff_tx_mdct_fwd_def),
   2190    &TX_NAME(ff_tx_mdct_inv_def),
   2191    &TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def),
   2192    &TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def),
   2193    &TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def),
   2194    &TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def),
   2195    &TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def),
   2196    &TX_NAME(ff_tx_mdct_pfa_3xM_inv_def),
   2197    &TX_NAME(ff_tx_mdct_pfa_5xM_inv_def),
   2198    &TX_NAME(ff_tx_mdct_pfa_7xM_inv_def),
   2199    &TX_NAME(ff_tx_mdct_pfa_9xM_inv_def),
   2200    &TX_NAME(ff_tx_mdct_pfa_15xM_inv_def),
   2201    &TX_NAME(ff_tx_mdct_naive_fwd_def),
   2202    &TX_NAME(ff_tx_mdct_naive_inv_def),
   2203    &TX_NAME(ff_tx_mdct_inv_full_def),
   2204    &TX_NAME(ff_tx_rdft_r2c_def),
   2205    &TX_NAME(ff_tx_rdft_r2r_def),
   2206    &TX_NAME(ff_tx_rdft_r2r_mod2_def),
   2207    &TX_NAME(ff_tx_rdft_r2i_def),
   2208    &TX_NAME(ff_tx_rdft_r2i_mod2_def),
   2209    &TX_NAME(ff_tx_rdft_c2r_def),
   2210    &TX_NAME(ff_tx_dctII_def),
   2211    &TX_NAME(ff_tx_dctIII_def),
   2212    &TX_NAME(ff_tx_dctI_def),
   2213    &TX_NAME(ff_tx_dstI_def),
   2214 
   2215    NULL,
   2216 };