tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

tx_float_init.c (13321B)


      1 /*
      2 * This file is part of FFmpeg.
      3 *
      4 * FFmpeg is free software; you can redistribute it and/or
      5 * modify it under the terms of the GNU Lesser General Public
      6 * License as published by the Free Software Foundation; either
      7 * version 2.1 of the License, or (at your option) any later version.
      8 *
      9 * FFmpeg is distributed in the hope that it will be useful,
     10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     12 * Lesser General Public License for more details.
     13 *
     14 * You should have received a copy of the GNU Lesser General Public
     15 * License along with FFmpeg; if not, write to the Free Software
     16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     17 */
     18 
     19 #define TX_FLOAT
     20 #include "libavutil/tx_priv.h"
     21 #include "libavutil/attributes.h"
     22 #include "libavutil/mem.h"
     23 #include "libavutil/x86/cpu.h"
     24 
     25 #include "config.h"
     26 
     27 TX_DECL_FN(fft2,      sse3)
     28 TX_DECL_FN(fft4_fwd,  sse2)
     29 TX_DECL_FN(fft4_inv,  sse2)
     30 TX_DECL_FN(fft8,      sse3)
     31 TX_DECL_FN(fft8_ns,   sse3)
     32 TX_DECL_FN(fft8,      avx)
     33 TX_DECL_FN(fft8_ns,   avx)
     34 TX_DECL_FN(fft15,     avx2)
     35 TX_DECL_FN(fft15_ns,  avx2)
     36 TX_DECL_FN(fft16,     avx)
     37 TX_DECL_FN(fft16_ns,  avx)
     38 TX_DECL_FN(fft16,     fma3)
     39 TX_DECL_FN(fft16_ns,  fma3)
     40 TX_DECL_FN(fft32,     avx)
     41 TX_DECL_FN(fft32_ns,  avx)
     42 TX_DECL_FN(fft32,     fma3)
     43 TX_DECL_FN(fft32_ns,  fma3)
     44 TX_DECL_FN(fft_sr,    avx)
     45 TX_DECL_FN(fft_sr_ns, avx)
     46 TX_DECL_FN(fft_sr,    fma3)
     47 TX_DECL_FN(fft_sr_ns, fma3)
     48 TX_DECL_FN(fft_sr,    avx2)
     49 TX_DECL_FN(fft_sr_ns, avx2)
     50 
     51 TX_DECL_FN(fft_pfa_15xM, avx2)
     52 TX_DECL_FN(fft_pfa_15xM_ns, avx2)
     53 
     54 TX_DECL_FN(mdct_inv, avx2)
     55 
     56 TX_DECL_FN(fft2_asm, sse3)
     57 TX_DECL_FN(fft4_fwd_asm, sse2)
     58 TX_DECL_FN(fft4_inv_asm, sse2)
     59 TX_DECL_FN(fft8_asm, sse3)
     60 TX_DECL_FN(fft8_asm, avx)
     61 TX_DECL_FN(fft16_asm, avx)
     62 TX_DECL_FN(fft16_asm, fma3)
     63 TX_DECL_FN(fft32_asm, avx)
     64 TX_DECL_FN(fft32_asm, fma3)
     65 TX_DECL_FN(fft_sr_asm, avx)
     66 TX_DECL_FN(fft_sr_asm, fma3)
     67 TX_DECL_FN(fft_sr_asm, avx2)
     68 
     69 TX_DECL_FN(fft_pfa_15xM_asm, avx2)
     70 
     71 #define DECL_INIT_FN(basis, interleave)                                        \
     72 static av_cold int b ##basis## _i ##interleave(AVTXContext *s,                 \
     73                                               const FFTXCodelet *cd,          \
     74                                               uint64_t flags,                 \
     75                                               FFTXCodeletOptions *opts,       \
     76                                               int len, int inv,               \
     77                                               const void *scale)              \
     78 {                                                                              \
     79    ff_tx_init_tabs_float(len);                                                \
     80    if (cd->max_len == 2)                                                      \
     81        return ff_tx_gen_ptwo_revtab(s, opts);                                 \
     82    else                                                                       \
     83        return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts,          \
     84                                                   basis, interleave);         \
     85 }
     86 
     87 DECL_INIT_FN(8, 0)
     88 DECL_INIT_FN(8, 2)
     89 
     90 static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd,
     91                               uint64_t flags, FFTXCodeletOptions *opts,
     92                               int len, int inv, const void *scale)
     93 {
     94    int ret;
     95 
     96    /* The transformations below are performed in the gather domain,
     97     * so override the option and let the infrastructure convert the map
     98     * to SCATTER if needed. */
     99    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
    100 
    101    TX_TAB(ff_tx_init_tabs)(len);
    102 
    103    if (len == 15)
    104        ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5);
    105    else
    106        ret = ff_tx_gen_default_map(s, &sub_opts);
    107 
    108    if (ret < 0)
    109        return ret;
    110 
    111    if (len == 15) {
    112        int cnt = 0, tmp[15];
    113 
    114        /* Special permutation to simplify loads in the pre-permuted version */
    115        memcpy(tmp, s->map, 15*sizeof(*tmp));
    116        for (int i = 1; i < 15; i += 3) {
    117            s->map[cnt] = tmp[i];
    118            cnt++;
    119        }
    120        for (int i = 2; i < 15; i += 3) {
    121            s->map[cnt] = tmp[i];
    122            cnt++;
    123        }
    124        for (int i = 0; i < 15; i += 3) {
    125            s->map[cnt] = tmp[i];
    126            cnt++;
    127        }
    128        memmove(&s->map[7], &s->map[6], 4*sizeof(int));
    129        memmove(&s->map[3], &s->map[1], 4*sizeof(int));
    130        s->map[1] = tmp[2];
    131        s->map[2] = tmp[0];
    132    }
    133 
    134    return 0;
    135 }
    136 
    137 static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
    138                              uint64_t flags, FFTXCodeletOptions *opts,
    139                              int len, int inv, const void *scale)
    140 {
    141    int ret;
    142    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
    143 
    144    s->scale_d = *((SCALE_TYPE *)scale);
    145    s->scale_f = s->scale_d;
    146 
    147    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
    148    flags |=  AV_TX_INPLACE;      /* in-place */
    149    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
    150    flags |=  FF_TX_ASM_CALL;     /* We want an assembly function, not C */
    151 
    152    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
    153                                inv, scale)))
    154        return ret;
    155 
    156    s->map = av_malloc(len*sizeof(*s->map));
    157    if (!s->map)
    158        return AVERROR(ENOMEM);
    159 
    160    memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
    161    /* Invert lookup table for unstrided path */
    162    for (int i = 0; i < (len >> 1); i++)
    163       s->map[(len >> 1) + s->map[i]] = i;
    164 
    165    if ((ret = ff_tx_mdct_gen_exp_float(s, s->map)))
    166        return ret;
    167 
    168    return 0;
    169 }
    170 
    171 static av_cold int fft_pfa_init(AVTXContext *s,
    172                                const FFTXCodelet *cd,
    173                                uint64_t flags,
    174                                FFTXCodeletOptions *opts,
    175                                int len, int inv,
    176                                const void *scale)
    177 {
    178    int ret;
    179    int sub_len = len / cd->factors[0];
    180    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
    181 
    182    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
    183    flags |=  AV_TX_INPLACE;      /* in-place */
    184    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
    185    flags |=  FF_TX_ASM_CALL;     /* We want an assembly function, not C */
    186 
    187    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
    188                                sub_len, inv, scale)))
    189        return ret;
    190 
    191    if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
    192        return ret;
    193 
    194    if (cd->factors[0] == 15) {
    195        int tmp[15];
    196 
    197        /* Our 15-point transform is also a compound one, so embed its input map */
    198        TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
    199 
    200        /* Special permutation to simplify loads in the pre-permuted version */
    201        for (int k = 0; k < s->sub[0].len; k++) {
    202            int cnt = 0;
    203            memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
    204            for (int i = 1; i < 15; i += 3) {
    205                s->map[k*15 + cnt] = tmp[i];
    206                cnt++;
    207            }
    208            for (int i = 2; i < 15; i += 3) {
    209                s->map[k*15 + cnt] = tmp[i];
    210                cnt++;
    211            }
    212            for (int i = 0; i < 15; i += 3) {
    213                s->map[k*15 + cnt] = tmp[i];
    214                cnt++;
    215            }
    216            memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int));
    217            memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int));
    218            s->map[k*15 + 1] = tmp[2];
    219            s->map[k*15 + 2] = tmp[0];
    220        }
    221    }
    222 
    223    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
    224        return AVERROR(ENOMEM);
    225 
    226    TX_TAB(ff_tx_init_tabs)(len / sub_len);
    227 
    228    return 0;
    229 }
    230 
    231 const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = {
    232    TX_DEF(fft2,     FFT,  2,  2, 2, 0, 128, NULL,  sse3, SSE3, AV_TX_INPLACE, 0),
    233    TX_DEF(fft2_asm, FFT,  2,  2, 2, 0, 192, b8_i0, sse3, SSE3,
    234           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
    235    TX_DEF(fft2,     FFT,  2,  2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
    236    TX_DEF(fft4_fwd, FFT,  4,  4, 2, 0, 128, NULL,  sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
    237    TX_DEF(fft4_fwd_asm, FFT,  4,  4, 2, 0, 192, b8_i0, sse2, SSE2,
    238           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
    239    TX_DEF(fft4_inv_asm, FFT,  4,  4, 2, 0, 128, NULL,  sse2, SSE2,
    240           AV_TX_INPLACE | FF_TX_INVERSE_ONLY | FF_TX_ASM_CALL, 0),
    241    TX_DEF(fft4_fwd, FFT,  4,  4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
    242    TX_DEF(fft4_inv, FFT,  4,  4, 2, 0, 128, NULL,  sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
    243    TX_DEF(fft8,     FFT,  8,  8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),
    244    TX_DEF(fft8_asm, FFT,  8,  8, 2, 0, 192, b8_i0, sse3, SSE3,
    245           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0),
    246    TX_DEF(fft8_ns,  FFT,  8,  8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
    247    TX_DEF(fft8,     FFT,  8,  8, 2, 0, 256, b8_i0, avx,  AVX,  AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
    248    TX_DEF(fft8_asm, FFT,  8,  8, 2, 0, 320, b8_i0, avx,  AVX,
    249           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
    250    TX_DEF(fft8_ns,  FFT,  8,  8, 2, 0, 320, b8_i0, avx,  AVX,  AV_TX_INPLACE | FF_TX_PRESHUFFLE,
    251           AV_CPU_FLAG_AVXSLOW),
    252    TX_DEF(fft16,    FFT, 16, 16, 2, 0, 256, b8_i2, avx,  AVX,  AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
    253    TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 320, b8_i2, avx,  AVX,
    254           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
    255    TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx,  AVX,  AV_TX_INPLACE | FF_TX_PRESHUFFLE,
    256           AV_CPU_FLAG_AVXSLOW),
    257    TX_DEF(fft16,    FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
    258    TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3,
    259           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
    260    TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
    261           AV_CPU_FLAG_AVXSLOW),
    262 
    263 #if ARCH_X86_64
    264    TX_DEF(fft32,    FFT, 32, 32, 2, 0, 256, b8_i2, avx,  AVX,  AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
    265    TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 320, b8_i2, avx,  AVX,
    266           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
    267    TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx,  AVX,  AV_TX_INPLACE | FF_TX_PRESHUFFLE,
    268           AV_CPU_FLAG_AVXSLOW),
    269    TX_DEF(fft32,    FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
    270    TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3,
    271           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
    272    TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
    273           AV_CPU_FLAG_AVXSLOW),
    274    TX_DEF(fft_sr,    FFT, 64, 2097152, 2, 0, 256, b8_i2, avx, AVX,  0, AV_CPU_FLAG_AVXSLOW),
    275    TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX,
    276           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
    277    TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX,  AV_TX_INPLACE | FF_TX_PRESHUFFLE,
    278           AV_CPU_FLAG_AVXSLOW),
    279    TX_DEF(fft_sr,    FFT, 64, 2097152, 2, 0, 288, b8_i2, fma3,  FMA3,  0, AV_CPU_FLAG_AVXSLOW),
    280    TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3,  FMA3,
    281           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW),
    282    TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3,  FMA3,  AV_TX_INPLACE | FF_TX_PRESHUFFLE,
    283           AV_CPU_FLAG_AVXSLOW),
    284 
    285    TX_DEF(fft15, FFT, 15, 15, 15, 0, 320, factor_init, avx2, AVX2,
    286           AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
    287    TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 384, factor_init, avx2, AVX2,
    288           AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW),
    289 
    290    TX_DEF(fft_sr,    FFT, 64, 2097152, 2, 0, 320, b8_i2, avx2, AVX2, 0,
    291           AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
    292    TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2,
    293           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
    294    TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
    295           AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
    296 
    297    TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 320, fft_pfa_init, avx2, AVX2,
    298           AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
    299    TX_DEF(fft_pfa_15xM_asm, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2,
    300           AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
    301    TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2,
    302           AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
    303 
    304    TX_DEF(mdct_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2,
    305           FF_TX_INVERSE_ONLY, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER),
    306 #endif
    307 
    308    NULL,
    309 };