tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

tx.c (29491B)


      1 /*
      2 * This file is part of FFmpeg.
      3 *
      4 * FFmpeg is free software; you can redistribute it and/or
      5 * modify it under the terms of the GNU Lesser General Public
      6 * License as published by the Free Software Foundation; either
      7 * version 2.1 of the License, or (at your option) any later version.
      8 *
      9 * FFmpeg is distributed in the hope that it will be useful,
     10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     12 * Lesser General Public License for more details.
     13 *
     14 * You should have received a copy of the GNU Lesser General Public
     15 * License along with FFmpeg; if not, write to the Free Software
     16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     17 */
     18 
     19 #include "avassert.h"
     20 #include "intmath.h"
     21 #include "cpu.h"
     22 #include "mem.h"
     23 #include "qsort.h"
     24 #include "bprint.h"
     25 
     26 #include "tx_priv.h"
     27 
     28 #define TYPE_IS(type, x)               \
     29    (((x) == AV_TX_FLOAT_ ## type)  || \
     30     ((x) == AV_TX_DOUBLE_ ## type) || \
     31     ((x) == AV_TX_INT32_ ## type))
     32 
     33 /* Calculates the modular multiplicative inverse */
     34 static av_always_inline int mulinv(int n, int m)
     35 {
     36    n = n % m;
     37    for (int x = 1; x < m; x++)
     38        if (((n * x) % m) == 1)
     39            return x;
     40    av_assert0(0); /* Never reached */
     41    return 0;
     42 }
     43 
     44 int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts,
     45                            int d1, int d2)
     46 {
     47    const int sl = d1*d2;
     48 
     49    s->map = av_malloc(s->len*sizeof(*s->map));
     50    if (!s->map)
     51        return AVERROR(ENOMEM);
     52 
     53    for (int k = 0; k < s->len; k += sl) {
     54        if (s->inv || (opts && opts->map_dir == FF_TX_MAP_SCATTER)) {
     55            for (int m = 0; m < d2; m++)
     56                for (int n = 0; n < d1; n++)
     57                    s->map[k + ((m*d1 + n*d2) % (sl))] = m*d1 + n;
     58        } else {
     59            for (int m = 0; m < d2; m++)
     60                for (int n = 0; n < d1; n++)
     61                    s->map[k + m*d1 + n] = (m*d1 + n*d2) % (sl);
     62        }
     63 
     64        if (s->inv)
     65            for (int w = 1; w <= ((sl) >> 1); w++)
     66                FFSWAP(int, s->map[k + w], s->map[k + sl - w]);
     67    }
     68 
     69    s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
     70 
     71    return 0;
     72 }
     73 
     74 /* Guaranteed to work for any n, m where gcd(n, m) == 1 */
     75 int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts,
     76                               int inv, int n, int m)
     77 {
     78    int *in_map, *out_map;
     79    const int len = n*m;    /* Will not be equal to s->len for MDCTs */
     80    int m_inv, n_inv;
     81 
     82    /* Make sure the numbers are coprime */
     83    if (av_gcd(n, m) != 1)
     84        return AVERROR(EINVAL);
     85 
     86    m_inv = mulinv(m, n);
     87    n_inv = mulinv(n, m);
     88 
     89    if (!(s->map = av_malloc(2*len*sizeof(*s->map))))
     90        return AVERROR(ENOMEM);
     91 
     92    in_map  = s->map;
     93    out_map = s->map + len;
     94 
     95    /* Ruritanian map for input, CRT map for output, can be swapped */
     96    if (opts && opts->map_dir == FF_TX_MAP_SCATTER) {
     97        for (int j = 0; j < m; j++) {
     98            for (int i = 0; i < n; i++) {
     99                in_map[(i*m + j*n) % len] = j*n + i;
    100                out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
    101            }
    102        }
    103    } else {
    104        for (int j = 0; j < m; j++) {
    105            for (int i = 0; i < n; i++) {
    106                in_map[j*n + i] = (i*m + j*n) % len;
    107                out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j;
    108            }
    109        }
    110    }
    111 
    112    if (inv) {
    113        for (int i = 0; i < m; i++) {
    114            int *in = &in_map[i*n + 1]; /* Skip the DC */
    115            for (int j = 0; j < ((n - 1) >> 1); j++)
    116                FFSWAP(int, in[j], in[n - j - 2]);
    117        }
    118    }
    119 
    120    s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
    121 
    122    return 0;
    123 }
    124 
    125 static inline int split_radix_permutation(int i, int len, int inv)
    126 {
    127    len >>= 1;
    128    if (len <= 1)
    129        return i & 1;
    130    if (!(i & len))
    131        return split_radix_permutation(i, len, inv) * 2;
    132    len >>= 1;
    133    return split_radix_permutation(i, len, inv) * 4 + 1 - 2*(!(i & len) ^ inv);
    134 }
    135 
    136 int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts)
    137 {
    138    int len = s->len;
    139 
    140    if (!(s->map = av_malloc(len*sizeof(*s->map))))
    141        return AVERROR(ENOMEM);
    142 
    143    if (opts && opts->map_dir == FF_TX_MAP_SCATTER) {
    144        for (int i = 0; i < s->len; i++)
    145            s->map[-split_radix_permutation(i, len, s->inv) & (len - 1)] = i;
    146    } else {
    147        for (int i = 0; i < s->len; i++)
    148            s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1);
    149    }
    150 
    151    s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
    152 
    153    return 0;
    154 }
    155 
    156 int ff_tx_gen_inplace_map(AVTXContext *s, int len)
    157 {
    158    int *src_map, out_map_idx = 0;
    159 
    160    if (!s->sub || !s->sub->map)
    161        return AVERROR(EINVAL);
    162 
    163    if (!(s->map = av_mallocz(len*sizeof(*s->map))))
    164        return AVERROR(ENOMEM);
    165 
    166    src_map = s->sub->map;
    167 
    168    /* The first coefficient is always already in-place */
    169    for (int src = 1; src < s->len; src++) {
    170        int dst = src_map[src];
    171        int found = 0;
    172 
    173        if (dst <= src)
    174            continue;
    175 
    176        /* This just checks if a closed loop has been encountered before,
    177         * and if so, skips it, since to fully permute a loop we must only
    178         * enter it once. */
    179        do {
    180            for (int j = 0; j < out_map_idx; j++) {
    181                if (dst == s->map[j]) {
    182                    found = 1;
    183                    break;
    184                }
    185            }
    186            dst = src_map[dst];
    187        } while (dst != src && !found);
    188 
    189        if (!found)
    190            s->map[out_map_idx++] = src;
    191    }
    192 
    193    s->map[out_map_idx++] = 0;
    194 
    195    return 0;
    196 }
    197 
    198 static void parity_revtab_generator(int *revtab, int n, int inv, int offset,
    199                                    int is_dual, int dual_high, int len,
    200                                    int basis, int dual_stride, int inv_lookup)
    201 {
    202    len >>= 1;
    203 
    204    if (len <= basis) {
    205        int k1, k2, stride, even_idx, odd_idx;
    206 
    207        is_dual = is_dual && dual_stride;
    208        dual_high = is_dual & dual_high;
    209        stride = is_dual ? FFMIN(dual_stride, len) : 0;
    210 
    211        even_idx = offset + dual_high*(stride - 2*len);
    212        odd_idx  = even_idx + len + (is_dual && !dual_high)*len + dual_high*len;
    213 
    214        for (int i = 0; i < len; i++) {
    215            k1 = -split_radix_permutation(offset + i*2 + 0, n, inv) & (n - 1);
    216            k2 = -split_radix_permutation(offset + i*2 + 1, n, inv) & (n - 1);
    217            if (inv_lookup) {
    218                revtab[even_idx++] = k1;
    219                revtab[odd_idx++]  = k2;
    220            } else {
    221                revtab[k1] = even_idx++;
    222                revtab[k2] = odd_idx++;
    223            }
    224            if (stride && !((i + 1) % stride)) {
    225                even_idx += stride;
    226                odd_idx  += stride;
    227            }
    228        }
    229 
    230        return;
    231    }
    232 
    233    parity_revtab_generator(revtab, n, inv, offset,
    234                            0, 0, len >> 0, basis, dual_stride, inv_lookup);
    235    parity_revtab_generator(revtab, n, inv, offset + (len >> 0),
    236                            1, 0, len >> 1, basis, dual_stride, inv_lookup);
    237    parity_revtab_generator(revtab, n, inv, offset + (len >> 0) + (len >> 1),
    238                            1, 1, len >> 1, basis, dual_stride, inv_lookup);
    239 }
    240 
    241 int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv,
    242                                        FFTXCodeletOptions *opts,
    243                                        int basis, int dual_stride)
    244 {
    245    basis >>= 1;
    246    if (len < basis)
    247        return AVERROR(EINVAL);
    248 
    249    if (!(s->map = av_mallocz(len*sizeof(*s->map))))
    250        return AVERROR(ENOMEM);
    251 
    252    av_assert0(!dual_stride || !(dual_stride & (dual_stride - 1)));
    253    av_assert0(dual_stride <= basis);
    254 
    255    parity_revtab_generator(s->map, len, inv, 0, 0, 0, len,
    256                            basis, dual_stride,
    257                            opts ? opts->map_dir == FF_TX_MAP_GATHER : FF_TX_MAP_GATHER);
    258 
    259    s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER;
    260 
    261    return 0;
    262 }
    263 
    264 static void reset_ctx(AVTXContext *s, int free_sub)
    265 {
    266    if (!s)
    267        return;
    268 
    269    if (s->sub)
    270        for (int i = 0; i < TX_MAX_SUB; i++)
    271            reset_ctx(&s->sub[i], free_sub + 1);
    272 
    273    if (s->cd_self && s->cd_self->uninit)
    274        s->cd_self->uninit(s);
    275 
    276    if (free_sub)
    277        av_freep(&s->sub);
    278 
    279    av_freep(&s->map);
    280    av_freep(&s->exp);
    281    av_freep(&s->tmp);
    282 
    283    /* Nothing else needs to be reset, it gets overwritten if another
    284     * ff_tx_init_subtx() call is made. */
    285    s->nb_sub = 0;
    286    s->opaque = NULL;
    287    memset(s->fn, 0, sizeof(*s->fn));
    288 }
    289 
    290 void ff_tx_clear_ctx(AVTXContext *s)
    291 {
    292    reset_ctx(s, 0);
    293 }
    294 
    295 av_cold void av_tx_uninit(AVTXContext **ctx)
    296 {
    297    if (!(*ctx))
    298        return;
    299 
    300    reset_ctx(*ctx, 1);
    301    av_freep(ctx);
    302 }
    303 
    304 static av_cold int ff_tx_null_init(AVTXContext *s, const FFTXCodelet *cd,
    305                                   uint64_t flags, FFTXCodeletOptions *opts,
    306                                   int len, int inv, const void *scale)
    307 {
    308    /* Can only handle one sample+type to one sample+type transforms */
    309    if (TYPE_IS(MDCT, s->type) || TYPE_IS(RDFT, s->type))
    310        return AVERROR(EINVAL);
    311    return 0;
    312 }
    313 
    314 /* Null transform when the length is 1 */
    315 static void ff_tx_null(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
    316 {
    317    memcpy(_out, _in, stride);
    318 }
    319 
    320 static const FFTXCodelet ff_tx_null_def = {
    321    .name       = NULL_IF_CONFIG_SMALL("null"),
    322    .function   = ff_tx_null,
    323    .type       = TX_TYPE_ANY,
    324    .flags      = AV_TX_UNALIGNED | FF_TX_ALIGNED |
    325                  FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
    326    .factors[0] = TX_FACTOR_ANY,
    327    .min_len    = 1,
    328    .max_len    = 1,
    329    .init       = ff_tx_null_init,
    330    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
    331    .prio       = FF_TX_PRIO_MAX,
    332 };
    333 
    334 static const FFTXCodelet * const ff_tx_null_list[] = {
    335    &ff_tx_null_def,
    336    NULL,
    337 };
    338 
    339 /* Array of all compiled codelet lists. Order is irrelevant. */
    340 static const FFTXCodelet * const * const codelet_list[] = {
    341    ff_tx_codelet_list_float_c,
    342    ff_tx_codelet_list_double_c,
    343    ff_tx_codelet_list_int32_c,
    344    ff_tx_null_list,
    345 #if HAVE_X86ASM
    346    ff_tx_codelet_list_float_x86,
    347 #endif
    348 #if ARCH_AARCH64
    349    ff_tx_codelet_list_float_aarch64,
    350 #endif
    351 };
    352 static const int codelet_list_num = FF_ARRAY_ELEMS(codelet_list);
    353 
    354 static const int cpu_slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW  |
    355                                 AV_CPU_FLAG_ATOM     | AV_CPU_FLAG_SSSE3SLOW |
    356                                 AV_CPU_FLAG_AVXSLOW  | AV_CPU_FLAG_SLOW_GATHER;
    357 
    358 static const int cpu_slow_penalties[][2] = {
    359    { AV_CPU_FLAG_SSE2SLOW,    1 + 64  },
    360    { AV_CPU_FLAG_SSE3SLOW,    1 + 64  },
    361    { AV_CPU_FLAG_SSSE3SLOW,   1 + 64  },
    362    { AV_CPU_FLAG_ATOM,        1 + 128 },
    363    { AV_CPU_FLAG_AVXSLOW,     1 + 128 },
    364    { AV_CPU_FLAG_SLOW_GATHER, 1 + 32  },
    365 };
    366 
    367 static int get_codelet_prio(const FFTXCodelet *cd, int cpu_flags, int len)
    368 {
    369    int prio = cd->prio;
    370    int max_factor = 0;
    371 
    372    /* If the CPU has a SLOW flag, and the instruction is also flagged
    373     * as being slow for such, reduce its priority */
    374    for (int i = 0; i < FF_ARRAY_ELEMS(cpu_slow_penalties); i++) {
    375        if ((cpu_flags & cd->cpu_flags) & cpu_slow_penalties[i][0])
    376            prio -= cpu_slow_penalties[i][1];
    377    }
    378 
    379    /* Prioritize aligned-only codelets */
    380    if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED))
    381        prio += 64;
    382 
    383    /* Codelets for specific lengths are generally faster */
    384    if ((len == cd->min_len) && (len == cd->max_len))
    385        prio += 64;
    386 
    387    /* Forward-only or inverse-only transforms are generally better */
    388    if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY)))
    389        prio += 64;
    390 
    391    /* Larger factors are generally better */
    392    for (int i = 0; i < TX_MAX_SUB; i++)
    393        max_factor = FFMAX(cd->factors[i], max_factor);
    394    if (max_factor)
    395        prio += 16*max_factor;
    396 
    397    return prio;
    398 }
    399 
    400 typedef struct FFTXLenDecomp {
    401    int len;
    402    int len2;
    403    int prio;
    404    const FFTXCodelet *cd;
    405 } FFTXLenDecomp;
    406 
    407 static int cmp_decomp(FFTXLenDecomp *a, FFTXLenDecomp *b)
    408 {
    409    return FFDIFFSIGN(b->prio, a->prio);
    410 }
    411 
    412 int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type,
    413                           int len, int inv)
    414 {
    415    int nb_decomp = 0;
    416    FFTXLenDecomp ld[TX_MAX_DECOMPOSITIONS];
    417    int codelet_list_idx = codelet_list_num;
    418 
    419    const int cpu_flags = av_get_cpu_flags();
    420 
    421    /* Loop through all codelets in all codelet lists to find matches
    422     * to the requirements */
    423    while (codelet_list_idx--) {
    424        const FFTXCodelet * const * list = codelet_list[codelet_list_idx];
    425        const FFTXCodelet *cd = NULL;
    426 
    427        while ((cd = *list++)) {
    428            int fl = len;
    429            int skip = 0, prio;
    430            int factors_product = 1, factors_mod = 0;
    431 
    432            if (nb_decomp >= TX_MAX_DECOMPOSITIONS)
    433                goto sort;
    434 
    435            /* Check if the type matches */
    436            if (cd->type != TX_TYPE_ANY && type != cd->type)
    437                continue;
    438 
    439            /* Check direction for non-orthogonal codelets */
    440            if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) ||
    441                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv) ||
    442                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_REAL)) && inv) ||
    443                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_IMAGINARY)) && inv))
    444                continue;
    445 
    446            /* Check if the CPU supports the required ISA */
    447            if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL &&
    448                !(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask)))
    449                continue;
    450 
    451            for (int i = 0; i < TX_MAX_FACTORS; i++) {
    452                if (!cd->factors[i] || (fl == 1))
    453                    break;
    454 
    455                if (cd->factors[i] == TX_FACTOR_ANY) {
    456                    factors_mod++;
    457                    factors_product *= fl;
    458                } else if (!(fl % cd->factors[i])) {
    459                    factors_mod++;
    460                    if (cd->factors[i] == 2) {
    461                        int b = ff_ctz(fl);
    462                        fl >>= b;
    463                        factors_product <<= b;
    464                    } else {
    465                        do {
    466                            fl /= cd->factors[i];
    467                            factors_product *= cd->factors[i];
    468                        } while (!(fl % cd->factors[i]));
    469                    }
    470                }
    471            }
    472 
    473            /* Disqualify if factor requirements are not satisfied or if trivial */
    474            if ((factors_mod < cd->nb_factors) || (len == factors_product))
    475                continue;
    476 
    477            if (av_gcd(factors_product, fl) != 1)
    478                continue;
    479 
    480            /* Check if length is supported and factorization was successful */
    481            if ((factors_product < cd->min_len) ||
    482                (cd->max_len != TX_LEN_UNLIMITED && (factors_product > cd->max_len)))
    483                continue;
    484 
    485            prio = get_codelet_prio(cd, cpu_flags, factors_product) * factors_product;
    486 
    487            /* Check for duplicates */
    488            for (int i = 0; i < nb_decomp; i++) {
    489                if (factors_product == ld[i].len) {
    490                    /* Update priority if new one is higher */
    491                    if (prio > ld[i].prio)
    492                        ld[i].prio = prio;
    493                    skip = 1;
    494                    break;
    495                }
    496            }
    497 
    498            /* Add decomposition if unique */
    499            if (!skip) {
    500                ld[nb_decomp].cd = cd;
    501                ld[nb_decomp].len = factors_product;
    502                ld[nb_decomp].len2 = fl;
    503                ld[nb_decomp].prio = prio;
    504                nb_decomp++;
    505            }
    506        }
    507    }
    508 
    509    if (!nb_decomp)
    510        return AVERROR(EINVAL);
    511 
    512 sort:
    513    AV_QSORT(ld, nb_decomp, FFTXLenDecomp, cmp_decomp);
    514 
    515    for (int i = 0; i < nb_decomp; i++) {
    516        if (ld[i].cd->nb_factors > 1)
    517            dst[i] = ld[i].len2;
    518        else
    519            dst[i] = ld[i].len;
    520    }
    521 
    522    return nb_decomp;
    523 }
    524 
    525 int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts)
    526 {
    527    s->map = av_malloc(s->len*sizeof(*s->map));
    528    if (!s->map)
    529        return AVERROR(ENOMEM);
    530 
    531    s->map[0] = 0; /* DC is always at the start */
    532    if (s->inv) /* Reversing the ACs flips the transform direction */
    533        for (int i = 1; i < s->len; i++)
    534            s->map[i] = s->len - i;
    535    else
    536        for (int i = 1; i < s->len; i++)
    537            s->map[i] = i;
    538 
    539    s->map_dir = FF_TX_MAP_GATHER;
    540 
    541    return 0;
    542 }
    543 
    544 #if !CONFIG_SMALL
    545 static void print_flags(AVBPrint *bp, uint64_t f)
    546 {
    547    int prev = 0;
    548    const char *sep = ", ";
    549    av_bprintf(bp, "flags: [");
    550    if ((f & FF_TX_ALIGNED) && ++prev)
    551        av_bprintf(bp, "aligned");
    552    if ((f & AV_TX_UNALIGNED) && ++prev)
    553        av_bprintf(bp, "%sunaligned", prev > 1 ? sep : "");
    554    if ((f & AV_TX_INPLACE) && ++prev)
    555        av_bprintf(bp, "%sinplace", prev > 1 ? sep : "");
    556    if ((f & FF_TX_OUT_OF_PLACE) && ++prev)
    557        av_bprintf(bp, "%sout_of_place", prev > 1 ? sep : "");
    558    if ((f & FF_TX_FORWARD_ONLY) && ++prev)
    559        av_bprintf(bp, "%sfwd_only", prev > 1 ? sep : "");
    560    if ((f & FF_TX_INVERSE_ONLY) && ++prev)
    561        av_bprintf(bp, "%sinv_only", prev > 1 ? sep : "");
    562    if ((f & FF_TX_PRESHUFFLE) && ++prev)
    563        av_bprintf(bp, "%spreshuf", prev > 1 ? sep : "");
    564    if ((f & AV_TX_FULL_IMDCT) && ++prev)
    565        av_bprintf(bp, "%simdct_full", prev > 1 ? sep : "");
    566    if ((f & AV_TX_REAL_TO_REAL) && ++prev)
    567        av_bprintf(bp, "%sreal_to_real", prev > 1 ? sep : "");
    568    if ((f & AV_TX_REAL_TO_IMAGINARY) && ++prev)
    569        av_bprintf(bp, "%sreal_to_imaginary", prev > 1 ? sep : "");
    570    if ((f & FF_TX_ASM_CALL) && ++prev)
    571        av_bprintf(bp, "%sasm_call", prev > 1 ? sep : "");
    572    av_bprintf(bp, "]");
    573 }
    574 
    575 static void print_type(AVBPrint *bp, enum AVTXType type)
    576 {
    577    av_bprintf(bp, "%s",
    578               type == TX_TYPE_ANY       ? "any"         :
    579               type == AV_TX_FLOAT_FFT   ? "fft_float"   :
    580               type == AV_TX_FLOAT_MDCT  ? "mdct_float"  :
    581               type == AV_TX_FLOAT_RDFT  ? "rdft_float"  :
    582               type == AV_TX_FLOAT_DCT_I ? "dctI_float"  :
    583               type == AV_TX_FLOAT_DST_I ? "dstI_float"  :
    584               type == AV_TX_DOUBLE_FFT  ? "fft_double"  :
    585               type == AV_TX_DOUBLE_MDCT ? "mdct_double" :
    586               type == AV_TX_DOUBLE_RDFT ? "rdft_double" :
    587               type == AV_TX_DOUBLE_DCT_I ? "dctI_double" :
    588               type == AV_TX_DOUBLE_DST_I ? "dstI_double" :
    589               type == AV_TX_INT32_FFT   ? "fft_int32"   :
    590               type == AV_TX_INT32_MDCT  ? "mdct_int32"  :
    591               type == AV_TX_INT32_RDFT  ? "rdft_int32"  :
    592               type == AV_TX_INT32_DCT_I ? "dctI_int32" :
    593               type == AV_TX_INT32_DST_I ? "dstI_int32" :
    594               "unknown");
    595 }
    596 
    597 static void print_cd_info(const FFTXCodelet *cd, int prio, int len, int print_prio,
    598                          int log_level)
    599 {
    600    AVBPrint bp;
    601    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
    602 
    603    av_bprintf(&bp, "%s - type: ", cd->name);
    604 
    605    print_type(&bp, cd->type);
    606 
    607    av_bprintf(&bp, ", len: ");
    608    if (!len) {
    609        if (cd->min_len != cd->max_len)
    610            av_bprintf(&bp, "[%i, ", cd->min_len);
    611 
    612        if (cd->max_len == TX_LEN_UNLIMITED)
    613            av_bprintf(&bp, "unlimited");
    614        else
    615            av_bprintf(&bp, "%i", cd->max_len);
    616    } else {
    617        av_bprintf(&bp, "%i", len);
    618    }
    619 
    620    if (cd->factors[1]) {
    621        av_bprintf(&bp, "%s, factors", !len && cd->min_len != cd->max_len ? "]" : "");
    622        if (!cd->nb_factors)
    623            av_bprintf(&bp, ": [");
    624        else
    625            av_bprintf(&bp, "[%i]: [", cd->nb_factors);
    626 
    627        for (int i = 0; i < TX_MAX_FACTORS; i++) {
    628            if (i && cd->factors[i])
    629                av_bprintf(&bp, ", ");
    630            if (cd->factors[i] == TX_FACTOR_ANY)
    631                av_bprintf(&bp, "any");
    632            else if (cd->factors[i])
    633                av_bprintf(&bp, "%i", cd->factors[i]);
    634            else
    635                break;
    636        }
    637 
    638        av_bprintf(&bp, "], ");
    639    } else {
    640        av_bprintf(&bp, "%s, factor: %i, ",
    641                   !len && cd->min_len != cd->max_len ? "]" : "", cd->factors[0]);
    642    }
    643    print_flags(&bp, cd->flags);
    644 
    645    if (print_prio)
    646        av_bprintf(&bp, ", prio: %i", prio);
    647 
    648    av_log(NULL, log_level, "%s\n", bp.str);
    649 }
    650 
    651 static void print_tx_structure(AVTXContext *s, int depth)
    652 {
    653    const FFTXCodelet *cd = s->cd_self;
    654 
    655    for (int i = 0; i <= depth; i++)
    656        av_log(NULL, AV_LOG_DEBUG, "    ");
    657 
    658    print_cd_info(cd, cd->prio, s->len, 0, AV_LOG_DEBUG);
    659 
    660    for (int i = 0; i < s->nb_sub; i++)
    661        print_tx_structure(&s->sub[i], depth + 1);
    662 }
    663 #endif /* CONFIG_SMALL */
    664 
    665 typedef struct TXCodeletMatch {
    666    const FFTXCodelet *cd;
    667    int prio;
    668 } TXCodeletMatch;
    669 
    670 static int cmp_matches(TXCodeletMatch *a, TXCodeletMatch *b)
    671 {
    672    return FFDIFFSIGN(b->prio, a->prio);
    673 }
    674 
    675 /* We want all factors to completely cover the length */
    676 static inline int check_cd_factors(const FFTXCodelet *cd, int len)
    677 {
    678    int matches = 0, any_flag = 0;
    679 
    680    for (int i = 0; i < TX_MAX_FACTORS; i++) {
    681        int factor = cd->factors[i];
    682 
    683        if (factor == TX_FACTOR_ANY) {
    684            any_flag = 1;
    685            matches++;
    686            continue;
    687        } else if (len <= 1 || !factor) {
    688            break;
    689        } else if (factor == 2) { /* Fast path */
    690            int bits_2 = ff_ctz(len);
    691            if (!bits_2)
    692                continue; /* Factor not supported */
    693 
    694            len >>= bits_2;
    695            matches++;
    696        } else {
    697            int res = len % factor;
    698            if (res)
    699                continue; /* Factor not supported */
    700 
    701            while (!res) {
    702                len /= factor;
    703                res = len % factor;
    704            }
    705            matches++;
    706        }
    707    }
    708 
    709    return (cd->nb_factors <= matches) && (any_flag || len == 1);
    710 }
    711 
    712 av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type,
    713                             uint64_t flags, FFTXCodeletOptions *opts,
    714                             int len, int inv, const void *scale)
    715 {
    716    int ret = 0;
    717    AVTXContext *sub = NULL;
    718    TXCodeletMatch *cd_tmp, *cd_matches = NULL;
    719    unsigned int cd_matches_size = 0;
    720    int codelet_list_idx = codelet_list_num;
    721    int nb_cd_matches = 0;
    722 #if !CONFIG_SMALL
    723    AVBPrint bp;
    724 #endif
    725 
    726    /* We still accept functions marked with SLOW, even if the CPU is
    727     * marked with the same flag, but we give them lower priority. */
    728    const int cpu_flags = av_get_cpu_flags();
    729 
    730    /* Flags the transform wants */
    731    uint64_t req_flags = flags;
    732 
    733    /* Flags the codelet may require to be present */
    734    uint64_t inv_req_mask = AV_TX_FULL_IMDCT |
    735                            AV_TX_REAL_TO_REAL |
    736                            AV_TX_REAL_TO_IMAGINARY |
    737                            FF_TX_PRESHUFFLE |
    738                            FF_TX_ASM_CALL;
    739 
    740    /* Unaligned codelets are compatible with the aligned flag */
    741    if (req_flags & FF_TX_ALIGNED)
    742        req_flags |= AV_TX_UNALIGNED;
    743 
    744    /* If either flag is set, both are okay, so don't check for an exact match */
    745    if ((req_flags & AV_TX_INPLACE) && (req_flags & FF_TX_OUT_OF_PLACE))
    746        req_flags &= ~(AV_TX_INPLACE | FF_TX_OUT_OF_PLACE);
    747    if ((req_flags & FF_TX_ALIGNED) && (req_flags & AV_TX_UNALIGNED))
    748        req_flags &= ~(FF_TX_ALIGNED | AV_TX_UNALIGNED);
    749 
    750    /* Loop through all codelets in all codelet lists to find matches
    751     * to the requirements */
    752    while (codelet_list_idx--) {
    753        const FFTXCodelet * const * list = codelet_list[codelet_list_idx];
    754        const FFTXCodelet *cd = NULL;
    755 
    756        while ((cd = *list++)) {
    757            /* Check if the type matches */
    758            if (cd->type != TX_TYPE_ANY && type != cd->type)
    759                continue;
    760 
    761            /* Check direction for non-orthogonal codelets */
    762            if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) ||
    763                ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv) ||
    764                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_REAL)) && inv) ||
    765                ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_IMAGINARY)) && inv))
    766                continue;
    767 
    768            /* Check if the requested flags match from both sides */
    769            if (((req_flags    & cd->flags) != (req_flags)) ||
    770                ((inv_req_mask & cd->flags) != (req_flags & inv_req_mask)))
    771                continue;
    772 
    773            /* Check if length is supported */
    774            if ((len < cd->min_len) || (cd->max_len != -1 && (len > cd->max_len)))
    775                continue;
    776 
    777            /* Check if the CPU supports the required ISA */
    778            if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL &&
    779                !(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask)))
    780                continue;
    781 
    782            /* Check for factors */
    783            if (!check_cd_factors(cd, len))
    784                continue;
    785 
    786            /* Realloc array and append */
    787            cd_tmp = av_fast_realloc(cd_matches, &cd_matches_size,
    788                                     sizeof(*cd_tmp) * (nb_cd_matches + 1));
    789            if (!cd_tmp) {
    790                av_free(cd_matches);
    791                return AVERROR(ENOMEM);
    792            }
    793 
    794            cd_matches                     = cd_tmp;
    795            cd_matches[nb_cd_matches].cd   = cd;
    796            cd_matches[nb_cd_matches].prio = get_codelet_prio(cd, cpu_flags, len);
    797            nb_cd_matches++;
    798        }
    799    }
    800 
    801 #if !CONFIG_SMALL
    802    /* Print debugging info */
    803    av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC);
    804    av_bprintf(&bp, "For transform of length %i, %s, ", len,
    805               inv ? "inverse" : "forward");
    806    print_type(&bp, type);
    807    av_bprintf(&bp, ", ");
    808    print_flags(&bp, flags);
    809    av_bprintf(&bp, ", found %i matches%s", nb_cd_matches,
    810               nb_cd_matches ? ":" : ".");
    811 #endif
    812 
    813    /* No matches found */
    814    if (!nb_cd_matches)
    815        return AVERROR(ENOSYS);
    816 
    817    /* Sort the list */
    818    AV_QSORT(cd_matches, nb_cd_matches, TXCodeletMatch, cmp_matches);
    819 
    820 #if !CONFIG_SMALL
    821    av_log(NULL, AV_LOG_TRACE, "%s\n", bp.str);
    822 
    823    for (int i = 0; i < nb_cd_matches; i++) {
    824        av_log(NULL, AV_LOG_TRACE, "    %i: ", i + 1);
    825        print_cd_info(cd_matches[i].cd, cd_matches[i].prio, 0, 1, AV_LOG_TRACE);
    826    }
    827 #endif
    828 
    829    if (!s->sub) {
    830        s->sub = sub = av_mallocz(TX_MAX_SUB*sizeof(*sub));
    831        if (!sub) {
    832            ret = AVERROR(ENOMEM);
    833            goto end;
    834        }
    835    }
    836 
    837    /* Attempt to initialize each */
    838    for (int i = 0; i < nb_cd_matches; i++) {
    839        const FFTXCodelet *cd = cd_matches[i].cd;
    840        AVTXContext *sctx = &s->sub[s->nb_sub];
    841 
    842        sctx->len        = len;
    843        sctx->inv        = inv;
    844        sctx->type       = type;
    845        sctx->flags      = cd->flags | flags;
    846        sctx->cd_self    = cd;
    847 
    848        s->fn[s->nb_sub] = cd->function;
    849        s->cd[s->nb_sub] = cd;
    850 
    851        ret = 0;
    852        if (cd->init)
    853            ret = cd->init(sctx, cd, flags, opts, len, inv, scale);
    854 
    855        if (ret >= 0) {
    856            if (opts && opts->map_dir != FF_TX_MAP_NONE &&
    857                sctx->map_dir == FF_TX_MAP_NONE) {
    858                /* If a specific map direction was requested, and it doesn't
    859                 * exist, create one.*/
    860                sctx->map = av_malloc(len*sizeof(*sctx->map));
    861                if (!sctx->map) {
    862                    ret = AVERROR(ENOMEM);
    863                    goto end;
    864                }
    865 
    866                for (int i = 0; i < len; i++)
    867                    sctx->map[i] = i;
    868            } else if (opts && (opts->map_dir != sctx->map_dir)) {
    869                int *tmp = av_malloc(len*sizeof(*sctx->map));
    870                if (!tmp) {
    871                    ret = AVERROR(ENOMEM);
    872                    goto end;
    873                }
    874 
    875                memcpy(tmp, sctx->map, len*sizeof(*sctx->map));
    876 
    877                for (int i = 0; i < len; i++)
    878                    sctx->map[tmp[i]] = i;
    879 
    880                av_free(tmp);
    881            }
    882 
    883            s->nb_sub++;
    884            goto end;
    885        }
    886 
    887        s->fn[s->nb_sub] = NULL;
    888        s->cd[s->nb_sub] = NULL;
    889 
    890        reset_ctx(sctx, 0);
    891        if (ret == AVERROR(ENOMEM))
    892            break;
    893    }
    894 
    895    if (!s->nb_sub)
    896        av_freep(&s->sub);
    897 
    898 end:
    899    av_free(cd_matches);
    900    return ret;
    901 }
    902 
    903 av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type,
    904                       int inv, int len, const void *scale, uint64_t flags)
    905 {
    906    int ret;
    907    AVTXContext tmp = { 0 };
    908    const double default_scale_d = 1.0;
    909    const float  default_scale_f = 1.0f;
    910 
    911    if (!len || type >= AV_TX_NB || !ctx || !tx)
    912        return AVERROR(EINVAL);
    913 
    914    if (!(flags & AV_TX_UNALIGNED))
    915        flags |= FF_TX_ALIGNED;
    916    if (!(flags & AV_TX_INPLACE))
    917        flags |= FF_TX_OUT_OF_PLACE;
    918 
    919    if (!scale && ((type == AV_TX_DOUBLE_MDCT) || (type == AV_TX_DOUBLE_DCT) ||
    920                   (type == AV_TX_DOUBLE_DCT_I) || (type == AV_TX_DOUBLE_DST_I) ||
    921                   (type == AV_TX_DOUBLE_RDFT)))
    922        scale = &default_scale_d;
    923    else if (!scale && !TYPE_IS(FFT, type))
    924        scale = &default_scale_f;
    925 
    926    ret = ff_tx_init_subtx(&tmp, type, flags, NULL, len, inv, scale);
    927    if (ret < 0)
    928        return ret;
    929 
    930    *ctx = &tmp.sub[0];
    931    *tx  = tmp.fn[0];
    932 
    933 #if !CONFIG_SMALL
    934    av_log(NULL, AV_LOG_DEBUG, "Transform tree:\n");
    935    print_tx_structure(*ctx, 0);
    936 #endif
    937 
    938    return ret;
    939 }