tx.c (29491B)
1 /* 2 * This file is part of FFmpeg. 3 * 4 * FFmpeg is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU Lesser General Public 6 * License as published by the Free Software Foundation; either 7 * version 2.1 of the License, or (at your option) any later version. 8 * 9 * FFmpeg is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * Lesser General Public License for more details. 13 * 14 * You should have received a copy of the GNU Lesser General Public 15 * License along with FFmpeg; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19 #include "avassert.h" 20 #include "intmath.h" 21 #include "cpu.h" 22 #include "mem.h" 23 #include "qsort.h" 24 #include "bprint.h" 25 26 #include "tx_priv.h" 27 28 #define TYPE_IS(type, x) \ 29 (((x) == AV_TX_FLOAT_ ## type) || \ 30 ((x) == AV_TX_DOUBLE_ ## type) || \ 31 ((x) == AV_TX_INT32_ ## type)) 32 33 /* Calculates the modular multiplicative inverse */ 34 static av_always_inline int mulinv(int n, int m) 35 { 36 n = n % m; 37 for (int x = 1; x < m; x++) 38 if (((n * x) % m) == 1) 39 return x; 40 av_assert0(0); /* Never reached */ 41 return 0; 42 } 43 44 int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, 45 int d1, int d2) 46 { 47 const int sl = d1*d2; 48 49 s->map = av_malloc(s->len*sizeof(*s->map)); 50 if (!s->map) 51 return AVERROR(ENOMEM); 52 53 for (int k = 0; k < s->len; k += sl) { 54 if (s->inv || (opts && opts->map_dir == FF_TX_MAP_SCATTER)) { 55 for (int m = 0; m < d2; m++) 56 for (int n = 0; n < d1; n++) 57 s->map[k + ((m*d1 + n*d2) % (sl))] = m*d1 + n; 58 } else { 59 for (int m = 0; m < d2; m++) 60 for (int n = 0; n < d1; n++) 61 s->map[k + m*d1 + n] = (m*d1 + n*d2) % (sl); 62 } 63 64 if (s->inv) 65 for (int w = 1; w <= ((sl) >> 1); w++) 66 FFSWAP(int, s->map[k + w], s->map[k + sl - w]); 67 } 68 69 s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER; 70 71 return 0; 72 } 73 74 /* Guaranteed to work for any n, m where gcd(n, m) == 1 */ 75 int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, 76 int inv, int n, int m) 77 { 78 int *in_map, *out_map; 79 const int len = n*m; /* Will not be equal to s->len for MDCTs */ 80 int m_inv, n_inv; 81 82 /* Make sure the numbers are coprime */ 83 if (av_gcd(n, m) != 1) 84 return AVERROR(EINVAL); 85 86 m_inv = mulinv(m, n); 87 n_inv = mulinv(n, m); 88 89 if (!(s->map = av_malloc(2*len*sizeof(*s->map)))) 90 return AVERROR(ENOMEM); 91 92 in_map = s->map; 93 out_map = s->map + len; 94 95 /* Ruritanian map for input, CRT map for output, can be swapped */ 96 if (opts && opts->map_dir == FF_TX_MAP_SCATTER) { 97 for (int j = 0; j < m; j++) { 98 for (int i = 0; i < n; i++) { 99 in_map[(i*m + j*n) % len] = j*n + i; 100 out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j; 101 } 102 } 103 } else { 104 for (int j = 0; j < m; j++) { 105 for (int i = 0; i < n; i++) { 106 in_map[j*n + i] = (i*m + j*n) % len; 107 out_map[(i*m*m_inv + j*n*n_inv) % len] = i*m + j; 108 } 109 } 110 } 111 112 if (inv) { 113 for (int i = 0; i < m; i++) { 114 int *in = &in_map[i*n + 1]; /* Skip the DC */ 115 for (int j = 0; j < ((n - 1) >> 1); j++) 116 FFSWAP(int, in[j], in[n - j - 2]); 117 } 118 } 119 120 s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER; 121 122 return 0; 123 } 124 125 static inline int split_radix_permutation(int i, int len, int inv) 126 { 127 len >>= 1; 128 if (len <= 1) 129 return i & 1; 130 if (!(i & len)) 131 return split_radix_permutation(i, len, inv) * 2; 132 len >>= 1; 133 return split_radix_permutation(i, len, inv) * 4 + 1 - 2*(!(i & len) ^ inv); 134 } 135 136 int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts) 137 { 138 int len = s->len; 139 140 if (!(s->map = av_malloc(len*sizeof(*s->map)))) 141 return AVERROR(ENOMEM); 142 143 if (opts && opts->map_dir == FF_TX_MAP_SCATTER) { 144 for (int i = 0; i < s->len; i++) 145 s->map[-split_radix_permutation(i, len, s->inv) & (len - 1)] = i; 146 } else { 147 for (int i = 0; i < s->len; i++) 148 s->map[i] = -split_radix_permutation(i, len, s->inv) & (len - 1); 149 } 150 151 s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER; 152 153 return 0; 154 } 155 156 int ff_tx_gen_inplace_map(AVTXContext *s, int len) 157 { 158 int *src_map, out_map_idx = 0; 159 160 if (!s->sub || !s->sub->map) 161 return AVERROR(EINVAL); 162 163 if (!(s->map = av_mallocz(len*sizeof(*s->map)))) 164 return AVERROR(ENOMEM); 165 166 src_map = s->sub->map; 167 168 /* The first coefficient is always already in-place */ 169 for (int src = 1; src < s->len; src++) { 170 int dst = src_map[src]; 171 int found = 0; 172 173 if (dst <= src) 174 continue; 175 176 /* This just checks if a closed loop has been encountered before, 177 * and if so, skips it, since to fully permute a loop we must only 178 * enter it once. */ 179 do { 180 for (int j = 0; j < out_map_idx; j++) { 181 if (dst == s->map[j]) { 182 found = 1; 183 break; 184 } 185 } 186 dst = src_map[dst]; 187 } while (dst != src && !found); 188 189 if (!found) 190 s->map[out_map_idx++] = src; 191 } 192 193 s->map[out_map_idx++] = 0; 194 195 return 0; 196 } 197 198 static void parity_revtab_generator(int *revtab, int n, int inv, int offset, 199 int is_dual, int dual_high, int len, 200 int basis, int dual_stride, int inv_lookup) 201 { 202 len >>= 1; 203 204 if (len <= basis) { 205 int k1, k2, stride, even_idx, odd_idx; 206 207 is_dual = is_dual && dual_stride; 208 dual_high = is_dual & dual_high; 209 stride = is_dual ? FFMIN(dual_stride, len) : 0; 210 211 even_idx = offset + dual_high*(stride - 2*len); 212 odd_idx = even_idx + len + (is_dual && !dual_high)*len + dual_high*len; 213 214 for (int i = 0; i < len; i++) { 215 k1 = -split_radix_permutation(offset + i*2 + 0, n, inv) & (n - 1); 216 k2 = -split_radix_permutation(offset + i*2 + 1, n, inv) & (n - 1); 217 if (inv_lookup) { 218 revtab[even_idx++] = k1; 219 revtab[odd_idx++] = k2; 220 } else { 221 revtab[k1] = even_idx++; 222 revtab[k2] = odd_idx++; 223 } 224 if (stride && !((i + 1) % stride)) { 225 even_idx += stride; 226 odd_idx += stride; 227 } 228 } 229 230 return; 231 } 232 233 parity_revtab_generator(revtab, n, inv, offset, 234 0, 0, len >> 0, basis, dual_stride, inv_lookup); 235 parity_revtab_generator(revtab, n, inv, offset + (len >> 0), 236 1, 0, len >> 1, basis, dual_stride, inv_lookup); 237 parity_revtab_generator(revtab, n, inv, offset + (len >> 0) + (len >> 1), 238 1, 1, len >> 1, basis, dual_stride, inv_lookup); 239 } 240 241 int ff_tx_gen_split_radix_parity_revtab(AVTXContext *s, int len, int inv, 242 FFTXCodeletOptions *opts, 243 int basis, int dual_stride) 244 { 245 basis >>= 1; 246 if (len < basis) 247 return AVERROR(EINVAL); 248 249 if (!(s->map = av_mallocz(len*sizeof(*s->map)))) 250 return AVERROR(ENOMEM); 251 252 av_assert0(!dual_stride || !(dual_stride & (dual_stride - 1))); 253 av_assert0(dual_stride <= basis); 254 255 parity_revtab_generator(s->map, len, inv, 0, 0, 0, len, 256 basis, dual_stride, 257 opts ? opts->map_dir == FF_TX_MAP_GATHER : FF_TX_MAP_GATHER); 258 259 s->map_dir = opts ? opts->map_dir : FF_TX_MAP_GATHER; 260 261 return 0; 262 } 263 264 static void reset_ctx(AVTXContext *s, int free_sub) 265 { 266 if (!s) 267 return; 268 269 if (s->sub) 270 for (int i = 0; i < TX_MAX_SUB; i++) 271 reset_ctx(&s->sub[i], free_sub + 1); 272 273 if (s->cd_self && s->cd_self->uninit) 274 s->cd_self->uninit(s); 275 276 if (free_sub) 277 av_freep(&s->sub); 278 279 av_freep(&s->map); 280 av_freep(&s->exp); 281 av_freep(&s->tmp); 282 283 /* Nothing else needs to be reset, it gets overwritten if another 284 * ff_tx_init_subtx() call is made. */ 285 s->nb_sub = 0; 286 s->opaque = NULL; 287 memset(s->fn, 0, sizeof(*s->fn)); 288 } 289 290 void ff_tx_clear_ctx(AVTXContext *s) 291 { 292 reset_ctx(s, 0); 293 } 294 295 av_cold void av_tx_uninit(AVTXContext **ctx) 296 { 297 if (!(*ctx)) 298 return; 299 300 reset_ctx(*ctx, 1); 301 av_freep(ctx); 302 } 303 304 static av_cold int ff_tx_null_init(AVTXContext *s, const FFTXCodelet *cd, 305 uint64_t flags, FFTXCodeletOptions *opts, 306 int len, int inv, const void *scale) 307 { 308 /* Can only handle one sample+type to one sample+type transforms */ 309 if (TYPE_IS(MDCT, s->type) || TYPE_IS(RDFT, s->type)) 310 return AVERROR(EINVAL); 311 return 0; 312 } 313 314 /* Null transform when the length is 1 */ 315 static void ff_tx_null(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride) 316 { 317 memcpy(_out, _in, stride); 318 } 319 320 static const FFTXCodelet ff_tx_null_def = { 321 .name = NULL_IF_CONFIG_SMALL("null"), 322 .function = ff_tx_null, 323 .type = TX_TYPE_ANY, 324 .flags = AV_TX_UNALIGNED | FF_TX_ALIGNED | 325 FF_TX_OUT_OF_PLACE | AV_TX_INPLACE, 326 .factors[0] = TX_FACTOR_ANY, 327 .min_len = 1, 328 .max_len = 1, 329 .init = ff_tx_null_init, 330 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 331 .prio = FF_TX_PRIO_MAX, 332 }; 333 334 static const FFTXCodelet * const ff_tx_null_list[] = { 335 &ff_tx_null_def, 336 NULL, 337 }; 338 339 /* Array of all compiled codelet lists. Order is irrelevant. */ 340 static const FFTXCodelet * const * const codelet_list[] = { 341 ff_tx_codelet_list_float_c, 342 ff_tx_codelet_list_double_c, 343 ff_tx_codelet_list_int32_c, 344 ff_tx_null_list, 345 #if HAVE_X86ASM 346 ff_tx_codelet_list_float_x86, 347 #endif 348 #if ARCH_AARCH64 349 ff_tx_codelet_list_float_aarch64, 350 #endif 351 }; 352 static const int codelet_list_num = FF_ARRAY_ELEMS(codelet_list); 353 354 static const int cpu_slow_mask = AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE3SLOW | 355 AV_CPU_FLAG_ATOM | AV_CPU_FLAG_SSSE3SLOW | 356 AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER; 357 358 static const int cpu_slow_penalties[][2] = { 359 { AV_CPU_FLAG_SSE2SLOW, 1 + 64 }, 360 { AV_CPU_FLAG_SSE3SLOW, 1 + 64 }, 361 { AV_CPU_FLAG_SSSE3SLOW, 1 + 64 }, 362 { AV_CPU_FLAG_ATOM, 1 + 128 }, 363 { AV_CPU_FLAG_AVXSLOW, 1 + 128 }, 364 { AV_CPU_FLAG_SLOW_GATHER, 1 + 32 }, 365 }; 366 367 static int get_codelet_prio(const FFTXCodelet *cd, int cpu_flags, int len) 368 { 369 int prio = cd->prio; 370 int max_factor = 0; 371 372 /* If the CPU has a SLOW flag, and the instruction is also flagged 373 * as being slow for such, reduce its priority */ 374 for (int i = 0; i < FF_ARRAY_ELEMS(cpu_slow_penalties); i++) { 375 if ((cpu_flags & cd->cpu_flags) & cpu_slow_penalties[i][0]) 376 prio -= cpu_slow_penalties[i][1]; 377 } 378 379 /* Prioritize aligned-only codelets */ 380 if ((cd->flags & FF_TX_ALIGNED) && !(cd->flags & AV_TX_UNALIGNED)) 381 prio += 64; 382 383 /* Codelets for specific lengths are generally faster */ 384 if ((len == cd->min_len) && (len == cd->max_len)) 385 prio += 64; 386 387 /* Forward-only or inverse-only transforms are generally better */ 388 if ((cd->flags & (FF_TX_FORWARD_ONLY | FF_TX_INVERSE_ONLY))) 389 prio += 64; 390 391 /* Larger factors are generally better */ 392 for (int i = 0; i < TX_MAX_SUB; i++) 393 max_factor = FFMAX(cd->factors[i], max_factor); 394 if (max_factor) 395 prio += 16*max_factor; 396 397 return prio; 398 } 399 400 typedef struct FFTXLenDecomp { 401 int len; 402 int len2; 403 int prio; 404 const FFTXCodelet *cd; 405 } FFTXLenDecomp; 406 407 static int cmp_decomp(FFTXLenDecomp *a, FFTXLenDecomp *b) 408 { 409 return FFDIFFSIGN(b->prio, a->prio); 410 } 411 412 int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type, 413 int len, int inv) 414 { 415 int nb_decomp = 0; 416 FFTXLenDecomp ld[TX_MAX_DECOMPOSITIONS]; 417 int codelet_list_idx = codelet_list_num; 418 419 const int cpu_flags = av_get_cpu_flags(); 420 421 /* Loop through all codelets in all codelet lists to find matches 422 * to the requirements */ 423 while (codelet_list_idx--) { 424 const FFTXCodelet * const * list = codelet_list[codelet_list_idx]; 425 const FFTXCodelet *cd = NULL; 426 427 while ((cd = *list++)) { 428 int fl = len; 429 int skip = 0, prio; 430 int factors_product = 1, factors_mod = 0; 431 432 if (nb_decomp >= TX_MAX_DECOMPOSITIONS) 433 goto sort; 434 435 /* Check if the type matches */ 436 if (cd->type != TX_TYPE_ANY && type != cd->type) 437 continue; 438 439 /* Check direction for non-orthogonal codelets */ 440 if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) || 441 ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv) || 442 ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_REAL)) && inv) || 443 ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_IMAGINARY)) && inv)) 444 continue; 445 446 /* Check if the CPU supports the required ISA */ 447 if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL && 448 !(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask))) 449 continue; 450 451 for (int i = 0; i < TX_MAX_FACTORS; i++) { 452 if (!cd->factors[i] || (fl == 1)) 453 break; 454 455 if (cd->factors[i] == TX_FACTOR_ANY) { 456 factors_mod++; 457 factors_product *= fl; 458 } else if (!(fl % cd->factors[i])) { 459 factors_mod++; 460 if (cd->factors[i] == 2) { 461 int b = ff_ctz(fl); 462 fl >>= b; 463 factors_product <<= b; 464 } else { 465 do { 466 fl /= cd->factors[i]; 467 factors_product *= cd->factors[i]; 468 } while (!(fl % cd->factors[i])); 469 } 470 } 471 } 472 473 /* Disqualify if factor requirements are not satisfied or if trivial */ 474 if ((factors_mod < cd->nb_factors) || (len == factors_product)) 475 continue; 476 477 if (av_gcd(factors_product, fl) != 1) 478 continue; 479 480 /* Check if length is supported and factorization was successful */ 481 if ((factors_product < cd->min_len) || 482 (cd->max_len != TX_LEN_UNLIMITED && (factors_product > cd->max_len))) 483 continue; 484 485 prio = get_codelet_prio(cd, cpu_flags, factors_product) * factors_product; 486 487 /* Check for duplicates */ 488 for (int i = 0; i < nb_decomp; i++) { 489 if (factors_product == ld[i].len) { 490 /* Update priority if new one is higher */ 491 if (prio > ld[i].prio) 492 ld[i].prio = prio; 493 skip = 1; 494 break; 495 } 496 } 497 498 /* Add decomposition if unique */ 499 if (!skip) { 500 ld[nb_decomp].cd = cd; 501 ld[nb_decomp].len = factors_product; 502 ld[nb_decomp].len2 = fl; 503 ld[nb_decomp].prio = prio; 504 nb_decomp++; 505 } 506 } 507 } 508 509 if (!nb_decomp) 510 return AVERROR(EINVAL); 511 512 sort: 513 AV_QSORT(ld, nb_decomp, FFTXLenDecomp, cmp_decomp); 514 515 for (int i = 0; i < nb_decomp; i++) { 516 if (ld[i].cd->nb_factors > 1) 517 dst[i] = ld[i].len2; 518 else 519 dst[i] = ld[i].len; 520 } 521 522 return nb_decomp; 523 } 524 525 int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts) 526 { 527 s->map = av_malloc(s->len*sizeof(*s->map)); 528 if (!s->map) 529 return AVERROR(ENOMEM); 530 531 s->map[0] = 0; /* DC is always at the start */ 532 if (s->inv) /* Reversing the ACs flips the transform direction */ 533 for (int i = 1; i < s->len; i++) 534 s->map[i] = s->len - i; 535 else 536 for (int i = 1; i < s->len; i++) 537 s->map[i] = i; 538 539 s->map_dir = FF_TX_MAP_GATHER; 540 541 return 0; 542 } 543 544 #if !CONFIG_SMALL 545 static void print_flags(AVBPrint *bp, uint64_t f) 546 { 547 int prev = 0; 548 const char *sep = ", "; 549 av_bprintf(bp, "flags: ["); 550 if ((f & FF_TX_ALIGNED) && ++prev) 551 av_bprintf(bp, "aligned"); 552 if ((f & AV_TX_UNALIGNED) && ++prev) 553 av_bprintf(bp, "%sunaligned", prev > 1 ? sep : ""); 554 if ((f & AV_TX_INPLACE) && ++prev) 555 av_bprintf(bp, "%sinplace", prev > 1 ? sep : ""); 556 if ((f & FF_TX_OUT_OF_PLACE) && ++prev) 557 av_bprintf(bp, "%sout_of_place", prev > 1 ? sep : ""); 558 if ((f & FF_TX_FORWARD_ONLY) && ++prev) 559 av_bprintf(bp, "%sfwd_only", prev > 1 ? sep : ""); 560 if ((f & FF_TX_INVERSE_ONLY) && ++prev) 561 av_bprintf(bp, "%sinv_only", prev > 1 ? sep : ""); 562 if ((f & FF_TX_PRESHUFFLE) && ++prev) 563 av_bprintf(bp, "%spreshuf", prev > 1 ? sep : ""); 564 if ((f & AV_TX_FULL_IMDCT) && ++prev) 565 av_bprintf(bp, "%simdct_full", prev > 1 ? sep : ""); 566 if ((f & AV_TX_REAL_TO_REAL) && ++prev) 567 av_bprintf(bp, "%sreal_to_real", prev > 1 ? sep : ""); 568 if ((f & AV_TX_REAL_TO_IMAGINARY) && ++prev) 569 av_bprintf(bp, "%sreal_to_imaginary", prev > 1 ? sep : ""); 570 if ((f & FF_TX_ASM_CALL) && ++prev) 571 av_bprintf(bp, "%sasm_call", prev > 1 ? sep : ""); 572 av_bprintf(bp, "]"); 573 } 574 575 static void print_type(AVBPrint *bp, enum AVTXType type) 576 { 577 av_bprintf(bp, "%s", 578 type == TX_TYPE_ANY ? "any" : 579 type == AV_TX_FLOAT_FFT ? "fft_float" : 580 type == AV_TX_FLOAT_MDCT ? "mdct_float" : 581 type == AV_TX_FLOAT_RDFT ? "rdft_float" : 582 type == AV_TX_FLOAT_DCT_I ? "dctI_float" : 583 type == AV_TX_FLOAT_DST_I ? "dstI_float" : 584 type == AV_TX_DOUBLE_FFT ? "fft_double" : 585 type == AV_TX_DOUBLE_MDCT ? "mdct_double" : 586 type == AV_TX_DOUBLE_RDFT ? "rdft_double" : 587 type == AV_TX_DOUBLE_DCT_I ? "dctI_double" : 588 type == AV_TX_DOUBLE_DST_I ? "dstI_double" : 589 type == AV_TX_INT32_FFT ? "fft_int32" : 590 type == AV_TX_INT32_MDCT ? "mdct_int32" : 591 type == AV_TX_INT32_RDFT ? "rdft_int32" : 592 type == AV_TX_INT32_DCT_I ? "dctI_int32" : 593 type == AV_TX_INT32_DST_I ? "dstI_int32" : 594 "unknown"); 595 } 596 597 static void print_cd_info(const FFTXCodelet *cd, int prio, int len, int print_prio, 598 int log_level) 599 { 600 AVBPrint bp; 601 av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC); 602 603 av_bprintf(&bp, "%s - type: ", cd->name); 604 605 print_type(&bp, cd->type); 606 607 av_bprintf(&bp, ", len: "); 608 if (!len) { 609 if (cd->min_len != cd->max_len) 610 av_bprintf(&bp, "[%i, ", cd->min_len); 611 612 if (cd->max_len == TX_LEN_UNLIMITED) 613 av_bprintf(&bp, "unlimited"); 614 else 615 av_bprintf(&bp, "%i", cd->max_len); 616 } else { 617 av_bprintf(&bp, "%i", len); 618 } 619 620 if (cd->factors[1]) { 621 av_bprintf(&bp, "%s, factors", !len && cd->min_len != cd->max_len ? "]" : ""); 622 if (!cd->nb_factors) 623 av_bprintf(&bp, ": ["); 624 else 625 av_bprintf(&bp, "[%i]: [", cd->nb_factors); 626 627 for (int i = 0; i < TX_MAX_FACTORS; i++) { 628 if (i && cd->factors[i]) 629 av_bprintf(&bp, ", "); 630 if (cd->factors[i] == TX_FACTOR_ANY) 631 av_bprintf(&bp, "any"); 632 else if (cd->factors[i]) 633 av_bprintf(&bp, "%i", cd->factors[i]); 634 else 635 break; 636 } 637 638 av_bprintf(&bp, "], "); 639 } else { 640 av_bprintf(&bp, "%s, factor: %i, ", 641 !len && cd->min_len != cd->max_len ? "]" : "", cd->factors[0]); 642 } 643 print_flags(&bp, cd->flags); 644 645 if (print_prio) 646 av_bprintf(&bp, ", prio: %i", prio); 647 648 av_log(NULL, log_level, "%s\n", bp.str); 649 } 650 651 static void print_tx_structure(AVTXContext *s, int depth) 652 { 653 const FFTXCodelet *cd = s->cd_self; 654 655 for (int i = 0; i <= depth; i++) 656 av_log(NULL, AV_LOG_DEBUG, " "); 657 658 print_cd_info(cd, cd->prio, s->len, 0, AV_LOG_DEBUG); 659 660 for (int i = 0; i < s->nb_sub; i++) 661 print_tx_structure(&s->sub[i], depth + 1); 662 } 663 #endif /* CONFIG_SMALL */ 664 665 typedef struct TXCodeletMatch { 666 const FFTXCodelet *cd; 667 int prio; 668 } TXCodeletMatch; 669 670 static int cmp_matches(TXCodeletMatch *a, TXCodeletMatch *b) 671 { 672 return FFDIFFSIGN(b->prio, a->prio); 673 } 674 675 /* We want all factors to completely cover the length */ 676 static inline int check_cd_factors(const FFTXCodelet *cd, int len) 677 { 678 int matches = 0, any_flag = 0; 679 680 for (int i = 0; i < TX_MAX_FACTORS; i++) { 681 int factor = cd->factors[i]; 682 683 if (factor == TX_FACTOR_ANY) { 684 any_flag = 1; 685 matches++; 686 continue; 687 } else if (len <= 1 || !factor) { 688 break; 689 } else if (factor == 2) { /* Fast path */ 690 int bits_2 = ff_ctz(len); 691 if (!bits_2) 692 continue; /* Factor not supported */ 693 694 len >>= bits_2; 695 matches++; 696 } else { 697 int res = len % factor; 698 if (res) 699 continue; /* Factor not supported */ 700 701 while (!res) { 702 len /= factor; 703 res = len % factor; 704 } 705 matches++; 706 } 707 } 708 709 return (cd->nb_factors <= matches) && (any_flag || len == 1); 710 } 711 712 av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, 713 uint64_t flags, FFTXCodeletOptions *opts, 714 int len, int inv, const void *scale) 715 { 716 int ret = 0; 717 AVTXContext *sub = NULL; 718 TXCodeletMatch *cd_tmp, *cd_matches = NULL; 719 unsigned int cd_matches_size = 0; 720 int codelet_list_idx = codelet_list_num; 721 int nb_cd_matches = 0; 722 #if !CONFIG_SMALL 723 AVBPrint bp; 724 #endif 725 726 /* We still accept functions marked with SLOW, even if the CPU is 727 * marked with the same flag, but we give them lower priority. */ 728 const int cpu_flags = av_get_cpu_flags(); 729 730 /* Flags the transform wants */ 731 uint64_t req_flags = flags; 732 733 /* Flags the codelet may require to be present */ 734 uint64_t inv_req_mask = AV_TX_FULL_IMDCT | 735 AV_TX_REAL_TO_REAL | 736 AV_TX_REAL_TO_IMAGINARY | 737 FF_TX_PRESHUFFLE | 738 FF_TX_ASM_CALL; 739 740 /* Unaligned codelets are compatible with the aligned flag */ 741 if (req_flags & FF_TX_ALIGNED) 742 req_flags |= AV_TX_UNALIGNED; 743 744 /* If either flag is set, both are okay, so don't check for an exact match */ 745 if ((req_flags & AV_TX_INPLACE) && (req_flags & FF_TX_OUT_OF_PLACE)) 746 req_flags &= ~(AV_TX_INPLACE | FF_TX_OUT_OF_PLACE); 747 if ((req_flags & FF_TX_ALIGNED) && (req_flags & AV_TX_UNALIGNED)) 748 req_flags &= ~(FF_TX_ALIGNED | AV_TX_UNALIGNED); 749 750 /* Loop through all codelets in all codelet lists to find matches 751 * to the requirements */ 752 while (codelet_list_idx--) { 753 const FFTXCodelet * const * list = codelet_list[codelet_list_idx]; 754 const FFTXCodelet *cd = NULL; 755 756 while ((cd = *list++)) { 757 /* Check if the type matches */ 758 if (cd->type != TX_TYPE_ANY && type != cd->type) 759 continue; 760 761 /* Check direction for non-orthogonal codelets */ 762 if (((cd->flags & FF_TX_FORWARD_ONLY) && inv) || 763 ((cd->flags & (FF_TX_INVERSE_ONLY | AV_TX_FULL_IMDCT)) && !inv) || 764 ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_REAL)) && inv) || 765 ((cd->flags & (FF_TX_FORWARD_ONLY | AV_TX_REAL_TO_IMAGINARY)) && inv)) 766 continue; 767 768 /* Check if the requested flags match from both sides */ 769 if (((req_flags & cd->flags) != (req_flags)) || 770 ((inv_req_mask & cd->flags) != (req_flags & inv_req_mask))) 771 continue; 772 773 /* Check if length is supported */ 774 if ((len < cd->min_len) || (cd->max_len != -1 && (len > cd->max_len))) 775 continue; 776 777 /* Check if the CPU supports the required ISA */ 778 if (cd->cpu_flags != FF_TX_CPU_FLAGS_ALL && 779 !(cpu_flags & (cd->cpu_flags & ~cpu_slow_mask))) 780 continue; 781 782 /* Check for factors */ 783 if (!check_cd_factors(cd, len)) 784 continue; 785 786 /* Realloc array and append */ 787 cd_tmp = av_fast_realloc(cd_matches, &cd_matches_size, 788 sizeof(*cd_tmp) * (nb_cd_matches + 1)); 789 if (!cd_tmp) { 790 av_free(cd_matches); 791 return AVERROR(ENOMEM); 792 } 793 794 cd_matches = cd_tmp; 795 cd_matches[nb_cd_matches].cd = cd; 796 cd_matches[nb_cd_matches].prio = get_codelet_prio(cd, cpu_flags, len); 797 nb_cd_matches++; 798 } 799 } 800 801 #if !CONFIG_SMALL 802 /* Print debugging info */ 803 av_bprint_init(&bp, 0, AV_BPRINT_SIZE_AUTOMATIC); 804 av_bprintf(&bp, "For transform of length %i, %s, ", len, 805 inv ? "inverse" : "forward"); 806 print_type(&bp, type); 807 av_bprintf(&bp, ", "); 808 print_flags(&bp, flags); 809 av_bprintf(&bp, ", found %i matches%s", nb_cd_matches, 810 nb_cd_matches ? ":" : "."); 811 #endif 812 813 /* No matches found */ 814 if (!nb_cd_matches) 815 return AVERROR(ENOSYS); 816 817 /* Sort the list */ 818 AV_QSORT(cd_matches, nb_cd_matches, TXCodeletMatch, cmp_matches); 819 820 #if !CONFIG_SMALL 821 av_log(NULL, AV_LOG_TRACE, "%s\n", bp.str); 822 823 for (int i = 0; i < nb_cd_matches; i++) { 824 av_log(NULL, AV_LOG_TRACE, " %i: ", i + 1); 825 print_cd_info(cd_matches[i].cd, cd_matches[i].prio, 0, 1, AV_LOG_TRACE); 826 } 827 #endif 828 829 if (!s->sub) { 830 s->sub = sub = av_mallocz(TX_MAX_SUB*sizeof(*sub)); 831 if (!sub) { 832 ret = AVERROR(ENOMEM); 833 goto end; 834 } 835 } 836 837 /* Attempt to initialize each */ 838 for (int i = 0; i < nb_cd_matches; i++) { 839 const FFTXCodelet *cd = cd_matches[i].cd; 840 AVTXContext *sctx = &s->sub[s->nb_sub]; 841 842 sctx->len = len; 843 sctx->inv = inv; 844 sctx->type = type; 845 sctx->flags = cd->flags | flags; 846 sctx->cd_self = cd; 847 848 s->fn[s->nb_sub] = cd->function; 849 s->cd[s->nb_sub] = cd; 850 851 ret = 0; 852 if (cd->init) 853 ret = cd->init(sctx, cd, flags, opts, len, inv, scale); 854 855 if (ret >= 0) { 856 if (opts && opts->map_dir != FF_TX_MAP_NONE && 857 sctx->map_dir == FF_TX_MAP_NONE) { 858 /* If a specific map direction was requested, and it doesn't 859 * exist, create one.*/ 860 sctx->map = av_malloc(len*sizeof(*sctx->map)); 861 if (!sctx->map) { 862 ret = AVERROR(ENOMEM); 863 goto end; 864 } 865 866 for (int i = 0; i < len; i++) 867 sctx->map[i] = i; 868 } else if (opts && (opts->map_dir != sctx->map_dir)) { 869 int *tmp = av_malloc(len*sizeof(*sctx->map)); 870 if (!tmp) { 871 ret = AVERROR(ENOMEM); 872 goto end; 873 } 874 875 memcpy(tmp, sctx->map, len*sizeof(*sctx->map)); 876 877 for (int i = 0; i < len; i++) 878 sctx->map[tmp[i]] = i; 879 880 av_free(tmp); 881 } 882 883 s->nb_sub++; 884 goto end; 885 } 886 887 s->fn[s->nb_sub] = NULL; 888 s->cd[s->nb_sub] = NULL; 889 890 reset_ctx(sctx, 0); 891 if (ret == AVERROR(ENOMEM)) 892 break; 893 } 894 895 if (!s->nb_sub) 896 av_freep(&s->sub); 897 898 end: 899 av_free(cd_matches); 900 return ret; 901 } 902 903 av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type, 904 int inv, int len, const void *scale, uint64_t flags) 905 { 906 int ret; 907 AVTXContext tmp = { 0 }; 908 const double default_scale_d = 1.0; 909 const float default_scale_f = 1.0f; 910 911 if (!len || type >= AV_TX_NB || !ctx || !tx) 912 return AVERROR(EINVAL); 913 914 if (!(flags & AV_TX_UNALIGNED)) 915 flags |= FF_TX_ALIGNED; 916 if (!(flags & AV_TX_INPLACE)) 917 flags |= FF_TX_OUT_OF_PLACE; 918 919 if (!scale && ((type == AV_TX_DOUBLE_MDCT) || (type == AV_TX_DOUBLE_DCT) || 920 (type == AV_TX_DOUBLE_DCT_I) || (type == AV_TX_DOUBLE_DST_I) || 921 (type == AV_TX_DOUBLE_RDFT))) 922 scale = &default_scale_d; 923 else if (!scale && !TYPE_IS(FFT, type)) 924 scale = &default_scale_f; 925 926 ret = ff_tx_init_subtx(&tmp, type, flags, NULL, len, inv, scale); 927 if (ret < 0) 928 return ret; 929 930 *ctx = &tmp.sub[0]; 931 *tx = tmp.fn[0]; 932 933 #if !CONFIG_SMALL 934 av_log(NULL, AV_LOG_DEBUG, "Transform tree:\n"); 935 print_tx_structure(*ctx, 0); 936 #endif 937 938 return ret; 939 }