tx_template.c (91045B)
1 /* 2 * Copyright (c) Lynne 3 * 4 * Power of two FFT: 5 * Copyright (c) Lynne 6 * Copyright (c) 2008 Loren Merritt 7 * Copyright (c) 2002 Fabrice Bellard 8 * Partly based on libdjbfft by D. J. Bernstein 9 * 10 * This file is part of FFmpeg. 11 * 12 * FFmpeg is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public 14 * License as published by the Free Software Foundation; either 15 * version 2.1 of the License, or (at your option) any later version. 16 * 17 * FFmpeg is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with FFmpeg; if not, write to the Free Software 24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25 */ 26 27 #include "mem.h" 28 29 #define TABLE_DEF(name, size) \ 30 DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size] 31 32 #define SR_POW2_TABLES \ 33 SR_TABLE(8) \ 34 SR_TABLE(16) \ 35 SR_TABLE(32) \ 36 SR_TABLE(64) \ 37 SR_TABLE(128) \ 38 SR_TABLE(256) \ 39 SR_TABLE(512) \ 40 SR_TABLE(1024) \ 41 SR_TABLE(2048) \ 42 SR_TABLE(4096) \ 43 SR_TABLE(8192) \ 44 SR_TABLE(16384) \ 45 SR_TABLE(32768) \ 46 SR_TABLE(65536) \ 47 SR_TABLE(131072) \ 48 SR_TABLE(262144) \ 49 SR_TABLE(524288) \ 50 SR_TABLE(1048576) \ 51 SR_TABLE(2097152) \ 52 53 #define SR_TABLE(len) \ 54 TABLE_DEF(len, len/4 + 1); 55 /* Power of two tables */ 56 SR_POW2_TABLES 57 #undef SR_TABLE 58 59 /* Other factors' tables */ 60 TABLE_DEF(53, 12); 61 TABLE_DEF( 7, 6); 62 TABLE_DEF( 9, 8); 63 64 typedef struct FFTabInitData { 65 void (*func)(void); 66 int factors[TX_MAX_SUB]; /* Must be sorted high -> low */ 67 } FFTabInitData; 68 69 #define SR_TABLE(len) \ 70 static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void) \ 71 { \ 72 double freq = 2*M_PI/len; \ 73 TXSample *tab = TX_TAB(ff_tx_tab_ ##len); \ 74 \ 75 for (int i = 0; i < len/4; i++) \ 76 *tab++ = RESCALE(cos(i*freq)); \ 77 \ 78 *tab = 0; \ 79 } 80 SR_POW2_TABLES 81 #undef SR_TABLE 82 83 static void (*const sr_tabs_init_funcs[])(void) = { 84 #define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len), 85 SR_POW2_TABLES 86 #undef SR_TABLE 87 }; 88 89 static AVOnce sr_tabs_init_once[] = { 90 #define SR_TABLE(len) AV_ONCE_INIT, 91 SR_POW2_TABLES 92 #undef SR_TABLE 93 }; 94 95 static av_cold void TX_TAB(ff_tx_init_tab_53)(void) 96 { 97 /* 5pt, doubled to eliminate AVX lane shuffles */ 98 TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5)); 99 TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5)); 100 TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10)); 101 TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10)); 102 TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5)); 103 TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5)); 104 TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10)); 105 TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10)); 106 107 /* 3pt */ 108 TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12)); 109 TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12)); 110 TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6)); 111 TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6)); 112 } 113 114 static av_cold void TX_TAB(ff_tx_init_tab_7)(void) 115 { 116 TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7)); 117 TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7)); 118 TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28)); 119 TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28)); 120 TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14)); 121 TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14)); 122 } 123 124 static av_cold void TX_TAB(ff_tx_init_tab_9)(void) 125 { 126 TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3)); 127 TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3)); 128 TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI / 9)); 129 TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI / 9)); 130 TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36)); 131 TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36)); 132 TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5]; 133 TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4]; 134 } 135 136 static const FFTabInitData nptwo_tabs_init_data[] = { 137 { TX_TAB(ff_tx_init_tab_53), { 15, 5, 3 } }, 138 { TX_TAB(ff_tx_init_tab_9), { 9 } }, 139 { TX_TAB(ff_tx_init_tab_7), { 7 } }, 140 }; 141 142 static AVOnce nptwo_tabs_init_once[] = { 143 AV_ONCE_INIT, 144 AV_ONCE_INIT, 145 AV_ONCE_INIT, 146 }; 147 148 av_cold void TX_TAB(ff_tx_init_tabs)(int len) 149 { 150 int factor_2 = ff_ctz(len); 151 if (factor_2) { 152 int idx = factor_2 - 3; 153 for (int i = 0; i <= idx; i++) 154 ff_thread_once(&sr_tabs_init_once[i], 155 sr_tabs_init_funcs[i]); 156 len >>= factor_2; 157 } 158 159 for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) { 160 int f, f_idx = 0; 161 162 if (len <= 1) 163 return; 164 165 while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) { 166 if (f % len) 167 continue; 168 169 ff_thread_once(&nptwo_tabs_init_once[i], 170 nptwo_tabs_init_data[i].func); 171 len /= f; 172 break; 173 } 174 } 175 } 176 177 static av_always_inline void fft3(TXComplex *out, TXComplex *in, 178 ptrdiff_t stride) 179 { 180 TXComplex tmp[3]; 181 const TXSample *tab = TX_TAB(ff_tx_tab_53); 182 #ifdef TX_INT32 183 int64_t mtmp[4]; 184 #endif 185 186 tmp[0] = in[0]; 187 BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im); 188 BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re); 189 190 #ifdef TX_INT32 191 out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re; 192 out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im; 193 mtmp[0] = (int64_t)tab[ 8] * tmp[1].re; 194 mtmp[1] = (int64_t)tab[ 9] * tmp[1].im; 195 mtmp[2] = (int64_t)tab[10] * tmp[2].re; 196 mtmp[3] = (int64_t)tab[10] * tmp[2].im; 197 out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31); 198 out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31); 199 out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31); 200 out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31); 201 #else 202 out[0*stride].re = tmp[0].re + tmp[2].re; 203 out[0*stride].im = tmp[0].im + tmp[2].im; 204 tmp[1].re = tab[ 8] * tmp[1].re; 205 tmp[1].im = tab[ 9] * tmp[1].im; 206 tmp[2].re = tab[10] * tmp[2].re; 207 tmp[2].im = tab[10] * tmp[2].im; 208 out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re; 209 out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im; 210 out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re; 211 out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im; 212 #endif 213 } 214 215 #define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \ 216 static av_always_inline void NAME(TXComplex *out, TXComplex *in, \ 217 ptrdiff_t stride) \ 218 { \ 219 TXComplex dc, z0[4], t[6]; \ 220 const TXSample *tab = TX_TAB(ff_tx_tab_53); \ 221 \ 222 dc = in[0]; \ 223 BF(t[1].im, t[0].re, in[1].re, in[4].re); \ 224 BF(t[1].re, t[0].im, in[1].im, in[4].im); \ 225 BF(t[3].im, t[2].re, in[2].re, in[3].re); \ 226 BF(t[3].re, t[2].im, in[2].im, in[3].im); \ 227 \ 228 out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re; \ 229 out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im; \ 230 \ 231 SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \ 232 SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \ 233 CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \ 234 CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \ 235 \ 236 BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \ 237 BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \ 238 BF(z0[2].re, z0[1].re, t[4].re, t[5].re); \ 239 BF(z0[2].im, z0[1].im, t[4].im, t[5].im); \ 240 \ 241 out[D1*stride].re = dc.re + (TXUSample)z0[3].re; \ 242 out[D1*stride].im = dc.im + (TXUSample)z0[0].im; \ 243 out[D2*stride].re = dc.re + (TXUSample)z0[2].re; \ 244 out[D2*stride].im = dc.im + (TXUSample)z0[1].im; \ 245 out[D3*stride].re = dc.re + (TXUSample)z0[1].re; \ 246 out[D3*stride].im = dc.im + (TXUSample)z0[2].im; \ 247 out[D4*stride].re = dc.re + (TXUSample)z0[0].re; \ 248 out[D4*stride].im = dc.im + (TXUSample)z0[3].im; \ 249 } 250 251 DECL_FFT5(fft5, 0, 1, 2, 3, 4) 252 DECL_FFT5(fft5_m1, 0, 6, 12, 3, 9) 253 DECL_FFT5(fft5_m2, 10, 1, 7, 13, 4) 254 DECL_FFT5(fft5_m3, 5, 11, 2, 8, 14) 255 256 static av_always_inline void fft7(TXComplex *out, TXComplex *in, 257 ptrdiff_t stride) 258 { 259 TXComplex dc, t[6], z[3]; 260 const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7); 261 #ifdef TX_INT32 262 int64_t mtmp[12]; 263 #endif 264 265 dc = in[0]; 266 BF(t[1].re, t[0].re, in[1].re, in[6].re); 267 BF(t[1].im, t[0].im, in[1].im, in[6].im); 268 BF(t[3].re, t[2].re, in[2].re, in[5].re); 269 BF(t[3].im, t[2].im, in[2].im, in[5].im); 270 BF(t[5].re, t[4].re, in[3].re, in[4].re); 271 BF(t[5].im, t[4].im, in[3].im, in[4].im); 272 273 out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re; 274 out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im; 275 276 #ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */ 277 mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re; 278 mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re; 279 mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re; 280 mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im; 281 mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im; 282 mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im; 283 284 mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im; 285 mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im; 286 mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im; 287 mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re; 288 mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re; 289 mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re; 290 291 z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31); 292 z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31); 293 z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31); 294 z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31); 295 z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31); 296 z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31); 297 298 t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31); 299 t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31); 300 t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31); 301 t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31); 302 t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31); 303 t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31); 304 #else 305 z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re; 306 z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re; 307 z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re; 308 z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im; 309 z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im; 310 z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im; 311 312 /* It's possible to do t[4].re and t[0].im with 2 multiplies only by 313 * multiplying the sum of all with the average of the twiddles */ 314 315 t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im; 316 t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im; 317 t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im; 318 t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re; 319 t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re; 320 t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re; 321 #endif 322 323 BF(t[1].re, z[0].re, z[0].re, t[4].re); 324 BF(t[3].re, z[1].re, z[1].re, t[2].re); 325 BF(t[5].re, z[2].re, z[2].re, t[0].re); 326 BF(t[1].im, z[0].im, z[0].im, t[0].im); 327 BF(t[3].im, z[1].im, z[1].im, t[2].im); 328 BF(t[5].im, z[2].im, z[2].im, t[4].im); 329 330 out[1*stride].re = dc.re + z[0].re; 331 out[1*stride].im = dc.im + t[1].im; 332 out[2*stride].re = dc.re + t[3].re; 333 out[2*stride].im = dc.im + z[1].im; 334 out[3*stride].re = dc.re + z[2].re; 335 out[3*stride].im = dc.im + t[5].im; 336 out[4*stride].re = dc.re + t[5].re; 337 out[4*stride].im = dc.im + z[2].im; 338 out[5*stride].re = dc.re + z[1].re; 339 out[5*stride].im = dc.im + t[3].im; 340 out[6*stride].re = dc.re + t[1].re; 341 out[6*stride].im = dc.im + z[0].im; 342 } 343 344 static av_always_inline void fft9(TXComplex *out, TXComplex *in, 345 ptrdiff_t stride) 346 { 347 const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9); 348 TXComplex dc, t[16], w[4], x[5], y[5], z[2]; 349 #ifdef TX_INT32 350 int64_t mtmp[12]; 351 #endif 352 353 dc = in[0]; 354 BF(t[1].re, t[0].re, in[1].re, in[8].re); 355 BF(t[1].im, t[0].im, in[1].im, in[8].im); 356 BF(t[3].re, t[2].re, in[2].re, in[7].re); 357 BF(t[3].im, t[2].im, in[2].im, in[7].im); 358 BF(t[5].re, t[4].re, in[3].re, in[6].re); 359 BF(t[5].im, t[4].im, in[3].im, in[6].im); 360 BF(t[7].re, t[6].re, in[4].re, in[5].re); 361 BF(t[7].im, t[6].im, in[4].im, in[5].im); 362 363 w[0].re = t[0].re - t[6].re; 364 w[0].im = t[0].im - t[6].im; 365 w[1].re = t[2].re - t[6].re; 366 w[1].im = t[2].im - t[6].im; 367 w[2].re = t[1].re - t[7].re; 368 w[2].im = t[1].im - t[7].im; 369 w[3].re = t[3].re + t[7].re; 370 w[3].im = t[3].im + t[7].im; 371 372 z[0].re = dc.re + t[4].re; 373 z[0].im = dc.im + t[4].im; 374 375 z[1].re = t[0].re + t[2].re + t[6].re; 376 z[1].im = t[0].im + t[2].im + t[6].im; 377 378 out[0*stride].re = z[0].re + z[1].re; 379 out[0*stride].im = z[0].im + z[1].im; 380 381 #ifdef TX_INT32 382 mtmp[0] = t[1].re - t[3].re + t[7].re; 383 mtmp[1] = t[1].im - t[3].im + t[7].im; 384 385 y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31); 386 y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31); 387 388 mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31); 389 mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31); 390 mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31); 391 mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31); 392 393 x[3].re = z[0].re + (int32_t)mtmp[0]; 394 x[3].im = z[0].im + (int32_t)mtmp[1]; 395 z[0].re = in[0].re + (int32_t)mtmp[2]; 396 z[0].im = in[0].im + (int32_t)mtmp[3]; 397 398 mtmp[0] = ((int64_t)tab[1].re)*w[0].re; 399 mtmp[1] = ((int64_t)tab[1].re)*w[0].im; 400 mtmp[2] = ((int64_t)tab[2].im)*w[0].re; 401 mtmp[3] = ((int64_t)tab[2].im)*w[0].im; 402 mtmp[4] = ((int64_t)tab[1].im)*w[2].re; 403 mtmp[5] = ((int64_t)tab[1].im)*w[2].im; 404 mtmp[6] = ((int64_t)tab[2].re)*w[2].re; 405 mtmp[7] = ((int64_t)tab[2].re)*w[2].im; 406 407 x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31); 408 x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31); 409 x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31); 410 x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31); 411 y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31); 412 y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31); 413 y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31); 414 y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31); 415 416 y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31); 417 y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31); 418 419 #else 420 y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re); 421 y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im); 422 423 x[3].re = z[0].re + tab[0].re*z[1].re; 424 x[3].im = z[0].im + tab[0].re*z[1].im; 425 z[0].re = dc.re + tab[0].re*t[4].re; 426 z[0].im = dc.im + tab[0].re*t[4].im; 427 428 x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re; 429 x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im; 430 x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re; 431 x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im; 432 y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re; 433 y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im; 434 y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re; 435 y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im; 436 437 y[0].re = tab[0].im*t[5].re; 438 y[0].im = tab[0].im*t[5].im; 439 #endif 440 441 x[4].re = x[1].re + x[2].re; 442 x[4].im = x[1].im + x[2].im; 443 444 y[4].re = y[1].re - y[2].re; 445 y[4].im = y[1].im - y[2].im; 446 x[1].re = z[0].re + x[1].re; 447 x[1].im = z[0].im + x[1].im; 448 y[1].re = y[0].re + y[1].re; 449 y[1].im = y[0].im + y[1].im; 450 x[2].re = z[0].re + x[2].re; 451 x[2].im = z[0].im + x[2].im; 452 y[2].re = y[2].re - y[0].re; 453 y[2].im = y[2].im - y[0].im; 454 x[4].re = z[0].re - x[4].re; 455 x[4].im = z[0].im - x[4].im; 456 y[4].re = y[0].re - y[4].re; 457 y[4].im = y[0].im - y[4].im; 458 459 out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re }; 460 out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re }; 461 out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re }; 462 out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re }; 463 out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re }; 464 out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re }; 465 out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re }; 466 out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re }; 467 } 468 469 static av_always_inline void fft15(TXComplex *out, TXComplex *in, 470 ptrdiff_t stride) 471 { 472 TXComplex tmp[15]; 473 474 for (int i = 0; i < 5; i++) 475 fft3(tmp + i, in + i*3, 5); 476 477 fft5_m1(out, tmp + 0, stride); 478 fft5_m2(out, tmp + 5, stride); 479 fft5_m3(out, tmp + 10, stride); 480 } 481 482 static av_cold int TX_NAME(ff_tx_fft_factor_init)(AVTXContext *s, 483 const FFTXCodelet *cd, 484 uint64_t flags, 485 FFTXCodeletOptions *opts, 486 int len, int inv, 487 const void *scale) 488 { 489 int ret = 0; 490 TX_TAB(ff_tx_init_tabs)(len); 491 492 if (len == 15) 493 ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5); 494 else if (flags & FF_TX_PRESHUFFLE) 495 ret = ff_tx_gen_default_map(s, opts); 496 497 return ret; 498 } 499 500 #define DECL_FACTOR_S(n) \ 501 static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst, \ 502 void *src, ptrdiff_t stride) \ 503 { \ 504 fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex)); \ 505 } \ 506 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \ 507 .name = TX_NAME_STR("fft" #n "_ns"), \ 508 .function = TX_NAME(ff_tx_fft##n), \ 509 .type = TX_TYPE(FFT), \ 510 .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \ 511 AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \ 512 .factors[0] = n, \ 513 .nb_factors = 1, \ 514 .min_len = n, \ 515 .max_len = n, \ 516 .init = TX_NAME(ff_tx_fft_factor_init), \ 517 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ 518 .prio = FF_TX_PRIO_BASE, \ 519 }; 520 521 #define DECL_FACTOR_F(n) \ 522 DECL_FACTOR_S(n) \ 523 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = { \ 524 .name = TX_NAME_STR("fft" #n "_fwd"), \ 525 .function = TX_NAME(ff_tx_fft##n), \ 526 .type = TX_TYPE(FFT), \ 527 .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \ 528 AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY, \ 529 .factors[0] = n, \ 530 .nb_factors = 1, \ 531 .min_len = n, \ 532 .max_len = n, \ 533 .init = TX_NAME(ff_tx_fft_factor_init), \ 534 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ 535 .prio = FF_TX_PRIO_BASE, \ 536 }; 537 538 DECL_FACTOR_F(3) 539 DECL_FACTOR_F(5) 540 DECL_FACTOR_F(7) 541 DECL_FACTOR_F(9) 542 DECL_FACTOR_S(15) 543 544 #define BUTTERFLIES(a0, a1, a2, a3) \ 545 do { \ 546 r0=a0.re; \ 547 i0=a0.im; \ 548 r1=a1.re; \ 549 i1=a1.im; \ 550 BF(t3, t5, t5, t1); \ 551 BF(a2.re, a0.re, r0, t5); \ 552 BF(a3.im, a1.im, i1, t3); \ 553 BF(t4, t6, t2, t6); \ 554 BF(a3.re, a1.re, r1, t4); \ 555 BF(a2.im, a0.im, i0, t6); \ 556 } while (0) 557 558 #define TRANSFORM(a0, a1, a2, a3, wre, wim) \ 559 do { \ 560 CMUL(t1, t2, a2.re, a2.im, wre, -wim); \ 561 CMUL(t5, t6, a3.re, a3.im, wre, wim); \ 562 BUTTERFLIES(a0, a1, a2, a3); \ 563 } while (0) 564 565 /* z[0...8n-1], w[1...2n-1] */ 566 static inline void TX_NAME(ff_tx_fft_sr_combine)(TXComplex *z, 567 const TXSample *cos, int len) 568 { 569 int o1 = 2*len; 570 int o2 = 4*len; 571 int o3 = 6*len; 572 const TXSample *wim = cos + o1 - 7; 573 TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1; 574 575 for (int i = 0; i < len; i += 4) { 576 TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]); 577 TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]); 578 TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]); 579 TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]); 580 581 TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]); 582 TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]); 583 TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]); 584 TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]); 585 586 z += 2*4; 587 cos += 2*4; 588 wim -= 2*4; 589 } 590 } 591 592 static av_cold int TX_NAME(ff_tx_fft_sr_codelet_init)(AVTXContext *s, 593 const FFTXCodelet *cd, 594 uint64_t flags, 595 FFTXCodeletOptions *opts, 596 int len, int inv, 597 const void *scale) 598 { 599 TX_TAB(ff_tx_init_tabs)(len); 600 return ff_tx_gen_ptwo_revtab(s, opts); 601 } 602 603 #define DECL_SR_CODELET_DEF(n) \ 604 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \ 605 .name = TX_NAME_STR("fft" #n "_ns"), \ 606 .function = TX_NAME(ff_tx_fft##n##_ns), \ 607 .type = TX_TYPE(FFT), \ 608 .flags = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | \ 609 AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \ 610 .factors[0] = 2, \ 611 .nb_factors = 1, \ 612 .min_len = n, \ 613 .max_len = n, \ 614 .init = TX_NAME(ff_tx_fft_sr_codelet_init), \ 615 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ 616 .prio = FF_TX_PRIO_BASE, \ 617 }; 618 619 #define DECL_SR_CODELET(n, n2, n4) \ 620 static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst, \ 621 void *_src, ptrdiff_t stride) \ 622 { \ 623 TXComplex *src = _src; \ 624 TXComplex *dst = _dst; \ 625 const TXSample *cos = TX_TAB(ff_tx_tab_##n); \ 626 \ 627 TX_NAME(ff_tx_fft##n2##_ns)(s, dst, src, stride); \ 628 TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride); \ 629 TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride); \ 630 TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1); \ 631 } \ 632 \ 633 DECL_SR_CODELET_DEF(n) 634 635 static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *_dst, 636 void *_src, ptrdiff_t stride) 637 { 638 TXComplex *src = _src; 639 TXComplex *dst = _dst; 640 TXComplex tmp; 641 642 BF(tmp.re, dst[0].re, src[0].re, src[1].re); 643 BF(tmp.im, dst[0].im, src[0].im, src[1].im); 644 dst[1] = tmp; 645 } 646 647 static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *_dst, 648 void *_src, ptrdiff_t stride) 649 { 650 TXComplex *src = _src; 651 TXComplex *dst = _dst; 652 TXSample t1, t2, t3, t4, t5, t6, t7, t8; 653 654 BF(t3, t1, src[0].re, src[1].re); 655 BF(t8, t6, src[3].re, src[2].re); 656 BF(dst[2].re, dst[0].re, t1, t6); 657 BF(t4, t2, src[0].im, src[1].im); 658 BF(t7, t5, src[2].im, src[3].im); 659 BF(dst[3].im, dst[1].im, t4, t8); 660 BF(dst[3].re, dst[1].re, t3, t7); 661 BF(dst[2].im, dst[0].im, t2, t5); 662 } 663 664 static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *_dst, 665 void *_src, ptrdiff_t stride) 666 { 667 TXComplex *src = _src; 668 TXComplex *dst = _dst; 669 TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1; 670 const TXSample cos = TX_TAB(ff_tx_tab_8)[1]; 671 672 TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride); 673 674 BF(t1, dst[5].re, src[4].re, -src[5].re); 675 BF(t2, dst[5].im, src[4].im, -src[5].im); 676 BF(t5, dst[7].re, src[6].re, -src[7].re); 677 BF(t6, dst[7].im, src[6].im, -src[7].im); 678 679 BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]); 680 TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos); 681 } 682 683 static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *_dst, 684 void *_src, ptrdiff_t stride) 685 { 686 TXComplex *src = _src; 687 TXComplex *dst = _dst; 688 const TXSample *cos = TX_TAB(ff_tx_tab_16); 689 690 TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1; 691 TXSample cos_16_1 = cos[1]; 692 TXSample cos_16_2 = cos[2]; 693 TXSample cos_16_3 = cos[3]; 694 695 TX_NAME(ff_tx_fft8_ns)(s, dst + 0, src + 0, stride); 696 TX_NAME(ff_tx_fft4_ns)(s, dst + 8, src + 8, stride); 697 TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride); 698 699 t1 = dst[ 8].re; 700 t2 = dst[ 8].im; 701 t5 = dst[12].re; 702 t6 = dst[12].im; 703 BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]); 704 705 TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2); 706 TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3); 707 TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1); 708 } 709 710 DECL_SR_CODELET_DEF(2) 711 DECL_SR_CODELET_DEF(4) 712 DECL_SR_CODELET_DEF(8) 713 DECL_SR_CODELET_DEF(16) 714 DECL_SR_CODELET(32,16,8) 715 DECL_SR_CODELET(64,32,16) 716 DECL_SR_CODELET(128,64,32) 717 DECL_SR_CODELET(256,128,64) 718 DECL_SR_CODELET(512,256,128) 719 DECL_SR_CODELET(1024,512,256) 720 DECL_SR_CODELET(2048,1024,512) 721 DECL_SR_CODELET(4096,2048,1024) 722 DECL_SR_CODELET(8192,4096,2048) 723 DECL_SR_CODELET(16384,8192,4096) 724 DECL_SR_CODELET(32768,16384,8192) 725 DECL_SR_CODELET(65536,32768,16384) 726 DECL_SR_CODELET(131072,65536,32768) 727 DECL_SR_CODELET(262144,131072,65536) 728 DECL_SR_CODELET(524288,262144,131072) 729 DECL_SR_CODELET(1048576,524288,262144) 730 DECL_SR_CODELET(2097152,1048576,524288) 731 732 static av_cold int TX_NAME(ff_tx_fft_init)(AVTXContext *s, 733 const FFTXCodelet *cd, 734 uint64_t flags, 735 FFTXCodeletOptions *opts, 736 int len, int inv, 737 const void *scale) 738 { 739 int ret; 740 int is_inplace = !!(flags & AV_TX_INPLACE); 741 FFTXCodeletOptions sub_opts = { 742 .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER, 743 }; 744 745 flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ 746 flags |= AV_TX_INPLACE; /* in-place */ 747 flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ 748 749 if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale))) 750 return ret; 751 752 if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len))) 753 return ret; 754 755 return 0; 756 } 757 758 static av_cold int TX_NAME(ff_tx_fft_inplace_small_init)(AVTXContext *s, 759 const FFTXCodelet *cd, 760 uint64_t flags, 761 FFTXCodeletOptions *opts, 762 int len, int inv, 763 const void *scale) 764 { 765 if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) 766 return AVERROR(ENOMEM); 767 flags &= ~AV_TX_INPLACE; 768 return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale); 769 } 770 771 static void TX_NAME(ff_tx_fft)(AVTXContext *s, void *_dst, 772 void *_src, ptrdiff_t stride) 773 { 774 TXComplex *src = _src; 775 TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst; 776 TXComplex *dst2 = _dst; 777 int *map = s->sub[0].map; 778 int len = s->len; 779 780 /* Compilers can't vectorize this anyway without assuming AVX2, which they 781 * generally don't, at least without -march=native -mtune=native */ 782 for (int i = 0; i < len; i++) 783 dst1[i] = src[map[i]]; 784 785 s->fn[0](&s->sub[0], dst2, dst1, stride); 786 } 787 788 static void TX_NAME(ff_tx_fft_inplace)(AVTXContext *s, void *_dst, 789 void *_src, ptrdiff_t stride) 790 { 791 TXComplex *src = _src; 792 TXComplex *dst = _dst; 793 TXComplex tmp; 794 const int *map = s->sub->map; 795 const int *inplace_idx = s->map; 796 int src_idx, dst_idx; 797 798 src_idx = *inplace_idx++; 799 do { 800 tmp = src[src_idx]; 801 dst_idx = map[src_idx]; 802 do { 803 FFSWAP(TXComplex, tmp, src[dst_idx]); 804 dst_idx = map[dst_idx]; 805 } while (dst_idx != src_idx); /* Can be > as well, but was less predictable */ 806 src[dst_idx] = tmp; 807 } while ((src_idx = *inplace_idx++)); 808 809 s->fn[0](&s->sub[0], dst, src, stride); 810 } 811 812 static const FFTXCodelet TX_NAME(ff_tx_fft_def) = { 813 .name = TX_NAME_STR("fft"), 814 .function = TX_NAME(ff_tx_fft), 815 .type = TX_TYPE(FFT), 816 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE, 817 .factors[0] = TX_FACTOR_ANY, 818 .nb_factors = 1, 819 .min_len = 2, 820 .max_len = TX_LEN_UNLIMITED, 821 .init = TX_NAME(ff_tx_fft_init), 822 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 823 .prio = FF_TX_PRIO_BASE, 824 }; 825 826 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_small_def) = { 827 .name = TX_NAME_STR("fft_inplace_small"), 828 .function = TX_NAME(ff_tx_fft), 829 .type = TX_TYPE(FFT), 830 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE, 831 .factors[0] = TX_FACTOR_ANY, 832 .nb_factors = 1, 833 .min_len = 2, 834 .max_len = 65536, 835 .init = TX_NAME(ff_tx_fft_inplace_small_init), 836 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 837 .prio = FF_TX_PRIO_BASE - 256, 838 }; 839 840 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_def) = { 841 .name = TX_NAME_STR("fft_inplace"), 842 .function = TX_NAME(ff_tx_fft_inplace), 843 .type = TX_TYPE(FFT), 844 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE, 845 .factors[0] = TX_FACTOR_ANY, 846 .nb_factors = 1, 847 .min_len = 2, 848 .max_len = TX_LEN_UNLIMITED, 849 .init = TX_NAME(ff_tx_fft_init), 850 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 851 .prio = FF_TX_PRIO_BASE - 512, 852 }; 853 854 static av_cold int TX_NAME(ff_tx_fft_init_naive_small)(AVTXContext *s, 855 const FFTXCodelet *cd, 856 uint64_t flags, 857 FFTXCodeletOptions *opts, 858 int len, int inv, 859 const void *scale) 860 { 861 const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len; 862 863 if (!(s->exp = av_malloc(len*len*sizeof(*s->exp)))) 864 return AVERROR(ENOMEM); 865 866 for (int i = 0; i < len; i++) { 867 for (int j = 0; j < len; j++) { 868 const double factor = phase*i*j; 869 s->exp[i*j] = (TXComplex){ 870 RESCALE(cos(factor)), 871 RESCALE(sin(factor)), 872 }; 873 } 874 } 875 876 return 0; 877 } 878 879 static void TX_NAME(ff_tx_fft_naive)(AVTXContext *s, void *_dst, void *_src, 880 ptrdiff_t stride) 881 { 882 TXComplex *src = _src; 883 TXComplex *dst = _dst; 884 const int n = s->len; 885 double phase = s->inv ? 2.0*M_PI/n : -2.0*M_PI/n; 886 887 stride /= sizeof(*dst); 888 889 for (int i = 0; i < n; i++) { 890 TXComplex tmp = { 0 }; 891 for (int j = 0; j < n; j++) { 892 const double factor = phase*i*j; 893 const TXComplex mult = { 894 RESCALE(cos(factor)), 895 RESCALE(sin(factor)), 896 }; 897 TXComplex res; 898 CMUL3(res, src[j], mult); 899 tmp.re += res.re; 900 tmp.im += res.im; 901 } 902 dst[i*stride] = tmp; 903 } 904 } 905 906 static void TX_NAME(ff_tx_fft_naive_small)(AVTXContext *s, void *_dst, void *_src, 907 ptrdiff_t stride) 908 { 909 TXComplex *src = _src; 910 TXComplex *dst = _dst; 911 const int n = s->len; 912 913 stride /= sizeof(*dst); 914 915 for (int i = 0; i < n; i++) { 916 TXComplex tmp = { 0 }; 917 for (int j = 0; j < n; j++) { 918 TXComplex res; 919 const TXComplex mult = s->exp[i*j]; 920 CMUL3(res, src[j], mult); 921 tmp.re += res.re; 922 tmp.im += res.im; 923 } 924 dst[i*stride] = tmp; 925 } 926 } 927 928 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_small_def) = { 929 .name = TX_NAME_STR("fft_naive_small"), 930 .function = TX_NAME(ff_tx_fft_naive_small), 931 .type = TX_TYPE(FFT), 932 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE, 933 .factors[0] = TX_FACTOR_ANY, 934 .nb_factors = 1, 935 .min_len = 2, 936 .max_len = 1024, 937 .init = TX_NAME(ff_tx_fft_init_naive_small), 938 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 939 .prio = FF_TX_PRIO_MIN/2, 940 }; 941 942 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_def) = { 943 .name = TX_NAME_STR("fft_naive"), 944 .function = TX_NAME(ff_tx_fft_naive), 945 .type = TX_TYPE(FFT), 946 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE, 947 .factors[0] = TX_FACTOR_ANY, 948 .nb_factors = 1, 949 .min_len = 2, 950 .max_len = TX_LEN_UNLIMITED, 951 .init = NULL, 952 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 953 .prio = FF_TX_PRIO_MIN, 954 }; 955 956 static av_cold int TX_NAME(ff_tx_fft_pfa_init)(AVTXContext *s, 957 const FFTXCodelet *cd, 958 uint64_t flags, 959 FFTXCodeletOptions *opts, 960 int len, int inv, 961 const void *scale) 962 { 963 int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE; 964 FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; 965 size_t extra_tmp_len = 0; 966 int len_list[TX_MAX_DECOMPOSITIONS]; 967 968 if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0) 969 return ret; 970 971 /* Two iterations to test both orderings. */ 972 for (int i = 0; i < ret; i++) { 973 int len1 = len_list[i]; 974 int len2 = len / len1; 975 976 /* Our ptwo transforms don't support striding the output. */ 977 if (len2 & (len2 - 1)) 978 FFSWAP(int, len1, len2); 979 980 ff_tx_clear_ctx(s); 981 982 /* First transform */ 983 sub_opts.map_dir = FF_TX_MAP_GATHER; 984 flags &= ~AV_TX_INPLACE; 985 flags |= FF_TX_OUT_OF_PLACE; 986 flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ 987 ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, 988 len1, inv, scale); 989 990 if (ret == AVERROR(ENOMEM)) { 991 return ret; 992 } else if (ret < 0) { /* Try again without a preshuffle flag */ 993 flags &= ~FF_TX_PRESHUFFLE; 994 ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, 995 len1, inv, scale); 996 if (ret == AVERROR(ENOMEM)) 997 return ret; 998 else if (ret < 0) 999 continue; 1000 } 1001 1002 /* Second transform. */ 1003 sub_opts.map_dir = FF_TX_MAP_SCATTER; 1004 flags |= FF_TX_PRESHUFFLE; 1005 retry: 1006 flags &= ~FF_TX_OUT_OF_PLACE; 1007 flags |= AV_TX_INPLACE; 1008 ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, 1009 len2, inv, scale); 1010 1011 if (ret == AVERROR(ENOMEM)) { 1012 return ret; 1013 } else if (ret < 0) { /* Try again with an out-of-place transform */ 1014 flags |= FF_TX_OUT_OF_PLACE; 1015 flags &= ~AV_TX_INPLACE; 1016 ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, 1017 len2, inv, scale); 1018 if (ret == AVERROR(ENOMEM)) { 1019 return ret; 1020 } else if (ret < 0) { 1021 if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */ 1022 flags &= ~FF_TX_PRESHUFFLE; 1023 goto retry; 1024 } else { 1025 continue; 1026 } 1027 } 1028 } 1029 1030 /* Success */ 1031 break; 1032 } 1033 1034 /* If nothing was sucessful, error out */ 1035 if (ret < 0) 1036 return ret; 1037 1038 /* Generate PFA map */ 1039 if ((ret = ff_tx_gen_compound_mapping(s, opts, 0, 1040 s->sub[0].len, s->sub[1].len))) 1041 return ret; 1042 1043 if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) 1044 return AVERROR(ENOMEM); 1045 1046 /* Flatten input map */ 1047 tmp = (int *)s->tmp; 1048 for (int k = 0; k < len; k += s->sub[0].len) { 1049 memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp)); 1050 for (int i = 0; i < s->sub[0].len; i++) 1051 s->map[k + i] = tmp[s->sub[0].map[i]]; 1052 } 1053 1054 /* Only allocate extra temporary memory if we need it */ 1055 if (!(s->sub[1].flags & AV_TX_INPLACE)) 1056 extra_tmp_len = len; 1057 else if (!ps) 1058 extra_tmp_len = s->sub[0].len; 1059 1060 if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp)))) 1061 return AVERROR(ENOMEM); 1062 1063 return 0; 1064 } 1065 1066 static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out, 1067 void *_in, ptrdiff_t stride) 1068 { 1069 const int n = s->sub[0].len, m = s->sub[1].len, l = s->len; 1070 const int *in_map = s->map, *out_map = in_map + l; 1071 const int *sub_map = s->sub[1].map; 1072 TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp; 1073 TXComplex *in = _in, *out = _out; 1074 1075 stride /= sizeof(*out); 1076 1077 for (int i = 0; i < m; i++) { 1078 for (int j = 0; j < n; j++) 1079 s->exp[j] = in[in_map[i*n + j]]; 1080 s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex)); 1081 } 1082 1083 for (int i = 0; i < n; i++) 1084 s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex)); 1085 1086 for (int i = 0; i < l; i++) 1087 out[i*stride] = tmp1[out_map[i]]; 1088 } 1089 1090 static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out, 1091 void *_in, ptrdiff_t stride) 1092 { 1093 const int n = s->sub[0].len, m = s->sub[1].len, l = s->len; 1094 const int *in_map = s->map, *out_map = in_map + l; 1095 const int *sub_map = s->sub[1].map; 1096 TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp; 1097 TXComplex *in = _in, *out = _out; 1098 1099 stride /= sizeof(*out); 1100 1101 for (int i = 0; i < m; i++) 1102 s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex)); 1103 1104 for (int i = 0; i < n; i++) 1105 s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex)); 1106 1107 for (int i = 0; i < l; i++) 1108 out[i*stride] = tmp1[out_map[i]]; 1109 } 1110 1111 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = { 1112 .name = TX_NAME_STR("fft_pfa"), 1113 .function = TX_NAME(ff_tx_fft_pfa), 1114 .type = TX_TYPE(FFT), 1115 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, 1116 .factors = { 7, 5, 3, 2, TX_FACTOR_ANY }, 1117 .nb_factors = 2, 1118 .min_len = 2*3, 1119 .max_len = TX_LEN_UNLIMITED, 1120 .init = TX_NAME(ff_tx_fft_pfa_init), 1121 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 1122 .prio = FF_TX_PRIO_BASE, 1123 }; 1124 1125 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = { 1126 .name = TX_NAME_STR("fft_pfa_ns"), 1127 .function = TX_NAME(ff_tx_fft_pfa_ns), 1128 .type = TX_TYPE(FFT), 1129 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | 1130 FF_TX_PRESHUFFLE, 1131 .factors = { 7, 5, 3, 2, TX_FACTOR_ANY }, 1132 .nb_factors = 2, 1133 .min_len = 2*3, 1134 .max_len = TX_LEN_UNLIMITED, 1135 .init = TX_NAME(ff_tx_fft_pfa_init), 1136 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 1137 .prio = FF_TX_PRIO_BASE, 1138 }; 1139 1140 static av_cold int TX_NAME(ff_tx_mdct_naive_init)(AVTXContext *s, 1141 const FFTXCodelet *cd, 1142 uint64_t flags, 1143 FFTXCodeletOptions *opts, 1144 int len, int inv, 1145 const void *scale) 1146 { 1147 s->scale_d = *((SCALE_TYPE *)scale); 1148 s->scale_f = s->scale_d; 1149 return 0; 1150 } 1151 1152 static void TX_NAME(ff_tx_mdct_naive_fwd)(AVTXContext *s, void *_dst, 1153 void *_src, ptrdiff_t stride) 1154 { 1155 TXSample *src = _src; 1156 TXSample *dst = _dst; 1157 double scale = s->scale_d; 1158 int len = s->len; 1159 const double phase = M_PI/(4.0*len); 1160 1161 stride /= sizeof(*dst); 1162 1163 for (int i = 0; i < len; i++) { 1164 double sum = 0.0; 1165 for (int j = 0; j < len*2; j++) { 1166 int a = (2*j + 1 + len) * (2*i + 1); 1167 sum += UNSCALE(src[j]) * cos(a * phase); 1168 } 1169 dst[i*stride] = RESCALE(sum*scale); 1170 } 1171 } 1172 1173 static void TX_NAME(ff_tx_mdct_naive_inv)(AVTXContext *s, void *_dst, 1174 void *_src, ptrdiff_t stride) 1175 { 1176 TXSample *src = _src; 1177 TXSample *dst = _dst; 1178 double scale = s->scale_d; 1179 int len = s->len >> 1; 1180 int len2 = len*2; 1181 const double phase = M_PI/(4.0*len2); 1182 1183 stride /= sizeof(*src); 1184 1185 for (int i = 0; i < len; i++) { 1186 double sum_d = 0.0; 1187 double sum_u = 0.0; 1188 double i_d = phase * (4*len - 2*i - 1); 1189 double i_u = phase * (3*len2 + 2*i + 1); 1190 for (int j = 0; j < len2; j++) { 1191 double a = (2 * j + 1); 1192 double a_d = cos(a * i_d); 1193 double a_u = cos(a * i_u); 1194 double val = UNSCALE(src[j*stride]); 1195 sum_d += a_d * val; 1196 sum_u += a_u * val; 1197 } 1198 dst[i + 0] = RESCALE( sum_d*scale); 1199 dst[i + len] = RESCALE(-sum_u*scale); 1200 } 1201 } 1202 1203 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_fwd_def) = { 1204 .name = TX_NAME_STR("mdct_naive_fwd"), 1205 .function = TX_NAME(ff_tx_mdct_naive_fwd), 1206 .type = TX_TYPE(MDCT), 1207 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, 1208 .factors = { 2, TX_FACTOR_ANY }, /* MDCTs need an even length */ 1209 .nb_factors = 2, 1210 .min_len = 2, 1211 .max_len = TX_LEN_UNLIMITED, 1212 .init = TX_NAME(ff_tx_mdct_naive_init), 1213 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 1214 .prio = FF_TX_PRIO_MIN, 1215 }; 1216 1217 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = { 1218 .name = TX_NAME_STR("mdct_naive_inv"), 1219 .function = TX_NAME(ff_tx_mdct_naive_inv), 1220 .type = TX_TYPE(MDCT), 1221 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, 1222 .factors = { 2, TX_FACTOR_ANY }, 1223 .nb_factors = 2, 1224 .min_len = 2, 1225 .max_len = TX_LEN_UNLIMITED, 1226 .init = TX_NAME(ff_tx_mdct_naive_init), 1227 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 1228 .prio = FF_TX_PRIO_MIN, 1229 }; 1230 1231 static av_cold int TX_NAME(ff_tx_mdct_init)(AVTXContext *s, 1232 const FFTXCodelet *cd, 1233 uint64_t flags, 1234 FFTXCodeletOptions *opts, 1235 int len, int inv, 1236 const void *scale) 1237 { 1238 int ret; 1239 FFTXCodeletOptions sub_opts = { 1240 .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER, 1241 }; 1242 1243 s->scale_d = *((SCALE_TYPE *)scale); 1244 s->scale_f = s->scale_d; 1245 1246 flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ 1247 flags |= AV_TX_INPLACE; /* in-place */ 1248 flags |= FF_TX_PRESHUFFLE; /* First try with an in-place transform */ 1249 1250 if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1, 1251 inv, scale))) { 1252 flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */ 1253 if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1, 1254 inv, scale))) 1255 return ret; 1256 } 1257 1258 s->map = av_malloc((len >> 1)*sizeof(*s->map)); 1259 if (!s->map) 1260 return AVERROR(ENOMEM); 1261 1262 /* If we need to preshuffle copy the map from the subcontext */ 1263 if (s->sub[0].flags & FF_TX_PRESHUFFLE) { 1264 memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map)); 1265 } else { 1266 for (int i = 0; i < len >> 1; i++) 1267 s->map[i] = i; 1268 } 1269 1270 if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL))) 1271 return ret; 1272 1273 /* Saves a multiply in a hot path. */ 1274 if (inv) 1275 for (int i = 0; i < (s->len >> 1); i++) 1276 s->map[i] <<= 1; 1277 1278 return 0; 1279 } 1280 1281 static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src, 1282 ptrdiff_t stride) 1283 { 1284 TXSample *src = _src, *dst = _dst; 1285 TXComplex *exp = s->exp, tmp, *z = _dst; 1286 const int len2 = s->len >> 1; 1287 const int len4 = s->len >> 2; 1288 const int len3 = len2 * 3; 1289 const int *sub_map = s->map; 1290 1291 stride /= sizeof(*dst); 1292 1293 for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */ 1294 const int k = 2*i; 1295 const int idx = sub_map[i]; 1296 if (k < len2) { 1297 tmp.re = FOLD(-src[ len2 + k], src[1*len2 - 1 - k]); 1298 tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); 1299 } else { 1300 tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]); 1301 tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]); 1302 } 1303 CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im); 1304 } 1305 1306 s->fn[0](&s->sub[0], z, z, sizeof(TXComplex)); 1307 1308 for (int i = 0; i < len4; i++) { 1309 const int i0 = len4 + i, i1 = len4 - i - 1; 1310 TXComplex src1 = { z[i1].re, z[i1].im }; 1311 TXComplex src0 = { z[i0].re, z[i0].im }; 1312 1313 CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, 1314 exp[i0].im, exp[i0].re); 1315 CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, 1316 exp[i1].im, exp[i1].re); 1317 } 1318 } 1319 1320 static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src, 1321 ptrdiff_t stride) 1322 { 1323 TXComplex *z = _dst, *exp = s->exp; 1324 const TXSample *src = _src, *in1, *in2; 1325 const int len2 = s->len >> 1; 1326 const int len4 = s->len >> 2; 1327 const int *sub_map = s->map; 1328 1329 stride /= sizeof(*src); 1330 in1 = src; 1331 in2 = src + ((len2*2) - 1) * stride; 1332 1333 for (int i = 0; i < len2; i++) { 1334 int k = sub_map[i]; 1335 TXComplex tmp = { in2[-k*stride], in1[k*stride] }; 1336 CMUL3(z[i], tmp, exp[i]); 1337 } 1338 1339 s->fn[0](&s->sub[0], z, z, sizeof(TXComplex)); 1340 1341 exp += len2; 1342 for (int i = 0; i < len4; i++) { 1343 const int i0 = len4 + i, i1 = len4 - i - 1; 1344 TXComplex src1 = { z[i1].im, z[i1].re }; 1345 TXComplex src0 = { z[i0].im, z[i0].re }; 1346 1347 CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); 1348 CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); 1349 } 1350 } 1351 1352 static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = { 1353 .name = TX_NAME_STR("mdct_fwd"), 1354 .function = TX_NAME(ff_tx_mdct_fwd), 1355 .type = TX_TYPE(MDCT), 1356 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, 1357 .factors = { 2, TX_FACTOR_ANY }, 1358 .nb_factors = 2, 1359 .min_len = 2, 1360 .max_len = TX_LEN_UNLIMITED, 1361 .init = TX_NAME(ff_tx_mdct_init), 1362 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 1363 .prio = FF_TX_PRIO_BASE, 1364 }; 1365 1366 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_def) = { 1367 .name = TX_NAME_STR("mdct_inv"), 1368 .function = TX_NAME(ff_tx_mdct_inv), 1369 .type = TX_TYPE(MDCT), 1370 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, 1371 .factors = { 2, TX_FACTOR_ANY }, 1372 .nb_factors = 2, 1373 .min_len = 2, 1374 .max_len = TX_LEN_UNLIMITED, 1375 .init = TX_NAME(ff_tx_mdct_init), 1376 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 1377 .prio = FF_TX_PRIO_BASE, 1378 }; 1379 1380 static av_cold int TX_NAME(ff_tx_mdct_inv_full_init)(AVTXContext *s, 1381 const FFTXCodelet *cd, 1382 uint64_t flags, 1383 FFTXCodeletOptions *opts, 1384 int len, int inv, 1385 const void *scale) 1386 { 1387 int ret; 1388 1389 s->scale_d = *((SCALE_TYPE *)scale); 1390 s->scale_f = s->scale_d; 1391 1392 flags &= ~AV_TX_FULL_IMDCT; 1393 1394 if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale))) 1395 return ret; 1396 1397 return 0; 1398 } 1399 1400 static void TX_NAME(ff_tx_mdct_inv_full)(AVTXContext *s, void *_dst, 1401 void *_src, ptrdiff_t stride) 1402 { 1403 int len = s->len << 1; 1404 int len2 = len >> 1; 1405 int len4 = len >> 2; 1406 TXSample *dst = _dst; 1407 1408 s->fn[0](&s->sub[0], dst + len4, _src, stride); 1409 1410 stride /= sizeof(*dst); 1411 1412 for (int i = 0; i < len4; i++) { 1413 dst[ i*stride] = -dst[(len2 - i - 1)*stride]; 1414 dst[(len - i - 1)*stride] = dst[(len2 + i + 0)*stride]; 1415 } 1416 } 1417 1418 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_full_def) = { 1419 .name = TX_NAME_STR("mdct_inv_full"), 1420 .function = TX_NAME(ff_tx_mdct_inv_full), 1421 .type = TX_TYPE(MDCT), 1422 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | 1423 FF_TX_OUT_OF_PLACE | AV_TX_FULL_IMDCT, 1424 .factors = { 2, TX_FACTOR_ANY }, 1425 .nb_factors = 2, 1426 .min_len = 2, 1427 .max_len = TX_LEN_UNLIMITED, 1428 .init = TX_NAME(ff_tx_mdct_inv_full_init), 1429 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 1430 .prio = FF_TX_PRIO_BASE, 1431 }; 1432 1433 static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s, 1434 const FFTXCodelet *cd, 1435 uint64_t flags, 1436 FFTXCodeletOptions *opts, 1437 int len, int inv, 1438 const void *scale) 1439 { 1440 int ret, sub_len; 1441 FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER }; 1442 1443 len >>= 1; 1444 sub_len = len / cd->factors[0]; 1445 1446 s->scale_d = *((SCALE_TYPE *)scale); 1447 s->scale_f = s->scale_d; 1448 1449 flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ 1450 flags |= AV_TX_INPLACE; /* in-place */ 1451 flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ 1452 1453 if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, 1454 sub_len, inv, scale))) 1455 return ret; 1456 1457 if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len))) 1458 return ret; 1459 1460 /* Our 15-point transform is also a compound one, so embed its input map */ 1461 if (cd->factors[0] == 15) 1462 TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5); 1463 1464 if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL))) 1465 return ret; 1466 1467 /* Saves multiplies in loops. */ 1468 for (int i = 0; i < len; i++) 1469 s->map[i] <<= 1; 1470 1471 if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) 1472 return AVERROR(ENOMEM); 1473 1474 TX_TAB(ff_tx_init_tabs)(len / sub_len); 1475 1476 return 0; 1477 } 1478 1479 #define DECL_COMP_IMDCT(N) \ 1480 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \ 1481 void *_src, ptrdiff_t stride) \ 1482 { \ 1483 TXComplex fft##N##in[N]; \ 1484 TXComplex *z = _dst, *exp = s->exp; \ 1485 const TXSample *src = _src, *in1, *in2; \ 1486 const int len4 = s->len >> 2; \ 1487 const int len2 = s->len >> 1; \ 1488 const int m = s->sub->len; \ 1489 const int *in_map = s->map, *out_map = in_map + N*m; \ 1490 const int *sub_map = s->sub->map; \ 1491 \ 1492 stride /= sizeof(*src); /* To convert it from bytes */ \ 1493 in1 = src; \ 1494 in2 = src + ((N*m*2) - 1) * stride; \ 1495 \ 1496 for (int i = 0; i < len2; i += N) { \ 1497 for (int j = 0; j < N; j++) { \ 1498 const int k = in_map[j]; \ 1499 TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \ 1500 CMUL3(fft##N##in[j], tmp, exp[j]); \ 1501 } \ 1502 fft##N(s->tmp + *(sub_map++), fft##N##in, m); \ 1503 exp += N; \ 1504 in_map += N; \ 1505 } \ 1506 \ 1507 for (int i = 0; i < N; i++) \ 1508 s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \ 1509 \ 1510 for (int i = 0; i < len4; i++) { \ 1511 const int i0 = len4 + i, i1 = len4 - i - 1; \ 1512 const int s0 = out_map[i0], s1 = out_map[i1]; \ 1513 TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re }; \ 1514 TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re }; \ 1515 \ 1516 CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); \ 1517 CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); \ 1518 } \ 1519 } \ 1520 \ 1521 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = { \ 1522 .name = TX_NAME_STR("mdct_pfa_" #N "xM_inv"), \ 1523 .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv), \ 1524 .type = TX_TYPE(MDCT), \ 1525 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, \ 1526 .factors = { N, TX_FACTOR_ANY }, \ 1527 .nb_factors = 2, \ 1528 .min_len = N*2, \ 1529 .max_len = TX_LEN_UNLIMITED, \ 1530 .init = TX_NAME(ff_tx_mdct_pfa_init), \ 1531 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ 1532 .prio = FF_TX_PRIO_BASE, \ 1533 }; 1534 1535 DECL_COMP_IMDCT(3) 1536 DECL_COMP_IMDCT(5) 1537 DECL_COMP_IMDCT(7) 1538 DECL_COMP_IMDCT(9) 1539 DECL_COMP_IMDCT(15) 1540 1541 #define DECL_COMP_MDCT(N) \ 1542 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst, \ 1543 void *_src, ptrdiff_t stride) \ 1544 { \ 1545 TXComplex fft##N##in[N]; \ 1546 TXSample *src = _src, *dst = _dst; \ 1547 TXComplex *exp = s->exp, tmp; \ 1548 const int m = s->sub->len; \ 1549 const int len4 = N*m; \ 1550 const int len3 = len4 * 3; \ 1551 const int len8 = s->len >> 2; \ 1552 const int *in_map = s->map, *out_map = in_map + N*m; \ 1553 const int *sub_map = s->sub->map; \ 1554 \ 1555 stride /= sizeof(*dst); \ 1556 \ 1557 for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */ \ 1558 for (int j = 0; j < N; j++) { \ 1559 const int k = in_map[i*N + j]; \ 1560 if (k < len4) { \ 1561 tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); \ 1562 tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); \ 1563 } else { \ 1564 tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); \ 1565 tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); \ 1566 } \ 1567 CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im, \ 1568 exp[k >> 1].re, exp[k >> 1].im); \ 1569 } \ 1570 fft##N(s->tmp + sub_map[i], fft##N##in, m); \ 1571 } \ 1572 \ 1573 for (int i = 0; i < N; i++) \ 1574 s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \ 1575 \ 1576 for (int i = 0; i < len8; i++) { \ 1577 const int i0 = len8 + i, i1 = len8 - i - 1; \ 1578 const int s0 = out_map[i0], s1 = out_map[i1]; \ 1579 TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im }; \ 1580 TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im }; \ 1581 \ 1582 CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, \ 1583 exp[i0].im, exp[i0].re); \ 1584 CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, \ 1585 exp[i1].im, exp[i1].re); \ 1586 } \ 1587 } \ 1588 \ 1589 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = { \ 1590 .name = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"), \ 1591 .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd), \ 1592 .type = TX_TYPE(MDCT), \ 1593 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \ 1594 .factors = { N, TX_FACTOR_ANY }, \ 1595 .nb_factors = 2, \ 1596 .min_len = N*2, \ 1597 .max_len = TX_LEN_UNLIMITED, \ 1598 .init = TX_NAME(ff_tx_mdct_pfa_init), \ 1599 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ 1600 .prio = FF_TX_PRIO_BASE, \ 1601 }; 1602 1603 DECL_COMP_MDCT(3) 1604 DECL_COMP_MDCT(5) 1605 DECL_COMP_MDCT(7) 1606 DECL_COMP_MDCT(9) 1607 DECL_COMP_MDCT(15) 1608 1609 static av_cold int TX_NAME(ff_tx_rdft_init)(AVTXContext *s, 1610 const FFTXCodelet *cd, 1611 uint64_t flags, 1612 FFTXCodeletOptions *opts, 1613 int len, int inv, 1614 const void *scale) 1615 { 1616 int ret; 1617 double f, m; 1618 TXSample *tab; 1619 uint64_t r2r = flags & AV_TX_REAL_TO_REAL; 1620 int len4 = FFALIGN(len, 4) / 4; 1621 1622 s->scale_d = *((SCALE_TYPE *)scale); 1623 s->scale_f = s->scale_d; 1624 1625 flags &= ~(AV_TX_REAL_TO_REAL | AV_TX_REAL_TO_IMAGINARY); 1626 1627 if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale))) 1628 return ret; 1629 1630 if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp)))) 1631 return AVERROR(ENOMEM); 1632 1633 tab = (TXSample *)s->exp; 1634 1635 f = 2*M_PI/len; 1636 1637 m = (inv ? 2*s->scale_d : s->scale_d); 1638 1639 *tab++ = RESCALE((inv ? 0.5 : 1.0) * m); 1640 *tab++ = RESCALE(inv ? 0.5*m : 1.0*m); 1641 *tab++ = RESCALE( m); 1642 *tab++ = RESCALE(-m); 1643 1644 *tab++ = RESCALE( (0.5 - 0.0) * m); 1645 if (r2r) 1646 *tab++ = 1 / s->scale_f; 1647 else 1648 *tab++ = RESCALE( (0.0 - 0.5) * m); 1649 *tab++ = RESCALE( (0.5 - inv) * m); 1650 *tab++ = RESCALE(-(0.5 - inv) * m); 1651 1652 for (int i = 0; i < len4; i++) 1653 *tab++ = RESCALE(cos(i*f)); 1654 1655 tab = ((TXSample *)s->exp) + len4 + 8; 1656 1657 for (int i = 0; i < len4; i++) 1658 *tab++ = RESCALE(cos(((len - i*4)/4.0)*f)) * (inv ? 1 : -1); 1659 1660 return 0; 1661 } 1662 1663 #define DECL_RDFT(n, inv) \ 1664 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \ 1665 void *_src, ptrdiff_t stride) \ 1666 { \ 1667 const int len2 = s->len >> 1; \ 1668 const int len4 = s->len >> 2; \ 1669 const TXSample *fact = (void *)s->exp; \ 1670 const TXSample *tcos = fact + 8; \ 1671 const TXSample *tsin = tcos + len4; \ 1672 TXComplex *data = inv ? _src : _dst; \ 1673 TXComplex t[3]; \ 1674 \ 1675 if (!inv) \ 1676 s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex)); \ 1677 else \ 1678 data[0].im = data[len2].re; \ 1679 \ 1680 /* The DC value's both components are real, but we need to change them \ 1681 * into complex values. Also, the middle of the array is special-cased. \ 1682 * These operations can be done before or after the loop. */ \ 1683 t[0].re = data[0].re; \ 1684 data[0].re = t[0].re + data[0].im; \ 1685 data[0].im = t[0].re - data[0].im; \ 1686 data[ 0].re = MULT(fact[0], data[ 0].re); \ 1687 data[ 0].im = MULT(fact[1], data[ 0].im); \ 1688 data[len4].re = MULT(fact[2], data[len4].re); \ 1689 data[len4].im = MULT(fact[3], data[len4].im); \ 1690 \ 1691 for (int i = 1; i < len4; i++) { \ 1692 /* Separate even and odd FFTs */ \ 1693 t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re)); \ 1694 t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im)); \ 1695 t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im)); \ 1696 t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re)); \ 1697 \ 1698 /* Apply twiddle factors to the odd FFT and add to the even FFT */ \ 1699 CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]); \ 1700 \ 1701 data[ i].re = t[0].re + t[2].re; \ 1702 data[ i].im = t[2].im - t[0].im; \ 1703 data[len2 - i].re = t[0].re - t[2].re; \ 1704 data[len2 - i].im = t[2].im + t[0].im; \ 1705 } \ 1706 \ 1707 if (inv) { \ 1708 s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex)); \ 1709 } else { \ 1710 /* Move [0].im to the last position, as convention requires */ \ 1711 data[len2].re = data[0].im; \ 1712 data[ 0].im = data[len2].im = 0; \ 1713 } \ 1714 } \ 1715 \ 1716 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \ 1717 .name = TX_NAME_STR("rdft_" #n), \ 1718 .function = TX_NAME(ff_tx_rdft_ ##n), \ 1719 .type = TX_TYPE(RDFT), \ 1720 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \ 1721 (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY), \ 1722 .factors = { 4, TX_FACTOR_ANY }, \ 1723 .nb_factors = 2, \ 1724 .min_len = 4, \ 1725 .max_len = TX_LEN_UNLIMITED, \ 1726 .init = TX_NAME(ff_tx_rdft_init), \ 1727 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ 1728 .prio = FF_TX_PRIO_BASE, \ 1729 }; 1730 1731 DECL_RDFT(r2c, 0) 1732 DECL_RDFT(c2r, 1) 1733 1734 #define DECL_RDFT_HALF(n, mode, mod2) \ 1735 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \ 1736 void *_src, ptrdiff_t stride) \ 1737 { \ 1738 const int len = s->len; \ 1739 const int len2 = len >> 1; \ 1740 const int len4 = len >> 2; \ 1741 const int aligned_len4 = FFALIGN(len, 4)/4; \ 1742 const TXSample *fact = (void *)s->exp; \ 1743 const TXSample *tcos = fact + 8; \ 1744 const TXSample *tsin = tcos + aligned_len4; \ 1745 TXComplex *data = _dst; \ 1746 TXSample *out = _dst; /* Half-complex is forward-only */ \ 1747 TXSample tmp_dc; \ 1748 av_unused TXSample tmp_mid; \ 1749 TXSample tmp[4]; \ 1750 TXComplex sf, sl; \ 1751 \ 1752 s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex)); \ 1753 \ 1754 tmp_dc = data[0].re; \ 1755 data[ 0].re = tmp_dc + data[0].im; \ 1756 tmp_dc = tmp_dc - data[0].im; \ 1757 \ 1758 data[ 0].re = MULT(fact[0], data[ 0].re); \ 1759 tmp_dc = MULT(fact[1], tmp_dc); \ 1760 data[len4].re = MULT(fact[2], data[len4].re); \ 1761 \ 1762 if (!mod2) { \ 1763 data[len4].im = MULT(fact[3], data[len4].im); \ 1764 } else { \ 1765 sf = data[len4]; \ 1766 sl = data[len4 + 1]; \ 1767 if (mode == AV_TX_REAL_TO_REAL) \ 1768 tmp[0] = MULT(fact[4], (sf.re + sl.re)); \ 1769 else \ 1770 tmp[0] = MULT(fact[5], (sf.im - sl.im)); \ 1771 tmp[1] = MULT(fact[6], (sf.im + sl.im)); \ 1772 tmp[2] = MULT(fact[7], (sf.re - sl.re)); \ 1773 \ 1774 if (mode == AV_TX_REAL_TO_REAL) { \ 1775 tmp[3] = tmp[1]*tcos[len4] - tmp[2]*tsin[len4]; \ 1776 tmp_mid = (tmp[0] - tmp[3]); \ 1777 } else { \ 1778 tmp[3] = tmp[1]*tsin[len4] + tmp[2]*tcos[len4]; \ 1779 tmp_mid = (tmp[0] + tmp[3]); \ 1780 } \ 1781 } \ 1782 \ 1783 /* NOTE: unrolling this breaks non-mod8 lengths */ \ 1784 for (int i = 1; i <= len4; i++) { \ 1785 TXSample tmp[4]; \ 1786 TXComplex sf = data[i]; \ 1787 TXComplex sl = data[len2 - i]; \ 1788 \ 1789 if (mode == AV_TX_REAL_TO_REAL) \ 1790 tmp[0] = MULT(fact[4], (sf.re + sl.re)); \ 1791 else \ 1792 tmp[0] = MULT(fact[5], (sf.im - sl.im)); \ 1793 \ 1794 tmp[1] = MULT(fact[6], (sf.im + sl.im)); \ 1795 tmp[2] = MULT(fact[7], (sf.re - sl.re)); \ 1796 \ 1797 if (mode == AV_TX_REAL_TO_REAL) { \ 1798 tmp[3] = tmp[1]*tcos[i] - tmp[2]*tsin[i]; \ 1799 out[i] = (tmp[0] + tmp[3]); \ 1800 out[len - i] = (tmp[0] - tmp[3]); \ 1801 } else { \ 1802 tmp[3] = tmp[1]*tsin[i] + tmp[2]*tcos[i]; \ 1803 out[i - 1] = (tmp[3] - tmp[0]); \ 1804 out[len - i - 1] = (tmp[0] + tmp[3]); \ 1805 } \ 1806 } \ 1807 \ 1808 for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++) \ 1809 out[len2 - i] = out[len - i]; \ 1810 \ 1811 if (mode == AV_TX_REAL_TO_REAL) { \ 1812 out[len2] = tmp_dc; \ 1813 if (mod2) \ 1814 out[len4 + 1] = tmp_mid * fact[5]; \ 1815 } else if (mod2) { \ 1816 out[len4] = tmp_mid; \ 1817 } \ 1818 } \ 1819 \ 1820 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \ 1821 .name = TX_NAME_STR("rdft_" #n), \ 1822 .function = TX_NAME(ff_tx_rdft_ ##n), \ 1823 .type = TX_TYPE(RDFT), \ 1824 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | mode | \ 1825 FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \ 1826 .factors = { 2 + 2*(!mod2), TX_FACTOR_ANY }, \ 1827 .nb_factors = 2, \ 1828 .min_len = 2 + 2*(!mod2), \ 1829 .max_len = TX_LEN_UNLIMITED, \ 1830 .init = TX_NAME(ff_tx_rdft_init), \ 1831 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \ 1832 .prio = FF_TX_PRIO_BASE, \ 1833 }; 1834 1835 DECL_RDFT_HALF(r2r, AV_TX_REAL_TO_REAL, 0) 1836 DECL_RDFT_HALF(r2r_mod2, AV_TX_REAL_TO_REAL, 1) 1837 DECL_RDFT_HALF(r2i, AV_TX_REAL_TO_IMAGINARY, 0) 1838 DECL_RDFT_HALF(r2i_mod2, AV_TX_REAL_TO_IMAGINARY, 1) 1839 1840 static av_cold int TX_NAME(ff_tx_dct_init)(AVTXContext *s, 1841 const FFTXCodelet *cd, 1842 uint64_t flags, 1843 FFTXCodeletOptions *opts, 1844 int len, int inv, 1845 const void *scale) 1846 { 1847 int ret; 1848 double freq; 1849 TXSample *tab; 1850 SCALE_TYPE rsc = *((SCALE_TYPE *)scale); 1851 1852 if (inv) { 1853 len *= 2; 1854 s->len *= 2; 1855 rsc *= 0.5; 1856 } 1857 1858 if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc))) 1859 return ret; 1860 1861 s->exp = av_malloc((len/2)*3*sizeof(TXSample)); 1862 if (!s->exp) 1863 return AVERROR(ENOMEM); 1864 1865 tab = (TXSample *)s->exp; 1866 1867 freq = M_PI/(len*2); 1868 1869 for (int i = 0; i < len; i++) 1870 tab[i] = RESCALE(cos(i*freq)*(!inv + 1)); 1871 1872 if (inv) { 1873 for (int i = 0; i < len/2; i++) 1874 tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq)); 1875 } else { 1876 for (int i = 0; i < len/2; i++) 1877 tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq)); 1878 } 1879 1880 return 0; 1881 } 1882 1883 static void TX_NAME(ff_tx_dctII)(AVTXContext *s, void *_dst, 1884 void *_src, ptrdiff_t stride) 1885 { 1886 TXSample *dst = _dst; 1887 TXSample *src = _src; 1888 const int len = s->len; 1889 const int len2 = len >> 1; 1890 const TXSample *exp = (void *)s->exp; 1891 TXSample next; 1892 #ifdef TX_INT32 1893 int64_t tmp1, tmp2; 1894 #else 1895 TXSample tmp1, tmp2; 1896 #endif 1897 1898 for (int i = 0; i < len2; i++) { 1899 TXSample in1 = src[i]; 1900 TXSample in2 = src[len - i - 1]; 1901 TXSample s = exp[len + i]; 1902 1903 #ifdef TX_INT32 1904 tmp1 = in1 + in2; 1905 tmp2 = in1 - in2; 1906 1907 tmp1 >>= 1; 1908 tmp2 *= s; 1909 1910 tmp2 = (tmp2 + 0x40000000) >> 31; 1911 #else 1912 tmp1 = (in1 + in2)*0.5; 1913 tmp2 = (in1 - in2)*s; 1914 #endif 1915 1916 src[i] = tmp1 + tmp2; 1917 src[len - i - 1] = tmp1 - tmp2; 1918 } 1919 1920 s->fn[0](&s->sub[0], dst, src, sizeof(TXComplex)); 1921 1922 next = dst[len]; 1923 1924 for (int i = len - 2; i > 0; i -= 2) { 1925 TXSample tmp; 1926 1927 CMUL(tmp, dst[i], exp[len - i], exp[i], dst[i + 0], dst[i + 1]); 1928 1929 dst[i + 1] = next; 1930 1931 next += tmp; 1932 } 1933 1934 #ifdef TX_INT32 1935 tmp1 = ((int64_t)exp[0]) * ((int64_t)dst[0]); 1936 dst[0] = (tmp1 + 0x40000000) >> 31; 1937 #else 1938 dst[0] = exp[0] * dst[0]; 1939 #endif 1940 dst[1] = next; 1941 } 1942 1943 static void TX_NAME(ff_tx_dctIII)(AVTXContext *s, void *_dst, 1944 void *_src, ptrdiff_t stride) 1945 { 1946 TXSample *dst = _dst; 1947 TXSample *src = _src; 1948 const int len = s->len; 1949 const int len2 = len >> 1; 1950 const TXSample *exp = (void *)s->exp; 1951 #ifdef TX_INT32 1952 int64_t tmp1, tmp2 = src[len - 1]; 1953 tmp2 = (2*tmp2 + 0x40000000) >> 31; 1954 #else 1955 TXSample tmp1, tmp2 = 2*src[len - 1]; 1956 #endif 1957 1958 src[len] = tmp2; 1959 1960 for (int i = len - 2; i >= 2; i -= 2) { 1961 TXSample val1 = src[i - 0]; 1962 TXSample val2 = src[i - 1] - src[i + 1]; 1963 1964 CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2); 1965 } 1966 1967 s->fn[0](&s->sub[0], dst, src, sizeof(float)); 1968 1969 for (int i = 0; i < len2; i++) { 1970 TXSample in1 = dst[i]; 1971 TXSample in2 = dst[len - i - 1]; 1972 TXSample c = exp[len + i]; 1973 1974 tmp1 = in1 + in2; 1975 tmp2 = in1 - in2; 1976 tmp2 *= c; 1977 #ifdef TX_INT32 1978 tmp2 = (tmp2 + 0x40000000) >> 31; 1979 #endif 1980 1981 dst[i] = tmp1 + tmp2; 1982 dst[len - i - 1] = tmp1 - tmp2; 1983 } 1984 } 1985 1986 static const FFTXCodelet TX_NAME(ff_tx_dctII_def) = { 1987 .name = TX_NAME_STR("dctII"), 1988 .function = TX_NAME(ff_tx_dctII), 1989 .type = TX_TYPE(DCT), 1990 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | 1991 FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, 1992 .factors = { 2, TX_FACTOR_ANY }, 1993 .min_len = 2, 1994 .max_len = TX_LEN_UNLIMITED, 1995 .init = TX_NAME(ff_tx_dct_init), 1996 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 1997 .prio = FF_TX_PRIO_BASE, 1998 }; 1999 2000 static const FFTXCodelet TX_NAME(ff_tx_dctIII_def) = { 2001 .name = TX_NAME_STR("dctIII"), 2002 .function = TX_NAME(ff_tx_dctIII), 2003 .type = TX_TYPE(DCT), 2004 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | 2005 FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, 2006 .factors = { 2, TX_FACTOR_ANY }, 2007 .min_len = 2, 2008 .max_len = TX_LEN_UNLIMITED, 2009 .init = TX_NAME(ff_tx_dct_init), 2010 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 2011 .prio = FF_TX_PRIO_BASE, 2012 }; 2013 2014 static av_cold int TX_NAME(ff_tx_dcstI_init)(AVTXContext *s, 2015 const FFTXCodelet *cd, 2016 uint64_t flags, 2017 FFTXCodeletOptions *opts, 2018 int len, int inv, 2019 const void *scale) 2020 { 2021 int ret; 2022 SCALE_TYPE rsc = *((SCALE_TYPE *)scale); 2023 2024 if (inv) { 2025 len *= 2; 2026 s->len *= 2; 2027 rsc *= 0.5; 2028 } 2029 2030 /* We want a half-complex RDFT */ 2031 flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL : 2032 AV_TX_REAL_TO_IMAGINARY; 2033 2034 if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, 2035 (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2, 2036 0, &rsc))) 2037 return ret; 2038 2039 s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample)); 2040 if (!s->tmp) 2041 return AVERROR(ENOMEM); 2042 2043 return 0; 2044 } 2045 2046 static void TX_NAME(ff_tx_dctI)(AVTXContext *s, void *_dst, 2047 void *_src, ptrdiff_t stride) 2048 { 2049 TXSample *dst = _dst; 2050 TXSample *src = _src; 2051 const int len = s->len - 1; 2052 TXSample *tmp = (TXSample *)s->tmp; 2053 2054 stride /= sizeof(TXSample); 2055 2056 for (int i = 0; i < len; i++) 2057 tmp[i] = tmp[2*len - i] = src[i * stride]; 2058 2059 tmp[len] = src[len * stride]; /* Middle */ 2060 2061 s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample)); 2062 } 2063 2064 static void TX_NAME(ff_tx_dstI)(AVTXContext *s, void *_dst, 2065 void *_src, ptrdiff_t stride) 2066 { 2067 TXSample *dst = _dst; 2068 TXSample *src = _src; 2069 const int len = s->len + 1; 2070 TXSample *tmp = (void *)s->tmp; 2071 2072 stride /= sizeof(TXSample); 2073 2074 tmp[0] = 0; 2075 2076 for (int i = 1; i < len; i++) { 2077 TXSample a = src[(i - 1) * stride]; 2078 tmp[i] = -a; 2079 tmp[2*len - i] = a; 2080 } 2081 2082 tmp[len] = 0; /* i == n, Nyquist */ 2083 2084 s->fn[0](&s->sub[0], dst, tmp, sizeof(float)); 2085 } 2086 2087 static const FFTXCodelet TX_NAME(ff_tx_dctI_def) = { 2088 .name = TX_NAME_STR("dctI"), 2089 .function = TX_NAME(ff_tx_dctI), 2090 .type = TX_TYPE(DCT_I), 2091 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, 2092 .factors = { 2, TX_FACTOR_ANY }, 2093 .nb_factors = 2, 2094 .min_len = 2, 2095 .max_len = TX_LEN_UNLIMITED, 2096 .init = TX_NAME(ff_tx_dcstI_init), 2097 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 2098 .prio = FF_TX_PRIO_BASE, 2099 }; 2100 2101 static const FFTXCodelet TX_NAME(ff_tx_dstI_def) = { 2102 .name = TX_NAME_STR("dstI"), 2103 .function = TX_NAME(ff_tx_dstI), 2104 .type = TX_TYPE(DST_I), 2105 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE, 2106 .factors = { 2, TX_FACTOR_ANY }, 2107 .nb_factors = 2, 2108 .min_len = 2, 2109 .max_len = TX_LEN_UNLIMITED, 2110 .init = TX_NAME(ff_tx_dcstI_init), 2111 .cpu_flags = FF_TX_CPU_FLAGS_ALL, 2112 .prio = FF_TX_PRIO_BASE, 2113 }; 2114 2115 int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab) 2116 { 2117 int off = 0; 2118 int len4 = s->len >> 1; 2119 double scale = s->scale_d; 2120 const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0; 2121 size_t alloc = pre_tab ? 2*len4 : len4; 2122 2123 if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp)))) 2124 return AVERROR(ENOMEM); 2125 2126 scale = sqrt(fabs(scale)); 2127 2128 if (pre_tab) 2129 off = len4; 2130 2131 for (int i = 0; i < len4; i++) { 2132 const double alpha = M_PI_2 * (i + theta) / len4; 2133 s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale), 2134 RESCALE(sin(alpha) * scale) }; 2135 } 2136 2137 if (pre_tab) 2138 for (int i = 0; i < len4; i++) 2139 s->exp[i] = s->exp[len4 + pre_tab[i]]; 2140 2141 return 0; 2142 } 2143 2144 const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = { 2145 /* Split-Radix codelets */ 2146 &TX_NAME(ff_tx_fft2_ns_def), 2147 &TX_NAME(ff_tx_fft4_ns_def), 2148 &TX_NAME(ff_tx_fft8_ns_def), 2149 &TX_NAME(ff_tx_fft16_ns_def), 2150 &TX_NAME(ff_tx_fft32_ns_def), 2151 &TX_NAME(ff_tx_fft64_ns_def), 2152 &TX_NAME(ff_tx_fft128_ns_def), 2153 &TX_NAME(ff_tx_fft256_ns_def), 2154 &TX_NAME(ff_tx_fft512_ns_def), 2155 &TX_NAME(ff_tx_fft1024_ns_def), 2156 &TX_NAME(ff_tx_fft2048_ns_def), 2157 &TX_NAME(ff_tx_fft4096_ns_def), 2158 &TX_NAME(ff_tx_fft8192_ns_def), 2159 &TX_NAME(ff_tx_fft16384_ns_def), 2160 &TX_NAME(ff_tx_fft32768_ns_def), 2161 &TX_NAME(ff_tx_fft65536_ns_def), 2162 &TX_NAME(ff_tx_fft131072_ns_def), 2163 &TX_NAME(ff_tx_fft262144_ns_def), 2164 &TX_NAME(ff_tx_fft524288_ns_def), 2165 &TX_NAME(ff_tx_fft1048576_ns_def), 2166 &TX_NAME(ff_tx_fft2097152_ns_def), 2167 2168 /* Prime factor codelets */ 2169 &TX_NAME(ff_tx_fft3_ns_def), 2170 &TX_NAME(ff_tx_fft5_ns_def), 2171 &TX_NAME(ff_tx_fft7_ns_def), 2172 &TX_NAME(ff_tx_fft9_ns_def), 2173 &TX_NAME(ff_tx_fft15_ns_def), 2174 2175 /* We get these for free */ 2176 &TX_NAME(ff_tx_fft3_fwd_def), 2177 &TX_NAME(ff_tx_fft5_fwd_def), 2178 &TX_NAME(ff_tx_fft7_fwd_def), 2179 &TX_NAME(ff_tx_fft9_fwd_def), 2180 2181 /* Standalone transforms */ 2182 &TX_NAME(ff_tx_fft_def), 2183 &TX_NAME(ff_tx_fft_inplace_def), 2184 &TX_NAME(ff_tx_fft_inplace_small_def), 2185 &TX_NAME(ff_tx_fft_pfa_def), 2186 &TX_NAME(ff_tx_fft_pfa_ns_def), 2187 &TX_NAME(ff_tx_fft_naive_def), 2188 &TX_NAME(ff_tx_fft_naive_small_def), 2189 &TX_NAME(ff_tx_mdct_fwd_def), 2190 &TX_NAME(ff_tx_mdct_inv_def), 2191 &TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def), 2192 &TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def), 2193 &TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def), 2194 &TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def), 2195 &TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def), 2196 &TX_NAME(ff_tx_mdct_pfa_3xM_inv_def), 2197 &TX_NAME(ff_tx_mdct_pfa_5xM_inv_def), 2198 &TX_NAME(ff_tx_mdct_pfa_7xM_inv_def), 2199 &TX_NAME(ff_tx_mdct_pfa_9xM_inv_def), 2200 &TX_NAME(ff_tx_mdct_pfa_15xM_inv_def), 2201 &TX_NAME(ff_tx_mdct_naive_fwd_def), 2202 &TX_NAME(ff_tx_mdct_naive_inv_def), 2203 &TX_NAME(ff_tx_mdct_inv_full_def), 2204 &TX_NAME(ff_tx_rdft_r2c_def), 2205 &TX_NAME(ff_tx_rdft_r2r_def), 2206 &TX_NAME(ff_tx_rdft_r2r_mod2_def), 2207 &TX_NAME(ff_tx_rdft_r2i_def), 2208 &TX_NAME(ff_tx_rdft_r2i_mod2_def), 2209 &TX_NAME(ff_tx_rdft_c2r_def), 2210 &TX_NAME(ff_tx_dctII_def), 2211 &TX_NAME(ff_tx_dctIII_def), 2212 &TX_NAME(ff_tx_dctI_def), 2213 &TX_NAME(ff_tx_dstI_def), 2214 2215 NULL, 2216 };