tx_float_init.c (13321B)
1 /* 2 * This file is part of FFmpeg. 3 * 4 * FFmpeg is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU Lesser General Public 6 * License as published by the Free Software Foundation; either 7 * version 2.1 of the License, or (at your option) any later version. 8 * 9 * FFmpeg is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * Lesser General Public License for more details. 13 * 14 * You should have received a copy of the GNU Lesser General Public 15 * License along with FFmpeg; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 19 #define TX_FLOAT 20 #include "libavutil/tx_priv.h" 21 #include "libavutil/attributes.h" 22 #include "libavutil/mem.h" 23 #include "libavutil/x86/cpu.h" 24 25 #include "config.h" 26 27 TX_DECL_FN(fft2, sse3) 28 TX_DECL_FN(fft4_fwd, sse2) 29 TX_DECL_FN(fft4_inv, sse2) 30 TX_DECL_FN(fft8, sse3) 31 TX_DECL_FN(fft8_ns, sse3) 32 TX_DECL_FN(fft8, avx) 33 TX_DECL_FN(fft8_ns, avx) 34 TX_DECL_FN(fft15, avx2) 35 TX_DECL_FN(fft15_ns, avx2) 36 TX_DECL_FN(fft16, avx) 37 TX_DECL_FN(fft16_ns, avx) 38 TX_DECL_FN(fft16, fma3) 39 TX_DECL_FN(fft16_ns, fma3) 40 TX_DECL_FN(fft32, avx) 41 TX_DECL_FN(fft32_ns, avx) 42 TX_DECL_FN(fft32, fma3) 43 TX_DECL_FN(fft32_ns, fma3) 44 TX_DECL_FN(fft_sr, avx) 45 TX_DECL_FN(fft_sr_ns, avx) 46 TX_DECL_FN(fft_sr, fma3) 47 TX_DECL_FN(fft_sr_ns, fma3) 48 TX_DECL_FN(fft_sr, avx2) 49 TX_DECL_FN(fft_sr_ns, avx2) 50 51 TX_DECL_FN(fft_pfa_15xM, avx2) 52 TX_DECL_FN(fft_pfa_15xM_ns, avx2) 53 54 TX_DECL_FN(mdct_inv, avx2) 55 56 TX_DECL_FN(fft2_asm, sse3) 57 TX_DECL_FN(fft4_fwd_asm, sse2) 58 TX_DECL_FN(fft4_inv_asm, sse2) 59 TX_DECL_FN(fft8_asm, sse3) 60 TX_DECL_FN(fft8_asm, avx) 61 TX_DECL_FN(fft16_asm, avx) 62 TX_DECL_FN(fft16_asm, fma3) 63 TX_DECL_FN(fft32_asm, avx) 64 TX_DECL_FN(fft32_asm, fma3) 65 TX_DECL_FN(fft_sr_asm, avx) 66 TX_DECL_FN(fft_sr_asm, fma3) 67 TX_DECL_FN(fft_sr_asm, avx2) 68 69 TX_DECL_FN(fft_pfa_15xM_asm, avx2) 70 71 #define DECL_INIT_FN(basis, interleave) \ 72 static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \ 73 const FFTXCodelet *cd, \ 74 uint64_t flags, \ 75 FFTXCodeletOptions *opts, \ 76 int len, int inv, \ 77 const void *scale) \ 78 { \ 79 ff_tx_init_tabs_float(len); \ 80 if (cd->max_len == 2) \ 81 return ff_tx_gen_ptwo_revtab(s, opts); \ 82 else \ 83 return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, \ 84 basis, interleave); \ 85 } 86 87 DECL_INIT_FN(8, 0) 88 DECL_INIT_FN(8, 2) 89 90 static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd, 91 uint64_t flags, FFTXCodeletOptions *opts, 92 int len, int inv, const void *scale) 93 { 94 int ret; 95 96 /* The transformations below are performed in the gather domain, 97 * so override the option and let the infrastructure convert the map 98 * to SCATTER if needed. */ 99 FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; 100 101 TX_TAB(ff_tx_init_tabs)(len); 102 103 if (len == 15) 104 ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5); 105 else 106 ret = ff_tx_gen_default_map(s, &sub_opts); 107 108 if (ret < 0) 109 return ret; 110 111 if (len == 15) { 112 int cnt = 0, tmp[15]; 113 114 /* Special permutation to simplify loads in the pre-permuted version */ 115 memcpy(tmp, s->map, 15*sizeof(*tmp)); 116 for (int i = 1; i < 15; i += 3) { 117 s->map[cnt] = tmp[i]; 118 cnt++; 119 } 120 for (int i = 2; i < 15; i += 3) { 121 s->map[cnt] = tmp[i]; 122 cnt++; 123 } 124 for (int i = 0; i < 15; i += 3) { 125 s->map[cnt] = tmp[i]; 126 cnt++; 127 } 128 memmove(&s->map[7], &s->map[6], 4*sizeof(int)); 129 memmove(&s->map[3], &s->map[1], 4*sizeof(int)); 130 s->map[1] = tmp[2]; 131 s->map[2] = tmp[0]; 132 } 133 134 return 0; 135 } 136 137 static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd, 138 uint64_t flags, FFTXCodeletOptions *opts, 139 int len, int inv, const void *scale) 140 { 141 int ret; 142 FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; 143 144 s->scale_d = *((SCALE_TYPE *)scale); 145 s->scale_f = s->scale_d; 146 147 flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ 148 flags |= AV_TX_INPLACE; /* in-place */ 149 flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ 150 flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */ 151 152 if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1, 153 inv, scale))) 154 return ret; 155 156 s->map = av_malloc(len*sizeof(*s->map)); 157 if (!s->map) 158 return AVERROR(ENOMEM); 159 160 memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map)); 161 /* Invert lookup table for unstrided path */ 162 for (int i = 0; i < (len >> 1); i++) 163 s->map[(len >> 1) + s->map[i]] = i; 164 165 if ((ret = ff_tx_mdct_gen_exp_float(s, s->map))) 166 return ret; 167 168 return 0; 169 } 170 171 static av_cold int fft_pfa_init(AVTXContext *s, 172 const FFTXCodelet *cd, 173 uint64_t flags, 174 FFTXCodeletOptions *opts, 175 int len, int inv, 176 const void *scale) 177 { 178 int ret; 179 int sub_len = len / cd->factors[0]; 180 FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER }; 181 182 flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ 183 flags |= AV_TX_INPLACE; /* in-place */ 184 flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ 185 flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */ 186 187 if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, 188 sub_len, inv, scale))) 189 return ret; 190 191 if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len))) 192 return ret; 193 194 if (cd->factors[0] == 15) { 195 int tmp[15]; 196 197 /* Our 15-point transform is also a compound one, so embed its input map */ 198 TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5); 199 200 /* Special permutation to simplify loads in the pre-permuted version */ 201 for (int k = 0; k < s->sub[0].len; k++) { 202 int cnt = 0; 203 memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp)); 204 for (int i = 1; i < 15; i += 3) { 205 s->map[k*15 + cnt] = tmp[i]; 206 cnt++; 207 } 208 for (int i = 2; i < 15; i += 3) { 209 s->map[k*15 + cnt] = tmp[i]; 210 cnt++; 211 } 212 for (int i = 0; i < 15; i += 3) { 213 s->map[k*15 + cnt] = tmp[i]; 214 cnt++; 215 } 216 memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int)); 217 memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int)); 218 s->map[k*15 + 1] = tmp[2]; 219 s->map[k*15 + 2] = tmp[0]; 220 } 221 } 222 223 if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) 224 return AVERROR(ENOMEM); 225 226 TX_TAB(ff_tx_init_tabs)(len / sub_len); 227 228 return 0; 229 } 230 231 const FFTXCodelet * const ff_tx_codelet_list_float_x86[] = { 232 TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0), 233 TX_DEF(fft2_asm, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, 234 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0), 235 TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), 236 TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0), 237 TX_DEF(fft4_fwd_asm, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, 238 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0), 239 TX_DEF(fft4_inv_asm, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, 240 AV_TX_INPLACE | FF_TX_INVERSE_ONLY | FF_TX_ASM_CALL, 0), 241 TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), 242 TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0), 243 TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0), 244 TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, 245 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, 0), 246 TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), 247 TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), 248 TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, 249 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), 250 TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 251 AV_CPU_FLAG_AVXSLOW), 252 TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), 253 TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, 254 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), 255 TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 256 AV_CPU_FLAG_AVXSLOW), 257 TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), 258 TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, 259 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), 260 TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 261 AV_CPU_FLAG_AVXSLOW), 262 263 #if ARCH_X86_64 264 TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), 265 TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, 266 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), 267 TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 268 AV_CPU_FLAG_AVXSLOW), 269 TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), 270 TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, 271 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), 272 TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 273 AV_CPU_FLAG_AVXSLOW), 274 TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 256, b8_i2, avx, AVX, 0, AV_CPU_FLAG_AVXSLOW), 275 TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX, 276 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), 277 TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 278 AV_CPU_FLAG_AVXSLOW), 279 TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW), 280 TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3, FMA3, 281 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW), 282 TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 283 AV_CPU_FLAG_AVXSLOW), 284 285 TX_DEF(fft15, FFT, 15, 15, 15, 0, 320, factor_init, avx2, AVX2, 286 AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW), 287 TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 384, factor_init, avx2, AVX2, 288 AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW), 289 290 TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx2, AVX2, 0, 291 AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), 292 TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2, 293 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), 294 TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 295 AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), 296 297 TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 320, fft_pfa_init, avx2, AVX2, 298 AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), 299 TX_DEF(fft_pfa_15xM_asm, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2, 300 AV_TX_INPLACE | FF_TX_PRESHUFFLE | FF_TX_ASM_CALL, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), 301 TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2, 302 AV_TX_INPLACE | FF_TX_PRESHUFFLE, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), 303 304 TX_DEF(mdct_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2, 305 FF_TX_INVERSE_ONLY, AV_CPU_FLAG_AVXSLOW | AV_CPU_FLAG_SLOW_GATHER), 306 #endif 307 308 NULL, 309 };