mpegaudiodsp.c (10688B)
1 /* 2 * SIMD-optimized MP3 decoding functions 3 * Copyright (c) 2010 Vitor Sessak 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #include <stddef.h> 23 24 #include "config.h" 25 #include "libavutil/attributes.h" 26 #include "libavutil/cpu.h" 27 #include "libavutil/mem_internal.h" 28 #include "libavutil/x86/asm.h" 29 #include "libavutil/x86/cpu.h" 30 #include "libavcodec/mpegaudiodsp.h" 31 32 #define DECL(CPU)\ 33 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\ 34 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win); 35 36 #if HAVE_X86ASM 37 DECL(sse2) 38 DECL(sse3) 39 DECL(ssse3) 40 DECL(avx) 41 #endif /* HAVE_X86ASM */ 42 43 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, 44 float *tmpbuf); 45 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, 46 float *tmpbuf); 47 48 void ff_dct32_float_sse2(float *out, const float *in); 49 void ff_dct32_float_avx (float *out, const float *in); 50 51 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; 52 53 #if HAVE_6REGS && HAVE_SSE_INLINE 54 55 #define MACS(rt, ra, rb) rt+=(ra)*(rb) 56 #define MLSS(rt, ra, rb) rt-=(ra)*(rb) 57 58 #define SUM8(op, sum, w, p) \ 59 { \ 60 op(sum, (w)[0 * 64], (p)[0 * 64]); \ 61 op(sum, (w)[1 * 64], (p)[1 * 64]); \ 62 op(sum, (w)[2 * 64], (p)[2 * 64]); \ 63 op(sum, (w)[3 * 64], (p)[3 * 64]); \ 64 op(sum, (w)[4 * 64], (p)[4 * 64]); \ 65 op(sum, (w)[5 * 64], (p)[5 * 64]); \ 66 op(sum, (w)[6 * 64], (p)[6 * 64]); \ 67 op(sum, (w)[7 * 64], (p)[7 * 64]); \ 68 } 69 70 static void apply_window(const float *buf, const float *win1, 71 const float *win2, float *sum1, float *sum2, int len) 72 { 73 x86_reg count = - 4*len; 74 const float *win1a = win1+len; 75 const float *win2a = win2+len; 76 const float *bufa = buf+len; 77 float *sum1a = sum1+len; 78 float *sum2a = sum2+len; 79 80 81 #define MULT(a, b) \ 82 "movaps " #a "(%1,%0), %%xmm1 \n\t" \ 83 "movaps " #a "(%3,%0), %%xmm2 \n\t" \ 84 "mulps %%xmm2, %%xmm1 \n\t" \ 85 "subps %%xmm1, %%xmm0 \n\t" \ 86 "mulps " #b "(%2,%0), %%xmm2 \n\t" \ 87 "subps %%xmm2, %%xmm4 \n\t" \ 88 89 __asm__ volatile( 90 "1: \n\t" 91 "xorps %%xmm0, %%xmm0 \n\t" 92 "xorps %%xmm4, %%xmm4 \n\t" 93 94 MULT( 0, 0) 95 MULT( 256, 64) 96 MULT( 512, 128) 97 MULT( 768, 192) 98 MULT(1024, 256) 99 MULT(1280, 320) 100 MULT(1536, 384) 101 MULT(1792, 448) 102 103 "movaps %%xmm0, (%4,%0) \n\t" 104 "movaps %%xmm4, (%5,%0) \n\t" 105 "add $16, %0 \n\t" 106 "jl 1b \n\t" 107 :"+&r"(count) 108 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) 109 ); 110 111 #undef MULT 112 } 113 114 static void apply_window_mp3(float *in, float *win, int *unused, float *out, 115 ptrdiff_t incr) 116 { 117 LOCAL_ALIGNED_16(float, suma, [17]); 118 LOCAL_ALIGNED_16(float, sumb, [17]); 119 LOCAL_ALIGNED_16(float, sumc, [17]); 120 LOCAL_ALIGNED_16(float, sumd, [17]); 121 122 float sum; 123 124 /* copy to avoid wrap */ 125 __asm__ volatile( 126 "movaps 0(%0), %%xmm0 \n\t" \ 127 "movaps 16(%0), %%xmm1 \n\t" \ 128 "movaps 32(%0), %%xmm2 \n\t" \ 129 "movaps 48(%0), %%xmm3 \n\t" \ 130 "movaps %%xmm0, 0(%1) \n\t" \ 131 "movaps %%xmm1, 16(%1) \n\t" \ 132 "movaps %%xmm2, 32(%1) \n\t" \ 133 "movaps %%xmm3, 48(%1) \n\t" \ 134 "movaps 64(%0), %%xmm0 \n\t" \ 135 "movaps 80(%0), %%xmm1 \n\t" \ 136 "movaps 96(%0), %%xmm2 \n\t" \ 137 "movaps 112(%0), %%xmm3 \n\t" \ 138 "movaps %%xmm0, 64(%1) \n\t" \ 139 "movaps %%xmm1, 80(%1) \n\t" \ 140 "movaps %%xmm2, 96(%1) \n\t" \ 141 "movaps %%xmm3, 112(%1) \n\t" 142 ::"r"(in), "r"(in+512) 143 :"memory" 144 ); 145 146 apply_window(in + 16, win , win + 512, suma, sumc, 16); 147 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); 148 149 SUM8(MACS, suma[0], win + 32, in + 48); 150 151 sumc[ 0] = 0; 152 sumb[16] = 0; 153 sumd[16] = 0; 154 155 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \ 156 "movups " #sumd "(%4), %%xmm0 \n\t" \ 157 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ 158 "subps " #suma "(%1), %%xmm0 \n\t" \ 159 "movaps %%xmm0," #out1 "(%0) \n\t" \ 160 \ 161 "movups " #sumc "(%3), %%xmm0 \n\t" \ 162 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ 163 "addps " #sumb "(%2), %%xmm0 \n\t" \ 164 "movaps %%xmm0," #out2 "(%0) \n\t" 165 166 if (incr == 1) { 167 __asm__ volatile( 168 SUMS( 0, 48, 4, 52, 0, 112) 169 SUMS(16, 32, 20, 36, 16, 96) 170 SUMS(32, 16, 36, 20, 32, 80) 171 SUMS(48, 0, 52, 4, 48, 64) 172 173 :"+&r"(out) 174 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) 175 :"memory" 176 ); 177 out += 16*incr; 178 } else { 179 int j; 180 float *out2 = out + 32 * incr; 181 out[0 ] = -suma[ 0]; 182 out += incr; 183 out2 -= incr; 184 for(j=1;j<16;j++) { 185 *out = -suma[ j] + sumd[16-j]; 186 *out2 = sumb[16-j] + sumc[ j]; 187 out += incr; 188 out2 -= incr; 189 } 190 } 191 192 sum = 0; 193 SUM8(MLSS, sum, win + 16 + 32, in + 32); 194 *out = sum; 195 } 196 197 #endif /* HAVE_6REGS && HAVE_SSE_INLINE */ 198 199 #if HAVE_X86ASM 200 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ 201 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ 202 int count, int switch_point, int block_type) \ 203 { \ 204 int align_end = count - (count & 3); \ 205 int j; \ 206 for (j = 0; j < align_end; j+= 4) { \ 207 LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ 208 float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ 209 /* apply window & overlap with previous buffer */ \ 210 \ 211 /* select window */ \ 212 ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ 213 in += 4*18; \ 214 buf += 4*18; \ 215 out += 4; \ 216 } \ 217 for (; j < count; j++) { \ 218 /* apply window & overlap with previous buffer */ \ 219 \ 220 /* select window */ \ 221 int win_idx = (switch_point && j < 2) ? 0 : block_type; \ 222 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ 223 \ 224 ff_imdct36_float_ ## CPU1(out, buf, in, win); \ 225 \ 226 in += 18; \ 227 buf++; \ 228 out++; \ 229 } \ 230 } 231 232 #if HAVE_SSE 233 DECL_IMDCT_BLOCKS(sse2,sse) 234 DECL_IMDCT_BLOCKS(sse3,sse) 235 DECL_IMDCT_BLOCKS(ssse3,sse) 236 #endif 237 #if HAVE_AVX_EXTERNAL 238 DECL_IMDCT_BLOCKS(avx,avx) 239 #endif 240 #endif /* HAVE_X86ASM */ 241 242 av_cold void ff_mpadsp_init_x86_tabs(void) 243 { 244 int i, j; 245 for (j = 0; j < 4; j++) { 246 for (i = 0; i < 40; i ++) { 247 mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; 248 mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; 249 mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; 250 mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; 251 mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; 252 mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; 253 mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; 254 mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; 255 } 256 } 257 } 258 259 av_cold void ff_mpadsp_init_x86(MPADSPContext *s) 260 { 261 av_unused int cpu_flags = av_get_cpu_flags(); 262 263 #if HAVE_6REGS && HAVE_SSE_INLINE 264 if (INLINE_SSE(cpu_flags)) { 265 s->apply_window_float = apply_window_mp3; 266 } 267 #endif /* HAVE_SSE_INLINE */ 268 269 #if HAVE_X86ASM 270 #if HAVE_SSE 271 if (EXTERNAL_SSE2(cpu_flags)) { 272 s->imdct36_blocks_float = imdct36_blocks_sse2; 273 s->dct32_float = ff_dct32_float_sse2; 274 } 275 if (EXTERNAL_SSE3(cpu_flags)) { 276 s->imdct36_blocks_float = imdct36_blocks_sse3; 277 } 278 if (EXTERNAL_SSSE3(cpu_flags)) { 279 s->imdct36_blocks_float = imdct36_blocks_ssse3; 280 } 281 #endif 282 #if HAVE_AVX_EXTERNAL 283 if (EXTERNAL_AVX(cpu_flags)) { 284 s->imdct36_blocks_float = imdct36_blocks_avx; 285 } 286 if (EXTERNAL_AVX_FAST(cpu_flags)) 287 s->dct32_float = ff_dct32_float_avx; 288 #endif 289 #endif /* HAVE_X86ASM */ 290 }