mdct.c (12338B)
1 /* Copyright (c) 2007-2008 CSIRO 2 Copyright (c) 2007-2008 Xiph.Org Foundation 3 Written by Jean-Marc Valin */ 4 /* 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions 7 are met: 8 9 - Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 12 - Redistributions in binary form must reproduce the above copyright 13 notice, this list of conditions and the following disclaimer in the 14 documentation and/or other materials provided with the distribution. 15 16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 20 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* This is a simple MDCT implementation that uses a N/4 complex FFT 30 to do most of the work. It should be relatively straightforward to 31 plug in pretty much and FFT here. 32 33 This replaces the Vorbis FFT (and uses the exact same API), which 34 was a bit too messy and that was ending up duplicating code 35 (might as well use the same FFT everywhere). 36 37 The algorithm is similar to (and inspired from) Fabrice Bellard's 38 MDCT implementation in FFMPEG, but has differences in signs, ordering 39 and scaling in many places. 40 */ 41 42 #ifndef SKIP_CONFIG_H 43 #ifdef HAVE_CONFIG_H 44 #include "config.h" 45 #endif 46 #endif 47 48 #include "mdct.h" 49 #include "kiss_fft.h" 50 #include "_kiss_fft_guts.h" 51 #include <math.h> 52 #include "os_support.h" 53 #include "mathops.h" 54 #include "stack_alloc.h" 55 56 #if defined(FIXED_POINT) && defined(__mips) && __mips == 32 57 #include "mips/mdct_mipsr1.h" 58 #endif 59 60 #ifndef M_PI 61 #define M_PI 3.141592653 62 #endif 63 64 #ifdef CUSTOM_MODES 65 66 int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch) 67 { 68 int i; 69 kiss_twiddle_scalar *trig; 70 int shift; 71 int N2=N>>1; 72 l->n = N; 73 l->maxshift = maxshift; 74 for (i=0;i<=maxshift;i++) 75 { 76 if (i==0) 77 l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0, arch); 78 else 79 l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0], arch); 80 #ifndef ENABLE_TI_DSPLIB55 81 if (l->kfft[i]==NULL) 82 return 0; 83 #endif 84 } 85 l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N-(N2>>maxshift))*sizeof(kiss_twiddle_scalar)); 86 if (l->trig==NULL) 87 return 0; 88 for (shift=0;shift<=maxshift;shift++) 89 { 90 /* We have enough points that sine isn't necessary */ 91 #if defined(FIXED_POINT) 92 #ifndef ENABLE_QEXT 93 for (i=0;i<N2;i++) 94 trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2+16384),N)); 95 #else 96 for (i=0;i<N2;i++) 97 trig[i] = (kiss_twiddle_scalar)MAX32(-2147483647,MIN32(2147483647,floor(.5+2147483648*cos(2*M_PI*(i+.125)/N)))); 98 #endif 99 #else 100 for (i=0;i<N2;i++) 101 trig[i] = (kiss_twiddle_scalar)cos(2*PI*(i+.125)/N); 102 #endif 103 trig += N2; 104 N2 >>= 1; 105 N >>= 1; 106 } 107 return 1; 108 } 109 110 void clt_mdct_clear(mdct_lookup *l, int arch) 111 { 112 int i; 113 for (i=0;i<=l->maxshift;i++) 114 opus_fft_free(l->kfft[i], arch); 115 opus_free((kiss_twiddle_scalar*)l->trig); 116 } 117 118 #endif /* CUSTOM_MODES */ 119 120 /* Forward MDCT trashes the input array */ 121 #ifndef OVERRIDE_clt_mdct_forward 122 void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, 123 const celt_coef *window, int overlap, int shift, int stride, int arch) 124 { 125 int i; 126 int N, N2, N4; 127 VARDECL(kiss_fft_scalar, f); 128 VARDECL(kiss_fft_cpx, f2); 129 const kiss_fft_state *st = l->kfft[shift]; 130 const kiss_twiddle_scalar *trig; 131 celt_coef scale; 132 #ifdef FIXED_POINT 133 /* Allows us to scale with MULT16_32_Q16(), which is faster than 134 MULT16_32_Q15() on ARM. */ 135 int scale_shift = st->scale_shift-1; 136 int headroom; 137 #endif 138 SAVE_STACK; 139 (void)arch; 140 scale = st->scale; 141 142 N = l->n; 143 trig = l->trig; 144 for (i=0;i<shift;i++) 145 { 146 N >>= 1; 147 trig += N; 148 } 149 N2 = N>>1; 150 N4 = N>>2; 151 152 ALLOC(f, N2, kiss_fft_scalar); 153 ALLOC(f2, N4, kiss_fft_cpx); 154 155 /* Consider the input to be composed of four blocks: [a, b, c, d] */ 156 /* Window, shuffle, fold */ 157 { 158 /* Temp pointers to make it really clear to the compiler what we're doing */ 159 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); 160 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); 161 kiss_fft_scalar * OPUS_RESTRICT yp = f; 162 const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); 163 const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; 164 for(i=0;i<((overlap+3)>>2);i++) 165 { 166 /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ 167 *yp++ = S_MUL(xp1[N2], *wp2) + S_MUL(*xp2, *wp1); 168 *yp++ = S_MUL(*xp1, *wp1) - S_MUL(xp2[-N2], *wp2); 169 xp1+=2; 170 xp2-=2; 171 wp1+=2; 172 wp2-=2; 173 } 174 wp1 = window; 175 wp2 = window+overlap-1; 176 for(;i<N4-((overlap+3)>>2);i++) 177 { 178 /* Real part arranged as a-bR, Imag part arranged as -c-dR */ 179 *yp++ = *xp2; 180 *yp++ = *xp1; 181 xp1+=2; 182 xp2-=2; 183 } 184 for(;i<N4;i++) 185 { 186 /* Real part arranged as a-bR, Imag part arranged as -c-dR */ 187 *yp++ = -S_MUL(xp1[-N2], *wp1) + S_MUL(*xp2, *wp2); 188 *yp++ = S_MUL(*xp1, *wp2) + S_MUL(xp2[N2], *wp1); 189 xp1+=2; 190 xp2-=2; 191 wp1+=2; 192 wp2-=2; 193 } 194 } 195 /* Pre-rotation */ 196 { 197 kiss_fft_scalar * OPUS_RESTRICT yp = f; 198 const kiss_twiddle_scalar *t = &trig[0]; 199 #ifdef FIXED_POINT 200 opus_val32 maxval=1; 201 #endif 202 for(i=0;i<N4;i++) 203 { 204 kiss_fft_cpx yc; 205 kiss_twiddle_scalar t0, t1; 206 kiss_fft_scalar re, im, yr, yi; 207 t0 = t[i]; 208 t1 = t[N4+i]; 209 re = *yp++; 210 im = *yp++; 211 yr = S_MUL(re,t0) - S_MUL(im,t1); 212 yi = S_MUL(im,t0) + S_MUL(re,t1); 213 /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. 214 For floating-point it doesn't matter. */ 215 #ifdef ENABLE_QEXT 216 yc.r = yr; 217 yc.i = yi; 218 #else 219 yc.r = S_MUL2(yr, scale); 220 yc.i = S_MUL2(yi, scale); 221 #endif 222 #ifdef FIXED_POINT 223 maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); 224 #endif 225 f2[st->bitrev[i]] = yc; 226 } 227 #ifdef FIXED_POINT 228 headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); 229 #endif 230 } 231 232 /* N/4 complex FFT, does not downscale anymore */ 233 opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); 234 235 /* Post-rotate */ 236 { 237 /* Temp pointers to make it really clear to the compiler what we're doing */ 238 const kiss_fft_cpx * OPUS_RESTRICT fp = f2; 239 kiss_fft_scalar * OPUS_RESTRICT yp1 = out; 240 kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); 241 const kiss_twiddle_scalar *t = &trig[0]; 242 /* Temp pointers to make it really clear to the compiler what we're doing */ 243 for(i=0;i<N4;i++) 244 { 245 kiss_fft_scalar yr, yi; 246 kiss_fft_scalar t0, t1; 247 #ifdef ENABLE_QEXT 248 t0 = S_MUL2(t[i], scale); 249 t1 = S_MUL2(t[N4+i], scale); 250 #else 251 t0 = t[i]; 252 t1 = t[N4+i]; 253 #endif 254 yr = PSHR32(S_MUL(fp->i,t1) - S_MUL(fp->r,t0), headroom); 255 yi = PSHR32(S_MUL(fp->r,t1) + S_MUL(fp->i,t0), headroom); 256 *yp1 = yr; 257 *yp2 = yi; 258 fp++; 259 yp1 += 2*stride; 260 yp2 -= 2*stride; 261 } 262 } 263 RESTORE_STACK; 264 } 265 #endif /* OVERRIDE_clt_mdct_forward */ 266 267 #ifndef OVERRIDE_clt_mdct_backward 268 void clt_mdct_backward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, 269 const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) 270 { 271 int i; 272 int N, N2, N4; 273 const kiss_twiddle_scalar *trig; 274 #ifdef FIXED_POINT 275 int pre_shift, post_shift, fft_shift; 276 #endif 277 (void) arch; 278 279 N = l->n; 280 trig = l->trig; 281 for (i=0;i<shift;i++) 282 { 283 N >>= 1; 284 trig += N; 285 } 286 N2 = N>>1; 287 N4 = N>>2; 288 289 #ifdef FIXED_POINT 290 { 291 opus_val32 sumval=N2; 292 opus_val32 maxval=0; 293 for (i=0;i<N2;i++) { 294 maxval = MAX32(maxval, ABS32(in[i*stride])); 295 sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11))); 296 } 297 pre_shift = IMAX(0, 29-celt_zlog2(1+maxval)); 298 /* Worst-case where all the energy goes to a single sample. */ 299 post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval))); 300 post_shift = IMIN(post_shift, pre_shift); 301 fft_shift = pre_shift - post_shift; 302 } 303 #endif 304 /* Pre-rotate */ 305 { 306 /* Temp pointers to make it really clear to the compiler what we're doing */ 307 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; 308 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); 309 kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); 310 const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; 311 const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; 312 for(i=0;i<N4;i++) 313 { 314 int rev; 315 kiss_fft_scalar yr, yi; 316 opus_val32 x1, x2; 317 rev = *bitrev++; 318 x1 = SHL32_ovflw(*xp1, pre_shift); 319 x2 = SHL32_ovflw(*xp2, pre_shift); 320 yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i])); 321 yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i])); 322 /* We swap real and imag because we use an FFT instead of an IFFT. */ 323 yp[2*rev+1] = yr; 324 yp[2*rev] = yi; 325 /* Storing the pre-rotation directly in the bitrev order. */ 326 xp1+=2*stride; 327 xp2-=2*stride; 328 } 329 } 330 331 opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); 332 333 /* Post-rotate and de-shuffle from both ends of the buffer at once to make 334 it in-place. */ 335 { 336 kiss_fft_scalar * yp0 = out+(overlap>>1); 337 kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; 338 const kiss_twiddle_scalar *t = &trig[0]; 339 /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the 340 middle pair will be computed twice. */ 341 for(i=0;i<(N4+1)>>1;i++) 342 { 343 kiss_fft_scalar re, im, yr, yi; 344 kiss_twiddle_scalar t0, t1; 345 /* We swap real and imag because we're using an FFT instead of an IFFT. */ 346 re = yp0[1]; 347 im = yp0[0]; 348 t0 = t[i]; 349 t1 = t[N4+i]; 350 /* We'd scale up by 2 here, but instead it's done when mixing the windows */ 351 yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); 352 yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); 353 /* We swap real and imag because we're using an FFT instead of an IFFT. */ 354 re = yp1[1]; 355 im = yp1[0]; 356 yp0[0] = yr; 357 yp1[1] = yi; 358 359 t0 = t[(N4-i-1)]; 360 t1 = t[(N2-i-1)]; 361 /* We'd scale up by 2 here, but instead it's done when mixing the windows */ 362 yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift); 363 yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift); 364 yp1[0] = yr; 365 yp0[1] = yi; 366 yp0 += 2; 367 yp1 -= 2; 368 } 369 } 370 371 /* Mirror on both sides for TDAC */ 372 { 373 kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; 374 kiss_fft_scalar * OPUS_RESTRICT yp1 = out; 375 const celt_coef * OPUS_RESTRICT wp1 = window; 376 const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; 377 378 for(i = 0; i < overlap/2; i++) 379 { 380 kiss_fft_scalar x1, x2; 381 x1 = *xp1; 382 x2 = *yp1; 383 *yp1++ = SUB32_ovflw(S_MUL(x2, *wp2), S_MUL(x1, *wp1)); 384 *xp1-- = ADD32_ovflw(S_MUL(x2, *wp1), S_MUL(x1, *wp2)); 385 wp1++; 386 wp2--; 387 } 388 } 389 } 390 #endif /* OVERRIDE_clt_mdct_backward */