mdct_mipsr1.h (11932B)
1 /* Copyright (c) 2007-2008 CSIRO 2 Copyright (c) 2007-2008 Xiph.Org Foundation 3 Written by Jean-Marc Valin */ 4 /* 5 Redistribution and use in source and binary forms, with or without 6 modification, are permitted provided that the following conditions 7 are met: 8 9 - Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 12 - Redistributions in binary form must reproduce the above copyright 13 notice, this list of conditions and the following disclaimer in the 14 documentation and/or other materials provided with the distribution. 15 16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 20 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* This is a simple MDCT implementation that uses a N/4 complex FFT 30 to do most of the work. It should be relatively straightforward to 31 plug in pretty much and FFT here. 32 33 This replaces the Vorbis FFT (and uses the exact same API), which 34 was a bit too messy and that was ending up duplicating code 35 (might as well use the same FFT everywhere). 36 37 The algorithm is similar to (and inspired from) Fabrice Bellard's 38 MDCT implementation in FFMPEG, but has differences in signs, ordering 39 and scaling in many places. 40 */ 41 #ifndef MDCT_MIPSR1_H__ 42 #define MDCT_MIPSR1_H__ 43 44 #ifndef SKIP_CONFIG_H 45 #ifdef HAVE_CONFIG_H 46 #include "config.h" 47 #endif 48 #endif 49 50 #include "mdct.h" 51 #include "kiss_fft.h" 52 #include "_kiss_fft_guts.h" 53 #include <math.h> 54 #include "os_support.h" 55 #include "mathops.h" 56 #include "stack_alloc.h" 57 58 #if defined (__mips_dsp) 59 static inline int S_MUL_ADD_PSR(int a, int b, int c, int d, int shift) { 60 long long acc = __builtin_mips_mult(a, b); 61 acc = __builtin_mips_madd(acc, c, d); 62 return __builtin_mips_extr_w(acc, 15+shift); 63 } 64 65 static inline int S_MUL_SUB_PSR(int a, int b, int c, int d, int shift) { 66 long long acc = __builtin_mips_mult(a, b); 67 acc = __builtin_mips_msub(acc, c, d); 68 return __builtin_mips_extr_w(acc, 15+shift); 69 } 70 71 #define OVERRIDE_clt_mdct_forward 72 #define OVERRIDE_clt_mdct_backward 73 74 #elif defined(__mips_isa_rev) && __mips_isa_rev < 6 75 76 static inline int S_MUL_ADD_PSR(int a, int b, int c, int d, int shift) { 77 long long acc; 78 79 asm volatile ( 80 "mult %[a], %[b] \n" 81 "madd %[c], %[d] \n" 82 : [acc] "=x"(acc) 83 : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) 84 : 85 ); 86 return (int)(acc >> (15 + shift)); 87 } 88 89 static inline int S_MUL_SUB_PSR(int a, int b, int c, int d, int shift) { 90 long long acc; 91 92 asm volatile ( 93 "mult %[a], %[b] \n" 94 "msub %[c], %[d] \n" 95 : [acc] "=x"(acc) 96 : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d) 97 : 98 ); 99 return (int)(acc >> (15 + shift)); 100 } 101 102 #define OVERRIDE_clt_mdct_forward 103 #define OVERRIDE_clt_mdct_backward 104 105 #endif 106 107 #if defined (OVERRIDE_clt_mdct_forward) 108 109 /* Forward MDCT trashes the input array */ 110 void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, 111 const celt_coef *window, int overlap, int shift, int stride, int arch) 112 { 113 int i; 114 int N, N2, N4; 115 VARDECL(kiss_fft_scalar, f); 116 VARDECL(kiss_fft_cpx, f2); 117 const kiss_fft_state *st = l->kfft[shift]; 118 const kiss_twiddle_scalar *trig; 119 celt_coef scale; 120 #ifdef FIXED_POINT 121 /* Allows us to scale with MULT16_32_Q16(), which is faster than 122 MULT16_32_Q15() on ARM. */ 123 int scale_shift = st->scale_shift-1; 124 int headroom; 125 #endif 126 SAVE_STACK; 127 (void)arch; 128 scale = st->scale; 129 130 N = l->n; 131 trig = l->trig; 132 for (i=0;i<shift;i++) 133 { 134 N >>= 1; 135 trig += N; 136 } 137 N2 = N>>1; 138 N4 = N>>2; 139 140 ALLOC(f, N2, kiss_fft_scalar); 141 ALLOC(f2, N4, kiss_fft_cpx); 142 143 /* Consider the input to be composed of four blocks: [a, b, c, d] */ 144 /* Window, shuffle, fold */ 145 { 146 /* Temp pointers to make it really clear to the compiler what we're doing */ 147 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1); 148 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1); 149 kiss_fft_scalar * OPUS_RESTRICT yp = f; 150 const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1); 151 const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1; 152 for(i=0;i<((overlap+3)>>2);i++) 153 { 154 /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/ 155 *yp++ = S_MUL_ADD(*wp2, xp1[N2],*wp1,*xp2); 156 *yp++ = S_MUL_SUB(*wp1, *xp1,*wp2, xp2[-N2]); 157 xp1+=2; 158 xp2-=2; 159 wp1+=2; 160 wp2-=2; 161 } 162 wp1 = window; 163 wp2 = window+overlap-1; 164 for(;i<N4-((overlap+3)>>2);i++) 165 { 166 /* Real part arranged as a-bR, Imag part arranged as -c-dR */ 167 *yp++ = *xp2; 168 *yp++ = *xp1; 169 xp1+=2; 170 xp2-=2; 171 } 172 for(;i<N4;i++) 173 { 174 /* Real part arranged as a-bR, Imag part arranged as -c-dR */ 175 *yp++ = S_MUL_SUB(*wp2, *xp2, *wp1, xp1[-N2]); 176 *yp++ = S_MUL_ADD(*wp2, *xp1, *wp1, xp2[N2]); 177 xp1+=2; 178 xp2-=2; 179 wp1+=2; 180 wp2-=2; 181 } 182 } 183 /* Pre-rotation */ 184 { 185 kiss_fft_scalar * OPUS_RESTRICT yp = f; 186 const kiss_twiddle_scalar *t = &trig[0]; 187 #ifdef FIXED_POINT 188 opus_val32 maxval=1; 189 #endif 190 for(i=0;i<N4;i++) 191 { 192 kiss_fft_cpx yc; 193 kiss_twiddle_scalar t0, t1; 194 kiss_fft_scalar re, im, yr, yi; 195 t0 = t[i]; 196 t1 = t[N4+i]; 197 re = *yp++; 198 im = *yp++; 199 yr = S_MUL_SUB(re,t0,im,t1); 200 yi = S_MUL_ADD(im,t0,re,t1); 201 /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after. 202 For floating-point it doesn't matter. */ 203 #ifdef ENABLE_QEXT 204 yc.r = yr; 205 yc.i = yi; 206 #else 207 yc.r = S_MUL2(yr, scale); 208 yc.i = S_MUL2(yi, scale); 209 #endif 210 #ifdef FIXED_POINT 211 maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i))); 212 #endif 213 f2[st->bitrev[i]] = yc; 214 } 215 #ifdef FIXED_POINT 216 headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval))); 217 #endif 218 } 219 220 /* N/4 complex FFT, does not downscale anymore */ 221 opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom)); 222 223 /* Post-rotate */ 224 { 225 /* Temp pointers to make it really clear to the compiler what we're doing */ 226 const kiss_fft_cpx * OPUS_RESTRICT fp = f2; 227 kiss_fft_scalar * OPUS_RESTRICT yp1 = out; 228 kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1); 229 const kiss_twiddle_scalar *t = &trig[0]; 230 /* Temp pointers to make it really clear to the compiler what we're doing */ 231 for(i=0;i<N4;i++) 232 { 233 kiss_fft_scalar yr, yi; 234 kiss_fft_scalar t0, t1; 235 #ifdef ENABLE_QEXT 236 t0 = S_MUL2(t[i], scale); 237 t1 = S_MUL2(t[N4+i], scale); 238 #else 239 t0 = t[i]; 240 t1 = t[N4+i]; 241 #endif 242 yr = S_MUL_SUB_PSR(fp->i,t1 , fp->r,t0, headroom); 243 yi = S_MUL_ADD_PSR(fp->r,t1 , fp->i,t0, headroom); 244 *yp1 = yr; 245 *yp2 = yi; 246 fp++; 247 yp1 += 2*stride; 248 yp2 -= 2*stride; 249 } 250 } 251 RESTORE_STACK; 252 } 253 254 #endif /* OVERRIDE_clt_mdct_forward */ 255 256 #if defined(OVERRIDE_clt_mdct_backward) 257 258 void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out, 259 const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch) 260 { 261 int i; 262 int N, N2, N4; 263 const kiss_twiddle_scalar *trig; 264 #ifdef FIXED_POINT 265 int pre_shift, post_shift, fft_shift; 266 #endif 267 (void) arch; 268 269 N = l->n; 270 trig = l->trig; 271 for (i=0;i<shift;i++) 272 { 273 N >>= 1; 274 trig += N; 275 } 276 N2 = N>>1; 277 N4 = N>>2; 278 279 #ifdef FIXED_POINT 280 { 281 opus_val32 sumval=N2; 282 opus_val32 maxval=0; 283 for (i=0;i<N2;i++) { 284 maxval = MAX32(maxval, ABS32(in[i*stride])); 285 sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11))); 286 } 287 pre_shift = IMAX(0, 29-celt_zlog2(1+maxval)); 288 /* Worst-case where all the energy goes to a single sample. */ 289 post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval))); 290 post_shift = IMIN(post_shift, pre_shift); 291 fft_shift = pre_shift - post_shift; 292 } 293 #endif 294 /* Pre-rotate */ 295 { 296 /* Temp pointers to make it really clear to the compiler what we're doing */ 297 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in; 298 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1); 299 kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1); 300 const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0]; 301 const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev; 302 for(i=0;i<N4;i++) 303 { 304 int rev; 305 kiss_fft_scalar yr, yi; 306 opus_val32 x1, x2; 307 rev = *bitrev++; 308 x1 = SHL32_ovflw(*xp1, pre_shift); 309 x2 = SHL32_ovflw(*xp2, pre_shift); 310 yr = S_MUL_ADD(x2,t[i] , x1,t[N4+i]); 311 yi = S_MUL_SUB(x1,t[i] , x2,t[N4+i]); 312 /* We swap real and imag because we use an FFT instead of an IFFT. */ 313 yp[2*rev+1] = yr; 314 yp[2*rev] = yi; 315 /* Storing the pre-rotation directly in the bitrev order. */ 316 xp1+=2*stride; 317 xp2-=2*stride; 318 } 319 } 320 321 opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift)); 322 323 /* Post-rotate and de-shuffle from both ends of the buffer at once to make 324 it in-place. */ 325 { 326 kiss_fft_scalar * yp0 = out+(overlap>>1); 327 kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2; 328 const kiss_twiddle_scalar *t = &trig[0]; 329 /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the 330 middle pair will be computed twice. */ 331 for(i=0;i<(N4+1)>>1;i++) 332 { 333 kiss_fft_scalar re, im, yr, yi; 334 kiss_twiddle_scalar t0, t1; 335 /* We swap real and imag because we're using an FFT instead of an IFFT. */ 336 re = yp0[1]; 337 im = yp0[0]; 338 t0 = t[i]; 339 t1 = t[N4+i]; 340 /* We'd scale up by 2 here, but instead it's done when mixing the windows */ 341 yr = S_MUL_ADD_PSR(re,t0 , im,t1, post_shift); 342 yi = S_MUL_SUB_PSR(re,t1 , im,t0, post_shift); 343 /* We swap real and imag because we're using an FFT instead of an IFFT. */ 344 re = yp1[1]; 345 im = yp1[0]; 346 yp0[0] = yr; 347 yp1[1] = yi; 348 349 t0 = t[(N4-i-1)]; 350 t1 = t[(N2-i-1)]; 351 /* We'd scale up by 2 here, but instead it's done when mixing the windows */ 352 yr = S_MUL_ADD_PSR(re,t0,im,t1, post_shift); 353 yi = S_MUL_SUB_PSR(re,t1,im,t0, post_shift); 354 yp1[0] = yr; 355 yp0[1] = yi; 356 yp0 += 2; 357 yp1 -= 2; 358 } 359 } 360 361 /* Mirror on both sides for TDAC */ 362 { 363 kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1; 364 kiss_fft_scalar * OPUS_RESTRICT yp1 = out; 365 const celt_coef * OPUS_RESTRICT wp1 = window; 366 const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1; 367 368 for(i = 0; i < overlap/2; i++) 369 { 370 kiss_fft_scalar x1, x2; 371 x1 = *xp1; 372 x2 = *yp1; 373 *yp1++ = S_MUL_SUB(x2, *wp2, x1, *wp1); 374 *xp1-- = S_MUL_ADD(x2, *wp1, x1, *wp2); 375 wp1++; 376 wp2--; 377 } 378 } 379 } 380 381 #endif /* OVERRIDE_clt_mdct_backward */ 382 383 #endif /* MDCT_MIPSR1_H__ */