tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mdct_mipsr1.h (11932B)


      1 /* Copyright (c) 2007-2008 CSIRO
      2   Copyright (c) 2007-2008 Xiph.Org Foundation
      3   Written by Jean-Marc Valin */
      4 /*
      5   Redistribution and use in source and binary forms, with or without
      6   modification, are permitted provided that the following conditions
      7   are met:
      8 
      9   - Redistributions of source code must retain the above copyright
     10   notice, this list of conditions and the following disclaimer.
     11 
     12   - Redistributions in binary form must reproduce the above copyright
     13   notice, this list of conditions and the following disclaimer in the
     14   documentation and/or other materials provided with the distribution.
     15 
     16   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     20   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     21   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     22   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     24   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     25   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 */
     28 
     29 /* This is a simple MDCT implementation that uses a N/4 complex FFT
     30   to do most of the work. It should be relatively straightforward to
     31   plug in pretty much and FFT here.
     32 
     33   This replaces the Vorbis FFT (and uses the exact same API), which
     34   was a bit too messy and that was ending up duplicating code
     35   (might as well use the same FFT everywhere).
     36 
     37   The algorithm is similar to (and inspired from) Fabrice Bellard's
     38   MDCT implementation in FFMPEG, but has differences in signs, ordering
     39   and scaling in many places.
     40 */
     41 #ifndef MDCT_MIPSR1_H__
     42 #define MDCT_MIPSR1_H__
     43 
     44 #ifndef SKIP_CONFIG_H
     45 #ifdef HAVE_CONFIG_H
     46 #include "config.h"
     47 #endif
     48 #endif
     49 
     50 #include "mdct.h"
     51 #include "kiss_fft.h"
     52 #include "_kiss_fft_guts.h"
     53 #include <math.h>
     54 #include "os_support.h"
     55 #include "mathops.h"
     56 #include "stack_alloc.h"
     57 
     58 #if defined (__mips_dsp)
     59 static inline int S_MUL_ADD_PSR(int a, int b, int c, int d, int shift) {
     60    long long acc = __builtin_mips_mult(a, b);
     61    acc = __builtin_mips_madd(acc, c, d);
     62    return __builtin_mips_extr_w(acc, 15+shift);
     63 }
     64 
     65 static inline int S_MUL_SUB_PSR(int a, int b, int c, int d, int shift) {
     66    long long acc = __builtin_mips_mult(a, b);
     67    acc = __builtin_mips_msub(acc, c, d);
     68    return __builtin_mips_extr_w(acc, 15+shift);
     69 }
     70 
     71 #define OVERRIDE_clt_mdct_forward
     72 #define OVERRIDE_clt_mdct_backward
     73 
     74 #elif defined(__mips_isa_rev) && __mips_isa_rev < 6
     75 
     76 static inline int S_MUL_ADD_PSR(int a, int b, int c, int d, int shift) {
     77    long long acc;
     78 
     79    asm volatile (
     80            "mult %[a], %[b]  \n"
     81            "madd %[c], %[d]  \n"
     82        : [acc] "=x"(acc)
     83        : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
     84        :
     85    );
     86    return (int)(acc >> (15 + shift));
     87 }
     88 
     89 static inline int S_MUL_SUB_PSR(int a, int b, int c, int d, int shift) {
     90    long long acc;
     91 
     92    asm volatile (
     93            "mult %[a], %[b]  \n"
     94            "msub %[c], %[d]  \n"
     95        : [acc] "=x"(acc)
     96        : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
     97        :
     98    );
     99    return (int)(acc >> (15 + shift));
    100 }
    101 
    102 #define OVERRIDE_clt_mdct_forward
    103 #define OVERRIDE_clt_mdct_backward
    104 
    105 #endif
    106 
    107 #if defined (OVERRIDE_clt_mdct_forward)
    108 
    109 /* Forward MDCT trashes the input array */
    110 void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
    111      const celt_coef *window, int overlap, int shift, int stride, int arch)
    112 {
    113   int i;
    114   int N, N2, N4;
    115   VARDECL(kiss_fft_scalar, f);
    116   VARDECL(kiss_fft_cpx, f2);
    117   const kiss_fft_state *st = l->kfft[shift];
    118   const kiss_twiddle_scalar *trig;
    119   celt_coef scale;
    120 #ifdef FIXED_POINT
    121   /* Allows us to scale with MULT16_32_Q16(), which is faster than
    122      MULT16_32_Q15() on ARM. */
    123   int scale_shift = st->scale_shift-1;
    124   int headroom;
    125 #endif
    126   SAVE_STACK;
    127   (void)arch;
    128   scale = st->scale;
    129 
    130   N = l->n;
    131   trig = l->trig;
    132   for (i=0;i<shift;i++)
    133   {
    134      N >>= 1;
    135      trig += N;
    136   }
    137   N2 = N>>1;
    138   N4 = N>>2;
    139 
    140   ALLOC(f, N2, kiss_fft_scalar);
    141   ALLOC(f2, N4, kiss_fft_cpx);
    142 
    143   /* Consider the input to be composed of four blocks: [a, b, c, d] */
    144   /* Window, shuffle, fold */
    145   {
    146      /* Temp pointers to make it really clear to the compiler what we're doing */
    147      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
    148      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
    149      kiss_fft_scalar * OPUS_RESTRICT yp = f;
    150      const celt_coef * OPUS_RESTRICT wp1 = window+(overlap>>1);
    151      const celt_coef * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
    152      for(i=0;i<((overlap+3)>>2);i++)
    153      {
    154         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
    155          *yp++ = S_MUL_ADD(*wp2, xp1[N2],*wp1,*xp2);
    156          *yp++ = S_MUL_SUB(*wp1, *xp1,*wp2, xp2[-N2]);
    157         xp1+=2;
    158         xp2-=2;
    159         wp1+=2;
    160         wp2-=2;
    161      }
    162      wp1 = window;
    163      wp2 = window+overlap-1;
    164      for(;i<N4-((overlap+3)>>2);i++)
    165      {
    166         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
    167         *yp++ = *xp2;
    168         *yp++ = *xp1;
    169         xp1+=2;
    170         xp2-=2;
    171      }
    172      for(;i<N4;i++)
    173      {
    174         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
    175          *yp++ = S_MUL_SUB(*wp2, *xp2, *wp1, xp1[-N2]);
    176          *yp++ = S_MUL_ADD(*wp2, *xp1, *wp1, xp2[N2]);
    177         xp1+=2;
    178         xp2-=2;
    179         wp1+=2;
    180         wp2-=2;
    181      }
    182   }
    183   /* Pre-rotation */
    184   {
    185      kiss_fft_scalar * OPUS_RESTRICT yp = f;
    186      const kiss_twiddle_scalar *t = &trig[0];
    187 #ifdef FIXED_POINT
    188      opus_val32 maxval=1;
    189 #endif
    190      for(i=0;i<N4;i++)
    191      {
    192         kiss_fft_cpx yc;
    193         kiss_twiddle_scalar t0, t1;
    194         kiss_fft_scalar re, im, yr, yi;
    195         t0 = t[i];
    196         t1 = t[N4+i];
    197         re = *yp++;
    198         im = *yp++;
    199         yr = S_MUL_SUB(re,t0,im,t1);
    200         yi = S_MUL_ADD(im,t0,re,t1);
    201         /* For QEXT, it's best to scale before the FFT, but otherwise it's best to scale after.
    202            For floating-point it doesn't matter. */
    203 #ifdef ENABLE_QEXT
    204         yc.r = yr;
    205         yc.i = yi;
    206 #else
    207         yc.r = S_MUL2(yr, scale);
    208         yc.i = S_MUL2(yi, scale);
    209 #endif
    210 #ifdef FIXED_POINT
    211         maxval = MAX32(maxval, MAX32(ABS32(yc.r), ABS32(yc.i)));
    212 #endif
    213         f2[st->bitrev[i]] = yc;
    214      }
    215 #ifdef FIXED_POINT
    216      headroom = IMAX(0, IMIN(scale_shift, 28-celt_ilog2(maxval)));
    217 #endif
    218   }
    219 
    220   /* N/4 complex FFT, does not downscale anymore */
    221   opus_fft_impl(st, f2 ARG_FIXED(scale_shift-headroom));
    222 
    223   /* Post-rotate */
    224   {
    225      /* Temp pointers to make it really clear to the compiler what we're doing */
    226      const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
    227      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
    228      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
    229      const kiss_twiddle_scalar *t = &trig[0];
    230      /* Temp pointers to make it really clear to the compiler what we're doing */
    231      for(i=0;i<N4;i++)
    232      {
    233         kiss_fft_scalar yr, yi;
    234         kiss_fft_scalar t0, t1;
    235 #ifdef ENABLE_QEXT
    236         t0 = S_MUL2(t[i], scale);
    237         t1 = S_MUL2(t[N4+i], scale);
    238 #else
    239         t0 = t[i];
    240         t1 = t[N4+i];
    241 #endif
    242         yr = S_MUL_SUB_PSR(fp->i,t1 , fp->r,t0, headroom);
    243         yi = S_MUL_ADD_PSR(fp->r,t1 , fp->i,t0, headroom);
    244         *yp1 = yr;
    245         *yp2 = yi;
    246         fp++;
    247         yp1 += 2*stride;
    248         yp2 -= 2*stride;
    249      }
    250   }
    251   RESTORE_STACK;
    252 }
    253 
    254 #endif /* OVERRIDE_clt_mdct_forward */
    255 
    256 #if defined(OVERRIDE_clt_mdct_backward)
    257 
    258 void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
    259      const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
    260 {
    261   int i;
    262   int N, N2, N4;
    263   const kiss_twiddle_scalar *trig;
    264 #ifdef FIXED_POINT
    265   int pre_shift, post_shift, fft_shift;
    266 #endif
    267   (void) arch;
    268 
    269   N = l->n;
    270   trig = l->trig;
    271   for (i=0;i<shift;i++)
    272   {
    273      N >>= 1;
    274      trig += N;
    275   }
    276   N2 = N>>1;
    277   N4 = N>>2;
    278 
    279 #ifdef FIXED_POINT
    280   {
    281      opus_val32 sumval=N2;
    282      opus_val32 maxval=0;
    283      for (i=0;i<N2;i++) {
    284         maxval = MAX32(maxval, ABS32(in[i*stride]));
    285         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],11)));
    286      }
    287      pre_shift = IMAX(0, 29-celt_zlog2(1+maxval));
    288      /* Worst-case where all the energy goes to a single sample. */
    289      post_shift = IMAX(0, 19-celt_ilog2(ABS32(sumval)));
    290      post_shift = IMIN(post_shift, pre_shift);
    291      fft_shift = pre_shift - post_shift;
    292   }
    293 #endif
    294   /* Pre-rotate */
    295   {
    296      /* Temp pointers to make it really clear to the compiler what we're doing */
    297      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
    298      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
    299      kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
    300      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
    301      const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
    302      for(i=0;i<N4;i++)
    303      {
    304         int rev;
    305         kiss_fft_scalar yr, yi;
    306         opus_val32 x1, x2;
    307         rev = *bitrev++;
    308         x1 = SHL32_ovflw(*xp1, pre_shift);
    309         x2 = SHL32_ovflw(*xp2, pre_shift);
    310         yr = S_MUL_ADD(x2,t[i] , x1,t[N4+i]);
    311         yi = S_MUL_SUB(x1,t[i] , x2,t[N4+i]);
    312         /* We swap real and imag because we use an FFT instead of an IFFT. */
    313         yp[2*rev+1] = yr;
    314         yp[2*rev] = yi;
    315         /* Storing the pre-rotation directly in the bitrev order. */
    316         xp1+=2*stride;
    317         xp2-=2*stride;
    318      }
    319   }
    320 
    321   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
    322 
    323   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
    324      it in-place. */
    325   {
    326      kiss_fft_scalar * yp0 = out+(overlap>>1);
    327      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
    328      const kiss_twiddle_scalar *t = &trig[0];
    329      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
    330         middle pair will be computed twice. */
    331      for(i=0;i<(N4+1)>>1;i++)
    332      {
    333         kiss_fft_scalar re, im, yr, yi;
    334         kiss_twiddle_scalar t0, t1;
    335         /* We swap real and imag because we're using an FFT instead of an IFFT. */
    336         re = yp0[1];
    337         im = yp0[0];
    338         t0 = t[i];
    339         t1 = t[N4+i];
    340         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
    341         yr = S_MUL_ADD_PSR(re,t0 , im,t1, post_shift);
    342         yi = S_MUL_SUB_PSR(re,t1 , im,t0, post_shift);
    343         /* We swap real and imag because we're using an FFT instead of an IFFT. */
    344         re = yp1[1];
    345         im = yp1[0];
    346         yp0[0] = yr;
    347         yp1[1] = yi;
    348 
    349         t0 = t[(N4-i-1)];
    350         t1 = t[(N2-i-1)];
    351         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
    352         yr = S_MUL_ADD_PSR(re,t0,im,t1, post_shift);
    353         yi = S_MUL_SUB_PSR(re,t1,im,t0, post_shift);
    354         yp1[0] = yr;
    355         yp0[1] = yi;
    356         yp0 += 2;
    357         yp1 -= 2;
    358      }
    359   }
    360 
    361   /* Mirror on both sides for TDAC */
    362   {
    363      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
    364      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
    365      const celt_coef * OPUS_RESTRICT wp1 = window;
    366      const celt_coef * OPUS_RESTRICT wp2 = window+overlap-1;
    367 
    368      for(i = 0; i < overlap/2; i++)
    369      {
    370         kiss_fft_scalar x1, x2;
    371         x1 = *xp1;
    372         x2 = *yp1;
    373         *yp1++ = S_MUL_SUB(x2, *wp2, x1, *wp1);
    374         *xp1-- = S_MUL_ADD(x2, *wp1, x1, *wp2);
    375         wp1++;
    376         wp2--;
    377      }
    378   }
    379 }
    380 
    381 #endif /* OVERRIDE_clt_mdct_backward */
    382 
    383 #endif /* MDCT_MIPSR1_H__ */