[ tor-browser ].git.dasho

celt_mdct_ne10.c (8118B)
      1 /* Copyright (c) 2015 Xiph.Org Foundation
      2   Written by Viswanath Puttagunta */
      3 /**
      4   @file celt_mdct_ne10.c
      5   @brief ARM Neon optimizations for mdct using NE10 library
      6 */
      7 
      8 /*
      9   Redistribution and use in source and binary forms, with or without
     10   modification, are permitted provided that the following conditions
     11   are met:
     12 
     13   - Redistributions of source code must retain the above copyright
     14   notice, this list of conditions and the following disclaimer.
     15 
     16   - Redistributions in binary form must reproduce the above copyright
     17   notice, this list of conditions and the following disclaimer in the
     18   documentation and/or other materials provided with the distribution.
     19 
     20   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     21   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     22   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     23   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     24   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     25   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     26   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     27   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     28   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     29   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     30   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     31 */
     32 
     33 #ifndef SKIP_CONFIG_H
     34 #ifdef HAVE_CONFIG_H
     35 #include "config.h"
     36 #endif
     37 #endif
     38 
     39 #include "kiss_fft.h"
     40 #include "_kiss_fft_guts.h"
     41 #include "mdct.h"
     42 #include "stack_alloc.h"
     43 
     44 void clt_mdct_forward_neon(const mdct_lookup *l,
     45                           kiss_fft_scalar *in,
     46                           kiss_fft_scalar * OPUS_RESTRICT out,
     47                           const opus_val16 *window,
     48                           int overlap, int shift, int stride, int arch)
     49 {
     50   int i;
     51   int N, N2, N4;
     52   VARDECL(kiss_fft_scalar, f);
     53   VARDECL(kiss_fft_cpx, f2);
     54   const kiss_fft_state *st = l->kfft[shift];
     55   const kiss_twiddle_scalar *trig;
     56 
     57   SAVE_STACK;
     58 
     59   N = l->n;
     60   trig = l->trig;
     61   for (i=0;i<shift;i++)
     62   {
     63      N >>= 1;
     64      trig += N;
     65   }
     66   N2 = N>>1;
     67   N4 = N>>2;
     68 
     69   ALLOC(f, N2, kiss_fft_scalar);
     70   ALLOC(f2, N4, kiss_fft_cpx);
     71 
     72   /* Consider the input to be composed of four blocks: [a, b, c, d] */
     73   /* Window, shuffle, fold */
     74   {
     75      /* Temp pointers to make it really clear to the compiler what we're doing */
     76      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
     77      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
     78      kiss_fft_scalar * OPUS_RESTRICT yp = f;
     79      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
     80      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
     81      for(i=0;i<((overlap+3)>>2);i++)
     82      {
     83         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
     84         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
     85         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
     86         xp1+=2;
     87         xp2-=2;
     88         wp1+=2;
     89         wp2-=2;
     90      }
     91      wp1 = window;
     92      wp2 = window+overlap-1;
     93      for(;i<N4-((overlap+3)>>2);i++)
     94      {
     95         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
     96         *yp++ = *xp2;
     97         *yp++ = *xp1;
     98         xp1+=2;
     99         xp2-=2;
    100      }
    101      for(;i<N4;i++)
    102      {
    103         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
    104         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
    105         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
    106         xp1+=2;
    107         xp2-=2;
    108         wp1+=2;
    109         wp2-=2;
    110      }
    111   }
    112   /* Pre-rotation */
    113   {
    114      kiss_fft_scalar * OPUS_RESTRICT yp = f;
    115      const kiss_twiddle_scalar *t = &trig[0];
    116      for(i=0;i<N4;i++)
    117      {
    118         kiss_fft_cpx yc;
    119         kiss_twiddle_scalar t0, t1;
    120         kiss_fft_scalar re, im, yr, yi;
    121         t0 = t[i];
    122         t1 = t[N4+i];
    123         re = *yp++;
    124         im = *yp++;
    125         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
    126         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
    127         yc.r = yr;
    128         yc.i = yi;
    129         f2[i] = yc;
    130      }
    131   }
    132 
    133   opus_fft(st, f2, (kiss_fft_cpx *)f, arch);
    134 
    135   /* Post-rotate */
    136   {
    137      /* Temp pointers to make it really clear to the compiler what we're doing */
    138      const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
    139      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
    140      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
    141      const kiss_twiddle_scalar *t = &trig[0];
    142      /* Temp pointers to make it really clear to the compiler what we're doing */
    143      for(i=0;i<N4;i++)
    144      {
    145         kiss_fft_scalar yr, yi;
    146         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
    147         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
    148         *yp1 = yr;
    149         *yp2 = yi;
    150         fp++;
    151         yp1 += 2*stride;
    152         yp2 -= 2*stride;
    153      }
    154   }
    155   RESTORE_STACK;
    156 }
    157 
    158 void clt_mdct_backward_neon(const mdct_lookup *l,
    159                            kiss_fft_scalar *in,
    160                            kiss_fft_scalar * OPUS_RESTRICT out,
    161                            const opus_val16 * OPUS_RESTRICT window,
    162                            int overlap, int shift, int stride, int arch)
    163 {
    164   int i;
    165   int N, N2, N4;
    166   VARDECL(kiss_fft_scalar, f);
    167   const kiss_twiddle_scalar *trig;
    168   const kiss_fft_state *st = l->kfft[shift];
    169 
    170   N = l->n;
    171   trig = l->trig;
    172   for (i=0;i<shift;i++)
    173   {
    174      N >>= 1;
    175      trig += N;
    176   }
    177   N2 = N>>1;
    178   N4 = N>>2;
    179 
    180   ALLOC(f, N2, kiss_fft_scalar);
    181 
    182   /* Pre-rotate */
    183   {
    184      /* Temp pointers to make it really clear to the compiler what we're doing */
    185      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
    186      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
    187      kiss_fft_scalar * OPUS_RESTRICT yp = f;
    188      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
    189      for(i=0;i<N4;i++)
    190      {
    191         kiss_fft_scalar yr, yi;
    192         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);
    193         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);
    194         yp[2*i] = yr;
    195         yp[2*i+1] = yi;
    196         xp1+=2*stride;
    197         xp2-=2*stride;
    198      }
    199   }
    200 
    201   opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);
    202 
    203   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
    204      it in-place. */
    205   {
    206      kiss_fft_scalar * yp0 = out+(overlap>>1);
    207      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;
    208      const kiss_twiddle_scalar *t = &trig[0];
    209      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
    210         middle pair will be computed twice. */
    211      for(i=0;i<(N4+1)>>1;i++)
    212      {
    213         kiss_fft_scalar re, im, yr, yi;
    214         kiss_twiddle_scalar t0, t1;
    215         re = yp0[0];
    216         im = yp0[1];
    217         t0 = t[i];
    218         t1 = t[N4+i];
    219         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
    220         yr = S_MUL(re,t0) + S_MUL(im,t1);
    221         yi = S_MUL(re,t1) - S_MUL(im,t0);
    222         re = yp1[0];
    223         im = yp1[1];
    224         yp0[0] = yr;
    225         yp1[1] = yi;
    226 
    227         t0 = t[(N4-i-1)];
    228         t1 = t[(N2-i-1)];
    229         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
    230         yr = S_MUL(re,t0) + S_MUL(im,t1);
    231         yi = S_MUL(re,t1) - S_MUL(im,t0);
    232         yp1[0] = yr;
    233         yp0[1] = yi;
    234         yp0 += 2;
    235         yp1 -= 2;
    236      }
    237   }
    238 
    239   /* Mirror on both sides for TDAC */
    240   {
    241      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
    242      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
    243      const opus_val16 * OPUS_RESTRICT wp1 = window;
    244      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
    245 
    246      for(i = 0; i < overlap/2; i++)
    247      {
    248         kiss_fft_scalar x1, x2;
    249         x1 = *xp1;
    250         x2 = *yp1;
    251         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
    252         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
    253         wp1++;
    254         wp2--;
    255      }
    256   }
    257   RESTORE_STACK;
    258 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE