tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vq.c (25048B)


      1 /* Copyright (c) 2007-2008 CSIRO
      2   Copyright (c) 2007-2009 Xiph.Org Foundation
      3   Written by Jean-Marc Valin */
      4 /*
      5   Redistribution and use in source and binary forms, with or without
      6   modification, are permitted provided that the following conditions
      7   are met:
      8 
      9   - Redistributions of source code must retain the above copyright
     10   notice, this list of conditions and the following disclaimer.
     11 
     12   - Redistributions in binary form must reproduce the above copyright
     13   notice, this list of conditions and the following disclaimer in the
     14   documentation and/or other materials provided with the distribution.
     15 
     16   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     17   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     18   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     19   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     20   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     21   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     22   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     23   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     24   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     25   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 */
     28 
     29 #ifdef HAVE_CONFIG_H
     30 #include "config.h"
     31 #endif
     32 
     33 #include "mathops.h"
     34 #include "cwrs.h"
     35 #include "vq.h"
     36 #include "arch.h"
     37 #include "os_support.h"
     38 #include "bands.h"
     39 #include "rate.h"
     40 #include "pitch.h"
     41 #include "SigProc_FIX.h"
     42 
     43 #if defined(FIXED_POINT)
     44 void norm_scaleup(celt_norm *X, int N, int shift) {
     45   int i;
     46   celt_assert(shift >= 0);
     47   if (shift <= 0) return;
     48   for (i=0;i<N;i++) X[i] = SHL32(X[i], shift);
     49 }
     50 
     51 void norm_scaledown(celt_norm *X, int N, int shift) {
     52   int i;
     53   celt_assert(shift >= 0);
     54   if (shift <= 0) return;
     55   for (i=0;i<N;i++) X[i] = PSHR32(X[i], shift);
     56 }
     57 
     58 opus_val32 celt_inner_prod_norm(const celt_norm *x, const celt_norm *y, int len, int arch) {
     59   int i;
     60   opus_val32 sum = 0;
     61   (void)arch;
     62   for (i=0;i<len;i++) sum += x[i]*y[i];
     63   return sum;
     64 }
     65 opus_val32 celt_inner_prod_norm_shift(const celt_norm *x, const celt_norm *y, int len, int arch) {
     66   int i;
     67   opus_val64 sum = 0;
     68   (void)arch;
     69   for (i=0;i<len;i++) sum += x[i]*(opus_val64)y[i];
     70   return sum>>2*(NORM_SHIFT-14);
     71 }
     72 #endif
     73 
     74 #ifndef OVERRIDE_vq_exp_rotation1
     75 static void exp_rotation1(celt_norm *X, int len, int stride, opus_val16 c, opus_val16 s)
     76 {
     77   int i;
     78   opus_val16 ms;
     79   celt_norm *Xptr;
     80   Xptr = X;
     81   ms = NEG16(s);
     82   norm_scaledown(X, len, NORM_SHIFT-14);
     83   for (i=0;i<len-stride;i++)
     84   {
     85      celt_norm x1, x2;
     86      x1 = Xptr[0];
     87      x2 = Xptr[stride];
     88      Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2),  s, x1), 15));
     89      *Xptr++      = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
     90   }
     91   Xptr = &X[len-2*stride-1];
     92   for (i=len-2*stride-1;i>=0;i--)
     93   {
     94      celt_norm x1, x2;
     95      x1 = Xptr[0];
     96      x2 = Xptr[stride];
     97      Xptr[stride] = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x2),  s, x1), 15));
     98      *Xptr--      = EXTRACT16(PSHR32(MAC16_16(MULT16_16(c, x1), ms, x2), 15));
     99   }
    100   norm_scaleup(X, len, NORM_SHIFT-14);
    101 }
    102 #endif /* OVERRIDE_vq_exp_rotation1 */
    103 
    104 void exp_rotation(celt_norm *X, int len, int dir, int stride, int K, int spread)
    105 {
    106   static const int SPREAD_FACTOR[3]={15,10,5};
    107   int i;
    108   opus_val16 c, s;
    109   opus_val16 gain, theta;
    110   int stride2=0;
    111   int factor;
    112 
    113   if (2*K>=len || spread==SPREAD_NONE)
    114      return;
    115   factor = SPREAD_FACTOR[spread-1];
    116 
    117   gain = celt_div((opus_val32)MULT16_16(Q15_ONE,len),(opus_val32)(len+factor*K));
    118   theta = HALF16(MULT16_16_Q15(gain,gain));
    119 
    120   c = celt_cos_norm(EXTEND32(theta));
    121   s = celt_cos_norm(EXTEND32(SUB16(Q15ONE,theta))); /*  sin(theta) */
    122 
    123   if (len>=8*stride)
    124   {
    125      stride2 = 1;
    126      /* This is just a simple (equivalent) way of computing sqrt(len/stride) with rounding.
    127         It's basically incrementing long as (stride2+0.5)^2 < len/stride. */
    128      while ((stride2*stride2+stride2)*stride + (stride>>2) < len)
    129         stride2++;
    130   }
    131   /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
    132      extract_collapse_mask().*/
    133   len = celt_udiv(len, stride);
    134   for (i=0;i<stride;i++)
    135   {
    136      if (dir < 0)
    137      {
    138         if (stride2)
    139            exp_rotation1(X+i*len, len, stride2, s, c);
    140         exp_rotation1(X+i*len, len, 1, c, s);
    141      } else {
    142         exp_rotation1(X+i*len, len, 1, c, -s);
    143         if (stride2)
    144            exp_rotation1(X+i*len, len, stride2, s, -c);
    145      }
    146   }
    147 }
    148 
    149 /** Normalizes the decoded integer pvq codeword to unit norm. */
    150 static void normalise_residual(int * OPUS_RESTRICT iy, celt_norm * OPUS_RESTRICT X,
    151      int N, opus_val32 Ryy, opus_val32 gain, int shift)
    152 {
    153   int i;
    154 #ifdef FIXED_POINT
    155   int k;
    156 #endif
    157   opus_val32 t;
    158   opus_val32 g;
    159 
    160 #ifdef FIXED_POINT
    161   k = celt_ilog2(Ryy)>>1;
    162 #endif
    163   t = VSHR32(Ryy, 2*(k-7)-15);
    164   g = MULT32_32_Q31(celt_rsqrt_norm32(t),gain);
    165   i=0;
    166   (void)shift;
    167 #if defined(FIXED_POINT) && defined(ENABLE_QEXT)
    168   if (shift>0) {
    169      int tot_shift = NORM_SHIFT+1-k-shift;
    170      if (tot_shift >= 0) {
    171         do X[i] = MULT32_32_Q31(g, SHL32(iy[i], tot_shift));
    172         while (++i < N);
    173      } else {
    174         do X[i] = MULT32_32_Q31(g, PSHR32(iy[i], -tot_shift));
    175         while (++i < N);
    176      }
    177   } else
    178 #endif
    179   do X[i] = VSHR32(MULT16_32_Q15(iy[i], g), k+15-NORM_SHIFT);
    180   while (++i < N);
    181 }
    182 
    183 static unsigned extract_collapse_mask(int *iy, int N, int B)
    184 {
    185   unsigned collapse_mask;
    186   int N0;
    187   int i;
    188   if (B<=1)
    189      return 1;
    190   /*NOTE: As a minor optimization, we could be passing around log2(B), not B, for both this and for
    191      exp_rotation().*/
    192   N0 = celt_udiv(N, B);
    193   collapse_mask = 0;
    194   i=0; do {
    195      int j;
    196      unsigned tmp=0;
    197      j=0; do {
    198         tmp |= iy[i*N0+j];
    199      } while (++j<N0);
    200      collapse_mask |= (tmp!=0)<<i;
    201   } while (++i<B);
    202   return collapse_mask;
    203 }
    204 
    205 opus_val16 op_pvq_search_c(celt_norm *X, int *iy, int K, int N, int arch)
    206 {
    207   VARDECL(celt_norm, y);
    208   VARDECL(int, signx);
    209   int i, j;
    210   int pulsesLeft;
    211   opus_val32 sum;
    212   opus_val32 xy;
    213   opus_val16 yy;
    214   SAVE_STACK;
    215 
    216   (void)arch;
    217   ALLOC(y, N, celt_norm);
    218   ALLOC(signx, N, int);
    219 #ifdef FIXED_POINT
    220   {
    221      int shift = (celt_ilog2(1+celt_inner_prod_norm_shift(X, X, N, arch))+1)/2;
    222      shift = IMAX(0, shift+(NORM_SHIFT-14)-14);
    223      norm_scaledown(X, N, shift);
    224   }
    225 #endif
    226   /* Get rid of the sign */
    227   sum = 0;
    228   j=0; do {
    229      signx[j] = X[j]<0;
    230      /* OPT: Make sure the compiler doesn't use a branch on ABS16(). */
    231      X[j] = ABS16(X[j]);
    232      iy[j] = 0;
    233      y[j] = 0;
    234   } while (++j<N);
    235 
    236   xy = yy = 0;
    237 
    238   pulsesLeft = K;
    239 
    240   /* Do a pre-search by projecting on the pyramid */
    241   if (K > (N>>1))
    242   {
    243      opus_val16 rcp;
    244      j=0; do {
    245         sum += X[j];
    246      }  while (++j<N);
    247 
    248      /* If X is too small, just replace it with a pulse at 0 */
    249 #ifdef FIXED_POINT
    250      if (sum <= K)
    251 #else
    252      /* Prevents infinities and NaNs from causing too many pulses
    253         to be allocated. 64 is an approximation of infinity here. */
    254      if (!(sum > EPSILON && sum < 64))
    255 #endif
    256      {
    257         X[0] = QCONST16(1.f,14);
    258         j=1; do
    259            X[j]=0;
    260         while (++j<N);
    261         sum = QCONST16(1.f,14);
    262      }
    263 #ifdef FIXED_POINT
    264      rcp = EXTRACT16(MULT16_32_Q16(K, celt_rcp(sum)));
    265 #else
    266      /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
    267      rcp = EXTRACT16(MULT16_32_Q16(K+0.8f, celt_rcp(sum)));
    268 #endif
    269      j=0; do {
    270 #ifdef FIXED_POINT
    271         /* It's really important to round *towards zero* here */
    272         iy[j] = MULT16_16_Q15(X[j],rcp);
    273 #else
    274         iy[j] = (int)floor(rcp*X[j]);
    275 #endif
    276         y[j] = (celt_norm)iy[j];
    277         yy = MAC16_16(yy, y[j],y[j]);
    278         xy = MAC16_16(xy, X[j],y[j]);
    279         y[j] *= 2;
    280         pulsesLeft -= iy[j];
    281      }  while (++j<N);
    282   }
    283   celt_sig_assert(pulsesLeft>=0);
    284 
    285   /* This should never happen, but just in case it does (e.g. on silence)
    286      we fill the first bin with pulses. */
    287 #ifdef FIXED_POINT_DEBUG
    288   celt_sig_assert(pulsesLeft<=N+3);
    289 #endif
    290   if (pulsesLeft > N+3)
    291   {
    292      opus_val16 tmp = (opus_val16)pulsesLeft;
    293      yy = MAC16_16(yy, tmp, tmp);
    294      yy = MAC16_16(yy, tmp, y[0]);
    295      iy[0] += pulsesLeft;
    296      pulsesLeft=0;
    297   }
    298 
    299   for (i=0;i<pulsesLeft;i++)
    300   {
    301      opus_val16 Rxy, Ryy;
    302      int best_id;
    303      opus_val32 best_num;
    304      opus_val16 best_den;
    305 #ifdef FIXED_POINT
    306      int rshift;
    307 #endif
    308 #ifdef FIXED_POINT
    309      rshift = 1+celt_ilog2(K-pulsesLeft+i+1);
    310 #endif
    311      best_id = 0;
    312      /* The squared magnitude term gets added anyway, so we might as well
    313         add it outside the loop */
    314      yy = ADD16(yy, 1);
    315 
    316      /* Calculations for position 0 are out of the loop, in part to reduce
    317         mispredicted branches (since the if condition is usually false)
    318         in the loop. */
    319      /* Temporary sums of the new pulse(s) */
    320      Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[0])),rshift));
    321      /* We're multiplying y[j] by two so we don't have to do it here */
    322      Ryy = ADD16(yy, y[0]);
    323 
    324      /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that
    325         Rxy is positive because the sign is pre-computed) */
    326      Rxy = MULT16_16_Q15(Rxy,Rxy);
    327      best_den = Ryy;
    328      best_num = Rxy;
    329      j=1;
    330      do {
    331         /* Temporary sums of the new pulse(s) */
    332         Rxy = EXTRACT16(SHR32(ADD32(xy, EXTEND32(X[j])),rshift));
    333         /* We're multiplying y[j] by two so we don't have to do it here */
    334         Ryy = ADD16(yy, y[j]);
    335 
    336         /* Approximate score: we maximise Rxy/sqrt(Ryy) (we're guaranteed that
    337            Rxy is positive because the sign is pre-computed) */
    338         Rxy = MULT16_16_Q15(Rxy,Rxy);
    339         /* The idea is to check for num/den >= best_num/best_den, but that way
    340            we can do it without any division */
    341         /* OPT: It's not clear whether a cmov is faster than a branch here
    342            since the condition is more often false than true and using
    343            a cmov introduces data dependencies across iterations. The optimal
    344            choice may be architecture-dependent. */
    345         if (opus_unlikely(MULT16_16(best_den, Rxy) > MULT16_16(Ryy, best_num)))
    346         {
    347            best_den = Ryy;
    348            best_num = Rxy;
    349            best_id = j;
    350         }
    351      } while (++j<N);
    352 
    353      /* Updating the sums of the new pulse(s) */
    354      xy = ADD32(xy, EXTEND32(X[best_id]));
    355      /* We're multiplying y[j] by two so we don't have to do it here */
    356      yy = ADD16(yy, y[best_id]);
    357 
    358      /* Only now that we've made the final choice, update y/iy */
    359      /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
    360      y[best_id] += 2;
    361      iy[best_id]++;
    362   }
    363 
    364   /* Put the original sign back */
    365   j=0;
    366   do {
    367      /*iy[j] = signx[j] ? -iy[j] : iy[j];*/
    368      /* OPT: The is more likely to be compiled without a branch than the code above
    369         but has the same performance otherwise. */
    370      iy[j] = (iy[j]^-signx[j]) + signx[j];
    371   } while (++j<N);
    372   RESTORE_STACK;
    373   return yy;
    374 }
    375 
    376 #ifdef ENABLE_QEXT
    377 #include "macros.h"
    378 
    379 static opus_val32 op_pvq_search_N2(const celt_norm *X, int *iy, int *up_iy, int K, int up, int *refine, int shift) {
    380   opus_val32 sum;
    381   opus_val32 rcp_sum;
    382   int offset;
    383   sum = ABS32(X[0]) + ABS32(X[1]);
    384   if (sum < EPSILON) {
    385      iy[0] = K;
    386      up_iy[0] = up*K;
    387      iy[1]=up_iy[1]=0;
    388      *refine=0;
    389 #ifdef FIXED_POINT
    390      return (opus_val64)K*K*up*up>>2*shift;
    391 #else
    392      (void)shift;
    393      return K*(float)K*up*up;
    394 #endif
    395   }
    396 #ifdef FIXED_POINT
    397   int sum_shift;
    398   opus_val32 X0;
    399   sum_shift = 30-celt_ilog2(sum);
    400   rcp_sum = celt_rcp_norm32(SHL32(sum, sum_shift));
    401   X0 = MULT32_32_Q31(SHL32(X[0], sum_shift), rcp_sum);
    402   iy[0] = PSHR32(MULT32_32_Q31(SHL32(K, 8), X0), 7);
    403   up_iy[0] = PSHR32(MULT32_32_Q31(SHL32(up*K, 8), X0), 7);
    404 #else
    405   rcp_sum = 1.f/sum;
    406   iy[0] = (int)floor(.5f+K*X[0]*rcp_sum);
    407   up_iy[0] = (int)floor(.5f+up*K*X[0]*rcp_sum);
    408 #endif
    409   up_iy[0] = IMAX(up*iy[0] - (up-1)/2, IMIN(up*iy[0] + (up-1)/2, up_iy[0]));
    410   offset = up_iy[0] - up*iy[0];
    411   iy[1] = K-abs(iy[0]);
    412   up_iy[1] = up*K-abs(up_iy[0]);
    413   if (X[1] < 0) {
    414      iy[1] = -iy[1];
    415      up_iy[1] = -up_iy[1];
    416      offset = -offset;
    417   }
    418   *refine = offset;
    419 #ifdef FIXED_POINT
    420   return (up_iy[0]*(opus_val64)up_iy[0] + up_iy[1]*(opus_val64)up_iy[1] + (1<<2*shift>>1))>>2*shift;
    421 #else
    422   return up_iy[0]*(opus_val64)up_iy[0] + up_iy[1]*(opus_val64)up_iy[1];
    423 #endif
    424 }
    425 
    426 static int op_pvq_refine(const opus_val32 *Xn, int *iy, int *iy0, int K, int up, int margin, int N) {
    427   int i;
    428   int dir;
    429   VARDECL(opus_val32, rounding);
    430   int iysum = 0;
    431   SAVE_STACK;
    432   ALLOC(rounding, N, opus_val32);
    433   for (i=0;i<N;i++) {
    434      opus_val32 tmp;
    435      tmp = MULT32_32_Q31(SHL32(K, 8), Xn[i]);
    436 #ifdef FIXED_POINT
    437      iy[i] = (tmp+64) >> 7;
    438 #else
    439      iy[i] = (int)floor(.5+tmp);
    440 #endif
    441      rounding[i] = tmp - SHL32(iy[i], 7);
    442   }
    443   if (iy != iy0) {
    444      for (i=0;i<N;i++) iy[i] = IMIN(up*iy0[i]+up-1, IMAX(up*iy0[i]-up+1, iy[i]));
    445   }
    446   for (i=0;i<N;i++) iysum += iy[i];
    447   if (abs(iysum - K) > 32) {
    448      RESTORE_STACK;
    449      return 1;
    450   }
    451   dir = iysum < K ? 1 : -1;
    452   while (iysum != K) {
    453      opus_val32 roundval=-1000000*dir;
    454      int roundpos=0;
    455      for (i=0;i<N;i++) {
    456         if ((rounding[i]-roundval)*dir > 0 && abs(iy[i]-up*iy0[i]) < (margin-1) && !(dir==-1 && iy[i] == 0)) {
    457            roundval = rounding[i];
    458            roundpos = i;
    459         }
    460      }
    461      iy[roundpos] += dir;
    462      rounding[roundpos] -= SHL32(dir, 15);
    463      iysum+=dir;
    464   }
    465   RESTORE_STACK;
    466   return 0;
    467 }
    468 
    469 static opus_val32 op_pvq_search_extra(const celt_norm *X, int *iy, int *up_iy, int K, int up, int *refine, int N, int shift) {
    470   opus_val32 rcp_sum;
    471   opus_val32 sum=0;
    472   int i;
    473   int failed=0;
    474   opus_val64 yy=0;
    475   VARDECL(opus_val32, Xn);
    476   SAVE_STACK;
    477   for (i=0;i<N;i++) sum += ABS32(X[i]);
    478   ALLOC(Xn, N, opus_val32);
    479   if (sum < EPSILON)
    480      failed = 1;
    481   else {
    482 #ifdef FIXED_POINT
    483      int sum_shift = 30-celt_ilog2(sum);
    484      rcp_sum = celt_rcp_norm32(SHL32(sum, sum_shift));
    485      for (i=0;i<N;i++) {
    486         Xn[i] = MULT32_32_Q31(SHL32(ABS32(X[i]), sum_shift), rcp_sum);
    487      }
    488 #else
    489      rcp_sum = celt_rcp(sum);
    490      for (i=0;i<N;i++) {
    491         Xn[i] = ABS32(X[i])*rcp_sum;
    492      }
    493 #endif
    494   }
    495   failed = failed || op_pvq_refine(Xn, iy, iy, K, 1, K+1, N);
    496   failed = failed || op_pvq_refine(Xn, up_iy, iy, up*K, up, up, N);
    497   if (failed) {
    498      iy[0] = K;
    499      for (i=1;i<N;i++) iy[i] = 0;
    500      up_iy[0] = up*K;
    501      for (i=1;i<N;i++) up_iy[i] = 0;
    502   }
    503   for (i=0;i<N;i++) {
    504      yy += up_iy[i]*(opus_val64)up_iy[i];
    505      if (X[i] < 0) {
    506         iy[i] = -iy[i];
    507         up_iy[i] = -up_iy[i];
    508      }
    509      refine[i] = up_iy[i]-up*iy[i];
    510   }
    511   RESTORE_STACK;
    512 #ifdef FIXED_POINT
    513   return (yy + (1<<2*shift>>1))>>2*shift;
    514 #else
    515   (void)shift;
    516   return yy;
    517 #endif
    518 }
    519 #endif
    520 
    521 #ifdef ENABLE_QEXT
    522 /* Take advantage of the fact that "large" refine values are much less likely
    523   than smaller ones. */
    524 static void ec_enc_refine(ec_enc *enc, opus_int32 refine, opus_int32 up, int extra_bits, int use_entropy) {
    525   int large;
    526   large = abs(refine)>up/2;
    527   ec_enc_bit_logp(enc, large, use_entropy ? 3 : 1);
    528   if (large) {
    529      ec_enc_bits(enc, refine < 0, 1);
    530      ec_enc_bits(enc, abs(refine)-up/2-1, extra_bits-1);
    531   } else {
    532      ec_enc_bits(enc, refine+up/2, extra_bits);
    533   }
    534 }
    535 
    536 static int ec_dec_refine(ec_enc *dec, opus_int32 up, int extra_bits, int use_entropy) {
    537   int large, refine;
    538   large = ec_dec_bit_logp(dec, use_entropy ? 3 : 1);
    539   if (large) {
    540      int sign = ec_dec_bits(dec, 1);
    541      refine = ec_dec_bits(dec, extra_bits-1) + up/2+1;
    542      if (sign) refine = -refine;
    543   } else {
    544      refine = (opus_int32)ec_dec_bits(dec, extra_bits)-up/2;
    545   }
    546   return refine;
    547 }
    548 #endif
    549 
    550 unsigned alg_quant(celt_norm *X, int N, int K, int spread, int B, ec_enc *enc,
    551      opus_val32 gain, int resynth
    552      ARG_QEXT(ec_enc *ext_enc) ARG_QEXT(int extra_bits), int arch)
    553 {
    554   VARDECL(int, iy);
    555   opus_val32 yy;
    556   unsigned collapse_mask;
    557 #ifdef ENABLE_QEXT
    558   int yy_shift = 0;
    559 #endif
    560   SAVE_STACK;
    561 
    562   celt_assert2(K>0, "alg_quant() needs at least one pulse");
    563   celt_assert2(N>1, "alg_quant() needs at least two dimensions");
    564 
    565   /* Covers vectorization by up to 4. */
    566   ALLOC(iy, N+3, int);
    567 
    568   exp_rotation(X, N, 1, B, K, spread);
    569 
    570 #ifdef ENABLE_QEXT
    571   if (N==2 && extra_bits >= 2) {
    572      int refine;
    573      int up_iy[2];
    574      int up;
    575      yy_shift = IMAX(0, extra_bits-7);
    576      up = (1<<extra_bits)-1;
    577      yy = op_pvq_search_N2(X, iy, up_iy, K, up, &refine, yy_shift);
    578      collapse_mask = extract_collapse_mask(up_iy, N, B);
    579      encode_pulses(iy, N, K, enc);
    580      ec_enc_uint(ext_enc, refine+(up-1)/2, up);
    581      if (resynth)
    582         normalise_residual(up_iy, X, N, yy, gain, yy_shift);
    583   } else if (extra_bits >= 2) {
    584      int i;
    585      VARDECL(int, up_iy);
    586      VARDECL(int, refine);
    587      int up, use_entropy;
    588      ALLOC(up_iy, N, int);
    589      ALLOC(refine, N, int);
    590      yy_shift = IMAX(0, extra_bits-7);
    591      up = (1<<extra_bits)-1;
    592      yy = op_pvq_search_extra(X, iy, up_iy, K, up, refine, N, yy_shift);
    593      collapse_mask = extract_collapse_mask(up_iy, N, B);
    594      encode_pulses(iy, N, K, enc);
    595      use_entropy = (ext_enc->storage*8 - ec_tell(ext_enc)) > (unsigned)(N-1)*(extra_bits+3)+1;
    596      for (i=0;i<N-1;i++) ec_enc_refine(ext_enc, refine[i], up, extra_bits, use_entropy);
    597      if (iy[N-1]==0) ec_enc_bits(ext_enc, up_iy[N-1]<0, 1);
    598      if (resynth)
    599         normalise_residual(up_iy, X, N, yy, gain, yy_shift);
    600   } else
    601 #endif
    602   {
    603      yy = op_pvq_search(X, iy, K, N, arch);
    604      collapse_mask = extract_collapse_mask(iy, N, B);
    605      encode_pulses(iy, N, K, enc);
    606      if (resynth)
    607         normalise_residual(iy, X, N, yy, gain, 0);
    608   }
    609 
    610   if (resynth)
    611      exp_rotation(X, N, -1, B, K, spread);
    612 
    613   RESTORE_STACK;
    614   return collapse_mask;
    615 }
    616 
    617 /** Decode pulse vector and combine the result with the pitch vector to produce
    618    the final normalised signal in the current band. */
    619 unsigned alg_unquant(celt_norm *X, int N, int K, int spread, int B,
    620      ec_dec *dec, opus_val32 gain
    621      ARG_QEXT(ec_enc *ext_dec) ARG_QEXT(int extra_bits))
    622 {
    623   opus_val32 Ryy;
    624   unsigned collapse_mask;
    625   VARDECL(int, iy);
    626   int yy_shift=0;
    627   SAVE_STACK;
    628 
    629   celt_assert2(K>0, "alg_unquant() needs at least one pulse");
    630   celt_assert2(N>1, "alg_unquant() needs at least two dimensions");
    631   ALLOC(iy, N, int);
    632   Ryy = decode_pulses(iy, N, K, dec);
    633 #ifdef ENABLE_QEXT
    634   if (N==2 && extra_bits >= 2) {
    635      int up;
    636      int refine;
    637      yy_shift = IMAX(0, extra_bits-7);
    638      up = (1<<extra_bits)-1;
    639      refine = (opus_int32)ec_dec_uint(ext_dec, up) - (up-1)/2;
    640      iy[0] *= up;
    641      iy[1] *= up;
    642      if (iy[1] == 0) {
    643         iy[1] = (iy[0] > 0) ? -refine : refine;
    644         iy[0] += (refine*(opus_int64)iy[0] > 0) ? -refine : refine;
    645      } else if (iy[1] > 0) {
    646         iy[0] += refine;
    647         iy[1] -= refine*(iy[0]>0?1:-1);
    648      } else {
    649         iy[0] -= refine;
    650         iy[1] -= refine*(iy[0]>0?1:-1);
    651      }
    652 #ifdef FIXED_POINT
    653      Ryy = (iy[0]*(opus_val64)iy[0] + iy[1]*(opus_val64)iy[1] + (1<<2*yy_shift>>1)) >> 2*yy_shift;
    654 #else
    655      Ryy = iy[0]*(opus_val64)iy[0] + iy[1]*(opus_val64)iy[1];
    656 #endif
    657   } else if (extra_bits >= 2) {
    658      int i;
    659      opus_val64 yy64;
    660      VARDECL(int, refine);
    661      int up, use_entropy;
    662      int sign=0;
    663      ALLOC(refine, N, int);
    664      yy_shift = IMAX(0, extra_bits-7);
    665      up = (1<<extra_bits)-1;
    666      use_entropy = (ext_dec->storage*8 - ec_tell(ext_dec)) > (unsigned)(N-1)*(extra_bits+3)+1;
    667      for (i=0;i<N-1;i++) refine[i] = ec_dec_refine(ext_dec, up, extra_bits, use_entropy);
    668      if (iy[N-1]==0) sign = ec_dec_bits(ext_dec, 1);
    669      else sign = iy[N-1] < 0;
    670      for (i=0;i<N-1;i++) {
    671         iy[i] = iy[i]*up + refine[i];
    672      }
    673      iy[N-1] = up*K;
    674      for (i=0;i<N-1;i++) iy[N-1] -= abs(iy[i]);
    675      if (sign) iy[N-1] = -iy[N-1];
    676      yy64 = 0;
    677      for (i=0;i<N;i++) yy64 += iy[i]*(opus_val64)iy[i];
    678 #ifdef FIXED_POINT
    679      Ryy = (yy64 + (1<<2*yy_shift>>1)) >> 2*yy_shift;
    680 #else
    681      Ryy = yy64;
    682 #endif
    683   }
    684 #endif
    685   normalise_residual(iy, X, N, Ryy, gain, yy_shift);
    686   exp_rotation(X, N, -1, B, K, spread);
    687   collapse_mask = extract_collapse_mask(iy, N, B);
    688   RESTORE_STACK;
    689   return collapse_mask;
    690 }
    691 
    692 #ifndef OVERRIDE_renormalise_vector
    693 void renormalise_vector(celt_norm *X, int N, opus_val32 gain, int arch)
    694 {
    695   int i;
    696 #ifdef FIXED_POINT
    697   int k;
    698 #endif
    699   opus_val32 E;
    700   opus_val16 g;
    701   opus_val32 t;
    702   celt_norm *xptr;
    703   norm_scaledown(X, N, NORM_SHIFT-14);
    704   E = EPSILON + celt_inner_prod_norm(X, X, N, arch);
    705 #ifdef FIXED_POINT
    706   k = celt_ilog2(E)>>1;
    707 #endif
    708   t = VSHR32(E, 2*(k-7));
    709   g = MULT32_32_Q31(celt_rsqrt_norm(t),gain);
    710 
    711   xptr = X;
    712   for (i=0;i<N;i++)
    713   {
    714      *xptr = EXTRACT16(PSHR32(MULT16_16(g, *xptr), k+15-14));
    715      xptr++;
    716   }
    717   norm_scaleup(X, N, NORM_SHIFT-14);
    718   /*return celt_sqrt(E);*/
    719 }
    720 #endif /* OVERRIDE_renormalise_vector */
    721 
    722 opus_int32 stereo_itheta(const celt_norm *X, const celt_norm *Y, int stereo, int N, int arch)
    723 {
    724   int i;
    725   int itheta;
    726   opus_val32 mid, side;
    727   opus_val32 Emid, Eside;
    728 
    729   Emid = Eside = 0;
    730   if (stereo)
    731   {
    732      for (i=0;i<N;i++)
    733      {
    734         celt_norm m, s;
    735         m = PSHR32(ADD32(X[i], Y[i]), NORM_SHIFT-13);
    736         s = PSHR32(SUB32(X[i], Y[i]), NORM_SHIFT-13);
    737         Emid = MAC16_16(Emid, m, m);
    738         Eside = MAC16_16(Eside, s, s);
    739      }
    740   } else {
    741      Emid += celt_inner_prod_norm_shift(X, X, N, arch);
    742      Eside += celt_inner_prod_norm_shift(Y, Y, N, arch);
    743   }
    744   mid = celt_sqrt32(Emid);
    745   side = celt_sqrt32(Eside);
    746 #if defined(FIXED_POINT)
    747   itheta = celt_atan2p_norm(side, mid);
    748 #else
    749   itheta = (int)floor(.5f+65536.f*16384*celt_atan2p_norm(side,mid));
    750 #endif
    751 
    752   return itheta;
    753 }
    754 
    755 #ifdef ENABLE_QEXT
    756 
    757 static void cubic_synthesis(celt_norm *X, int *iy, int N, int K, int face, int sign, opus_val32 gain) {
    758   int i;
    759   opus_val32 sum=0;
    760   opus_val32 mag;
    761 #ifdef FIXED_POINT
    762   int sum_shift;
    763   int shift = IMAX(celt_ilog2(K) + celt_ilog2(N)/2 - 13, 0);
    764 #endif
    765   for (i=0;i<N;i++) {
    766      X[i] = (1+2*iy[i])-K;
    767   }
    768   X[face] = sign ? -K : K;
    769   for (i=0;i<N;i++) {
    770      sum += PSHR32(MULT16_16(X[i],X[i]), 2*shift);
    771   }
    772 #ifdef FIXED_POINT
    773   sum_shift = (29-celt_ilog2(sum))>>1;
    774   mag = celt_rsqrt_norm32(SHL32(sum, 2*sum_shift+1));
    775   for (i=0;i<N;i++) {
    776      X[i] = VSHR32(MULT16_32_Q15(X[i],MULT32_32_Q31(mag,gain)), shift-sum_shift+29-NORM_SHIFT);
    777   }
    778 #else
    779   mag = 1.f/sqrt(sum);
    780   for (i=0;i<N;i++) {
    781      X[i] *= mag*gain;
    782   }
    783 #endif
    784 }
    785 
    786 unsigned cubic_quant(celt_norm *X, int N, int res, int B, ec_enc *enc, opus_val32 gain, int resynth) {
    787   int i;
    788   int face=0;
    789   int K;
    790   VARDECL(int, iy);
    791   celt_norm faceval=-1;
    792   opus_val32 norm;
    793   int sign;
    794   SAVE_STACK;
    795   ALLOC(iy, N, int);
    796   K = 1<<res;
    797   /* Using odd K on transients to avoid adding pre-echo. */
    798   if (B!=1) K=IMAX(1, K-1);
    799   if (K==1) {
    800      if (resynth) OPUS_CLEAR(X, N);
    801      RESTORE_STACK;
    802      return 0;
    803   }
    804   for (i=0;i<N;i++) {
    805      if (ABS32(X[i]) > faceval) {
    806         faceval = ABS32(X[i]);
    807         face = i;
    808      }
    809   }
    810   sign = X[face]<0;
    811   ec_enc_uint(enc, face, N);
    812   ec_enc_bits(enc, sign, 1);
    813 #ifdef FIXED_POINT
    814   if (faceval != 0) {
    815      int face_shift = 30-celt_ilog2(faceval);
    816      norm = celt_rcp_norm32(SHL32(faceval, face_shift));
    817      norm = MULT16_32_Q15(K, norm);
    818      for (i=0;i<N;i++) {
    819         /* By computing X[i]+faceval inside the shift, the result is guaranteed non-negative. */
    820         iy[i] = IMIN(K-1, (MULT32_32_Q31(SHL32(X[i]+faceval, face_shift-1), norm)) >> 15);
    821      }
    822   } else {
    823      OPUS_CLEAR(iy, N);
    824   }
    825 #else
    826   norm = .5f*K/(faceval+EPSILON);
    827   for (i=0;i<N;i++) {
    828      iy[i] = IMIN(K-1, (int)floor((X[i]+faceval)*norm));
    829   }
    830 #endif
    831   for (i=0;i<N;i++) {
    832      if (i != face) ec_enc_bits(enc, iy[i], res);
    833   }
    834   if (resynth) {
    835      cubic_synthesis(X, iy, N, K, face, sign, gain);
    836   }
    837   RESTORE_STACK;
    838   return (1<<B)-1;
    839 }
    840 
    841 unsigned cubic_unquant(celt_norm *X, int N, int res, int B, ec_dec *dec, opus_val32 gain) {
    842   int i;
    843   int face;
    844   int sign;
    845   int K;
    846   VARDECL(int, iy);
    847   SAVE_STACK;
    848   ALLOC(iy, N, int);
    849   K = 1<<res;
    850   /* Using odd K on transients to avoid adding pre-echo. */
    851   if (B!=1) K=IMAX(1, K-1);
    852   if (K==1) {
    853      OPUS_CLEAR(X, N);
    854      RESTORE_STACK;
    855      return 0;
    856   }
    857   face = ec_dec_uint(dec, N);
    858   sign = ec_dec_bits(dec, 1);
    859   for (i=0;i<N;i++) {
    860      if (i != face) iy[i] = ec_dec_bits(dec, res);
    861   }
    862   iy[face]=0;
    863   cubic_synthesis(X, iy, N, K, face, sign, gain);
    864   RESTORE_STACK;
    865   return (1<<B)-1;
    866 }
    867 #endif