tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

NSQ_del_dec_avx2.c (52645B)


      1 /***********************************************************************
      2 Copyright (c) 2021 Google Inc.
      3 Redistribution and use in source and binary forms, with or without
      4 modification, are permitted provided that the following conditions
      5 are met:
      6 - Redistributions of source code must retain the above copyright notice,
      7 this list of conditions and the following disclaimer.
      8 - Redistributions in binary form must reproduce the above copyright
      9 notice, this list of conditions and the following disclaimer in the
     10 documentation and/or other materials provided with the distribution.
     11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
     12 names of specific contributors, may be used to endorse or promote
     13 products derived from this software without specific prior written
     14 permission.
     15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     25 POSSIBILITY OF SUCH DAMAGE.
     26 ***********************************************************************/
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #ifdef OPUS_CHECK_ASM
     33 #include <string.h>
     34 #endif
     35 
     36 #include "opus_defines.h"
     37 #include <immintrin.h>
     38 
     39 #include "main.h"
     40 #include "stack_alloc.h"
     41 #include "NSQ.h"
     42 #include "celt/x86/x86cpu.h"
     43 
     44 /* Returns TRUE if all assumptions met */
     45 static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC)
     46 {
     47    /* This optimization is based on these assumptions        */
     48    /* These assumptions are fundamental and hence assert are */
     49    /* used. Should any assert triggers, we have to re-visit  */
     50    /* all related code to make sure it still functions the   */
     51    /* same as the C implementation.                          */
     52    silk_assert(MAX_DEL_DEC_STATES  <= 4      &&
     53                MAX_FRAME_LENGTH     % 4 == 0 &&
     54                MAX_SUB_FRAME_LENGTH % 4 == 0 &&
     55                LTP_MEM_LENGTH_MS    % 4 == 0 );
     56    silk_assert(psEncC->fs_kHz ==  8 ||
     57                psEncC->fs_kHz == 12 ||
     58                psEncC->fs_kHz == 16 );
     59    silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR &&
     60                psEncC->nb_subfr > 0             );
     61    silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES &&
     62                psEncC->nStatesDelayedDecision > 0                   );
     63    silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS);
     64 
     65    /* Regressions were observed on certain AMD Zen CPUs when      */
     66    /* nStatesDelayedDecision is 1 or 2. Ideally we should detect  */
     67    /* these CPUs and enable this optimization on others; however, */
     68    /* there is no good way to do so under current OPUS framework. */
     69    return psEncC->nStatesDelayedDecision == 3 ||
     70           psEncC->nStatesDelayedDecision == 4;
     71 }
     72 
     73 /* Intrinsics not defined on MSVC */
     74 #ifdef _MSC_VER
     75 #include <intsafe.h>
     76 static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res)
     77 {
     78    *res = a+b;
     79    return (*res ^ a) & (*res ^ b) & 0x80000000;
     80 }
     81 static inline int __builtin_ctz(unsigned int x)
     82 {
     83    DWORD res = 0;
     84    return _BitScanForward(&res, x) ? res : 32;
     85 }
     86 #endif
     87 
     88 static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num)
     89 {
     90    return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1)));
     91 }
     92 
     93 static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num)
     94 {
     95    num = num > silk_int16_MAX ? silk_int16_MAX : num;
     96    num = num < silk_int16_MIN ? silk_int16_MIN : num;
     97    return num;
     98 }
     99 
    100 static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits)
    101 {
    102    silk_assert(bits > 0 && bits < 31);
    103    a += 1 << (bits-1);
    104    return a >> bits;
    105 }
    106 
    107 static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits)
    108 {
    109 #ifndef OPUS_CHECK_ASM
    110    opus_int64 t;
    111 #endif
    112    silk_assert(bits > 0 && bits < 63);
    113 #ifdef OPUS_CHECK_ASM
    114    return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits);
    115 #else
    116    /* This code is more correct, but it won't overflow like the C code in some rare cases. */
    117    silk_assert(bits > 0 && bits < 63);
    118    t = ((opus_int64)a) * ((opus_int64)b);
    119    bits += 16;
    120    t += 1ull << (bits-1);
    121    return t >> bits;
    122 #endif
    123 }
    124 
    125 static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b)
    126 {
    127    opus_int32 sum;
    128    if (__builtin_sadd_overflow(a, b, &sum))
    129    {
    130        return a >= 0 ? silk_int32_MAX : silk_int32_MIN;
    131    }
    132    return sum;
    133 }
    134 
    135 static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits)
    136 {
    137    silk_assert(bits > 0 && bits < 31);
    138    return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits);
    139 }
    140 
    141 /* add/subtract with output saturated */
    142 static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b)
    143 {
    144    __m128i r = _mm_add_epi32(a, b);
    145    __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r));           /* OF = (sum ^ a) & (sum ^ b)   */
    146    __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */
    147    return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
    148 }
    149 static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b)
    150 {
    151    __m128i r = _mm_sub_epi32(a, b);
    152    __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
    153    __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
    154    return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31));
    155 }
    156 static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b)
    157 {
    158    __m256i r = _mm256_sub_epi32(a, b);
    159    __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r));        /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */
    160    __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF                         */
    161    return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31));
    162 }
    163 
    164 static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2)
    165 {
    166    opus_int32 lo = limit1 < limit2 ? limit1 : limit2;
    167    opus_int32 hi = limit1 > limit2 ? limit1 : limit2;
    168 
    169    num = _mm_min_epi32(num, _mm_set1_epi32(hi));
    170    num = _mm_max_epi32(num, _mm_set1_epi32(lo));
    171    return num;
    172 }
    173 
    174 /* cond < 0 ? -num : num */
    175 static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond)
    176 {
    177    return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1)));
    178 }
    179 static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond)
    180 {
    181    return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1)));
    182 }
    183 
    184 /* (a32 * b32) >> 16 */
    185 static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b)
    186 {
    187    return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16));
    188 }
    189 
    190 /* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */
    191 static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b)
    192 {
    193    return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32((opus_uint32)b<<16)));
    194 }
    195 
    196 /* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */
    197 static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b)
    198 {
    199    const char FF = (char)0xFF;
    200    __m256i msk = _mm256_set_epi8(
    201        FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0,
    202        FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0);
    203    __m256i lo = _mm256_mullo_epi16(a, b);
    204    __m256i hi = _mm256_mulhi_epi16(a, b);
    205    lo = _mm256_shuffle_epi8(lo, msk);
    206    hi = _mm256_shuffle_epi8(hi, msk);
    207    return _mm256_unpacklo_epi16(lo, hi);
    208 }
    209 
    210 static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v)
    211 {
    212    v = _mm256_shuffle_epi32(v, 0x1B);
    213    v = _mm256_permute4x64_epi64(v, 0x4E);
    214    return v;
    215 }
    216 
    217 static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v)
    218 {
    219    __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0));
    220    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E));
    221    sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1));
    222    return _mm_cvtsi128_si32(sum);
    223 }
    224 
    225 static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num)
    226 {
    227    num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */
    228    num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
    229    return num;
    230 }
    231 
    232 static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num)
    233 {
    234    num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */
    235    num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */
    236    return num;
    237 }
    238 
    239 static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask)
    240 {
    241    num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask);
    242    return silk_mm_hmin_epi32(num);
    243 }
    244 
    245 static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask)
    246 {
    247    num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask);
    248    return silk_mm_hmax_epi32(num);
    249 }
    250 
    251 static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed)
    252 {
    253    seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER));
    254    seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT));
    255    return seed;
    256 }
    257 
    258 static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b)
    259 {
    260    unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111;
    261    silk_assert(mask != 0);
    262    return __builtin_ctz(mask) >> 2;
    263 }
    264 
    265 static __m128i silk_index_to_selector(opus_int32 index)
    266 {
    267    silk_assert(index < 4);
    268    index <<= 2;
    269    return _mm_set_epi8(
    270        index + 3, index + 2, index + 1, index + 0,
    271        index + 3, index + 2, index + 1, index + 0,
    272        index + 3, index + 2, index + 1, index + 0,
    273        index + 3, index + 2, index + 1, index + 0);
    274 }
    275 
    276 static opus_int32 silk_select_winner(__m128i num, __m128i selector)
    277 {
    278    return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector));
    279 }
    280 
    281 typedef struct
    282 {
    283    __m128i RandState;
    284    __m128i Q_Q10;
    285    __m128i Xq_Q14;
    286    __m128i Pred_Q15;
    287    __m128i Shape_Q14;
    288 } NSQ_del_dec_sample_struct;
    289 
    290 typedef struct
    291 {
    292    __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH];
    293    __m128i LF_AR_Q14;
    294    __m128i Seed;
    295    __m128i SeedInit;
    296    __m128i RD_Q10;
    297    __m128i Diff_Q14;
    298    __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER];
    299    NSQ_del_dec_sample_struct Samples[DECISION_DELAY];
    300 } NSQ_del_dec_struct;
    301 
    302 static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
    303    const silk_encoder_state *psEncC,          /* I    Encoder State                   */
    304    silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
    305    NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
    306    const opus_int16 x16[],                    /* I    Input                           */
    307    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
    308    const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
    309    opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
    310    opus_int subfr,                            /* I    Subframe number                 */
    311    const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
    312    const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
    313    const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
    314    const opus_int signal_type,                /* I    Signal type                     */
    315    const opus_int decisionDelay               /* I    Decision delay                  */
    316 );
    317 
    318 /*******************************************/
    319 /* LPC analysis filter                     */
    320 /* NB! State is kept internally and the    */
    321 /* filter always starts with zero state    */
    322 /* first d output samples are set to zero  */
    323 /*******************************************/
    324 static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
    325    opus_int16                  *out,               /* O    Output signal                           */
    326    const opus_int16            *in,                /* I    Input signal                            */
    327    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
    328    const opus_int32            len,                /* I    Signal length                           */
    329    const opus_int32            order               /* I    Filter order                            */
    330 );
    331 
    332 /******************************************/
    333 /* Noise shape quantizer for one subframe */
    334 /******************************************/
    335 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
    336    silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
    337    NSQ_del_dec_struct psDelDec[],              /* I/O  Delayed decision states            */
    338    opus_int signalType,                        /* I    Signal type                        */
    339    const opus_int32 x_Q10[],                   /* I                                       */
    340    opus_int8 pulses[],                         /* O                                       */
    341    opus_int16 xq[],                            /* O                                       */
    342    opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
    343    opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
    344    const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
    345    const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
    346    const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
    347    opus_int lag,                               /* I    Pitch lag                          */
    348    opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
    349    opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
    350    opus_int32 LF_shp_Q14,                      /* I                                       */
    351    opus_int32 Gain_Q16,                        /* I                                       */
    352    opus_int Lambda_Q10,                        /* I                                       */
    353    opus_int offset_Q10,                        /* I                                       */
    354    opus_int length,                            /* I    Input length                       */
    355    opus_int subfr,                             /* I    Subframe number                    */
    356    opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
    357    opus_int predictLPCOrder,                   /* I    Prediction filter order            */
    358    opus_int warping_Q16,                       /* I                                       */
    359    __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
    360    opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
    361    opus_int decisionDelay                      /* I                                       */
    362 );
    363 
    364 void silk_NSQ_del_dec_avx2(
    365    const silk_encoder_state *psEncC,                            /* I    Encoder State               */
    366    silk_nsq_state *NSQ,                                         /* I/O  NSQ state                   */
    367    SideInfoIndices *psIndices,                                  /* I/O  Quantization Indices        */
    368    const opus_int16 x16[],                                      /* I    Input                       */
    369    opus_int8 pulses[],                                          /* O    Quantized pulse signal      */
    370    const opus_int16 *PredCoef_Q12,                              /* I    Short term prediction coefs */
    371    const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR],      /* I    Long term prediction coefs  */
    372    const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I    Noise shaping coefs         */
    373    const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR],              /* I    Long term shaping coefs     */
    374    const opus_int Tilt_Q14[MAX_NB_SUBFR],                       /* I    Spectral tilt               */
    375    const opus_int32 LF_shp_Q14[MAX_NB_SUBFR],                   /* I    Low frequency shaping coefs */
    376    const opus_int32 Gains_Q16[MAX_NB_SUBFR],                    /* I    Quantization step sizes     */
    377    const opus_int32 pitchL[MAX_NB_SUBFR],                       /* I    Pitch lags                  */
    378    const opus_int Lambda_Q10,                                   /* I    Rate/distortion tradeoff    */
    379    const opus_int LTP_scale_Q14                                 /* I    LTP state scaling           */
    380 )
    381 {
    382 #ifdef OPUS_CHECK_ASM
    383    silk_nsq_state NSQ_c;
    384    SideInfoIndices psIndices_c;
    385    opus_int8 pulses_c[MAX_FRAME_LENGTH];
    386    const opus_int8 *const pulses_a = pulses;
    387 
    388    silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c));
    389    silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c));
    390    silk_memcpy(pulses_c, pulses, sizeof(pulses_c));
    391    silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
    392                       pitchL, Lambda_Q10, LTP_scale_Q14);
    393 #endif
    394 
    395    if (!verify_assumptions(psEncC))
    396    {
    397        silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14);
    398        return;
    399    }
    400 
    401    opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
    402    opus_int last_smple_idx, smpl_buf_idx, decisionDelay;
    403    const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
    404    opus_int16 *pxq;
    405    VARDECL(opus_int32, sLTP_Q15);
    406    VARDECL(opus_int16, sLTP);
    407    opus_int32 HarmShapeFIRPacked_Q14;
    408    opus_int offset_Q10;
    409    opus_int32 Gain_Q10;
    410    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH];
    411    opus_int32 delayedGain_Q10[DECISION_DELAY];
    412    NSQ_del_dec_struct psDelDec = {0};
    413    NSQ_del_dec_sample_struct *psSample;
    414    __m128i RDmin_Q10, MaskDelDec, Winner_selector;
    415    SAVE_STACK;
    416 
    417    MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3)));
    418 
    419    /* Set unvoiced lag to the previous one, overwrite later for voiced */
    420    lag = NSQ->lagPrev;
    421 
    422    silk_assert(NSQ->prev_gain_Q16 != 0);
    423    psDelDec.Seed = _mm_and_si128(
    424        _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)),
    425        _mm_set1_epi32(3));
    426    psDelDec.SeedInit = psDelDec.Seed;
    427    psDelDec.RD_Q10 = _mm_setzero_si128();
    428    psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14);
    429    psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14);
    430    psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]);
    431    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
    432    {
    433        psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]);
    434    }
    435    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
    436    {
    437        psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]);
    438    }
    439 
    440    offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType];
    441    smpl_buf_idx = 0; /* index of oldest samples */
    442 
    443    decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length);
    444 
    445    /* For voiced frames limit the decision delay to lower than the pitch lag */
    446    if (psIndices->signalType == TYPE_VOICED)
    447    {
    448        for (k = 0; k < psEncC->nb_subfr; k++)
    449        {
    450            decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1);
    451        }
    452    }
    453    else
    454    {
    455        if (lag > 0)
    456        {
    457            decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1);
    458        }
    459    }
    460 
    461    if (psIndices->NLSFInterpCoef_Q2 == 4)
    462    {
    463        LSF_interpolation_flag = 0;
    464    }
    465    else
    466    {
    467        LSF_interpolation_flag = 1;
    468    }
    469 
    470    ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32);
    471    ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16);
    472    /* Set up pointers to start of sub frame */
    473    pxq = &NSQ->xq[psEncC->ltp_mem_length];
    474    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
    475    NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
    476    subfr = 0;
    477    for (k = 0; k < psEncC->nb_subfr; k++)
    478    {
    479        A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER];
    480        B_Q14 = &LTPCoef_Q14[k * LTP_ORDER];
    481        AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER];
    482 
    483        /* Noise shape parameters */
    484        silk_assert(HarmShapeGain_Q14[k] >= 0);
    485        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
    486        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
    487 
    488        NSQ->rewhite_flag = 0;
    489        if (psIndices->signalType == TYPE_VOICED)
    490        {
    491            /* Voiced */
    492            lag = pitchL[k];
    493 
    494            /* Re-whitening */
    495            if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0)
    496            {
    497                if (k == 2)
    498                {
    499                    /* RESET DELAYED DECISIONS */
    500                    /* Find winner */
    501                    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
    502                    Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10);
    503                    Winner_selector = silk_index_to_selector(Winner_ind);
    504                    psDelDec.RD_Q10 = _mm_add_epi32(
    505                        psDelDec.RD_Q10,
    506                        _mm_blendv_epi8(
    507                            _mm_set1_epi32(silk_int32_MAX >> 4),
    508                            _mm_setzero_si128(),
    509                            _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3)))));
    510 
    511                    /* Copy final part of signals from winner state to output and long-term filter states */
    512                    last_smple_idx = smpl_buf_idx + decisionDelay;
    513                    for (i = 0; i < decisionDelay; i++)
    514                    {
    515                        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
    516                        psSample = &psDelDec.Samples[last_smple_idx];
    517                        pulses[i - decisionDelay] =
    518                            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
    519                        pxq[i - decisionDelay] =
    520                            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14));
    521                        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
    522                            silk_select_winner(psSample->Shape_Q14, Winner_selector);
    523                    }
    524 
    525                    subfr = 0;
    526                }
    527 
    528                /* Rewhiten with new A coefs */
    529                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
    530                silk_assert(start_idx > 0);
    531 
    532                silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length],
    533                                              A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder);
    534 
    535                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
    536                NSQ->rewhite_flag = 1;
    537            }
    538        }
    539 
    540        silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
    541                                           LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay);
    542 
    543        silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
    544                                                delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k],
    545                                                Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
    546                                                psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay);
    547 
    548        x16 += psEncC->subfr_length;
    549        pulses += psEncC->subfr_length;
    550        pxq += psEncC->subfr_length;
    551    }
    552 
    553    /* Find winner */
    554    RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec);
    555    Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10));
    556 
    557    /* Copy final part of signals from winner state to output and long-term filter states */
    558    psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector);
    559    last_smple_idx = smpl_buf_idx + decisionDelay;
    560    Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6;
    561    for (i = 0; i < decisionDelay; i++)
    562    {
    563        last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY;
    564        psSample = &psDelDec.Samples[last_smple_idx];
    565 
    566        pulses[i - decisionDelay] =
    567            (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10);
    568        pxq[i - decisionDelay] =
    569            silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8));
    570        NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] =
    571            silk_select_winner(psSample->Shape_Q14, Winner_selector);
    572    }
    573    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
    574    {
    575        NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector);
    576    }
    577    for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
    578    {
    579        NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector);
    580    }
    581 
    582    /* Update states */
    583    NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector);
    584    NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector);
    585    NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1];
    586 
    587    /* Save quantized speech signal */
    588    silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16));
    589    silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32));
    590 
    591 #ifdef OPUS_CHECK_ASM
    592    silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c)));
    593    silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c)));
    594    silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c)));
    595 #endif
    596 
    597    RESTORE_STACK;
    598 }
    599 
    600 static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order)
    601 {
    602    __m256i out;
    603    silk_assert(order == 10 || order == 16);
    604 
    605    /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
    606    out = _mm256_set1_epi32(order >> 1);
    607    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */
    608    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */
    609    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */
    610    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */
    611    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */
    612    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */
    613    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */
    614    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */
    615    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */
    616    out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */
    617 
    618    if (order == 16)
    619    {
    620        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */
    621        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */
    622        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */
    623        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */
    624        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */
    625        out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */
    626    }
    627    return silk_cvtepi64_epi32_high(out);
    628 }
    629 
    630 /******************************************/
    631 /* Noise shape quantizer for one subframe */
    632 /******************************************/
    633 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2(
    634    silk_nsq_state *NSQ,                        /* I/O  NSQ state                          */
    635    NSQ_del_dec_struct *psDelDec,               /* I/O  Delayed decision states            */
    636    opus_int signalType,                        /* I    Signal type                        */
    637    const opus_int32 x_Q10[],                   /* I                                       */
    638    opus_int8 pulses[],                         /* O                                       */
    639    opus_int16 xq[],                            /* O                                       */
    640    opus_int32 sLTP_Q15[],                      /* I/O  LTP filter state                   */
    641    opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O  Gain delay buffer                  */
    642    const opus_int16 a_Q12[],                   /* I    Short term prediction coefs        */
    643    const opus_int16 b_Q14[],                   /* I    Long term prediction coefs         */
    644    const opus_int16 AR_shp_Q13[],              /* I    Noise shaping coefs                */
    645    opus_int lag,                               /* I    Pitch lag                          */
    646    opus_int32 HarmShapeFIRPacked_Q14,          /* I                                       */
    647    opus_int Tilt_Q14,                          /* I    Spectral tilt                      */
    648    opus_int32 LF_shp_Q14,                      /* I                                       */
    649    opus_int32 Gain_Q16,                        /* I                                       */
    650    opus_int Lambda_Q10,                        /* I                                       */
    651    opus_int offset_Q10,                        /* I                                       */
    652    opus_int length,                            /* I    Input length                       */
    653    opus_int subfr,                             /* I    Subframe number                    */
    654    opus_int shapingLPCOrder,                   /* I    Shaping LPC filter order           */
    655    opus_int predictLPCOrder,                   /* I    Prediction filter order            */
    656    opus_int warping_Q16,                       /* I                                       */
    657    __m128i MaskDelDec,                         /* I    Mask of states in decision tree    */
    658    opus_int *smpl_buf_idx,                     /* I/O  Index to newest samples in buffers */
    659    opus_int decisionDelay                      /* I                                       */
    660 )
    661 {
    662    int i;
    663    opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2];
    664    opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2];
    665    opus_int32 Gain_Q10 = Gain_Q16 >> 6;
    666 
    667    for (i = 0; i < length; i++)
    668    {
    669        /* Perform common calculations used in all states */
    670        /* NSQ_sample_struct */
    671        /* Low  128 bits => 1st set */
    672        /* High 128 bits => 2nd set */
    673        int j;
    674        __m256i SS_Q_Q10;
    675        __m256i SS_RD_Q10;
    676        __m256i SS_xq_Q14;
    677        __m256i SS_LF_AR_Q14;
    678        __m256i SS_Diff_Q14;
    679        __m256i SS_sLTP_shp_Q14;
    680        __m256i SS_LPC_exc_Q14;
    681        __m256i exc_Q14;
    682        __m256i q_Q10, rr_Q10, rd_Q10;
    683        __m256i mask;
    684        __m128i LPC_pred_Q14, n_AR_Q14;
    685        __m128i RDmin_Q10, RDmax_Q10;
    686        __m128i n_LF_Q14;
    687        __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10;
    688        __m128i Winner_rand_state, Winner_selector;
    689        __m128i tmp0, tmp1;
    690        NSQ_del_dec_sample_struct *psLastSample, *psSample;
    691        opus_int32 RDmin_ind, RDmax_ind, last_smple_idx;
    692        opus_int32 LTP_pred_Q14, n_LTP_Q14;
    693 
    694        /* Long-term prediction */
    695        if (signalType == TYPE_VOICED)
    696        {
    697            /* Unrolled loop */
    698            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
    699            LTP_pred_Q14 = 2;
    700            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]);
    701            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]);
    702            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]);
    703            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]);
    704            LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]);
    705            LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */
    706            pred_lag_ptr++;
    707        }
    708        else
    709        {
    710            LTP_pred_Q14 = 0;
    711        }
    712 
    713        /* Long-term shaping */
    714        if (lag > 0)
    715        {
    716            /* Symmetric, packed FIR coefficients */
    717            n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]);
    718            n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14);
    719            n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14);
    720            n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */
    721            shp_lag_ptr++;
    722        }
    723        else
    724        {
    725            n_LTP_Q14 = 0;
    726        }
    727 
    728        /* BEGIN Updating Delayed Decision States */
    729 
    730        /* Generate dither */
    731        psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed);
    732 
    733        /* Short-term prediction */
    734        LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder);
    735        LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */
    736 
    737        /* Noise shape feedback */
    738        silk_assert(shapingLPCOrder > 0);
    739        silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */
    740        /* Output of lowpass section */
    741        tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16));
    742        n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1);
    743        for (j = 0; j < shapingLPCOrder - 1; j++)
    744        {
    745            /* Output of allpass section */
    746            tmp1 = psDelDec->sAR2_Q14[j];
    747            psDelDec->sAR2_Q14[j] = tmp0;
    748            n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j]));
    749            tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16));
    750        }
    751        psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0;
    752        n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1]));
    753 
    754        n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1);                                                  /* Q11 -> Q12 */
    755        n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */
    756        n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2);                                                  /* Q12 -> Q14 */
    757 
    758        tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */
    759        tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16);                  /* Q12 */
    760        n_LF_Q14 = _mm_add_epi32(tmp0, tmp1);                                                /* Q12 */
    761        n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2);                                              /* Q12 -> Q14 */
    762 
    763        /* Input minus prediction plus noise feedback                       */
    764        /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
    765        tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14);              /* Q14 */
    766        tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */
    767        tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0);                      /* Q13 */
    768        tmp0 = silk_mm_srai_round_epi32(tmp0, 4);                      /* Q10 */
    769 
    770        r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */
    771 
    772        /* Flip sign depending on dither */
    773        r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed);
    774        r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10);
    775 
    776        /* Find two quantization level candidates and measure their rate-distortion */
    777        q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10));
    778        q1_Q0 = _mm_srai_epi32(q1_Q10, 10);
    779        if (Lambda_Q10 > 2048)
    780        {
    781            /* For aggressive RDO, the bias becomes more than one pulse. */
    782            tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */
    783            q1_Q0 = _mm_srai_epi32(q1_Q10, 31);
    784            tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128());
    785            tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10);
    786            q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1);
    787        }
    788 
    789        tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0);
    790        q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0);
    791        q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10));
    792 
    793        /* check if q1_Q0 is 0 or -1 */
    794        tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0);
    795        tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128());
    796        tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1);
    797        q2_Q10 = _mm_add_epi32(q1_Q10, tmp0);
    798        q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10);
    799 
    800        rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10);
    801        rd_Q10 = _mm256_abs_epi32(q_Q10);
    802        rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10);
    803        rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10));
    804        rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10);
    805        rd_Q10 = _mm256_srai_epi32(rd_Q10, 10);
    806 
    807        mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1)));
    808        SS_RD_Q10 = _mm256_add_epi32(
    809            _mm256_broadcastsi128_si256(psDelDec->RD_Q10),
    810            _mm256_blendv_epi8(
    811                _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1),
    812                rd_Q10,
    813                mask));
    814        SS_Q_Q10 = _mm256_blendv_epi8(
    815            _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1),
    816            q_Q10,
    817            mask);
    818 
    819        /* Update states for best and second best quantization */
    820 
    821        /* Quantized excitation */
    822        exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed));
    823 
    824        /* Add predictions */
    825        exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14));
    826        SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1);
    827        SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14));
    828 
    829        /* Update states */
    830        SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4)));
    831        SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14));
    832        SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14));
    833 
    834        /* END Updating Delayed Decision States */
    835 
    836        *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY;
    837        last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY;
    838        psLastSample = &psDelDec->Samples[last_smple_idx];
    839 
    840        /* Find winner */
    841        RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec);
    842        Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10)));
    843 
    844        /* Increase RD values of expired states */
    845        Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector);
    846 
    847        SS_RD_Q10 = _mm256_blendv_epi8(
    848            _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)),
    849            SS_RD_Q10,
    850            _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state)));
    851 
    852        /* find worst in first set */
    853        RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec);
    854        /* find best in second set */
    855        RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec);
    856 
    857        /* Replace a state if best from second set outperforms worst in first set */
    858        tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10);
    859        if (!_mm_test_all_zeros(tmp0, tmp0))
    860        {
    861            int t;
    862            RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0));
    863            RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1));
    864            tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3)));
    865            tmp0 = _mm_blendv_epi8(
    866                _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0),
    867                silk_index_to_selector(RDmin_ind),
    868                tmp1);
    869            for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++)
    870            {
    871                psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0);
    872            }
    873            psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0);
    874            psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0);
    875            for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++)
    876            {
    877                psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0);
    878            }
    879            for (t = 0; t < DECISION_DELAY; t++)
    880            {
    881                psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0);
    882                psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0);
    883                psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0);
    884                psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0);
    885                psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0);
    886            }
    887            mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1));
    888            SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask);
    889            SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask);
    890            SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask);
    891            SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask);
    892            SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask);
    893            SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask);
    894            SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask);
    895        }
    896 
    897        /* Write samples from winner to output and long-term filter states */
    898        if (subfr > 0 || i >= decisionDelay)
    899        {
    900            pulses[i - decisionDelay] =
    901                (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10);
    902            xq[i - decisionDelay] =
    903                silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8));
    904            NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] =
    905                silk_select_winner(psLastSample->Shape_Q14, Winner_selector);
    906            sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] =
    907                silk_select_winner(psLastSample->Pred_Q15, Winner_selector);
    908        }
    909        NSQ->sLTP_shp_buf_idx++;
    910        NSQ->sLTP_buf_idx++;
    911 
    912        /* Update states */
    913        psSample = &psDelDec->Samples[*smpl_buf_idx];
    914        psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10));
    915        psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14);
    916        psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14);
    917        psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14);
    918        psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10);
    919        psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14);
    920        psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10);
    921        psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14);
    922        psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14);
    923        psSample->RandState = psDelDec->Seed;
    924        delayedGain_Q10[*smpl_buf_idx] = Gain_Q10;
    925    }
    926    /* Update LPC states */
    927    for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
    928    {
    929        psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i];
    930    }
    931 }
    932 
    933 static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2(
    934    const silk_encoder_state *psEncC,          /* I    Encoder State                   */
    935    silk_nsq_state *NSQ,                       /* I/O  NSQ state                       */
    936    NSQ_del_dec_struct *psDelDec,              /* I/O  Delayed decision states         */
    937    const opus_int16 x16[],                    /* I    Input                           */
    938    opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O    Input scaled with 1/Gain in Q10 */
    939    const opus_int16 sLTP[],                   /* I    Re-whitened LTP state in Q0     */
    940    opus_int32 sLTP_Q15[],                     /* O    LTP state matching scaled input */
    941    opus_int subfr,                            /* I    Subframe number                 */
    942    const opus_int LTP_scale_Q14,              /* I    LTP state scaling               */
    943    const opus_int32 Gains_Q16[MAX_NB_SUBFR],  /* I                                    */
    944    const opus_int pitchL[MAX_NB_SUBFR],       /* I    Pitch lag                       */
    945    const opus_int signal_type,                /* I    Signal type                     */
    946    const opus_int decisionDelay               /* I    Decision delay                  */
    947 )
    948 {
    949    int i;
    950    opus_int lag;
    951    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
    952    NSQ_del_dec_sample_struct *psSample;
    953 
    954    lag = pitchL[subfr];
    955    inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47);
    956    silk_assert(inv_gain_Q31 != 0);
    957 
    958    /* Scale input */
    959    inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5);
    960    for (i = 0; i < psEncC->subfr_length; i+=4)
    961    {
    962        __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i]));
    963        x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16);
    964        _mm_storeu_si128((__m128i*)(void*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x));
    965    }
    966 
    967    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
    968    if (NSQ->rewhite_flag)
    969    {
    970        if (subfr == 0)
    971        {
    972            /* Do LTP downscaling */
    973            inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2);
    974        }
    975        for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++)
    976        {
    977            silk_assert(i < MAX_FRAME_LENGTH);
    978            sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]);
    979        }
    980    }
    981 
    982    /* Adjust for changing gain */
    983    if (Gains_Q16[subfr] != NSQ->prev_gain_Q16)
    984    {
    985        gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16);
    986 
    987        /* Scale long-term shaping state */
    988        for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4)
    989        {
    990     opus_int32 *p = &NSQ->sLTP_shp_Q14[i];
    991            _mm_storeu_si128((__m128i*)(void*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)(void*)p), gain_adj_Q16));
    992        }
    993 
    994        /* Scale long-term prediction state */
    995        if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0)
    996        {
    997            for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++)
    998            {
    999                sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16;
   1000            }
   1001        }
   1002 
   1003        /* Scale scalar states */
   1004        psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16);
   1005        psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16);
   1006 
   1007        /* Scale short-term prediction and shaping states */
   1008        for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++)
   1009        {
   1010            psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16);
   1011        }
   1012        for (i = 0; i < DECISION_DELAY; i++)
   1013        {
   1014            psSample = &psDelDec->Samples[i];
   1015            psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16);
   1016            psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16);
   1017        }
   1018        for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++)
   1019        {
   1020            psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16);
   1021        }
   1022 
   1023        /* Save inverse gain */
   1024        NSQ->prev_gain_Q16 = Gains_Q16[subfr];
   1025    }
   1026 }
   1027 
   1028 static OPUS_INLINE void silk_LPC_analysis_filter_avx2(
   1029    opus_int16                  *out,               /* O    Output signal                           */
   1030    const opus_int16            *in,                /* I    Input signal                            */
   1031    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order] */
   1032    const opus_int32            len,                /* I    Signal length                           */
   1033    const opus_int32            order               /* I    Filter order                            */
   1034 )
   1035 {
   1036    int i;
   1037    opus_int32       out32_Q12, out32;
   1038    silk_assert(order == 10 || order == 16);
   1039 
   1040    for(i = order; i < len; i++ )
   1041    {
   1042        const opus_int16 *in_ptr = &in[ i ];
   1043        /* Allowing wrap around so that two wraps can cancel each other. The rare
   1044           cases where the result wraps around can only be triggered by invalid streams*/
   1045 
   1046        __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&in_ptr[-8]));
   1047        __m256i B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&      B[0]));
   1048        __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v));
   1049        if (order > 10)
   1050        {
   1051            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&in_ptr[-16]));
   1052            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&B       [8]));
   1053            B_v  = silk_mm256_reverse_epi32(B_v);
   1054        }
   1055        else
   1056        {
   1057            in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10]));
   1058            B_v  = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B       [8]));
   1059            B_v  = _mm256_shuffle_epi32(B_v, 0x01);
   1060        }
   1061        sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v));
   1062 
   1063        out32_Q12 = silk_mm256_hsum_epi32(sum);
   1064 
   1065        /* Subtract prediction */
   1066        out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 );
   1067 
   1068        /* Scale to Q0 */
   1069        out32 = silk_sar_round_32(out32_Q12, 12);
   1070 
   1071        /* Saturate output */
   1072        out[ i ] = silk_sat16(out32);
   1073    }
   1074 
   1075    /* Set first d output samples to zero */
   1076    silk_memset( out, 0, order * sizeof( opus_int16 ) );
   1077 }