tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

NSQ_del_dec_sse4_1.c (48765B)


      1 /* Copyright (c) 2014-2020, Cisco Systems, INC
      2   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
      3 
      4   Redistribution and use in source and binary forms, with or without
      5   modification, are permitted provided that the following conditions
      6   are met:
      7 
      8   - Redistributions of source code must retain the above copyright
      9   notice, this list of conditions and the following disclaimer.
     10 
     11   - Redistributions in binary form must reproduce the above copyright
     12   notice, this list of conditions and the following disclaimer in the
     13   documentation and/or other materials provided with the distribution.
     14 
     15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     19   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     23   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     24   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include <xmmintrin.h>
     33 #include <emmintrin.h>
     34 #include <smmintrin.h>
     35 #include "main.h"
     36 #include "celt/x86/x86cpu.h"
     37 
     38 #include "stack_alloc.h"
     39 
     40 typedef struct {
     41    opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ];
     42    opus_int32 RandState[ DECISION_DELAY ];
     43    opus_int32 Q_Q10[     DECISION_DELAY ];
     44    opus_int32 Xq_Q14[    DECISION_DELAY ];
     45    opus_int32 Pred_Q15[  DECISION_DELAY ];
     46    opus_int32 Shape_Q14[ DECISION_DELAY ];
     47    opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ];
     48    opus_int32 LF_AR_Q14;
     49    opus_int32 Diff_Q14;
     50    opus_int32 Seed;
     51    opus_int32 SeedInit;
     52    opus_int32 RD_Q10;
     53 } NSQ_del_dec_struct;
     54 
     55 typedef struct {
     56    opus_int32 Q_Q10;
     57    opus_int32 RD_Q10;
     58    opus_int32 xq_Q14;
     59    opus_int32 LF_AR_Q14;
     60    opus_int32 Diff_Q14;
     61    opus_int32 sLTP_shp_Q14;
     62    opus_int32 LPC_exc_Q14;
     63 } NSQ_sample_struct;
     64 
     65 typedef NSQ_sample_struct  NSQ_sample_pair[ 2 ];
     66 
     67 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
     68    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
     69    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
     70    NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
     71    const opus_int16    x16[],                      /* I    Input                               */
     72    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
     73    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
     74    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
     75    opus_int            subfr,                      /* I    Subframe number                     */
     76    opus_int            nStatesDelayedDecision,     /* I    Number of del dec states            */
     77    const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
     78    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
     79    const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
     80    const opus_int      signal_type,                /* I    Signal type                         */
     81    const opus_int      decisionDelay               /* I    Decision delay                      */
     82 );
     83 
     84 /******************************************/
     85 /* Noise shape quantizer for one subframe */
     86 /******************************************/
     87 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
     88    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
     89    NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
     90    opus_int            signalType,             /* I    Signal type                         */
     91    const opus_int32    x_Q10[],                /* I                                        */
     92    opus_int8           pulses[],               /* O                                        */
     93    opus_int16          xq[],                   /* O                                        */
     94    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
     95    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
     96    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
     97    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
     98    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
     99    opus_int            lag,                    /* I    Pitch lag                           */
    100    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
    101    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
    102    opus_int32          LF_shp_Q14,             /* I                                        */
    103    opus_int32          Gain_Q16,               /* I                                        */
    104    opus_int            Lambda_Q10,             /* I                                        */
    105    opus_int            offset_Q10,             /* I                                        */
    106    opus_int            length,                 /* I    Input length                        */
    107    opus_int            subfr,                  /* I    Subframe number                     */
    108    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
    109    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
    110    opus_int            warping_Q16,            /* I                                        */
    111    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
    112    opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
    113    opus_int            decisionDelay           /* I                                        */
    114 );
    115 
    116 void silk_NSQ_del_dec_sse4_1(
    117    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
    118    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
    119    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
    120    const opus_int16            x16[],                                        /* I    Input                           */
    121    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
    122    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
    123    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
    124    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
    125    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
    126    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
    127    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
    128    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
    129    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
    130    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
    131    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
    132 )
    133 {
    134    opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
    135    opus_int            last_smple_idx, smpl_buf_idx, decisionDelay;
    136    const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
    137    opus_int16          *pxq;
    138    VARDECL( opus_int32, sLTP_Q15 );
    139    VARDECL( opus_int16, sLTP );
    140    opus_int32          HarmShapeFIRPacked_Q14;
    141    opus_int            offset_Q10;
    142    opus_int32          RDmin_Q10, Gain_Q10;
    143    VARDECL( opus_int32, x_sc_Q10 );
    144    VARDECL( opus_int32, delayedGain_Q10 );
    145    VARDECL( NSQ_del_dec_struct, psDelDec );
    146    NSQ_del_dec_struct  *psDD;
    147 #ifdef OPUS_CHECK_ASM
    148    silk_nsq_state NSQ_c;
    149    SideInfoIndices psIndices_c;
    150    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
    151    const opus_int8 *const pulses_a = pulses;
    152 #endif
    153    SAVE_STACK;
    154 
    155 #ifdef OPUS_CHECK_ASM
    156    ( void )pulses_a;
    157    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
    158    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
    159    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
    160    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
    161    silk_NSQ_del_dec_c(
    162        psEncC,
    163        &NSQ_c,
    164        &psIndices_c,
    165        x16,
    166        pulses_c,
    167        PredCoef_Q12,
    168        LTPCoef_Q14,
    169        AR_Q13,
    170        HarmShapeGain_Q14,
    171        Tilt_Q14,
    172        LF_shp_Q14,
    173        Gains_Q16,
    174        pitchL,
    175        Lambda_Q10,
    176        LTP_scale_Q14
    177    );
    178 #endif
    179 
    180    /* Set unvoiced lag to the previous one, overwrite later for voiced */
    181    lag = NSQ->lagPrev;
    182 
    183    silk_assert( NSQ->prev_gain_Q16 != 0 );
    184 
    185    /* Initialize delayed decision states */
    186    ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct );
    187    silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) );
    188    for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
    189        psDD                 = &psDelDec[ k ];
    190        psDD->Seed           = ( k + psIndices->Seed ) & 3;
    191        psDD->SeedInit       = psDD->Seed;
    192        psDD->RD_Q10         = 0;
    193        psDD->LF_AR_Q14      = NSQ->sLF_AR_shp_Q14;
    194        psDD->Diff_Q14       = NSQ->sDiff_shp_Q14;
    195        psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ];
    196        silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
    197        silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) );
    198    }
    199 
    200    offset_Q10   = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
    201    smpl_buf_idx = 0; /* index of oldest samples */
    202 
    203    decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
    204 
    205    /* For voiced frames limit the decision delay to lower than the pitch lag */
    206    if( psIndices->signalType == TYPE_VOICED ) {
    207        for( k = 0; k < psEncC->nb_subfr; k++ ) {
    208            decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 );
    209        }
    210    } else {
    211        if( lag > 0 ) {
    212            decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
    213        }
    214    }
    215 
    216    if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
    217        LSF_interpolation_flag = 0;
    218    } else {
    219        LSF_interpolation_flag = 1;
    220    }
    221 
    222    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
    223    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
    224    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
    225    ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
    226    /* Set up pointers to start of sub frame */
    227    pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
    228    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
    229    NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
    230    subfr = 0;
    231    for( k = 0; k < psEncC->nb_subfr; k++ ) {
    232        A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
    233        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
    234        AR_shp_Q13 = &AR_Q13[     k * MAX_SHAPE_LPC_ORDER ];
    235 
    236        /* Noise shape parameters */
    237        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
    238        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
    239        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
    240 
    241        NSQ->rewhite_flag = 0;
    242        if( psIndices->signalType == TYPE_VOICED ) {
    243            /* Voiced */
    244            lag = pitchL[ k ];
    245 
    246            /* Re-whitening */
    247            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
    248                if( k == 2 ) {
    249                    /* RESET DELAYED DECISIONS */
    250                    /* Find winner */
    251                    RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
    252                    Winner_ind = 0;
    253                    for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
    254                        if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) {
    255                            RDmin_Q10 = psDelDec[ i ].RD_Q10;
    256                            Winner_ind = i;
    257                        }
    258                    }
    259                    for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) {
    260                        if( i != Winner_ind ) {
    261                            psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 );
    262                            silk_assert( psDelDec[ i ].RD_Q10 >= 0 );
    263                        }
    264                    }
    265 
    266                    /* Copy final part of signals from winner state to output and long-term filter states */
    267                    psDD = &psDelDec[ Winner_ind ];
    268                    last_smple_idx = smpl_buf_idx + decisionDelay;
    269                    for( i = 0; i < decisionDelay; i++ ) {
    270                        last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
    271                        if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
    272                        pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
    273                        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
    274                            silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) );
    275                        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
    276                    }
    277 
    278                    subfr = 0;
    279                }
    280 
    281                /* Rewhiten with new A coefs */
    282                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
    283                celt_assert( start_idx > 0 );
    284 
    285                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
    286                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
    287 
    288                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
    289                NSQ->rewhite_flag = 1;
    290            }
    291        }
    292 
    293        silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
    294            psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
    295 
    296        silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
    297            delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
    298            Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
    299            psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
    300 
    301        x16    += psEncC->subfr_length;
    302        pulses += psEncC->subfr_length;
    303        pxq    += psEncC->subfr_length;
    304    }
    305 
    306    /* Find winner */
    307    RDmin_Q10 = psDelDec[ 0 ].RD_Q10;
    308    Winner_ind = 0;
    309    for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
    310        if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) {
    311            RDmin_Q10 = psDelDec[ k ].RD_Q10;
    312            Winner_ind = k;
    313        }
    314    }
    315 
    316    /* Copy final part of signals from winner state to output and long-term filter states */
    317    psDD = &psDelDec[ Winner_ind ];
    318    psIndices->Seed = psDD->SeedInit;
    319    last_smple_idx = smpl_buf_idx + decisionDelay;
    320    Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
    321    for( i = 0; i < decisionDelay; i++ ) {
    322        last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY;
    323        if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY;
    324 
    325        pulses[   i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
    326        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
    327            silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) );
    328        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ];
    329    }
    330    silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
    331    silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) );
    332 
    333    /* Update states */
    334    NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14;
    335    NSQ->sDiff_shp_Q14  = psDD->Diff_Q14;
    336    NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
    337 
    338    /* Save quantized speech signal */
    339    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
    340    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
    341 
    342 #ifdef OPUS_CHECK_ASM
    343    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
    344    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
    345    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
    346 #endif
    347 
    348    RESTORE_STACK;
    349 }
    350 
    351 /******************************************/
    352 /* Noise shape quantizer for one subframe */
    353 /******************************************/
    354 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1(
    355    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
    356    NSQ_del_dec_struct  psDelDec[],             /* I/O  Delayed decision states             */
    357    opus_int            signalType,             /* I    Signal type                         */
    358    const opus_int32    x_Q10[],                /* I                                        */
    359    opus_int8           pulses[],               /* O                                        */
    360    opus_int16          xq[],                   /* O                                        */
    361    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
    362    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
    363    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
    364    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
    365    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
    366    opus_int            lag,                    /* I    Pitch lag                           */
    367    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
    368    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
    369    opus_int32          LF_shp_Q14,             /* I                                        */
    370    opus_int32          Gain_Q16,               /* I                                        */
    371    opus_int            Lambda_Q10,             /* I                                        */
    372    opus_int            offset_Q10,             /* I                                        */
    373    opus_int            length,                 /* I    Input length                        */
    374    opus_int            subfr,                  /* I    Subframe number                     */
    375    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
    376    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
    377    opus_int            warping_Q16,            /* I                                        */
    378    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
    379    opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
    380    opus_int            decisionDelay           /* I                                        */
    381 )
    382 {
    383    opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
    384    opus_int32   Winner_rand_state;
    385    opus_int32   LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14;
    386    opus_int32   n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10;
    387    opus_int32   q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
    388    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
    389    opus_int32   *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14;
    390    int rdo_offset;
    391 
    392    VARDECL( NSQ_sample_pair, psSampleState );
    393    NSQ_del_dec_struct *psDD;
    394    NSQ_sample_struct  *psSS;
    395 
    396    __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF;
    397    __m128i b_Q12_0123, b_sr_Q12_0123;
    398    SAVE_STACK;
    399 
    400    celt_assert( nStatesDelayedDecision > 0 );
    401    ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair );
    402 
    403    rdo_offset = (Lambda_Q10 >> 1) - 512;
    404 
    405    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
    406    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
    407    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
    408 
    409    a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 );
    410    a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 );
    411 
    412    if( opus_likely( predictLPCOrder == 16 ) ) {
    413        a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 );
    414        a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 );
    415    }
    416 
    417    if( signalType == TYPE_VOICED ){
    418        b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 );
    419        b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    420    }
    421    for( i = 0; i < length; i++ ) {
    422        /* Perform common calculations used in all states */
    423 
    424        /* Long-term prediction */
    425        if( signalType == TYPE_VOICED ) {
    426            /* Unrolled loop */
    427            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
    428            LTP_pred_Q14 = 2;
    429            {
    430                __m128i tmpa, tmpb, pred_lag_ptr_tmp;
    431                pred_lag_ptr_tmp    = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) );
    432                pred_lag_ptr_tmp    = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B );
    433                tmpa                = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 );
    434                tmpa                = _mm_srli_si128( tmpa, 2 );
    435 
    436                pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */
    437                pred_lag_ptr_tmp    = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 );
    438                pred_lag_ptr_tmp    = _mm_srli_si128( pred_lag_ptr_tmp, 2 );
    439                pred_lag_ptr_tmp    = _mm_add_epi32( pred_lag_ptr_tmp, tmpa );
    440 
    441                tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */
    442                pred_lag_ptr_tmp    = _mm_add_epi32( pred_lag_ptr_tmp, tmpb );
    443                LTP_pred_Q14        += _mm_cvtsi128_si32( pred_lag_ptr_tmp );
    444 
    445                LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
    446                LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 );                          /* Q13 -> Q14 */
    447                pred_lag_ptr++;
    448            }
    449        } else {
    450            LTP_pred_Q14 = 0;
    451        }
    452 
    453        /* Long-term shaping */
    454        if( lag > 0 ) {
    455            /* Symmetric, packed FIR coefficients */
    456            n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
    457            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
    458            n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
    459            shp_lag_ptr++;
    460        } else {
    461            n_LTP_Q14 = 0;
    462        }
    463        {
    464            __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp;
    465 
    466            for( k = 0; k < nStatesDelayedDecision; k++ ) {
    467                /* Delayed decision state */
    468                psDD = &psDelDec[ k ];
    469 
    470                /* Sample state */
    471                psSS = psSampleState[ k ];
    472 
    473                /* Generate dither */
    474                psDD->Seed = silk_RAND( psDD->Seed );
    475 
    476                /* Pointer used in short term prediction and shaping */
    477                psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ];
    478                /* Short-term prediction */
    479                silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 );
    480                /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
    481                LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 );
    482 
    483                tmpb = _mm_setzero_si128();
    484 
    485                /* step 1 */
    486                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */
    487                psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );      /* 0, -1, -2, -3 */
    488                tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 );    /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */
    489 
    490                tmpa            = _mm_srli_epi64( tmpa, 16 );
    491                tmpb            = _mm_add_epi32( tmpb, tmpa );
    492 
    493                psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    494                a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    495                psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */
    496                psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
    497                tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
    498 
    499                /* step 2 */
    500                psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -7 ] ) );
    501                psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
    502                tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 );
    503                tmpa            = _mm_srli_epi64( tmpa, 16 );
    504                tmpb            = _mm_add_epi32( tmpb, tmpa );
    505 
    506                psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    507                a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    508                psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
    509                psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
    510                tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
    511 
    512                if ( opus_likely( predictLPCOrder == 16 ) )
    513                {
    514                    /* step 3 */
    515                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -11 ] ) );
    516                    psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
    517                    tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB );
    518                    tmpa            = _mm_srli_epi64( tmpa, 16 );
    519                    tmpb            = _mm_add_epi32( tmpb, tmpa );
    520 
    521                    psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    522                    a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */
    523                    psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
    524                    psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
    525                    tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
    526 
    527                    /* step 4 */
    528                    psLPC_Q14_tmp   = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -15 ] ) );
    529                    psLPC_Q14_tmp   = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
    530                    tmpa            = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF );
    531                    tmpa            = _mm_srli_epi64( tmpa, 16 );
    532                    tmpb            = _mm_add_epi32( tmpb, tmpa );
    533 
    534                    psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    535                    a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */
    536                    psLPC_Q14_tmp   = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp );
    537                    psLPC_Q14_tmp   = _mm_srli_epi64( psLPC_Q14_tmp, 16 );
    538                    tmpb            = _mm_add_epi32( tmpb, psLPC_Q14_tmp );
    539 
    540                    /* add at last */
    541                    /* equal shift right 8 bytes*/
    542                    tmpa            = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) );
    543                    tmpb            = _mm_add_epi32( tmpb, tmpa );
    544                    LPC_pred_Q14    += _mm_cvtsi128_si32( tmpb );
    545                }
    546                else
    547                {
    548                    /* add at last */
    549                    tmpa            = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/
    550                    tmpb            = _mm_add_epi32( tmpb, tmpa );
    551                    LPC_pred_Q14    += _mm_cvtsi128_si32( tmpb );
    552 
    553                    LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] );
    554                    LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] );
    555                }
    556 
    557                LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */
    558 
    559                /* Noise shape feedback */
    560                celt_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
    561                /* Output of lowpass section */
    562                tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 );
    563                /* Output of allpass section */
    564                tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ 1 ], tmp2), warping_Q16 );
    565                psDD->sAR2_Q14[ 0 ] = tmp2;
    566                n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 );
    567                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] );
    568                /* Loop over allpass sections */
    569                for( j = 2; j < shapingLPCOrder; j += 2 ) {
    570                    /* Output of allpass section */
    571                    tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 0 ], tmp1), warping_Q16 );
    572                    psDD->sAR2_Q14[ j - 1 ] = tmp1;
    573                    n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] );
    574                    /* Output of allpass section */
    575                    tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 1 ], tmp2), warping_Q16 );
    576                    psDD->sAR2_Q14[ j + 0 ] = tmp2;
    577                    n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] );
    578                }
    579                psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1;
    580                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] );
    581 
    582                n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 );                                      /* Q11 -> Q12 */
    583                n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 );              /* Q12 */
    584                n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 );                                      /* Q12 -> Q14 */
    585 
    586                n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 );     /* Q12 */
    587                n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 );            /* Q12 */
    588                n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 );                                      /* Q12 -> Q14 */
    589 
    590                /* Input minus prediction plus noise feedback                       */
    591                /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
    592                tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 );                                /* Q14 */
    593                tmp2 = silk_ADD32_ovflw( n_LTP_Q14, LPC_pred_Q14 );                         /* Q13 */
    594                tmp1 = silk_SUB_SAT32( tmp2, tmp1 );                                        /* Q13 */
    595                tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );                                        /* Q10 */
    596 
    597                r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );                                     /* residual error Q10 */
    598 
    599                /* Flip sign depending on dither */
    600                if ( psDD->Seed < 0 ) {
    601                    r_Q10 = -r_Q10;
    602                }
    603                r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
    604 
    605                /* Find two quantization level candidates and measure their rate-distortion */
    606                q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
    607                q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
    608                if (Lambda_Q10 > 2048) {
    609                    /* For aggressive RDO, the bias becomes more than one pulse. */
    610                    if (q1_Q10 > rdo_offset) {
    611                        q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
    612                    } else if (q1_Q10 < -rdo_offset) {
    613                        q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
    614                    } else if (q1_Q10 < 0) {
    615                        q1_Q0 = -1;
    616                    } else {
    617                        q1_Q0 = 0;
    618                    }
    619                }
    620                if( q1_Q0 > 0 ) {
    621                    q1_Q10  = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
    622                    q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
    623                    q2_Q10  = silk_ADD32( q1_Q10, 1024 );
    624                    rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
    625                    rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
    626                } else if( q1_Q0 == 0 ) {
    627                    q1_Q10  = offset_Q10;
    628                    q2_Q10  = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
    629                    rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 );
    630                    rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 );
    631                } else if( q1_Q0 == -1 ) {
    632                    q2_Q10  = offset_Q10;
    633                    q1_Q10  = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 );
    634                    rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
    635                    rd2_Q10 = silk_SMULBB(  q2_Q10, Lambda_Q10 );
    636                } else {            /* q1_Q0 < -1 */
    637                    q1_Q10  = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 );
    638                    q1_Q10  = silk_ADD32( q1_Q10, offset_Q10 );
    639                    q2_Q10  = silk_ADD32( q1_Q10, 1024 );
    640                    rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 );
    641                    rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 );
    642                }
    643                rr_Q10  = silk_SUB32( r_Q10, q1_Q10 );
    644                rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 );
    645                rr_Q10  = silk_SUB32( r_Q10, q2_Q10 );
    646                rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 );
    647 
    648                if( rd1_Q10 < rd2_Q10 ) {
    649                    psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
    650                    psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
    651                    psSS[ 0 ].Q_Q10  = q1_Q10;
    652                    psSS[ 1 ].Q_Q10  = q2_Q10;
    653                } else {
    654                    psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 );
    655                    psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 );
    656                    psSS[ 0 ].Q_Q10  = q2_Q10;
    657                    psSS[ 1 ].Q_Q10  = q1_Q10;
    658                }
    659 
    660                /* Update states for best quantization */
    661 
    662                /* Quantized excitation */
    663                exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 );
    664                if ( psDD->Seed < 0 ) {
    665                    exc_Q14 = -exc_Q14;
    666                }
    667 
    668                /* Add predictions */
    669                LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
    670                xq_Q14      = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 );
    671 
    672                /* Update states */
    673                psSS[ 0 ].Diff_Q14     = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) );
    674                sLF_AR_shp_Q14         = silk_SUB32_ovflw( psSS[ 0 ].Diff_Q14, n_AR_Q14 );
    675                psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
    676                psSS[ 0 ].LF_AR_Q14    = sLF_AR_shp_Q14;
    677                psSS[ 0 ].LPC_exc_Q14  = LPC_exc_Q14;
    678                psSS[ 0 ].xq_Q14       = xq_Q14;
    679 
    680                /* Update states for second best quantization */
    681 
    682                /* Quantized excitation */
    683                exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 );
    684                if ( psDD->Seed < 0 ) {
    685                    exc_Q14 = -exc_Q14;
    686                }
    687 
    688                /* Add predictions */
    689                LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 );
    690                xq_Q14      = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 );
    691 
    692                /* Update states */
    693                psSS[ 1 ].Diff_Q14     = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) );
    694                sLF_AR_shp_Q14         = silk_SUB32_ovflw( psSS[ 1 ].Diff_Q14, n_AR_Q14 );
    695                psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 );
    696                psSS[ 1 ].LF_AR_Q14    = sLF_AR_shp_Q14;
    697                psSS[ 1 ].LPC_exc_Q14  = LPC_exc_Q14;
    698                psSS[ 1 ].xq_Q14       = xq_Q14;
    699            }
    700        }
    701        *smpl_buf_idx  = ( *smpl_buf_idx - 1 ) % DECISION_DELAY;
    702        if( *smpl_buf_idx < 0 ) *smpl_buf_idx += DECISION_DELAY;
    703        last_smple_idx = ( *smpl_buf_idx + decisionDelay ) % DECISION_DELAY;
    704 
    705        /* Find winner */
    706        RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10;
    707        Winner_ind = 0;
    708        for( k = 1; k < nStatesDelayedDecision; k++ ) {
    709            if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) {
    710                RDmin_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
    711                Winner_ind = k;
    712            }
    713        }
    714 
    715        /* Increase RD values of expired states */
    716        Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ];
    717        for( k = 0; k < nStatesDelayedDecision; k++ ) {
    718            if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) {
    719                psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 );
    720                psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 );
    721                silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 );
    722            }
    723        }
    724 
    725        /* Find worst in first set and best in second set */
    726        RDmax_Q10  = psSampleState[ 0 ][ 0 ].RD_Q10;
    727        RDmin_Q10  = psSampleState[ 0 ][ 1 ].RD_Q10;
    728        RDmax_ind = 0;
    729        RDmin_ind = 0;
    730        for( k = 1; k < nStatesDelayedDecision; k++ ) {
    731            /* find worst in first set */
    732            if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) {
    733                RDmax_Q10  = psSampleState[ k ][ 0 ].RD_Q10;
    734                RDmax_ind = k;
    735            }
    736            /* find best in second set */
    737            if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) {
    738                RDmin_Q10  = psSampleState[ k ][ 1 ].RD_Q10;
    739                RDmin_ind = k;
    740            }
    741        }
    742 
    743        /* Replace a state if best from second set outperforms worst in first set */
    744        if( RDmin_Q10 < RDmax_Q10 ) {
    745            silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i,
    746                         ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) );
    747            silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) );
    748        }
    749 
    750        /* Write samples from winner to output and long-term filter states */
    751        psDD = &psDelDec[ Winner_ind ];
    752        if( subfr > 0 || i >= decisionDelay ) {
    753            pulses[  i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 );
    754            xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
    755                silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
    756            NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ];
    757            sLTP_Q15[          NSQ->sLTP_buf_idx     - decisionDelay ] = psDD->Pred_Q15[  last_smple_idx ];
    758        }
    759        NSQ->sLTP_shp_buf_idx++;
    760        NSQ->sLTP_buf_idx++;
    761 
    762        /* Update states */
    763        for( k = 0; k < nStatesDelayedDecision; k++ ) {
    764            psDD                                     = &psDelDec[ k ];
    765            psSS                                     = &psSampleState[ k ][ 0 ];
    766            psDD->LF_AR_Q14                          = psSS->LF_AR_Q14;
    767            psDD->Diff_Q14                           = psSS->Diff_Q14;
    768            psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14;
    769            psDD->Xq_Q14[    *smpl_buf_idx ]         = psSS->xq_Q14;
    770            psDD->Q_Q10[     *smpl_buf_idx ]         = psSS->Q_Q10;
    771            psDD->Pred_Q15[  *smpl_buf_idx ]         = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 );
    772            psDD->Shape_Q14[ *smpl_buf_idx ]         = psSS->sLTP_shp_Q14;
    773            psDD->Seed                               = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) );
    774            psDD->RandState[ *smpl_buf_idx ]         = psDD->Seed;
    775            psDD->RD_Q10                             = psSS->RD_Q10;
    776        }
    777        delayedGain_Q10[     *smpl_buf_idx ]         = Gain_Q10;
    778    }
    779    /* Update LPC states */
    780    for( k = 0; k < nStatesDelayedDecision; k++ ) {
    781        psDD = &psDelDec[ k ];
    782        silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
    783    }
    784    RESTORE_STACK;
    785 }
    786 
    787 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1(
    788    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
    789    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
    790    NSQ_del_dec_struct  psDelDec[],                 /* I/O  Delayed decision states             */
    791    const opus_int16    x16[],                      /* I    Input                               */
    792    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
    793    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
    794    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
    795    opus_int            subfr,                      /* I    Subframe number                     */
    796    opus_int            nStatesDelayedDecision,     /* I    Number of del dec states            */
    797    const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
    798    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
    799    const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
    800    const opus_int      signal_type,                /* I    Signal type                         */
    801    const opus_int      decisionDelay               /* I    Decision delay                      */
    802 )
    803 {
    804    opus_int            i, k, lag;
    805    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
    806    NSQ_del_dec_struct  *psDD;
    807    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
    808 
    809    lag          = pitchL[ subfr ];
    810    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
    811    silk_assert( inv_gain_Q31 != 0 );
    812 
    813    /* Scale input */
    814    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
    815 
    816    /* prepare inv_gain_Q26 in packed 4 32-bits */
    817    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
    818 
    819    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
    820        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
    821 
    822        /* equal shift right 4 bytes*/
    823        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    824 
    825        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
    826        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
    827 
    828        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
    829        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
    830 
    831        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
    832 
    833        _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
    834    }
    835 
    836    for( ; i < psEncC->subfr_length; i++ ) {
    837        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
    838    }
    839 
    840    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
    841    if( NSQ->rewhite_flag ) {
    842        if( subfr == 0 ) {
    843            /* Do LTP downscaling */
    844            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
    845        }
    846        for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
    847            silk_assert( i < MAX_FRAME_LENGTH );
    848            sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
    849        }
    850    }
    851 
    852    /* Adjust for changing gain */
    853    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
    854        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
    855 
    856        /* Scale long-term shaping state */
    857        {
    858            __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
    859 
    860            /* prepare gain_adj_Q16 in packed 4 32-bits */
    861            xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 );
    862 
    863            for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
    864            {
    865                xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
    866                /* equal shift right 4 bytes*/
    867                xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    868 
    869                xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
    870                xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
    871 
    872                xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
    873                xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
    874 
    875                xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
    876 
    877                _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
    878            }
    879 
    880            for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
    881                NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
    882            }
    883 
    884            /* Scale long-term prediction state */
    885            if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
    886                for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
    887                    sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
    888                }
    889            }
    890 
    891            for( k = 0; k < nStatesDelayedDecision; k++ ) {
    892                psDD = &psDelDec[ k ];
    893 
    894                /* Scale scalar states */
    895                psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 );
    896                psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 );
    897 
    898                /* Scale short-term prediction and shaping states */
    899                for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
    900                    psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] );
    901                }
    902                for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
    903                    psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] );
    904                }
    905                for( i = 0; i < DECISION_DELAY; i++ ) {
    906                    psDD->Pred_Q15[  i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[  i ] );
    907                    psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] );
    908                }
    909            }
    910        }
    911 
    912        /* Save inverse gain */
    913        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
    914    }
    915 }